From 9ccea796219f4e5382510b7b00f4cbc26c78bada Mon Sep 17 00:00:00 2001
From: Jessica Black <me@jessica.black>
Date: Mon, 3 Jun 2024 11:26:20 -0700
Subject: [PATCH] Extract from `foundation-libs`

---
 .github/CODEOWNERS                           |   1 +
 .github/workflows/check-dynamic.yml          |   2 +-
 Cargo.toml                                   |  15 +-
 README.md                                    |  30 +-
 src/fingerprint.rs                           | 230 +++++++++++
 src/lib.rs                                   | 399 ++++++++++++++++++-
 src/main.rs                                  |   9 -
 src/serialize.rs                             |  55 +++
 src/stream.rs                                |  91 +++++
 src/tests.rs                                 | 146 +++++++
 testdata/eftest.key                          |   1 +
 testdata/facebook-folly-Version.cpp          |  23 ++
 testdata/facebook-folly-Version.cpp.stripped |   4 +
 13 files changed, 960 insertions(+), 46 deletions(-)
 create mode 100644 .github/CODEOWNERS
 create mode 100644 src/fingerprint.rs
 delete mode 100644 src/main.rs
 create mode 100644 src/serialize.rs
 create mode 100644 src/stream.rs
 create mode 100644 src/tests.rs
 create mode 100644 testdata/eftest.key
 create mode 100644 testdata/facebook-folly-Version.cpp
 create mode 100644 testdata/facebook-folly-Version.cpp.stripped

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
new file mode 100644
index 0000000..2a0d5ea
--- /dev/null
+++ b/.github/CODEOWNERS
@@ -0,0 +1 @@
+* @fossas/analysis
diff --git a/.github/workflows/check-dynamic.yml b/.github/workflows/check-dynamic.yml
index 2a7ae0e..b9c2855 100644
--- a/.github/workflows/check-dynamic.yml
+++ b/.github/workflows/check-dynamic.yml
@@ -13,7 +13,7 @@ jobs:
           setup: echo "no setup"
           build: cargo build
         - host: macos-latest
-          setup: rustup target add aarch64-apple-darwin
+          setup: rustup target add aarch64-apple-darwin && rustup target add x86_64-apple-darwin
           build: cargo build --target aarch64-apple-darwin && cargo build --target x86_64-apple-darwin
 
     runs-on: ${{ matrix.settings.host }}
diff --git a/Cargo.toml b/Cargo.toml
index 6184e61..3b40fff 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,8 +1,15 @@
 [package]
-name = "template-rust"
-version = "0.1.0"
+name = "fingerprint"
+version = "1.0.1"
 edition = "2021"
 
-# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
-
 [dependencies]
+getset = "0.1.2"
+hex = "0.4.3"
+iter-read = "0.3.1"
+serde = { version = "1.0.140", features = ["derive"] }
+thiserror = "1.0.31"
+sha2 = "0.10.6"
+
+[dev-dependencies]
+typed-builder = "0.10.0"
diff --git a/README.md b/README.md
index cffb3b1..2586b63 100644
--- a/README.md
+++ b/README.md
@@ -1,29 +1 @@
-# template-rust
-
-Template repository for a Rust project.
-
-TODOs for a new project:
-- [ ] Change the license if MPL2 is not appropriate for the project. Make sure to do this before adding any code.
-- [ ] Ensure the dev docs (in particular the release and compatibility semantics) are valid for this project.
-- [ ] Set [CODEOWNERS] to the team that owns the repository.
-- [ ] Create an API user in [FOSSA] and store it as a secret named `FOSSA_API_KEY`.
-  - Consider naming it with the pattern `ci-{REPO_NAME}`. For example, `ci-template-rust`.
-- [ ] Update repository permissions as appropriate. Generally, the CODEOWNER team is set as admin.
-- [ ] Update branch protection rules as appropriate.
-- [ ] Update repository features and settings. Recommended defaults:
-  - [ ] Turn off all features (Wikis, Issues, Sponsorships, Discussions, Projects); FOSSA uses other systems for these.
-  - [ ] Only allow squash merging.
-  - [ ] Always suggest updating PR branches.
-  - [ ] Allow auto-merge.
-  - [ ] Automatically delete head branches.
-
-Then just edit the included Rust project, or remove it and `cargo init` your project, and get going!
-
-[codeowners]: https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-code-owners
-[fossa]: https://app.fossa.com
-
-# recommendations
-
-- If publishing a Linux binary, consider providing two: one that [statically links libc](./docs/dev/reference/static-binary.md), and one that doesn't.
-- If publishing a macOS binary, consider providing two: one for [Intel and one for M-series CPUs](./docs/dev/reference/macos-arch.md).
-- If this application may be used on AWS Graviton or similar, consider providing an ARM build for Linux as well.
+# lib-fingerprint
diff --git a/src/fingerprint.rs b/src/fingerprint.rs
new file mode 100644
index 0000000..53b3f8d
--- /dev/null
+++ b/src/fingerprint.rs
@@ -0,0 +1,230 @@
+use std::io::{self, BufRead, BufReader, Cursor, Read, Write};
+
+use iter_read::IterRead;
+use sha2::{Digest, Sha256};
+
+use crate::{stream::ConvertCRLFToLF, CommentStrippedSHA256, Error, Fingerprint, RawSHA256};
+
+/// Fingerprint the file using the [`RawSHA256`] kind.
+pub fn raw<R: BufRead>(stream: &mut R) -> Result<Fingerprint<RawSHA256>, Error> {
+    // Read the start of the stream, and decide whether to treat the rest of the stream as binary based on that.
+    let BinaryCheck { read, is_binary } = content_is_binary(stream)?;
+
+    // Chain the part of the stream already read to evaluate binary along with the rest of the stream.
+    let mut stream = Cursor::new(read).chain(stream);
+    let mut hasher = Sha256::new();
+    if is_binary {
+        content_binary(&mut stream, &mut hasher)?;
+    } else {
+        content_text(&mut stream, &mut hasher)?;
+    }
+
+    Fingerprint::from_digest(hasher)
+}
+
+/// Fingerprint the file using the [`CommentStrippedSHA256`] kind.
+pub fn comment_stripped<R: BufRead>(
+    stream: &mut R,
+) -> Result<Option<Fingerprint<CommentStrippedSHA256>>, Error> {
+    // Read the start of the stream, and decide whether to treat the rest of the stream as binary based on that.
+    let BinaryCheck { read, is_binary } = content_is_binary(stream)?;
+    if is_binary {
+        return Ok(None);
+    }
+
+    // Chain the part of the stream already read to evaluate binary along with the rest of the stream.
+    let mut stream = Cursor::new(read).chain(stream);
+    let mut hasher = Sha256::new();
+    match content_stripped(&mut stream, &mut hasher) {
+        Ok(_) => Some(Fingerprint::from_digest(hasher)).transpose(),
+        Err(err) => {
+            // The `io::Error` type is opaque.
+            // Handle the case of attempting to comment strip a binary file.
+            if err.to_string().to_lowercase().contains("utf-8") {
+                Ok(None)
+            } else {
+                Err(err)
+            }
+        }
+    }
+}
+
+/// The result of checking a file for whether it is binary.
+pub(crate) struct BinaryCheck {
+    pub(crate) read: Vec<u8>,
+    pub(crate) is_binary: bool,
+}
+
+/// Inspect the file to determine if it is binary.
+///
+/// Uses the same method as git: "is there a zero byte in the first 8000 bytes of the file"
+pub(crate) fn content_is_binary<R: Read>(stream: &mut R) -> Result<BinaryCheck, io::Error> {
+    let mut buf = Vec::new();
+    stream.take(8000).read_to_end(&mut buf)?;
+    let is_binary = buf.contains(&0);
+    Ok(BinaryCheck {
+        read: buf,
+        is_binary,
+    })
+}
+
+/// Reads the exact contents of a binary file without modification.
+pub(crate) fn content_binary(stream: &mut impl BufRead, w: &mut impl Write) -> Result<(), Error> {
+    io::copy(stream, w)?;
+    Ok(())
+}
+
+/// Reads text files in a platform independent manner.
+///
+/// Specifically:
+/// - All text encodings are ignored; this function operates on raw bytes.
+/// - `git` implementations on Windows typically check out files with `\r\n` line endings,
+///   while *nix checks them out with `\n`.
+///   To be platform independent, any `\r\n` byte sequences found are converted to a single `\n`.
+pub(crate) fn content_text(stream: &mut impl BufRead, w: &mut impl Write) -> Result<(), Error> {
+    let stream = BufReader::new(stream).bytes().crlf_to_lf().fuse();
+    io::copy(&mut IterRead::new(stream), w)?;
+    Ok(())
+}
+
+/// Hashes code files while removing C-style comments and blank lines in a platform independent manner.
+///
+/// Specifically:
+/// - All text encodings are treated as utf8.
+/// - `git` implementations on Windows typically check out files with `\r\n` line endings,
+///   while *nix checks them out with `\n`.
+///   To be platform independent, any `\r\n` byte sequences found are converted to a single `\n`.
+/// - C-style comments are removed:
+///   - `//` is considered the start of a single line comment; these bytes and any other bytes until right before a `\n` are removed.
+///   - `/*` is considered the start of a multi line comment; these bytes and any other bytes until after a `*/` is read are removed.
+///   - This function does not check for escaped comments.
+/// - Any sequence of multiple contiguous `\n` bytes are collapsed to a single `\n` byte.
+/// - The final `\n` byte is removed from the end of the stream if present.
+pub(crate) fn content_stripped(stream: &mut impl BufRead, w: &mut impl Write) -> Result<(), Error> {
+    let mut buffered_output_line = String::new();
+    let mut is_multiline_active = false;
+
+    for line in stream.lines() {
+        let mut line = line?;
+
+        // At this point we know we have a new line coming. If a previous line is buffered and ready to write, do so now.
+        // Write it with a trailing newline because we know we'll be writing a following line.
+        if !buffered_output_line.is_empty() {
+            writeln!(w, "{buffered_output_line}")?;
+        }
+
+        (line, is_multiline_active) = clean_line(line, is_multiline_active);
+        line.trim().clone_into(&mut buffered_output_line);
+    }
+
+    // Now that we're done reading the input stream, if there's a buffered output line write it *without a trailing newline*.
+    write!(w, "{buffered_output_line}")?;
+    Ok(())
+}
+
+/// Part comment stripping, part state machine. Cleans lines of comments based on whether a previous invocation
+/// detected the start of a multi line comment.
+///
+/// This is very much not an ideal function: it scans the line multiple times instead of being forward-looking-only,
+/// and the dual responsibility makes it complicated. We should fix this, but moving forward for now.
+fn clean_line(line: String, is_multiline_active: bool) -> (String, bool) {
+    if is_multiline_active {
+        if let Some(end) = line.find("*/") {
+            return clean_line(line[end + 2..].to_string(), false);
+        }
+
+        (String::new(), true)
+    } else if let Some(start) = line.find("/*") {
+        let before_multi = line[..start].to_string();
+        let (after_multi, is_multi) = clean_line(line[start + 2..].to_string(), true);
+        (before_multi + &after_multi, is_multi)
+    } else if let Some(start) = line.find("//") {
+        (line[..start].to_string(), false)
+    } else {
+        (line, false)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    //! Tests for internal logic.
+
+    use super::*;
+
+    /// Inspired by the Haskell implementation: https://github.com/fossas/fossa-cli/blob/8de74b71b80d77321d64f94d7573773e49306772/test/App/Fossa/VSI/testdata/multi_line_comment.c#L1-L10
+    #[test]
+    fn comment_strip_mixed() {
+        let content = r#"/*
+ * This is a placeholder file used to test comment stripping code.
+*/
+
+int main() {
+  int code = 0;
+  // code = 1;
+
+
+
+
+  return code; // perfect
+}
+"#;
+        let expected = r#"int main() {
+int code = 0;
+return code;
+}"#;
+
+        let mut buf = Vec::new();
+        content_stripped(&mut Cursor::new(content), &mut buf).expect("must fingerprint");
+        assert_eq!(expected, String::from_utf8_lossy(&buf));
+    }
+
+    /// Copied from the Go implementation: https://github.com/fossas/basis/blob/6b0a1ce7ca5d88d033732f6dcfebd90b8f143038/sherlock/pkg/lib/indexer/cleaned/strip_comments_internal_test.go#L71-L79
+    #[test]
+    fn comment_strip_single_line_comments() {
+        let content = " content1 \n content2 //comment \n content3 ";
+        let expected = "content1\ncontent2\ncontent3";
+
+        let mut buf = Vec::new();
+        content_stripped(&mut Cursor::new(content), &mut buf).expect("must fingerprint");
+        assert_eq!(expected, String::from_utf8_lossy(&buf));
+    }
+
+    /// Copied from the Go implementation: https://github.com/fossas/basis/blob/6b0a1ce7ca5d88d033732f6dcfebd90b8f143038/sherlock/pkg/lib/indexer/cleaned/strip_comments_internal_test.go#L89-L97
+    #[test]
+    fn comment_strip_multi_line_comments() {
+        let content =
+            " content1 \n  content2 /* begin comment \n end comment */ content3 \n content4 ";
+        let expected = "content1\ncontent2\ncontent3\ncontent4";
+
+        let mut buf = Vec::new();
+        content_stripped(&mut Cursor::new(content), &mut buf).expect("must fingerprint");
+        assert_eq!(expected, String::from_utf8_lossy(&buf));
+    }
+
+    #[test]
+    fn comment_strip_cr() {
+        let content = "hello world\r\nanother line\r\na final line\n";
+        let expected = "hello world\nanother line\na final line";
+
+        let mut buf = Vec::new();
+        content_stripped(&mut Cursor::new(content), &mut buf).expect("must fingerprint");
+        assert_eq!(expected, String::from_utf8_lossy(&buf));
+    }
+
+    #[test]
+    fn comment_strip_real_source() {
+        let content = include_bytes!("../testdata/facebook-folly-Version.cpp");
+        let expected = include_str!("../testdata/facebook-folly-Version.cpp.stripped");
+
+        let mut buf = Vec::new();
+        content_stripped(&mut Cursor::new(content), &mut buf).expect("must process");
+
+        assert_eq!(normalize_lf(expected), String::from_utf8_lossy(&buf));
+    }
+
+    /// Windows CI checks out CRLF. Normalize it to be LF only.
+    /// This function should only be applied to testing values, not responses from the functions being tested.
+    fn normalize_lf(input: impl Into<String>) -> String {
+        input.into().replace("\r\n", "\n")
+    }
+}
diff --git a/src/lib.rs b/src/lib.rs
index 4ee70a2..1b86daf 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,5 +1,398 @@
+//! A fingerprint is a unique identifier for a file's contents.
+//!
+//! Fingerprints come in multiple "kinds", which are represented by textual identifiers.
+//! Fingerprints themselves are represented as binary blobs.
+//!
+//! Fingerprint kinds MUST maintain exact implementation compatibility; once the algorithm for a given kind
+//! has been created and its fingerprints have been crawled, it can't be changed. If a change is needed,
+//! that has to be a new kind of fingerprint.
+//!
+//! This rule means that we start out with two kinds that existed prior to this library being created,
+//! which have specific rules about how to compute the fingerprint, and specific text identifiers.
+//!
+//! For more information, refer to the documentation for the types below.
+
+#![deny(unsafe_code)]
+#![deny(missing_docs)]
+#![warn(rust_2018_idioms)]
+#![deny(clippy::unwrap_used)]
+
+use std::{
+    fmt::Display,
+    fs::File,
+    io::{self, BufRead, BufReader, Seek},
+    marker::PhantomData,
+    path::Path,
+};
+
+use crate::fingerprint::BinaryCheck;
+use getset::Getters;
+use serde::{Deserialize, Serialize};
+use sha2::{Digest, Sha256};
+use thiserror::Error;
 #[cfg(test)]
-mod tests {
-    #[test]
-    fn lib_works() {}
+use typed_builder::TypedBuilder;
+
+mod fingerprint;
+pub mod serialize;
+mod stream;
+
+/// Errors that may be encountered during fingerprinting.
+#[derive(Debug, Error)]
+#[non_exhaustive]
+pub enum Error {
+    /// A generic IO error occurred while reading the content to be hashed.
+    /// This error may be retried, but if it fails multiple times it's generally not recoverable.
+    #[error("i/o error: {0}")]
+    IO(#[from] io::Error),
+}
+
+/// Fingerprint kinds MUST maintain exact implementation compatibility; once the algorithm for a given kind
+/// has been created and its fingerprints have been crawled, it can't be changed. If a change is needed,
+/// that has to be a new kind of fingerprint. Similarly, the text representation for a given algorithm
+/// cannot change either: some services assume certain things about the fingerprints that we cannot easily change
+/// (for example, the VSI Forensics Service assumes all files have a `sha_256` fingerprint).
+///
+/// This is because fingerprints form the backbone of how VSI operates:
+/// - FOSSA CLI creates them.
+/// - The VSI Forensics Service assumes certain things about them.
+/// - The VSI Cloud Store assumes certain things about them.
+/// - The VSI Cloud Store's Crawlers create them.
+/// - Crawlers and FOSSA CLI must create them in the same way.
+/// - ... and all of this has to be compatible with the fingerprinting in the MVP store, which formed the initial basis of VSI.
+///
+/// All valid fingerprint kinds implement this trait.
+///
+/// This trait is sealed, indicating nothing outside this module may implement it.
+///
+/// ### Future work
+///
+/// The current implementation of `Kind` causes an issue when we want to actually send kind information
+/// across a serialization boundary, because `Kind`s aren't concrete and therefore aren't
+/// generally serializable.
+///
+/// Specifically, this is an issue for `FinalizeRevision` and `CheckRevision` methods in the VSI Cloud Store,
+/// where it's not simple to send a list of `Kind`s used to fingerprint a set of files,
+/// and it's not simple to then retreive that list from the API.
+///
+/// Instead, for `FinalizeRevision`, clients are forced to:
+/// - Know what kinds of fingerprints are possible, separately.
+/// - Manually call `.to_string` on those kinds to get a list of kinds used.
+/// - Send them as opaque strings.
+/// And for `CheckRevision`, clients are forced to:
+/// - Manually compare the API result (which is a set of opaque strings) against known kinds, using the `to_string` method.
+/// And the server is required to treat all this as opaque strings.
+///
+/// To make this less error prone, this is all handled in this library under the `serialize` module,
+/// and it works for now so it's not a massive problem. But if we have ideas for how to improve this for the future,
+/// we should do them.
+pub trait Kind: private::Sealed {}
+
+/// Represents a fingerprint derived by hashing the raw contents of a file with the SHA256 algorithm.
+///
+/// This is the default kind of fingerprint, and the kind of fingerprint with the maximal comparison signal,
+/// as the raw SHA256 hash of two files matching indicates that the two files are exactly the same content.
+/// It's also the fingerprint kind that works for literally all kinds of files, whereas other fingerprint kinds
+/// generally require specific circumstances: `CommentStrippedSHA256` requires that the file is text, and
+/// hypothetical future fingerprint kinds such as something based on an AST would require that the file is source code.
+///
+/// This fingerprint kind has been finalized and may not change (except to fix a bug).
+#[derive(Clone, Eq, PartialEq, Debug, Default, Hash, Serialize, Deserialize)]
+pub struct RawSHA256;
+
+impl private::Sealed for RawSHA256 {}
+impl Kind for RawSHA256 {}
+
+impl Display for RawSHA256 {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "sha_256")
+    }
+}
+
+/// Represents a fingerprint derived by hashing the contents of a file with the SHA256 algorithm
+/// after performing basic C-style comment stripping.
+///
+/// This fingerprint kind has been finalized and may not change (except to fix a bug).
+#[derive(Clone, Eq, PartialEq, Debug, Default, Hash, Serialize, Deserialize)]
+pub struct CommentStrippedSHA256;
+
+impl private::Sealed for CommentStrippedSHA256 {}
+impl Kind for CommentStrippedSHA256 {}
+
+impl Display for CommentStrippedSHA256 {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "comment_stripped:sha_256")
+    }
+}
+
+/// An array of bytes representing a fingerprint's content.
+///
+/// Must be encoded as hex to be compatible with the FOSSA backend.
+#[derive(Clone, Eq, PartialEq, Hash, Debug, Default)]
+pub struct Blob(Vec<u8>);
+
+impl Blob {
+    fn from_digest<D: Digest>(digest: D) -> Result<Blob, Error> {
+        let buf = digest.finalize().as_slice().to_vec();
+        Ok(Blob(buf))
+    }
+
+    /// Reference the bytes inside the blob.
+    pub fn as_bytes(&self) -> &[u8] {
+        &self.0
+    }
+}
+
+impl Serialize for Blob {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        serializer.serialize_str(&hex::encode(&self.0))
+    }
+}
+
+impl<'de> Deserialize<'de> for Blob {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        let s = String::deserialize(deserializer)?;
+        let b = hex::decode(s).map_err(serde::de::Error::custom)?;
+        Ok(Self(b))
+    }
+}
+
+/// Fingerprints need to be hashable by their `Kind` and `Content` values
+/// for the VSI Cloud Store to properly interact with them.
+pub trait Hashable {
+    /// Create a new hash from a fingerprint kind and a fingerprint.
+    fn to_hash(&self) -> Vec<u8>;
+}
+
+/// An opaque, deterministic value for the file's contents.
+/// If two fingerprints are the same, the contents of the files used to create the fingerprints are the same.
+#[derive(Clone, Eq, PartialEq, Hash, Default, Debug, Getters)]
+#[cfg_attr(test, derive(TypedBuilder))]
+#[getset(get = "pub")]
+pub struct Fingerprint<K: Kind> {
+    #[getset(skip)]
+    kind: PhantomData<K>,
+    /// The content of the blob.
+    content: Blob,
+}
+
+impl<K: Kind> Serialize for Fingerprint<K> {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        self.content.serialize(serializer)
+    }
+}
+
+impl<'de, K: Kind> Deserialize<'de> for Fingerprint<K> {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        Ok(Self {
+            content: Blob::deserialize(deserializer)?,
+            kind: PhantomData {},
+        })
+    }
+}
+
+impl<K> Fingerprint<K>
+where
+    K: Kind,
+{
+    fn new(content: Blob) -> Self {
+        Self {
+            content,
+            kind: PhantomData {},
+        }
+    }
+
+    fn from_digest<D: Digest>(digest: D) -> Result<Self, Error> {
+        let content = Blob::from_digest(digest)?;
+        Ok(Fingerprint::new(content))
+    }
+}
+
+impl Hashable for Fingerprint<RawSHA256> {
+    /// Create a new hash from a fingerprint kind and a fingerprint
+    fn to_hash(&self) -> Vec<u8> {
+        let mut bs = RawSHA256.to_string().as_bytes().to_vec();
+        bs.extend_from_slice(self.content.as_bytes());
+        Sha256::digest(&bs).to_vec()
+    }
+}
+
+impl Hashable for Fingerprint<CommentStrippedSHA256> {
+    /// Create a new hash from a fingerprint kind and a fingerprint
+    fn to_hash(&self) -> Vec<u8> {
+        let mut bs = CommentStrippedSHA256.to_string().as_bytes().to_vec();
+        bs.extend_from_slice(self.content.as_bytes());
+        Sha256::digest(&bs).to_vec()
+    }
+}
+
+impl<K> Display for Fingerprint<K>
+where
+    K: Kind,
+{
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", hex::encode(&self.content.0))
+    }
+}
+
+/// The result of eagerly running all fingerprint [`Kind`]s on some given content.
+///
+/// When creating a [`Combined`], the same content is run through each [`Kind`].
+/// Any [`Kind`] returning [`Error::Unsupported`] is silently dropped from the [`Combined`] data structure.
+///
+/// For example, this means that if [`Combined`] is created over a binary file, [`CommentStrippedSHA256`] is not
+/// in the resulting data structure, because that kind of fingerprint requires UTF8 encoded text content to run.
+#[derive(Clone, Hash, Eq, PartialEq, Default, Debug, Getters, Serialize, Deserialize)]
+#[cfg_attr(test, derive(TypedBuilder))]
+#[getset(get = "pub")]
+pub struct Combined {
+    /// This fingerprint is derived regardless of the kind of file.
+    // Important: if this struct is changed, update `serialize::kind::kinds_evaluated` to reflect the change.
+    // `kinds_evaluated` may be replaced by a macro in the future.
+    #[serde(rename = "sha_256")]
+    raw: Fingerprint<RawSHA256>,
+    /// The fingerprint derived when the file is a text file, and any C-style comments have been removed.
+    #[serde(rename = "comment_stripped:sha_256")]
+    comment_stripped: Option<Fingerprint<CommentStrippedSHA256>>,
+}
+
+impl Combined {
+    /// Create a vector of fingerprint hashes, the equivalent of running
+    /// `Fingerprint::to_hash` on each `Fingerprint` stored in this struct.
+    ///
+    /// For `Optional` fingerprints, a `None` value is dropped from the
+    /// resulting vector.
+    pub fn to_hashes(&self) -> Vec<Vec<u8>> {
+        let raw = self.raw.to_hash();
+        if let Some(stripped) = &self.comment_stripped {
+            vec![raw, stripped.to_hash()]
+        } else {
+            vec![raw]
+        }
+    }
+}
+
+impl Display for Combined {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        if let Some(comment_stripped) = &self.comment_stripped {
+            write!(
+                f,
+                "{}({}); {}({})",
+                RawSHA256, self.raw, CommentStrippedSHA256, comment_stripped,
+            )
+        } else {
+            write!(f, "{}({})", RawSHA256, self.raw())
+        }
+    }
+}
+
+/// Fingerprint the provided file with all fingerprint [`Kind`]s.
+pub fn fingerprint(path: &Path) -> Result<Combined, Error> {
+    let mut file = BufReader::new(File::open(path)?);
+    fingerprint_stream(&mut file)
+}
+
+/// Fingerprint the provided stream (typically a file handle) with all fingerprint [`Kind`]s.
+pub fn fingerprint_stream<R: BufRead + Send + Seek + 'static>(
+    stream: &mut R,
+) -> Result<Combined, Error> {
+    let raw = fingerprint::raw(stream)?;
+    stream.seek(io::SeekFrom::Start(0))?;
+    let comment_stripped = fingerprint::comment_stripped(stream)?;
+    Ok(Combined {
+        raw,
+        comment_stripped,
+    })
+}
+
+/// The result of eagerly running all fingerprint [`Kind`]s on some given content.
+///
+/// This structure is equivalent to [`Combined`], but each fingerprint is a tuple of the computed fingerprint
+/// plus the content that was processed to make the fingerprint.
+#[derive(Clone, Hash, Eq, PartialEq, Debug, Getters, Serialize, Deserialize)]
+#[getset(get = "pub")]
+pub struct Processed {
+    /// Whether the file was detected to be binary.
+    detected_as_binary: bool,
+
+    /// This fingerprint is derived regardless of the kind of file.
+    raw: (Fingerprint<RawSHA256>, String),
+
+    /// The fingerprint derived when the file is a text file, and any C-style comments have been removed.
+    comment_stripped: Option<(Fingerprint<CommentStrippedSHA256>, String)>,
+}
+
+/// Process the provided file with all fingerprint [`Kind`]s.
+///
+/// # Performance
+///
+/// This function is intended to be used for debugging;
+/// it outputs much more data and is much more expensive in terms of IO
+/// as compared to the standard fingerprint functions.
+pub fn process(path: &Path) -> Result<Processed, Error> {
+    let mut file = BufReader::new(File::open(path)?);
+    process_stream(&mut file)
+}
+
+/// Process the provided stream (typically a file handle) with all fingerprint [`Kind`]s.
+///
+/// # Performance
+///
+/// This function is intended to be used for debugging;
+/// it outputs much more data and is much more expensive in terms of IO
+/// as compared to the standard fingerprint functions.
+pub fn process_stream<R: BufRead + Send + Seek + 'static>(
+    stream: &mut R,
+) -> Result<Processed, Error> {
+    let BinaryCheck { is_binary, .. } = fingerprint::content_is_binary(stream)?;
+    stream.seek(io::SeekFrom::Start(0))?;
+
+    let raw = fingerprint::raw(stream)?;
+    stream.seek(io::SeekFrom::Start(0))?;
+
+    let mut raw_content = Vec::new();
+    if is_binary {
+        fingerprint::content_binary(stream, &mut raw_content)?;
+    } else {
+        fingerprint::content_text(stream, &mut raw_content)?;
+    }
+    stream.seek(io::SeekFrom::Start(0))?;
+
+    let comment_stripped = fingerprint::comment_stripped(stream)?;
+    stream.seek(io::SeekFrom::Start(0))?;
+
+    Ok(Processed {
+        detected_as_binary: is_binary,
+        raw: (raw, lossy_string(raw_content)),
+        comment_stripped: if let Some(comment_stripped) = comment_stripped {
+            let mut stripped_content = Vec::new();
+            fingerprint::content_stripped(stream, &mut stripped_content)?;
+            Some((comment_stripped, lossy_string(stripped_content)))
+        } else {
+            None
+        },
+    })
+}
+
+fn lossy_string(v: Vec<u8>) -> String {
+    String::from_utf8_lossy(&v).to_string()
+}
+
+#[cfg(test)]
+mod tests;
+
+mod private {
+    pub trait Sealed {}
 }
diff --git a/src/main.rs b/src/main.rs
deleted file mode 100644
index ae16713..0000000
--- a/src/main.rs
+++ /dev/null
@@ -1,9 +0,0 @@
-fn main() {
-    println!("Hello, world!");
-}
-
-#[cfg(test)]
-mod tests {
-    #[test]
-    fn bin_works() {}
-}
diff --git a/src/serialize.rs b/src/serialize.rs
new file mode 100644
index 0000000..4e74f9d
--- /dev/null
+++ b/src/serialize.rs
@@ -0,0 +1,55 @@
+//! Contains helpers for serializing.
+//!
+//! Most serialization work is handled by serde, but we needed additional custom logic for kinds,
+//! so here we are.
+
+/// Contains helpers for serializing fingerprint kinds.
+pub mod kind {
+    use std::collections::HashSet;
+
+    use serde::{Deserialize, Serialize};
+
+    use crate::{CommentStrippedSHA256, RawSHA256};
+
+    /// The stringified version of a [`Kind`].
+    #[derive(Clone, Eq, PartialEq, Debug, Hash, Serialize, Deserialize)]
+    pub struct SerializedKind(String);
+
+    impl SerializedKind {
+        /// Create a new instance from a `String`.
+        /// Be careful: this method is intended only for use when serializing;
+        /// it is possible to create nonsensical values with this method.
+        pub fn new(inner: impl ToString) -> Self {
+            Self(inner.to_string())
+        }
+
+        /// Extract the inner `String` for this instance.
+        pub fn into_inner(self) -> String {
+            self.0
+        }
+    }
+
+    /// Return the kinds used to evaluate a [`crate::Combined`] output by
+    /// this version of this crate.
+    ///
+    /// All kinds _evaluated_ for a `Combined` are included, whether the `Combined`
+    /// actually included those kinds or not.
+    ///
+    /// Specifically: Even if a `Combined` does not concretely include a `CommentStrippedSHA256`
+    /// fingerprint, it is still included in the serialized list of kinds, because it was
+    /// something that the fingerprint algorithm _considered_ for the file that is
+    /// represented by a `Combined` value.
+    pub fn kinds_evaluated() -> HashSet<SerializedKind> {
+        [RawSHA256.to_string(), CommentStrippedSHA256.to_string()]
+            .into_iter()
+            .map(SerializedKind)
+            .collect()
+    }
+
+    /// If the previous set of kinds contains all of the kinds we would now emit
+    /// (ignoring kinds we wouldn't emit), we should not re-fingerprint the files.
+    pub fn would_evaluate_new_kinds(previously_evaluated: &HashSet<SerializedKind>) -> bool {
+        let would_be_evaluated = kinds_evaluated();
+        !would_be_evaluated.is_subset(previously_evaluated)
+    }
+}
diff --git a/src/stream.rs b/src/stream.rs
new file mode 100644
index 0000000..8dea6af
--- /dev/null
+++ b/src/stream.rs
@@ -0,0 +1,91 @@
+//! Utilities for streaming byte oriented operations.
+
+use std::{io, iter::Peekable};
+
+const LF_CHAR: u8 = b'\n';
+const CR_CHAR: u8 = b'\r';
+
+/// Convenience trait representing an iterator of a byte stream (as returned from `Read::bytes`).
+/// Automatically implemented.
+pub(crate) trait ByteIterator: Iterator<Item = io::Result<u8>> {}
+impl<I> ByteIterator for I where I: Iterator<Item = io::Result<u8>> {}
+
+/// Implements the ability to drop `\r\n` byte pairs from a stream, converting each instance to a single `\n`.
+pub(crate) struct CRLFToLF<I: ByteIterator> {
+    iter: Peekable<I>,
+}
+
+impl<I> Iterator for CRLFToLF<I>
+where
+    I: ByteIterator,
+{
+    type Item = io::Result<u8>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        // If the read byte is `\r`, check the byte after that (the "upcoming" byte):
+        // - If this is the end of the stream, just drop the `\r`.
+        // - If the upcoming byte is `\n`, drop the currently read `\r` by re-running.
+        // - If the upcoming byte is neither of those things, emit the `\r`.
+        //
+        // The goal here is to replicate the following Haskell reference:
+        // https://github.com/fossas/fossa-cli/blob/bde67a0157b8b8b8472056bea843a30d4e495271/src/App/Fossa/VSI/Fingerprint.hs#L88-L89
+        // which effectively splits the stream into chunks on `\n` boundaries, then from each chunk trims the final `\r` if it exists.
+        // A `\r` immediately proceeding the end of a stream is dropped because that would have been a final chunk in the Haskell version,
+        // which then would have had its trailing `\r` dropped.
+        match self.iter.next()? {
+            Ok(byte) => {
+                if byte == CR_CHAR {
+                    if let Ok(next) = self.iter.peek()? {
+                        if next == &LF_CHAR {
+                            return self.next();
+                        }
+                    }
+                }
+                Some(Ok(byte))
+            }
+            Err(e) => Some(Err(e)),
+        }
+    }
+}
+
+pub(crate) trait ConvertCRLFToLF {
+    /// Drops `\r\n` byte pairs from a stream, converting each instance to a single `\n`.
+    fn crlf_to_lf(self) -> CRLFToLF<Self>
+    where
+        Self: Sized,
+        Self: ByteIterator;
+}
+
+impl<I> ConvertCRLFToLF for I
+where
+    I: ByteIterator,
+{
+    fn crlf_to_lf(self) -> CRLFToLF<Self> {
+        CRLFToLF {
+            iter: self.peekable(),
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    //! Tests for internal logic.
+
+    use std::io::{Cursor, Read};
+
+    use super::*;
+
+    #[test]
+    fn crlf_to_lf_works() {
+        let content = b"hello\r\neveryone\nin\r\nthe\nworld\r";
+        let expected = b"hello\neveryone\nin\nthe\nworld".to_vec();
+
+        let processed = Cursor::new(content)
+            .bytes()
+            .crlf_to_lf()
+            .collect::<io::Result<Vec<u8>>>()
+            .expect("should not error");
+
+        assert_eq!(expected, processed);
+    }
+}
diff --git a/src/tests.rs b/src/tests.rs
new file mode 100644
index 0000000..1944ba6
--- /dev/null
+++ b/src/tests.rs
@@ -0,0 +1,146 @@
+//! Tests for the external API.
+
+use std::io::Cursor;
+
+use sha2::{Digest, Sha256};
+
+use crate::serialize::kind::{kinds_evaluated, would_evaluate_new_kinds, SerializedKind};
+
+use super::*;
+
+fn hash(content: &[u8]) -> Vec<u8> {
+    let mut hasher = Sha256::new();
+    hasher.update(content);
+    hasher.finalize().as_slice().to_vec()
+}
+
+fn make_fingerprint<K: Kind>(content: &[u8]) -> Fingerprint<K> {
+    Fingerprint::builder()
+        .content(Blob(hash(content)))
+        .kind(PhantomData {})
+        .build()
+}
+
+#[test]
+fn fp_getters() {
+    let content = Blob(hash(b"hello world"));
+    let fp = Fingerprint::builder()
+        .content(content.clone())
+        .kind(PhantomData::<RawSHA256> {})
+        .build();
+
+    assert_eq!(&content, fp.content())
+}
+
+#[test]
+fn combined_getters() {
+    let raw = make_fingerprint::<RawSHA256>(b"hello world raw");
+    let comment_stripped =
+        make_fingerprint::<CommentStrippedSHA256>(b"hello world comment stripped");
+    let combined = Combined::builder()
+        .raw(raw.clone())
+        .comment_stripped(Some(comment_stripped.clone()))
+        .build();
+
+    assert_eq!(&raw, combined.raw());
+    assert_eq!(&Some(comment_stripped), combined.comment_stripped());
+
+    let combined = Combined::builder()
+        .raw(raw.clone())
+        .comment_stripped(None)
+        .build();
+    assert_eq!(&raw, combined.raw());
+    assert_eq!(&None, combined.comment_stripped());
+}
+
+#[test]
+fn fingerprints_binary_file() {
+    let content = vec![1, 2, 3, 0, 1, 2, 3];
+    let combined = fingerprint_stream(&mut Cursor::new(content.clone())).expect("should not error");
+    let expected_fingerprint = make_fingerprint::<RawSHA256>(&content);
+
+    assert_eq!(combined.raw, expected_fingerprint);
+    assert_eq!(combined.comment_stripped, None);
+}
+
+#[test]
+fn fingerprints_text_file() {
+    let content = b"hello world";
+
+    let combined = fingerprint_stream(&mut Cursor::new(content)).expect("should not error");
+    let expected_fingerprint = make_fingerprint::<RawSHA256>(content);
+    assert_eq!(combined.raw, expected_fingerprint);
+
+    let expected_fingerprint = make_fingerprint::<CommentStrippedSHA256>(content);
+    assert_eq!(combined.comment_stripped, Some(expected_fingerprint));
+}
+
+#[test]
+fn fingerprints_text_file_stripping_cr() {
+    let content = b"hello world\r\nanother line\r\na final line\n";
+    let cr_stripped = b"hello world\nanother line\na final line\n";
+
+    let combined = fingerprint_stream(&mut Cursor::new(content)).expect("should not error");
+    let expected_fingerprint = make_fingerprint::<RawSHA256>(cr_stripped);
+    assert_eq!(combined.raw, expected_fingerprint, "raw");
+
+    let comment_stripped = b"hello world\nanother line\na final line";
+    let expected_fingerprint = make_fingerprint::<CommentStrippedSHA256>(comment_stripped);
+    assert_eq!(
+        combined.comment_stripped,
+        Some(expected_fingerprint),
+        "comment stripped"
+    );
+}
+
+#[test]
+fn fingerprints_binary_file_appearing_as_text() {
+    // Sourced from `git@github.com:chromium/chromium.git` at `tools/origin_trials/eftest.key` on commit 49249345609d505c8bb8b0b5a42ff4b68b9e6d41.
+    let content = include_bytes!("../testdata/eftest.key");
+    let combined = fingerprint_stream(&mut Cursor::new(content)).expect("should not error");
+    let expected_fingerprint = make_fingerprint::<RawSHA256>(content);
+    assert_eq!(combined.raw, expected_fingerprint);
+    assert_eq!(combined.comment_stripped, None);
+}
+
+#[test]
+fn comment_stripped_does_not_fingerprint_binary_file() {
+    let content = vec![1, 2, 3, 0, 1, 2, 3];
+    let combined = fingerprint_stream(&mut Cursor::new(content)).expect("should not error");
+    assert_eq!(combined.comment_stripped, None);
+}
+
+#[test]
+fn comment_stripped_fingerprint_text_file() {
+    let content = br#"/*
+ * This is a placeholder file used to test comment stripping code.
+*/
+
+int main() {
+  int code = 0;
+  // code = 1;
+
+  return code; // perfect
+}
+"#;
+
+    let combined = fingerprint_stream(&mut Cursor::new(content)).expect("should not error");
+    let expected = String::from("44fc8f68ab633c7ca0240a66e4ff038c0f2412fe69d14b6f052556edaa1b9160");
+    assert_eq!(
+        combined.comment_stripped.map(|fp| fp.to_string()),
+        Some(expected)
+    );
+}
+
+#[test]
+fn evaluate_kinds() {
+    let mut evaluated = kinds_evaluated();
+    assert!(!would_evaluate_new_kinds(&evaluated));
+
+    evaluated.remove(&SerializedKind::new(CommentStrippedSHA256.to_string()));
+    assert!(would_evaluate_new_kinds(&evaluated));
+
+    let mut evaluated = kinds_evaluated();
+    evaluated.insert(SerializedKind::new("some other kind"));
+    assert!(!would_evaluate_new_kinds(&evaluated));
+}
diff --git a/testdata/eftest.key b/testdata/eftest.key
new file mode 100644
index 0000000..3a7a7a9
--- /dev/null
+++ b/testdata/eftest.key
@@ -0,0 +1 @@
+�g��*CLg��(ɐ������v�,K�u��:��(pҚ�Y�+���d����V:�
\ No newline at end of file
diff --git a/testdata/facebook-folly-Version.cpp b/testdata/facebook-folly-Version.cpp
new file mode 100644
index 0000000..b1e667c
--- /dev/null
+++ b/testdata/facebook-folly-Version.cpp
@@ -0,0 +1,23 @@
+/*
+ * Copyright 2016 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <folly/VersionCheck.h>
+
+namespace folly { namespace detail {
+
+FOLLY_VERSION_CHECK(folly, FOLLY_VERSION)
+
+}}  // namespaces
diff --git a/testdata/facebook-folly-Version.cpp.stripped b/testdata/facebook-folly-Version.cpp.stripped
new file mode 100644
index 0000000..a04745f
--- /dev/null
+++ b/testdata/facebook-folly-Version.cpp.stripped
@@ -0,0 +1,4 @@
+#include <folly/VersionCheck.h>
+namespace folly { namespace detail {
+FOLLY_VERSION_CHECK(folly, FOLLY_VERSION)
+}}
\ No newline at end of file