From 9ccea796219f4e5382510b7b00f4cbc26c78bada Mon Sep 17 00:00:00 2001 From: Jessica Black Date: Mon, 3 Jun 2024 11:26:20 -0700 Subject: [PATCH] Extract from `foundation-libs` --- .github/CODEOWNERS | 1 + .github/workflows/check-dynamic.yml | 2 +- Cargo.toml | 15 +- README.md | 30 +- src/fingerprint.rs | 230 +++++++++++ src/lib.rs | 399 ++++++++++++++++++- src/main.rs | 9 - src/serialize.rs | 55 +++ src/stream.rs | 91 +++++ src/tests.rs | 146 +++++++ testdata/eftest.key | 1 + testdata/facebook-folly-Version.cpp | 23 ++ testdata/facebook-folly-Version.cpp.stripped | 4 + 13 files changed, 960 insertions(+), 46 deletions(-) create mode 100644 .github/CODEOWNERS create mode 100644 src/fingerprint.rs delete mode 100644 src/main.rs create mode 100644 src/serialize.rs create mode 100644 src/stream.rs create mode 100644 src/tests.rs create mode 100644 testdata/eftest.key create mode 100644 testdata/facebook-folly-Version.cpp create mode 100644 testdata/facebook-folly-Version.cpp.stripped diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS new file mode 100644 index 0000000..2a0d5ea --- /dev/null +++ b/.github/CODEOWNERS @@ -0,0 +1 @@ +* @fossas/analysis diff --git a/.github/workflows/check-dynamic.yml b/.github/workflows/check-dynamic.yml index 2a7ae0e..b9c2855 100644 --- a/.github/workflows/check-dynamic.yml +++ b/.github/workflows/check-dynamic.yml @@ -13,7 +13,7 @@ jobs: setup: echo "no setup" build: cargo build - host: macos-latest - setup: rustup target add aarch64-apple-darwin + setup: rustup target add aarch64-apple-darwin && rustup target add x86_64-apple-darwin build: cargo build --target aarch64-apple-darwin && cargo build --target x86_64-apple-darwin runs-on: ${{ matrix.settings.host }} diff --git a/Cargo.toml b/Cargo.toml index 6184e61..3b40fff 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,8 +1,15 @@ [package] -name = "template-rust" -version = "0.1.0" +name = "fingerprint" +version = "1.0.1" edition = "2021" -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html - [dependencies] +getset = "0.1.2" +hex = "0.4.3" +iter-read = "0.3.1" +serde = { version = "1.0.140", features = ["derive"] } +thiserror = "1.0.31" +sha2 = "0.10.6" + +[dev-dependencies] +typed-builder = "0.10.0" diff --git a/README.md b/README.md index cffb3b1..2586b63 100644 --- a/README.md +++ b/README.md @@ -1,29 +1 @@ -# template-rust - -Template repository for a Rust project. - -TODOs for a new project: -- [ ] Change the license if MPL2 is not appropriate for the project. Make sure to do this before adding any code. -- [ ] Ensure the dev docs (in particular the release and compatibility semantics) are valid for this project. -- [ ] Set [CODEOWNERS] to the team that owns the repository. -- [ ] Create an API user in [FOSSA] and store it as a secret named `FOSSA_API_KEY`. - - Consider naming it with the pattern `ci-{REPO_NAME}`. For example, `ci-template-rust`. -- [ ] Update repository permissions as appropriate. Generally, the CODEOWNER team is set as admin. -- [ ] Update branch protection rules as appropriate. -- [ ] Update repository features and settings. Recommended defaults: - - [ ] Turn off all features (Wikis, Issues, Sponsorships, Discussions, Projects); FOSSA uses other systems for these. - - [ ] Only allow squash merging. - - [ ] Always suggest updating PR branches. - - [ ] Allow auto-merge. - - [ ] Automatically delete head branches. - -Then just edit the included Rust project, or remove it and `cargo init` your project, and get going! - -[codeowners]: https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-code-owners -[fossa]: https://app.fossa.com - -# recommendations - -- If publishing a Linux binary, consider providing two: one that [statically links libc](./docs/dev/reference/static-binary.md), and one that doesn't. -- If publishing a macOS binary, consider providing two: one for [Intel and one for M-series CPUs](./docs/dev/reference/macos-arch.md). -- If this application may be used on AWS Graviton or similar, consider providing an ARM build for Linux as well. +# lib-fingerprint diff --git a/src/fingerprint.rs b/src/fingerprint.rs new file mode 100644 index 0000000..53b3f8d --- /dev/null +++ b/src/fingerprint.rs @@ -0,0 +1,230 @@ +use std::io::{self, BufRead, BufReader, Cursor, Read, Write}; + +use iter_read::IterRead; +use sha2::{Digest, Sha256}; + +use crate::{stream::ConvertCRLFToLF, CommentStrippedSHA256, Error, Fingerprint, RawSHA256}; + +/// Fingerprint the file using the [`RawSHA256`] kind. +pub fn raw(stream: &mut R) -> Result, Error> { + // Read the start of the stream, and decide whether to treat the rest of the stream as binary based on that. + let BinaryCheck { read, is_binary } = content_is_binary(stream)?; + + // Chain the part of the stream already read to evaluate binary along with the rest of the stream. + let mut stream = Cursor::new(read).chain(stream); + let mut hasher = Sha256::new(); + if is_binary { + content_binary(&mut stream, &mut hasher)?; + } else { + content_text(&mut stream, &mut hasher)?; + } + + Fingerprint::from_digest(hasher) +} + +/// Fingerprint the file using the [`CommentStrippedSHA256`] kind. +pub fn comment_stripped( + stream: &mut R, +) -> Result>, Error> { + // Read the start of the stream, and decide whether to treat the rest of the stream as binary based on that. + let BinaryCheck { read, is_binary } = content_is_binary(stream)?; + if is_binary { + return Ok(None); + } + + // Chain the part of the stream already read to evaluate binary along with the rest of the stream. + let mut stream = Cursor::new(read).chain(stream); + let mut hasher = Sha256::new(); + match content_stripped(&mut stream, &mut hasher) { + Ok(_) => Some(Fingerprint::from_digest(hasher)).transpose(), + Err(err) => { + // The `io::Error` type is opaque. + // Handle the case of attempting to comment strip a binary file. + if err.to_string().to_lowercase().contains("utf-8") { + Ok(None) + } else { + Err(err) + } + } + } +} + +/// The result of checking a file for whether it is binary. +pub(crate) struct BinaryCheck { + pub(crate) read: Vec, + pub(crate) is_binary: bool, +} + +/// Inspect the file to determine if it is binary. +/// +/// Uses the same method as git: "is there a zero byte in the first 8000 bytes of the file" +pub(crate) fn content_is_binary(stream: &mut R) -> Result { + let mut buf = Vec::new(); + stream.take(8000).read_to_end(&mut buf)?; + let is_binary = buf.contains(&0); + Ok(BinaryCheck { + read: buf, + is_binary, + }) +} + +/// Reads the exact contents of a binary file without modification. +pub(crate) fn content_binary(stream: &mut impl BufRead, w: &mut impl Write) -> Result<(), Error> { + io::copy(stream, w)?; + Ok(()) +} + +/// Reads text files in a platform independent manner. +/// +/// Specifically: +/// - All text encodings are ignored; this function operates on raw bytes. +/// - `git` implementations on Windows typically check out files with `\r\n` line endings, +/// while *nix checks them out with `\n`. +/// To be platform independent, any `\r\n` byte sequences found are converted to a single `\n`. +pub(crate) fn content_text(stream: &mut impl BufRead, w: &mut impl Write) -> Result<(), Error> { + let stream = BufReader::new(stream).bytes().crlf_to_lf().fuse(); + io::copy(&mut IterRead::new(stream), w)?; + Ok(()) +} + +/// Hashes code files while removing C-style comments and blank lines in a platform independent manner. +/// +/// Specifically: +/// - All text encodings are treated as utf8. +/// - `git` implementations on Windows typically check out files with `\r\n` line endings, +/// while *nix checks them out with `\n`. +/// To be platform independent, any `\r\n` byte sequences found are converted to a single `\n`. +/// - C-style comments are removed: +/// - `//` is considered the start of a single line comment; these bytes and any other bytes until right before a `\n` are removed. +/// - `/*` is considered the start of a multi line comment; these bytes and any other bytes until after a `*/` is read are removed. +/// - This function does not check for escaped comments. +/// - Any sequence of multiple contiguous `\n` bytes are collapsed to a single `\n` byte. +/// - The final `\n` byte is removed from the end of the stream if present. +pub(crate) fn content_stripped(stream: &mut impl BufRead, w: &mut impl Write) -> Result<(), Error> { + let mut buffered_output_line = String::new(); + let mut is_multiline_active = false; + + for line in stream.lines() { + let mut line = line?; + + // At this point we know we have a new line coming. If a previous line is buffered and ready to write, do so now. + // Write it with a trailing newline because we know we'll be writing a following line. + if !buffered_output_line.is_empty() { + writeln!(w, "{buffered_output_line}")?; + } + + (line, is_multiline_active) = clean_line(line, is_multiline_active); + line.trim().clone_into(&mut buffered_output_line); + } + + // Now that we're done reading the input stream, if there's a buffered output line write it *without a trailing newline*. + write!(w, "{buffered_output_line}")?; + Ok(()) +} + +/// Part comment stripping, part state machine. Cleans lines of comments based on whether a previous invocation +/// detected the start of a multi line comment. +/// +/// This is very much not an ideal function: it scans the line multiple times instead of being forward-looking-only, +/// and the dual responsibility makes it complicated. We should fix this, but moving forward for now. +fn clean_line(line: String, is_multiline_active: bool) -> (String, bool) { + if is_multiline_active { + if let Some(end) = line.find("*/") { + return clean_line(line[end + 2..].to_string(), false); + } + + (String::new(), true) + } else if let Some(start) = line.find("/*") { + let before_multi = line[..start].to_string(); + let (after_multi, is_multi) = clean_line(line[start + 2..].to_string(), true); + (before_multi + &after_multi, is_multi) + } else if let Some(start) = line.find("//") { + (line[..start].to_string(), false) + } else { + (line, false) + } +} + +#[cfg(test)] +mod tests { + //! Tests for internal logic. + + use super::*; + + /// Inspired by the Haskell implementation: https://github.com/fossas/fossa-cli/blob/8de74b71b80d77321d64f94d7573773e49306772/test/App/Fossa/VSI/testdata/multi_line_comment.c#L1-L10 + #[test] + fn comment_strip_mixed() { + let content = r#"/* + * This is a placeholder file used to test comment stripping code. +*/ + +int main() { + int code = 0; + // code = 1; + + + + + return code; // perfect +} +"#; + let expected = r#"int main() { +int code = 0; +return code; +}"#; + + let mut buf = Vec::new(); + content_stripped(&mut Cursor::new(content), &mut buf).expect("must fingerprint"); + assert_eq!(expected, String::from_utf8_lossy(&buf)); + } + + /// Copied from the Go implementation: https://github.com/fossas/basis/blob/6b0a1ce7ca5d88d033732f6dcfebd90b8f143038/sherlock/pkg/lib/indexer/cleaned/strip_comments_internal_test.go#L71-L79 + #[test] + fn comment_strip_single_line_comments() { + let content = " content1 \n content2 //comment \n content3 "; + let expected = "content1\ncontent2\ncontent3"; + + let mut buf = Vec::new(); + content_stripped(&mut Cursor::new(content), &mut buf).expect("must fingerprint"); + assert_eq!(expected, String::from_utf8_lossy(&buf)); + } + + /// Copied from the Go implementation: https://github.com/fossas/basis/blob/6b0a1ce7ca5d88d033732f6dcfebd90b8f143038/sherlock/pkg/lib/indexer/cleaned/strip_comments_internal_test.go#L89-L97 + #[test] + fn comment_strip_multi_line_comments() { + let content = + " content1 \n content2 /* begin comment \n end comment */ content3 \n content4 "; + let expected = "content1\ncontent2\ncontent3\ncontent4"; + + let mut buf = Vec::new(); + content_stripped(&mut Cursor::new(content), &mut buf).expect("must fingerprint"); + assert_eq!(expected, String::from_utf8_lossy(&buf)); + } + + #[test] + fn comment_strip_cr() { + let content = "hello world\r\nanother line\r\na final line\n"; + let expected = "hello world\nanother line\na final line"; + + let mut buf = Vec::new(); + content_stripped(&mut Cursor::new(content), &mut buf).expect("must fingerprint"); + assert_eq!(expected, String::from_utf8_lossy(&buf)); + } + + #[test] + fn comment_strip_real_source() { + let content = include_bytes!("../testdata/facebook-folly-Version.cpp"); + let expected = include_str!("../testdata/facebook-folly-Version.cpp.stripped"); + + let mut buf = Vec::new(); + content_stripped(&mut Cursor::new(content), &mut buf).expect("must process"); + + assert_eq!(normalize_lf(expected), String::from_utf8_lossy(&buf)); + } + + /// Windows CI checks out CRLF. Normalize it to be LF only. + /// This function should only be applied to testing values, not responses from the functions being tested. + fn normalize_lf(input: impl Into) -> String { + input.into().replace("\r\n", "\n") + } +} diff --git a/src/lib.rs b/src/lib.rs index 4ee70a2..1b86daf 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,5 +1,398 @@ +//! A fingerprint is a unique identifier for a file's contents. +//! +//! Fingerprints come in multiple "kinds", which are represented by textual identifiers. +//! Fingerprints themselves are represented as binary blobs. +//! +//! Fingerprint kinds MUST maintain exact implementation compatibility; once the algorithm for a given kind +//! has been created and its fingerprints have been crawled, it can't be changed. If a change is needed, +//! that has to be a new kind of fingerprint. +//! +//! This rule means that we start out with two kinds that existed prior to this library being created, +//! which have specific rules about how to compute the fingerprint, and specific text identifiers. +//! +//! For more information, refer to the documentation for the types below. + +#![deny(unsafe_code)] +#![deny(missing_docs)] +#![warn(rust_2018_idioms)] +#![deny(clippy::unwrap_used)] + +use std::{ + fmt::Display, + fs::File, + io::{self, BufRead, BufReader, Seek}, + marker::PhantomData, + path::Path, +}; + +use crate::fingerprint::BinaryCheck; +use getset::Getters; +use serde::{Deserialize, Serialize}; +use sha2::{Digest, Sha256}; +use thiserror::Error; #[cfg(test)] -mod tests { - #[test] - fn lib_works() {} +use typed_builder::TypedBuilder; + +mod fingerprint; +pub mod serialize; +mod stream; + +/// Errors that may be encountered during fingerprinting. +#[derive(Debug, Error)] +#[non_exhaustive] +pub enum Error { + /// A generic IO error occurred while reading the content to be hashed. + /// This error may be retried, but if it fails multiple times it's generally not recoverable. + #[error("i/o error: {0}")] + IO(#[from] io::Error), +} + +/// Fingerprint kinds MUST maintain exact implementation compatibility; once the algorithm for a given kind +/// has been created and its fingerprints have been crawled, it can't be changed. If a change is needed, +/// that has to be a new kind of fingerprint. Similarly, the text representation for a given algorithm +/// cannot change either: some services assume certain things about the fingerprints that we cannot easily change +/// (for example, the VSI Forensics Service assumes all files have a `sha_256` fingerprint). +/// +/// This is because fingerprints form the backbone of how VSI operates: +/// - FOSSA CLI creates them. +/// - The VSI Forensics Service assumes certain things about them. +/// - The VSI Cloud Store assumes certain things about them. +/// - The VSI Cloud Store's Crawlers create them. +/// - Crawlers and FOSSA CLI must create them in the same way. +/// - ... and all of this has to be compatible with the fingerprinting in the MVP store, which formed the initial basis of VSI. +/// +/// All valid fingerprint kinds implement this trait. +/// +/// This trait is sealed, indicating nothing outside this module may implement it. +/// +/// ### Future work +/// +/// The current implementation of `Kind` causes an issue when we want to actually send kind information +/// across a serialization boundary, because `Kind`s aren't concrete and therefore aren't +/// generally serializable. +/// +/// Specifically, this is an issue for `FinalizeRevision` and `CheckRevision` methods in the VSI Cloud Store, +/// where it's not simple to send a list of `Kind`s used to fingerprint a set of files, +/// and it's not simple to then retreive that list from the API. +/// +/// Instead, for `FinalizeRevision`, clients are forced to: +/// - Know what kinds of fingerprints are possible, separately. +/// - Manually call `.to_string` on those kinds to get a list of kinds used. +/// - Send them as opaque strings. +/// And for `CheckRevision`, clients are forced to: +/// - Manually compare the API result (which is a set of opaque strings) against known kinds, using the `to_string` method. +/// And the server is required to treat all this as opaque strings. +/// +/// To make this less error prone, this is all handled in this library under the `serialize` module, +/// and it works for now so it's not a massive problem. But if we have ideas for how to improve this for the future, +/// we should do them. +pub trait Kind: private::Sealed {} + +/// Represents a fingerprint derived by hashing the raw contents of a file with the SHA256 algorithm. +/// +/// This is the default kind of fingerprint, and the kind of fingerprint with the maximal comparison signal, +/// as the raw SHA256 hash of two files matching indicates that the two files are exactly the same content. +/// It's also the fingerprint kind that works for literally all kinds of files, whereas other fingerprint kinds +/// generally require specific circumstances: `CommentStrippedSHA256` requires that the file is text, and +/// hypothetical future fingerprint kinds such as something based on an AST would require that the file is source code. +/// +/// This fingerprint kind has been finalized and may not change (except to fix a bug). +#[derive(Clone, Eq, PartialEq, Debug, Default, Hash, Serialize, Deserialize)] +pub struct RawSHA256; + +impl private::Sealed for RawSHA256 {} +impl Kind for RawSHA256 {} + +impl Display for RawSHA256 { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "sha_256") + } +} + +/// Represents a fingerprint derived by hashing the contents of a file with the SHA256 algorithm +/// after performing basic C-style comment stripping. +/// +/// This fingerprint kind has been finalized and may not change (except to fix a bug). +#[derive(Clone, Eq, PartialEq, Debug, Default, Hash, Serialize, Deserialize)] +pub struct CommentStrippedSHA256; + +impl private::Sealed for CommentStrippedSHA256 {} +impl Kind for CommentStrippedSHA256 {} + +impl Display for CommentStrippedSHA256 { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "comment_stripped:sha_256") + } +} + +/// An array of bytes representing a fingerprint's content. +/// +/// Must be encoded as hex to be compatible with the FOSSA backend. +#[derive(Clone, Eq, PartialEq, Hash, Debug, Default)] +pub struct Blob(Vec); + +impl Blob { + fn from_digest(digest: D) -> Result { + let buf = digest.finalize().as_slice().to_vec(); + Ok(Blob(buf)) + } + + /// Reference the bytes inside the blob. + pub fn as_bytes(&self) -> &[u8] { + &self.0 + } +} + +impl Serialize for Blob { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + serializer.serialize_str(&hex::encode(&self.0)) + } +} + +impl<'de> Deserialize<'de> for Blob { + fn deserialize(deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + let s = String::deserialize(deserializer)?; + let b = hex::decode(s).map_err(serde::de::Error::custom)?; + Ok(Self(b)) + } +} + +/// Fingerprints need to be hashable by their `Kind` and `Content` values +/// for the VSI Cloud Store to properly interact with them. +pub trait Hashable { + /// Create a new hash from a fingerprint kind and a fingerprint. + fn to_hash(&self) -> Vec; +} + +/// An opaque, deterministic value for the file's contents. +/// If two fingerprints are the same, the contents of the files used to create the fingerprints are the same. +#[derive(Clone, Eq, PartialEq, Hash, Default, Debug, Getters)] +#[cfg_attr(test, derive(TypedBuilder))] +#[getset(get = "pub")] +pub struct Fingerprint { + #[getset(skip)] + kind: PhantomData, + /// The content of the blob. + content: Blob, +} + +impl Serialize for Fingerprint { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + self.content.serialize(serializer) + } +} + +impl<'de, K: Kind> Deserialize<'de> for Fingerprint { + fn deserialize(deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + Ok(Self { + content: Blob::deserialize(deserializer)?, + kind: PhantomData {}, + }) + } +} + +impl Fingerprint +where + K: Kind, +{ + fn new(content: Blob) -> Self { + Self { + content, + kind: PhantomData {}, + } + } + + fn from_digest(digest: D) -> Result { + let content = Blob::from_digest(digest)?; + Ok(Fingerprint::new(content)) + } +} + +impl Hashable for Fingerprint { + /// Create a new hash from a fingerprint kind and a fingerprint + fn to_hash(&self) -> Vec { + let mut bs = RawSHA256.to_string().as_bytes().to_vec(); + bs.extend_from_slice(self.content.as_bytes()); + Sha256::digest(&bs).to_vec() + } +} + +impl Hashable for Fingerprint { + /// Create a new hash from a fingerprint kind and a fingerprint + fn to_hash(&self) -> Vec { + let mut bs = CommentStrippedSHA256.to_string().as_bytes().to_vec(); + bs.extend_from_slice(self.content.as_bytes()); + Sha256::digest(&bs).to_vec() + } +} + +impl Display for Fingerprint +where + K: Kind, +{ + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", hex::encode(&self.content.0)) + } +} + +/// The result of eagerly running all fingerprint [`Kind`]s on some given content. +/// +/// When creating a [`Combined`], the same content is run through each [`Kind`]. +/// Any [`Kind`] returning [`Error::Unsupported`] is silently dropped from the [`Combined`] data structure. +/// +/// For example, this means that if [`Combined`] is created over a binary file, [`CommentStrippedSHA256`] is not +/// in the resulting data structure, because that kind of fingerprint requires UTF8 encoded text content to run. +#[derive(Clone, Hash, Eq, PartialEq, Default, Debug, Getters, Serialize, Deserialize)] +#[cfg_attr(test, derive(TypedBuilder))] +#[getset(get = "pub")] +pub struct Combined { + /// This fingerprint is derived regardless of the kind of file. + // Important: if this struct is changed, update `serialize::kind::kinds_evaluated` to reflect the change. + // `kinds_evaluated` may be replaced by a macro in the future. + #[serde(rename = "sha_256")] + raw: Fingerprint, + /// The fingerprint derived when the file is a text file, and any C-style comments have been removed. + #[serde(rename = "comment_stripped:sha_256")] + comment_stripped: Option>, +} + +impl Combined { + /// Create a vector of fingerprint hashes, the equivalent of running + /// `Fingerprint::to_hash` on each `Fingerprint` stored in this struct. + /// + /// For `Optional` fingerprints, a `None` value is dropped from the + /// resulting vector. + pub fn to_hashes(&self) -> Vec> { + let raw = self.raw.to_hash(); + if let Some(stripped) = &self.comment_stripped { + vec![raw, stripped.to_hash()] + } else { + vec![raw] + } + } +} + +impl Display for Combined { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + if let Some(comment_stripped) = &self.comment_stripped { + write!( + f, + "{}({}); {}({})", + RawSHA256, self.raw, CommentStrippedSHA256, comment_stripped, + ) + } else { + write!(f, "{}({})", RawSHA256, self.raw()) + } + } +} + +/// Fingerprint the provided file with all fingerprint [`Kind`]s. +pub fn fingerprint(path: &Path) -> Result { + let mut file = BufReader::new(File::open(path)?); + fingerprint_stream(&mut file) +} + +/// Fingerprint the provided stream (typically a file handle) with all fingerprint [`Kind`]s. +pub fn fingerprint_stream( + stream: &mut R, +) -> Result { + let raw = fingerprint::raw(stream)?; + stream.seek(io::SeekFrom::Start(0))?; + let comment_stripped = fingerprint::comment_stripped(stream)?; + Ok(Combined { + raw, + comment_stripped, + }) +} + +/// The result of eagerly running all fingerprint [`Kind`]s on some given content. +/// +/// This structure is equivalent to [`Combined`], but each fingerprint is a tuple of the computed fingerprint +/// plus the content that was processed to make the fingerprint. +#[derive(Clone, Hash, Eq, PartialEq, Debug, Getters, Serialize, Deserialize)] +#[getset(get = "pub")] +pub struct Processed { + /// Whether the file was detected to be binary. + detected_as_binary: bool, + + /// This fingerprint is derived regardless of the kind of file. + raw: (Fingerprint, String), + + /// The fingerprint derived when the file is a text file, and any C-style comments have been removed. + comment_stripped: Option<(Fingerprint, String)>, +} + +/// Process the provided file with all fingerprint [`Kind`]s. +/// +/// # Performance +/// +/// This function is intended to be used for debugging; +/// it outputs much more data and is much more expensive in terms of IO +/// as compared to the standard fingerprint functions. +pub fn process(path: &Path) -> Result { + let mut file = BufReader::new(File::open(path)?); + process_stream(&mut file) +} + +/// Process the provided stream (typically a file handle) with all fingerprint [`Kind`]s. +/// +/// # Performance +/// +/// This function is intended to be used for debugging; +/// it outputs much more data and is much more expensive in terms of IO +/// as compared to the standard fingerprint functions. +pub fn process_stream( + stream: &mut R, +) -> Result { + let BinaryCheck { is_binary, .. } = fingerprint::content_is_binary(stream)?; + stream.seek(io::SeekFrom::Start(0))?; + + let raw = fingerprint::raw(stream)?; + stream.seek(io::SeekFrom::Start(0))?; + + let mut raw_content = Vec::new(); + if is_binary { + fingerprint::content_binary(stream, &mut raw_content)?; + } else { + fingerprint::content_text(stream, &mut raw_content)?; + } + stream.seek(io::SeekFrom::Start(0))?; + + let comment_stripped = fingerprint::comment_stripped(stream)?; + stream.seek(io::SeekFrom::Start(0))?; + + Ok(Processed { + detected_as_binary: is_binary, + raw: (raw, lossy_string(raw_content)), + comment_stripped: if let Some(comment_stripped) = comment_stripped { + let mut stripped_content = Vec::new(); + fingerprint::content_stripped(stream, &mut stripped_content)?; + Some((comment_stripped, lossy_string(stripped_content))) + } else { + None + }, + }) +} + +fn lossy_string(v: Vec) -> String { + String::from_utf8_lossy(&v).to_string() +} + +#[cfg(test)] +mod tests; + +mod private { + pub trait Sealed {} } diff --git a/src/main.rs b/src/main.rs deleted file mode 100644 index ae16713..0000000 --- a/src/main.rs +++ /dev/null @@ -1,9 +0,0 @@ -fn main() { - println!("Hello, world!"); -} - -#[cfg(test)] -mod tests { - #[test] - fn bin_works() {} -} diff --git a/src/serialize.rs b/src/serialize.rs new file mode 100644 index 0000000..4e74f9d --- /dev/null +++ b/src/serialize.rs @@ -0,0 +1,55 @@ +//! Contains helpers for serializing. +//! +//! Most serialization work is handled by serde, but we needed additional custom logic for kinds, +//! so here we are. + +/// Contains helpers for serializing fingerprint kinds. +pub mod kind { + use std::collections::HashSet; + + use serde::{Deserialize, Serialize}; + + use crate::{CommentStrippedSHA256, RawSHA256}; + + /// The stringified version of a [`Kind`]. + #[derive(Clone, Eq, PartialEq, Debug, Hash, Serialize, Deserialize)] + pub struct SerializedKind(String); + + impl SerializedKind { + /// Create a new instance from a `String`. + /// Be careful: this method is intended only for use when serializing; + /// it is possible to create nonsensical values with this method. + pub fn new(inner: impl ToString) -> Self { + Self(inner.to_string()) + } + + /// Extract the inner `String` for this instance. + pub fn into_inner(self) -> String { + self.0 + } + } + + /// Return the kinds used to evaluate a [`crate::Combined`] output by + /// this version of this crate. + /// + /// All kinds _evaluated_ for a `Combined` are included, whether the `Combined` + /// actually included those kinds or not. + /// + /// Specifically: Even if a `Combined` does not concretely include a `CommentStrippedSHA256` + /// fingerprint, it is still included in the serialized list of kinds, because it was + /// something that the fingerprint algorithm _considered_ for the file that is + /// represented by a `Combined` value. + pub fn kinds_evaluated() -> HashSet { + [RawSHA256.to_string(), CommentStrippedSHA256.to_string()] + .into_iter() + .map(SerializedKind) + .collect() + } + + /// If the previous set of kinds contains all of the kinds we would now emit + /// (ignoring kinds we wouldn't emit), we should not re-fingerprint the files. + pub fn would_evaluate_new_kinds(previously_evaluated: &HashSet) -> bool { + let would_be_evaluated = kinds_evaluated(); + !would_be_evaluated.is_subset(previously_evaluated) + } +} diff --git a/src/stream.rs b/src/stream.rs new file mode 100644 index 0000000..8dea6af --- /dev/null +++ b/src/stream.rs @@ -0,0 +1,91 @@ +//! Utilities for streaming byte oriented operations. + +use std::{io, iter::Peekable}; + +const LF_CHAR: u8 = b'\n'; +const CR_CHAR: u8 = b'\r'; + +/// Convenience trait representing an iterator of a byte stream (as returned from `Read::bytes`). +/// Automatically implemented. +pub(crate) trait ByteIterator: Iterator> {} +impl ByteIterator for I where I: Iterator> {} + +/// Implements the ability to drop `\r\n` byte pairs from a stream, converting each instance to a single `\n`. +pub(crate) struct CRLFToLF { + iter: Peekable, +} + +impl Iterator for CRLFToLF +where + I: ByteIterator, +{ + type Item = io::Result; + + fn next(&mut self) -> Option { + // If the read byte is `\r`, check the byte after that (the "upcoming" byte): + // - If this is the end of the stream, just drop the `\r`. + // - If the upcoming byte is `\n`, drop the currently read `\r` by re-running. + // - If the upcoming byte is neither of those things, emit the `\r`. + // + // The goal here is to replicate the following Haskell reference: + // https://github.com/fossas/fossa-cli/blob/bde67a0157b8b8b8472056bea843a30d4e495271/src/App/Fossa/VSI/Fingerprint.hs#L88-L89 + // which effectively splits the stream into chunks on `\n` boundaries, then from each chunk trims the final `\r` if it exists. + // A `\r` immediately proceeding the end of a stream is dropped because that would have been a final chunk in the Haskell version, + // which then would have had its trailing `\r` dropped. + match self.iter.next()? { + Ok(byte) => { + if byte == CR_CHAR { + if let Ok(next) = self.iter.peek()? { + if next == &LF_CHAR { + return self.next(); + } + } + } + Some(Ok(byte)) + } + Err(e) => Some(Err(e)), + } + } +} + +pub(crate) trait ConvertCRLFToLF { + /// Drops `\r\n` byte pairs from a stream, converting each instance to a single `\n`. + fn crlf_to_lf(self) -> CRLFToLF + where + Self: Sized, + Self: ByteIterator; +} + +impl ConvertCRLFToLF for I +where + I: ByteIterator, +{ + fn crlf_to_lf(self) -> CRLFToLF { + CRLFToLF { + iter: self.peekable(), + } + } +} + +#[cfg(test)] +mod tests { + //! Tests for internal logic. + + use std::io::{Cursor, Read}; + + use super::*; + + #[test] + fn crlf_to_lf_works() { + let content = b"hello\r\neveryone\nin\r\nthe\nworld\r"; + let expected = b"hello\neveryone\nin\nthe\nworld".to_vec(); + + let processed = Cursor::new(content) + .bytes() + .crlf_to_lf() + .collect::>>() + .expect("should not error"); + + assert_eq!(expected, processed); + } +} diff --git a/src/tests.rs b/src/tests.rs new file mode 100644 index 0000000..1944ba6 --- /dev/null +++ b/src/tests.rs @@ -0,0 +1,146 @@ +//! Tests for the external API. + +use std::io::Cursor; + +use sha2::{Digest, Sha256}; + +use crate::serialize::kind::{kinds_evaluated, would_evaluate_new_kinds, SerializedKind}; + +use super::*; + +fn hash(content: &[u8]) -> Vec { + let mut hasher = Sha256::new(); + hasher.update(content); + hasher.finalize().as_slice().to_vec() +} + +fn make_fingerprint(content: &[u8]) -> Fingerprint { + Fingerprint::builder() + .content(Blob(hash(content))) + .kind(PhantomData {}) + .build() +} + +#[test] +fn fp_getters() { + let content = Blob(hash(b"hello world")); + let fp = Fingerprint::builder() + .content(content.clone()) + .kind(PhantomData:: {}) + .build(); + + assert_eq!(&content, fp.content()) +} + +#[test] +fn combined_getters() { + let raw = make_fingerprint::(b"hello world raw"); + let comment_stripped = + make_fingerprint::(b"hello world comment stripped"); + let combined = Combined::builder() + .raw(raw.clone()) + .comment_stripped(Some(comment_stripped.clone())) + .build(); + + assert_eq!(&raw, combined.raw()); + assert_eq!(&Some(comment_stripped), combined.comment_stripped()); + + let combined = Combined::builder() + .raw(raw.clone()) + .comment_stripped(None) + .build(); + assert_eq!(&raw, combined.raw()); + assert_eq!(&None, combined.comment_stripped()); +} + +#[test] +fn fingerprints_binary_file() { + let content = vec![1, 2, 3, 0, 1, 2, 3]; + let combined = fingerprint_stream(&mut Cursor::new(content.clone())).expect("should not error"); + let expected_fingerprint = make_fingerprint::(&content); + + assert_eq!(combined.raw, expected_fingerprint); + assert_eq!(combined.comment_stripped, None); +} + +#[test] +fn fingerprints_text_file() { + let content = b"hello world"; + + let combined = fingerprint_stream(&mut Cursor::new(content)).expect("should not error"); + let expected_fingerprint = make_fingerprint::(content); + assert_eq!(combined.raw, expected_fingerprint); + + let expected_fingerprint = make_fingerprint::(content); + assert_eq!(combined.comment_stripped, Some(expected_fingerprint)); +} + +#[test] +fn fingerprints_text_file_stripping_cr() { + let content = b"hello world\r\nanother line\r\na final line\n"; + let cr_stripped = b"hello world\nanother line\na final line\n"; + + let combined = fingerprint_stream(&mut Cursor::new(content)).expect("should not error"); + let expected_fingerprint = make_fingerprint::(cr_stripped); + assert_eq!(combined.raw, expected_fingerprint, "raw"); + + let comment_stripped = b"hello world\nanother line\na final line"; + let expected_fingerprint = make_fingerprint::(comment_stripped); + assert_eq!( + combined.comment_stripped, + Some(expected_fingerprint), + "comment stripped" + ); +} + +#[test] +fn fingerprints_binary_file_appearing_as_text() { + // Sourced from `git@github.com:chromium/chromium.git` at `tools/origin_trials/eftest.key` on commit 49249345609d505c8bb8b0b5a42ff4b68b9e6d41. + let content = include_bytes!("../testdata/eftest.key"); + let combined = fingerprint_stream(&mut Cursor::new(content)).expect("should not error"); + let expected_fingerprint = make_fingerprint::(content); + assert_eq!(combined.raw, expected_fingerprint); + assert_eq!(combined.comment_stripped, None); +} + +#[test] +fn comment_stripped_does_not_fingerprint_binary_file() { + let content = vec![1, 2, 3, 0, 1, 2, 3]; + let combined = fingerprint_stream(&mut Cursor::new(content)).expect("should not error"); + assert_eq!(combined.comment_stripped, None); +} + +#[test] +fn comment_stripped_fingerprint_text_file() { + let content = br#"/* + * This is a placeholder file used to test comment stripping code. +*/ + +int main() { + int code = 0; + // code = 1; + + return code; // perfect +} +"#; + + let combined = fingerprint_stream(&mut Cursor::new(content)).expect("should not error"); + let expected = String::from("44fc8f68ab633c7ca0240a66e4ff038c0f2412fe69d14b6f052556edaa1b9160"); + assert_eq!( + combined.comment_stripped.map(|fp| fp.to_string()), + Some(expected) + ); +} + +#[test] +fn evaluate_kinds() { + let mut evaluated = kinds_evaluated(); + assert!(!would_evaluate_new_kinds(&evaluated)); + + evaluated.remove(&SerializedKind::new(CommentStrippedSHA256.to_string())); + assert!(would_evaluate_new_kinds(&evaluated)); + + let mut evaluated = kinds_evaluated(); + evaluated.insert(SerializedKind::new("some other kind")); + assert!(!would_evaluate_new_kinds(&evaluated)); +} diff --git a/testdata/eftest.key b/testdata/eftest.key new file mode 100644 index 0000000..3a7a7a9 --- /dev/null +++ b/testdata/eftest.key @@ -0,0 +1 @@ +g* CLg(ɐv,Ku:(pҚ Y+dV: \ No newline at end of file diff --git a/testdata/facebook-folly-Version.cpp b/testdata/facebook-folly-Version.cpp new file mode 100644 index 0000000..b1e667c --- /dev/null +++ b/testdata/facebook-folly-Version.cpp @@ -0,0 +1,23 @@ +/* + * Copyright 2016 Facebook, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +namespace folly { namespace detail { + +FOLLY_VERSION_CHECK(folly, FOLLY_VERSION) + +}} // namespaces diff --git a/testdata/facebook-folly-Version.cpp.stripped b/testdata/facebook-folly-Version.cpp.stripped new file mode 100644 index 0000000..a04745f --- /dev/null +++ b/testdata/facebook-folly-Version.cpp.stripped @@ -0,0 +1,4 @@ +#include +namespace folly { namespace detail { +FOLLY_VERSION_CHECK(folly, FOLLY_VERSION) +}} \ No newline at end of file