Skip to content

Commit

Permalink
Improve performance
Browse files Browse the repository at this point in the history
  • Loading branch information
jssblck committed Jun 15, 2024
1 parent e71a5ac commit 4433ada
Show file tree
Hide file tree
Showing 2 changed files with 50 additions and 15 deletions.
49 changes: 43 additions & 6 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,14 @@ use std::{
fs::File,
io::{BufRead, BufReader, Cursor, Seek},
path::Path,
thread::ScopedJoinHandle,
};

use getset::Getters;
use serde::{Deserialize, Serialize};
use sha2::{Digest, Sha256};
use strum::{AsRefStr, Display, EnumIter, IntoEnumIterator, VariantNames};
use tap::Pipe;
use thiserror::Error;

mod fingerprint;
Expand Down Expand Up @@ -171,7 +173,7 @@ impl<'de> Deserialize<'de> for Kind {
}

/// An array of bytes representing a fingerprint's content.
#[derive(Clone, Eq, PartialEq, Hash, Default)]
#[derive(Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Default)]
pub struct Content(Vec<u8>);

impl Content {
Expand Down Expand Up @@ -400,7 +402,11 @@ impl<K: Into<Kind>, C: Into<Content>> From<(K, C)> for Fingerprint {
pub struct Combined(HashMap<Kind, Content>);

impl Combined {
/// Fingerprint the provided stream (typically a file handle) with all fingerprint [`Kind`]s.
/// Fingerprint the provided stream with all fingerprint [`Kind`]s.
///
/// Note: this forces fingerprinting to be performed serially
/// since the stream has to be seeked backwards for each fingerprinter;
/// if this is not desired consider [`Combined::from_file`] or [`Combined::from_buffer`].
#[tracing::instrument(level = tracing::Level::DEBUG, skip_all, ret)]
pub fn from_stream(mut stream: impl BufRead + Seek) -> Result<Self, Error> {
let mut fingerprints = Vec::new();
Expand All @@ -417,10 +423,22 @@ impl Combined {
}

/// Fingerprint the provided file with all fingerprint [`Kind`]s.
///
/// Note: this opens the file multiple times, once for each kind of fingerprint,
/// then runs each fingerprinter in its own thread.
/// If this is not desired consider [`Combined::from_stream`] or [`Combined::from_buffer`].
#[tracing::instrument(level = tracing::Level::DEBUG, ret)]
pub fn from_file(path: &Path) -> Result<Self, Error> {
let mut file = BufReader::new(File::open(path)?);
Self::from_stream(&mut file)
std::thread::scope(|scope| {
let handles = Kind::iter()
.map(|kind| scope.spawn(move || Fingerprint::from_file(kind, path)))
.collect::<Vec<_>>();

match collapse_handles(handles) {
Ok(fps) => fps.into_iter().flatten().pipe(Combined::from).pipe(Ok),
Err(err) => Err(err),
}
})
}

/// Fingerprint the provided buffer with all fingerprint [`Kind`]s.
Expand All @@ -435,8 +453,13 @@ impl Combined {
/// of errors in the future it isn't a breaking change.
#[tracing::instrument(level = tracing::Level::DEBUG, fields(buf = %buf.as_ref().len()), ret)]
pub fn from_buffer(buf: impl AsRef<[u8]>) -> Result<Self, Error> {
let mut content = Cursor::new(buf);
Self::from_stream(&mut content)
Kind::iter()
.map(|kind| Fingerprint::from_buffer(kind, buf.as_ref()))
.collect::<Result<Vec<_>, _>>()?
.into_iter()
.flatten()
.pipe(Combined::from)
.pipe(Ok)
}

/// Create a new instance from a single fingerprint.
Expand Down Expand Up @@ -472,3 +495,17 @@ impl<I: IntoIterator<Item = F>, F: Into<Fingerprint>> From<I> for Combined {
)
}
}

fn collapse_handles<T, E>(handles: Vec<ScopedJoinHandle<'_, Result<T, E>>>) -> Result<Vec<T>, E> {
let mut collected = Vec::new();
for handle in handles {
match handle.join() {
Err(err) => std::panic::resume_unwind(err),
Ok(operation) => match operation {
Ok(inner) => collected.push(inner),
Err(err) => return Err(err),
},
}
}
Ok(collected)
}
16 changes: 7 additions & 9 deletions tests/it/code_vsi.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
//! Tests for plain code files using legacy VSI fingerprints.
use std::io::Cursor;

use pretty_assertions::assert_eq;

use fingerprint::*;
Expand All @@ -11,7 +9,7 @@ use fingerprint::*;
///
/// ```ignore
/// let content = b"hello world";
/// let combined = Combined::from_stream(&mut Cursor::new(content)).expect("fingerprint");
/// let combined = Combined::from_buffer(&content).expect("fingerprint");
/// assert_fingerprint_eq!(Kind::RawSha256, content, combined);
/// assert_fingerprint_eq!(Kind::CommentStrippedSha256, content, combined);
/// ```
Expand Down Expand Up @@ -69,15 +67,15 @@ fn combined_getters() {
#[test]
fn fingerprints_binary_file() {
let content = vec![1, 2, 3, 0, 1, 2, 3];
let combined = Combined::from_stream(&mut Cursor::new(content.clone())).expect("fingerprint");
let combined = Combined::from_buffer(&content).expect("fingerprint");
assert_fingerprint_eq!(Kind::RawSha256, &content, combined);
assert_fingerprint_eq!(Kind::CommentStrippedSha256, None, combined);
}

#[test]
fn fingerprints_text_file() {
let content = b"hello world";
let combined = Combined::from_stream(&mut Cursor::new(content)).expect("fingerprint");
let combined = Combined::from_buffer(content).expect("fingerprint");
assert_fingerprint_eq!(Kind::RawSha256, content, combined);
assert_fingerprint_eq!(Kind::CommentStrippedSha256, content, combined);
}
Expand All @@ -88,7 +86,7 @@ fn fingerprints_text_file_stripping_cr() {
let content_cs = b"hello world\nanother line\na final line";
let without_cr = b"hello world\nanother line\na final line\n";

let combined = Combined::from_stream(&mut Cursor::new(content)).expect("fingerprint");
let combined = Combined::from_buffer(content).expect("fingerprint");
assert_fingerprint_eq!(Kind::RawSha256, without_cr, combined);
assert_fingerprint_eq!(Kind::CommentStrippedSha256, content_cs, combined);
}
Expand All @@ -97,15 +95,15 @@ fn fingerprints_text_file_stripping_cr() {
fn fingerprints_binary_file_appearing_as_text() {
// Sourced from `[email protected]:chromium/chromium.git` at `tools/origin_trials/eftest.key` on commit 49249345609d505c8bb8b0b5a42ff4b68b9e6d41.
let content = include_bytes!("../../testdata/eftest.key");
let combined = Combined::from_stream(&mut Cursor::new(content)).expect("fingerprint");
let combined = Combined::from_buffer(content).expect("fingerprint");
assert_fingerprint_eq!(Kind::RawSha256, content, combined);
assert_fingerprint_eq!(Kind::CommentStrippedSha256, None, combined);
}

#[test]
fn comment_stripped_does_not_fingerprint_binary_file() {
let content = vec![1, 2, 3, 0, 1, 2, 3];
let combined = Combined::from_stream(&mut Cursor::new(content)).expect("fingerprint");
let combined = Combined::from_buffer(content).expect("fingerprint");
assert_fingerprint_eq!(Kind::CommentStrippedSha256, None, combined);
}

Expand All @@ -123,7 +121,7 @@ int main() {
}
"#;

let combined = Combined::from_stream(&mut Cursor::new(content)).expect("fingerprint");
let combined = Combined::from_buffer(content).expect("fingerprint");
let expected = Content::new(
hex::decode("44fc8f68ab633c7ca0240a66e4ff038c0f2412fe69d14b6f052556edaa1b9160")
.expect("decode hex literal"),
Expand Down

0 comments on commit 4433ada

Please sign in to comment.