Skip to content

Commit

Permalink
Better filter noise
Browse files Browse the repository at this point in the history
  • Loading branch information
jssblck committed Oct 4, 2023
1 parent 8eb4457 commit 5b60c05
Show file tree
Hide file tree
Showing 2 changed files with 82 additions and 106 deletions.
55 changes: 7 additions & 48 deletions extlib/millhone/src/cmd/analyze.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,7 @@ use std::{

use clap::Parser;
use getset::Getters;
use lazy_regex::regex_is_match;
use millhone::{
api::prelude::*,
extract::{ContentSnippet, Kind, Language, Target},
};
use millhone::{api::prelude::*, extract::ContentSnippet};
use rayon::prelude::*;
use stable_eyre::{
eyre::{bail, Context},
Expand Down Expand Up @@ -68,7 +64,7 @@ pub fn main(endpoint: &BaseUrl, opts: Subcommand) -> Result<(), Report> {
let snippet_opts = opts.extract().into();

let total_count_snippets = AtomicCounter::default();
let noise_count_snippets = AtomicCounter::default();
let total_count_matches = AtomicCounter::default();
let total_count_files = AtomicCounter::default();

WalkDir::new(root)
Expand Down Expand Up @@ -112,14 +108,10 @@ pub fn main(endpoint: &BaseUrl, opts: Subcommand) -> Result<(), Report> {
return None;
}

let total_snippet_count = snippets.len();
let snippets = snippets.into_iter().filter(|m| !snippet_is_noise(m)).collect::<HashSet<_>>();
let snippet_count = snippets.len();
let noisy_snippet_count = total_snippet_count - snippet_count;

debug!(path = %path.display(), %snippet_count, %noisy_snippet_count, "extracted snippets");
debug!(path = %path.display(), %snippet_count, "extracted snippets");
total_count_snippets.increment_by(snippet_count);
noise_count_snippets.increment_by(noisy_snippet_count);

(path, snippets).pipe(Some)
})
// The goal is to then parallelize API calls, so flatten collections of snippets.
Expand All @@ -140,6 +132,7 @@ pub fn main(endpoint: &BaseUrl, opts: Subcommand) -> Result<(), Report> {
for matched in matching_snippets.iter() {
trace!(%fingerprint, ?matched, "matched snippet");
}
total_count_matches.increment_by(matching_snippets.len());

MatchingSnippet::builder()
.found_in(found.snippet().file_path())
Expand Down Expand Up @@ -193,45 +186,11 @@ pub fn main(endpoint: &BaseUrl, opts: Subcommand) -> Result<(), Report> {
});

info!(
"Finished matching {} snippets out of {} files, of which {} were discarded as too noisy",
"Finished matching {} snippets out of {} files to {} matches",
total_count_snippets.into_inner(),
total_count_files.into_inner(),
noise_count_snippets.into_inner(),
total_count_matches.into_inner(),
);

Ok(())
}

/// Some snippets are just too noisy, for example basically every C program has an `int main`.
/// This function is meant to use to filter such snippets.
#[tracing::instrument(level = "DEBUG", skip_all, fields(fingerprint = %m.snippet().fingerprint()), ret)]
fn snippet_is_noise(m: &ContentSnippet) -> bool {
const DEFAULT: bool = false;

// Empty snippets are automatically too noisy.
if regex_is_match!(r"^\s*$"B, m.content()) {
return true;
}

// Metadata is needed to determine the kind of strategy to use for determining noise.
let Ok((language, target, kind)) = m.snippet().parse_meta() else {
warn!(snippet = ?m.snippet(), "failed to parse snippet metadata");
return DEFAULT;
};

match language {
Language::C | Language::CPP => match target {
Target::Function => match kind {
Kind::Signature => contains_bytes(m.content(), b"int main"),
Kind::Body => regex_is_match!(r"\s*\{\s*\}\s*"B, m.content()),
Kind::Full => DEFAULT,
},
},
}
}

fn contains_bytes(haystack: &[u8], needle: &[u8]) -> bool {
haystack
.windows(needle.len())
.any(|window| window.eq(needle))
}
133 changes: 75 additions & 58 deletions extlib/millhone/src/extract.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,13 @@ use clap::{Parser, ValueEnum};
use derive_more::From;
use getset::{CopyGetters, Getters};
use itertools::Itertools;
use lazy_regex::regex_is_match;
use serde::{Deserialize, Serialize};
use snippets::{language::c99_tc3, language::cpp_98, Extractor};
use strum::{Display, EnumIter, EnumVariantNames, IntoEnumIterator, VariantNames};
use tap::{Pipe, Tap};
use thiserror::Error;
use tracing::{debug, trace};
use tracing::{debug, trace, warn};
use typed_builder::TypedBuilder;

/// Errors encountered in this module.
Expand Down Expand Up @@ -101,9 +102,7 @@ impl From<&Options> for snippets::Options {
}

/// The targets of snippets to extract.
#[derive(
Debug, Clone, Copy, Hash, PartialEq, Eq, ValueEnum, Display, EnumIter, EnumVariantNames,
)]
#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, ValueEnum, Display, EnumIter)]
#[strum(serialize_all = "snake_case")]
pub enum Target {
/// Targets function defintions as snippets.
Expand All @@ -118,12 +117,8 @@ impl From<Target> for snippets::Target {
}
}

impl ParseEnumValue for Target {}

/// The kind of item this snippet represents.
#[derive(
Debug, Clone, Copy, Hash, PartialEq, Eq, ValueEnum, Display, EnumIter, EnumVariantNames,
)]
#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, ValueEnum, Display, EnumIter)]
#[strum(serialize_all = "snake_case")]
pub enum Kind {
/// The signature of the snippet.
Expand All @@ -146,8 +141,6 @@ impl From<Kind> for snippets::Kind {
}
}

impl ParseEnumValue for Kind {}

/// The normalization used to extract this snippet.
#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, Display)]
#[strum(serialize_all = "snake_case")]
Expand Down Expand Up @@ -177,24 +170,8 @@ impl Method {
}
}

impl Method {
/// Parse a `Method` from a string.
pub fn parse(input: &str) -> Result<Self, Error> {
if input == Self::Raw.to_string() {
Ok(Self::Raw)
} else {
let input = input.trim_start_matches("normalized(");
let input = input.trim_end_matches(')');
let transform = Transform::parse(input)?;
Self::Normalized(transform).pipe(Ok)
}
}
}

/// The normalization used to extract this snippet.
#[derive(
Debug, Clone, Copy, Hash, PartialEq, Eq, ValueEnum, Display, EnumIter, EnumVariantNames,
)]
#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, ValueEnum, Display, EnumIter)]
#[strum(serialize_all = "snake_case")]
pub enum Transform {
/// Transform the text to have any comments removed and whitespace normalized.
Expand All @@ -217,23 +194,6 @@ impl From<Transform> for snippets::Transform {
}
}

impl ParseEnumValue for Transform {}

trait ParseEnumValue: std::fmt::Display + strum::IntoEnumIterator + strum::VariantNames {
/// Attempt to parse from a string.
fn parse(input: &str) -> Result<Self, Error> {
Self::iter()
.find(|variant| variant.to_string() == input)
.map(Ok)
.unwrap_or_else(|| {
Err(Error::Parse(
input.to_string(),
Self::VARIANTS.iter().map(|s| s.to_string()).collect(),
))
})
}
}

/// A snippet is a specific unit of code that the service wants to match later.
///
/// A snippet fingerprint is a deterministic representation of the code
Expand Down Expand Up @@ -382,11 +342,13 @@ impl Snippet {
match language {
Language::C => c99_tc3::Extractor::extract(opts, content)?
.pipe(collapse_raw)
.filter(|snippet| snippet.is_significant(content))
.map(|snippet| Self::from(scan_root, path, content, snippet))
.inspect(|snippet| trace!(?snippet, "extracted snippet"))
.collect::<HashSet<Self>>(),
Language::CPP => cpp_98::Extractor::extract(opts, content)?
.pipe(collapse_raw)
.filter(|snippet| snippet.is_significant(content))
.map(|snippet| Self::from(scan_root, path, content, snippet))
.inspect(|snippet| trace!(?snippet, "extracted snippet"))
.collect::<HashSet<Self>>(),
Expand All @@ -413,14 +375,6 @@ impl Snippet {
.col_end(self.col_end as _)
.build()
}

/// Attempt to parse the metadata of a snippet retrieved from the API.
pub fn parse_meta(&self) -> Result<(Language, Target, Kind), Error> {
let language = self.language().pipe_borrow(Language::from_str)?;
let kind = Kind::parse(&self.kind)?;
let target = Target::parse(&self.target)?;
(language, target, kind).pipe(Ok)
}
}

/// Equivalent to [`Snippet`], with the content copied from the input file.
Expand Down Expand Up @@ -459,11 +413,8 @@ impl ContentSnippet {
Snippet::from_content(scan_root, opts, path, language, &content)?
.into_iter()
.map(|snippet| {
let location = snippet.location();
Self {
snippet,
content: location.extract_from(&content).to_owned(),
}
let content = snippet.extract_content_from(&content).to_owned();
Self { snippet, content }
})
.collect::<HashSet<_>>()
.pipe(Ok)
Expand Down Expand Up @@ -764,6 +715,72 @@ fn collapse_raw<L>(
grouped.into_values().filter_map(|mut group| group.pop())
}

trait SnippetContent {
/// Extract the snippet content from a larger block of content.
fn extract_content_from<'a>(&self, content: &'a [u8]) -> &'a [u8];
}

impl<L> SnippetContent for snippets::Snippet<L> {
fn extract_content_from<'a>(&self, content: &'a [u8]) -> &'a [u8] {
self.metadata().location().extract_from(content)
}
}

impl SnippetContent for Snippet {
fn extract_content_from<'a>(&self, content: &'a [u8]) -> &'a [u8] {
self.location().extract_from(content)
}
}

trait SnippetNoiseFilter: SnippetContent {
/// Determine whether a snippet is significant.
/// Significant snippets are not noise and are not fully whitespace.
fn is_significant(&self, content: &[u8]) -> bool {
let content = self.extract_content_from(content);
!is_whitespace(content) && !self.is_noise(content)
}

/// Return whether the snippet is noise.
/// Callers who wish to filter snippets should prefer `is_significant`.
///
/// This is usually the function that snippets implement.
fn is_noise(&self, content: &[u8]) -> bool;
}

impl SnippetNoiseFilter for snippets::Snippet<c99_tc3::Language> {
fn is_noise(&self, content: &[u8]) -> bool {
match self.metadata().kind() {
snippets::Kind::Signature => contains_bytes(b"int main", content),
snippets::Kind::Body => regex_is_match!(r"\s*\{\s*\}\s*"B, content),
_ => false,
}
}
}

impl SnippetNoiseFilter for snippets::Snippet<cpp_98::Language> {
fn is_noise(&self, content: &[u8]) -> bool {
match self.metadata().kind() {
snippets::Kind::Signature => contains_bytes(b"int main", content),
snippets::Kind::Body => regex_is_match!(r"\s*\{\s*\}\s*"B, content),
_ => false,
}
}
}

fn is_whitespace(content: &[u8]) -> bool {
regex_is_match!(r"^\s*$"B, content)
}

fn contains_bytes(needle: &[u8], haystack: &[u8]) -> bool {
if needle.len() > haystack.len() {
return false;
}

haystack
.windows(needle.len())
.any(|window| window.eq(needle))
}

#[cfg(test)]
mod tests {
use std::collections::HashMap;
Expand Down

0 comments on commit 5b60c05

Please sign in to comment.