Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] make the library no_std compatible #6

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
925 changes: 576 additions & 349 deletions Cargo.lock

Large diffs are not rendered by default.

22 changes: 8 additions & 14 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[package]
name = "lzjd"
name = "lzjd-bin"
version = "0.2.0"
authors = ["Henk Dieter <[email protected]>"]
edition = "2018"
Expand All @@ -11,22 +11,16 @@ categories = ["algorithms","compression","cryptography","filesystem","science"]
license = "GPL-3.0"
repository = "https://github.com/tweedegolf/lzjd-rs"

[workspace]
members = [".", "lib"]

[dependencies]
clap = "2.32.0"
lzjd = { path = "./lib" }
base64 = "0.10.1"
bincode = "1.1.2"
clap = "2.32.0"
failure = "0.1.5"
failure_derive = "0.1.5"
fasthash= "0.4.0"
bincode = "1.1.2"
crc = "1.8.1"
walkdir = "2.2.7"
num_cpus = "1.10.0"
rayon = "1.0.3"

[dev-dependencies]
rand = "0.6.5"
criterion = "0.2.10"

[[bench]]
name = "lzjd"
harness = false
walkdir = "2.2.7"
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,14 @@
Rust implementation of Lempel-Ziv Jaccard Distance (LZJD) algorithm based on [jLZJD](https://github.com/EdwardRaff/jLZJD)

Main differences:

- Rust instead of Java
- Can use any hasher (executable uses CRC32) instead of just Murmur3
- Does not allocate memory for every unique hash, instead keeps k=1024 smallest
- Based on Vec<u64> instead of IntSetNoRemove, which is more like HashMap
- Based on `Vec<u64>` instead of IntSetNoRemove, which is more like HashMap
- Hash files are considerably smaller if small sequences have been digested

```
```man
USAGE:
lzjd [FLAGS] [OPTIONS] <INPUT>...

Expand All @@ -30,8 +31,7 @@ ARGS:
<INPUT>... Sets the input file to use
```


See also:

- [Original paper](http://www.edwardraff.com/publications/alternative-ncd-lzjd.pdf)
- [Follow-up paper](https://arxiv.org/abs/1708.03346)
- [Follow-up paper](https://arxiv.org/abs/1708.03346)
35 changes: 35 additions & 0 deletions lib/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
[package]
name = "lzjd"
version = "0.2.0" # !!! needs a major bump
authors = ["Henk Dieter <[email protected]>"]
edition = "2018"
exclude = ["in/*", "out/*"]
description = "Rust implementation of the LZJD algorithm (https://github.com/EdwardRaff/jLZJD)"
readme = "README.md"
keywords = ["lzjd", "edit", "distance", "Lempel", "Ziv"]
categories = ["algorithms","compression","cryptography","filesystem","science"]
license = "GPL-3.0"
repository = "https://github.com/tweedegolf/lzjd-rs"

[dependencies]
hashbrown = "0.9.1"
crc = { version = "1.8.1", default-features = false }
base64 = { version = "0.10.1", default-features = false }

# fasthash is mostly written in C with a FFI
# a pure Rust alternative is https://github.com/stusmall/murmur3, which
# still needs std by default, but can be easily patched
fasthash = "0.4.0"

bincode = "1.1.2" # no_std is a WIP, keep an eye on https://github.com/bincode-org/bincode/pull/339

failure = "0.1.5" # deprecated; switch dependency
failure_derive = "0.1.5"

[dev-dependencies]
criterion = "0.3.4"
rand = "0.8.3"

[[bench]]
name = "lzjd"
harness = false
File renamed without changes.
5 changes: 1 addition & 4 deletions src/crc32.rs → lib/src/crc32.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
//! Defines a wrapper around crc::crc32::Digest, implementing std::hash::Hasher
//! as well as a std::hash::BuildHasher which builds the hasher.
use crc::crc32::{self, Hasher32};

use std::hash::BuildHasher;
use std::hash::Hasher;
use core::hash::{Hasher, BuildHasher};

/// Wrapper around crc::crc32::Digest which implements std::hash::Hasher
pub struct CRC32Hasher {
Expand Down Expand Up @@ -39,4 +37,3 @@ impl BuildHasher for CRC32BuildHasher {
CRC32Hasher::new()
}
}

28 changes: 12 additions & 16 deletions src/lib.rs → lib/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -52,12 +52,19 @@
//!
//! assert_eq!(lzjd, 0.5714285714285714);
//! ```

#![no_std]
extern crate alloc;
extern crate hashbrown;
#[macro_use]
extern crate failure_derive;
extern crate base64;
extern crate bincode;
extern crate fasthash;


use alloc::string::{String, ToString};

pub use crate::lz_dict::LZDict;
use std::io;

/// LZ dictionary implementation
pub mod lz_dict;
Expand All @@ -68,11 +75,6 @@ pub mod murmur3;

#[derive(Debug, Fail)]
pub enum LZJDError {
#[fail(display = "IO error: {}", err)]
Io {
#[cause]
err: io::Error,
},
#[fail(display = "Decode error: {}", err)]
Base64 {
#[cause]
Expand All @@ -99,27 +101,21 @@ impl From<bincode::Error> for LZJDError {
}
}

impl From<std::io::Error> for LZJDError {
fn from(err: std::io::Error) -> Self {
LZJDError::Io { err }
}
}

impl<'a> From<&'a str> for LZJDError {
fn from(msg: &'a str) -> Self {
LZJDError::Msg {
msg: msg.to_owned(),
msg: msg.to_string(),
}
}
}

pub type Result<T> = std::result::Result<T, LZJDError>;
pub type Result<T> = core::result::Result<T, LZJDError>;

#[cfg(test)]
mod tests {
use crate::crc32::CRC32BuildHasher;
use crate::*;
use std::f64::EPSILON;
use core::f64::EPSILON;

#[test]
fn test_optimized_dist() {
Expand Down
11 changes: 6 additions & 5 deletions src/lz_dict.rs → lib/src/lz_dict.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
use crate::Result;
use core::hash::BuildHasher;
use core::hash::Hasher;
use core::hash::{Hasher, BuildHasher};
use core::ops::Deref;
use std::collections::HashSet;
use alloc::vec;
use alloc::vec::Vec;
use alloc::string::String;
use hashbrown::HashSet;

/// A sorted list of the k smallest LZSet hashes
#[derive(Debug)]
Expand Down Expand Up @@ -185,8 +187,7 @@ impl From<LZDict> for Vec<i32> {
mod tests {
use crate::crc32::CRC32BuildHasher;
use crate::lz_dict::LZDict;
use std::f64::EPSILON;
use std::iter::*;
use core::f64::EPSILON;

fn is_sorted_and_unique<T: PartialOrd>(list: &[T]) -> bool {
if list.len() <= 1 {
Expand Down
2 changes: 1 addition & 1 deletion src/murmur3.rs → lib/src/murmur3.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use fasthash::FastHasher;
use std::hash::BuildHasher;
use core::hash::BuildHasher;

pub struct Murmur3BuildHasher;

Expand Down
5 changes: 1 addition & 4 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,10 @@ extern crate lzjd;
#[macro_use]
extern crate failure_derive;

mod crc32;
mod murmur3;
use lzjd::{murmur3, LZDict, LZJDError};

use murmur3::Murmur3BuildHasher;

use lzjd::{LZDict, LZJDError};

use std::fs::File;
use std::io::Write;
use std::io::{self, BufRead, BufReader, BufWriter, Read};
Expand Down