Skip to content

Commit

Permalink
working skipmers, i think!
Browse files Browse the repository at this point in the history
  • Loading branch information
bluegenes committed Nov 21, 2024
1 parent c8229d5 commit f5e4153
Show file tree
Hide file tree
Showing 5 changed files with 156 additions and 7 deletions.
3 changes: 1 addition & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@ crate-type = ["cdylib"]
pyo3 = { version = "0.23.1", features = ["extension-module", "anyhow"] }
rayon = "1.10.0"
serde = { version = "1.0.215", features = ["derive"] }
sourmash = { version = "0.17.2", features = ["branchwater"] }
#sourmash = { version = "0.17.2", features = ["branchwater"] }
sourmash = {git = "https://github.com/sourmash-bio/sourmash/", branch = "try-skipmers", features = ["branchwater"]}
serde_json = "1.0.133"
niffler = "2.4.0"
log = "0.4.22"
Expand Down
2 changes: 1 addition & 1 deletion src/python/sourmash_plugin_branchwater/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -618,7 +618,7 @@ def main(self, args):
args.param_string = ["k=31,scaled=1000,dna"]

# Check and append 'dna' if no moltype is found in a param string
moltypes = ["dna", "protein", "dayhoff", "hp"]
moltypes = ["dna", "protein", "dayhoff", "hp", "skipmer"]
updated_param_strings = []

for param in args.param_string:
Expand Down
130 changes: 130 additions & 0 deletions src/python/tests/test_sketch.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
import pytest
import csv
import pandas
import sourmash
from sourmash import index
Expand Down Expand Up @@ -1435,3 +1436,132 @@ def test_singlesketch_zip_output(runtmp):
runtmp.sourmash("sketch", "dna", fa1, "-o", output2)
sig2 = sourmash.load_one_signature(output2)
assert sig.minhash.hashes == sig2.minhash.hashes


def test_manysketch_skipmer(runtmp, capfd):
fa_csv = runtmp.output("db-fa.csv")

fa1 = get_test_data("short.fa")
fa2 = get_test_data("short2.fa")
fa3 = get_test_data("short3.fa")
protfa1 = get_test_data("short-protein.fa")

make_assembly_csv(fa_csv, [fa1, fa2, fa3], [protfa1])

output = runtmp.output("db.zip")

runtmp.sourmash(
"scripts",
"manysketch",
fa_csv,
"-o",
output,
"--param-str",
"dna,k=21,scaled=1",
"--param-str",
"skipmer,k=31,scaled=30",
)

assert os.path.exists(output)
assert not runtmp.last_result.out # stdout should be empty
captured = capfd.readouterr()
print(captured.err)

idx = sourmash.load_file_as_index(output)
sigs = list(idx.signatures())
print(sigs)

assert len(sigs) == 3 # 3 dna, 3 skipmer. But sourmash can only read the DNA sigs!!
# check moltypes, etc!
dna_md5sums = {
"short": "1474578c5c46dd09da4c2df29cf86621",
"short2": "4efeebd26644278e36b9553e018a851a",
"short3": "f85747ac4f473c4a71c1740d009f512b",
}
for sig in sigs:
if sig.minhash.is_dna:
assert sig.minhash.ksize == 21
assert sig.minhash.scaled == 1
print("DNA: ", sig.name, sig.md5sum())
assert sig.md5sum() == dna_md5sums[sig.name]

# read the file with python and check sigs
import zipfile, gzip, json

with zipfile.ZipFile(output, "r") as zf:
# Check the manifest exists
assert "SOURMASH-MANIFEST.csv" in zf.namelist()

expected_signatures = [
{
"name": "short",
"ksize": 31,
"scaled": 30,
"moltype": "skipmer",
"md5sum": "c7b5b8d98c6fde00e2a25769d5eb470a",
},
{
"name": "short3",
"ksize": 31,
"scaled": 30,
"moltype": "skipmer",
"md5sum": "ed5bbe1b2a9a5cd83a6c8583e8122518",
},
{
"name": "short2",
"ksize": 31,
"scaled": 30,
"moltype": "skipmer",
"md5sum": "13c456b9a346284d0acdf49b3e529da6",
},
]
expected_signatures_dict = {exp["md5sum"]: exp for exp in expected_signatures}

# Read and parse the manifest
with zf.open("SOURMASH-MANIFEST.csv") as manifest_file:
manifest_data = manifest_file.read().decode("utf-8").splitlines()
manifest_data = [line for line in manifest_data if not line.startswith("#")]
manifest_reader = csv.DictReader(manifest_data)

for row in manifest_reader:
if row["moltype"] == "skipmer":
print("Manifest Row:", row)

# Validate row fields
md5sum = row["md5"]
assert (
md5sum in expected_signatures_dict
), f"Unexpected md5sum: {md5sum}"
expected = expected_signatures_dict[md5sum]
assert (
row["name"] == expected["name"]
), f"Name mismatch: {row['name']}"
assert (
int(row["ksize"]) == expected["ksize"]
), f"Ksize mismatch: {row['ksize']}"
assert (
row["moltype"] == expected["moltype"]
), f"Moltype mismatch: {row['moltype']}"

sig_path = row["internal_location"]
assert sig_path.startswith("signatures/")

# Extract and read the signature file
with zf.open(sig_path) as sig_gz:
with gzip.open(sig_gz, "rt") as sig_file:
sig_contents = json.load(sig_file)
print("Signature Contents:", sig_contents)

# Validate signature contents
sig_data = sig_contents[0]
print(sig_data)
siginfo = sig_data["signatures"][0]
assert (
siginfo["md5sum"] == md5sum
), f"MD5 mismatch: {siginfo['md5sum']}"
assert (
siginfo["ksize"] == expected["ksize"]
), f"Ksize mismatch: {siginfo['ksize']}"
assert (
siginfo["molecule"] == expected["moltype"]
), f"Moltype mismatch: {siginfo['molecule']}"
25 changes: 22 additions & 3 deletions src/utils/buildutils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ impl MultiSelection {
pub fn from_input_moltype(input_moltype: &str) -> Result<Self, SourmashError> {
// currently we don't allow translation. Will need to change this when we do.
// is there a better way to do this?
let mut moltypes = vec!["DNA"];
let mut moltypes = vec!["DNA", "skipmer"]; // change so default is just dna?
if input_moltype == "protein" {
moltypes = vec!["protein", "dayhoff", "hp"];
}
Expand Down Expand Up @@ -181,6 +181,15 @@ impl BuildRecord {
}
}

pub fn default_skipmer() -> Self {
Self {
moltype: "skipmer".to_string(),
ksize: 21,
scaled: 1000,
..Self::default_dna()
}
}

pub fn moltype(&self) -> HashFunctions {
self.moltype.as_str().try_into().unwrap()
}
Expand Down Expand Up @@ -420,6 +429,13 @@ impl BuildCollection {
Ok(mf.records.len())
}

pub fn skipmer_size(&self) -> Result<usize, SourmashError> {
let multiselection = MultiSelection::from_moltypes(vec!["skipmer"])?;
let mut mf = self.manifest.clone(); // temporary mutable copy
mf.select(&multiselection)?;
Ok(mf.records.len())
}

pub fn protein_size(&self) -> Result<usize, SourmashError> {
let multiselection = MultiSelection::from_moltypes(vec!["protein"])?;
let mut mf = self.manifest.clone(); // temporary mutable copy
Expand Down Expand Up @@ -466,7 +482,7 @@ impl BuildCollection {

pub fn parse_moltype(item: &str, current: &mut Option<String>) -> Result<String, String> {
let new_moltype = match item {
"protein" | "dna" | "dayhoff" | "hp" => item.to_string(),
"protein" | "dna" | "dayhoff" | "hp" | "skipmer" => item.to_string(),
_ => return Err(format!("unknown moltype '{}'", item)),
};

Expand Down Expand Up @@ -531,7 +547,7 @@ impl BuildCollection {
"abund" | "noabund" => {
Self::parse_abundance(item, &mut track_abundance)?;
}
"protein" | "dna" | "DNA" | "dayhoff" | "hp" => {
"protein" | "dna" | "DNA" | "dayhoff" | "hp" | "skipmer" => {
Self::parse_moltype(item, &mut moltype)?;
}
_ if item.starts_with("num=") => {
Expand Down Expand Up @@ -566,6 +582,7 @@ impl BuildCollection {
"protein" => BuildRecord::default_protein(),
"dayhoff" => BuildRecord::default_dayhoff(),
"hp" => BuildRecord::default_hp(),
"skipmer" => BuildRecord::default_skipmer(),
_ => {
return Err(format!(
"Error parsing params string '{}': Unsupported moltype '{}'",
Expand Down Expand Up @@ -660,6 +677,7 @@ impl BuildCollection {
.dna(record.moltype == "DNA")
.dayhoff(record.moltype == "dayhoff")
.hp(record.moltype == "hp")
.skipmer(record.moltype == "skipmer")
.num_hashes(record.num)
.track_abundance(record.with_abundance)
.build();
Expand Down Expand Up @@ -748,6 +766,7 @@ impl BuildCollection {
}
} else if (input_moltype == "DNA" || input_moltype == "dna")
&& rec.moltype() == HashFunctions::Murmur64Dna
|| rec.moltype() == HashFunctions::Murmur64Skipmer
{
sig.add_sequence(&record.seq(), true)
.context("Failed to add sequence")?;
Expand Down

0 comments on commit f5e4153

Please sign in to comment.