From 6df0360366381ebf2302eb864f4ff3de8470a4eb Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 13 Sep 2023 15:25:50 -0700 Subject: [PATCH] MRG: fix duplicate md5 in picklist problem (#2747) This PR changes picklists using `manifest`, `prefetch`, `search`, and `gather` coltypes to use a composite tuple of `(ident, md5short)` for selection. This should have essentially zero false positives, unlike the current behavior (which relies solely on md5). The basic problem solved: when two signatures with different signature metadata (name, etc) but the same md5sum were in a single index, grepping/extracting via metadata (e.g. with a name) would return all signatures with matching md5sum. This PR fixes this behavior, and also cleans up `SignaturePicklist` a fair bit. The only potential downsides: * the memory size of picklists will increase because it is storing `ident` not just `md5` * `name`/`ident` takes on more official meaning in sourmash (as in, they should be meaningful or at least distinct) Fixes #2617 Fixes https://github.com/sourmash-bio/sourmash/issues/2593 Closes https://github.com/sourmash-bio/sourmash/pull/2602 --- src/sourmash/index/sqlite_index.py | 10 +-- src/sourmash/manifest.py | 9 +- src/sourmash/picklist.py | 134 +++++++++++++++++++---------- src/sourmash/sourmash_args.py | 2 +- tests/test_cmd_signature.py | 27 +++++- tests/test_cmd_signature_grep.py | 24 ++++++ tests/test_lca.py | 4 +- tests/test_picklist.py | 114 +++++++++++++++++++++++- 8 files changed, 261 insertions(+), 63 deletions(-) diff --git a/src/sourmash/index/sqlite_index.py b/src/sourmash/index/sqlite_index.py index 16020b17e2..b16eb00b59 100644 --- a/src/sourmash/index/sqlite_index.py +++ b/src/sourmash/index/sqlite_index.py @@ -885,13 +885,9 @@ def picklist(self): def to_picklist(self): "Convert this manifest to a picklist." - pickset = set() - for row in self.rows: - pickset.add(row['md5']) - - picklist = SignaturePicklist('md5') - picklist.pickset = pickset - return picklist + pl = SignaturePicklist('manifest') + pl.pickset = { pl._get_value_for_manifest_row(row) for row in self.rows } + return pl @classmethod def _create_manifest_from_rows(cls, rows_iter, *, location=":memory:", diff --git a/src/sourmash/manifest.py b/src/sourmash/manifest.py index bfd27eabb9..4bcf1b7dbc 100644 --- a/src/sourmash/manifest.py +++ b/src/sourmash/manifest.py @@ -8,7 +8,7 @@ from abc import abstractmethod import itertools -from sourmash.picklist import SignaturePicklist +from sourmash import picklist class BaseCollectionManifest: @@ -339,7 +339,8 @@ def __contains__(self, ss): def to_picklist(self): "Convert this manifest to a picklist." - picklist = SignaturePicklist('md5') - picklist.pickset = set(self._md5_set) + pl = picklist.SignaturePicklist('manifest') - return picklist + pl.pickset = { pl._get_value_for_manifest_row(row) for row in self.rows } + + return pl diff --git a/src/sourmash/picklist.py b/src/sourmash/picklist.py index 30d5c84f90..8f43aca739 100644 --- a/src/sourmash/picklist.py +++ b/src/sourmash/picklist.py @@ -1,4 +1,14 @@ -"Picklist code for extracting subsets of signatures." +"""Picklist code for extracting subsets of signatures. + +Picklists serve as a central inclusion/exclusion mechanism for sketches. +Each picklist object operates on a particular type of value, and uses +values specified by the user (if using an external picklist) or works with +the output of sourmash (manifests or search/prefetch/gather output). + +Two key features of picklists is that they can be passed into Index.select +and operate efficiently on manifests, so when used with e.g. zipfiles, +only the selected sketches are loaded. +""" import csv import os from enum import Enum @@ -10,6 +20,7 @@ preprocess['name'] = lambda x: x preprocess['md5'] = lambda x: x + # identifier matches/prefix foo - space delimited identifiers preprocess['identprefix'] = lambda x: x.split(' ')[0].split('.')[0] preprocess['ident'] = lambda x: x.split(' ')[0] @@ -18,6 +29,18 @@ preprocess['md5prefix8'] = lambda x: x[:8] preprocess['md5short'] = lambda x: x[:8] +# all meta-coltypes use the same preprocessing of tuple => (ident, md5short) +def combine_ident_md5(x): + "preprocess (name, md5) tup into (ident, md5short) tup" + name, md5 = x + ident = name.split(' ')[0] + md5 = md5[:8] + return (ident, md5) +preprocess['manifest'] = combine_ident_md5 +preprocess['prefetch'] = combine_ident_md5 +preprocess['gather'] = combine_ident_md5 +preprocess['search'] = combine_ident_md5 + class PickStyle(Enum): INCLUDE = 1 @@ -45,10 +68,11 @@ class SignaturePicklist: Identifiers are constructed by using the first space delimited word in the signature name. - You can also use 'gather', 'prefetch', 'search' and 'manifest' as + You can also use 'gather', 'prefetch', 'search', and 'manifest' as column types; these take the CSV output of 'gather', 'prefetch', - 'search', and 'sig manifest' as picklists. 'column' must be left - blank in this case: e.g. use 'pickfile.csv::gather'. + 'search', and 'manifest' as picklists. 'column' must be left + blank in this case: e.g. use 'pickfile.csv::gather'. These "meta-coltypes" + use composite selection on (ident, md5short) tuples. """ meta_coltypes = ('manifest', 'gather', 'prefetch', 'search') supported_coltypes = ('md5', 'md5prefix8', 'md5short', @@ -66,25 +90,16 @@ def __init__(self, coltype, *, pickfile=None, column_name=None, self.orig_coltype = coltype self.orig_colname = column_name - # if we're using gather or prefetch or manifest, set column_name + # if we're using gather, prefetch, manifest, or search, set column_name # automatically (after checks). if coltype in self.meta_coltypes: if column_name: raise ValueError(f"no column name allowed for coltype '{coltype}'") - if coltype == 'gather': - # for now, override => md5short in column md5 - coltype = 'md5prefix8' - column_name = 'md5' - elif coltype == 'prefetch': - # for now, override => md5short in column match_md5 - coltype = 'md5prefix8' - column_name = 'match_md5' - elif coltype == 'manifest' or coltype == 'search': - # for now, override => md5 - coltype = 'md5' - column_name = 'md5' - else: # should never be reached! - assert 0 + + if coltype == 'prefetch': + column_name = '(match_name, match_md5)' + else: + column_name = '(name, md5)' self.coltype = coltype self.pickfile = pickfile @@ -110,7 +125,7 @@ def from_picklist_args(cls, argstr): elif pickstyle_str == 'exclude': pickstyle = PickStyle.EXCLUDE else: - raise ValueError(f"invalid picklist 'pickstyle' argument, '{pickstyle_str}': must be 'include' or 'exclude'") + raise ValueError(f"invalid picklist 'pickstyle' argument 4: '{pickstyle_str}' must be 'include' or 'exclude'") if len(picklist) != 3: raise ValueError(f"invalid picklist argument '{argstr}'") @@ -122,14 +137,55 @@ def from_picklist_args(cls, argstr): pickstyle=pickstyle) def _get_sig_attribute(self, ss): - "for a given SourmashSignature, return attribute for this picklist." + "for a given SourmashSignature, return relevant picklist value." coltype = self.coltype - if coltype in ('md5', 'md5prefix8', 'md5short'): + if coltype in self.meta_coltypes: # gather, prefetch, search, manifest + q = (ss.name, ss.md5sum()) + elif coltype in ('md5', 'md5prefix8', 'md5short'): q = ss.md5sum() elif coltype in ('name', 'ident', 'identprefix'): q = ss.name else: - assert 0 + raise ValueError(f"picklist get_sig_attribute {coltype} has unhandled branch") + + return q + + def _get_value_for_manifest_row(self, row): + "return the picklist value from a manifest row" + if self.coltype in self.meta_coltypes: # gather, prefetch, search, manifest + q = (row['name'], row['md5']) + else: + if self.coltype == 'md5': + colkey = 'md5' + elif self.coltype in ('md5prefix8', 'md5short'): + colkey = 'md5short' + elif self.coltype in ('name', 'ident', 'identprefix'): + colkey = 'name' + else: + raise ValueError(f"picklist get_value_for_row {colkey} has unhandled branch") + + q = row.get(colkey) + + assert q + q = self.preprocess_fn(q) + + return q + + def _get_value_for_csv_row(self, row): + "return the picklist value from a CSV pickfile row - supplied by user, typically" + + # customize for each type of meta_coltypes + if self.coltype == 'manifest': + q = (row['name'], row['md5']) + elif self.coltype == 'prefetch': + q = (row['match_name'], row['match_md5']) + elif self.coltype in ('gather', 'search'): + q = (row['name'], row['md5']) + else: + q = row[self.column_name] + + if q: + q = self.preprocess_fn(q) return q @@ -140,20 +196,24 @@ def init(self, values=[]): self.pickset = set(values) return self.pickset - def load(self, pickfile, column_name, *, allow_empty=False): + def load(self, *, allow_empty=False): "load pickset, return num empty vals, and set of duplicate vals." from . import sourmash_args pickset = self.init() + pickfile = self.pickfile + coltype = self.coltype + column_name = self.column_name + if not os.path.exists(pickfile) or not os.path.isfile(pickfile): raise ValueError(f"pickfile '{pickfile}' must exist and be a regular file") n_empty_val = 0 dup_vals = set() - # CTB: not clear to me what a good "default" name would be for a - # picklist CSV inside a zip (default_csv_name). Maybe manifest? + # CTB note: for zipfiles, not clear to me what a good "default" name would be for a + # picklist CSV inside a zip (default_csv_name for FileInputCSV). with sourmash_args.FileInputCSV(pickfile) as r: self.pickfile = pickfile if not r.fieldnames: @@ -162,18 +222,15 @@ def load(self, pickfile, column_name, *, allow_empty=False): else: return 0, 0 - if column_name not in r.fieldnames: + if not (column_name in r.fieldnames or coltype in self.meta_coltypes): raise ValueError(f"column '{column_name}' not in pickfile '{pickfile}'") for row in r: - # pick out values from column - col = row[column_name] + col = self._get_value_for_csv_row(row) if not col: n_empty_val += 1 continue - col = self.preprocess_fn(col) - # look for duplicate values or empty values if col in pickset: dup_vals.add(col) @@ -210,17 +267,7 @@ def __contains__(self, ss): def matches_manifest_row(self, row): "does the given manifest row match this picklist?" - if self.coltype == 'md5': - colkey = 'md5' - elif self.coltype in ('md5prefix8', 'md5short'): - colkey = 'md5short' - elif self.coltype in ('name', 'ident', 'identprefix'): - colkey = 'name' - else: - assert 0 - - q = row[colkey] - q = self.preprocess_fn(q) + q = self._get_value_for_manifest_row(row) self.n_queries += 1 if self.pickstyle == PickStyle.INCLUDE: @@ -238,8 +285,7 @@ def matched_csv_row(self, row): This is used for examining matches/nomatches to original picklist file. """ - q = row[self.column_name] - q = self.preprocess_fn(q) + q = self._get_value_for_csv_row(row) self.n_queries += 1 if q in self.found: diff --git a/src/sourmash/sourmash_args.py b/src/sourmash/sourmash_args.py index f0b682f80c..8b149d7d1d 100644 --- a/src/sourmash/sourmash_args.py +++ b/src/sourmash/sourmash_args.py @@ -125,7 +125,7 @@ def load_picklist(args): notify(f"picking column '{picklist.column_name}' of type '{picklist.coltype}' from '{picklist.pickfile}'") - n_empty_val, dup_vals = picklist.load(picklist.pickfile, picklist.column_name) + n_empty_val, dup_vals = picklist.load() except ValueError as exc: error("ERROR: could not load picklist.") error(str(exc)) diff --git a/tests/test_cmd_signature.py b/tests/test_cmd_signature.py index 680924568a..28501191f0 100644 --- a/tests/test_cmd_signature.py +++ b/tests/test_cmd_signature.py @@ -2689,7 +2689,7 @@ def test_sig_extract_12_picklist_bad_pickstyle(runtmp): err = runtmp.last_result.err print(err) - assert "invalid picklist 'pickstyle' argument, 'XXX': must be 'include' or 'exclude'" in err + assert "invalid picklist 'pickstyle' argument 4: 'XXX' must be 'include' or 'exclude'" in err def test_sig_extract_12_picklist_bad_colname(runtmp): @@ -2764,6 +2764,31 @@ def test_sig_extract_11_pattern_exclude(runtmp): assert 'shewanella' not in n.lower(), n +def test_sig_extract_identical_md5s(runtmp): + # test that we properly handle different signatures with identical md5s + sig47 = utils.get_test_data('47.fa.sig') + ss = load_signatures(sig47) + sig = list(ss)[0] + new_sig = sig.to_mutable() + new_sig.name = 'foo' + sig47foo = runtmp.output('foo.sig') + # this was only a problem when the signatures are stored in the same file + with open(sig47foo, 'wt') as fp: + sourmash.save_signatures([new_sig, sig], fp) + + runtmp.run_sourmash('sig', 'extract', '--name', 'foo', sig47foo) + + out = runtmp.last_result.out + print(out) + ss = load_signatures(out) + ss = list(ss) + assert len(ss) == 1 + ss = ss[0] + assert 'Shewanella' not in ss.name + assert 'foo' in ss.name + assert ss.md5sum() == '09a08691ce52952152f0e866a59f6261' + + def test_sig_flatten_1(runtmp): c = runtmp diff --git a/tests/test_cmd_signature_grep.py b/tests/test_cmd_signature_grep.py index 7979c98847..17dd5ee2dc 100644 --- a/tests/test_cmd_signature_grep.py +++ b/tests/test_cmd_signature_grep.py @@ -387,3 +387,27 @@ def test_sig_grep_8_count(runtmp): 2 matches: prot/protein.zip """.splitlines(): assert line.strip() in out + + +def test_sig_grep_identical_md5s(runtmp): + # test that we properly handle different signatures with identical md5s + sig47 = utils.get_test_data('47.fa.sig') + ss = load_signatures(sig47) + sig = list(ss)[0] + new_sig = sig.to_mutable() + new_sig.name = 'foo' + sig47foo = runtmp.output('foo.sig') + # this was only a problem when the signatures are stored in the same file + with open(sig47foo, 'wt') as fp: + sourmash.save_signatures([new_sig, sig], fp) + + runtmp.run_sourmash('sig', 'grep', '-i', 'foo', sig47foo) + + out = runtmp.last_result.out + ss = load_signatures(out) + ss = list(ss) + assert len(ss) == 1 + ss = ss[0] + assert 'Shewanella' not in ss.name + assert 'foo' in ss.name + assert ss.md5sum() == '09a08691ce52952152f0e866a59f6261' diff --git a/tests/test_lca.py b/tests/test_lca.py index 8977f94763..3ebfe14090 100644 --- a/tests/test_lca.py +++ b/tests/test_lca.py @@ -2971,7 +2971,7 @@ def test_lca_index_select_with_picklist(runtmp, lca_db_format): idx = sourmash.load_file_as_index(outdb) picklist_obj = SignaturePicklist.from_picklist_args(f"{picklist}:md5:md5") - picklist_obj.load(picklist_obj.pickfile, picklist_obj.column_name) + picklist_obj.load() idx = idx.select(picklist=picklist_obj) @@ -3002,7 +3002,7 @@ def test_lca_index_select_with_picklist_exclude(runtmp, lca_db_format): idx = sourmash.load_file_as_index(outdb) picklist_obj = SignaturePicklist.from_picklist_args(f"{picklist}:md5:md5:exclude") - picklist_obj.load(picklist_obj.pickfile, picklist_obj.column_name) + picklist_obj.load() idx = idx.select(picklist=picklist_obj) siglist = list(idx.signatures()) diff --git a/tests/test_picklist.py b/tests/test_picklist.py index aba2d32d95..dd331868f5 100644 --- a/tests/test_picklist.py +++ b/tests/test_picklist.py @@ -4,20 +4,126 @@ import pytest import sourmash +import copy + import sourmash_tst_utils as utils +from sourmash import picklist from sourmash.picklist import SignaturePicklist +from sourmash.index import LinearIndex, MultiIndex +from sourmash.index.sqlite_index import SqliteIndex def test_load_empty_picklist_fail(): empty = utils.get_test_data('picklist/empty.csv') - pl = SignaturePicklist('manifest') + pl = SignaturePicklist('manifest', pickfile=empty) with pytest.raises(ValueError): - pl.load(empty, 'foo', allow_empty=False) + pl.load(allow_empty=False) def test_load_empty_picklist_allow(): empty = utils.get_test_data('picklist/empty.csv') - pl = SignaturePicklist('manifest') - pl.load(empty, 'foo', allow_empty=True) + pl = SignaturePicklist('manifest', pickfile=empty) + pl.load(allow_empty=True) + + +def test_dup_md5_picked(runtmp): + # load a sig, duplicate, and see if a picklist gets the right one + sig47 = utils.get_test_data('47.fa.sig') + ss = sourmash.load_signatures(sig47) + sig = list(ss)[0] + + # save a manifest with one entry + xl = LinearIndex([sig]) + ml = MultiIndex.load([xl], [None], None) + + print(ml.manifest.rows) + assert len(ml.manifest) == 1 + + mf_csv = runtmp.output('select.csv') + ml.manifest.write_to_filename(mf_csv) + + # now make an index to select against, with an identical signature + # (but diff name) + new_sig = sig.to_mutable() + new_sig.name = 'foo' + xl = LinearIndex([sig, new_sig]) + ml2 = MultiIndex.load([xl], [None], None) + + assert len(ml2) == 2 + + # create a picklist... + pl = SignaturePicklist('manifest', pickfile=mf_csv) + print(pl.load()) + print('loaded:', len(pl.pickset)) + + # use in select + ml3 = ml2.select(picklist=pl) + print('picked:', len(ml3)) + + assert len(pl.pickset) == len(ml3) + + +def test_dup_md5_picked_mf_to_picklist(runtmp): + # load a sig, duplicate, and see if a picklist gets the right one + # uses an in memory picklist + sig47 = utils.get_test_data('47.fa.sig') + ss = sourmash.load_signatures(sig47) + sig = list(ss)[0] + + # save a manifest with one entry + xl = LinearIndex([sig]) + ml = MultiIndex.load([xl], [None], None) + + print(ml.manifest.rows) + assert len(ml.manifest) == 1 + + pl = ml.manifest.to_picklist() + + # now make an index to select against, with an identical signature + # (but diff name) + new_sig = sig.to_mutable() + new_sig.name = 'foo' + xl = LinearIndex([sig, new_sig]) + ml2 = MultiIndex.load([xl], [None], None) + + assert len(ml2) == 2 + + # use picklist in select + ml3 = ml2.select(picklist=pl) + print('picked:', len(ml3)) + + assert len(pl.pickset) == len(ml3) + + +def test_dup_md5_picked_mf_to_picklist_sqlite(runtmp): + # load a sig, duplicate, and see if a picklist gets the right one + # use a sqlite db with its own to_picklist behavior. + sig47 = utils.get_test_data('47.fa.sig') + ss = sourmash.load_signatures(sig47) + sig = list(ss)[0] + + # save a manifest with one entry + xl = SqliteIndex.create(':memory:') + xl.insert(sig) + + print(xl.manifest.rows) + assert len(xl.manifest) == 1 + + pl = xl.manifest.to_picklist() + + # now make an index to select against, with an identical signature + # (but diff name) + new_sig = sig.to_mutable() + new_sig.name = 'foo' + xl = LinearIndex([sig, new_sig]) + ml2 = MultiIndex.load([xl], [None], None) + + assert len(ml2) == 2 + + # use picklist in select + ml3 = ml2.select(picklist=pl) + print('picked:', len(ml3)) + + assert len(pl.pickset) == len(ml3) \ No newline at end of file