Skip to content

Commit

Permalink
MRG: fix duplicate md5 in picklist problem (#2747)
Browse files Browse the repository at this point in the history
This PR changes picklists using `manifest`, `prefetch`, `search`, and
`gather` coltypes to use a composite tuple of `(ident, md5short)` for
selection. This should have essentially zero false positives, unlike the
current behavior (which relies solely on md5).

The basic problem solved: when two signatures with different signature
metadata (name, etc) but the same md5sum were in a single index,
grepping/extracting via metadata (e.g. with a name) would return all
signatures with matching md5sum.

This PR fixes this behavior, and also cleans up `SignaturePicklist` a
fair bit.

The only potential downsides:
* the memory size of picklists will increase because it is storing
`ident` not just `md5`
* `name`/`ident` takes on more official meaning in sourmash (as in, they
should be meaningful or at least distinct)

Fixes #2617
Fixes #2593
Closes #2602
  • Loading branch information
ctb authored Sep 13, 2023
1 parent 551e32a commit 6df0360
Show file tree
Hide file tree
Showing 8 changed files with 261 additions and 63 deletions.
10 changes: 3 additions & 7 deletions src/sourmash/index/sqlite_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -885,13 +885,9 @@ def picklist(self):

def to_picklist(self):
"Convert this manifest to a picklist."
pickset = set()
for row in self.rows:
pickset.add(row['md5'])

picklist = SignaturePicklist('md5')
picklist.pickset = pickset
return picklist
pl = SignaturePicklist('manifest')
pl.pickset = { pl._get_value_for_manifest_row(row) for row in self.rows }
return pl

@classmethod
def _create_manifest_from_rows(cls, rows_iter, *, location=":memory:",
Expand Down
9 changes: 5 additions & 4 deletions src/sourmash/manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from abc import abstractmethod
import itertools

from sourmash.picklist import SignaturePicklist
from sourmash import picklist


class BaseCollectionManifest:
Expand Down Expand Up @@ -339,7 +339,8 @@ def __contains__(self, ss):

def to_picklist(self):
"Convert this manifest to a picklist."
picklist = SignaturePicklist('md5')
picklist.pickset = set(self._md5_set)
pl = picklist.SignaturePicklist('manifest')

return picklist
pl.pickset = { pl._get_value_for_manifest_row(row) for row in self.rows }

return pl
134 changes: 90 additions & 44 deletions src/sourmash/picklist.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,14 @@
"Picklist code for extracting subsets of signatures."
"""Picklist code for extracting subsets of signatures.
Picklists serve as a central inclusion/exclusion mechanism for sketches.
Each picklist object operates on a particular type of value, and uses
values specified by the user (if using an external picklist) or works with
the output of sourmash (manifests or search/prefetch/gather output).
Two key features of picklists is that they can be passed into Index.select
and operate efficiently on manifests, so when used with e.g. zipfiles,
only the selected sketches are loaded.
"""
import csv
import os
from enum import Enum
Expand All @@ -10,6 +20,7 @@
preprocess['name'] = lambda x: x
preprocess['md5'] = lambda x: x


# identifier matches/prefix foo - space delimited identifiers
preprocess['identprefix'] = lambda x: x.split(' ')[0].split('.')[0]
preprocess['ident'] = lambda x: x.split(' ')[0]
Expand All @@ -18,6 +29,18 @@
preprocess['md5prefix8'] = lambda x: x[:8]
preprocess['md5short'] = lambda x: x[:8]

# all meta-coltypes use the same preprocessing of tuple => (ident, md5short)
def combine_ident_md5(x):
"preprocess (name, md5) tup into (ident, md5short) tup"
name, md5 = x
ident = name.split(' ')[0]
md5 = md5[:8]
return (ident, md5)
preprocess['manifest'] = combine_ident_md5
preprocess['prefetch'] = combine_ident_md5
preprocess['gather'] = combine_ident_md5
preprocess['search'] = combine_ident_md5


class PickStyle(Enum):
INCLUDE = 1
Expand Down Expand Up @@ -45,10 +68,11 @@ class SignaturePicklist:
Identifiers are constructed by using the first space delimited word in
the signature name.
You can also use 'gather', 'prefetch', 'search' and 'manifest' as
You can also use 'gather', 'prefetch', 'search', and 'manifest' as
column types; these take the CSV output of 'gather', 'prefetch',
'search', and 'sig manifest' as picklists. 'column' must be left
blank in this case: e.g. use 'pickfile.csv::gather'.
'search', and 'manifest' as picklists. 'column' must be left
blank in this case: e.g. use 'pickfile.csv::gather'. These "meta-coltypes"
use composite selection on (ident, md5short) tuples.
"""
meta_coltypes = ('manifest', 'gather', 'prefetch', 'search')
supported_coltypes = ('md5', 'md5prefix8', 'md5short',
Expand All @@ -66,25 +90,16 @@ def __init__(self, coltype, *, pickfile=None, column_name=None,
self.orig_coltype = coltype
self.orig_colname = column_name

# if we're using gather or prefetch or manifest, set column_name
# if we're using gather, prefetch, manifest, or search, set column_name
# automatically (after checks).
if coltype in self.meta_coltypes:
if column_name:
raise ValueError(f"no column name allowed for coltype '{coltype}'")
if coltype == 'gather':
# for now, override => md5short in column md5
coltype = 'md5prefix8'
column_name = 'md5'
elif coltype == 'prefetch':
# for now, override => md5short in column match_md5
coltype = 'md5prefix8'
column_name = 'match_md5'
elif coltype == 'manifest' or coltype == 'search':
# for now, override => md5
coltype = 'md5'
column_name = 'md5'
else: # should never be reached!
assert 0

if coltype == 'prefetch':
column_name = '(match_name, match_md5)'
else:
column_name = '(name, md5)'

self.coltype = coltype
self.pickfile = pickfile
Expand All @@ -110,7 +125,7 @@ def from_picklist_args(cls, argstr):
elif pickstyle_str == 'exclude':
pickstyle = PickStyle.EXCLUDE
else:
raise ValueError(f"invalid picklist 'pickstyle' argument, '{pickstyle_str}': must be 'include' or 'exclude'")
raise ValueError(f"invalid picklist 'pickstyle' argument 4: '{pickstyle_str}' must be 'include' or 'exclude'")

if len(picklist) != 3:
raise ValueError(f"invalid picklist argument '{argstr}'")
Expand All @@ -122,14 +137,55 @@ def from_picklist_args(cls, argstr):
pickstyle=pickstyle)

def _get_sig_attribute(self, ss):
"for a given SourmashSignature, return attribute for this picklist."
"for a given SourmashSignature, return relevant picklist value."
coltype = self.coltype
if coltype in ('md5', 'md5prefix8', 'md5short'):
if coltype in self.meta_coltypes: # gather, prefetch, search, manifest
q = (ss.name, ss.md5sum())
elif coltype in ('md5', 'md5prefix8', 'md5short'):
q = ss.md5sum()
elif coltype in ('name', 'ident', 'identprefix'):
q = ss.name
else:
assert 0
raise ValueError(f"picklist get_sig_attribute {coltype} has unhandled branch")

return q

def _get_value_for_manifest_row(self, row):
"return the picklist value from a manifest row"
if self.coltype in self.meta_coltypes: # gather, prefetch, search, manifest
q = (row['name'], row['md5'])
else:
if self.coltype == 'md5':
colkey = 'md5'
elif self.coltype in ('md5prefix8', 'md5short'):
colkey = 'md5short'
elif self.coltype in ('name', 'ident', 'identprefix'):
colkey = 'name'
else:
raise ValueError(f"picklist get_value_for_row {colkey} has unhandled branch")

q = row.get(colkey)

assert q
q = self.preprocess_fn(q)

return q

def _get_value_for_csv_row(self, row):
"return the picklist value from a CSV pickfile row - supplied by user, typically"

# customize for each type of meta_coltypes
if self.coltype == 'manifest':
q = (row['name'], row['md5'])
elif self.coltype == 'prefetch':
q = (row['match_name'], row['match_md5'])
elif self.coltype in ('gather', 'search'):
q = (row['name'], row['md5'])
else:
q = row[self.column_name]

if q:
q = self.preprocess_fn(q)

return q

Expand All @@ -140,20 +196,24 @@ def init(self, values=[]):
self.pickset = set(values)
return self.pickset

def load(self, pickfile, column_name, *, allow_empty=False):
def load(self, *, allow_empty=False):
"load pickset, return num empty vals, and set of duplicate vals."
from . import sourmash_args

pickset = self.init()

pickfile = self.pickfile
coltype = self.coltype
column_name = self.column_name

if not os.path.exists(pickfile) or not os.path.isfile(pickfile):
raise ValueError(f"pickfile '{pickfile}' must exist and be a regular file")

n_empty_val = 0
dup_vals = set()

# CTB: not clear to me what a good "default" name would be for a
# picklist CSV inside a zip (default_csv_name). Maybe manifest?
# CTB note: for zipfiles, not clear to me what a good "default" name would be for a
# picklist CSV inside a zip (default_csv_name for FileInputCSV).
with sourmash_args.FileInputCSV(pickfile) as r:
self.pickfile = pickfile
if not r.fieldnames:
Expand All @@ -162,18 +222,15 @@ def load(self, pickfile, column_name, *, allow_empty=False):
else:
return 0, 0

if column_name not in r.fieldnames:
if not (column_name in r.fieldnames or coltype in self.meta_coltypes):
raise ValueError(f"column '{column_name}' not in pickfile '{pickfile}'")

for row in r:
# pick out values from column
col = row[column_name]
col = self._get_value_for_csv_row(row)
if not col:
n_empty_val += 1
continue

col = self.preprocess_fn(col)

# look for duplicate values or empty values
if col in pickset:
dup_vals.add(col)
Expand Down Expand Up @@ -210,17 +267,7 @@ def __contains__(self, ss):

def matches_manifest_row(self, row):
"does the given manifest row match this picklist?"
if self.coltype == 'md5':
colkey = 'md5'
elif self.coltype in ('md5prefix8', 'md5short'):
colkey = 'md5short'
elif self.coltype in ('name', 'ident', 'identprefix'):
colkey = 'name'
else:
assert 0

q = row[colkey]
q = self.preprocess_fn(q)
q = self._get_value_for_manifest_row(row)
self.n_queries += 1

if self.pickstyle == PickStyle.INCLUDE:
Expand All @@ -238,8 +285,7 @@ def matched_csv_row(self, row):
This is used for examining matches/nomatches to original picklist file.
"""
q = row[self.column_name]
q = self.preprocess_fn(q)
q = self._get_value_for_csv_row(row)
self.n_queries += 1

if q in self.found:
Expand Down
2 changes: 1 addition & 1 deletion src/sourmash/sourmash_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ def load_picklist(args):

notify(f"picking column '{picklist.column_name}' of type '{picklist.coltype}' from '{picklist.pickfile}'")

n_empty_val, dup_vals = picklist.load(picklist.pickfile, picklist.column_name)
n_empty_val, dup_vals = picklist.load()
except ValueError as exc:
error("ERROR: could not load picklist.")
error(str(exc))
Expand Down
27 changes: 26 additions & 1 deletion tests/test_cmd_signature.py
Original file line number Diff line number Diff line change
Expand Up @@ -2689,7 +2689,7 @@ def test_sig_extract_12_picklist_bad_pickstyle(runtmp):

err = runtmp.last_result.err
print(err)
assert "invalid picklist 'pickstyle' argument, 'XXX': must be 'include' or 'exclude'" in err
assert "invalid picklist 'pickstyle' argument 4: 'XXX' must be 'include' or 'exclude'" in err


def test_sig_extract_12_picklist_bad_colname(runtmp):
Expand Down Expand Up @@ -2764,6 +2764,31 @@ def test_sig_extract_11_pattern_exclude(runtmp):
assert 'shewanella' not in n.lower(), n


def test_sig_extract_identical_md5s(runtmp):
# test that we properly handle different signatures with identical md5s
sig47 = utils.get_test_data('47.fa.sig')
ss = load_signatures(sig47)
sig = list(ss)[0]
new_sig = sig.to_mutable()
new_sig.name = 'foo'
sig47foo = runtmp.output('foo.sig')
# this was only a problem when the signatures are stored in the same file
with open(sig47foo, 'wt') as fp:
sourmash.save_signatures([new_sig, sig], fp)

runtmp.run_sourmash('sig', 'extract', '--name', 'foo', sig47foo)

out = runtmp.last_result.out
print(out)
ss = load_signatures(out)
ss = list(ss)
assert len(ss) == 1
ss = ss[0]
assert 'Shewanella' not in ss.name
assert 'foo' in ss.name
assert ss.md5sum() == '09a08691ce52952152f0e866a59f6261'


def test_sig_flatten_1(runtmp):
c = runtmp

Expand Down
24 changes: 24 additions & 0 deletions tests/test_cmd_signature_grep.py
Original file line number Diff line number Diff line change
Expand Up @@ -387,3 +387,27 @@ def test_sig_grep_8_count(runtmp):
2 matches: prot/protein.zip
""".splitlines():
assert line.strip() in out


def test_sig_grep_identical_md5s(runtmp):
# test that we properly handle different signatures with identical md5s
sig47 = utils.get_test_data('47.fa.sig')
ss = load_signatures(sig47)
sig = list(ss)[0]
new_sig = sig.to_mutable()
new_sig.name = 'foo'
sig47foo = runtmp.output('foo.sig')
# this was only a problem when the signatures are stored in the same file
with open(sig47foo, 'wt') as fp:
sourmash.save_signatures([new_sig, sig], fp)

runtmp.run_sourmash('sig', 'grep', '-i', 'foo', sig47foo)

out = runtmp.last_result.out
ss = load_signatures(out)
ss = list(ss)
assert len(ss) == 1
ss = ss[0]
assert 'Shewanella' not in ss.name
assert 'foo' in ss.name
assert ss.md5sum() == '09a08691ce52952152f0e866a59f6261'
4 changes: 2 additions & 2 deletions tests/test_lca.py
Original file line number Diff line number Diff line change
Expand Up @@ -2971,7 +2971,7 @@ def test_lca_index_select_with_picklist(runtmp, lca_db_format):

idx = sourmash.load_file_as_index(outdb)
picklist_obj = SignaturePicklist.from_picklist_args(f"{picklist}:md5:md5")
picklist_obj.load(picklist_obj.pickfile, picklist_obj.column_name)
picklist_obj.load()

idx = idx.select(picklist=picklist_obj)

Expand Down Expand Up @@ -3002,7 +3002,7 @@ def test_lca_index_select_with_picklist_exclude(runtmp, lca_db_format):

idx = sourmash.load_file_as_index(outdb)
picklist_obj = SignaturePicklist.from_picklist_args(f"{picklist}:md5:md5:exclude")
picklist_obj.load(picklist_obj.pickfile, picklist_obj.column_name)
picklist_obj.load()
idx = idx.select(picklist=picklist_obj)

siglist = list(idx.signatures())
Expand Down
Loading

0 comments on commit 6df0360

Please sign in to comment.