Skip to content

Commit

Permalink
🚧 add sequence support
Browse files Browse the repository at this point in the history
  • Loading branch information
victorlin committed Aug 30, 2024
1 parent eb5a01a commit f86323a
Show file tree
Hide file tree
Showing 2 changed files with 141 additions and 2 deletions.
105 changes: 103 additions & 2 deletions augur/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
from augur.errors import AugurError
from augur.io.metadata import DEFAULT_DELIMITERS, DEFAULT_ID_COLUMNS, Metadata
from augur.io.print import print_err, print_debug
from augur.io.sequences import read_sequences
from augur.utils import first_line


Expand Down Expand Up @@ -116,6 +117,17 @@ class NamedFile:
table_name: str
"""Generated SQLite table name for this file, based on *name*."""

path: str


class NamedSequences(NamedFile):
def __init__(self, name: str, path: str):
self.name = name
self.path = path
self.table_name = f"sequences_{self.name}"

def __repr__(self):
return f"<NamedSequences {self.name}={self.path}>"

class NamedMetadata(Metadata, NamedFile):
def __init__(self, name: str, *args, **kwargs):
Expand All @@ -132,26 +144,30 @@ def register_parser(parent_subparsers):

input_group = parser.add_argument_group("inputs", "options related to input")
input_group.add_argument("--metadata", nargs="+", action="extend", metavar="NAME=FILE", help="Required. Metadata table names and file paths. Names are arbitrary monikers used solely for referring to the associated input file in other arguments and in output column names. Paths must be to seekable files, not unseekable streams. Compressed files are supported." + SKIP_AUTO_DEFAULT_IN_HELP)
input_group.add_argument("--sequences", nargs="+", action="extend", metavar="NAME=FILE", help="Sequence files. Compressed files are supported." + SKIP_AUTO_DEFAULT_IN_HELP)

input_group.add_argument("--metadata-id-columns", default=DEFAULT_ID_COLUMNS, nargs="+", action=ExtendOverwriteDefault, metavar="[TABLE=]COLUMN", help=f"Possible metadata column names containing identifiers, considered in the order given. Columns will be considered for all metadata tables by default. Table-specific column names may be given using the same names assigned in --metadata. Only one ID column will be inferred for each table. (default: {' '.join(map(shquote_humanized, DEFAULT_ID_COLUMNS))})" + SKIP_AUTO_DEFAULT_IN_HELP)
input_group.add_argument("--metadata-delimiters", default=DEFAULT_DELIMITERS, nargs="+", action=ExtendOverwriteDefault, metavar="[TABLE=]CHARACTER", help=f"Possible field delimiters to use for reading metadata tables, considered in the order given. Delimiters will be considered for all metadata tables by default. Table-specific delimiters may be given using the same names assigned in --metadata. Only one delimiter will be inferred for each table. (default: {' '.join(map(shquote_humanized, DEFAULT_DELIMITERS))})" + SKIP_AUTO_DEFAULT_IN_HELP)

output_group = parser.add_argument_group("outputs", "options related to output")
output_group.add_argument('--output-metadata', metavar="FILE", help="Required. Merged metadata as TSV. Compressed files are supported." + SKIP_AUTO_DEFAULT_IN_HELP)
output_group.add_argument('--output-sequences', metavar="FILE", help="Required. Merged sequences as FASTA. Compressed files are supported." + SKIP_AUTO_DEFAULT_IN_HELP)
output_group.add_argument('--quiet', action="store_true", default=False, help="Suppress informational and warning messages normally written to stderr. (default: disabled)" + SKIP_AUTO_DEFAULT_IN_HELP)

return parser


def validate_arguments(args):
# These will make more sense when sequence support is added.
if not args.metadata:
if not any((args.metadata, args.sequences)):
raise AugurError("At least one input must be specified.")
if not args.output_metadata:
if not any((args.output_metadata, args.output_sequences)):
raise AugurError("At least one output must be specified.")

if args.output_metadata and not args.metadata:
raise AugurError("--output-metadata requires --metadata.")
if args.output_sequences and not args.sequences:
raise AugurError("--output-sequences requires --sequences.")


def run(args):
Expand All @@ -165,14 +181,33 @@ def run(args):

db = Database()

if args.metadata and args.sequences:
# FIXME: check order of inputs
# ERROR: Order of inputs differs between metadata (a,b) and sequences (b,a).

# FIXME: check that inputs match
# ERROR: Sequence file (c.fasta) does not have a corresponding metadata file.
...

if args.metadata:
metadata = get_metadata(args.metadata, args.metadata_id_columns, args.metadata_delimiters)
output_columns = get_output_columns(metadata)
load_metadata(db, metadata)

if args.sequences:
sequences = get_sequences(args.sequences)
load_sequences(db, sequences)

# FIXME: check that entries in input match
# WARNING: Sequence 'XXX' in a.tsv is missing from a.fasta. It will not be present in any output.
# WARNING: Sequence 'YYY' in b.fasta is missing from b.csv. It will not be present in any output.

if args.output_metadata:
merge_metadata(db, metadata, output_columns, args.output_metadata)

if args.output_sequences:
merge_sequences(db, sequences, args.output_sequences)


def get_metadata(
input_metadata: Sequence[str],
Expand Down Expand Up @@ -340,6 +375,72 @@ def merge_metadata(
db.cleanup()


def get_sequences(input_sequences):
# Validate arguments
sequences = parse_named_inputs(input_sequences)

return [NamedSequences(name, path) for name, path in sequences]


def load_sequences(db: Database, sequences: List[NamedSequences]):
for sequence_input in sequences:
ids = [seq.id for seq in read_sequences(sequence_input.path)]

# FIXME: error on duplicates within a single sequence file

db.run(f"CREATE TABLE {sequence_input.table_name} (id TEXT)")
values = ", ".join([f"('{seq_id}')" for seq_id in ids])
db.run(f"INSERT INTO {sequence_input.table_name} (id) VALUES {values}")


# FIXME: return a list of arguments and don't use shell
def cat(filepath: str):
cat = "cat"

if filepath.endswith(".gz"):
cat = "gzcat"
if filepath.endswith(".xz"):
cat = "xzcat"
if filepath.endswith(".zst"):
cat = "zstdcat"

return f"{cat} {filepath}"


def merge_sequences(
db: Database,
sequences: List[NamedSequences],
output_sequences: str,
):
# Confirm that seqkit is installed.
if which("seqkit") is None:
raise AugurError("'seqkit' is not installed! This is required to merge sequences.")

# Reversed because seqkit rmdup keeps the first entry but this command
# should keep the last entry.
# FIXME: don't use shell. just using it to get a sense of feasibility.
# FIXME: is seqkit overkill here? compare to ncov's drop_duplicate_sequences which is plain Python.
# https://github.com/nextstrain/ncov/blob/0769ac0429df8456ce70be2f74dc885d7b7fab12/scripts/sanitize_sequences.py#L127
cat_processes = (f"<({cat(filepath)})" for filepath in reversed([sequence_input.path for sequence_input in sequences]))
shell_cmd = f"cat {' '.join(cat_processes)} | seqkit rmdup"
print_debug(F"running shell command {shell_cmd!r}")
process = subprocess.Popen(shell_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE)

# FIXME: output only the sequences that are also present in metadata
# 1. before calling this function, create an index table mapping ID -> output (boolean)
# 2. create a file with list of IDs to output
# 3. use `seqkit grep` to filter the output of rmdup

# FIXME: handle `-` better
output = process.communicate()[0]
if output_sequences == "-":
sys.stdout.write(output.decode())
else:
with open(output_sequences, "w") as f:
f.write(output.decode())


# FIXME: do this for seqkit too
def sqlite3(*args, **kwargs):
"""
Internal helper for invoking ``sqlite3``, the SQLite CLI program.
Expand Down
38 changes: 38 additions & 0 deletions tests/functional/merge/cram/merge-sequences.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
SETUP

$ export AUGUR="${AUGUR:-$TESTDIR/../../../../bin/augur}"

Merge sequences without metadata

$ cat >x.fasta <<~~
> >seq1
> ATCG
> >seq2
> GCTA
> >seq3
> TCGA
> ~~

$ cat >y.fasta <<~~
> >seq3
> ATCG
> >seq4
> GCTA
> ~~

$ ${AUGUR} merge \
> --sequences x=x.fasta y=y.fasta \
> --output-sequences - > merged.fasta
[INFO]\x1b[0m 1 duplicated records removed (esc)

$ cat merged.fasta
>seq3
ATCG
>seq4
GCTA
>seq1
ATCG
>seq2
GCTA

FIXME: Merge both sequences and metadata

0 comments on commit f86323a

Please sign in to comment.