From 81eae3629194de0b07116ab4b1ec2385cc8bda70 Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Mon, 16 Sep 2024 11:38:35 -0700 Subject: [PATCH] =?UTF-8?q?=F0=9F=9A=A7=20add=20sequence=20support?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- augur/merge.py | 65 +++++++++++++++++-- tests/functional/merge/cram/merge-sequences.t | 58 +++++++++++++++++ 2 files changed, 118 insertions(+), 5 deletions(-) create mode 100644 tests/functional/merge/cram/merge-sequences.t diff --git a/augur/merge.py b/augur/merge.py index bac333ccb..36fb8e6a7 100644 --- a/augur/merge.py +++ b/augur/merge.py @@ -92,24 +92,35 @@ def register_parser(parent_subparsers): input_group.add_argument("--metadata-id-columns", default=DEFAULT_ID_COLUMNS, nargs="+", action=ExtendOverwriteDefault, metavar="[TABLE=]COLUMN", help=f"Possible metadata column names containing identifiers, considered in the order given. Columns will be considered for all metadata tables by default. Table-specific column names may be given using the same names assigned in --metadata. Only one ID column will be inferred for each table. (default: {' '.join(map(shquote_humanized, DEFAULT_ID_COLUMNS))})" + SKIP_AUTO_DEFAULT_IN_HELP) input_group.add_argument("--metadata-delimiters", default=DEFAULT_DELIMITERS, nargs="+", action=ExtendOverwriteDefault, metavar="[TABLE=]CHARACTER", help=f"Possible field delimiters to use for reading metadata tables, considered in the order given. Delimiters will be considered for all metadata tables by default. Table-specific delimiters may be given using the same names assigned in --metadata. Only one delimiter will be inferred for each table. (default: {' '.join(map(shquote_humanized, DEFAULT_DELIMITERS))})" + SKIP_AUTO_DEFAULT_IN_HELP) + input_group.add_argument("--sequences", nargs="+", action="extend", metavar="FILE", help="Sequence files. Compressed files are supported." + SKIP_AUTO_DEFAULT_IN_HELP) + output_group = parser.add_argument_group("outputs", "options related to output") output_group.add_argument('--output-metadata', metavar="FILE", help="Merged metadata as TSV. Compressed files are supported." + SKIP_AUTO_DEFAULT_IN_HELP) output_group.add_argument('--source-columns', default="__source_metadata_{NAME}", metavar="TEMPLATE", help=f"Template with which to generate names for the columns (described above) identifying the source of each row's data. Must contain a literal placeholder, {{NAME}}, which stands in for the metadata table names assigned in --metadata.") output_group.add_argument('--no-source-columns', dest="source_columns", action="store_const", const=None, help=f"Suppress generated columns (described above) identifying the source of each row's data." + SKIP_AUTO_DEFAULT_IN_HELP) + + output_group.add_argument('--output-sequences', metavar="FILE", help="Merged sequences as FASTA. Compressed files are supported." + SKIP_AUTO_DEFAULT_IN_HELP) + output_group.add_argument('--quiet', action="store_true", default=False, help="Suppress informational and warning messages normally written to stderr. (default: disabled)" + SKIP_AUTO_DEFAULT_IN_HELP) return parser def validate_arguments(args): - # These will make more sense when sequence support is added. - if not args.metadata: - raise AugurError("--metadata must be specified.") - if not args.output_metadata: - raise AugurError("--output-metadata must be specified.") + if not any((args.metadata, args.sequences)): + raise AugurError("Either --metadata or --sequences must be specified.") + if not any((args.output_metadata, args.output_sequences)): + raise AugurError("Either --output-metadata or --sequences must be specified.") + + if args.metadata and args.sequences: + raise AugurError("--metadata and --sequences cannot be used together.") + # Maybe allow both with a caveat? Example: + # > NOTE: Both metadata and sequences provided in the same call. There is no cross-checking and the order of files for each input is used independently. if args.output_metadata and not args.metadata: raise AugurError("--output-metadata requires --metadata.") + if args.output_sequences and not args.sequences: + raise AugurError("--output-sequences requires --sequences.") def run(args): @@ -119,6 +130,9 @@ def run(args): if args.metadata: merge_metadata(args) + if args.sequences: + merge_sequences(args) + def merge_metadata(args): global print_info @@ -361,6 +375,47 @@ def merge_metadata(args): print_info(f"WARNING: Skipped deletion of {db_path} due to error, but you may want to clean it up yourself (e.g. if it's large).") +# FIXME: return a list of arguments and don't use shell +def cat(filepath: str): + cat = "cat" + + if filepath.endswith(".gz"): + cat = "gzcat" + if filepath.endswith(".xz"): + cat = "xzcat" + if filepath.endswith(".zst"): + cat = "zstdcat" + + return f"{cat} {filepath}" + + +def merge_sequences(args): + # Confirm that seqkit is installed. + if which("seqkit") is None: + raise AugurError("'seqkit' is not installed! This is required to merge sequences.") + + # FIXME: apply --quiet to seqkit + + # Reversed because seqkit rmdup keeps the first entry but this command + # should keep the last entry. + # FIXME: don't use shell. just using it to get a sense of feasibility. + # FIXME: is seqkit overkill here? compare to ncov's drop_duplicate_sequences which is plain Python. + # https://github.com/nextstrain/ncov/blob/0769ac0429df8456ce70be2f74dc885d7b7fab12/scripts/sanitize_sequences.py#L127 + cat_processes = (f"<({cat(filepath)})" for filepath in reversed(args.sequences)) + shell_cmd = f"cat {' '.join(cat_processes)} | seqkit rmdup" + print_debug(F"running shell command {shell_cmd!r}") + process = subprocess.Popen(shell_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE) + + # FIXME: handle `-` better + output = process.communicate()[0] + if args.output_sequences == "-": + sys.stdout.write(output.decode()) + else: + with open(args.output_sequences, "w") as f: + f.write(output.decode()) + + +# FIXME: do this for seqkit too def sqlite3(*args, **kwargs): """ Internal helper for invoking ``sqlite3``, the SQLite CLI program. diff --git a/tests/functional/merge/cram/merge-sequences.t b/tests/functional/merge/cram/merge-sequences.t new file mode 100644 index 000000000..694371268 --- /dev/null +++ b/tests/functional/merge/cram/merge-sequences.t @@ -0,0 +1,58 @@ +SETUP + + $ export AUGUR="${AUGUR:-$TESTDIR/../../../../bin/augur}" + +Merge sequences without metadata + + $ cat >x.fasta <<~~ + > >seq1 + > ATCG + > >seq2 + > GCTA + > >seq3 + > TCGA + > ~~ + + $ cat >y.fasta <<~~ + > >seq3 + > ATCG + > >seq4 + > GCTA + > ~~ + + $ ${AUGUR} merge \ + > --sequences x.fasta y.fasta \ + > --output-sequences - > merged.fasta + [INFO]\x1b[0m 1 duplicated records removed (esc) + + $ cat merged.fasta + >seq3 + ATCG + >seq4 + GCTA + >seq1 + ATCG + >seq2 + GCTA + +Merging sequences with metadata is not supported. + + $ cat >x.tsv <<~~ + > strain a b c + > one X1a X1b X1c + > two X2a X2b X2c + > ~~ + + $ cat >y.tsv <<~~ + > strain b c f e d + > two Y2c Y2f Y2e Y2d + > three Y3f Y3e Y3d + > ~~ + + $ ${AUGUR} merge \ + > --metadata X=x.tsv Y=y.tsv \ + > --sequences x.fasta y.fasta \ + > --output-metadata merged.tsv \ + > --output-sequences merged.fasta + ERROR: --metadata and --sequences cannot be used together. + [2]