diff --git a/README.md b/README.md index c356412..2ec62d2 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,115 @@ # sc2ts Infer a succinct tree sequence from SARS-COV-2 variation data -** This is an early version not intended for production use!!** +**This is an early version not intended for production use!!** + +If you are interested in helping to develop sc2ts or would like to +work with the inferred ARGS, please get in touch. + +## Installation + +To run the downstream analysis utilties, install from pip using + +``` +python3 -m pip install sc2ts[analysis] +``` + +This installs matplotlib and some other heavyweight dependencies. + +For just running the inference tools, use + +``` +python3 -m pip install sc2ts +``` + +## Inference workflow + +### Command line inference + +Inference is intended to be run from the command-line primarily, +and most likely orchestrated via a shell script or Snakemake file, etc. + +The CLI is split into subcommands. Get help by running the CLI without arguments: + +``` +python3 -m sc2ts +``` + +### Import metadata to local database + +Metadata for all samples must be available, and provided in a tab-separated +file. We need to convert from a standard text file to a SQLite database +so that we can quickly search for strains collected on a given day, without +loading the entire set each time. + +``` +python3 -m sc2ts import-metadata data/metadata.tsv data/metadata.db +``` + +**TODO: Document required fields** + +### Import alignments + +To provide fast access to the individual alignments, we store them in a local +database file. These must be imported before inference can be performed. + +The basic approach is to use the ``import-alignments`` command, with a +path to a ``alignments.db`` file which we are creating, and one or more +FASTA files that we are importing into it. + +```bash +python3 -m sc2ts import-alignments data/alignments.db data/alignments/.fasta +``` + +By default the database file is updated each time, so this can be done +in stages. + +**TODO discuss the storage and time requirements for this step!** + + +### Run the inference + +The basic approach is to run the ``daily-extend`` command which runs the +basic extension operation day-by-day using the information +in the metadata DB. + +``` +python3 -m sc2ts daily-extend data/alignments.db data/metadata.db results/output-prefix +``` + +### Example run script + +Here is a script used to run the inference for the Long ARG +in the preprint: + +``` +#!/bin/bash +set -e + +precision=12 +mismatches=3 +max_submission_delay=30 +max_daily_samples=1000 +num_threads=40 + +datadir=data +run_id=upgma-mds-$max_daily_samples-md-$max_submission_delay-mm-$mismatches +resultsdir=results/$run_id +results_prefix=$resultsdir/$run_id- +logfile=logs/$run_id.log + +options="--num-threads $num_threads -vv -l $logfile " +options+="--max-submission-delay $max_submission_delay " +options+="--max-daily-samples $max_daily_samples " +options+="--precision $precision --num-mismatches $mismatches" + +mkdir -p $resultsdir + +alignments=$datadir/alignments2.db +metadata=$datadir/metadata.filtered.db +# NOTE: we can start from a given data also with the -b option +# basets="$results_prefix"2022-01-24.ts +# options+=" -b $basets" + +python3 -m sc2ts daily-extend $alignments $metadata $results_prefix $options +``` diff --git a/sc2ts/alignments.py b/sc2ts/alignments.py index c53f53f..b827e3f 100644 --- a/sc2ts/alignments.py +++ b/sc2ts/alignments.py @@ -93,7 +93,7 @@ def close(self): self.env.close() def __str__(self): - return str(self.env) + return f"AlignmentStore at {self.env.path()} contains {len(self)} alignments" @staticmethod def initialise(path): diff --git a/sc2ts/cli.py b/sc2ts/cli.py index 14e71f7..65d9baa 100644 --- a/sc2ts/cli.py +++ b/sc2ts/cli.py @@ -77,15 +77,18 @@ def setup_logging(verbosity, log_file=None): daiquiri.setup(level=log_level, outputs=outputs, set_excepthook=False) +# TODO add options to list keys, dump specific alignments etc @click.command() -# FIXME this isn't checking for existing! -@click.argument("store", type=click.Path(dir_okay=True, exists=False, file_okay=False)) +@click.argument("store", type=click.Path(exists=True, dir_okay=False)) @click.option("-v", "--verbose", count=True) @click.option("-l", "--log-file", default=None, type=click.Path(dir_okay=False)) -def init_alignment_store(store, verbose, log_file): +def info_alignments(store, verbose, log_file): + """ + Information about an alignment store + """ setup_logging(verbose, log_file) - # provenance = get_provenance_dict() - sc2ts.AlignmentStore.initialise(store) + with sc2ts.AlignmentStore(store) as alignment_store: + print(alignment_store) @click.command() @@ -96,6 +99,9 @@ def init_alignment_store(store, verbose, log_file): @click.option("-v", "--verbose", count=True) @click.option("-l", "--log-file", default=None, type=click.Path(dir_okay=False)) def import_alignments(store, fastas, initialise, no_progress, verbose, log_file): + """ + Import the alignments from all FASTAS into STORE. + """ setup_logging(verbose, log_file) if initialise: a = sc2ts.AlignmentStore.initialise(store) @@ -120,79 +126,25 @@ def import_metadata(metadata, db, verbose): sc2ts.MetadataDb.import_csv(metadata, db) -def add_provenance(ts, output_file): - # Record provenance here because this is where the arguments are provided. - provenance = get_provenance_dict() - tables = ts.dump_tables() - tables.provenances.add_row(json.dumps(provenance)) - tables.dump(output_file) - - @click.command() -@click.argument("output") -@click.option("-v", "--verbose", count=True) -def init(output, verbose): - """ - Creates the initial tree sequence containing the reference sequence. - """ - setup_logging(verbose) - ts = inference.initial_ts() - add_provenance(ts, output) - - -@click.command() -@click.argument("alignments", type=click.Path(exists=True, dir_okay=False)) @click.argument("metadata", type=click.Path(exists=True, dir_okay=False)) -@click.argument("base", type=click.Path(dir_okay=False)) -@click.argument("output", type=click.Path(dir_okay=False)) -@click.argument("date") -@click.option("--num-mismatches", default=None, type=float, help="num-mismatches") -@click.option( - "--max-submission-delay", - default=None, - type=int, - help=( - "The maximum number of days between the sample and its submission date " - "for it to be included in the inference" - ), -) -@click.option("--num-threads", default=0, type=int, help="Number of match threads") -@click.option("-p", "--precision", default=None, type=int, help="Match precision") -@click.option("--no-progress", default=False, type=bool, help="Don't show progress") @click.option("-v", "--verbose", count=True) @click.option("-l", "--log-file", default=None, type=click.Path(dir_okay=False)) -def extend( - alignments, - metadata, - base, - output, - date, - num_mismatches, - max_submission_delay, - num_threads, - precision, - no_progress, - verbose, - log_file, -): +def info_metadata(metadata, verbose, log_file): + """ + Information about a metadata DB + """ setup_logging(verbose, log_file) + with sc2ts.MetadataDb(metadata) as metadata_db: + print(metadata_db) - with contextlib.ExitStack() as exit_stack: - alignment_store = exit_stack.enter_context(sc2ts.AlignmentStore(alignments)) - metadata_db = exit_stack.enter_context(sc2ts.MetadataDb(metadata)) - base_ts = tskit.load(base) - ts = inference.extend( - alignment_store=alignment_store, - metadata_db=metadata_db, - date=date, - base_ts=base_ts, - num_mismatches=num_mismatches, - max_submission_delay=max_submission_delay, - precision=precision, - num_threads=num_threads, - show_progress=not no_progress, - ) - add_provenance(ts, output) + +def add_provenance(ts, output_file): + # Record provenance here because this is where the arguments are provided. + provenance = get_provenance_dict() + tables = ts.dump_tables() + tables.provenances.add_row(json.dumps(provenance)) + tables.dump(output_file) @click.command() @@ -249,6 +201,9 @@ def daily_extend( verbose, log_file, ): + """ + Sequentially extend the trees by adding samples in daily batches. + """ setup_logging(verbose, log_file) rng = random.Random(random_seed) if base is None: @@ -281,6 +236,9 @@ def daily_extend( @click.argument("ts_file") @click.option("-v", "--verbose", count=True) def validate(alignment_db, ts_file, verbose): + """ + Check that the specified trees correctly encode alignments for samples. + """ setup_logging(verbose) if ts_file.endswith(".tsz"): @@ -388,12 +346,11 @@ def cli(): pass -cli.add_command(init_alignment_store) cli.add_command(import_alignments) cli.add_command(import_metadata) +cli.add_command(info_alignments) +cli.add_command(info_metadata) -cli.add_command(init) -cli.add_command(extend) cli.add_command(daily_extend) cli.add_command(validate) cli.add_command(annotate_recombinants) diff --git a/sc2ts/metadata.py b/sc2ts/metadata.py index 7e88df8..4a48fe7 100644 --- a/sc2ts/metadata.py +++ b/sc2ts/metadata.py @@ -33,6 +33,7 @@ class MetadataDb: def __init__(self, path): uri = f"file:{path}" uri += "?mode=ro" + self.uri = uri self.conn = sqlite3.connect(uri, uri=True) self.conn.row_factory = dict_factory @@ -42,6 +43,15 @@ def __enter__(self): def __exit__(self, type, value, traceback): self.close() + def __str__(self): + return f"MetadataDb at {self.uri} contains {len(self)} sequences" + + def __len__(self): + sql = "SELECT COUNT(*) FROM samples" + with self.conn: + row = self.conn.execute(sql).fetchone() + return row["COUNT(*)"] + def close(self): self.conn.close()