Merge pull request #167 from jeromekelleher/cli-tidy-up

CLI tidyups
jeromekelleher · May 31, 2023 · 76cae00 · 76cae00
2 parents e47002b + 51ace6c
commit 76cae00
Show file tree

Hide file tree

Showing 4 changed files with 155 additions and 77 deletions.
diff --git a/README.md b/README.md
@@ -1,4 +1,115 @@
 # sc2ts
 Infer a succinct tree sequence from SARS-COV-2 variation data
 
-** This is an early version not intended for production use!!**
+**This is an early version not intended for production use!!**
+
+If you are interested in helping to develop sc2ts or would like to 
+work with the inferred ARGS, please get in touch.
+
+## Installation
+
+To run the downstream analysis utilties, install from pip using
+
+```
+python3 -m pip install sc2ts[analysis]
+```
+
+This installs matplotlib and some other heavyweight dependencies.
+
+For just running the inference tools, use
+
+```
+python3 -m pip install sc2ts
+```
+
+## Inference workflow
+
+### Command line inference
+
+Inference is intended to be run from the command-line primarily,
+and most likely orchestrated via a shell script or Snakemake file, etc.
+
+The CLI is split into subcommands. Get help by running the CLI without arguments:
+
+```
+python3 -m sc2ts
+```
+
+### Import metadata to local database
+
+Metadata for all samples must be available, and provided in a tab-separated
+file. We need to convert from a standard text file to a SQLite database
+so that we can quickly search for strains collected on  a given day, without
+loading the entire set each time.
+
+```
+python3 -m sc2ts import-metadata data/metadata.tsv data/metadata.db
+```
+
+**TODO: Document required fields**
+
+### Import alignments
+
+To provide fast access to the individual alignments, we store them in a local
+database file. These must be imported before inference can be performed.
+
+The basic approach is to use the ``import-alignments`` command, with a
+path to a ``alignments.db`` file which we are creating, and one or more
+FASTA files that we are importing into it.
+
+```bash
+python3 -m sc2ts import-alignments data/alignments.db data/alignments/.fasta
+```
+
+By default the database file is updated each time, so this can be done
+in stages.
+
+**TODO discuss the storage and time requirements for this step!**
+
+
+### Run the inference
+
+The basic approach is to run the ``daily-extend`` command which runs the
+basic extension operation day-by-day using the information
+in the metadata DB.
+
+```
+python3 -m sc2ts daily-extend data/alignments.db data/metadata.db results/output-prefix
+```
+
+### Example run script
+
+Here is a script used to run the inference for the Long ARG
+in the preprint:
+
+```
+#!/bin/bash
+set -e
+
+precision=12
+mismatches=3
+max_submission_delay=30
+max_daily_samples=1000
+num_threads=40
+
+datadir=data
+run_id=upgma-mds-$max_daily_samples-md-$max_submission_delay-mm-$mismatches
+resultsdir=results/$run_id
+results_prefix=$resultsdir/$run_id-
+logfile=logs/$run_id.log
+
+options="--num-threads $num_threads -vv -l $logfile "
+options+="--max-submission-delay $max_submission_delay "
+options+="--max-daily-samples $max_daily_samples "
+options+="--precision $precision --num-mismatches $mismatches"
+
+mkdir -p $resultsdir
+
+alignments=$datadir/alignments2.db
+metadata=$datadir/metadata.filtered.db
+# NOTE: we can start from a given data also with the -b option
+# basets="$results_prefix"2022-01-24.ts
+# options+=" -b $basets"
+
+python3 -m sc2ts daily-extend $alignments $metadata $results_prefix $options
+```
diff --git a/sc2ts/alignments.py b/sc2ts/alignments.py
@@ -93,7 +93,7 @@ def close(self):
         self.env.close()
 
     def __str__(self):
-        return str(self.env)
+        return f"AlignmentStore at {self.env.path()} contains {len(self)} alignments"
 
     @staticmethod
     def initialise(path):

diff --git a/sc2ts/cli.py b/sc2ts/cli.py
@@ -77,15 +77,18 @@ def setup_logging(verbosity, log_file=None):
     daiquiri.setup(level=log_level, outputs=outputs, set_excepthook=False)
 
 
+# TODO add options to list keys, dump specific alignments etc
 @click.command()
-# FIXME this isn't checking for existing!
-@click.argument("store", type=click.Path(dir_okay=True, exists=False, file_okay=False))
+@click.argument("store", type=click.Path(exists=True, dir_okay=False))
 @click.option("-v", "--verbose", count=True)
 @click.option("-l", "--log-file", default=None, type=click.Path(dir_okay=False))
-def init_alignment_store(store, verbose, log_file):
+def info_alignments(store, verbose, log_file):
+    """
+    Information about an alignment store
+    """
     setup_logging(verbose, log_file)
-    # provenance = get_provenance_dict()
-    sc2ts.AlignmentStore.initialise(store)
+    with sc2ts.AlignmentStore(store) as alignment_store:
+        print(alignment_store)
 
 
 @click.command()
@@ -96,6 +99,9 @@ def init_alignment_store(store, verbose, log_file):
 @click.option("-v", "--verbose", count=True)
 @click.option("-l", "--log-file", default=None, type=click.Path(dir_okay=False))
 def import_alignments(store, fastas, initialise, no_progress, verbose, log_file):
+    """
+    Import the alignments from all FASTAS into STORE.
+    """
     setup_logging(verbose, log_file)
     if initialise:
         a = sc2ts.AlignmentStore.initialise(store)
@@ -120,79 +126,25 @@ def import_metadata(metadata, db, verbose):
     sc2ts.MetadataDb.import_csv(metadata, db)
 
 
-def add_provenance(ts, output_file):
-    # Record provenance here because this is where the arguments are provided.
-    provenance = get_provenance_dict()
-    tables = ts.dump_tables()
-    tables.provenances.add_row(json.dumps(provenance))
-    tables.dump(output_file)
-
-
 @click.command()
-@click.argument("output")
-@click.option("-v", "--verbose", count=True)
-def init(output, verbose):
-    """
-    Creates the initial tree sequence containing the reference sequence.
-    """
-    setup_logging(verbose)
-    ts = inference.initial_ts()
-    add_provenance(ts, output)
-
-
-@click.command()
-@click.argument("alignments", type=click.Path(exists=True, dir_okay=False))
 @click.argument("metadata", type=click.Path(exists=True, dir_okay=False))
-@click.argument("base", type=click.Path(dir_okay=False))
-@click.argument("output", type=click.Path(dir_okay=False))
-@click.argument("date")
-@click.option("--num-mismatches", default=None, type=float, help="num-mismatches")
-@click.option(
-    "--max-submission-delay",
-    default=None,
-    type=int,
-    help=(
-        "The maximum number of days between the sample and its submission date "
-        "for it to be included in the inference"
-    ),
-)
-@click.option("--num-threads", default=0, type=int, help="Number of match threads")
-@click.option("-p", "--precision", default=None, type=int, help="Match precision")
-@click.option("--no-progress", default=False, type=bool, help="Don't show progress")
 @click.option("-v", "--verbose", count=True)
 @click.option("-l", "--log-file", default=None, type=click.Path(dir_okay=False))
-def extend(
-    alignments,
-    metadata,
-    base,
-    output,
-    date,
-    num_mismatches,
-    max_submission_delay,
-    num_threads,
-    precision,
-    no_progress,
-    verbose,
-    log_file,
-):
+def info_metadata(metadata, verbose, log_file):
+    """
+    Information about a metadata DB
+    """
     setup_logging(verbose, log_file)
+    with sc2ts.MetadataDb(metadata) as metadata_db:
+        print(metadata_db)
 
-    with contextlib.ExitStack() as exit_stack:
-        alignment_store = exit_stack.enter_context(sc2ts.AlignmentStore(alignments))
-        metadata_db = exit_stack.enter_context(sc2ts.MetadataDb(metadata))
-        base_ts = tskit.load(base)
-        ts = inference.extend(
-            alignment_store=alignment_store,
-            metadata_db=metadata_db,
-            date=date,
-            base_ts=base_ts,
-            num_mismatches=num_mismatches,
-            max_submission_delay=max_submission_delay,
-            precision=precision,
-            num_threads=num_threads,
-            show_progress=not no_progress,
-        )
-        add_provenance(ts, output)
+
+def add_provenance(ts, output_file):
+    # Record provenance here because this is where the arguments are provided.
+    provenance = get_provenance_dict()
+    tables = ts.dump_tables()
+    tables.provenances.add_row(json.dumps(provenance))
+    tables.dump(output_file)
 
 
 @click.command()
@@ -249,6 +201,9 @@ def daily_extend(
     verbose,
     log_file,
 ):
+    """
+    Sequentially extend the trees by adding samples in daily batches.
+    """
     setup_logging(verbose, log_file)
     rng = random.Random(random_seed)
     if base is None:
@@ -281,6 +236,9 @@ def daily_extend(
 @click.argument("ts_file")
 @click.option("-v", "--verbose", count=True)
 def validate(alignment_db, ts_file, verbose):
+    """
+    Check that the specified trees correctly encode alignments for samples.
+    """
     setup_logging(verbose)
 
     if ts_file.endswith(".tsz"):
@@ -388,12 +346,11 @@ def cli():
     pass
 
 
-cli.add_command(init_alignment_store)
 cli.add_command(import_alignments)
 cli.add_command(import_metadata)
+cli.add_command(info_alignments)
+cli.add_command(info_metadata)
 
-cli.add_command(init)
-cli.add_command(extend)
 cli.add_command(daily_extend)
 cli.add_command(validate)
 cli.add_command(annotate_recombinants)
diff --git a/sc2ts/metadata.py b/sc2ts/metadata.py
@@ -33,6 +33,7 @@ class MetadataDb:
     def __init__(self, path):
         uri = f"file:{path}"
         uri += "?mode=ro"
+        self.uri = uri
         self.conn = sqlite3.connect(uri, uri=True)
         self.conn.row_factory = dict_factory
 
@@ -42,6 +43,15 @@ def __enter__(self):
     def __exit__(self, type, value, traceback):
         self.close()
 
+    def __str__(self):
+        return f"MetadataDb at {self.uri} contains {len(self)} sequences"
+
+    def __len__(self):
+        sql = "SELECT COUNT(*) FROM samples"
+        with self.conn:
+            row = self.conn.execute(sql).fetchone()
+            return row["COUNT(*)"]
+
     def close(self):
         self.conn.close()