Skip to content

Commit

Permalink
CLI tidyups + README
Browse files Browse the repository at this point in the history
Closes #109
  • Loading branch information
jeromekelleher committed May 31, 2023
1 parent e47002b commit 51ace6c
Show file tree
Hide file tree
Showing 4 changed files with 155 additions and 77 deletions.
113 changes: 112 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,115 @@
# sc2ts
Infer a succinct tree sequence from SARS-COV-2 variation data

** This is an early version not intended for production use!!**
**This is an early version not intended for production use!!**

If you are interested in helping to develop sc2ts or would like to
work with the inferred ARGS, please get in touch.

## Installation

To run the downstream analysis utilties, install from pip using

```
python3 -m pip install sc2ts[analysis]
```

This installs matplotlib and some other heavyweight dependencies.

For just running the inference tools, use

```
python3 -m pip install sc2ts
```

## Inference workflow

### Command line inference

Inference is intended to be run from the command-line primarily,
and most likely orchestrated via a shell script or Snakemake file, etc.

The CLI is split into subcommands. Get help by running the CLI without arguments:

```
python3 -m sc2ts
```

### Import metadata to local database

Metadata for all samples must be available, and provided in a tab-separated
file. We need to convert from a standard text file to a SQLite database
so that we can quickly search for strains collected on a given day, without
loading the entire set each time.

```
python3 -m sc2ts import-metadata data/metadata.tsv data/metadata.db
```

**TODO: Document required fields**

### Import alignments

To provide fast access to the individual alignments, we store them in a local
database file. These must be imported before inference can be performed.

The basic approach is to use the ``import-alignments`` command, with a
path to a ``alignments.db`` file which we are creating, and one or more
FASTA files that we are importing into it.

```bash
python3 -m sc2ts import-alignments data/alignments.db data/alignments/.fasta
```

By default the database file is updated each time, so this can be done
in stages.

**TODO discuss the storage and time requirements for this step!**


### Run the inference

The basic approach is to run the ``daily-extend`` command which runs the
basic extension operation day-by-day using the information
in the metadata DB.

```
python3 -m sc2ts daily-extend data/alignments.db data/metadata.db results/output-prefix
```

### Example run script

Here is a script used to run the inference for the Long ARG
in the preprint:

```
#!/bin/bash
set -e
precision=12
mismatches=3
max_submission_delay=30
max_daily_samples=1000
num_threads=40
datadir=data
run_id=upgma-mds-$max_daily_samples-md-$max_submission_delay-mm-$mismatches
resultsdir=results/$run_id
results_prefix=$resultsdir/$run_id-
logfile=logs/$run_id.log
options="--num-threads $num_threads -vv -l $logfile "
options+="--max-submission-delay $max_submission_delay "
options+="--max-daily-samples $max_daily_samples "
options+="--precision $precision --num-mismatches $mismatches"
mkdir -p $resultsdir
alignments=$datadir/alignments2.db
metadata=$datadir/metadata.filtered.db
# NOTE: we can start from a given data also with the -b option
# basets="$results_prefix"2022-01-24.ts
# options+=" -b $basets"
python3 -m sc2ts daily-extend $alignments $metadata $results_prefix $options
```
2 changes: 1 addition & 1 deletion sc2ts/alignments.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ def close(self):
self.env.close()

def __str__(self):
return str(self.env)
return f"AlignmentStore at {self.env.path()} contains {len(self)} alignments"

@staticmethod
def initialise(path):
Expand Down
107 changes: 32 additions & 75 deletions sc2ts/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,15 +77,18 @@ def setup_logging(verbosity, log_file=None):
daiquiri.setup(level=log_level, outputs=outputs, set_excepthook=False)


# TODO add options to list keys, dump specific alignments etc
@click.command()
# FIXME this isn't checking for existing!
@click.argument("store", type=click.Path(dir_okay=True, exists=False, file_okay=False))
@click.argument("store", type=click.Path(exists=True, dir_okay=False))
@click.option("-v", "--verbose", count=True)
@click.option("-l", "--log-file", default=None, type=click.Path(dir_okay=False))
def init_alignment_store(store, verbose, log_file):
def info_alignments(store, verbose, log_file):
"""
Information about an alignment store
"""
setup_logging(verbose, log_file)
# provenance = get_provenance_dict()
sc2ts.AlignmentStore.initialise(store)
with sc2ts.AlignmentStore(store) as alignment_store:
print(alignment_store)


@click.command()
Expand All @@ -96,6 +99,9 @@ def init_alignment_store(store, verbose, log_file):
@click.option("-v", "--verbose", count=True)
@click.option("-l", "--log-file", default=None, type=click.Path(dir_okay=False))
def import_alignments(store, fastas, initialise, no_progress, verbose, log_file):
"""
Import the alignments from all FASTAS into STORE.
"""
setup_logging(verbose, log_file)
if initialise:
a = sc2ts.AlignmentStore.initialise(store)
Expand All @@ -120,79 +126,25 @@ def import_metadata(metadata, db, verbose):
sc2ts.MetadataDb.import_csv(metadata, db)


def add_provenance(ts, output_file):
# Record provenance here because this is where the arguments are provided.
provenance = get_provenance_dict()
tables = ts.dump_tables()
tables.provenances.add_row(json.dumps(provenance))
tables.dump(output_file)


@click.command()
@click.argument("output")
@click.option("-v", "--verbose", count=True)
def init(output, verbose):
"""
Creates the initial tree sequence containing the reference sequence.
"""
setup_logging(verbose)
ts = inference.initial_ts()
add_provenance(ts, output)


@click.command()
@click.argument("alignments", type=click.Path(exists=True, dir_okay=False))
@click.argument("metadata", type=click.Path(exists=True, dir_okay=False))
@click.argument("base", type=click.Path(dir_okay=False))
@click.argument("output", type=click.Path(dir_okay=False))
@click.argument("date")
@click.option("--num-mismatches", default=None, type=float, help="num-mismatches")
@click.option(
"--max-submission-delay",
default=None,
type=int,
help=(
"The maximum number of days between the sample and its submission date "
"for it to be included in the inference"
),
)
@click.option("--num-threads", default=0, type=int, help="Number of match threads")
@click.option("-p", "--precision", default=None, type=int, help="Match precision")
@click.option("--no-progress", default=False, type=bool, help="Don't show progress")
@click.option("-v", "--verbose", count=True)
@click.option("-l", "--log-file", default=None, type=click.Path(dir_okay=False))
def extend(
alignments,
metadata,
base,
output,
date,
num_mismatches,
max_submission_delay,
num_threads,
precision,
no_progress,
verbose,
log_file,
):
def info_metadata(metadata, verbose, log_file):
"""
Information about a metadata DB
"""
setup_logging(verbose, log_file)
with sc2ts.MetadataDb(metadata) as metadata_db:
print(metadata_db)

with contextlib.ExitStack() as exit_stack:
alignment_store = exit_stack.enter_context(sc2ts.AlignmentStore(alignments))
metadata_db = exit_stack.enter_context(sc2ts.MetadataDb(metadata))
base_ts = tskit.load(base)
ts = inference.extend(
alignment_store=alignment_store,
metadata_db=metadata_db,
date=date,
base_ts=base_ts,
num_mismatches=num_mismatches,
max_submission_delay=max_submission_delay,
precision=precision,
num_threads=num_threads,
show_progress=not no_progress,
)
add_provenance(ts, output)

def add_provenance(ts, output_file):
# Record provenance here because this is where the arguments are provided.
provenance = get_provenance_dict()
tables = ts.dump_tables()
tables.provenances.add_row(json.dumps(provenance))
tables.dump(output_file)


@click.command()
Expand Down Expand Up @@ -249,6 +201,9 @@ def daily_extend(
verbose,
log_file,
):
"""
Sequentially extend the trees by adding samples in daily batches.
"""
setup_logging(verbose, log_file)
rng = random.Random(random_seed)
if base is None:
Expand Down Expand Up @@ -281,6 +236,9 @@ def daily_extend(
@click.argument("ts_file")
@click.option("-v", "--verbose", count=True)
def validate(alignment_db, ts_file, verbose):
"""
Check that the specified trees correctly encode alignments for samples.
"""
setup_logging(verbose)

if ts_file.endswith(".tsz"):
Expand Down Expand Up @@ -388,12 +346,11 @@ def cli():
pass


cli.add_command(init_alignment_store)
cli.add_command(import_alignments)
cli.add_command(import_metadata)
cli.add_command(info_alignments)
cli.add_command(info_metadata)

cli.add_command(init)
cli.add_command(extend)
cli.add_command(daily_extend)
cli.add_command(validate)
cli.add_command(annotate_recombinants)
10 changes: 10 additions & 0 deletions sc2ts/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ class MetadataDb:
def __init__(self, path):
uri = f"file:{path}"
uri += "?mode=ro"
self.uri = uri
self.conn = sqlite3.connect(uri, uri=True)
self.conn.row_factory = dict_factory

Expand All @@ -42,6 +43,15 @@ def __enter__(self):
def __exit__(self, type, value, traceback):
self.close()

def __str__(self):
return f"MetadataDb at {self.uri} contains {len(self)} sequences"

def __len__(self):
sql = "SELECT COUNT(*) FROM samples"
with self.conn:
row = self.conn.execute(sql).fetchone()
return row["COUNT(*)"]

def close(self):
self.conn.close()

Expand Down

0 comments on commit 51ace6c

Please sign in to comment.