Skip to content

Commit

Permalink
Merge pull request #17 from martinghunt/output_file_tidying
Browse files Browse the repository at this point in the history
Output file tidying
  • Loading branch information
martinghunt authored Oct 15, 2021
2 parents c2fa4a8 + 0bf6d24 commit e2a314a
Show file tree
Hide file tree
Showing 5 changed files with 100 additions and 25 deletions.
14 changes: 14 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,20 @@ Other options:
it already exists.



## Output files

The default files in the output directory are:

* `consensus.fa`: a FASTA file of the consensus sequence.
* `variants.vcf`: a VCF file of the identified variants between the consensus
sequence and the reference genome.
* `log.json`: contains logging information for the viridian workflow run.

If the option `--keep_bam` is used, then a sorted BAM file of the reads mapped
to the reference will also be present, called
`reference_mapped.bam` (and its index file `reference_mapped.bam.bai`).

## Configuration

Quality control thresholds are configured in `viridian_workflow/config.ini`:
Expand Down
2 changes: 1 addition & 1 deletion tests/one_sample_pipeline_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ def test_complete_assembly_no_reads_map(test_data):
# TODO specify that it was the consensus file that's missing
if str(error) != str(
os.path.abspath(
os.path.join(outdir, "viridian/consensus.final_assembly.fa")
os.path.join(outdir, "Processing/viridian/consensus.final_assembly.fa")
)
):
raise error
Expand Down
93 changes: 72 additions & 21 deletions viridian_workflow/one_sample_pipeline.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,23 @@
import argparse
import datetime
import json
import logging
import socket
import subprocess
import sys
import os

from viridian_workflow import minimap, qcovid, sample_reads, varifier
from viridian_workflow.utils import (
amplicons_json_to_bed_and_range,
check_file,
load_json,
run_process,
rm,
set_sample_name_in_vcf_file,
set_seq_name_in_fasta_file,
)
from viridian_workflow import __version__ as viridian_wf_version


def check_tech(tech):
Expand Down Expand Up @@ -52,6 +60,7 @@ def run_one_sample(
keep_bam=False,
target_sample_depth=1000,
sample_name="sample",
command_line_args=None,
):
check_tech(tech)
if tech == "ont":
Expand All @@ -63,8 +72,38 @@ def run_one_sample(
else:
raise NotImplementedError(f"tech not implemented: {tech}")

logging.info(f"Start running viridian_workflow, output dir: {outdir}")
os.mkdir(outdir)
amplicon_bed = os.path.join(outdir, "amplicons.bed")
json_log = os.path.join(outdir, "log.json")
# Make a dict of the command line options to go in the JSON output file.
# The tests don't use argparse (they use Mock), which means convert to dict
# doesn't work. Don't care about that case anyway in the final output, so
# just set to None
if isinstance(command_line_args, argparse.Namespace):
options_dict = {k: v for k, v in vars(command_line_args).items() if k != "func"}
else:
options_dict = None
start_time = datetime.datetime.now()
log_info = {
"run_summary": {
"command": " ".join(sys.argv),
"options": options_dict,
"cwd": os.getcwd(),
"version": viridian_wf_version,
"finished_running": False,
"start_time": start_time.replace(microsecond=0).isoformat(),
"end_time": None,
"hostname": socket.gethostname(),
},
"read_sampling": None,
"viridian": None,
}
with open(json_log, "w") as f:
json.dump(log_info, f, indent=2)

processing_dir = os.path.join(outdir, "Processing")
os.mkdir(processing_dir)
amplicon_bed = os.path.join(processing_dir, "amplicons.bed")
amplicons_start, amplicons_end = amplicons_json_to_bed_and_range(
amplicon_json, amplicon_bed
)
Expand All @@ -74,7 +113,8 @@ def run_one_sample(
)
else:
all_reads_bam = minimap.run_se(outdir, ref_genome, fq1, sample_name=sample_name)
sample_outprefix = os.path.join(outdir, "sample")
logging.info("Mapping and sampling reads")
sample_outprefix = os.path.join(processing_dir, "sample_reads")
sampler = sample_reads.sample_reads(
ref_genome,
all_reads_bam,
Expand All @@ -83,20 +123,22 @@ def run_one_sample(
target_depth=target_sample_depth,
)
bam = sampler.bam_out
logging.info(f"Running QC on all mapped reads in {bam}")
if paired:
bad_amplicons = qcovid.bin_amplicons(outdir, ref_genome, amplicon_bed, bam)
bad_amplicons = qcovid.bin_amplicons(processing_dir, ref_genome, amplicon_bed, bam)
else:
bad_amplicons = qcovid.bin_amplicons_se(outdir, ref_genome, amplicon_bed, bam)
bad_amplicons = qcovid.bin_amplicons_se(processing_dir, ref_genome, amplicon_bed, bam)

viridian_out = os.path.join(outdir, "viridian")
logging.info("Making initial unmasked consensus using Viridian")
viridian_out = os.path.join(processing_dir, "viridian")
assembly = run_viridian(
tech, viridian_out, ref_genome, amplicon_json, bam, bad_amplicons
)

varifier_out = os.path.join(outdir, "varifier")
logging.info("Mapping reads to consensus from Viridian")
if paired:
self_map = minimap.run(
outdir,
processing_dir,
assembly,
sampler.fq_out1,
sampler.fq_out2,
Expand All @@ -105,13 +147,16 @@ def run_one_sample(
)
else:
self_map = minimap.run_se(
outdir, assembly, sampler.fq_out, prefix="self_qc", sample_name=sample_name
processing_dir, assembly, sampler.fq_out, prefix="self_qc", sample_name=sample_name
)

masked_fasta = qcovid.self_qc(outdir, assembly, self_map)
final_masked_fasta = os.path.join(outdir, "consensus.masked.fa")
logging.info("Running QC on Viridian consensus to make masked FASTA")
masked_fasta = qcovid.self_qc(processing_dir, assembly, self_map)
final_masked_fasta = os.path.join(outdir, "consensus.fa")
set_seq_name_in_fasta_file(masked_fasta, final_masked_fasta, sample_name)

logging.info("Making VCF file of variants")
varifier_out = os.path.join(processing_dir, "varifier")
varifier_vcf = varifier.run(
varifier_out,
ref_genome,
Expand All @@ -124,18 +169,24 @@ def run_one_sample(
set_sample_name_in_vcf_file(varifier_vcf, final_vcf, sample_name)
check_file(final_vcf)

log_info["read_sampling"] = load_json(f"{sample_outprefix}.json")
log_info["viridian"] = load_json(os.path.join(viridian_out, "run_info.json"))

# clean up intermediate files
if not keep_intermediate:
if not keep_bam:
if keep_bam:
logging.info(f"Keeping BAM file {all_reads_bam} because --keep_bam option used")
else:
rm(all_reads_bam)
rm(all_reads_bam + ".bai")
rm(amplicon_bed)
rm(self_map)
rm(self_map + ".bai")
rm(bam)
rm(bam + ".bai")
if paired:
rm(sampler.fq_out1)
rm(sampler.fq_out2)
else:
rm(sampler.fq_out)
logging.info(f"Removing processing directory {processing_dir}")
subprocess.check_output(f"rm -rf {processing_dir}", shell=True)

logging.info(f"Writing JSON log file {json_log}")
end_time = datetime.datetime.now()
log_info["run_summary"]["end_time"] = end_time.replace(microsecond=0).isoformat()
log_info["run_summary"]["run_time"] = str(end_time - start_time)
log_info["run_summary"]["finished_running"] = True
with open(json_log, "w") as f:
json.dump(log_info, f, indent=2)
logging.info(f"Finished running viridian_workflow, output dir: {outdir}")
7 changes: 7 additions & 0 deletions viridian_workflow/tasks/run_one_sample.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import logging
import subprocess
from viridian_workflow import one_sample_pipeline


Expand All @@ -13,6 +15,10 @@ def run(options):
fq1 = options.reads1
fq2 = options.reads2

if options.force:
logging.info(f"--force option used, so deleting {options.outdir} if it exists")
subprocess.check_output(f"rm -rf {options.outdir}", shell=True)

one_sample_pipeline.run_one_sample(
options.tech,
options.outdir,
Expand All @@ -24,4 +30,5 @@ def run(options):
keep_bam=options.keep_bam,
target_sample_depth=options.target_sample_depth,
sample_name=options.sample_name,
command_line_args=options,
)
9 changes: 6 additions & 3 deletions viridian_workflow/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,11 @@ def rm(fn):
os.remove(os.path.abspath(fn))


def load_json(infile):
with open(infile) as f:
return json.load(f)


def run_process(cmd, ignore_error=False, stdout=None):
logging.info(f"Running: {cmd}")
stdout_fd = subprocess.PIPE
Expand Down Expand Up @@ -54,9 +59,7 @@ def amplicons_json_to_bed_and_range(infile, outfile):
various stages of the pipeline. Returns the 0-based inclusive coordinates
of the start of the first amplicon and end of the last amplicon, as
a tuple (start, end)"""
with open(infile) as f:
data = json.load(f)

data = load_json(infile)
start = float("inf")
end = -1

Expand Down

0 comments on commit e2a314a

Please sign in to comment.