Skip to content

Commit

Permalink
Merge pull request #16 from martinghunt/add_sample_name_option
Browse files Browse the repository at this point in the history
Add sample name option
  • Loading branch information
martinghunt authored Oct 15, 2021
2 parents 7607dbb + 9281047 commit c2fa4a8
Show file tree
Hide file tree
Showing 11 changed files with 100 additions and 13 deletions.
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@ The FASTA and JSON files in those commands can be found in the `data/`
directory of this repository.

Other options:
* `--sample_name MY_NAME`: use this to change the sample name
(default is "sample") that is put in the final FASTA file, BAM file, and
VCF file.
* `--keep_bam`: use this option to keep the BAM file of original input reads
mapped to the reference genome.
* `--force`: use with caution - it will overwrite the output directory if
Expand Down
6 changes: 6 additions & 0 deletions tests/data/utils/set_sample_name_in_vcf_file.expect.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
##fileformat=VCFv4.2
##INFO=<ID=DP,Number=1,Type=Integer,Description="Read depth">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NEW_NAME
ref.1 1 . A G 61 . DP=42 GT 1/1
ref.1 5 . C T 62 . DP=43 GT 0/0
6 changes: 6 additions & 0 deletions tests/data/utils/set_sample_name_in_vcf_file.in.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
##fileformat=VCFv4.2
##INFO=<ID=DP,Number=1,Type=Integer,Description="Read depth">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT OLD_NAME
ref.1 1 . A G 61 . DP=42 GT 1/1
ref.1 5 . C T 62 . DP=43 GT 0/0
2 changes: 2 additions & 0 deletions tests/data/utils/set_seq_name_in_fasta_file.expect.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
>NEW_NAME
ACGT
2 changes: 2 additions & 0 deletions tests/data/utils/set_seq_name_in_fasta_file.in.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
>OLD_NAME
ACGT
20 changes: 20 additions & 0 deletions tests/utils_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,3 +44,23 @@ def test_load_amplicons_bed_file():
]
infile = os.path.join(data_dir, "load_amplicons_bed_file.bed")
assert expect == utils.load_amplicons_bed_file(infile)


def test_set_sample_name_in_vcf_file():
infile = os.path.join(data_dir, "set_sample_name_in_vcf_file.in.vcf")
tmp_out = "tmp.set_sample_name_in_vcf_file.vcf"
subprocess.check_output(f"rm -f {tmp_out}", shell=True)
utils.set_sample_name_in_vcf_file(infile, tmp_out, "NEW_NAME")
expect = os.path.join(data_dir, "set_sample_name_in_vcf_file.expect.vcf")
assert filecmp.cmp(tmp_out, expect, shallow=False)
os.unlink(tmp_out)


def test_set_seq_name_in_fasta_file():
infile = os.path.join(data_dir, "set_seq_name_in_fasta_file.in.fasta")
tmp_out = "tmp.set_seq_name_in_fasta_file.fasta"
subprocess.check_output(f"rm -f {tmp_out}", shell=True)
utils.set_seq_name_in_fasta_file(infile, tmp_out, "NEW_NAME")
expect = os.path.join(data_dir, "set_seq_name_in_fasta_file.expect.fasta")
assert filecmp.cmp(tmp_out, expect, shallow=False)
os.unlink(tmp_out)
6 changes: 6 additions & 0 deletions viridian_workflow/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,12 @@ def main(args=None):
help="Target coverage for amplicon depth normalisation [%(default)s]",
metavar="INT",
)
subparser_run_one_sample.add_argument(
"--sample_name",
default="sample",
help="Name of sample to put in header of final FASTA, VCF, and BAM files [%(default)s]",
metavar="STRING",
)

subparser_run_one_sample.set_defaults(
func=viridian_workflow.tasks.run_one_sample.run
Expand Down
16 changes: 10 additions & 6 deletions viridian_workflow/minimap.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,21 @@
from viridian_workflow.utils import run_process, check_file


def run_se(outdir, ref_genome, fq, prefix=None, threads=1):
def run_se(outdir, ref_genome, fq, prefix=None, threads=1, sample_name="sample"):
"""map single fastq ONT reads with minimap2
"""
bam = os.path.join(outdir, "reference_mapped.bam")
if prefix:
bam = os.path.join(outdir, f"{prefix}-reference_mapped.bam")

minimap_cmd = ["minimap2", "-t", str(threads), "-ax", "map-ont", ref_genome, fq]
# Note: was getting issues with escaping the tab characters in the -R
# option. In the end got it to work by giving Popen a string instead of
# a list and setting shell=True (using a list and shell=False failed using
# exactly the same minimap_cmd contents).
minimap_cmd = ["minimap2", f"-R '@RG\\tID:1\\tSM:{sample_name}'", "-t", str(threads), "-ax", "map-ont", ref_genome, fq]
sort_cmd = ["samtools", "sort", "-o", bam]

map_proc = subprocess.Popen(minimap_cmd, stdout=subprocess.PIPE)
map_proc = subprocess.Popen(" ".join(minimap_cmd), shell=True, stdout=subprocess.PIPE)
sort_proc = subprocess.Popen(sort_cmd, stdin=map_proc.stdout)
sort_proc.wait()
check_file(bam)
Expand All @@ -25,15 +29,15 @@ def run_se(outdir, ref_genome, fq, prefix=None, threads=1):
return bam


def run(outdir, ref_genome, fq1, fq2, prefix=None, threads=1):
def run(outdir, ref_genome, fq1, fq2, prefix=None, threads=1, sample_name="sample"):
bam = os.path.join(outdir, "reference_mapped.bam")
if prefix:
bam = os.path.join(outdir, f"{prefix}-reference_mapped.bam")

minimap_cmd = ["minimap2", "-t", str(threads), "-ax", "sr", ref_genome, fq1, fq2]
minimap_cmd = ["minimap2", f"-R '@RG\\tID:1\\tSM:{sample_name}'", "-t", str(threads), "-ax", "sr", ref_genome, fq1, fq2]
sort_cmd = ["samtools", "sort", "-o", bam]

map_proc = subprocess.Popen(minimap_cmd, stdout=subprocess.PIPE)
map_proc = subprocess.Popen(" ".join(minimap_cmd), shell=True, stdout=subprocess.PIPE)
sort_proc = subprocess.Popen(sort_cmd, stdin=map_proc.stdout)
sort_proc.wait()
check_file(bam)
Expand Down
31 changes: 24 additions & 7 deletions viridian_workflow/one_sample_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
check_file,
run_process,
rm,
set_sample_name_in_vcf_file,
set_seq_name_in_fasta_file,
)


Expand Down Expand Up @@ -49,6 +51,7 @@ def run_one_sample(
keep_intermediate=False,
keep_bam=False,
target_sample_depth=1000,
sample_name="sample",
):
check_tech(tech)
if tech == "ont":
Expand All @@ -66,9 +69,11 @@ def run_one_sample(
amplicon_json, amplicon_bed
)
if paired:
all_reads_bam = minimap.run(outdir, ref_genome, fq1, fq2)
all_reads_bam = minimap.run(
outdir, ref_genome, fq1, fq2, sample_name=sample_name
)
else:
all_reads_bam = minimap.run_se(outdir, ref_genome, fq1)
all_reads_bam = minimap.run_se(outdir, ref_genome, fq1, sample_name=sample_name)
sample_outprefix = os.path.join(outdir, "sample")
sampler = sample_reads.sample_reads(
ref_genome,
Expand All @@ -91,21 +96,33 @@ def run_one_sample(
varifier_out = os.path.join(outdir, "varifier")
if paired:
self_map = minimap.run(
outdir, assembly, sampler.fq_out1, sampler.fq_out2, prefix="self_qc"
outdir,
assembly,
sampler.fq_out1,
sampler.fq_out2,
prefix="self_qc",
sample_name=sample_name,
)
else:
self_map = minimap.run_se(outdir, assembly, sampler.fq_out, prefix="self_qc")
self_map = minimap.run_se(
outdir, assembly, sampler.fq_out, prefix="self_qc", sample_name=sample_name
)

masked_fasta = qcovid.self_qc(outdir, assembly, self_map)
final_masked_fasta = os.path.join(outdir, "consensus.masked.fa")
set_seq_name_in_fasta_file(masked_fasta, final_masked_fasta, sample_name)

vcf = varifier.run(
varifier_vcf = varifier.run(
varifier_out,
ref_genome,
masked_fasta,
final_masked_fasta,
min_coord=amplicons_start,
max_coord=amplicons_end,
)
check_file(vcf)
check_file(varifier_vcf)
final_vcf = os.path.join(outdir, "variants.vcf")
set_sample_name_in_vcf_file(varifier_vcf, final_vcf, sample_name)
check_file(final_vcf)

# clean up intermediate files
if not keep_intermediate:
Expand Down
1 change: 1 addition & 0 deletions viridian_workflow/tasks/run_one_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,5 @@ def run(options):
keep_intermediate=options.debug,
keep_bam=options.keep_bam,
target_sample_depth=options.target_sample_depth,
sample_name=options.sample_name,
)
20 changes: 20 additions & 0 deletions viridian_workflow/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,3 +96,23 @@ def load_single_seq_fasta(infile):
ref = list(d.values())[0]
ref.id = ref.id.split()[0]
return ref


def set_sample_name_in_vcf_file(infile, outfile, sample_name):
with open(infile) as f_in, open(outfile, "w") as f_out:
for line in f_in:
if line.startswith("#CHROM\tPOS"):
fields = line.rstrip().split("\t")
fields[-1] = sample_name
print(*fields, sep="\t", file=f_out)
else:
print(line, end="", file=f_out)


def set_seq_name_in_fasta_file(infile, outfile, new_name):
"""Changes name in FASTA file. Assumes that there is only one sequence
in the file, raises Exception if it finds >1 sequence"""
seq = load_single_seq_fasta(infile)
seq.id = new_name
with open(outfile, "w") as f:
print(seq, file=f)

0 comments on commit c2fa4a8

Please sign in to comment.