From 4b35e7b015b4b298fad1eecc4f21c3a0d7b27347 Mon Sep 17 00:00:00 2001 From: Stephen Nayfach Date: Sun, 20 Aug 2017 09:15:40 -0700 Subject: [PATCH] Alignment mode changes -switch back to global alignment in snps module; makes program consistent with versions <= 1.2.2 -genes module has always used local alignment, so no changes here -added option `-m` in run_midas.py snps|genes to choose between global/local read alignment -updated help text and examples to reflect these changes --- docs/cnvs.md | 1 + docs/snvs.md | 1 + midas/run/genes.py | 3 ++- midas/run/snps.py | 3 ++- scripts/run_midas.py | 29 ++++++++++++++++++++++------- 5 files changed, 28 insertions(+), 9 deletions(-) diff --git a/docs/cnvs.md b/docs/cnvs.md index 628a82e..3f15ab3 100644 --- a/docs/cnvs.md +++ b/docs/cnvs.md @@ -41,6 +41,7 @@ Read alignment options (if using --align): --interleaved FASTA/FASTQ file in -1 are paired and contain forward AND reverse reads -s {very-fast,fast,sensitive,very-sensitive} Alignment speed/sensitivity (very-sensitive) + -m {local,global} Global/local read alignment (local) -n MAX_READS # reads to use from input file(s) (use all) -t THREADS Number of threads to use (1) diff --git a/docs/snvs.md b/docs/snvs.md index e28286b..27b1ea6 100644 --- a/docs/snvs.md +++ b/docs/snvs.md @@ -41,6 +41,7 @@ Read alignment options (if using --align): --interleaved FASTA/FASTQ file in -1 are paired and contain forward AND reverse reads -s {very-fast,fast,sensitive,very-sensitive} Bowtie2 alignment speed/sensitivity (very-sensitive) + -m {local,global} Global/local read alignment (global) -n MAX_READS # reads to use from input file(s) (use all) -t THREADS Number of threads to use (1) diff --git a/midas/run/genes.py b/midas/run/genes.py index a277653..943fa67 100644 --- a/midas/run/genes.py +++ b/midas/run/genes.py @@ -120,7 +120,8 @@ def pangenome_align(args): command += '-x %s ' % '/'.join([args['outdir'], 'genes/temp/pangenomes']) # index if args['max_reads']: command += '-u %s ' % args['max_reads'] # max num of reads if args['trim']: command += '--trim3 %s ' % args['trim'] # trim 3' - command += '--%s-local ' % args['speed'] # speed/sensitivity + command += '--%s' % args['speed'] # alignment speed + command += '-local ' if args['mode'] == 'local' else ' ' # global/local alignment command += '--threads %s ' % args['threads'] # threads command += '-f ' if args['file_type'] == 'fasta' else '-q ' # input type if args['m2']: # -1 and -2 contain paired reads diff --git a/midas/run/snps.py b/midas/run/snps.py index bda3c71..d439945 100644 --- a/midas/run/snps.py +++ b/midas/run/snps.py @@ -102,7 +102,8 @@ def genome_align(args): command += '-x %s ' % '/'.join([args['outdir'], 'snps/temp/genomes']) # index if args['max_reads']: command += '-u %s ' % args['max_reads'] # max num of reads if args['trim']: command += '--trim3 %s ' % args['trim'] # trim 3' - command += '--%s-local ' % args['speed'] # speed/sensitivity + command += '--%s' % args['speed'] # alignment speed + command += '-local ' if args['mode'] == 'local' else ' ' # global/local alignment command += '--threads %s ' % args['threads'] command += '-f ' if args['file_type'] == 'fasta' else '-q ' # input type if args['m2']: # -1 and -2 contain paired reads diff --git a/scripts/run_midas.py b/scripts/run_midas.py index 2036bae..4ba0c27 100755 --- a/scripts/run_midas.py +++ b/scripts/run_midas.py @@ -210,7 +210,7 @@ def gene_arguments(): The pipeline can be broken down into the following steps: 1) build a database of pangenomes for abundance bacterial species - 2) map high-quality metagenomic reads to the database + 2) use local alignment to map high-quality metagenomic reads to the database 3) use mapped reads to quantify pangenome genes After completion, use 'merge_midas.py genes' to generate pangenome matrixes @@ -224,10 +224,13 @@ def gene_arguments(): 2) run entire pipeline for a specific species: run_midas.py genes /path/to/outdir --species_id Bacteroides_vulgatus_57955 -1 /path/to/reads_1.fq.gz -2 /path/to/reads_2.fq.gz -3) just align reads, use faster alignment, only use the first 10M reads, use 4 CPUs: +3) use global read alignment (default is local alignment with 75% minimum alignment coverage): +run_midas.py snps /path/to/outdir -1 /path/to/reads_1.fq.gz -2 /path/to/reads_2.fq.gz -m global + +4) just align reads, use faster alignment, only use the first 10M reads, use 4 CPUs: run_midas.py genes /path/to/outdir --align -1 /path/to/reads_1.fq.gz -2 /path/to/reads_2.fq.gz -s very-fast -n 10000000 -t 4 -4) just quantify genes, keep reads with >=95% alignment identity and reads with an average quality-score >=30: +5) just quantify genes, keep reads with >=95% alignment identity and reads with an average quality-score >=30: run_midas.py genes /path/to/outdir --call_genes --mapid 95 --readq 20 """) parser.add_argument('program', help=argparse.SUPPRESS) @@ -263,6 +266,9 @@ def gene_arguments(): align.add_argument('-s', type=str, dest='speed', default='very-sensitive', choices=['very-fast', 'fast', 'sensitive', 'very-sensitive'], help='Alignment speed/sensitivity (very-sensitive)') + align.add_argument('-m', type=str, dest='mode', default='local', + choices=['local', 'global'], + help='Global/local read alignment (local)') align.add_argument('-n', type=int, dest='max_reads', help='# reads to use from input file(s) (use all)') align.add_argument('-t', dest='threads', default=1, @@ -315,6 +321,7 @@ def print_gene_arguments(args): else: lines.append(" input reads (unpaired): %s" % args['m1']) lines.append(" alignment speed/sensitivity: %s" % args['speed']) + lines.append(" alignment mode: %s" % args['mode']) lines.append(" number of reads to use from input: %s" % (args['max_reads'] if args['max_reads'] else 'use all')) lines.append(" number of threads for database search: %s" % args['threads']) if args['cov']: @@ -337,7 +344,7 @@ def snp_arguments(): The pipeline can be broken down into the following steps: 1) build a database of genome sequences for abundant bacterial species (1 representative genome/species) - 2) map high-quality reads to the database + 2) use global alignment to map high-quality reads to the database 3) generate pileups and count variants at each genomic site After completion, use 'merge_midas.py snps' to perform multisample SNP calling @@ -351,10 +358,13 @@ def snp_arguments(): 2) run entire pipeline for a specific species: run_midas.py snps /path/to/outdir --species_id Bacteroides_vulgatus_57955 -1 /path/to/reads_1.fq.gz -2 /path/to/reads_2.fq.gz -3) just align reads, use faster alignment, only use the first 10M reads, use 4 CPUs: +3) use local read alignment and discard alignments that cover reads by < 75% (default is global alignment): +run_midas.py snps /path/to/outdir -1 /path/to/reads_1.fq.gz -2 /path/to/reads_2.fq.gz -m local --aln_cov 0.75 + +4) just align reads, use faster alignment, only use the first 10M reads, use 4 CPUs: run_midas.py snps /path/to/outdir --align -1 /path/to/reads_1.fq.gz -2 /path/to/reads_2.fq.gz -s very-fast -n 10000000 -t 4 -4) just count variants, keep reads with >=95% alignment identity and keep bases with quality-scores >=35: +5) just count variants, keep reads with >=95% alignment identity and keep bases with quality-scores >=35: run_midas.py snps /path/to/outdir --pileup --mapid 95 --baseq 35 """) parser.add_argument('program', help=argparse.SUPPRESS) @@ -391,7 +401,11 @@ def snp_arguments(): choices=['very-fast', 'fast', 'sensitive', 'very-sensitive'], help='Bowtie2 alignment speed/sensitivity (very-sensitive)') align.add_argument('-n', type=int, dest='max_reads', help='# reads to use from input file(s) (use all)') - align.add_argument('-t', dest='threads', default=1, help='Number of threads to use (1)') + align.add_argument('-m', type=str, dest='mode', default='global', + choices=['local', 'global'], + help='Global/local read alignment (global)') + align.add_argument('-t', dest='threads', default=1, + help='Number of threads to use (1)') snps = parser.add_argument_group('Pileup options (if using --pileup)') snps.add_argument('--mapid', type=float, metavar='FLOAT', default=94.0, help='Discard reads with alignment identity < MAPID (94.0)') @@ -448,6 +462,7 @@ def print_snp_arguments(args): else: lines.append(" input reads (unpaired): %s" % args['m1']) lines.append(" alignment speed/sensitivity: %s" % args['speed']) + lines.append(" alignment mode: %s" % args['mode']) lines.append(" number of reads to use from input: %s" % (args['max_reads'] if args['max_reads'] else 'use all')) lines.append(" number of threads for database search: %s" % args['threads']) if args['call']: