Merge pull request #129 from broadinstitute/dp-one-barcode

fixes for single-barcode demultiplexing
broadinstitute · Apr 29, 2015 · aeeecbf · aeeecbf
2 parents 1ba2dc4 + 00e34df
commit aeeecbf
Show file tree

Hide file tree

Showing 3 changed files with 24 additions and 14 deletions.
diff --git a/broad_utils.py b/broad_utils.py
@@ -119,20 +119,23 @@ def main_get_all_names(args) :
 
 def make_barcodes_file(inFile, outFile):
     'Create input file for extract_barcodes'
-    header = ['barcode_name', 'library_name', 'barcode_sequence_1', 'barcode_sequence_2']
+    if any(row.get('barcode_2') for row in util.file.read_tabfile_dict(inFile)):
+        header = ['barcode_name', 'library_name', 'barcode_sequence_1', 'barcode_sequence_2']
+    else:
+        header = ['barcode_name', 'library_name', 'barcode_sequence_1']
     with open(outFile, 'wt') as outf:
         outf.write('\t'.join(header)+'\n')
         for row in util.file.read_tabfile_dict(inFile):
             out  = {'barcode_sequence_1':row['barcode_1'],
-                    'barcode_sequence_2':row['barcode_2'],
+                    'barcode_sequence_2':row.get('barcode_2',''),
                     'barcode_name':row['sample'],
                     'library_name':row['sample']}
             if row.get('library_id_per_sample'):
                 out['library_name'] += '.l' + row['library_id_per_sample']
             outf.write('\t'.join(out[h] for h in header)+'\n')
 def parser_make_barcodes_file(parser=argparse.ArgumentParser()):
     parser.add_argument('inFile',
-        help='''Input tab file w/header and 3-5 named columns (last two are optional):
+        help='''Input tab file w/header and 2-5 named columns (last three are optional):
                 sample, barcode_1, barcode_2, library_id_per_sample, run_id_per_library''')
     parser.add_argument('outFile', help='Output BARCODE_FILE file for Picard.')
     util.cmd.attach_main(parser, make_barcodes_file, split_args=True)
@@ -182,14 +185,17 @@ def main_extract_barcodes(args):
 
 def make_params_file(inFile, bamDir, outFile):
     'Create input file for illumina_basecalls'
-    header = ['BARCODE_1', 'BARCODE_2', 'OUTPUT', 'SAMPLE_ALIAS', 'LIBRARY_NAME']
+    if any(row.get('barcode_2') for row in util.file.read_tabfile_dict(inFile)):
+        header = ['OUTPUT', 'SAMPLE_ALIAS', 'LIBRARY_NAME', 'BARCODE_1', 'BARCODE_2']
+    else:
+        header = ['OUTPUT', 'SAMPLE_ALIAS', 'LIBRARY_NAME', 'BARCODE_1']
     with open(outFile, 'wt') as outf:
         outf.write('\t'.join(header)+'\n')
         rows = list(util.file.read_tabfile_dict(inFile))
         rows.append({'barcode_1':'N','barcode_2':'N','sample':'Unmatched'})
         for row in rows:
             out  = {'BARCODE_1':row['barcode_1'],
-                    'BARCODE_2':row['barcode_2'],
+                    'BARCODE_2':row.get('barcode_2',''),
                     'SAMPLE_ALIAS':row['sample'],
                     'LIBRARY_NAME':row['sample']}
             if row.get('library_id_per_sample'):

diff --git a/pipes/rules/demux.rules b/pipes/rules/demux.rules
@@ -126,7 +126,11 @@ rule illumina_basecalls:
             lane = get_one_lane_from_run(wildcards.flowcell, wildcards.lane, config['seqruns_demux'])
             dir = lane['bustard_dir']
             run_date = lane.get('seq_run_date')
-            shell("{config[binDir]}/broad_utils.py illumina_basecalls {dir} {input[1]} {wildcards.flowcell} {wildcards.lane} {input[0]} --include_non_pf_reads=false --run_start_date={run_date} --sequencing_center={params.center}")
+            opts = ''
+            for opt in ('read_structure',):
+                if lane.get(opt):
+                    opts += ' --%s=%s' % (opt, lane[opt])
+            shell("{config[binDir]}/broad_utils.py illumina_basecalls {dir} {input[1]} {wildcards.flowcell} {wildcards.lane} {input[0]} --include_non_pf_reads=false --run_start_date={run_date} --sequencing_center={params.center} {opts}")
 
 def demux_move_bams_inputs(wildcards):
     lane = get_one_lane_from_run(wildcards.flowcell, wildcards.lane, config.get('seqruns_demux',''))
@@ -143,15 +147,15 @@ rule move_bams_demux:
             shutil.move(input[0], output[0])
 
 rule bams_from_fastq:
-    input:  os.path.join(config['dataDir'],config['subdirs']['source'],'{sample}_R1_{idx}.fastq'),
-            os.path.join(config['dataDir'],config['subdirs']['source'],'{sample}_R2_{idx}.fastq')
-    output: os.path.join(config['dataDir'],config['subdirs']['source'],'{sample}_{idx}.bam')
+    input:  os.path.join(config['dataDir'],config['subdirs']['source'],'{sample}_L001_R1_001.fastq.gz'),
+            os.path.join(config['dataDir'],config['subdirs']['source'],'{sample}_L001_R2_001.fastq.gz')
+    output: os.path.join(config['dataDir'],config['subdirs']['source'],'{sample}.bam')
     params: LSF=config.get('LSF_queues', {}).get('short', '-W 4:00'),
-            logid="{sample}_{idx}",
+            logid="{sample}",
             center=config["seq_center"]
     run:
             makedirs(os.path.join(config['dataDir'], config['subdirs']['source']))
-            shell("{config[binDir]}/read_utils.py fastq_to_bam {input} {output} --sampleName {wildcards.sample} --picardOptions PLATFORM=illumina SEQUENCING_CENTER={params.center} LIBRARY_NAME={wildcards.sample}_{wildcards.idx} SORT_ORDER=queryname")
+            shell("{config[binDir]}/read_utils.py fastq_to_bam {input} {output} --sampleName {wildcards.sample} --picardOptions PLATFORM=illumina SEQUENCING_CENTER={params.center} LIBRARY_NAME={wildcards.sample} SORT_ORDER=queryname")
 
 ruleorder: move_bams_demux > bams_from_fastq
 
diff --git a/tools/picard.py b/tools/picard.py
@@ -157,8 +157,8 @@ class ExtractIlluminaBarcodesTool(PicardTools) :
     subtoolName = 'ExtractIlluminaBarcodes'
     jvmMemDefault = '8g'
     defaults = {'read_structure':'101T8B8B101T',
-        'max_mismatches':1, 'minimum_base_quality':15,
-        'num_processors':4}
+        'max_mismatches':0, 'minimum_base_quality':25,
+        'num_processors':8}
     option_list = ('read_structure', 'max_mismatches', 'minimum_base_quality',
         'min_mismatch_delta', 'max_no_calls', 'minimum_quality',
         'compress_outputs', 'num_processors')
@@ -190,7 +190,7 @@ class IlluminaBasecallsToSamTool(PicardTools) :
     defaults = {'read_structure':'101T8B8B101T', 'sequencing_center':'BI',
         'adapters_to_check': ('PAIRED_END', 'NEXTERA_V1', 'NEXTERA_V2'),
         'max_reads_in_ram_per_tile':100000, 'max_records_in_ram':100000,
-        'num_processors':4, 'force_gc':False}
+        'num_processors':8, 'force_gc':False}
     option_list = ('read_structure', 'sequencing_center', 'adapters_to_check',
         'platform', 'max_reads_in_ram_per_tile', 'max_records_in_ram', 'num_processors',
         'apply_eamss_filter', 'force_gc', 'first_tile', 'tile_limit',