diff --git a/broad_utils.py b/broad_utils.py index 4d033276e..abb9b0a76 100755 --- a/broad_utils.py +++ b/broad_utils.py @@ -119,12 +119,15 @@ def main_get_all_names(args) : def make_barcodes_file(inFile, outFile): 'Create input file for extract_barcodes' - header = ['barcode_name', 'library_name', 'barcode_sequence_1', 'barcode_sequence_2'] + if any(row.get('barcode_2') for row in util.file.read_tabfile_dict(inFile)): + header = ['barcode_name', 'library_name', 'barcode_sequence_1', 'barcode_sequence_2'] + else: + header = ['barcode_name', 'library_name', 'barcode_sequence_1'] with open(outFile, 'wt') as outf: outf.write('\t'.join(header)+'\n') for row in util.file.read_tabfile_dict(inFile): out = {'barcode_sequence_1':row['barcode_1'], - 'barcode_sequence_2':row['barcode_2'], + 'barcode_sequence_2':row.get('barcode_2',''), 'barcode_name':row['sample'], 'library_name':row['sample']} if row.get('library_id_per_sample'): @@ -132,7 +135,7 @@ def make_barcodes_file(inFile, outFile): outf.write('\t'.join(out[h] for h in header)+'\n') def parser_make_barcodes_file(parser=argparse.ArgumentParser()): parser.add_argument('inFile', - help='''Input tab file w/header and 3-5 named columns (last two are optional): + help='''Input tab file w/header and 2-5 named columns (last three are optional): sample, barcode_1, barcode_2, library_id_per_sample, run_id_per_library''') parser.add_argument('outFile', help='Output BARCODE_FILE file for Picard.') util.cmd.attach_main(parser, make_barcodes_file, split_args=True) @@ -182,14 +185,17 @@ def main_extract_barcodes(args): def make_params_file(inFile, bamDir, outFile): 'Create input file for illumina_basecalls' - header = ['BARCODE_1', 'BARCODE_2', 'OUTPUT', 'SAMPLE_ALIAS', 'LIBRARY_NAME'] + if any(row.get('barcode_2') for row in util.file.read_tabfile_dict(inFile)): + header = ['OUTPUT', 'SAMPLE_ALIAS', 'LIBRARY_NAME', 'BARCODE_1', 'BARCODE_2'] + else: + header = ['OUTPUT', 'SAMPLE_ALIAS', 'LIBRARY_NAME', 'BARCODE_1'] with open(outFile, 'wt') as outf: outf.write('\t'.join(header)+'\n') rows = list(util.file.read_tabfile_dict(inFile)) rows.append({'barcode_1':'N','barcode_2':'N','sample':'Unmatched'}) for row in rows: out = {'BARCODE_1':row['barcode_1'], - 'BARCODE_2':row['barcode_2'], + 'BARCODE_2':row.get('barcode_2',''), 'SAMPLE_ALIAS':row['sample'], 'LIBRARY_NAME':row['sample']} if row.get('library_id_per_sample'): diff --git a/pipes/rules/demux.rules b/pipes/rules/demux.rules index 29f6a356b..f6e126cf3 100644 --- a/pipes/rules/demux.rules +++ b/pipes/rules/demux.rules @@ -126,7 +126,11 @@ rule illumina_basecalls: lane = get_one_lane_from_run(wildcards.flowcell, wildcards.lane, config['seqruns_demux']) dir = lane['bustard_dir'] run_date = lane.get('seq_run_date') - shell("{config[binDir]}/broad_utils.py illumina_basecalls {dir} {input[1]} {wildcards.flowcell} {wildcards.lane} {input[0]} --include_non_pf_reads=false --run_start_date={run_date} --sequencing_center={params.center}") + opts = '' + for opt in ('read_structure',): + if lane.get(opt): + opts += ' --%s=%s' % (opt, lane[opt]) + shell("{config[binDir]}/broad_utils.py illumina_basecalls {dir} {input[1]} {wildcards.flowcell} {wildcards.lane} {input[0]} --include_non_pf_reads=false --run_start_date={run_date} --sequencing_center={params.center} {opts}") def demux_move_bams_inputs(wildcards): lane = get_one_lane_from_run(wildcards.flowcell, wildcards.lane, config.get('seqruns_demux','')) @@ -143,15 +147,15 @@ rule move_bams_demux: shutil.move(input[0], output[0]) rule bams_from_fastq: - input: os.path.join(config['dataDir'],config['subdirs']['source'],'{sample}_R1_{idx}.fastq'), - os.path.join(config['dataDir'],config['subdirs']['source'],'{sample}_R2_{idx}.fastq') - output: os.path.join(config['dataDir'],config['subdirs']['source'],'{sample}_{idx}.bam') + input: os.path.join(config['dataDir'],config['subdirs']['source'],'{sample}_L001_R1_001.fastq.gz'), + os.path.join(config['dataDir'],config['subdirs']['source'],'{sample}_L001_R2_001.fastq.gz') + output: os.path.join(config['dataDir'],config['subdirs']['source'],'{sample}.bam') params: LSF=config.get('LSF_queues', {}).get('short', '-W 4:00'), - logid="{sample}_{idx}", + logid="{sample}", center=config["seq_center"] run: makedirs(os.path.join(config['dataDir'], config['subdirs']['source'])) - shell("{config[binDir]}/read_utils.py fastq_to_bam {input} {output} --sampleName {wildcards.sample} --picardOptions PLATFORM=illumina SEQUENCING_CENTER={params.center} LIBRARY_NAME={wildcards.sample}_{wildcards.idx} SORT_ORDER=queryname") + shell("{config[binDir]}/read_utils.py fastq_to_bam {input} {output} --sampleName {wildcards.sample} --picardOptions PLATFORM=illumina SEQUENCING_CENTER={params.center} LIBRARY_NAME={wildcards.sample} SORT_ORDER=queryname") ruleorder: move_bams_demux > bams_from_fastq diff --git a/tools/picard.py b/tools/picard.py index b84354bad..6898a8b0e 100644 --- a/tools/picard.py +++ b/tools/picard.py @@ -157,8 +157,8 @@ class ExtractIlluminaBarcodesTool(PicardTools) : subtoolName = 'ExtractIlluminaBarcodes' jvmMemDefault = '8g' defaults = {'read_structure':'101T8B8B101T', - 'max_mismatches':1, 'minimum_base_quality':15, - 'num_processors':4} + 'max_mismatches':0, 'minimum_base_quality':25, + 'num_processors':8} option_list = ('read_structure', 'max_mismatches', 'minimum_base_quality', 'min_mismatch_delta', 'max_no_calls', 'minimum_quality', 'compress_outputs', 'num_processors') @@ -190,7 +190,7 @@ class IlluminaBasecallsToSamTool(PicardTools) : defaults = {'read_structure':'101T8B8B101T', 'sequencing_center':'BI', 'adapters_to_check': ('PAIRED_END', 'NEXTERA_V1', 'NEXTERA_V2'), 'max_reads_in_ram_per_tile':100000, 'max_records_in_ram':100000, - 'num_processors':4, 'force_gc':False} + 'num_processors':8, 'force_gc':False} option_list = ('read_structure', 'sequencing_center', 'adapters_to_check', 'platform', 'max_reads_in_ram_per_tile', 'max_records_in_ram', 'num_processors', 'apply_eamss_filter', 'force_gc', 'first_tile', 'tile_limit',