From f7478a9c1977004c2885a1df267ebb5450817e2b Mon Sep 17 00:00:00 2001 From: Ahsan Date: Thu, 20 Jul 2023 17:22:22 -0400 Subject: [PATCH] allelic depth with strand info, short ONT read mode, SNP quality filter for phasing --- NanoCaller | 13 ++++++--- nanocaller_src/generate_SNP_pileups.py | 38 ++++++++++++++++++++++-- nanocaller_src/indelCaller.py | 10 +++++-- nanocaller_src/snpCaller.py | 40 ++++++++++++++++++-------- 4 files changed, 80 insertions(+), 21 deletions(-) diff --git a/NanoCaller b/NanoCaller index 7c13c51..b461b1e 100755 --- a/NanoCaller +++ b/NanoCaller @@ -30,7 +30,8 @@ def run(args): 'threshold':threshold, 'snp_model':args.snp_model, 'cpu':args.cpu, \ 'vcf_path':args.output, 'prefix':args.prefix, 'sample':args.sample, \ 'seq':args.sequencing, 'supplementary':args.supplementary, \ - 'exclude_bed':args.exclude_bed, 'suppress_progress':args.suppress_progress_bar} + 'exclude_bed':args.exclude_bed, 'suppress_progress':args.suppress_progress_bar, + 'phase_qual_score':args.phase_qual_score} snp_vcf=snpCaller.call_manager(in_dict) print('\n%s: SNP calling completed. Time taken= %.4f\n' %(str(datetime.datetime.now()), time.time()-snp_time),flush=True) @@ -47,7 +48,8 @@ def run(args): 'seq':args.sequencing, 'del_t':args.del_threshold, 'ins_t':args.ins_threshold,\ 'impute_indel_phase':args.impute_indel_phase, 'supplementary':args.supplementary,\ 'exclude_bed':args.exclude_bed, 'win_size':args.win_size, 'small_win_size':args.small_win_size, - 'enable_whatshap':args.enable_whatshap, 'suppress_progress':args.suppress_progress_bar} + 'enable_whatshap':args.enable_whatshap, 'suppress_progress':args.suppress_progress_bar, + 'phase_qual_score':args.phase_qual_score} files=indelCaller.call_manager(in_dict) print('\n%s: %s completed. Time taken= %.4f\n' %(str(datetime.datetime.now()), 'Phasing' if args.mode=='snps' else 'Indel calling', time.time()-indel_phase_time),flush=True) @@ -62,6 +64,8 @@ if __name__ == '__main__': preset_dict={'ont':{'sequencing':'ont', 'snp_model':'ONT-HG002', 'indel_model':'ONT-HG002', 'neighbor_threshold':'0.4,0.6', 'ins_threshold':0.4,'del_threshold':0.6, 'enable_whatshap':False, 'impute_indel_phase':False}, + 'short_ont':{'sequencing':'short_ont', 'snp_model':'ONT-HG002', 'indel_model':'ONT-HG002', 'neighbor_threshold':'0.3,0.7', 'ins_threshold':0.4,'del_threshold':0.6, 'enable_whatshap':False, 'impute_indel_phase':False}, + 'ul_ont': {'sequencing':'ul_ont', 'snp_model':'ONT-HG002', 'indel_model':'ONT-HG002', 'neighbor_threshold':'0.4,0.6', 'ins_threshold':0.4,'del_threshold':0.6, 'enable_whatshap':False, 'impute_indel_phase':False}, 'ul_ont_extreme':{'sequencing':'ul_ont_extreme', 'snp_model':'ONT-HG002', 'indel_model':'ONT-HG002', 'neighbor_threshold':'0.4,0.6', 'ins_threshold':0.4,'del_threshold':0.6, 'enable_whatshap':False, 'impute_indel_phase':False}, @@ -89,7 +93,7 @@ if __name__ == '__main__': phase_group=parser.add_argument_group("Phasing") config_group.add_argument("--mode", help="NanoCaller mode to run. 'snps' mode quits NanoCaller without using WhatsHap for phasing. In this mode, if you want NanoCaller to phase SNPs and BAM files, use --phase argument additionally.", type=str, default='all', choices=['snps', 'indels', 'all']) - config_group.add_argument("--sequencing", help="Sequencing type. 'ont' works well for any type of ONT sequencing datasets. However, use 'ul_ont' if you have several ultra-long ONT reads up to 100kbp long, and 'ul_ont_extreme' if you have several ultra-long ONT reads up to 300kbp long. For PacBio CCS (HiFi) and CLR reads, use 'pacbio'.", type=str, default='ont', choices=['ont', 'ul_ont', 'ul_ont_extreme', 'pacbio']) + config_group.add_argument("--sequencing", help="Sequencing type. 'ont' works well for any type of ONT sequencing datasets. However, use 'ul_ont' if you have several ultra-long ONT reads up to 100kbp long, and 'ul_ont_extreme' if you have several ultra-long ONT reads up to 300kbp long. If you have short ONT reads, typically within 2-10kbp, use 'short_ont'. For PacBio CCS (HiFi) and CLR reads, use 'pacbio'.", type=str, default='ont', choices=['short_ont','ont', 'ul_ont', 'ul_ont_extreme', 'pacbio']) config_group.add_argument("--cpu", help="Number of CPUs to use.", type=int, default=1) @@ -141,8 +145,9 @@ if __name__ == '__main__': #phasing phase_group.add_argument('--phase', help='Phase SNPs and BAM files if snps mode is selected.', default=False, action='store_true') + phase_group.add_argument('--phase_qual_score', help='SNP QUAL score cutoff for phasing via Whatshap. SNPS with QUAL score lower than this will not be used for infering haplotype and will also not be phased.', type=float, default=10) - phase_group.add_argument("--enable_whatshap", help="Allow WhatsHap to change SNP genotypes when phasing using --distrust-genotypes and --include-homozygous flags (this is not the same as regenotyping), increasing the time needed for phasing. It has a negligible effect on SNP calling accuracy for Nanopore reads, but may make a small improvement for PacBio reads. By default WhatsHap will only phase SNP calls produced by NanoCaller, but not change their genotypes.", default=False, action='store_true') + phase_group.add_argument("--enable_whatshap", help="Allow WhatsHap to change SNP genotypes when phasing using --distrust-genotypes and --include-homozygous flags (this is not the same as regenotyping), increasing the time needed for phasing. It has a negligible effect on SNP calling accuracy for Nanopore reads, but may make a small improvement for PacBio reads or low coverage ONT samples. By default WhatsHap will only phase SNP calls produced by NanoCaller, but not change their genotypes.", default=False, action='store_true') args = parser.parse_args() diff --git a/nanocaller_src/generate_SNP_pileups.py b/nanocaller_src/generate_SNP_pileups.py index c757671..0420270 100644 --- a/nanocaller_src/generate_SNP_pileups.py +++ b/nanocaller_src/generate_SNP_pileups.py @@ -21,7 +21,23 @@ def get_cnd_pos(v_pos,cnd_pos, seq='ont'): ls_total_1=sorted(ls1_0+ls1_1+ls1_2+ls1_3+ls1_4) ls_total_2=sorted(ls2_0+ls2_1+ls2_2+ls2_3+ls2_4) + + elif seq=='short_ont': + ls=cnd_pos[abs(cnd_pos-v_pos)<50000] + + ls1_0= [p for p in ls if (p>=v_pos-2000) & (p=v_pos-5000) & (pv_pos) & (p<=v_pos+2000)][:5] + ls2_1= [p for p in ls if (p>v_pos+2000) & (p<=v_pos+5000)][:10] + ls2_2= [p for p in ls if (p>v_pos+5000)][:5] + + + ls_total_1=sorted(ls1_0+ls1_1+ls1_2) + ls_total_2=sorted(ls2_0+ls2_1+ls2_2) + elif seq=='ul_ont': ls=cnd_pos[abs(cnd_pos-v_pos)<100000] @@ -120,6 +136,12 @@ def ex_bed(tree, pos): ref_dict={j:s.upper() if s in 'AGTC' else '*' for j,s in zip(range(max(1,start-50000),end+50000+1),fastafile.fetch(chrom,max(1,start-50000)-1,end+50000)) } + strand_dict={} + + for pread in samfile.fetch(chrom, max(0, start-10), end + 10): + if pread.flag & 0x900==0: + strand_dict[pread.qname]=(pread.flag & 2320)//16 + pileup_dict={} nbr_sites=[] @@ -170,7 +192,9 @@ def ex_bed(tree, pos): pos_list=output.keys() current_depth=[] depth=0 - output_pos,output_ref,output_mat,output_dp,output_freq=[],[],[],[],[] + + + output_pos,output_ref,output_mat,output_dp,output_freq,output_fwd_dp, output_rev_dp=[],[],[],[],[],[],[] if pos_list: for v_pos in pos_list: @@ -183,6 +207,11 @@ def ex_bed(tree, pos): sample=list(pileup_dict[v_pos].keys()) + bases=np.eye(5)[np.array(list(pileup_dict[v_pos].values()))][:,:4] + strand_info=np.array([strand_dict[rname] for rname in pileup_dict[v_pos].keys()]).astype(bool) + fwd_bases=np.sum(bases[~strand_info],axis=0) + rev_bases=np.sum(bases[strand_info],axis=0) + if len(sample) > dct['maxcov']: sample=random.sample(sample,min(len(sample), dct['maxcov'])) @@ -229,6 +258,8 @@ def ex_bed(tree, pos): output_mat.append(data) output_dp.append(output[v_pos][0]) output_freq.append(output[v_pos][1]) + output_fwd_dp.append(fwd_bases) + output_rev_dp.append(rev_bases) current_depth.append(len(sample)) if len(output_pos)>0: @@ -241,5 +272,8 @@ def ex_bed(tree, pos): output_dp=np.array(output_dp) output_freq=np.array(output_freq) depth=np.mean(current_depth) + + output_fwd_dp=np.array(output_fwd_dp) + output_rev_dp=np.array(output_rev_dp) - return (output_pos,output_ref,output_mat,output_dp,output_freq,depth) \ No newline at end of file + return (output_pos,output_ref,output_mat,output_dp,output_freq,depth, output_fwd_dp, output_rev_dp) \ No newline at end of file diff --git a/nanocaller_src/indelCaller.py b/nanocaller_src/indelCaller.py index a4fd654..570cb8b 100644 --- a/nanocaller_src/indelCaller.py +++ b/nanocaller_src/indelCaller.py @@ -215,17 +215,21 @@ def phase_run(contig_dict, params, indel_dict, job_Q, counter_Q, phased_snp_file start, end = contig_dict['start'], contig_dict['end'] phase_dir = params['intermediate_phase_files_dir'] input_snp_vcf = params['snp_vcf'] + phase_qual_score=params['phase_qual_score'] input_contig_vcf = os.path.join(phase_dir, '%s.snps.unphased.vcf' %contig) + lowq_input_contig_vcf= os.path.join(phase_dir, '%s.snps.lowq.unphased.vcf.gz' %contig) raw_output_contig_vcf = os.path.join(phase_dir, '%s.snps.phased.raw.vcf' %contig) output_contig_vcf = os.path.join(phase_dir, '%s.snps.phased.vcf.gz' %contig) phased_bam = os.path.join(phase_dir, '%s.phased.bam' %contig) phased_snp_files_list.append(output_contig_vcf) + phased_snp_files_list.append(lowq_input_contig_vcf) enable_whatshap = '--distrust-genotypes --include-homozygous' if params['enable_whatshap'] else '' - + # extract region from VCF - run_cmd('bcftools view %s -r %s -o %s' %(input_snp_vcf, contig, input_contig_vcf)) + run_cmd('bcftools view %s -r %s -i "QUAL>=%.4f" -o %s' %(input_snp_vcf, contig, phase_qual_score,input_contig_vcf)) + run_cmd('bcftools view %s -r %s -i "QUAL<%.4f"|bgziptabix %s' %(input_snp_vcf, contig, phase_qual_score,lowq_input_contig_vcf)) #phase VCF run_cmd("whatshap phase %s %s -o %s -r %s --ignore-read-groups --chromosome %s %s" %(input_contig_vcf, params['sam_path'], raw_output_contig_vcf, params['fasta_path'], contig , enable_whatshap)) @@ -346,7 +350,7 @@ def call_manager(params): output_files['snps']=os.path.join(params['vcf_path'],'%s.snps.phased.vcf.gz' %params['prefix']) phased_snps_files='\n'.join(phased_snp_files_list) - cmd='bcftools concat -f - |bgziptabix %s' %(output_files['snps']) + cmd='bcftools concat -a -f - |bgziptabix %s' %(output_files['snps']) stream=Popen(cmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=PIPE) stdout, stderr = stream.communicate(input=phased_snps_files.encode()) diff --git a/nanocaller_src/snpCaller.py b/nanocaller_src/snpCaller.py index ec94c9d..6851181 100644 --- a/nanocaller_src/snpCaller.py +++ b/nanocaller_src/snpCaller.py @@ -83,7 +83,7 @@ def caller(params, chunks_Q, counter_Q, snp_files): while not chunks_Q.empty(): chunk = chunks_Q.get(block=False) chrom=chunk['chrom'] - pos, test_ref, x_test, dp, freq, coverage = get_snp_testing_candidates(params, chunk) + pos, test_ref, x_test, dp, freq, coverage, fwd_dp, rev_dp = get_snp_testing_candidates(params, chunk) if len(pos)!=0: if chunk['ploidy']=='diploid': @@ -95,6 +95,9 @@ def caller(params, chunks_Q, counter_Q, snp_files): for batch in range(int(np.ceil(len(x_test)/batch_size))): batch_freq=freq[batch*batch_size:min((batch+1)*batch_size,len(freq))] batch_dp=dp[batch*batch_size:min((batch+1)*batch_size,len(dp))] + batch_fwd_dp=fwd_dp[batch*batch_size:min((batch+1)*batch_size,len(fwd_dp))] + batch_rev_dp=rev_dp[batch*batch_size:min((batch+1)*batch_size,len(rev_dp))] + batch_pos = pos[batch*batch_size:min((batch+1)*batch_size,len(pos))] batch_x = x_test[batch*batch_size:min((batch+1)*batch_size,len(x_test))] @@ -118,34 +121,42 @@ def caller(params, chunks_Q, counter_Q, snp_files): sort_probs=np.sort(batch_probs,axis=1) for j in range(len(batch_pred_GT)): - info_field='PR='+','.join("{:.4f}".format(x) for x in batch_probs[j,[0,3,1,2]]) + info_field='PR='+','.join("{:.4f}".format(x) for x in batch_probs[j,[0,3,1,2]])+ ";FQ={:.4f}".format(batch_freq[j]) + ref_dp=(batch_fwd_dp[j][batch_ref[j]], batch_rev_dp[j][batch_ref[j]]) + if batch_pred_GT[j]>=2: # if het pred1,pred2=batch_pred[j,-1],batch_pred[j,-2] if pred1==batch_ref[j]: - s='%s\t%d\t.\t%s\t%s\t%.3f\t%s\t%s\tGT:DP:FQ\t%s:%d:%.4f\n' %(chrom, batch_pos[j], num_to_base_map[batch_ref[j]], num_to_base_map[pred2], min(99,-10*np.log10(1e-10+ 1-batch_probs[j,pred2])),'PASS',info_field, '0/1', batch_dp[j], batch_freq[j]) - f.write(s) + alt_dp=(batch_fwd_dp[j][pred2], batch_rev_dp[j][pred2]) + + s='%s\t%d\t.\t%s\t%s\t%.3f\t%s\t%s\tGT:DP:VF:AD:ADF:ADR\t%s:%d:%.4f:%d,%d:%d,%d:%d,%d\n' %(chrom, batch_pos[j], num_to_base_map[batch_ref[j]], num_to_base_map[pred2], min(99,-10*np.log10(1e-10+ 1-batch_probs[j,pred2])),'PASS',info_field, '0/1', batch_dp[j], sum(alt_dp)/batch_dp[j], sum(ref_dp), sum(alt_dp), ref_dp[0], alt_dp[0], ref_dp[1], alt_dp[1]) + f.write(s) elif pred2==batch_ref[j] and batch_probs[j,pred2]>=0.5: - s='%s\t%d\t.\t%s\t%s\t%.3f\t%s\t%s\tGT:DP:FQ\t%s:%d:%.4f\n' %(chrom,batch_pos[j], num_to_base_map[batch_ref[j]], num_to_base_map[pred1], min(99,-10*np.log10(1e-10+ 1-batch_probs[j,pred2])),'PASS',info_field, '1/0', batch_dp[j], batch_freq[j]) + alt_dp=(batch_fwd_dp[j][pred1], batch_rev_dp[j][pred1]) + s='%s\t%d\t.\t%s\t%s\t%.3f\t%s\t%s\tGT:DP:VF:AD:ADF:ADR\t%s:%d:%.4f:%d,%d:%d,%d:%d,%d\n' %(chrom,batch_pos[j], num_to_base_map[batch_ref[j]], num_to_base_map[pred1], min(99,-10*np.log10(1e-10+ 1-batch_probs[j,pred2])),'PASS',info_field, '0/1', batch_dp[j], sum(alt_dp)/batch_dp[j], sum(ref_dp), sum(alt_dp), ref_dp[0], alt_dp[0], ref_dp[1], alt_dp[1]) f.write(s) elif pred2!=batch_ref[j] and pred1!=batch_ref[j] and batch_probs[j,pred2]>=0.5: - s='%s\t%d\t.\t%s\t%s,%s\t%.3f\t%s\t%s\tGT:DP:FQ\t%s:%d:%.4f\n' %\ - (chrom,batch_pos[j],num_to_base_map[batch_ref[j]],num_to_base_map[pred1],num_to_base_map[pred2],min(99,-10*np.log10(1e-10+ 1-batch_probs[j,pred2])),'PASS',info_field,'1/2', batch_dp[j], batch_freq[j]) + alt1_dp=(batch_fwd_dp[j][pred1], batch_rev_dp[j][pred1]) + alt2_dp=(batch_fwd_dp[j][pred2], batch_rev_dp[j][pred2]) + s='%s\t%d\t.\t%s\t%s,%s\t%.3f\t%s\t%s\tGT:DP:VF:AD:ADF:ADR\t%s:%d:%.4f,%.4f:%d,%d,%d:%d,%d,%d:%d,%d,%d\n' %\ + (chrom,batch_pos[j],num_to_base_map[batch_ref[j]],num_to_base_map[pred1],num_to_base_map[pred2],min(99,-10*np.log10(1e-10+ 1-batch_probs[j,pred2])),'PASS',info_field,'1/2', batch_dp[j], sum(alt1_dp)/batch_dp[j],sum(alt2_dp)/batch_dp[j], sum(ref_dp), sum(alt1_dp), sum(alt2_dp), ref_dp[0], alt1_dp[0], alt2_dp[0], ref_dp[1], alt1_dp[1], alt2_dp[1]) f.write(s) elif batch_pred_GT[j]==1 and batch_ref[j]!=batch_pred[j,-1] and batch_probs[j,batch_pred[j,-1]]>=0.5: pred1=batch_pred[j,-1] - s='%s\t%d\t.\t%s\t%s\t%.3f\t%s\t%s\tGT:DP:FQ\t%s:%d:%.4f\n' %(chrom, batch_pos[j], num_to_base_map[batch_ref[j]], num_to_base_map[pred1], min(99,-10*np.log10(1e-10+ 1-batch_probs[j,pred1])),'PASS',info_field,'1/1', batch_dp[j], batch_freq[j]) + alt_dp=(batch_fwd_dp[j][pred1], batch_rev_dp[j][pred1]) + s='%s\t%d\t.\t%s\t%s\t%.3f\t%s\t%s\tGT:DP:VF:AD:ADF:ADR\t%s:%d:%.4f:%d,%d:%d,%d:%d,%d\n' %(chrom, batch_pos[j], num_to_base_map[batch_ref[j]], num_to_base_map[pred1], min(99,-10*np.log10(1e-10+ 1-batch_probs[j,pred1])),'PASS',info_field,'1/1', batch_dp[j], sum(alt_dp)/batch_dp[j], sum(ref_dp), sum(alt_dp), ref_dp[0], alt_dp[0], ref_dp[1], alt_dp[1]) f.write(s) else: if batch_pred_GT[j]==1 and batch_ref[j]==batch_pred[j,-1]: pred1=batch_pred[j,-1] - s='%s\t%d\t.\t%s\t%s\t%.3f\t%s\t%s\tGT:DP:FQ\t%s:%d:%.4f\n' %(chrom, batch_pos[j], num_to_base_map[batch_ref[j]], '.', min(99,-10*np.log10(1e-10+ 1-batch_probs[j,pred1])),'REF',info_field,'./.', batch_dp[j], batch_freq[j]) + s='%s\t%d\t.\t%s\t%s\t%.3f\t%s\t%s\tGT:DP:VF:AD:ADF:ADR\t%s:%d:.:.:.:.\n' %(chrom, batch_pos[j], num_to_base_map[batch_ref[j]], '.', min(99,-10*np.log10(1e-10+ 1-batch_probs[j,pred1])),'REF',info_field,'./.', batch_dp[j]) f.write(s) else: - s='%s\t%d\t.\t%s\t%s\t%.3f\t%s\t%s\tGT:DP:FQ\t%s:%d:%.4f\n' %(chrom, batch_pos[j], num_to_base_map[batch_ref[j]], '.', 0,'LOW',info_field,'./.', batch_dp[j], batch_freq[j]) + s='%s\t%d\t.\t%s\t%s\t%.3f\t%s\t%s\tGT:DP:VF:AD:ADF:ADR\t%s:%d:.:.:.:.\n' %(chrom, batch_pos[j], num_to_base_map[batch_ref[j]], '.', 0,'LOW',info_field,'./.', batch_dp[j]) f.write(s) @@ -170,7 +181,8 @@ def caller(params, chunks_Q, counter_Q, snp_files): for j in range(len(batch_pred)): pred=batch_pred[j] if pred!=batch_ref[j]: - s='%s\t%d\t.\t%s\t%s\t%.3f\t%s\t.\tGT:DP:FQ\t%s:%d:%.4f\n' %(chrom, batch_pos[j], num_to_base_map[batch_ref[j]], num_to_base_map[pred], min(999,-100*np.log10(1e-10+ 1-batch_probs[j,pred])),'PASS','1/1', batch_dp[j], batch_freq[j]) + info_field='PR='+','.join("{:.4f}".format(x) for x in batch_probs[j,[0,3,1,2]])+ ";FQ={:.4f}".format(batch_freq[j]) + s='%s\t%d\t.\t%s\t%s\t%.3f\t%s\t%s\tGT:DP:VF:AD:ADF:ADR\t%s:%d:%.4f:.:.:.\n' %(chrom, batch_pos[j], num_to_base_map[batch_ref[j]], num_to_base_map[pred], min(999,-100*np.log10(1e-10+ 1-batch_probs[j,pred])),'PASS',info_field,'1/1', batch_dp[j], batch_freq[j]) f.write(s) f.flush() @@ -242,8 +254,12 @@ def call_manager(params): outfile.write(b'##FORMAT=\n') outfile.write(b'##FORMAT=\n') - outfile.write(b'##FORMAT=\n') + outfile.write(b'##FORMAT=\n') + outfile.write(b'##FORMAT=\n') + outfile.write(b'##FORMAT=\n') + outfile.write(b'##FORMAT=\n') outfile.write(b'##INFO=\n') + outfile.write(b'##INFO=\n') outfile.write(b'#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t%s\n' %bytes(params['sample'].encode('utf-8')))