Skip to content

Commit

Permalink
allelic depth with strand info, short ONT read mode, SNP quality filt…
Browse files Browse the repository at this point in the history
…er for phasing
  • Loading branch information
umahsn committed Jul 20, 2023
1 parent 4756d03 commit f7478a9
Show file tree
Hide file tree
Showing 4 changed files with 80 additions and 21 deletions.
13 changes: 9 additions & 4 deletions NanoCaller
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ def run(args):
'threshold':threshold, 'snp_model':args.snp_model, 'cpu':args.cpu, \
'vcf_path':args.output, 'prefix':args.prefix, 'sample':args.sample, \
'seq':args.sequencing, 'supplementary':args.supplementary, \
'exclude_bed':args.exclude_bed, 'suppress_progress':args.suppress_progress_bar}
'exclude_bed':args.exclude_bed, 'suppress_progress':args.suppress_progress_bar,
'phase_qual_score':args.phase_qual_score}

snp_vcf=snpCaller.call_manager(in_dict)
print('\n%s: SNP calling completed. Time taken= %.4f\n' %(str(datetime.datetime.now()), time.time()-snp_time),flush=True)
Expand All @@ -47,7 +48,8 @@ def run(args):
'seq':args.sequencing, 'del_t':args.del_threshold, 'ins_t':args.ins_threshold,\
'impute_indel_phase':args.impute_indel_phase, 'supplementary':args.supplementary,\
'exclude_bed':args.exclude_bed, 'win_size':args.win_size, 'small_win_size':args.small_win_size,
'enable_whatshap':args.enable_whatshap, 'suppress_progress':args.suppress_progress_bar}
'enable_whatshap':args.enable_whatshap, 'suppress_progress':args.suppress_progress_bar,
'phase_qual_score':args.phase_qual_score}

files=indelCaller.call_manager(in_dict)
print('\n%s: %s completed. Time taken= %.4f\n' %(str(datetime.datetime.now()), 'Phasing' if args.mode=='snps' else 'Indel calling', time.time()-indel_phase_time),flush=True)
Expand All @@ -62,6 +64,8 @@ if __name__ == '__main__':

preset_dict={'ont':{'sequencing':'ont', 'snp_model':'ONT-HG002', 'indel_model':'ONT-HG002', 'neighbor_threshold':'0.4,0.6', 'ins_threshold':0.4,'del_threshold':0.6, 'enable_whatshap':False, 'impute_indel_phase':False},

'short_ont':{'sequencing':'short_ont', 'snp_model':'ONT-HG002', 'indel_model':'ONT-HG002', 'neighbor_threshold':'0.3,0.7', 'ins_threshold':0.4,'del_threshold':0.6, 'enable_whatshap':False, 'impute_indel_phase':False},

'ul_ont': {'sequencing':'ul_ont', 'snp_model':'ONT-HG002', 'indel_model':'ONT-HG002', 'neighbor_threshold':'0.4,0.6', 'ins_threshold':0.4,'del_threshold':0.6, 'enable_whatshap':False, 'impute_indel_phase':False},

'ul_ont_extreme':{'sequencing':'ul_ont_extreme', 'snp_model':'ONT-HG002', 'indel_model':'ONT-HG002', 'neighbor_threshold':'0.4,0.6', 'ins_threshold':0.4,'del_threshold':0.6, 'enable_whatshap':False, 'impute_indel_phase':False},
Expand Down Expand Up @@ -89,7 +93,7 @@ if __name__ == '__main__':
phase_group=parser.add_argument_group("Phasing")

config_group.add_argument("--mode", help="NanoCaller mode to run. 'snps' mode quits NanoCaller without using WhatsHap for phasing. In this mode, if you want NanoCaller to phase SNPs and BAM files, use --phase argument additionally.", type=str, default='all', choices=['snps', 'indels', 'all'])
config_group.add_argument("--sequencing", help="Sequencing type. 'ont' works well for any type of ONT sequencing datasets. However, use 'ul_ont' if you have several ultra-long ONT reads up to 100kbp long, and 'ul_ont_extreme' if you have several ultra-long ONT reads up to 300kbp long. For PacBio CCS (HiFi) and CLR reads, use 'pacbio'.", type=str, default='ont', choices=['ont', 'ul_ont', 'ul_ont_extreme', 'pacbio'])
config_group.add_argument("--sequencing", help="Sequencing type. 'ont' works well for any type of ONT sequencing datasets. However, use 'ul_ont' if you have several ultra-long ONT reads up to 100kbp long, and 'ul_ont_extreme' if you have several ultra-long ONT reads up to 300kbp long. If you have short ONT reads, typically within 2-10kbp, use 'short_ont'. For PacBio CCS (HiFi) and CLR reads, use 'pacbio'.", type=str, default='ont', choices=['short_ont','ont', 'ul_ont', 'ul_ont_extreme', 'pacbio'])

config_group.add_argument("--cpu", help="Number of CPUs to use.", type=int, default=1)

Expand Down Expand Up @@ -141,8 +145,9 @@ if __name__ == '__main__':

#phasing
phase_group.add_argument('--phase', help='Phase SNPs and BAM files if snps mode is selected.', default=False, action='store_true')
phase_group.add_argument('--phase_qual_score', help='SNP QUAL score cutoff for phasing via Whatshap. SNPS with QUAL score lower than this will not be used for infering haplotype and will also not be phased.', type=float, default=10)

phase_group.add_argument("--enable_whatshap", help="Allow WhatsHap to change SNP genotypes when phasing using --distrust-genotypes and --include-homozygous flags (this is not the same as regenotyping), increasing the time needed for phasing. It has a negligible effect on SNP calling accuracy for Nanopore reads, but may make a small improvement for PacBio reads. By default WhatsHap will only phase SNP calls produced by NanoCaller, but not change their genotypes.", default=False, action='store_true')
phase_group.add_argument("--enable_whatshap", help="Allow WhatsHap to change SNP genotypes when phasing using --distrust-genotypes and --include-homozygous flags (this is not the same as regenotyping), increasing the time needed for phasing. It has a negligible effect on SNP calling accuracy for Nanopore reads, but may make a small improvement for PacBio reads or low coverage ONT samples. By default WhatsHap will only phase SNP calls produced by NanoCaller, but not change their genotypes.", default=False, action='store_true')

args = parser.parse_args()

Expand Down
38 changes: 36 additions & 2 deletions nanocaller_src/generate_SNP_pileups.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,23 @@ def get_cnd_pos(v_pos,cnd_pos, seq='ont'):

ls_total_1=sorted(ls1_0+ls1_1+ls1_2+ls1_3+ls1_4)
ls_total_2=sorted(ls2_0+ls2_1+ls2_2+ls2_3+ls2_4)

elif seq=='short_ont':
ls=cnd_pos[abs(cnd_pos-v_pos)<50000]

ls1_0= [p for p in ls if (p>=v_pos-2000) & (p<v_pos)][-5:]
ls1_1= [p for p in ls if (p>=v_pos-5000) & (p<v_pos-2000)][-10:]
ls1_2= [p for p in ls if (p<v_pos-5000)][-5:]


ls2_0= [p for p in ls if (p>v_pos) & (p<=v_pos+2000)][:5]
ls2_1= [p for p in ls if (p>v_pos+2000) & (p<=v_pos+5000)][:10]
ls2_2= [p for p in ls if (p>v_pos+5000)][:5]


ls_total_1=sorted(ls1_0+ls1_1+ls1_2)
ls_total_2=sorted(ls2_0+ls2_1+ls2_2)

elif seq=='ul_ont':
ls=cnd_pos[abs(cnd_pos-v_pos)<100000]

Expand Down Expand Up @@ -120,6 +136,12 @@ def ex_bed(tree, pos):

ref_dict={j:s.upper() if s in 'AGTC' else '*' for j,s in zip(range(max(1,start-50000),end+50000+1),fastafile.fetch(chrom,max(1,start-50000)-1,end+50000)) }

strand_dict={}

for pread in samfile.fetch(chrom, max(0, start-10), end + 10):
if pread.flag & 0x900==0:
strand_dict[pread.qname]=(pread.flag & 2320)//16

pileup_dict={}

nbr_sites=[]
Expand Down Expand Up @@ -170,7 +192,9 @@ def ex_bed(tree, pos):
pos_list=output.keys()
current_depth=[]
depth=0
output_pos,output_ref,output_mat,output_dp,output_freq=[],[],[],[],[]


output_pos,output_ref,output_mat,output_dp,output_freq,output_fwd_dp, output_rev_dp=[],[],[],[],[],[],[]
if pos_list:

for v_pos in pos_list:
Expand All @@ -183,6 +207,11 @@ def ex_bed(tree, pos):

sample=list(pileup_dict[v_pos].keys())

bases=np.eye(5)[np.array(list(pileup_dict[v_pos].values()))][:,:4]
strand_info=np.array([strand_dict[rname] for rname in pileup_dict[v_pos].keys()]).astype(bool)
fwd_bases=np.sum(bases[~strand_info],axis=0)
rev_bases=np.sum(bases[strand_info],axis=0)

if len(sample) > dct['maxcov']:
sample=random.sample(sample,min(len(sample), dct['maxcov']))

Expand Down Expand Up @@ -229,6 +258,8 @@ def ex_bed(tree, pos):
output_mat.append(data)
output_dp.append(output[v_pos][0])
output_freq.append(output[v_pos][1])
output_fwd_dp.append(fwd_bases)
output_rev_dp.append(rev_bases)
current_depth.append(len(sample))

if len(output_pos)>0:
Expand All @@ -241,5 +272,8 @@ def ex_bed(tree, pos):
output_dp=np.array(output_dp)
output_freq=np.array(output_freq)
depth=np.mean(current_depth)

output_fwd_dp=np.array(output_fwd_dp)
output_rev_dp=np.array(output_rev_dp)

return (output_pos,output_ref,output_mat,output_dp,output_freq,depth)
return (output_pos,output_ref,output_mat,output_dp,output_freq,depth, output_fwd_dp, output_rev_dp)
10 changes: 7 additions & 3 deletions nanocaller_src/indelCaller.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,17 +215,21 @@ def phase_run(contig_dict, params, indel_dict, job_Q, counter_Q, phased_snp_file
start, end = contig_dict['start'], contig_dict['end']
phase_dir = params['intermediate_phase_files_dir']
input_snp_vcf = params['snp_vcf']
phase_qual_score=params['phase_qual_score']
input_contig_vcf = os.path.join(phase_dir, '%s.snps.unphased.vcf' %contig)
lowq_input_contig_vcf= os.path.join(phase_dir, '%s.snps.lowq.unphased.vcf.gz' %contig)
raw_output_contig_vcf = os.path.join(phase_dir, '%s.snps.phased.raw.vcf' %contig)
output_contig_vcf = os.path.join(phase_dir, '%s.snps.phased.vcf.gz' %contig)
phased_bam = os.path.join(phase_dir, '%s.phased.bam' %contig)

phased_snp_files_list.append(output_contig_vcf)
phased_snp_files_list.append(lowq_input_contig_vcf)

enable_whatshap = '--distrust-genotypes --include-homozygous' if params['enable_whatshap'] else ''

# extract region from VCF
run_cmd('bcftools view %s -r %s -o %s' %(input_snp_vcf, contig, input_contig_vcf))
run_cmd('bcftools view %s -r %s -i "QUAL>=%.4f" -o %s' %(input_snp_vcf, contig, phase_qual_score,input_contig_vcf))
run_cmd('bcftools view %s -r %s -i "QUAL<%.4f"|bgziptabix %s' %(input_snp_vcf, contig, phase_qual_score,lowq_input_contig_vcf))

#phase VCF
run_cmd("whatshap phase %s %s -o %s -r %s --ignore-read-groups --chromosome %s %s" %(input_contig_vcf, params['sam_path'], raw_output_contig_vcf, params['fasta_path'], contig , enable_whatshap))
Expand Down Expand Up @@ -346,7 +350,7 @@ def call_manager(params):
output_files['snps']=os.path.join(params['vcf_path'],'%s.snps.phased.vcf.gz' %params['prefix'])

phased_snps_files='\n'.join(phased_snp_files_list)
cmd='bcftools concat -f - |bgziptabix %s' %(output_files['snps'])
cmd='bcftools concat -a -f - |bgziptabix %s' %(output_files['snps'])
stream=Popen(cmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=PIPE)
stdout, stderr = stream.communicate(input=phased_snps_files.encode())

Expand Down
Loading

0 comments on commit f7478a9

Please sign in to comment.