From ff0137beaa2b4070f10223714287ca629ba63107 Mon Sep 17 00:00:00 2001 From: speleonut <37854406+speleonut@users.noreply.github.com> Date: Wed, 6 Dec 2023 11:22:00 +1030 Subject: [PATCH] Edit python filtering. Clean up old scripts --- .../ANNOVARv3_older.sh | 0 .../AnnovarGenomeSummaryCombo.v2_older.sh | 0 .../familyKeyMatchingAfterANNOVAR.py | 0 .../trioKeyMatchingAfterANNOVAR.sh | 0 .../vcfFamilyBasedFilterAndAnnotate.sh | 0 familyKeyMatchingAfterANNOVAR_hg38.py | 17 ++++++++++------- preConceptionTesting.py | 17 ++++++++++------- trioKeyMatchingAfterANNOVAR_hg38.py | 13 +++++++------ 8 files changed, 27 insertions(+), 20 deletions(-) rename ANNOVARv3_older.sh => archive/ANNOVARv3_older.sh (100%) mode change 100755 => 100644 rename AnnovarGenomeSummaryCombo.v2_older.sh => archive/AnnovarGenomeSummaryCombo.v2_older.sh (100%) mode change 100755 => 100644 rename familyKeyMatchingAfterANNOVAR.py => archive/familyKeyMatchingAfterANNOVAR.py (100%) mode change 100755 => 100644 rename trioKeyMatchingAfterANNOVAR.sh => archive/trioKeyMatchingAfterANNOVAR.sh (100%) rename vcfFamilyBasedFilterAndAnnotate.sh => archive/vcfFamilyBasedFilterAndAnnotate.sh (100%) mode change 100755 => 100644 diff --git a/ANNOVARv3_older.sh b/archive/ANNOVARv3_older.sh old mode 100755 new mode 100644 similarity index 100% rename from ANNOVARv3_older.sh rename to archive/ANNOVARv3_older.sh diff --git a/AnnovarGenomeSummaryCombo.v2_older.sh b/archive/AnnovarGenomeSummaryCombo.v2_older.sh old mode 100755 new mode 100644 similarity index 100% rename from AnnovarGenomeSummaryCombo.v2_older.sh rename to archive/AnnovarGenomeSummaryCombo.v2_older.sh diff --git a/familyKeyMatchingAfterANNOVAR.py b/archive/familyKeyMatchingAfterANNOVAR.py old mode 100755 new mode 100644 similarity index 100% rename from familyKeyMatchingAfterANNOVAR.py rename to archive/familyKeyMatchingAfterANNOVAR.py diff --git a/trioKeyMatchingAfterANNOVAR.sh b/archive/trioKeyMatchingAfterANNOVAR.sh similarity index 100% rename from trioKeyMatchingAfterANNOVAR.sh rename to archive/trioKeyMatchingAfterANNOVAR.sh diff --git a/vcfFamilyBasedFilterAndAnnotate.sh b/archive/vcfFamilyBasedFilterAndAnnotate.sh old mode 100755 new mode 100644 similarity index 100% rename from vcfFamilyBasedFilterAndAnnotate.sh rename to archive/vcfFamilyBasedFilterAndAnnotate.sh diff --git a/familyKeyMatchingAfterANNOVAR_hg38.py b/familyKeyMatchingAfterANNOVAR_hg38.py index 8783b54..0dc2793 100755 --- a/familyKeyMatchingAfterANNOVAR_hg38.py +++ b/familyKeyMatchingAfterANNOVAR_hg38.py @@ -8,7 +8,8 @@ def usage(): print( ''' # familyKeyMatchingAfterANNOVAR.py a script to filter affected family members for matched genotypes in a multisample ANNOVAR file -# also outputting a BestGeneCandidates file. +# for rare possibly disease causing alleles. Covers IBD, comp het, X-linked, autosomal dominant and clinVar flagged genotypes +# outputs various filtered tables for further analysis in excel. # # Usage familyKeyMatchingAfterANNOVAR.py -i ANNOVAR.table.txt -s sampleList.txt | [ -h | --help ] # @@ -22,19 +23,21 @@ def usage(): # Edit History (Name; Date; Description) # Ali Gardner; 21/01/2021; Tweak to use with hg38 (no UK10, Wellderley, Exac.ro.1.filtered), change Func.gene to Func.refGene # Thomas Litster; 27/04.2022; Added clinvar search (Will search for clinvar variants in all samples provided, even if variant is not shared) +# Mark Corbett; 06/12/2023; Add in phased genotypes ''' ) # Set initial values inputFile = '' sampleFile = '' +geneTerms = ['exonic', 'splicing', 'UTR5', 'ncRNA_exonic', 'ncRNA_splicing'] notGeneTerms = ['downstream', 'intergenic', 'intronic', 'ncRNA_exonic', 'ncRNA_intronic', 'ncRNA_splicing', 'ncRNA_UTR3', 'ncRNA_UTR5', 'upstream', 'UTR3', 'UTR5'] filterTerms = ['.', 'PASS'] ncSpliceTerms = ['splicing', 'intronic'] filter005 = ['esp6500siv2_all', '1000g2015aug_all'] -filter0001 = ['exac03', 'gnomad211_exome', 'gnomad211_genome'] +filter0001 = ['exac03', 'gnomad211_exome', 'gnomad211_genome', 'AF'] pathogenicFilter = ['Pathogenic', 'Likely_pathogenic'] -nullAlelles = ['0/0', '\./\.'] +nullAlelles = ['0/0', '0|0', '\./\.'] # Read command line arguments try: opts, args = getopt.getopt(sys.argv[1:],'hi:s:',['help']) @@ -75,7 +78,7 @@ def usage(): dfCore=coreTable for s in samples: # Maybe this loop could be an apply function? currentSampleList=ANNOVARtable[[s]] - homList=currentSampleList[currentSampleList[s].str.match('1/1')] + homList=currentSampleList[currentSampleList[s].str.match(pat = '(1/1)|(1|1)')] dfCore = pd.concat([dfCore,homList], axis=1, join='inner') # Add , sort='False' once Ubuntu is upgraded dfCore.to_csv("ibdAndXl."+inputFile, sep='\t') @@ -86,7 +89,7 @@ def usage(): dfCore=dfCore[(dfCore[filter0001].apply(pd.to_numeric, errors='coerce').fillna(0).lt(0.0001)).all(axis=1)] #BestGeneCandidates -bgc=dfCore[~dfCore['Func.refGene'].isin(notGeneTerms)] +bgc=dfCore[dfCore['Func.refGene'].isin(geneTerms)] bgc.to_csv("ibdAndXl.BestGeneCandidates."+inputFile, sep='\t') # Cadidates to test with spliceAI @@ -97,7 +100,7 @@ def usage(): dfCore=coreTable for s in samples: currentSampleList=ANNOVARtable[[s]] - homList=currentSampleList[currentSampleList[s].str.match('0/1')] + homList=currentSampleList[currentSampleList[s].str.match(pat = '(0/1)|(0|1)')] dfCore = pd.concat([dfCore,homList], axis=1, join='inner') # Add , sort='False' once Ubuntu is upgraded dfCore.to_csv("het."+inputFile, sep='\t') @@ -106,7 +109,7 @@ def usage(): dfCore=dfCore[(dfCore[filter0001].apply(pd.to_numeric, errors='coerce').fillna(0).lt(0.0001)).all(axis=1)] #BestGeneCandidates -bgc=dfCore[~dfCore['Func.refGene'].isin(notGeneTerms)] +bgc=dfCore[dfCore['Func.refGene'].isin(geneTerms)] bgc.to_csv("het.BestGeneCandidates."+inputFile, sep='\t') # Find cadidates to test with spliceAI diff --git a/preConceptionTesting.py b/preConceptionTesting.py index 12da66f..f2ec446 100755 --- a/preConceptionTesting.py +++ b/preConceptionTesting.py @@ -21,6 +21,7 @@ def usage(): # Script created by Mark Corbett on 20/12/2019 # Contact: mark.corbett at adelaide.edu dot au # Edit History (Name; Date; Description) +# Mark Corbett; 06/12/2023; Add in pahsed genotypes and update ANNOVAR field names # ''' ) @@ -28,12 +29,14 @@ def usage(): # Set initial values inputFile = '' sampleFile = '' -notGeneTerms = ['downstream', 'intergenic', 'intronic', 'ncRNA_exonic', 'ncRNA_intronic', 'ncRNA_splicing', 'ncRNA_UTR3', 'ncRNA_UTR5', 'upstream', 'UTR3', 'UTR5', 'upstream;downstream'] +geneTerms = ['exonic', 'splicing', 'UTR5', 'ncRNA_exonic', 'ncRNA_splicing'] +notGeneTerms = ['downstream', 'intergenic', 'intronic', 'ncRNA_exonic', 'ncRNA_intronic', 'ncRNA_splicing', 'ncRNA_UTR3', 'ncRNA_UTR5', 'upstream', 'UTR3', 'UTR5'] filterTerms = ['.', 'PASS'] ncSpliceTerms = ['splicing', 'intronic'] -filter005 = ['esp6500siv2_all', '1000g2015aug_all', 'UK10K-AF-all'] -filter0001 = ['ExAC.r0.1.filtered', 'exac03', 'gnomad211_exome', 'gnomad211_genome'] -nullAlelles = ['0/0', '\./\.'] +filter005 = ['esp6500siv2_all', '1000g2015aug_all'] +filter0001 = ['exac03', 'gnomad211_exome', 'gnomad211_genome', 'AF'] +pathogenicFilter = ['Pathogenic', 'Likely_pathogenic'] +nullAlelles = ['0/0', '0|0'. '\./\.'] # Read command line arguments try: opts, args = getopt.getopt(sys.argv[1:],'hi:m:f:',['help']) @@ -75,7 +78,7 @@ def bestGeneCandidatesFilter(df): ANNOVARtable=pd.read_csv(inputFile, sep='\t', index_col = num_cols) samples = [mumID, dadID] -hetList=ANNOVARtable[ANNOVARtable[samples[0]].str.match('0/1') & ANNOVARtable[samples[1]].str.match('0/1')] +hetList=ANNOVARtable[ANNOVARtable[samples[0]].str.match(pat = '(0/1)|(0|1)') & ANNOVARtable[samples[1]].str.match(pat = '(0/1)|(0|1)')] hetList.to_csv("allSharedHetCalls."+inputFile, sep='\t') #Generic filters for most likely pathogenic @@ -83,9 +86,9 @@ def bestGeneCandidatesFilter(df): hetList.to_csv("allSharedHetCalls.BestGeneCandidates."+inputFile, sep='\t') # Compound het calls -mNotfHets=ANNOVARtable[ANNOVARtable[samples[0]].str.match('0/1') & ANNOVARtable[samples[1]].str.contains('|'.join(nullAlelles))] +mNotfHets=ANNOVARtable[ANNOVARtable[samples[0]].str.match(pat = '(0/1)|(0|1)') & ANNOVARtable[samples[1]].str.contains('|'.join(nullAlelles))] mGenes=pd.unique(mNotfHets['Gene.refGene']) -fNotmHets=ANNOVARtable[ANNOVARtable[samples[0]].str.contains('|'.join(nullAlelles)) & ANNOVARtable[samples[1]].str.match('0/1')] +fNotmHets=ANNOVARtable[ANNOVARtable[samples[0]].str.contains('|'.join(nullAlelles)) & ANNOVARtable[samples[1]].str.match(pat = '(0/1)|(0|1)')] fGenes=pd.unique(fNotmHets['Gene.refGene']) seriesCHgenes=pd.Series(mGenes.tolist() + fGenes.tolist()) chGenes=seriesCHgenes[seriesCHgenes.duplicated()] diff --git a/trioKeyMatchingAfterANNOVAR_hg38.py b/trioKeyMatchingAfterANNOVAR_hg38.py index ea39f62..7795671 100755 --- a/trioKeyMatchingAfterANNOVAR_hg38.py +++ b/trioKeyMatchingAfterANNOVAR_hg38.py @@ -24,6 +24,7 @@ def usage(): # Contact: mark.corbett at adelaide.edu dot au # Edit History (Date; Name; Description) # 08/12/2021; Mark; Add gnomADv3 geneotypes AF column to the 0.0001 filter list. Fix Gene.refGene. Change best gene candidate filter to whitelist. +# Mark Corbett; 06/12/2023; Add in phased genotypes # ''' ) @@ -37,7 +38,7 @@ def usage(): filter005 = ['esp6500siv2_all', '1000g2015aug_all'] filter0001 = ['exac03', 'gnomad211_exome', 'gnomad211_genome', 'AF'] pathogenicFilter = ['Pathogenic', 'Likely_pathogenic'] -nullAlelles = ['0/0', '\./\.'] +nullAlelles = ['0/0', '0|0'. '\./\.'] # Read command line arguments try: @@ -83,7 +84,7 @@ def bestGeneCandidatesFilter(df): samples = [mumID, dadID, childID] # de novo -dnList=ANNOVARtable[ANNOVARtable[samples[0]].str.contains('|'.join(nullAlelles)) & ANNOVARtable[samples[1]].str.contains('|'.join(nullAlelles)) & ANNOVARtable[samples[2]].str.match('0/1')] +dnList=ANNOVARtable[ANNOVARtable[samples[0]].str.contains('|'.join(nullAlelles)) & ANNOVARtable[samples[1]].str.contains('|'.join(nullAlelles)) & ANNOVARtable[samples[2]].str.match(pat = '(0/1)|(0|1)')] dnList.to_csv(childID+".dn."+inputFile, sep='\t') spliceCandidates=dnList[dnList['Func.refGene'].isin(ncSpliceTerms)] spliceCandidates.to_csv(childID+".dn.SpliceCandidates."+inputFile, sep='\t') @@ -91,7 +92,7 @@ def bestGeneCandidatesFilter(df): dnList.to_csv(childID+".dn.BestGeneCandidates."+inputFile, sep='\t') # AR, identical by descent and X-linked -homList=ANNOVARtable[~ANNOVARtable[samples[0]].str.match('1/1') & ~ANNOVARtable[samples[1]].str.match('1/1') & ANNOVARtable[samples[2]].str.match('1/1')] +homList=ANNOVARtable[~ANNOVARtable[samples[0]].str.match(pat = '(1/1)|(1|1)') & ~ANNOVARtable[samples[1]].str.match(pat = '(1/1)|(1|1)') & ANNOVARtable[samples[2]].str.match(pat = '(1/1)|(1|1)')] homList.to_csv(childID+".ibdAndXl."+inputFile, sep='\t') spliceCandidates=homList[homList['Func.refGene'].isin(ncSpliceTerms)] spliceCandidates.to_csv(childID+".ibdAndXl.SpliceCandidates."+inputFile, sep='\t') @@ -99,9 +100,9 @@ def bestGeneCandidatesFilter(df): homList.to_csv(childID+".ibdAndXl.BestGeneCandidates."+inputFile, sep='\t') # Compound het calls -mNotfHets=ANNOVARtable[ANNOVARtable[samples[0]].str.match('0/1') & ANNOVARtable[samples[1]].str.contains('|'.join(nullAlelles)) & ANNOVARtable[samples[2]].str.match('0/1')] +mNotfHets=ANNOVARtable[ANNOVARtable[samples[0]].str.match(pat = '(0/1)|(0|1)') & ANNOVARtable[samples[1]].str.contains('|'.join(nullAlelles)) & ANNOVARtable[samples[2]].str.match(pat = '(0/1)|(0|1)')] mGenes=pd.unique(mNotfHets['Gene.refGene']) -fNotmHets=ANNOVARtable[ANNOVARtable[samples[0]].str.contains('|'.join(nullAlelles)) & ANNOVARtable[samples[1]].str.match('0/1') & ANNOVARtable[samples[2]].str.match('0/1')] +fNotmHets=ANNOVARtable[ANNOVARtable[samples[0]].str.contains('|'.join(nullAlelles)) & ANNOVARtable[samples[1]].str.match(pat = '(0/1)|(0|1)') & ANNOVARtable[samples[2]].str.match(pat = '(0/1)|(0|1)')] fGenes=pd.unique(fNotmHets['Gene.refGene']) seriesCHgenes=pd.Series(mGenes.tolist() + fGenes.tolist()) chGenes=seriesCHgenes[seriesCHgenes.duplicated()] @@ -123,7 +124,7 @@ def bestGeneCandidatesFilter(df): compHets.to_csv(childID+".ch.BestGeneCandidates."+inputFile, sep='\t') # AD -hetList=ANNOVARtable[ANNOVARtable[samples[2]].str.match('0/1')] +hetList=ANNOVARtable[ANNOVARtable[samples[2]].str.match(pat = '(0/1)|(0|1)')] #hetList.to_csv("allHets."+inputFile, sep='\t') #Not likely to be worth writing out spliceCandidates=hetList[hetList['Func.refGene'].isin(ncSpliceTerms)] spliceCandidates.to_csv(childID+".allHets.SpliceCandidates."+inputFile, sep='\t')