Edit python filtering. Clean up old scripts

speleonut · Dec 6, 2023 · ff0137b · ff0137b
1 parent fc591f9
commit ff0137b
Show file tree

Hide file tree

Showing 8 changed files with 27 additions and 20 deletions.
diff --git a/ANNOVARv3_older.sh → archive/ANNOVARv3_older.sh b/ANNOVARv3_older.sh → archive/ANNOVARv3_older.sh
diff --git a/AnnovarGenomeSummaryCombo.v2_older.sh → ...ive/AnnovarGenomeSummaryCombo.v2_older.sh b/AnnovarGenomeSummaryCombo.v2_older.sh → ...ive/AnnovarGenomeSummaryCombo.v2_older.sh
diff --git a/familyKeyMatchingAfterANNOVAR.py → archive/familyKeyMatchingAfterANNOVAR.py b/familyKeyMatchingAfterANNOVAR.py → archive/familyKeyMatchingAfterANNOVAR.py
diff --git a/trioKeyMatchingAfterANNOVAR.sh → archive/trioKeyMatchingAfterANNOVAR.sh b/trioKeyMatchingAfterANNOVAR.sh → archive/trioKeyMatchingAfterANNOVAR.sh
diff --git a/vcfFamilyBasedFilterAndAnnotate.sh → archive/vcfFamilyBasedFilterAndAnnotate.sh b/vcfFamilyBasedFilterAndAnnotate.sh → archive/vcfFamilyBasedFilterAndAnnotate.sh
diff --git a/familyKeyMatchingAfterANNOVAR_hg38.py b/familyKeyMatchingAfterANNOVAR_hg38.py
@@ -8,7 +8,8 @@ def usage():
     print(
 '''
 # familyKeyMatchingAfterANNOVAR.py a script to filter affected family members for matched genotypes in a multisample ANNOVAR file
-# also outputting a BestGeneCandidates file.  
+# for rare possibly disease causing alleles. Covers IBD, comp het, X-linked, autosomal dominant and clinVar flagged genotypes
+# outputs various filtered tables for further analysis in excel.
 #
 # Usage familyKeyMatchingAfterANNOVAR.py -i ANNOVAR.table.txt -s sampleList.txt | [ -h | --help ]
 #
@@ -22,19 +23,21 @@ def usage():
 # Edit History (Name; Date; Description)
 # Ali Gardner; 21/01/2021; Tweak to use with hg38 (no UK10, Wellderley, Exac.ro.1.filtered), change Func.gene to Func.refGene
 # Thomas Litster; 27/04.2022; Added clinvar search (Will search for clinvar variants in all samples provided, even if variant is not shared)
+# Mark Corbett; 06/12/2023; Add in phased genotypes
 '''
          )
 
 # Set initial values
 inputFile = ''
 sampleFile = ''
+geneTerms = ['exonic', 'splicing', 'UTR5', 'ncRNA_exonic', 'ncRNA_splicing']
 notGeneTerms = ['downstream', 'intergenic', 'intronic', 'ncRNA_exonic', 'ncRNA_intronic', 'ncRNA_splicing', 'ncRNA_UTR3', 'ncRNA_UTR5', 'upstream', 'UTR3', 'UTR5']
 filterTerms = ['.', 'PASS']
 ncSpliceTerms = ['splicing', 'intronic']
 filter005 = ['esp6500siv2_all', '1000g2015aug_all']
-filter0001 = ['exac03', 'gnomad211_exome', 'gnomad211_genome']
+filter0001 = ['exac03', 'gnomad211_exome', 'gnomad211_genome', 'AF']
 pathogenicFilter = ['Pathogenic', 'Likely_pathogenic']
-nullAlelles = ['0/0', '\./\.']
+nullAlelles = ['0/0', '0|0', '\./\.']
 # Read command line arguments
 try:
     opts, args = getopt.getopt(sys.argv[1:],'hi:s:',['help'])
@@ -75,7 +78,7 @@ def usage():
 dfCore=coreTable
 for s in samples: # Maybe this loop could be an apply function?
     currentSampleList=ANNOVARtable[[s]]
-    homList=currentSampleList[currentSampleList[s].str.match('1/1')]
+    homList=currentSampleList[currentSampleList[s].str.match(pat = '(1/1)|(1|1)')]
     dfCore = pd.concat([dfCore,homList], axis=1, join='inner') # Add , sort='False' once Ubuntu is upgraded
 
 dfCore.to_csv("ibdAndXl."+inputFile, sep='\t')
@@ -86,7 +89,7 @@ def usage():
 dfCore=dfCore[(dfCore[filter0001].apply(pd.to_numeric, errors='coerce').fillna(0).lt(0.0001)).all(axis=1)]
 
 #BestGeneCandidates
-bgc=dfCore[~dfCore['Func.refGene'].isin(notGeneTerms)]
+bgc=dfCore[dfCore['Func.refGene'].isin(geneTerms)]
 bgc.to_csv("ibdAndXl.BestGeneCandidates."+inputFile, sep='\t')
 
 # Cadidates to test with spliceAI
@@ -97,7 +100,7 @@ def usage():
 dfCore=coreTable 
 for s in samples:  
     currentSampleList=ANNOVARtable[[s]]
-    homList=currentSampleList[currentSampleList[s].str.match('0/1')]
+    homList=currentSampleList[currentSampleList[s].str.match(pat = '(0/1)|(0|1)')]
     dfCore = pd.concat([dfCore,homList], axis=1, join='inner') # Add , sort='False' once Ubuntu is upgraded
 
 dfCore.to_csv("het."+inputFile, sep='\t')
@@ -106,7 +109,7 @@ def usage():
 dfCore=dfCore[(dfCore[filter0001].apply(pd.to_numeric, errors='coerce').fillna(0).lt(0.0001)).all(axis=1)]
 
 #BestGeneCandidates
-bgc=dfCore[~dfCore['Func.refGene'].isin(notGeneTerms)]
+bgc=dfCore[dfCore['Func.refGene'].isin(geneTerms)]
 bgc.to_csv("het.BestGeneCandidates."+inputFile, sep='\t')
 
 # Find cadidates to test with spliceAI

diff --git a/preConceptionTesting.py b/preConceptionTesting.py
@@ -21,19 +21,22 @@ def usage():
 # Script created by Mark Corbett on 20/12/2019
 # Contact: mark.corbett at adelaide.edu dot au
 # Edit History (Name; Date; Description)
+# Mark Corbett; 06/12/2023; Add in pahsed genotypes and update ANNOVAR field names
 #
 '''
          )
 
 # Set initial values
 inputFile = ''
 sampleFile = ''
-notGeneTerms = ['downstream', 'intergenic', 'intronic', 'ncRNA_exonic', 'ncRNA_intronic', 'ncRNA_splicing', 'ncRNA_UTR3', 'ncRNA_UTR5', 'upstream', 'UTR3', 'UTR5', 'upstream;downstream']
+geneTerms = ['exonic', 'splicing', 'UTR5', 'ncRNA_exonic', 'ncRNA_splicing']
+notGeneTerms = ['downstream', 'intergenic', 'intronic', 'ncRNA_exonic', 'ncRNA_intronic', 'ncRNA_splicing', 'ncRNA_UTR3', 'ncRNA_UTR5', 'upstream', 'UTR3', 'UTR5']
 filterTerms = ['.', 'PASS']
 ncSpliceTerms = ['splicing', 'intronic']
-filter005 = ['esp6500siv2_all', '1000g2015aug_all', 'UK10K-AF-all']
-filter0001 = ['ExAC.r0.1.filtered', 'exac03', 'gnomad211_exome', 'gnomad211_genome']
-nullAlelles = ['0/0', '\./\.']
+filter005 = ['esp6500siv2_all', '1000g2015aug_all']
+filter0001 = ['exac03', 'gnomad211_exome', 'gnomad211_genome', 'AF']
+pathogenicFilter = ['Pathogenic', 'Likely_pathogenic']
+nullAlelles = ['0/0', '0|0'. '\./\.']
 # Read command line arguments
 try:
     opts, args = getopt.getopt(sys.argv[1:],'hi:m:f:',['help'])
@@ -75,17 +78,17 @@ def bestGeneCandidatesFilter(df):
 ANNOVARtable=pd.read_csv(inputFile, sep='\t', index_col = num_cols)
 samples = [mumID, dadID]
 
-hetList=ANNOVARtable[ANNOVARtable[samples[0]].str.match('0/1') & ANNOVARtable[samples[1]].str.match('0/1')]
+hetList=ANNOVARtable[ANNOVARtable[samples[0]].str.match(pat = '(0/1)|(0|1)') & ANNOVARtable[samples[1]].str.match(pat = '(0/1)|(0|1)')]
 hetList.to_csv("allSharedHetCalls."+inputFile, sep='\t')
 
 #Generic filters for most likely pathogenic
 hetList=bestGeneCandidatesFilter(df=hetList)
 hetList.to_csv("allSharedHetCalls.BestGeneCandidates."+inputFile, sep='\t')
 
 # Compound het calls
-mNotfHets=ANNOVARtable[ANNOVARtable[samples[0]].str.match('0/1') & ANNOVARtable[samples[1]].str.contains('|'.join(nullAlelles))]
+mNotfHets=ANNOVARtable[ANNOVARtable[samples[0]].str.match(pat = '(0/1)|(0|1)') & ANNOVARtable[samples[1]].str.contains('|'.join(nullAlelles))]
 mGenes=pd.unique(mNotfHets['Gene.refGene'])
-fNotmHets=ANNOVARtable[ANNOVARtable[samples[0]].str.contains('|'.join(nullAlelles)) & ANNOVARtable[samples[1]].str.match('0/1')]
+fNotmHets=ANNOVARtable[ANNOVARtable[samples[0]].str.contains('|'.join(nullAlelles)) & ANNOVARtable[samples[1]].str.match(pat = '(0/1)|(0|1)')]
 fGenes=pd.unique(fNotmHets['Gene.refGene'])
 seriesCHgenes=pd.Series(mGenes.tolist() + fGenes.tolist())
 chGenes=seriesCHgenes[seriesCHgenes.duplicated()]

diff --git a/trioKeyMatchingAfterANNOVAR_hg38.py b/trioKeyMatchingAfterANNOVAR_hg38.py
@@ -24,6 +24,7 @@ def usage():
 # Contact: mark.corbett at adelaide.edu dot au
 # Edit History (Date; Name; Description)
 # 08/12/2021; Mark; Add gnomADv3 geneotypes AF column to the 0.0001 filter list. Fix Gene.refGene. Change best gene candidate filter to whitelist.
+# Mark Corbett; 06/12/2023; Add in phased genotypes
 #
 '''
          )
@@ -37,7 +38,7 @@ def usage():
 filter005 = ['esp6500siv2_all', '1000g2015aug_all']
 filter0001 = ['exac03', 'gnomad211_exome', 'gnomad211_genome', 'AF']
 pathogenicFilter = ['Pathogenic', 'Likely_pathogenic']
-nullAlelles = ['0/0', '\./\.']
+nullAlelles = ['0/0', '0|0'. '\./\.']
 
 # Read command line arguments
 try:
@@ -83,25 +84,25 @@ def bestGeneCandidatesFilter(df):
 samples = [mumID, dadID, childID]
 
 # de novo
-dnList=ANNOVARtable[ANNOVARtable[samples[0]].str.contains('|'.join(nullAlelles)) & ANNOVARtable[samples[1]].str.contains('|'.join(nullAlelles)) & ANNOVARtable[samples[2]].str.match('0/1')]
+dnList=ANNOVARtable[ANNOVARtable[samples[0]].str.contains('|'.join(nullAlelles)) & ANNOVARtable[samples[1]].str.contains('|'.join(nullAlelles)) & ANNOVARtable[samples[2]].str.match(pat = '(0/1)|(0|1)')]
 dnList.to_csv(childID+".dn."+inputFile, sep='\t')
 spliceCandidates=dnList[dnList['Func.refGene'].isin(ncSpliceTerms)]
 spliceCandidates.to_csv(childID+".dn.SpliceCandidates."+inputFile, sep='\t')
 dnList=bestGeneCandidatesFilter(df=dnList)
 dnList.to_csv(childID+".dn.BestGeneCandidates."+inputFile, sep='\t')
 
 # AR, identical by descent and X-linked 
-homList=ANNOVARtable[~ANNOVARtable[samples[0]].str.match('1/1') & ~ANNOVARtable[samples[1]].str.match('1/1') & ANNOVARtable[samples[2]].str.match('1/1')]
+homList=ANNOVARtable[~ANNOVARtable[samples[0]].str.match(pat = '(1/1)|(1|1)') & ~ANNOVARtable[samples[1]].str.match(pat = '(1/1)|(1|1)') & ANNOVARtable[samples[2]].str.match(pat = '(1/1)|(1|1)')]
 homList.to_csv(childID+".ibdAndXl."+inputFile, sep='\t')
 spliceCandidates=homList[homList['Func.refGene'].isin(ncSpliceTerms)]
 spliceCandidates.to_csv(childID+".ibdAndXl.SpliceCandidates."+inputFile, sep='\t')
 homList=bestGeneCandidatesFilter(df=homList)
 homList.to_csv(childID+".ibdAndXl.BestGeneCandidates."+inputFile, sep='\t')
 
 # Compound het calls
-mNotfHets=ANNOVARtable[ANNOVARtable[samples[0]].str.match('0/1') & ANNOVARtable[samples[1]].str.contains('|'.join(nullAlelles)) & ANNOVARtable[samples[2]].str.match('0/1')]
+mNotfHets=ANNOVARtable[ANNOVARtable[samples[0]].str.match(pat = '(0/1)|(0|1)') & ANNOVARtable[samples[1]].str.contains('|'.join(nullAlelles)) & ANNOVARtable[samples[2]].str.match(pat = '(0/1)|(0|1)')]
 mGenes=pd.unique(mNotfHets['Gene.refGene'])
-fNotmHets=ANNOVARtable[ANNOVARtable[samples[0]].str.contains('|'.join(nullAlelles)) & ANNOVARtable[samples[1]].str.match('0/1') & ANNOVARtable[samples[2]].str.match('0/1')]
+fNotmHets=ANNOVARtable[ANNOVARtable[samples[0]].str.contains('|'.join(nullAlelles)) & ANNOVARtable[samples[1]].str.match(pat = '(0/1)|(0|1)') & ANNOVARtable[samples[2]].str.match(pat = '(0/1)|(0|1)')]
 fGenes=pd.unique(fNotmHets['Gene.refGene'])
 seriesCHgenes=pd.Series(mGenes.tolist() + fGenes.tolist())
 chGenes=seriesCHgenes[seriesCHgenes.duplicated()]
@@ -123,7 +124,7 @@ def bestGeneCandidatesFilter(df):
 compHets.to_csv(childID+".ch.BestGeneCandidates."+inputFile, sep='\t')
 
 # AD
-hetList=ANNOVARtable[ANNOVARtable[samples[2]].str.match('0/1')]
+hetList=ANNOVARtable[ANNOVARtable[samples[2]].str.match(pat = '(0/1)|(0|1)')]
 #hetList.to_csv("allHets."+inputFile, sep='\t') #Not likely to be worth writing out
 spliceCandidates=hetList[hetList['Func.refGene'].isin(ncSpliceTerms)]
 spliceCandidates.to_csv(childID+".allHets.SpliceCandidates."+inputFile, sep='\t')