From ff0137beaa2b4070f10223714287ca629ba63107 Mon Sep 17 00:00:00 2001
From: speleonut <37854406+speleonut@users.noreply.github.com>
Date: Wed, 6 Dec 2023 11:22:00 +1030
Subject: [PATCH] Edit python filtering. Clean up old scripts

---
 .../ANNOVARv3_older.sh                          |  0
 .../AnnovarGenomeSummaryCombo.v2_older.sh       |  0
 .../familyKeyMatchingAfterANNOVAR.py            |  0
 .../trioKeyMatchingAfterANNOVAR.sh              |  0
 .../vcfFamilyBasedFilterAndAnnotate.sh          |  0
 familyKeyMatchingAfterANNOVAR_hg38.py           | 17 ++++++++++-------
 preConceptionTesting.py                         | 17 ++++++++++-------
 trioKeyMatchingAfterANNOVAR_hg38.py             | 13 +++++++------
 8 files changed, 27 insertions(+), 20 deletions(-)
 rename ANNOVARv3_older.sh => archive/ANNOVARv3_older.sh (100%)
 mode change 100755 => 100644
 rename AnnovarGenomeSummaryCombo.v2_older.sh => archive/AnnovarGenomeSummaryCombo.v2_older.sh (100%)
 mode change 100755 => 100644
 rename familyKeyMatchingAfterANNOVAR.py => archive/familyKeyMatchingAfterANNOVAR.py (100%)
 mode change 100755 => 100644
 rename trioKeyMatchingAfterANNOVAR.sh => archive/trioKeyMatchingAfterANNOVAR.sh (100%)
 rename vcfFamilyBasedFilterAndAnnotate.sh => archive/vcfFamilyBasedFilterAndAnnotate.sh (100%)
 mode change 100755 => 100644

diff --git a/ANNOVARv3_older.sh b/archive/ANNOVARv3_older.sh
old mode 100755
new mode 100644
similarity index 100%
rename from ANNOVARv3_older.sh
rename to archive/ANNOVARv3_older.sh
diff --git a/AnnovarGenomeSummaryCombo.v2_older.sh b/archive/AnnovarGenomeSummaryCombo.v2_older.sh
old mode 100755
new mode 100644
similarity index 100%
rename from AnnovarGenomeSummaryCombo.v2_older.sh
rename to archive/AnnovarGenomeSummaryCombo.v2_older.sh
diff --git a/familyKeyMatchingAfterANNOVAR.py b/archive/familyKeyMatchingAfterANNOVAR.py
old mode 100755
new mode 100644
similarity index 100%
rename from familyKeyMatchingAfterANNOVAR.py
rename to archive/familyKeyMatchingAfterANNOVAR.py
diff --git a/trioKeyMatchingAfterANNOVAR.sh b/archive/trioKeyMatchingAfterANNOVAR.sh
similarity index 100%
rename from trioKeyMatchingAfterANNOVAR.sh
rename to archive/trioKeyMatchingAfterANNOVAR.sh
diff --git a/vcfFamilyBasedFilterAndAnnotate.sh b/archive/vcfFamilyBasedFilterAndAnnotate.sh
old mode 100755
new mode 100644
similarity index 100%
rename from vcfFamilyBasedFilterAndAnnotate.sh
rename to archive/vcfFamilyBasedFilterAndAnnotate.sh
diff --git a/familyKeyMatchingAfterANNOVAR_hg38.py b/familyKeyMatchingAfterANNOVAR_hg38.py
index 8783b54..0dc2793 100755
--- a/familyKeyMatchingAfterANNOVAR_hg38.py
+++ b/familyKeyMatchingAfterANNOVAR_hg38.py
@@ -8,7 +8,8 @@ def usage():
     print(
 '''
 # familyKeyMatchingAfterANNOVAR.py a script to filter affected family members for matched genotypes in a multisample ANNOVAR file
-# also outputting a BestGeneCandidates file.  
+# for rare possibly disease causing alleles. Covers IBD, comp het, X-linked, autosomal dominant and clinVar flagged genotypes
+# outputs various filtered tables for further analysis in excel.
 #
 # Usage familyKeyMatchingAfterANNOVAR.py -i ANNOVAR.table.txt -s sampleList.txt | [ -h | --help ]
 #
@@ -22,19 +23,21 @@ def usage():
 # Edit History (Name; Date; Description)
 # Ali Gardner; 21/01/2021; Tweak to use with hg38 (no UK10, Wellderley, Exac.ro.1.filtered), change Func.gene to Func.refGene
 # Thomas Litster; 27/04.2022; Added clinvar search (Will search for clinvar variants in all samples provided, even if variant is not shared)
+# Mark Corbett; 06/12/2023; Add in phased genotypes
 '''
          )
 
 # Set initial values
 inputFile = ''
 sampleFile = ''
+geneTerms = ['exonic', 'splicing', 'UTR5', 'ncRNA_exonic', 'ncRNA_splicing']
 notGeneTerms = ['downstream', 'intergenic', 'intronic', 'ncRNA_exonic', 'ncRNA_intronic', 'ncRNA_splicing', 'ncRNA_UTR3', 'ncRNA_UTR5', 'upstream', 'UTR3', 'UTR5']
 filterTerms = ['.', 'PASS']
 ncSpliceTerms = ['splicing', 'intronic']
 filter005 = ['esp6500siv2_all', '1000g2015aug_all']
-filter0001 = ['exac03', 'gnomad211_exome', 'gnomad211_genome']
+filter0001 = ['exac03', 'gnomad211_exome', 'gnomad211_genome', 'AF']
 pathogenicFilter = ['Pathogenic', 'Likely_pathogenic']
-nullAlelles = ['0/0', '\./\.']
+nullAlelles = ['0/0', '0|0', '\./\.']
 # Read command line arguments
 try:
     opts, args = getopt.getopt(sys.argv[1:],'hi:s:',['help'])
@@ -75,7 +78,7 @@ def usage():
 dfCore=coreTable
 for s in samples: # Maybe this loop could be an apply function?
     currentSampleList=ANNOVARtable[[s]]
-    homList=currentSampleList[currentSampleList[s].str.match('1/1')]
+    homList=currentSampleList[currentSampleList[s].str.match(pat = '(1/1)|(1|1)')]
     dfCore = pd.concat([dfCore,homList], axis=1, join='inner') # Add , sort='False' once Ubuntu is upgraded
 
 dfCore.to_csv("ibdAndXl."+inputFile, sep='\t')
@@ -86,7 +89,7 @@ def usage():
 dfCore=dfCore[(dfCore[filter0001].apply(pd.to_numeric, errors='coerce').fillna(0).lt(0.0001)).all(axis=1)]
 
 #BestGeneCandidates
-bgc=dfCore[~dfCore['Func.refGene'].isin(notGeneTerms)]
+bgc=dfCore[dfCore['Func.refGene'].isin(geneTerms)]
 bgc.to_csv("ibdAndXl.BestGeneCandidates."+inputFile, sep='\t')
 
 # Cadidates to test with spliceAI
@@ -97,7 +100,7 @@ def usage():
 dfCore=coreTable 
 for s in samples:  
     currentSampleList=ANNOVARtable[[s]]
-    homList=currentSampleList[currentSampleList[s].str.match('0/1')]
+    homList=currentSampleList[currentSampleList[s].str.match(pat = '(0/1)|(0|1)')]
     dfCore = pd.concat([dfCore,homList], axis=1, join='inner') # Add , sort='False' once Ubuntu is upgraded
 
 dfCore.to_csv("het."+inputFile, sep='\t')
@@ -106,7 +109,7 @@ def usage():
 dfCore=dfCore[(dfCore[filter0001].apply(pd.to_numeric, errors='coerce').fillna(0).lt(0.0001)).all(axis=1)]
 
 #BestGeneCandidates
-bgc=dfCore[~dfCore['Func.refGene'].isin(notGeneTerms)]
+bgc=dfCore[dfCore['Func.refGene'].isin(geneTerms)]
 bgc.to_csv("het.BestGeneCandidates."+inputFile, sep='\t')
 
 # Find cadidates to test with spliceAI
diff --git a/preConceptionTesting.py b/preConceptionTesting.py
index 12da66f..f2ec446 100755
--- a/preConceptionTesting.py
+++ b/preConceptionTesting.py
@@ -21,6 +21,7 @@ def usage():
 # Script created by Mark Corbett on 20/12/2019
 # Contact: mark.corbett at adelaide.edu dot au
 # Edit History (Name; Date; Description)
+# Mark Corbett; 06/12/2023; Add in pahsed genotypes and update ANNOVAR field names
 #
 '''
          )
@@ -28,12 +29,14 @@ def usage():
 # Set initial values
 inputFile = ''
 sampleFile = ''
-notGeneTerms = ['downstream', 'intergenic', 'intronic', 'ncRNA_exonic', 'ncRNA_intronic', 'ncRNA_splicing', 'ncRNA_UTR3', 'ncRNA_UTR5', 'upstream', 'UTR3', 'UTR5', 'upstream;downstream']
+geneTerms = ['exonic', 'splicing', 'UTR5', 'ncRNA_exonic', 'ncRNA_splicing']
+notGeneTerms = ['downstream', 'intergenic', 'intronic', 'ncRNA_exonic', 'ncRNA_intronic', 'ncRNA_splicing', 'ncRNA_UTR3', 'ncRNA_UTR5', 'upstream', 'UTR3', 'UTR5']
 filterTerms = ['.', 'PASS']
 ncSpliceTerms = ['splicing', 'intronic']
-filter005 = ['esp6500siv2_all', '1000g2015aug_all', 'UK10K-AF-all']
-filter0001 = ['ExAC.r0.1.filtered', 'exac03', 'gnomad211_exome', 'gnomad211_genome']
-nullAlelles = ['0/0', '\./\.']
+filter005 = ['esp6500siv2_all', '1000g2015aug_all']
+filter0001 = ['exac03', 'gnomad211_exome', 'gnomad211_genome', 'AF']
+pathogenicFilter = ['Pathogenic', 'Likely_pathogenic']
+nullAlelles = ['0/0', '0|0'. '\./\.']
 # Read command line arguments
 try:
     opts, args = getopt.getopt(sys.argv[1:],'hi:m:f:',['help'])
@@ -75,7 +78,7 @@ def bestGeneCandidatesFilter(df):
 ANNOVARtable=pd.read_csv(inputFile, sep='\t', index_col = num_cols)
 samples = [mumID, dadID]
 
-hetList=ANNOVARtable[ANNOVARtable[samples[0]].str.match('0/1') & ANNOVARtable[samples[1]].str.match('0/1')]
+hetList=ANNOVARtable[ANNOVARtable[samples[0]].str.match(pat = '(0/1)|(0|1)') & ANNOVARtable[samples[1]].str.match(pat = '(0/1)|(0|1)')]
 hetList.to_csv("allSharedHetCalls."+inputFile, sep='\t')
 
 #Generic filters for most likely pathogenic
@@ -83,9 +86,9 @@ def bestGeneCandidatesFilter(df):
 hetList.to_csv("allSharedHetCalls.BestGeneCandidates."+inputFile, sep='\t')
 
 # Compound het calls
-mNotfHets=ANNOVARtable[ANNOVARtable[samples[0]].str.match('0/1') & ANNOVARtable[samples[1]].str.contains('|'.join(nullAlelles))]
+mNotfHets=ANNOVARtable[ANNOVARtable[samples[0]].str.match(pat = '(0/1)|(0|1)') & ANNOVARtable[samples[1]].str.contains('|'.join(nullAlelles))]
 mGenes=pd.unique(mNotfHets['Gene.refGene'])
-fNotmHets=ANNOVARtable[ANNOVARtable[samples[0]].str.contains('|'.join(nullAlelles)) & ANNOVARtable[samples[1]].str.match('0/1')]
+fNotmHets=ANNOVARtable[ANNOVARtable[samples[0]].str.contains('|'.join(nullAlelles)) & ANNOVARtable[samples[1]].str.match(pat = '(0/1)|(0|1)')]
 fGenes=pd.unique(fNotmHets['Gene.refGene'])
 seriesCHgenes=pd.Series(mGenes.tolist() + fGenes.tolist())
 chGenes=seriesCHgenes[seriesCHgenes.duplicated()]
diff --git a/trioKeyMatchingAfterANNOVAR_hg38.py b/trioKeyMatchingAfterANNOVAR_hg38.py
index ea39f62..7795671 100755
--- a/trioKeyMatchingAfterANNOVAR_hg38.py
+++ b/trioKeyMatchingAfterANNOVAR_hg38.py
@@ -24,6 +24,7 @@ def usage():
 # Contact: mark.corbett at adelaide.edu dot au
 # Edit History (Date; Name; Description)
 # 08/12/2021; Mark; Add gnomADv3 geneotypes AF column to the 0.0001 filter list. Fix Gene.refGene. Change best gene candidate filter to whitelist.
+# Mark Corbett; 06/12/2023; Add in phased genotypes
 #
 '''
          )
@@ -37,7 +38,7 @@ def usage():
 filter005 = ['esp6500siv2_all', '1000g2015aug_all']
 filter0001 = ['exac03', 'gnomad211_exome', 'gnomad211_genome', 'AF']
 pathogenicFilter = ['Pathogenic', 'Likely_pathogenic']
-nullAlelles = ['0/0', '\./\.']
+nullAlelles = ['0/0', '0|0'. '\./\.']
 
 # Read command line arguments
 try:
@@ -83,7 +84,7 @@ def bestGeneCandidatesFilter(df):
 samples = [mumID, dadID, childID]
 
 # de novo
-dnList=ANNOVARtable[ANNOVARtable[samples[0]].str.contains('|'.join(nullAlelles)) & ANNOVARtable[samples[1]].str.contains('|'.join(nullAlelles)) & ANNOVARtable[samples[2]].str.match('0/1')]
+dnList=ANNOVARtable[ANNOVARtable[samples[0]].str.contains('|'.join(nullAlelles)) & ANNOVARtable[samples[1]].str.contains('|'.join(nullAlelles)) & ANNOVARtable[samples[2]].str.match(pat = '(0/1)|(0|1)')]
 dnList.to_csv(childID+".dn."+inputFile, sep='\t')
 spliceCandidates=dnList[dnList['Func.refGene'].isin(ncSpliceTerms)]
 spliceCandidates.to_csv(childID+".dn.SpliceCandidates."+inputFile, sep='\t')
@@ -91,7 +92,7 @@ def bestGeneCandidatesFilter(df):
 dnList.to_csv(childID+".dn.BestGeneCandidates."+inputFile, sep='\t')
 
 # AR, identical by descent and X-linked 
-homList=ANNOVARtable[~ANNOVARtable[samples[0]].str.match('1/1') & ~ANNOVARtable[samples[1]].str.match('1/1') & ANNOVARtable[samples[2]].str.match('1/1')]
+homList=ANNOVARtable[~ANNOVARtable[samples[0]].str.match(pat = '(1/1)|(1|1)') & ~ANNOVARtable[samples[1]].str.match(pat = '(1/1)|(1|1)') & ANNOVARtable[samples[2]].str.match(pat = '(1/1)|(1|1)')]
 homList.to_csv(childID+".ibdAndXl."+inputFile, sep='\t')
 spliceCandidates=homList[homList['Func.refGene'].isin(ncSpliceTerms)]
 spliceCandidates.to_csv(childID+".ibdAndXl.SpliceCandidates."+inputFile, sep='\t')
@@ -99,9 +100,9 @@ def bestGeneCandidatesFilter(df):
 homList.to_csv(childID+".ibdAndXl.BestGeneCandidates."+inputFile, sep='\t')
 
 # Compound het calls
-mNotfHets=ANNOVARtable[ANNOVARtable[samples[0]].str.match('0/1') & ANNOVARtable[samples[1]].str.contains('|'.join(nullAlelles)) & ANNOVARtable[samples[2]].str.match('0/1')]
+mNotfHets=ANNOVARtable[ANNOVARtable[samples[0]].str.match(pat = '(0/1)|(0|1)') & ANNOVARtable[samples[1]].str.contains('|'.join(nullAlelles)) & ANNOVARtable[samples[2]].str.match(pat = '(0/1)|(0|1)')]
 mGenes=pd.unique(mNotfHets['Gene.refGene'])
-fNotmHets=ANNOVARtable[ANNOVARtable[samples[0]].str.contains('|'.join(nullAlelles)) & ANNOVARtable[samples[1]].str.match('0/1') & ANNOVARtable[samples[2]].str.match('0/1')]
+fNotmHets=ANNOVARtable[ANNOVARtable[samples[0]].str.contains('|'.join(nullAlelles)) & ANNOVARtable[samples[1]].str.match(pat = '(0/1)|(0|1)') & ANNOVARtable[samples[2]].str.match(pat = '(0/1)|(0|1)')]
 fGenes=pd.unique(fNotmHets['Gene.refGene'])
 seriesCHgenes=pd.Series(mGenes.tolist() + fGenes.tolist())
 chGenes=seriesCHgenes[seriesCHgenes.duplicated()]
@@ -123,7 +124,7 @@ def bestGeneCandidatesFilter(df):
 compHets.to_csv(childID+".ch.BestGeneCandidates."+inputFile, sep='\t')
 
 # AD
-hetList=ANNOVARtable[ANNOVARtable[samples[2]].str.match('0/1')]
+hetList=ANNOVARtable[ANNOVARtable[samples[2]].str.match(pat = '(0/1)|(0|1)')]
 #hetList.to_csv("allHets."+inputFile, sep='\t') #Not likely to be worth writing out
 spliceCandidates=hetList[hetList['Func.refGene'].isin(ncSpliceTerms)]
 spliceCandidates.to_csv(childID+".allHets.SpliceCandidates."+inputFile, sep='\t')