Skip to content

Commit

Permalink
Edit python filtering. Clean up old scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
speleonut committed Dec 6, 2023
1 parent fc591f9 commit ff0137b
Show file tree
Hide file tree
Showing 8 changed files with 27 additions and 20 deletions.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
17 changes: 10 additions & 7 deletions familyKeyMatchingAfterANNOVAR_hg38.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ def usage():
print(
'''
# familyKeyMatchingAfterANNOVAR.py a script to filter affected family members for matched genotypes in a multisample ANNOVAR file
# also outputting a BestGeneCandidates file.
# for rare possibly disease causing alleles. Covers IBD, comp het, X-linked, autosomal dominant and clinVar flagged genotypes
# outputs various filtered tables for further analysis in excel.
#
# Usage familyKeyMatchingAfterANNOVAR.py -i ANNOVAR.table.txt -s sampleList.txt | [ -h | --help ]
#
Expand All @@ -22,19 +23,21 @@ def usage():
# Edit History (Name; Date; Description)
# Ali Gardner; 21/01/2021; Tweak to use with hg38 (no UK10, Wellderley, Exac.ro.1.filtered), change Func.gene to Func.refGene
# Thomas Litster; 27/04.2022; Added clinvar search (Will search for clinvar variants in all samples provided, even if variant is not shared)
# Mark Corbett; 06/12/2023; Add in phased genotypes
'''
)

# Set initial values
inputFile = ''
sampleFile = ''
geneTerms = ['exonic', 'splicing', 'UTR5', 'ncRNA_exonic', 'ncRNA_splicing']
notGeneTerms = ['downstream', 'intergenic', 'intronic', 'ncRNA_exonic', 'ncRNA_intronic', 'ncRNA_splicing', 'ncRNA_UTR3', 'ncRNA_UTR5', 'upstream', 'UTR3', 'UTR5']
filterTerms = ['.', 'PASS']
ncSpliceTerms = ['splicing', 'intronic']
filter005 = ['esp6500siv2_all', '1000g2015aug_all']
filter0001 = ['exac03', 'gnomad211_exome', 'gnomad211_genome']
filter0001 = ['exac03', 'gnomad211_exome', 'gnomad211_genome', 'AF']
pathogenicFilter = ['Pathogenic', 'Likely_pathogenic']
nullAlelles = ['0/0', '\./\.']
nullAlelles = ['0/0', '0|0', '\./\.']
# Read command line arguments
try:
opts, args = getopt.getopt(sys.argv[1:],'hi:s:',['help'])
Expand Down Expand Up @@ -75,7 +78,7 @@ def usage():
dfCore=coreTable
for s in samples: # Maybe this loop could be an apply function?
currentSampleList=ANNOVARtable[[s]]
homList=currentSampleList[currentSampleList[s].str.match('1/1')]
homList=currentSampleList[currentSampleList[s].str.match(pat = '(1/1)|(1|1)')]
dfCore = pd.concat([dfCore,homList], axis=1, join='inner') # Add , sort='False' once Ubuntu is upgraded

dfCore.to_csv("ibdAndXl."+inputFile, sep='\t')
Expand All @@ -86,7 +89,7 @@ def usage():
dfCore=dfCore[(dfCore[filter0001].apply(pd.to_numeric, errors='coerce').fillna(0).lt(0.0001)).all(axis=1)]

#BestGeneCandidates
bgc=dfCore[~dfCore['Func.refGene'].isin(notGeneTerms)]
bgc=dfCore[dfCore['Func.refGene'].isin(geneTerms)]
bgc.to_csv("ibdAndXl.BestGeneCandidates."+inputFile, sep='\t')

# Cadidates to test with spliceAI
Expand All @@ -97,7 +100,7 @@ def usage():
dfCore=coreTable
for s in samples:
currentSampleList=ANNOVARtable[[s]]
homList=currentSampleList[currentSampleList[s].str.match('0/1')]
homList=currentSampleList[currentSampleList[s].str.match(pat = '(0/1)|(0|1)')]
dfCore = pd.concat([dfCore,homList], axis=1, join='inner') # Add , sort='False' once Ubuntu is upgraded

dfCore.to_csv("het."+inputFile, sep='\t')
Expand All @@ -106,7 +109,7 @@ def usage():
dfCore=dfCore[(dfCore[filter0001].apply(pd.to_numeric, errors='coerce').fillna(0).lt(0.0001)).all(axis=1)]

#BestGeneCandidates
bgc=dfCore[~dfCore['Func.refGene'].isin(notGeneTerms)]
bgc=dfCore[dfCore['Func.refGene'].isin(geneTerms)]
bgc.to_csv("het.BestGeneCandidates."+inputFile, sep='\t')

# Find cadidates to test with spliceAI
Expand Down
17 changes: 10 additions & 7 deletions preConceptionTesting.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,19 +21,22 @@ def usage():
# Script created by Mark Corbett on 20/12/2019
# Contact: mark.corbett at adelaide.edu dot au
# Edit History (Name; Date; Description)
# Mark Corbett; 06/12/2023; Add in pahsed genotypes and update ANNOVAR field names
#
'''
)

# Set initial values
inputFile = ''
sampleFile = ''
notGeneTerms = ['downstream', 'intergenic', 'intronic', 'ncRNA_exonic', 'ncRNA_intronic', 'ncRNA_splicing', 'ncRNA_UTR3', 'ncRNA_UTR5', 'upstream', 'UTR3', 'UTR5', 'upstream;downstream']
geneTerms = ['exonic', 'splicing', 'UTR5', 'ncRNA_exonic', 'ncRNA_splicing']
notGeneTerms = ['downstream', 'intergenic', 'intronic', 'ncRNA_exonic', 'ncRNA_intronic', 'ncRNA_splicing', 'ncRNA_UTR3', 'ncRNA_UTR5', 'upstream', 'UTR3', 'UTR5']
filterTerms = ['.', 'PASS']
ncSpliceTerms = ['splicing', 'intronic']
filter005 = ['esp6500siv2_all', '1000g2015aug_all', 'UK10K-AF-all']
filter0001 = ['ExAC.r0.1.filtered', 'exac03', 'gnomad211_exome', 'gnomad211_genome']
nullAlelles = ['0/0', '\./\.']
filter005 = ['esp6500siv2_all', '1000g2015aug_all']
filter0001 = ['exac03', 'gnomad211_exome', 'gnomad211_genome', 'AF']
pathogenicFilter = ['Pathogenic', 'Likely_pathogenic']
nullAlelles = ['0/0', '0|0'. '\./\.']
# Read command line arguments
try:
opts, args = getopt.getopt(sys.argv[1:],'hi:m:f:',['help'])
Expand Down Expand Up @@ -75,17 +78,17 @@ def bestGeneCandidatesFilter(df):
ANNOVARtable=pd.read_csv(inputFile, sep='\t', index_col = num_cols)
samples = [mumID, dadID]

hetList=ANNOVARtable[ANNOVARtable[samples[0]].str.match('0/1') & ANNOVARtable[samples[1]].str.match('0/1')]
hetList=ANNOVARtable[ANNOVARtable[samples[0]].str.match(pat = '(0/1)|(0|1)') & ANNOVARtable[samples[1]].str.match(pat = '(0/1)|(0|1)')]
hetList.to_csv("allSharedHetCalls."+inputFile, sep='\t')

#Generic filters for most likely pathogenic
hetList=bestGeneCandidatesFilter(df=hetList)
hetList.to_csv("allSharedHetCalls.BestGeneCandidates."+inputFile, sep='\t')

# Compound het calls
mNotfHets=ANNOVARtable[ANNOVARtable[samples[0]].str.match('0/1') & ANNOVARtable[samples[1]].str.contains('|'.join(nullAlelles))]
mNotfHets=ANNOVARtable[ANNOVARtable[samples[0]].str.match(pat = '(0/1)|(0|1)') & ANNOVARtable[samples[1]].str.contains('|'.join(nullAlelles))]
mGenes=pd.unique(mNotfHets['Gene.refGene'])
fNotmHets=ANNOVARtable[ANNOVARtable[samples[0]].str.contains('|'.join(nullAlelles)) & ANNOVARtable[samples[1]].str.match('0/1')]
fNotmHets=ANNOVARtable[ANNOVARtable[samples[0]].str.contains('|'.join(nullAlelles)) & ANNOVARtable[samples[1]].str.match(pat = '(0/1)|(0|1)')]
fGenes=pd.unique(fNotmHets['Gene.refGene'])
seriesCHgenes=pd.Series(mGenes.tolist() + fGenes.tolist())
chGenes=seriesCHgenes[seriesCHgenes.duplicated()]
Expand Down
13 changes: 7 additions & 6 deletions trioKeyMatchingAfterANNOVAR_hg38.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ def usage():
# Contact: mark.corbett at adelaide.edu dot au
# Edit History (Date; Name; Description)
# 08/12/2021; Mark; Add gnomADv3 geneotypes AF column to the 0.0001 filter list. Fix Gene.refGene. Change best gene candidate filter to whitelist.
# Mark Corbett; 06/12/2023; Add in phased genotypes
#
'''
)
Expand All @@ -37,7 +38,7 @@ def usage():
filter005 = ['esp6500siv2_all', '1000g2015aug_all']
filter0001 = ['exac03', 'gnomad211_exome', 'gnomad211_genome', 'AF']
pathogenicFilter = ['Pathogenic', 'Likely_pathogenic']
nullAlelles = ['0/0', '\./\.']
nullAlelles = ['0/0', '0|0'. '\./\.']

# Read command line arguments
try:
Expand Down Expand Up @@ -83,25 +84,25 @@ def bestGeneCandidatesFilter(df):
samples = [mumID, dadID, childID]

# de novo
dnList=ANNOVARtable[ANNOVARtable[samples[0]].str.contains('|'.join(nullAlelles)) & ANNOVARtable[samples[1]].str.contains('|'.join(nullAlelles)) & ANNOVARtable[samples[2]].str.match('0/1')]
dnList=ANNOVARtable[ANNOVARtable[samples[0]].str.contains('|'.join(nullAlelles)) & ANNOVARtable[samples[1]].str.contains('|'.join(nullAlelles)) & ANNOVARtable[samples[2]].str.match(pat = '(0/1)|(0|1)')]
dnList.to_csv(childID+".dn."+inputFile, sep='\t')
spliceCandidates=dnList[dnList['Func.refGene'].isin(ncSpliceTerms)]
spliceCandidates.to_csv(childID+".dn.SpliceCandidates."+inputFile, sep='\t')
dnList=bestGeneCandidatesFilter(df=dnList)
dnList.to_csv(childID+".dn.BestGeneCandidates."+inputFile, sep='\t')

# AR, identical by descent and X-linked
homList=ANNOVARtable[~ANNOVARtable[samples[0]].str.match('1/1') & ~ANNOVARtable[samples[1]].str.match('1/1') & ANNOVARtable[samples[2]].str.match('1/1')]
homList=ANNOVARtable[~ANNOVARtable[samples[0]].str.match(pat = '(1/1)|(1|1)') & ~ANNOVARtable[samples[1]].str.match(pat = '(1/1)|(1|1)') & ANNOVARtable[samples[2]].str.match(pat = '(1/1)|(1|1)')]
homList.to_csv(childID+".ibdAndXl."+inputFile, sep='\t')
spliceCandidates=homList[homList['Func.refGene'].isin(ncSpliceTerms)]
spliceCandidates.to_csv(childID+".ibdAndXl.SpliceCandidates."+inputFile, sep='\t')
homList=bestGeneCandidatesFilter(df=homList)
homList.to_csv(childID+".ibdAndXl.BestGeneCandidates."+inputFile, sep='\t')

# Compound het calls
mNotfHets=ANNOVARtable[ANNOVARtable[samples[0]].str.match('0/1') & ANNOVARtable[samples[1]].str.contains('|'.join(nullAlelles)) & ANNOVARtable[samples[2]].str.match('0/1')]
mNotfHets=ANNOVARtable[ANNOVARtable[samples[0]].str.match(pat = '(0/1)|(0|1)') & ANNOVARtable[samples[1]].str.contains('|'.join(nullAlelles)) & ANNOVARtable[samples[2]].str.match(pat = '(0/1)|(0|1)')]
mGenes=pd.unique(mNotfHets['Gene.refGene'])
fNotmHets=ANNOVARtable[ANNOVARtable[samples[0]].str.contains('|'.join(nullAlelles)) & ANNOVARtable[samples[1]].str.match('0/1') & ANNOVARtable[samples[2]].str.match('0/1')]
fNotmHets=ANNOVARtable[ANNOVARtable[samples[0]].str.contains('|'.join(nullAlelles)) & ANNOVARtable[samples[1]].str.match(pat = '(0/1)|(0|1)') & ANNOVARtable[samples[2]].str.match(pat = '(0/1)|(0|1)')]
fGenes=pd.unique(fNotmHets['Gene.refGene'])
seriesCHgenes=pd.Series(mGenes.tolist() + fGenes.tolist())
chGenes=seriesCHgenes[seriesCHgenes.duplicated()]
Expand All @@ -123,7 +124,7 @@ def bestGeneCandidatesFilter(df):
compHets.to_csv(childID+".ch.BestGeneCandidates."+inputFile, sep='\t')

# AD
hetList=ANNOVARtable[ANNOVARtable[samples[2]].str.match('0/1')]
hetList=ANNOVARtable[ANNOVARtable[samples[2]].str.match(pat = '(0/1)|(0|1)')]
#hetList.to_csv("allHets."+inputFile, sep='\t') #Not likely to be worth writing out
spliceCandidates=hetList[hetList['Func.refGene'].isin(ncSpliceTerms)]
spliceCandidates.to_csv(childID+".allHets.SpliceCandidates."+inputFile, sep='\t')
Expand Down

0 comments on commit ff0137b

Please sign in to comment.