Correct pattern match error for genotypes

speleonut · Dec 6, 2023 · 07f0f11 · 07f0f11
1 parent ff0137b
commit 07f0f11
Show file tree

Hide file tree

Showing 3 changed files with 10 additions and 10 deletions.
diff --git a/familyKeyMatchingAfterANNOVAR_hg38.py b/familyKeyMatchingAfterANNOVAR_hg38.py
@@ -78,7 +78,7 @@ def usage():
 dfCore=coreTable
 for s in samples: # Maybe this loop could be an apply function?
     currentSampleList=ANNOVARtable[[s]]
-    homList=currentSampleList[currentSampleList[s].str.match(pat = '(1/1)|(1|1)')]
+    homList=currentSampleList[currentSampleList[s].str.match(pat = '(1/1)|(1\|1)')]
     dfCore = pd.concat([dfCore,homList], axis=1, join='inner') # Add , sort='False' once Ubuntu is upgraded
 
 dfCore.to_csv("ibdAndXl."+inputFile, sep='\t')
@@ -100,7 +100,7 @@ def usage():
 dfCore=coreTable 
 for s in samples:  
     currentSampleList=ANNOVARtable[[s]]
-    homList=currentSampleList[currentSampleList[s].str.match(pat = '(0/1)|(0|1)')]
+    homList=currentSampleList[currentSampleList[s].str.match(pat = '(0/1)|(0\|1)|(1\|0)')]
     dfCore = pd.concat([dfCore,homList], axis=1, join='inner') # Add , sort='False' once Ubuntu is upgraded
 
 dfCore.to_csv("het."+inputFile, sep='\t')

diff --git a/preConceptionTesting.py b/preConceptionTesting.py
@@ -78,17 +78,17 @@ def bestGeneCandidatesFilter(df):
 ANNOVARtable=pd.read_csv(inputFile, sep='\t', index_col = num_cols)
 samples = [mumID, dadID]
 
-hetList=ANNOVARtable[ANNOVARtable[samples[0]].str.match(pat = '(0/1)|(0|1)') & ANNOVARtable[samples[1]].str.match(pat = '(0/1)|(0|1)')]
+hetList=ANNOVARtable[ANNOVARtable[samples[0]].str.match(pat = '(0/1)|(0\|1)|(1\|0)') & ANNOVARtable[samples[1]].str.match(pat = '(0/1)|(0\|1)|(1\|0)')]
 hetList.to_csv("allSharedHetCalls."+inputFile, sep='\t')
 
 #Generic filters for most likely pathogenic
 hetList=bestGeneCandidatesFilter(df=hetList)
 hetList.to_csv("allSharedHetCalls.BestGeneCandidates."+inputFile, sep='\t')
 
 # Compound het calls
-mNotfHets=ANNOVARtable[ANNOVARtable[samples[0]].str.match(pat = '(0/1)|(0|1)') & ANNOVARtable[samples[1]].str.contains('|'.join(nullAlelles))]
+mNotfHets=ANNOVARtable[ANNOVARtable[samples[0]].str.match(pat = '(0/1)|(0\|1)|(1\|0)') & ANNOVARtable[samples[1]].str.contains('|'.join(nullAlelles))]
 mGenes=pd.unique(mNotfHets['Gene.refGene'])
-fNotmHets=ANNOVARtable[ANNOVARtable[samples[0]].str.contains('|'.join(nullAlelles)) & ANNOVARtable[samples[1]].str.match(pat = '(0/1)|(0|1)')]
+fNotmHets=ANNOVARtable[ANNOVARtable[samples[0]].str.contains('|'.join(nullAlelles)) & ANNOVARtable[samples[1]].str.match(pat = '(0/1)|(0\|1)|(1\|0)')]
 fGenes=pd.unique(fNotmHets['Gene.refGene'])
 seriesCHgenes=pd.Series(mGenes.tolist() + fGenes.tolist())
 chGenes=seriesCHgenes[seriesCHgenes.duplicated()]

diff --git a/trioKeyMatchingAfterANNOVAR_hg38.py b/trioKeyMatchingAfterANNOVAR_hg38.py
@@ -84,25 +84,25 @@ def bestGeneCandidatesFilter(df):
 samples = [mumID, dadID, childID]
 
 # de novo
-dnList=ANNOVARtable[ANNOVARtable[samples[0]].str.contains('|'.join(nullAlelles)) & ANNOVARtable[samples[1]].str.contains('|'.join(nullAlelles)) & ANNOVARtable[samples[2]].str.match(pat = '(0/1)|(0|1)')]
+dnList=ANNOVARtable[ANNOVARtable[samples[0]].str.contains('|'.join(nullAlelles)) & ANNOVARtable[samples[1]].str.contains('|'.join(nullAlelles)) & ANNOVARtable[samples[2]].str.match(pat = '(0/1)|(0\|1)|(1\|0)')]
 dnList.to_csv(childID+".dn."+inputFile, sep='\t')
 spliceCandidates=dnList[dnList['Func.refGene'].isin(ncSpliceTerms)]
 spliceCandidates.to_csv(childID+".dn.SpliceCandidates."+inputFile, sep='\t')
 dnList=bestGeneCandidatesFilter(df=dnList)
 dnList.to_csv(childID+".dn.BestGeneCandidates."+inputFile, sep='\t')
 
 # AR, identical by descent and X-linked 
-homList=ANNOVARtable[~ANNOVARtable[samples[0]].str.match(pat = '(1/1)|(1|1)') & ~ANNOVARtable[samples[1]].str.match(pat = '(1/1)|(1|1)') & ANNOVARtable[samples[2]].str.match(pat = '(1/1)|(1|1)')]
+homList=ANNOVARtable[~ANNOVARtable[samples[0]].str.match(pat = '(1/1)|(1\|1)') & ~ANNOVARtable[samples[1]].str.match(pat = '(1/1)|(1\|1)') & ANNOVARtable[samples[2]].str.match(pat = '(1/1)|(1\|1)')]
 homList.to_csv(childID+".ibdAndXl."+inputFile, sep='\t')
 spliceCandidates=homList[homList['Func.refGene'].isin(ncSpliceTerms)]
 spliceCandidates.to_csv(childID+".ibdAndXl.SpliceCandidates."+inputFile, sep='\t')
 homList=bestGeneCandidatesFilter(df=homList)
 homList.to_csv(childID+".ibdAndXl.BestGeneCandidates."+inputFile, sep='\t')
 
 # Compound het calls
-mNotfHets=ANNOVARtable[ANNOVARtable[samples[0]].str.match(pat = '(0/1)|(0|1)') & ANNOVARtable[samples[1]].str.contains('|'.join(nullAlelles)) & ANNOVARtable[samples[2]].str.match(pat = '(0/1)|(0|1)')]
+mNotfHets=ANNOVARtable[ANNOVARtable[samples[0]].str.match(pat = '(0/1)|(0\|1)|(1\|0)') & ANNOVARtable[samples[1]].str.contains('|'.join(nullAlelles)) & ANNOVARtable[samples[2]].str.match(pat = '(0/1)|(0\|1)|(1\|0)')]
 mGenes=pd.unique(mNotfHets['Gene.refGene'])
-fNotmHets=ANNOVARtable[ANNOVARtable[samples[0]].str.contains('|'.join(nullAlelles)) & ANNOVARtable[samples[1]].str.match(pat = '(0/1)|(0|1)') & ANNOVARtable[samples[2]].str.match(pat = '(0/1)|(0|1)')]
+fNotmHets=ANNOVARtable[ANNOVARtable[samples[0]].str.contains('|'.join(nullAlelles)) & ANNOVARtable[samples[1]].str.match(pat = '(0/1)|(0\|1)|(1\|0)') & ANNOVARtable[samples[2]].str.match(pat = '(0/1)|(0\|1)|(1\|0)')]
 fGenes=pd.unique(fNotmHets['Gene.refGene'])
 seriesCHgenes=pd.Series(mGenes.tolist() + fGenes.tolist())
 chGenes=seriesCHgenes[seriesCHgenes.duplicated()]
@@ -124,7 +124,7 @@ def bestGeneCandidatesFilter(df):
 compHets.to_csv(childID+".ch.BestGeneCandidates."+inputFile, sep='\t')
 
 # AD
-hetList=ANNOVARtable[ANNOVARtable[samples[2]].str.match(pat = '(0/1)|(0|1)')]
+hetList=ANNOVARtable[ANNOVARtable[samples[2]].str.match(pat = '(0/1)|(0\|1)|(1\|0)')]
 #hetList.to_csv("allHets."+inputFile, sep='\t') #Not likely to be worth writing out
 spliceCandidates=hetList[hetList['Func.refGene'].isin(ncSpliceTerms)]
 spliceCandidates.to_csv(childID+".allHets.SpliceCandidates."+inputFile, sep='\t')