fastQ file bug fixed

OmarOakheart · Jul 30, 2020 · f74e10f · f74e10f
1 parent 38876c5
commit f74e10f
Show file tree

Hide file tree

Showing 16 changed files with 100 additions and 61 deletions.
diff --git a/.gitignore b/.gitignore
@@ -11,3 +11,4 @@ example/reference/SaCe_Chr2.fasta.bwt
 example/reference/SaCe_Chr2.fasta.ann
 example/reference/SaCe_Chr2.fasta.amb
 example/reference/SaCe_Chr2.fasta-ht-13-2.2.ngm
+Thumbs.db
diff --git a/bin/__pycache__/nPhasePipelineFunctions_LOCAL.cpython-38.pyc b/bin/__pycache__/nPhasePipelineFunctions_LOCAL.cpython-38.pyc
diff --git a/bin/__pycache__/nPhase_LOCAL.cpython-38.pyc b/bin/__pycache__/nPhase_LOCAL.cpython-38.pyc
diff --git a/bin/nPhase.py b/bin/nPhase.py
@@ -348,7 +348,7 @@ def keepUsefulSplitReadsChr(usefulSplitReadProfiles,chimericReadIDict,mergedClus
                 chrmSNPs[position.split(":")[0]][position]=info
             i=0
             for chrm, posInfo in chrmSNPs.items():
-                usefulSplitReads[splitRead+"_"+str(i)]=posInfo
+                usefulSplitReads[splitRead+"_VCSTTP_"+str(i)]=posInfo
                 i+=1
     return usefulSplitReads
 
@@ -459,17 +459,17 @@ def nPhase(longReadSNPAssignments,strainName,contextDepthsFilePath,outFolder,ref
     chimericNames=set()
 
     for readName, sequence in SNPAssignments.items():
-        if len(readName[len(strainName)+1:].split("_"))>1:
-            chimericNames.add("_".join(readName.split("_")[:-1]))
+        if len(readName.split("_VCSTTP_"))>1:
+            chimericNames.add(readName.split("_VCSTTP_")[0])
 
     print("Split reads identified.")
 
     #Applying the context depth info to all of the reads
     chimericReadIDict={}
     for readName, sequence in SNPAssignments.items():
-        if len(readName[len(strainName)+1:].split("_"))>1 or readName in chimericNames:
-            if len(readName[len(strainName)+1:].split("_"))>1:
-                readName="_".join(readName.split("_")[:-1])
+        if len(readName.split("_VCSTTP_"))>1 or readName in chimericNames:
+            if len(readName.split("_VCSTTP_"))>1:
+                readName="_VCSTTP_".join(readName.split("_VCSTTP_")[:-1])
             allBaseDict={}
             if len(sequence)>=10:
                 sequence=set(sequence)
@@ -528,6 +528,8 @@ def nPhase(longReadSNPAssignments,strainName,contextDepthsFilePath,outFolder,ref
 
     newChimericDict=keepUsefulSplitReadsChr(usefulSplitReadProfiles,chimericReadIDict,mergedClusterEdges)
 
+    #print(newChimericDict["chimeric-CCN_ACA_BMB_0"])
+
     cleanAllReadIDict={}
     cleanAllReadIDict.update(readIDict)
     cleanAllReadIDict.update(newChimericDict)
@@ -545,14 +547,16 @@ def nPhase(longReadSNPAssignments,strainName,contextDepthsFilePath,outFolder,ref
 
     #Create sets of fastQ read names, to be turned into FASTQ files by another script
 
-    st=len(strainName)
-
     clusterReadText=""
 
     for clusterName, clusterInfo in cleanFullClusters.items():
         for readName in clusterInfo["names"]:
+            x=readName
             readName=readName.replace("chimeric-","")
-            readName=readName[:st+1]+readName[st+1:].split("_")[0]
+            readPart=readName.split("_VCSTTP_")
+            if len(readPart)>1:
+                readPart=readPart[:-1]
+            readName="_VCSTTP_".join(readPart)
             clusterReadText+="\t".join([clusterName,readName])+"\n"
 
     clusterReadFile=open(outFolder+"/"+strainName+"_"+str(minOvl)+"_"+str(minSim)+"_"+str(maxID)+"_"+str(minLen)+"_clusterReadNames.tsv","w")
@@ -577,5 +581,3 @@ def nPhase(longReadSNPAssignments,strainName,contextDepthsFilePath,outFolder,ref
     return "Phasing over"
 
 
-
-
diff --git a/bin/nPhasePipelineFunctions.py b/bin/nPhasePipelineFunctions.py
@@ -37,10 +37,13 @@ def longReadMapping(strainName,longReads,reference,outputFolder,flag,longReadPla
     if p.stderr!="" or p.stdout !="":
         outputLog+="STDERR:\n\n"+p.stderr+"\n\nSTDOUT:\n\n"+p.stdout+"\n\n"
 
-    #Cleaning up
-    os.remove(samFile)
-    os.remove(passSamFile)
-    os.remove(sortedHeaderSamFile)
+    try:
+        #Cleaning up
+        os.remove(samFile)
+        os.remove(passSamFile)
+        os.remove(sortedHeaderSamFile)
+    except:
+        return outputLog,"Long reads not mapped to reference"
 
     return outputLog,"Long reads successfully mapped to reference"
 
@@ -64,7 +67,7 @@ def shortReadMapping(strainName,R1,R2,reference,outputFolder):
     outputLog+="COMMAND: "+" ".join(["samtools","view","-bT",reference,"-q","1",samFile,"-o",bamFile])+"\n\n"
     if p.stderr!="" or p.stdout !="":
         outputLog+="STDERR:\n\n"+p.stderr+"\n\nSTDOUT:\n\n"+p.stdout+"\n\n"
-    
+
     p=subprocess.run(["samtools", "sort", bamFile, "-o", sortedBamFile],stderr=subprocess.PIPE, stdout=subprocess.PIPE, universal_newlines=True)
     outputLog+="COMMAND: "+" ".join(["samtools", "sort", bamFile, "-o", sortedBamFile])+"\n\n"
     if p.stderr!="" or p.stdout !="":
@@ -125,7 +128,7 @@ def getShortReadPositions(hetVCFName,shortReadPosFileName):
                     pass
                 elif len(alt)<len(ref): #DEL
                     #for curPos in range(int(line[1])+1,int(line[1])+len(ref)):
-                    #	hetVCFPositions.add(chr+":"+str(curPos))
+                    #   hetVCFPositions.add(chr+":"+str(curPos))
                     pass
 
     #Writing the short read het positions to a file
@@ -206,8 +209,8 @@ def assignLongReadToSNPs(samPath,bedPath,referencePath,minQ,minMQ,minAln,outputP
             b=False
             while b==False:
                 if line[0]+"_"+str(i) not in readDict:
-                    readDict[line[0]+"_"+str(i)]=[]
-                    line[0]=line[0]+"_"+str(i)
+                    readDict[line[0]+"_VCSTTP_"+str(i)]=[] #Very Clean Solution To This Problem
+                    line[0]=line[0]+"_VCSTTP_"+str(i)
                     b=True
                 i+=1
 
@@ -360,7 +363,7 @@ def longReadValidation(longReadFilePath,minCov,minRatio,minTrioCov,validatedSNPA
     #vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv#
     ###########################################################################################
 
-    #Only keeping long reads that have unique combinations of SNPs 
+    #Only keeping long reads that have unique combinations of SNPs
     uniqueLongReadSets=set()
     uniqueLongReadIDs=set()
 
@@ -412,7 +415,7 @@ def generateLongReadFastQFiles(haplotigReadNameFilePath,longReadFastQFilePath,ou
     haplotigReadNameFile.close()
 
     gzipBool=True
-    
+
     longReadData={}
 
     with gzip.open(longReadFastQFilePath, 'r') as fh:
@@ -444,13 +447,13 @@ def generateLongReadFastQFiles(haplotigReadNameFilePath,longReadFastQFilePath,ou
     return "Successfully generated phased FastQ files"
 
 def loadFile(filePath):
-	openFile=open(filePath,"r")
-	fileContents=[]
-	for line in openFile:
-		line=line.strip("\n").split("\t")
-		fileContents.append(line)
-	openFile.close()
-	return fileContents
+        openFile=open(filePath,"r")
+        fileContents=[]
+        for line in openFile:
+                line=line.strip("\n").split("\t")
+                fileContents.append(line)
+        openFile.close()
+        return fileContents
 
 def simplifyDataVis(dataVisPath,simpleOutPath,distance):
     fileContents=loadFile(dataVisPath)
@@ -493,7 +496,7 @@ def generateDataVis(dataVisPath,outPath):
     tbl.columns = ["contigName", "startPos", "endPos", "chr", "yValue"]
 
     #g=(ggplot(tbl,aes(y=tbl["yValue"],yend=tbl["yValue"],x=tbl["startPos"],xend=tbl["endPos"],color='factor(yValue)'))+geom_segment(size=1.5)+theme(legend_position="none")+theme(panel_grid_minor=element_blank())+facet_wrap("~chr",scales = "free")+theme(axis_title_y=element_blank(),axis_text_y=element_blank(),axis_ticks_major_y=element_blank())+xlab("Distance from start of genome (bp)")+theme(subplots_adjust={'hspace': 0.7}))
-    
+
     g=(ggplot(tbl,aes(y=tbl["yValue"],yend=tbl["yValue"],x=tbl["startPos"],xend=tbl["endPos"],color='factor(yValue)'))+geom_segment(size=1.5)+theme(legend_position="none")+theme(panel_grid_minor=element_blank())+theme(axis_title_y=element_blank(),axis_text_y=element_blank(),axis_ticks_major_y=element_blank())+xlab("Distance from start of genome (bp)")+theme(subplots_adjust={'hspace': 0.7}))
 
     ggsave(g,filename=outputSVG,width=18, height=10)

diff --git a/build/lib/bin/nPhase.py b/build/lib/bin/nPhase.py
@@ -348,7 +348,7 @@ def keepUsefulSplitReadsChr(usefulSplitReadProfiles,chimericReadIDict,mergedClus
                 chrmSNPs[position.split(":")[0]][position]=info
             i=0
             for chrm, posInfo in chrmSNPs.items():
-                usefulSplitReads[splitRead+"_"+str(i)]=posInfo
+                usefulSplitReads[splitRead+"_VCSTTP_"+str(i)]=posInfo
                 i+=1
     return usefulSplitReads
 
@@ -459,17 +459,17 @@ def nPhase(longReadSNPAssignments,strainName,contextDepthsFilePath,outFolder,ref
     chimericNames=set()
 
     for readName, sequence in SNPAssignments.items():
-        if len(readName[len(strainName)+1:].split("_"))>1:
-            chimericNames.add("_".join(readName.split("_")[:-1]))
+        if len(readName.split("_VCSTTP_"))>1:
+            chimericNames.add(readName.split("_VCSTTP_")[0])
 
     print("Split reads identified.")
 
     #Applying the context depth info to all of the reads
     chimericReadIDict={}
     for readName, sequence in SNPAssignments.items():
-        if len(readName[len(strainName)+1:].split("_"))>1 or readName in chimericNames:
-            if len(readName[len(strainName)+1:].split("_"))>1:
-                readName="_".join(readName.split("_")[:-1])
+        if len(readName.split("_VCSTTP_"))>1 or readName in chimericNames:
+            if len(readName.split("_VCSTTP_"))>1:
+                readName="_VCSTTP_".join(readName.split("_VCSTTP_")[:-1])
             allBaseDict={}
             if len(sequence)>=10:
                 sequence=set(sequence)
@@ -528,6 +528,8 @@ def nPhase(longReadSNPAssignments,strainName,contextDepthsFilePath,outFolder,ref
 
     newChimericDict=keepUsefulSplitReadsChr(usefulSplitReadProfiles,chimericReadIDict,mergedClusterEdges)
 
+    #print(newChimericDict["chimeric-CCN_ACA_BMB_0"])
+
     cleanAllReadIDict={}
     cleanAllReadIDict.update(readIDict)
     cleanAllReadIDict.update(newChimericDict)
@@ -545,14 +547,16 @@ def nPhase(longReadSNPAssignments,strainName,contextDepthsFilePath,outFolder,ref
 
     #Create sets of fastQ read names, to be turned into FASTQ files by another script
 
-    st=len(strainName)
-
     clusterReadText=""
 
     for clusterName, clusterInfo in cleanFullClusters.items():
         for readName in clusterInfo["names"]:
+            x=readName
             readName=readName.replace("chimeric-","")
-            readName=readName[:st+1]+readName[st+1:].split("_")[0]
+            readPart=readName.split("_VCSTTP_")
+            if len(readPart)>1:
+                readPart=readPart[:-1]
+            readName="_VCSTTP_".join(readPart)
             clusterReadText+="\t".join([clusterName,readName])+"\n"
 
     clusterReadFile=open(outFolder+"/"+strainName+"_"+str(minOvl)+"_"+str(minSim)+"_"+str(maxID)+"_"+str(minLen)+"_clusterReadNames.tsv","w")
@@ -577,5 +581,3 @@ def nPhase(longReadSNPAssignments,strainName,contextDepthsFilePath,outFolder,ref
     return "Phasing over"
 
 
-
-
diff --git a/build/lib/bin/nPhasePipelineFunctions.py b/build/lib/bin/nPhasePipelineFunctions.py
@@ -37,10 +37,13 @@ def longReadMapping(strainName,longReads,reference,outputFolder,flag,longReadPla
     if p.stderr!="" or p.stdout !="":
         outputLog+="STDERR:\n\n"+p.stderr+"\n\nSTDOUT:\n\n"+p.stdout+"\n\n"
 
-    #Cleaning up
-    os.remove(samFile)
-    os.remove(passSamFile)
-    os.remove(sortedHeaderSamFile)
+    try:
+        #Cleaning up
+        os.remove(samFile)
+        os.remove(passSamFile)
+        os.remove(sortedHeaderSamFile)
+    except:
+        return outputLog,"Long reads not mapped to reference"
 
     return outputLog,"Long reads successfully mapped to reference"
 
@@ -64,7 +67,7 @@ def shortReadMapping(strainName,R1,R2,reference,outputFolder):
     outputLog+="COMMAND: "+" ".join(["samtools","view","-bT",reference,"-q","1",samFile,"-o",bamFile])+"\n\n"
     if p.stderr!="" or p.stdout !="":
         outputLog+="STDERR:\n\n"+p.stderr+"\n\nSTDOUT:\n\n"+p.stdout+"\n\n"
-    
+
     p=subprocess.run(["samtools", "sort", bamFile, "-o", sortedBamFile],stderr=subprocess.PIPE, stdout=subprocess.PIPE, universal_newlines=True)
     outputLog+="COMMAND: "+" ".join(["samtools", "sort", bamFile, "-o", sortedBamFile])+"\n\n"
     if p.stderr!="" or p.stdout !="":
@@ -125,7 +128,7 @@ def getShortReadPositions(hetVCFName,shortReadPosFileName):
                     pass
                 elif len(alt)<len(ref): #DEL
                     #for curPos in range(int(line[1])+1,int(line[1])+len(ref)):
-                    #	hetVCFPositions.add(chr+":"+str(curPos))
+                    #   hetVCFPositions.add(chr+":"+str(curPos))
                     pass
 
     #Writing the short read het positions to a file
@@ -206,8 +209,8 @@ def assignLongReadToSNPs(samPath,bedPath,referencePath,minQ,minMQ,minAln,outputP
             b=False
             while b==False:
                 if line[0]+"_"+str(i) not in readDict:
-                    readDict[line[0]+"_"+str(i)]=[]
-                    line[0]=line[0]+"_"+str(i)
+                    readDict[line[0]+"_VCSTTP_"+str(i)]=[] #Very Clean Solution To This Problem
+                    line[0]=line[0]+"_VCSTTP_"+str(i)
                     b=True
                 i+=1
 
@@ -360,7 +363,7 @@ def longReadValidation(longReadFilePath,minCov,minRatio,minTrioCov,validatedSNPA
     #vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv#
     ###########################################################################################
 
-    #Only keeping long reads that have unique combinations of SNPs 
+    #Only keeping long reads that have unique combinations of SNPs
     uniqueLongReadSets=set()
     uniqueLongReadIDs=set()
 
@@ -412,7 +415,7 @@ def generateLongReadFastQFiles(haplotigReadNameFilePath,longReadFastQFilePath,ou
     haplotigReadNameFile.close()
 
     gzipBool=True
-    
+
     longReadData={}
 
     with gzip.open(longReadFastQFilePath, 'r') as fh:
@@ -444,13 +447,13 @@ def generateLongReadFastQFiles(haplotigReadNameFilePath,longReadFastQFilePath,ou
     return "Successfully generated phased FastQ files"
 
 def loadFile(filePath):
-	openFile=open(filePath,"r")
-	fileContents=[]
-	for line in openFile:
-		line=line.strip("\n").split("\t")
-		fileContents.append(line)
-	openFile.close()
-	return fileContents
+        openFile=open(filePath,"r")
+        fileContents=[]
+        for line in openFile:
+                line=line.strip("\n").split("\t")
+                fileContents.append(line)
+        openFile.close()
+        return fileContents
 
 def simplifyDataVis(dataVisPath,simpleOutPath,distance):
     fileContents=loadFile(dataVisPath)
@@ -493,7 +496,7 @@ def generateDataVis(dataVisPath,outPath):
     tbl.columns = ["contigName", "startPos", "endPos", "chr", "yValue"]
 
     #g=(ggplot(tbl,aes(y=tbl["yValue"],yend=tbl["yValue"],x=tbl["startPos"],xend=tbl["endPos"],color='factor(yValue)'))+geom_segment(size=1.5)+theme(legend_position="none")+theme(panel_grid_minor=element_blank())+facet_wrap("~chr",scales = "free")+theme(axis_title_y=element_blank(),axis_text_y=element_blank(),axis_ticks_major_y=element_blank())+xlab("Distance from start of genome (bp)")+theme(subplots_adjust={'hspace': 0.7}))
-    
+
     g=(ggplot(tbl,aes(y=tbl["yValue"],yend=tbl["yValue"],x=tbl["startPos"],xend=tbl["endPos"],color='factor(yValue)'))+geom_segment(size=1.5)+theme(legend_position="none")+theme(panel_grid_minor=element_blank())+theme(axis_title_y=element_blank(),axis_text_y=element_blank(),axis_ticks_major_y=element_blank())+xlab("Distance from start of genome (bp)")+theme(subplots_adjust={'hspace': 0.7}))
 
     ggsave(g,filename=outputSVG,width=18, height=10)

diff --git a/dist/nPhase-1.0.7-py3-none-any.whl b/dist/nPhase-1.0.7-py3-none-any.whl
diff --git a/dist/nPhase-1.0.7.tar.gz b/dist/nPhase-1.0.7.tar.gz
diff --git a/dist/nPhase-1.0.9-py3-none-any.whl b/dist/nPhase-1.0.9-py3-none-any.whl
diff --git a/dist/nPhase-1.0.9.tar.gz b/dist/nPhase-1.0.9.tar.gz
diff --git a/example/shortReads/SaCe_Chr2_SR_R1.fastq.gz b/example/shortReads/SaCe_Chr2_SR_R1.fastq.gz
diff --git a/example/shortReads/SaCe_Chr2_SR_R2.fastq.gz b/example/shortReads/SaCe_Chr2_SR_R2.fastq.gz
diff --git a/example/shortReads/pythonSolution.py b/example/shortReads/pythonSolution.py
@@ -0,0 +1,27 @@
+import sys, operator
+
+inFile = open(sys.argv[1],'r')
+
+header = ''
+data = ''
+
+allData = []
+
+i=1
+
+for line in inFile:
+    if i%4==1:
+        if data != '':
+            allData.append((header,data))
+        header = line.strip()[1:]
+        data = line
+    else:
+        data += line
+    i+=1
+
+allData.append((header,data))
+allData.sort(key = operator.itemgetter(0))
+
+for item in allData:
+    print (item[1].strip())
+
diff --git a/nphase/meta.yaml b/nphase/meta.yaml
@@ -1,13 +1,13 @@
 {% set name = "nPhase" %}
-{% set version = "1.0.6" %}
+{% set version = "1.0.9" %}
 
 package:
   name: "{{ name|lower }}"
   version: "{{ version }}"
 
 source:
   url: "https://pypi.io/packages/source/{{ name[0] }}/{{ name }}/{{ name }}-{{ version }}.tar.gz"
-  sha256: cd11c1743b25e96cba92bfd98866b46a7da8ebc27450d0ab384b4f9e4e626c67
+  sha256: b2d8515e23e3538eebe4cdecfaa94638551119689399862164b89834e5d0cc4c
 
 build:
   number: 0
@@ -35,6 +35,7 @@ test:
     - bin
   commands:
     - nphase --help
+
 about:
   home: "https://https://github.com/nPhasePipeline/nPhase"
   license: "GNU General Public v3 (GPLv3)"

diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
 
 setuptools.setup(
     name="nPhase",
-    version="1.0.7",
+    version="1.0.9",
     author="Omar Abou Saada",
     author_email="[email protected]",
     description="nPhase is a command line ploidy agnostic phasing pipeline and algorithm which phases samples of any ploidy with sequence alignment of long and short read data to a reference sequence.",