Skip to content

Commit

Permalink
fastQ file bug fixed
Browse files Browse the repository at this point in the history
  • Loading branch information
OmarOakheart committed Jul 30, 2020
1 parent 38876c5 commit f74e10f
Show file tree
Hide file tree
Showing 16 changed files with 100 additions and 61 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,4 @@ example/reference/SaCe_Chr2.fasta.bwt
example/reference/SaCe_Chr2.fasta.ann
example/reference/SaCe_Chr2.fasta.amb
example/reference/SaCe_Chr2.fasta-ht-13-2.2.ngm
Thumbs.db
Binary file not shown.
Binary file added bin/__pycache__/nPhase_LOCAL.cpython-38.pyc
Binary file not shown.
24 changes: 13 additions & 11 deletions bin/nPhase.py
Original file line number Diff line number Diff line change
Expand Up @@ -348,7 +348,7 @@ def keepUsefulSplitReadsChr(usefulSplitReadProfiles,chimericReadIDict,mergedClus
chrmSNPs[position.split(":")[0]][position]=info
i=0
for chrm, posInfo in chrmSNPs.items():
usefulSplitReads[splitRead+"_"+str(i)]=posInfo
usefulSplitReads[splitRead+"_VCSTTP_"+str(i)]=posInfo
i+=1
return usefulSplitReads

Expand Down Expand Up @@ -459,17 +459,17 @@ def nPhase(longReadSNPAssignments,strainName,contextDepthsFilePath,outFolder,ref
chimericNames=set()

for readName, sequence in SNPAssignments.items():
if len(readName[len(strainName)+1:].split("_"))>1:
chimericNames.add("_".join(readName.split("_")[:-1]))
if len(readName.split("_VCSTTP_"))>1:
chimericNames.add(readName.split("_VCSTTP_")[0])

print("Split reads identified.")

#Applying the context depth info to all of the reads
chimericReadIDict={}
for readName, sequence in SNPAssignments.items():
if len(readName[len(strainName)+1:].split("_"))>1 or readName in chimericNames:
if len(readName[len(strainName)+1:].split("_"))>1:
readName="_".join(readName.split("_")[:-1])
if len(readName.split("_VCSTTP_"))>1 or readName in chimericNames:
if len(readName.split("_VCSTTP_"))>1:
readName="_VCSTTP_".join(readName.split("_VCSTTP_")[:-1])
allBaseDict={}
if len(sequence)>=10:
sequence=set(sequence)
Expand Down Expand Up @@ -528,6 +528,8 @@ def nPhase(longReadSNPAssignments,strainName,contextDepthsFilePath,outFolder,ref

newChimericDict=keepUsefulSplitReadsChr(usefulSplitReadProfiles,chimericReadIDict,mergedClusterEdges)

#print(newChimericDict["chimeric-CCN_ACA_BMB_0"])

cleanAllReadIDict={}
cleanAllReadIDict.update(readIDict)
cleanAllReadIDict.update(newChimericDict)
Expand All @@ -545,14 +547,16 @@ def nPhase(longReadSNPAssignments,strainName,contextDepthsFilePath,outFolder,ref

#Create sets of fastQ read names, to be turned into FASTQ files by another script

st=len(strainName)

clusterReadText=""

for clusterName, clusterInfo in cleanFullClusters.items():
for readName in clusterInfo["names"]:
x=readName
readName=readName.replace("chimeric-","")
readName=readName[:st+1]+readName[st+1:].split("_")[0]
readPart=readName.split("_VCSTTP_")
if len(readPart)>1:
readPart=readPart[:-1]
readName="_VCSTTP_".join(readPart)
clusterReadText+="\t".join([clusterName,readName])+"\n"

clusterReadFile=open(outFolder+"/"+strainName+"_"+str(minOvl)+"_"+str(minSim)+"_"+str(maxID)+"_"+str(minLen)+"_clusterReadNames.tsv","w")
Expand All @@ -577,5 +581,3 @@ def nPhase(longReadSNPAssignments,strainName,contextDepthsFilePath,outFolder,ref
return "Phasing over"




39 changes: 21 additions & 18 deletions bin/nPhasePipelineFunctions.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,13 @@ def longReadMapping(strainName,longReads,reference,outputFolder,flag,longReadPla
if p.stderr!="" or p.stdout !="":
outputLog+="STDERR:\n\n"+p.stderr+"\n\nSTDOUT:\n\n"+p.stdout+"\n\n"

#Cleaning up
os.remove(samFile)
os.remove(passSamFile)
os.remove(sortedHeaderSamFile)
try:
#Cleaning up
os.remove(samFile)
os.remove(passSamFile)
os.remove(sortedHeaderSamFile)
except:
return outputLog,"Long reads not mapped to reference"

return outputLog,"Long reads successfully mapped to reference"

Expand All @@ -64,7 +67,7 @@ def shortReadMapping(strainName,R1,R2,reference,outputFolder):
outputLog+="COMMAND: "+" ".join(["samtools","view","-bT",reference,"-q","1",samFile,"-o",bamFile])+"\n\n"
if p.stderr!="" or p.stdout !="":
outputLog+="STDERR:\n\n"+p.stderr+"\n\nSTDOUT:\n\n"+p.stdout+"\n\n"

p=subprocess.run(["samtools", "sort", bamFile, "-o", sortedBamFile],stderr=subprocess.PIPE, stdout=subprocess.PIPE, universal_newlines=True)
outputLog+="COMMAND: "+" ".join(["samtools", "sort", bamFile, "-o", sortedBamFile])+"\n\n"
if p.stderr!="" or p.stdout !="":
Expand Down Expand Up @@ -125,7 +128,7 @@ def getShortReadPositions(hetVCFName,shortReadPosFileName):
pass
elif len(alt)<len(ref): #DEL
#for curPos in range(int(line[1])+1,int(line[1])+len(ref)):
# hetVCFPositions.add(chr+":"+str(curPos))
# hetVCFPositions.add(chr+":"+str(curPos))
pass

#Writing the short read het positions to a file
Expand Down Expand Up @@ -206,8 +209,8 @@ def assignLongReadToSNPs(samPath,bedPath,referencePath,minQ,minMQ,minAln,outputP
b=False
while b==False:
if line[0]+"_"+str(i) not in readDict:
readDict[line[0]+"_"+str(i)]=[]
line[0]=line[0]+"_"+str(i)
readDict[line[0]+"_VCSTTP_"+str(i)]=[] #Very Clean Solution To This Problem
line[0]=line[0]+"_VCSTTP_"+str(i)
b=True
i+=1

Expand Down Expand Up @@ -360,7 +363,7 @@ def longReadValidation(longReadFilePath,minCov,minRatio,minTrioCov,validatedSNPA
#vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv#
###########################################################################################

#Only keeping long reads that have unique combinations of SNPs
#Only keeping long reads that have unique combinations of SNPs
uniqueLongReadSets=set()
uniqueLongReadIDs=set()

Expand Down Expand Up @@ -412,7 +415,7 @@ def generateLongReadFastQFiles(haplotigReadNameFilePath,longReadFastQFilePath,ou
haplotigReadNameFile.close()

gzipBool=True

longReadData={}

with gzip.open(longReadFastQFilePath, 'r') as fh:
Expand Down Expand Up @@ -444,13 +447,13 @@ def generateLongReadFastQFiles(haplotigReadNameFilePath,longReadFastQFilePath,ou
return "Successfully generated phased FastQ files"

def loadFile(filePath):
openFile=open(filePath,"r")
fileContents=[]
for line in openFile:
line=line.strip("\n").split("\t")
fileContents.append(line)
openFile.close()
return fileContents
openFile=open(filePath,"r")
fileContents=[]
for line in openFile:
line=line.strip("\n").split("\t")
fileContents.append(line)
openFile.close()
return fileContents

def simplifyDataVis(dataVisPath,simpleOutPath,distance):
fileContents=loadFile(dataVisPath)
Expand Down Expand Up @@ -493,7 +496,7 @@ def generateDataVis(dataVisPath,outPath):
tbl.columns = ["contigName", "startPos", "endPos", "chr", "yValue"]

#g=(ggplot(tbl,aes(y=tbl["yValue"],yend=tbl["yValue"],x=tbl["startPos"],xend=tbl["endPos"],color='factor(yValue)'))+geom_segment(size=1.5)+theme(legend_position="none")+theme(panel_grid_minor=element_blank())+facet_wrap("~chr",scales = "free")+theme(axis_title_y=element_blank(),axis_text_y=element_blank(),axis_ticks_major_y=element_blank())+xlab("Distance from start of genome (bp)")+theme(subplots_adjust={'hspace': 0.7}))

g=(ggplot(tbl,aes(y=tbl["yValue"],yend=tbl["yValue"],x=tbl["startPos"],xend=tbl["endPos"],color='factor(yValue)'))+geom_segment(size=1.5)+theme(legend_position="none")+theme(panel_grid_minor=element_blank())+theme(axis_title_y=element_blank(),axis_text_y=element_blank(),axis_ticks_major_y=element_blank())+xlab("Distance from start of genome (bp)")+theme(subplots_adjust={'hspace': 0.7}))

ggsave(g,filename=outputSVG,width=18, height=10)
Expand Down
24 changes: 13 additions & 11 deletions build/lib/bin/nPhase.py
Original file line number Diff line number Diff line change
Expand Up @@ -348,7 +348,7 @@ def keepUsefulSplitReadsChr(usefulSplitReadProfiles,chimericReadIDict,mergedClus
chrmSNPs[position.split(":")[0]][position]=info
i=0
for chrm, posInfo in chrmSNPs.items():
usefulSplitReads[splitRead+"_"+str(i)]=posInfo
usefulSplitReads[splitRead+"_VCSTTP_"+str(i)]=posInfo
i+=1
return usefulSplitReads

Expand Down Expand Up @@ -459,17 +459,17 @@ def nPhase(longReadSNPAssignments,strainName,contextDepthsFilePath,outFolder,ref
chimericNames=set()

for readName, sequence in SNPAssignments.items():
if len(readName[len(strainName)+1:].split("_"))>1:
chimericNames.add("_".join(readName.split("_")[:-1]))
if len(readName.split("_VCSTTP_"))>1:
chimericNames.add(readName.split("_VCSTTP_")[0])

print("Split reads identified.")

#Applying the context depth info to all of the reads
chimericReadIDict={}
for readName, sequence in SNPAssignments.items():
if len(readName[len(strainName)+1:].split("_"))>1 or readName in chimericNames:
if len(readName[len(strainName)+1:].split("_"))>1:
readName="_".join(readName.split("_")[:-1])
if len(readName.split("_VCSTTP_"))>1 or readName in chimericNames:
if len(readName.split("_VCSTTP_"))>1:
readName="_VCSTTP_".join(readName.split("_VCSTTP_")[:-1])
allBaseDict={}
if len(sequence)>=10:
sequence=set(sequence)
Expand Down Expand Up @@ -528,6 +528,8 @@ def nPhase(longReadSNPAssignments,strainName,contextDepthsFilePath,outFolder,ref

newChimericDict=keepUsefulSplitReadsChr(usefulSplitReadProfiles,chimericReadIDict,mergedClusterEdges)

#print(newChimericDict["chimeric-CCN_ACA_BMB_0"])

cleanAllReadIDict={}
cleanAllReadIDict.update(readIDict)
cleanAllReadIDict.update(newChimericDict)
Expand All @@ -545,14 +547,16 @@ def nPhase(longReadSNPAssignments,strainName,contextDepthsFilePath,outFolder,ref

#Create sets of fastQ read names, to be turned into FASTQ files by another script

st=len(strainName)

clusterReadText=""

for clusterName, clusterInfo in cleanFullClusters.items():
for readName in clusterInfo["names"]:
x=readName
readName=readName.replace("chimeric-","")
readName=readName[:st+1]+readName[st+1:].split("_")[0]
readPart=readName.split("_VCSTTP_")
if len(readPart)>1:
readPart=readPart[:-1]
readName="_VCSTTP_".join(readPart)
clusterReadText+="\t".join([clusterName,readName])+"\n"

clusterReadFile=open(outFolder+"/"+strainName+"_"+str(minOvl)+"_"+str(minSim)+"_"+str(maxID)+"_"+str(minLen)+"_clusterReadNames.tsv","w")
Expand All @@ -577,5 +581,3 @@ def nPhase(longReadSNPAssignments,strainName,contextDepthsFilePath,outFolder,ref
return "Phasing over"




39 changes: 21 additions & 18 deletions build/lib/bin/nPhasePipelineFunctions.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,13 @@ def longReadMapping(strainName,longReads,reference,outputFolder,flag,longReadPla
if p.stderr!="" or p.stdout !="":
outputLog+="STDERR:\n\n"+p.stderr+"\n\nSTDOUT:\n\n"+p.stdout+"\n\n"

#Cleaning up
os.remove(samFile)
os.remove(passSamFile)
os.remove(sortedHeaderSamFile)
try:
#Cleaning up
os.remove(samFile)
os.remove(passSamFile)
os.remove(sortedHeaderSamFile)
except:
return outputLog,"Long reads not mapped to reference"

return outputLog,"Long reads successfully mapped to reference"

Expand All @@ -64,7 +67,7 @@ def shortReadMapping(strainName,R1,R2,reference,outputFolder):
outputLog+="COMMAND: "+" ".join(["samtools","view","-bT",reference,"-q","1",samFile,"-o",bamFile])+"\n\n"
if p.stderr!="" or p.stdout !="":
outputLog+="STDERR:\n\n"+p.stderr+"\n\nSTDOUT:\n\n"+p.stdout+"\n\n"

p=subprocess.run(["samtools", "sort", bamFile, "-o", sortedBamFile],stderr=subprocess.PIPE, stdout=subprocess.PIPE, universal_newlines=True)
outputLog+="COMMAND: "+" ".join(["samtools", "sort", bamFile, "-o", sortedBamFile])+"\n\n"
if p.stderr!="" or p.stdout !="":
Expand Down Expand Up @@ -125,7 +128,7 @@ def getShortReadPositions(hetVCFName,shortReadPosFileName):
pass
elif len(alt)<len(ref): #DEL
#for curPos in range(int(line[1])+1,int(line[1])+len(ref)):
# hetVCFPositions.add(chr+":"+str(curPos))
# hetVCFPositions.add(chr+":"+str(curPos))
pass

#Writing the short read het positions to a file
Expand Down Expand Up @@ -206,8 +209,8 @@ def assignLongReadToSNPs(samPath,bedPath,referencePath,minQ,minMQ,minAln,outputP
b=False
while b==False:
if line[0]+"_"+str(i) not in readDict:
readDict[line[0]+"_"+str(i)]=[]
line[0]=line[0]+"_"+str(i)
readDict[line[0]+"_VCSTTP_"+str(i)]=[] #Very Clean Solution To This Problem
line[0]=line[0]+"_VCSTTP_"+str(i)
b=True
i+=1

Expand Down Expand Up @@ -360,7 +363,7 @@ def longReadValidation(longReadFilePath,minCov,minRatio,minTrioCov,validatedSNPA
#vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv#
###########################################################################################

#Only keeping long reads that have unique combinations of SNPs
#Only keeping long reads that have unique combinations of SNPs
uniqueLongReadSets=set()
uniqueLongReadIDs=set()

Expand Down Expand Up @@ -412,7 +415,7 @@ def generateLongReadFastQFiles(haplotigReadNameFilePath,longReadFastQFilePath,ou
haplotigReadNameFile.close()

gzipBool=True

longReadData={}

with gzip.open(longReadFastQFilePath, 'r') as fh:
Expand Down Expand Up @@ -444,13 +447,13 @@ def generateLongReadFastQFiles(haplotigReadNameFilePath,longReadFastQFilePath,ou
return "Successfully generated phased FastQ files"

def loadFile(filePath):
openFile=open(filePath,"r")
fileContents=[]
for line in openFile:
line=line.strip("\n").split("\t")
fileContents.append(line)
openFile.close()
return fileContents
openFile=open(filePath,"r")
fileContents=[]
for line in openFile:
line=line.strip("\n").split("\t")
fileContents.append(line)
openFile.close()
return fileContents

def simplifyDataVis(dataVisPath,simpleOutPath,distance):
fileContents=loadFile(dataVisPath)
Expand Down Expand Up @@ -493,7 +496,7 @@ def generateDataVis(dataVisPath,outPath):
tbl.columns = ["contigName", "startPos", "endPos", "chr", "yValue"]

#g=(ggplot(tbl,aes(y=tbl["yValue"],yend=tbl["yValue"],x=tbl["startPos"],xend=tbl["endPos"],color='factor(yValue)'))+geom_segment(size=1.5)+theme(legend_position="none")+theme(panel_grid_minor=element_blank())+facet_wrap("~chr",scales = "free")+theme(axis_title_y=element_blank(),axis_text_y=element_blank(),axis_ticks_major_y=element_blank())+xlab("Distance from start of genome (bp)")+theme(subplots_adjust={'hspace': 0.7}))

g=(ggplot(tbl,aes(y=tbl["yValue"],yend=tbl["yValue"],x=tbl["startPos"],xend=tbl["endPos"],color='factor(yValue)'))+geom_segment(size=1.5)+theme(legend_position="none")+theme(panel_grid_minor=element_blank())+theme(axis_title_y=element_blank(),axis_text_y=element_blank(),axis_ticks_major_y=element_blank())+xlab("Distance from start of genome (bp)")+theme(subplots_adjust={'hspace': 0.7}))

ggsave(g,filename=outputSVG,width=18, height=10)
Expand Down
Binary file removed dist/nPhase-1.0.7-py3-none-any.whl
Binary file not shown.
Binary file removed dist/nPhase-1.0.7.tar.gz
Binary file not shown.
Binary file added dist/nPhase-1.0.9-py3-none-any.whl
Binary file not shown.
Binary file added dist/nPhase-1.0.9.tar.gz
Binary file not shown.
Binary file modified example/shortReads/SaCe_Chr2_SR_R1.fastq.gz
Binary file not shown.
Binary file modified example/shortReads/SaCe_Chr2_SR_R2.fastq.gz
Binary file not shown.
27 changes: 27 additions & 0 deletions example/shortReads/pythonSolution.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import sys, operator

inFile = open(sys.argv[1],'r')

header = ''
data = ''

allData = []

i=1

for line in inFile:
if i%4==1:
if data != '':
allData.append((header,data))
header = line.strip()[1:]
data = line
else:
data += line
i+=1

allData.append((header,data))
allData.sort(key = operator.itemgetter(0))

for item in allData:
print (item[1].strip())

5 changes: 3 additions & 2 deletions nphase/meta.yaml
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
{% set name = "nPhase" %}
{% set version = "1.0.6" %}
{% set version = "1.0.9" %}

package:
name: "{{ name|lower }}"
version: "{{ version }}"

source:
url: "https://pypi.io/packages/source/{{ name[0] }}/{{ name }}/{{ name }}-{{ version }}.tar.gz"
sha256: cd11c1743b25e96cba92bfd98866b46a7da8ebc27450d0ab384b4f9e4e626c67
sha256: b2d8515e23e3538eebe4cdecfaa94638551119689399862164b89834e5d0cc4c

build:
number: 0
Expand Down Expand Up @@ -35,6 +35,7 @@ test:
- bin
commands:
- nphase --help

about:
home: "https://https://github.com/nPhasePipeline/nPhase"
license: "GNU General Public v3 (GPLv3)"
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

setuptools.setup(
name="nPhase",
version="1.0.7",
version="1.0.9",
author="Omar Abou Saada",
author_email="[email protected]",
description="nPhase is a command line ploidy agnostic phasing pipeline and algorithm which phases samples of any ploidy with sequence alignment of long and short read data to a reference sequence.",
Expand Down

0 comments on commit f74e10f

Please sign in to comment.