Skip to content

Commit

Permalink
Merge pull request #125 from broadinstitute/dp-cmapper
Browse files Browse the repository at this point in the history
CoordMapper tweaks
  • Loading branch information
dpark01 committed Apr 13, 2015
2 parents 2da9d08 + a44687a commit 423caca
Showing 1 changed file with 55 additions and 44 deletions.
99 changes: 55 additions & 44 deletions interhost.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
try:
from itertools import zip_longest
except ImportError :
from itertools import izip_longest as zip_longest, izip as zip
from itertools import izip_longest as zip_longest
import tools.muscle, tools.snpeff, tools.mafft
import util.cmd, util.file, util.vcf
from collections import OrderedDict, Sequence
Expand Down Expand Up @@ -44,7 +44,7 @@ def __init__(self, fastaA, fastaB, alignerTool = tools.muscle.MuscleTool) :
self.BtoA = OrderedDict() # {chrB : [chrA, mapperAB], chrD : [chrC, mapperCD], ...}

self._align(fastaA, fastaB, alignerTool())

def mapAtoB(self, fromChrom, fromPos = None, side = 0) :
""" Map (chrom, coordinate) from genome A to genome B.
If fromPos is None, map only the chromosome name
Expand Down Expand Up @@ -78,33 +78,35 @@ def mapBtoA(self, fromChrom, fromPos = None, side = 0) :
return (toChrom, toPos)

def _align(self, fastaA, fastaB, aligner) :
alignInFileName = util.file.mkstempfname('.fasta')
alignOutFileName = util.file.mkstempfname('.fasta')
with util.file.open_or_gzopen(fastaA) as infA :
with util.file.open_or_gzopen(fastaB) as infB :
numSeqs = 0
for recA, recB in zip_longest(SeqIO.parse(infA, 'fasta'),
SeqIO.parse(infB, 'fasta')) :
assert (recA != None and recB != None), 'CoordMapper '\
'input files must have same number of sequences.'
numSeqs += 1
chrA = recA.id
chrB = recB.id
with open(alignInFileName, 'wt') as alignInFile :
SeqIO.write(recA, alignInFile, 'fasta')
SeqIO.write(recB, alignInFile, 'fasta')
aligner.execute(alignInFileName, alignOutFileName)
with open(alignOutFileName) as alignOutFile :
seqParser = SeqIO.parse(alignOutFile, 'fasta')
seqs = [seqRec.seq for seqRec in seqParser]
mapper = CoordMapper2Seqs(seqs[0], seqs[1])
self.AtoB[chrA] = [chrB, mapper]
self.BtoA[chrB] = [chrA, mapper]
assert numSeqs > 0, 'CoordMapper: no input sequences.'
assert len(self.AtoB) == len(self.BtoA) == numSeqs, \
'CoordMapper: duplicate sequence name.'
os.unlink(alignInFileName)
os.unlink(alignOutFileName)
# transpose
per_chr_fastas = transposeChromosomeFiles([fastaA, fastaB])
if not per_chr_fastas:
raise Exception('no input sequences')
# align
alignOutFileNames = []
for alignInFileName in per_chr_fastas:
alignOutFileName = util.file.mkstempfname('.fasta')
aligner.execute(alignInFileName, alignOutFileName)
alignOutFileNames.append(alignOutFileName)
os.unlink(alignInFileName)
# read in
self._load_alignments(alignOutFileNames)
# clean up
for f in alignOutFileNames:
os.unlink(f)

def _load_alignments(self, aligned_files, a_idx=0, b_idx=1) :
assert a_idx>=0 and b_idx>=0
for alignOutFileName in aligned_files:
with open(alignOutFileName, 'rt') as alignOutFile :
seqs = list(SeqIO.parse(alignOutFile, 'fasta'))
assert a_idx<len(seqs) and b_idx<len(seqs)
mapper = CoordMapper2Seqs(seqs[a_idx].seq, seqs[b_idx].seq)
self.AtoB[seqs[a_idx].id] = [seqs[b_idx].id, mapper]
self.BtoA[seqs[b_idx].id] = [seqs[a_idx].id, mapper]
assert len(self.AtoB) == len(self.BtoA) == len(aligned_files), \
'duplicate sequence names'


class CoordMapper2Seqs(object) :
""" Map 1-based coordinates between two aligned sequences.
Expand Down Expand Up @@ -315,7 +317,7 @@ def multichr_mafft(args):
# passes input files in this way, and the MAFFT tool expects lists,
# but in this case we are creating the input file ourselves
tools.mafft.MafftTool().execute(
inFastas = [filePath],
inFastas = [os.path.abspath(filePath)],
outFile = absoluteOutDirectory + "/{}_{}.fasta".format(prefix, idx),
localpair = args.localpair,
globalpair = args.globalpair,
Expand Down Expand Up @@ -374,26 +376,35 @@ def make_vcf(a, ref_idx, chrom):
vars.append(m+1)
yield row+vars

def transposeChromosomeFiles(inputFilenamesList, outputDirectory, outputFilePrefix, inputFormat="fasta", outputFormat="fasta"):
def transposeChromosomeFiles(inputFilenamesList):
''' Input: a list of FASTA files representing a genome for each sample.
Each file contains the same number of sequences (chromosomes, segments,
etc) in the same order.
Output: a list of FASTA files representing all samples for each
chromosome/segment for input to a multiple sequence aligner.
The number of FASTA files corresponds to the number of chromosomes
in the genome. Each file contains the same number of samples
in the same order. Each output file is a tempfile.
'''
outputFilenames = []

# open files
inputFilesList = [open(x, 'rU') for x in inputFilenamesList]
# open all files
inputFilesList = [util.file.open_or_gzopen(x, 'rU') for x in inputFilenamesList]
# get BioPython iterators for each of the FASTA files specified in the input
fastaFiles = [SeqIO.parse(x, inputFormat) for x in inputFilesList]
fastaFiles = [SeqIO.parse(x, 'fasta') for x in inputFilesList]

# for each interleaved record
for idx, chrRecordList in enumerate( zip(*fastaFiles) ):
outputFilename = util.file.mkstempfname('__transposed_{}_'.format(chrRecordList[0]))

fileObj = open(outputFilename, "w")
for record in chrRecordList:
# write the corresonding record to a new FASTA file
countWritten = SeqIO.write(record, fileObj, outputFormat)
fileObj.close()
outputFilenames.append(os.path.abspath(outputFilename))
for idx, chrRecordList in enumerate( zip_longest(*fastaFiles) ):
if any(rec==None for rec in chrRecordList):
raise Exception("input files must all have the same number of sequences")

outputFilename = util.file.mkstempfname('.fasta')
outputFilenames.append(outputFilename)
with open(outputFilename, "w") as outf:
# write the corresonding records to a new FASTA file
countWritten = SeqIO.write(chrRecordList, outf, 'fasta')

#close input files
# close all input files
[x.close() for x in inputFilesList]

return outputFilenames
Expand Down

0 comments on commit 423caca

Please sign in to comment.