diff --git a/interhost.py b/interhost.py index 0962ad2e3..4a375ed47 100755 --- a/interhost.py +++ b/interhost.py @@ -13,7 +13,7 @@ try: from itertools import zip_longest except ImportError : - from itertools import izip_longest as zip_longest, izip as zip + from itertools import izip_longest as zip_longest import tools.muscle, tools.snpeff, tools.mafft import util.cmd, util.file, util.vcf from collections import OrderedDict, Sequence @@ -44,7 +44,7 @@ def __init__(self, fastaA, fastaB, alignerTool = tools.muscle.MuscleTool) : self.BtoA = OrderedDict() # {chrB : [chrA, mapperAB], chrD : [chrC, mapperCD], ...} self._align(fastaA, fastaB, alignerTool()) - + def mapAtoB(self, fromChrom, fromPos = None, side = 0) : """ Map (chrom, coordinate) from genome A to genome B. If fromPos is None, map only the chromosome name @@ -78,33 +78,35 @@ def mapBtoA(self, fromChrom, fromPos = None, side = 0) : return (toChrom, toPos) def _align(self, fastaA, fastaB, aligner) : - alignInFileName = util.file.mkstempfname('.fasta') - alignOutFileName = util.file.mkstempfname('.fasta') - with util.file.open_or_gzopen(fastaA) as infA : - with util.file.open_or_gzopen(fastaB) as infB : - numSeqs = 0 - for recA, recB in zip_longest(SeqIO.parse(infA, 'fasta'), - SeqIO.parse(infB, 'fasta')) : - assert (recA != None and recB != None), 'CoordMapper '\ - 'input files must have same number of sequences.' - numSeqs += 1 - chrA = recA.id - chrB = recB.id - with open(alignInFileName, 'wt') as alignInFile : - SeqIO.write(recA, alignInFile, 'fasta') - SeqIO.write(recB, alignInFile, 'fasta') - aligner.execute(alignInFileName, alignOutFileName) - with open(alignOutFileName) as alignOutFile : - seqParser = SeqIO.parse(alignOutFile, 'fasta') - seqs = [seqRec.seq for seqRec in seqParser] - mapper = CoordMapper2Seqs(seqs[0], seqs[1]) - self.AtoB[chrA] = [chrB, mapper] - self.BtoA[chrB] = [chrA, mapper] - assert numSeqs > 0, 'CoordMapper: no input sequences.' - assert len(self.AtoB) == len(self.BtoA) == numSeqs, \ - 'CoordMapper: duplicate sequence name.' - os.unlink(alignInFileName) - os.unlink(alignOutFileName) + # transpose + per_chr_fastas = transposeChromosomeFiles([fastaA, fastaB]) + if not per_chr_fastas: + raise Exception('no input sequences') + # align + alignOutFileNames = [] + for alignInFileName in per_chr_fastas: + alignOutFileName = util.file.mkstempfname('.fasta') + aligner.execute(alignInFileName, alignOutFileName) + alignOutFileNames.append(alignOutFileName) + os.unlink(alignInFileName) + # read in + self._load_alignments(alignOutFileNames) + # clean up + for f in alignOutFileNames: + os.unlink(f) + + def _load_alignments(self, aligned_files, a_idx=0, b_idx=1) : + assert a_idx>=0 and b_idx>=0 + for alignOutFileName in aligned_files: + with open(alignOutFileName, 'rt') as alignOutFile : + seqs = list(SeqIO.parse(alignOutFile, 'fasta')) + assert a_idx