Merge pull request #125 from broadinstitute/dp-cmapper

CoordMapper tweaks
broadinstitute · Apr 13, 2015 · 423caca · 423caca
2 parents 2da9d08 + a44687a
commit 423caca
Showing 1 changed file with 55 additions and 44 deletions.
diff --git a/interhost.py b/interhost.py
@@ -13,7 +13,7 @@
 try:
     from itertools import zip_longest
 except ImportError :
-    from itertools import izip_longest as zip_longest, izip as zip
+    from itertools import izip_longest as zip_longest
 import tools.muscle, tools.snpeff, tools.mafft
 import util.cmd, util.file, util.vcf
 from collections import OrderedDict, Sequence
@@ -44,7 +44,7 @@ def __init__(self, fastaA, fastaB, alignerTool = tools.muscle.MuscleTool) :
         self.BtoA = OrderedDict() # {chrB : [chrA, mapperAB], chrD : [chrC, mapperCD], ...}
 
         self._align(fastaA, fastaB, alignerTool())
-    
+
     def mapAtoB(self, fromChrom, fromPos = None, side = 0) :
         """ Map (chrom, coordinate) from genome A to genome B.
             If fromPos is None, map only the chromosome name
@@ -78,33 +78,35 @@ def mapBtoA(self, fromChrom, fromPos = None, side = 0) :
         return (toChrom, toPos)
 
     def _align(self, fastaA, fastaB, aligner) :
-        alignInFileName = util.file.mkstempfname('.fasta')
-        alignOutFileName = util.file.mkstempfname('.fasta')
-        with util.file.open_or_gzopen(fastaA) as infA :
-            with util.file.open_or_gzopen(fastaB) as infB :
-                numSeqs = 0
-                for recA, recB in zip_longest(SeqIO.parse(infA, 'fasta'),
-                                               SeqIO.parse(infB, 'fasta')) :
-                    assert (recA != None and recB != None), 'CoordMapper '\
-                            'input files must have same number of sequences.'
-                    numSeqs += 1
-                    chrA = recA.id
-                    chrB = recB.id
-                    with open(alignInFileName, 'wt') as alignInFile :
-                        SeqIO.write(recA, alignInFile, 'fasta')
-                        SeqIO.write(recB, alignInFile, 'fasta')
-                    aligner.execute(alignInFileName, alignOutFileName)
-                    with open(alignOutFileName) as alignOutFile :
-                        seqParser = SeqIO.parse(alignOutFile, 'fasta')
-                        seqs = [seqRec.seq for seqRec in seqParser]
-                        mapper = CoordMapper2Seqs(seqs[0], seqs[1])
-                        self.AtoB[chrA] = [chrB, mapper]
-                        self.BtoA[chrB] = [chrA, mapper]
-        assert numSeqs > 0, 'CoordMapper: no input sequences.'
-        assert len(self.AtoB) == len(self.BtoA) == numSeqs, \
-               'CoordMapper: duplicate sequence name.'
-        os.unlink(alignInFileName)
-        os.unlink(alignOutFileName)
+        # transpose
+        per_chr_fastas = transposeChromosomeFiles([fastaA, fastaB])
+        if not per_chr_fastas:
+            raise Exception('no input sequences')
+        # align
+        alignOutFileNames = []
+        for alignInFileName in per_chr_fastas:
+            alignOutFileName = util.file.mkstempfname('.fasta')
+            aligner.execute(alignInFileName, alignOutFileName)
+            alignOutFileNames.append(alignOutFileName)
+            os.unlink(alignInFileName)
+        # read in
+        self._load_alignments(alignOutFileNames)
+        # clean up
+        for f in alignOutFileNames:
+            os.unlink(f)
+
+    def _load_alignments(self, aligned_files, a_idx=0, b_idx=1) :
+        assert a_idx>=0 and b_idx>=0
+        for alignOutFileName in aligned_files:
+            with open(alignOutFileName, 'rt') as alignOutFile :
+                seqs = list(SeqIO.parse(alignOutFile, 'fasta'))
+                assert a_idx<len(seqs) and b_idx<len(seqs)
+                mapper = CoordMapper2Seqs(seqs[a_idx].seq, seqs[b_idx].seq)
+                self.AtoB[seqs[a_idx].id] = [seqs[b_idx].id, mapper]
+                self.BtoA[seqs[b_idx].id] = [seqs[a_idx].id, mapper]
+        assert len(self.AtoB) == len(self.BtoA) == len(aligned_files), \
+               'duplicate sequence names'
+
 
 class CoordMapper2Seqs(object) :
     """ Map 1-based coordinates between two aligned sequences.
@@ -315,7 +317,7 @@ def multichr_mafft(args):
         # passes input files in this way, and the MAFFT tool expects lists,
         # but in this case we are creating the input file ourselves
         tools.mafft.MafftTool().execute(
-                    inFastas          = [filePath], 
+                    inFastas          = [os.path.abspath(filePath)],
                     outFile           = absoluteOutDirectory + "/{}_{}.fasta".format(prefix, idx), 
                     localpair         = args.localpair, 
                     globalpair        = args.globalpair, 
@@ -374,26 +376,35 @@ def make_vcf(a, ref_idx, chrom):
                             vars.append(m+1)
             yield row+vars
 
-def transposeChromosomeFiles(inputFilenamesList, outputDirectory, outputFilePrefix, inputFormat="fasta", outputFormat="fasta"):
+def transposeChromosomeFiles(inputFilenamesList):
+    ''' Input:  a list of FASTA files representing a genome for each sample.
+                Each file contains the same number of sequences (chromosomes, segments,
+                etc) in the same order.
+        Output: a list of FASTA files representing all samples for each
+                chromosome/segment for input to a multiple sequence aligner.
+                The number of FASTA files corresponds to the number of chromosomes
+                in the genome.  Each file contains the same number of samples
+                in the same order.  Each output file is a tempfile.
+    '''
     outputFilenames = []
 
-    # open files
-    inputFilesList = [open(x, 'rU') for x in inputFilenamesList]
+    # open all files
+    inputFilesList = [util.file.open_or_gzopen(x, 'rU') for x in inputFilenamesList]
     # get BioPython iterators for each of the FASTA files specified in the input
-    fastaFiles = [SeqIO.parse(x, inputFormat) for x in inputFilesList]
+    fastaFiles = [SeqIO.parse(x, 'fasta') for x in inputFilesList]
 
     # for each interleaved record
-    for idx, chrRecordList in enumerate( zip(*fastaFiles) ):
-        outputFilename = util.file.mkstempfname('__transposed_{}_'.format(chrRecordList[0]))
-
-        fileObj = open(outputFilename, "w")
-        for record in chrRecordList:
-            # write the corresonding record to a new FASTA file
-            countWritten = SeqIO.write(record, fileObj, outputFormat)
-        fileObj.close()
-        outputFilenames.append(os.path.abspath(outputFilename))
+    for idx, chrRecordList in enumerate( zip_longest(*fastaFiles) ):
+        if any(rec==None for rec in chrRecordList):
+            raise Exception("input files must all have the same number of sequences")
+
+        outputFilename = util.file.mkstempfname('.fasta')
+        outputFilenames.append(outputFilename)
+        with open(outputFilename, "w") as outf:
+            # write the corresonding records to a new FASTA file
+            countWritten = SeqIO.write(chrRecordList, outf, 'fasta')
 
-    #close input files
+    # close all input files
     [x.close() for x in inputFilesList]
 
     return outputFilenames