CallHap_HapCallr.py

#!/bin/python
# CallHap_HapCallr V. 1.01.00
#
# A program for determining full-genome haplotype frequencies in pooled DNA 
# samples based on SNP calls and limited known haplotypes.  
# Takes as input a pair of VCF files describing haplotype identity and SNP 
# frequency, as generated by CallHap VCF_Filt
# 
# Import necessary modules
import numpy as np
from argparse import ArgumentParser
import time
import sys
import random
import os
from multiprocessing import Pool
from Modules.VCF_parser import *
from Modules.CorrHaps import *
from Modules.CallHap_LeastSquares import *
from Modules.General import *
from Modules.IO import *
from Modules.parallel import *

progVersion = "V1.01.00"

def  MakeHaps(inSnpSets, inOldHaps, inInitialFreqs, InitialHaps):
    # Module to create new haplotypes using input SNP sets and haplotype set.
    # Figure out what the less common identity for this SNP is in the current 
    # haplotype set
    snpIDs = [inOldHaps[x][inSnpSets[0]] for x in xrange(len(inOldHaps))]
    numSnps = len(inOldHaps[0])
    commonCounter = [snpIDs.count(0), snpIDs.count(1)]
    if commonCounter[0] > commonCounter[1]:
        rareAllele=1
    else:
        rareAllele=0
    # Figure out which haplotypes contain the less common variant    
    containingHaps = [True if inOldHaps[x][inSnpSets[0]] == rareAllele 
                      else False for x in xrange(len(inOldHaps))]
    if True in containingHaps: # If this SNP is in a known haplotype
        # Determine which SNPs can be legally changed in each haplotype
        legalSnpsByHap = ValidSnpsFromPhylogeny(inOldHaps, InitialHaps)
        # Check which haplotypes the target SNP can be legally changed in
        # These are the ones that could be used to create new source haplotypes
        usableHaps = [True if inSnpSets[0] in legalSnpsByHap[hap] else False 
                      for hap in xrange(len(inOldHaps))]
    else: # If this SNP is not in a known haplotype
        # All haplotypes can be used to create new source haplotypes.
        usableHaps = [True for x in containingHaps]
    # Initialize lists of possible haplotype sets
    possibleFreqs = [inInitialFreqs[:]]
    possibleHaps = [inOldHaps]
    initialHaps = len(inOldHaps)
    
    freqSet = 0
    testStop = len(possibleFreqs)
    loopCtr1 = 0
    
    # while there are still haplotypes to try adding this SNP to
    while freqSet < testStop:
        loopCtr1 += 1
        baseFreq = []
        for freq in xrange(len(possibleFreqs[freqSet])):
            
            if possibleFreqs[freqSet][freq] > 0 and usableHaps[freq] == True:
                baseFreq.append(freq)
        
        newFreq = 0
        loopCtr2 = 0
        while newFreq < len(baseFreq):
            loopCtr2 += 1
            if loopCtr2 > 1000:
                raise Exception(
                    "Too many iterations at line 342 with baseFreq = %s" % 
                    len(baseFreq)
                    )
            if baseFreq[newFreq] > initialHaps:
                if newFreq == len(baseFreq) - 1:
                    # Change the original frequency set and haplotypes set
                    possibleFreqs[freqSet].append(1)
                    possibleHaps[freqSet].append(
                        np.copy(possibleHaps[freqSet][baseFreq[newFreq]])
                        )
                    for iter1 in inSnpSets:
                        possibleHaps[freqSet][-1][iter1] = 1 - possibleHaps[freqSet][-1][iter1]
                else:
                    # make a copy of the original frequency set and haplotypes
                    # set
                    possibleFreqs.append([x for x in possibleFreqs[freqSet]])
                    possibleHaps.append([np.copy(x) 
                        for x in possibleHaps[freqSet]])
                    # change the copy
                    possibleFreqs[-1].append(1)
                    possibleHaps[-1].append(
                        np.copy(possibleHaps[freqSet][newFreq])
                        )
                    for iter1 in inSnpSets:
                        possibleHaps[-1][-1][iter1] = 1 - possibleHaps[-1][-1][iter1]
            else:

                if newFreq == len(baseFreq) - 1:
                    # Change the origional frequency set and haplotypes set
                    possibleFreqs[freqSet].append(1)
                    possibleHaps[freqSet].append(
                        np.copy(possibleHaps[freqSet][baseFreq[newFreq]])
                        )
                    for iter1 in inSnpSets:
                        possibleHaps[freqSet][-1][iter1] = 1 - possibleHaps[freqSet][-1][iter1]

                    if (int(possibleFreqs[freqSet][baseFreq[newFreq]]) == 0 
                            and baseFreq[newFreq] >= InitialHaps):
                        possibleFreqs[freqSet].pop(baseFreq[newFreq])
                        possibleHaps[freqSet].pop(baseFreq[newFreq])
                else:
                    # make a copy of the original frequency set and haplotypes
                    # set
                    possibleFreqs.append([x for x in possibleFreqs[freqSet]])
                    possibleHaps.append([np.copy(x) 
                                         for x in possibleHaps[freqSet]])
                    # change the copy
                    possibleFreqs[-1].append(1)
                    possibleHaps[-1].append(
                        np.copy(possibleHaps[freqSet][baseFreq[newFreq]])
                        )
                    for iter1 in inSnpSets:
                        possibleHaps[-1][-1][iter1] = 1 - possibleHaps[-1][-1][iter1]
                    if (int(possibleFreqs[freqSet][baseFreq[newFreq]]) == 0 
                            and baseFreq[newFreq] >= InitialHaps):
                        possibleFreqs[freqSet].pop(baseFreq[newFreq])
                        possibleHaps[freqSet].pop(baseFreq[newFreq])
            newFreq += 1
        freqSet += 1
    return(possibleHaps)

def CallHapMain(OrderNumber, o, resume=False):

    print("Starting Random Order %s/%s" % (str(OrderNumber + 1), 
                                           str(o.numRand)))
    # Load haplotypes
    KnownHaps, KnownNames = toNP_array(o.knownHaps, "GT")
    # Invert haplotypes so that ref allele is 1
    KnownHaps = invertArray(KnownHaps)
    # Find unique haplotypes
    inHapArray, UniqueNames = UniqueHaps(KnownHaps, KnownNames)
    # Count number of unique haplotypes
    numHapsInitial = len(UniqueNames)
    # Count number of SNPs
    numSNPs = inHapArray.shape[0]
    # Add "dummy" SNP to ensure haplotype frequencies sum correctly
    inHapArray = ExtendHaps(inHapArray)
    # Store input haplotypes in bestArray
    bestArray = np.copy(inHapArray)       
    # Load SNPs
    SnpFreqs, poolNames = toNP_array(o.inFreqs, "RF")
    # Add "dummy" SNP to ensure haplotype frequencies sum correctly
    SnpFreqs = ExtendHaps(SnpFreqs)
    # Count number of pools present
    numPools = len(poolNames)
    
    # Count number of haplotypes again to save initial number of known 
    # haplotypes for later
    # May not be needed in random method
    numHapsInitial1 = len(UniqueNames)
    # Set baseNumHapSets to keep track of source haplotype set for each created
    # haplotype set
    baseNumHapSets = 1
    # Convert haplotypes and SNPs arrays to decimal format to prevent rounding
    # errors
    bestArray = npToDecNp(bestArray)
    SnpFreqs = npToDecNp(SnpFreqs)
    
    # Find base SLSq
    # Save base RSS
    baseSLSq = []
    # Save base haplotype frequencies
    baseFreqs = []
    # save base residuals
    baseResiduals = [[]]
    # Calculate RSS for each pool
    for poolIter in xrange(numPools):
        tmpSol = Find_Freqs(bestArray, SnpFreqs[:,poolIter], poolSizes[poolIter])
        baseSLSq.append(tmpSol[1])
        baseFreqs.append(tmpSol[0])
        baseResiduals[0].append(
            np.array([[x] for x in list(residuals(tmpSol[0][0],bestArray, 
                          SnpFreqs[:,poolIter],poolSizes[poolIter]))])
            )
    # Calculate total per SNP RSS values for all SNPs; method for deterministic
    # ordering
    baseSnpResids = [sum([baseResiduals[0][pool][xSnp] 
        for pool in xrange(numPools)]) for xSnp in xrange(numSNPs)]
    # Find overall SNP frequency in SSLs; method for deterministic ordering
    snpFreqsTotal = np.sum(bestArray, axis=1) < bestArray.shape[1]

    # Create random SNP ordering
    if o.ordered:
        snpFreqsTotal = np.sum(SnpFreqs, axis=1)
        snpCombins3 = [[x] for x in sorted(range(numSNPs), key = lambda x: snpFreqsTotal[x], reverse=True)]
    else:
        snpCombins3 = [[x] for x in range(numSNPs)]
        random.shuffle(snpCombins3)
        snpCombins3 = [y for y in sorted(snpCombins3, 
        key = lambda x: snpFreqsTotal[x[0]], reverse = True)]
    
    #Find base average RSS value
    baseRSS = sum(baseSLSq)/len(baseSLSq)
    
    fullFreqs = [[0 for x in xrange(numHapsInitial)]]
    for testIter in xrange(numHapsInitial):
        for testIter2 in xrange(numPools):
            if baseFreqs[testIter2][0,testIter] > 0:
                fullFreqs[0][testIter] = 1
    # Break up haplotypes array into a list of arrays
    potHapSets = [[np.copy(bestArray[:,x]) for x in xrange(numHapsInitial)]]
    numHaps = [numHapsInitial]
    # AIC and RSS are used somewhat interchangeably as variable names
    # in this program at the moment, and I don't have the time to clean it up 
    # right now.  It will be cleaned up in the future
    bestAIC = [baseRSS]
    usedSnps = 0
    iterationsStartPoint=0
    targetUsedSNPs = 0
    if o.resume:
        print("Restarting order %s" % OrderNumber)
        #try resuming this order
        # if this order cannot be resumed, pass
        try:
            print("Opening file %s_save%s.tmp" % (o.outPrefix, OrderNumber))
            restartInput = open("%s_save%s.tmp" % (o.outPrefix, OrderNumber),"rb")
            line = restartInput.readline().strip()
            outputList = []
            if line == "Outputs":
                print("Order %s does not need restarting" % str(OrderNumber + 1))
                line = restartInput.readline().strip()
                while "#" not in line:
                    linebins = line.split()
                    outputList.append([])
                    outputList[-1].append([int(x) for x in linebins[:-1]])
                    outputList[-1].append(float(linebins[-1]))
                    line = restartInput.readline().strip()
                restartInput.close()
                print("Outputs from order %s/%s:\n%s" % (str(OrderNumber + 1),str(o.numRand),str(outputList)))
                
                return(outputList)
            while line != "":
                if line=="Order":
                    print("Loading SNPs for Order %s" % str(OrderNumber))
                    line = restartInput.readline().strip().split()
                    snpCombins3 = [[int(x)] for x in line]
                elif line == "Iter":
                    print("Loading iteration number for order %s" % str(OrderNumber))
                    line = restartInput.readline().strip()
                    iterationsStartPoint= int(line)
                elif line=="Used":
                    print("Loading used SNPs for order %s" % str(OrderNumber))
                    line = restartInput.readline().strip()
                    targetUsedSNPs = int(line)
                elif line=="potHapSets":
                    print("Loading pot hap sets for order %s" % str(OrderNumber))
                    line = restartInput.readline().strip()
                    potHapSets = []
                    while "#" not in line:
                        print(line)
                        potHapSets.append([DecHapToNPHap(int(x)) for x in line.split()])
                        line = restartInput.readline().strip()
                elif line=="fullFreqs":
                    print("loading fullFreqs for order %s" % str(OrderNumber))
                    fullFreqs = []
                    line = restartInput.readline().strip()
                    while "#" not in line:
                        fullFreqs.append([int(x) for x in line.split()])
                        line = restartInput.readline().strip()
                elif line=="AIC":
                    print("Loading AICs for order %s" % str(OrderNumber))
                    line=restartInput.readline().strip()
                    bestAIC = []
                    while "#" not in line:
                        bestAIC.append(float(line))
                        line=restartInput.readline().strip()
                else:
                    pass
                line = restartInput.readline().strip()
            numHaps = [len(x) for x in potHapSets]
            restartInput.close()
        except IOError:
            print("Order %s hasn't started yet" % (str(OrderNumber + 1)))     
    print("SNP Order %s/%s: \n%s" % (str(OrderNumber + 1), 
                                     str(o.numRand), snpCombins3))
    # Start adding SNPs
    # In the case of multiple iterations:
    for iteration in xrange(iterationsStartPoint, o.numIterations):
        # Legacy line from when I was grouping SNPs based on correlation, 
        # or residual, or frequency
        for combin in snpCombins3:
            if usedSnps < targetUsedSNPs:
                usedSnps += 1
            else:
                # Keep track of where in the list of SNPs I am so the user knows 
                # something's happening
                usedSnps += 1
                # Test if this SNP combination has any non-zero residuals
                useCombin = False
                for hapSetIter in xrange(baseNumHapSets):
                    for population in xrange(numPools):
                        if abs(round(
                                20*baseResiduals[hapSetIter][population][combin[0]]
                                )) > 0:
                            useCombin = True
                # If this SNP combination has non-zero residuals:
                if useCombin:
                    newPotHapSets = []
                    potHapSetsAIC = []
                    newFullFreqs = []
                    # Find options for adding this SNP set:
                    currentHapSet = 0
                    snpRes = []
                    for hapSet in potHapSets:
                        newPotHaps = MakeHaps(combin, copy(hapSet), 
                                              fullFreqs[0], numHapsInitial)
                        SLSqs = []
                        Freqs = []
                        testAICList = []
                        maxRSSList = []
                        srcHap = []
                        # Find the average SLSq for each pot hap set
                        newPotHaps2 = []
                        if o.ordered:
                            tmpSols = easy_parallizeLS(newPotHaps, o.numProcesses, SnpFreqs, poolSizes)
                        else:
                            intermediate = []
                            for solverIter1 in xrange(len(newPotHaps)):
                                intermediate.append(
                                    easyConcat(newPotHaps[solverIter1])
                                    )
                            cleanedIntermediate = [x for x in intermediate 
                                                   if not x is None]                
                            func = partial(massFindFreqs, inSnpFreqs=SnpFreqs, 
                                           p=poolSizes)
                            result = []
                            for solverIter in xrange(len(cleanedIntermediate)):
                                result.append(func(cleanedIntermediate[solverIter]))
                            tmpSols = [x for x in result if not x is None]
                        
                        # Determine which solutions (and thus haplotypes) produce 
                        # an improvement in RSS value
                        testAICList = [x for x in xrange(len(tmpSols)) 
                                       if tmpSols[x][2] <= bestAIC[currentHapSet]]
                        # Keep track of the source haplotye set for these solutions
                        srcHap = [currentHapSet for x in xrange(len(testAICList))]
                        # Calculate per SNP residuals to test if improvement was 
                        # enough to keep this SNPs solutions
                        newResiduals = []
                        changedResids = []
                        SnpResiduals = []
                        solIter = 0
                        for sol in tmpSols:
                            newFullFreqs.append(
                                [0 for x in xrange(len(sol[1][0][0]))]
                                )
                            for testIter in xrange(len(newFullFreqs[-1])):
                                for testIter2 in xrange(numPools):
                                    if sol[1][testIter2][0,testIter] > 0:
                                        newFullFreqs[-1][testIter] = 1
                                    newResiduals.append(
                                      np.array([[x] 
                                      for x in list(residuals(sol[1][testIter2][0],
                                      np.concatenate([np.transpose(y[np.newaxis]) 
                                      for y in newPotHaps[solIter]], axis=1), 
                                      SnpFreqs[:,testIter2],poolSizes[testIter2]))])
                                      )
                            # Calculate per SNP RSS values
                            SnpResiduals.append(
                                [sum([newResiduals[poolIter][x]**2 
                                for poolIter in xrange(numPools)])/numPools 
                                for x in xrange(numSNPs)]
                                )
                            solIter += 1
                        snpRes1 = [SnpResiduals[x][combin[0]] 
                                   for x in xrange(len(tmpSols))]
                        if len(testAICList) > 0:
                            # Filter to only the best solutions out of all proposed
                            # solutions based on this haplotype set
                            # Sort solutions better than starting RSS by RSS value, 
                            # from lowest to highest
                            testIndex = sorted(testAICList, 
                                               key=lambda x: tmpSols[x][2])
                            # If no best solution for this SNP exists, the best 
                            # solution for this 
                            if len(potHapSetsAIC) == 0:
                                testFreq = tmpSols[testIndex[0]][2]
                            # If the best RSS from this solution is worse than the 
                            # best RSS so far proposed, use the best RSS so far 
                            # proposed
                            elif tmpSols[testIndex[0]][2] >= min(potHapSetsAIC):
                                testFreq = min(potHapSetsAIC)
                            # Othrewise, use the best RSS value from this SNP
                            else:
                                testFreq = tmpSols[testIndex[0]][2]
                            # If this RSS value represents an improvement, sort and
                            # save solutions
                            if testFreq < bestAIC[currentHapSet]:
                                iter1 = 0
                                minAICIndex = []
                                continueLoop = True
                                # Save all solutions (and thus potential haplotype
                                # sets) that represent an improvement in RSS value
                                while (iter1 < len(testIndex) and 
                                       tmpSols[testIndex[iter1]][2] <= testFreq):
                                    newPotHapSets.append(
                                        copy(newPotHaps[testIndex[iter1]])
                                        )
                                    potHapSetsAIC.append(
                                        tmpSols[testIndex[iter1]][2]
                                        )
                                    snpRes.append(snpRes1[testIndex[iter1]])
                                    iter1 += 1
                            else:
                                minAICIndex = []
                        # Next haplotype set
                        currentHapSet += 1
                    # Check if the ending residual values for a SNP are too high
                    continueCheck = [False if snpRes[x] >= o.highResidual else True 
                                     for x in xrange(len(snpRes))]
                    # Sort potential haplotype sets by RSS value
                    bestAICIdx = sorted(range(len(newPotHapSets)), 
                                        key=lambda x: potHapSetsAIC[x])
                    # Filter solutions based on RSS values, keeping only the lowest 
                    # RSS values
                    if len(bestAICIdx) > 0 and True in continueCheck:
                        bestFreq = potHapSetsAIC[bestAICIdx[0]]
                        potHapSets = []
                        bestAIC = []
                        iter1 = 0
                        minCtr = 0
                        newSourceHap = []
                        potHapSetsMaxRSS = []
                        while  iter1 < len(bestAICIdx):
                            if (potHapSetsAIC[bestAICIdx[iter1]] == bestFreq and 
                                    snpRes[bestAICIdx[iter1]] < o.highResidual):
                                minCtr += 1
                                potHapSets.append(
                                    copy(newPotHapSets[bestAICIdx[iter1]])
                                    )
                                bestAIC.append(potHapSetsAIC[bestAICIdx[iter1]])
                            iter1 += 1
                        fullFreqs = newFullFreqs[:]
                        
                        bestRSS = bestFreq
                        numHaps = [len(x) for x in potHapSets]
            if o.saveFreq > 0:
                if usedSnps % o.saveFreq == 0:
                    # Save invormation after this ordering
                    saveFile = open("%s_save%s.tmp" % (o.outPrefix, OrderNumber),"wb")
                    saveFile.write("Order\n%s\n" % "\t".join([str(x[0]) for x in snpCombins3]))
                    saveFile.write("Iter\n%s\n" % iteration)
                    saveFile.write("Used\n%s\n" % usedSnps)
                    saveFile.write("# Potential Haplotype Sets\n")
                    saveFile.write("potHapSets\n")
                    for potSaveIter in xrange(len(potHapSets)):
                        tmpOutput = []
                        for hapSaveIter in xrange(len(potHapSets[potSaveIter])):
                            tmpOutput.append(int("1"+"".join([str(int(x)) 
                                        for x in potHapSets[potSaveIter][hapSaveIter]]),2))
                        saveFile.write("%s\n" % "\t".join([str(x) for x in tmpOutput]))
                    saveFile.write("# Done\n")
                    saveFile.write("fullFreqs\n")
                    for potSaveIter in xrange(len(potHapSets)):
                        saveFile.write("%s\n" % "\t".join([str(x) for x in fullFreqs[potSaveIter]]))
                    saveFile.write("# Done\n")
                    saveFile.write("AIC\n%s\n" % "\n".join([str(x) for x in bestAIC])) 
                    saveFile.write("# Done")
                    saveFile.close()
        # Filter any solutions that made it through all SNPs. to only those 
        # with the lowest AIC (this time, really is AIC value)  
        SLSqs = []
        Freqs = []
        finFullFreqs = []
        SolutionHapSets = []
        SolutionAICs = []
        # Remove unused haplotypes from each potential final hap set
        intermediate = []
        for solverIter1 in xrange(len(potHapSets)):
            intermediate.append(easyConcat(potHapSets[solverIter1]))
        cleanedIntermediate = [x for x in intermediate if not x is None]                
        func = partial(massFindFreqs, inSnpFreqs=SnpFreqs, p=poolSizes)
        result = []
        for solverIter in xrange(len(cleanedIntermediate)):
            result.append(func(cleanedIntermediate[solverIter]))
        tmpSols = [x for x in result if not x is None]        
        
        for sol in tmpSols:
            finFullFreqs.append([0 for x in xrange(len(sol[1][0][0]))])
            for testIter in xrange(len(finFullFreqs[-1])):
                for testIter2 in xrange(numPools):
                    if sol[1][testIter2][0,testIter] > 0:
                        finFullFreqs[-1][testIter] = 1
        # Calculate AIC values for each solution
        SolutionAICs = [AIC_from_RSS(tmpSols[x][2], 
                                     sum(finFullFreqs[x]), numSNPs) 
                        for x in xrange(len(tmpSols))]
        # Create solution haplotype sets with only haplotypes present in 
        # initial haplotypes or with frequency in pools
        # Known haplotypes should be a subset of haplotypes with frequency in 
        # the final solution, but this is just in case they aren't
        SolutionHapSets = [[np.copy(potHapSets[x][y]) 
                           for y in xrange(len(finFullFreqs[x])) 
                           if finFullFreqs[x][y] > 0 or y < numHapsInitial ] 
                           for x in xrange(len(tmpSols))]
        # If SNPs are being removed permenantly after the final iteration:
        if o.dropFinal == True and iteration == o.numIterations - 1:
            # Figure out which SNPs to remove for each proposed solution
            newResiduals = []
            snpsToRemove = []
            solIter = 0
            for sol in tmpSols:
                for testIter in xrange(len(newFullFreqs)):
                    for testIter2 in xrange(numPools):
                        newResiduals.append(
                            np.array([[x] for x in list(
                                residuals(sol[1][testIter2][0],
                                   np.concatenate([np.transpose(y[np.newaxis])
                                       for y in potHapSets[solIter]], axis=1), 
                                   SnpFreqs[:,testIter2],poolSizes[testIter2])
                                )])
                            )
                SnpResiduals = [sum([newResiduals[poolIter][x]**2 
                                for poolIter in xrange(numPools)]) 
                                for x in xrange(numSNPs)]
                snpsToRemove.append([])
                for snpRemovalIter in xrange(numSNPs):
                    if SnpResiduals[snpRemovalIter] >= o.highResidual:
                        snpsToRemove[-1].append(snpRemovalIter)
                solIter += 1
        else:
            snpsToRemove = [[] for x in xrange(len(tmpSols))]
            
        
        # Figure out which solution(s) has (have) the lowest AIC
        AIC_test_idx = sorted(range(len(SolutionAICs)), 
                              key = lambda x: SolutionAICs[x])
        finIndex = 0
        testFreq = SolutionAICs[AIC_test_idx[0]]
        iter1 = 0
        minAICIndex = []
        continueLoop = True
        # Figure out how many solutions to output
        while  iter1 < len(AIC_test_idx) and continueLoop:
            if SolutionAICs[AIC_test_idx[iter1]] == testFreq:
                finIndex += 1
            else:
                continueLoop = False
            iter1 += 1
        
        # Start resetting base haplotype residuals
        baseResiduals = []        
        
        # Output solutions
        newPotHapSets = []
        bestAIC = []
        if iteration == o.numIterations - 1:
            outputList = []
        for outputIdx in xrange(finIndex):
            if iteration == o.numIterations - 1:
                outputList.append([])
            # Create final haplotypes array
            finSolution = np.concatenate(
                [SolutionHapSets[
                    AIC_test_idx[outputIdx]][x][np.newaxis].transpose() 
                    for x in xrange(len(
                        SolutionHapSets[AIC_test_idx[outputIdx]]
                        ))]
                , axis=1
                )
            # Remove any SNPs that need removing
            finSolution = np.delete(finSolution, snpsToRemove[outputIdx], 0)
            # Find (or make) haplotype names
            myHapNames = []
            newHapNumber = 1
            for haplotypeIter in xrange(finSolution.shape[1]):
                if haplotypeIter >= len(UniqueNames):
                    # For new haplotypes, build a new haplotype name, keeping 
                    # track of iteration and new haplotype number
                    myHapNames.append(
                        "NewHap_%s.%s" % (str(iteration).zfill(2), 
                                          str(newHapNumber).zfill(2)))
                    newHapNumber += 1
                else:
                    # For known haplotypes, use the original haplotype name
                    myHapNames.append(UniqueNames[haplotypeIter])
            # Redo uniqueness of haplotypes in case removing a SNP merged two 
            # haplotypes
            finSolution, finNames = UniqueHaps(finSolution, myHapNames)
            # remove SNPs from SNP frequencies
            finSNPs = np.delete(SnpFreqs,snpsToRemove[outputIdx],0)
            
            # Create decimal haplotype identifiers
            myDecHaps = []
            for haplotypeIter in xrange(finSolution.shape[1]):
                myDecHaps.append(int("1"+"".join([str(int(x)) 
                    for x in finSolution[:, haplotypeIter]]),2))
            if iteration == o.numIterations - 1:
                outputList[-1].append(myDecHaps)

            SLSqs = []
            Freqs = []
            predSnpFreqs = []
            newResiduals = []
            
            for poolIter in xrange(numPools):
                tmpSol = Find_Freqs(finSolution, finSNPs[:,poolIter], 
                                    poolSizes[poolIter])
                SLSqs.append(tmpSol[1])
                Freqs.append(tmpSol[0])
                # Calculate residuals for this pool
                newResiduals.append(
                    np.array([[x] for x in list(residuals(tmpSol[0][0], 
                                                          finSolution, 
                                                          finSNPs[:,poolIter], 
                                                          poolSizes[poolIter]))])
                    )
                # Calculate predicted SNP frequencies                
                predSnpFreqs = np.sum(finSolution * tmpSol[0][0], 
                                      axis = 1)/poolSizes[poolIter]
            if iteration == o.numIterations - 1:
                outputList[-1].append(average(SLSqs))
            baseResiduals.append(newResiduals[:])
            # Calculate per SNP RSS values for VCF output
            SnpResiduals = [float(sum([newResiduals[poolIter][x]**2 
                for poolIter in xrange(numPools)])[0]) 
                for x in xrange(numSNPs-len(snpsToRemove[outputIdx]))]
            bestAIC.append(sum(SLSqs)/len(SLSqs))
            # Save this haplotype set for the next iteration
            newPotHapSets.append([np.copy(finSolution[:,x]) 
                for x in xrange(finSolution.shape[1])])
        # Setup for next iteration
        usedSnps = 0
        numHapsInitial = len(myHapNames) # may need some fixing
        UniqueNames = myHapNames[:] # may need some fixing
        numHaps = [numHapsInitial for x in xrange(len(newPotHapSets))]
        outPrefix = "%s_Iteration%s" % (o.outPrefix, iteration + 2)
        potHapSets = newPotHapSets[:]
        fullFreqs = [[1 for x in xrange(len(potHapSets[y]))] 
            for y in xrange(len(potHapSets))]
        # Go on to the next iteration
        if iteration == o.numIterations - 1:
            print("Finished Random Order %s/%s" % (str(OrderNumber + 1), 
                                                   str(o.numRand)))
                                                   
            saveFile = open("%s_save%s.tmp" % (o.outPrefix, OrderNumber),"wb")
            saveFile.write("Outputs\n")
            for xIter in xrange(len(outputList)):
                saveFile.write("%s\t%s\n" % ("\t".join([str(x) for x in outputList[xIter][0]]), str(outputList[xIter][1])))
            saveFile.write("# Done")
            saveFile.close()
            print("Outputs from order %s/%s:\n%s" % (str(OrderNumber + 1),str(o.numRand),str(outputList)))
            return(outputList)
        
if __name__ == "__main__":
    # Load options
    parser = ArgumentParser()
    parser.add_argument(
        '-i','--inputHaps', 
        action="store", 
        dest="knownHaps", 
        help = "A VCF-formatted file containing the known haplotypes encoded \
                in the GT field.  GT must be present in the FORMAT field, and \
                ploidy must be 1.  ", 
        required=True
        )
    parser.add_argument(
        "-p", "--poolSizes", 
        action="store", 
        dest="poolSizesFile", 
        help="A file detailing the number of individuals in each pooled library.  ", 
        required=True
        )
    parser.add_argument(
        '-f','--inputFreqs', 
        action="store", 
        dest="inFreqs", 
        help="A VCF-formatted file containing the input pool frequencies \
              encoded in the RF field.  RF must be present in the FORMAT \
              field.  ", 
        required=True
        )
    parser.add_argument(
        '-o','--outPrefix', 
        action="store", 
        dest="outPrefix", 
        required=True, 
        help="A prefix for output file names.  "
        )
    parser.add_argument(
        "-v", "--version", 
        action="store_true", 
        dest="v", 
        help="Displays the version number and exits."
        )
    parser.add_argument(
        '-t', '--processes', 
        type=int, 
        action="store", 
        dest="numProcesses", 
        default=None, 
        help="The number of processes to use.  Should not be more than the \
              number of cores on your CPU.  Defaults to using the number of \
              cores on your CPU.  "
        )
    parser.add_argument(
        '-l','--numIterations', 
        type=int, 
        action="store", 
        dest="numIterations", 
        default=1, 
        help="Number of iterations to run within each random ordering."
        )
    parser.add_argument(
        '-r','--highResidual', 
        type=float, 
        action="store", 
        dest="highResidual", 
        default=100, 
        help="Cutoff value for delaying processing of a SNP until after all \
              other SNPs have been processed"
        )
    parser.add_argument(
        '--dropFinal', 
        action="store_true", 
        dest="dropFinal", 
        help="If after delaying processing on a SNP, the solution isn't \
              improved by keeping it, drop the SNP.  If absent, the SNP will \
              be processed as normal at the end.  "
        )
    parser.add_argument(
        '--genpop', 
        action="store_true", 
        dest="genpopOutput", 
        help="Output a genpop file of the resulting haplotype frequencies.  "
        )
    parser.add_argument(
        '--structure', 
        action="store_true", 
        dest="strOutput", 
        help="Output a Structure formatted file of the resulting haplotype \
              frequencies.  "
        )
    parser.add_argument(
        '--numRandom', 
        type=int, 
        action="store", 
        dest="numRand", 
        help="The number of random orders to use for haplotype creation.  \
              More orders will yield more accurate results, but will also \
              take longer.  ", 
        default=1
        )
    parser.add_argument(
        '--numTopRSS', 
        type=int, 
        action="store", 
        dest="topNum",  
        default=3, 
        help="The number of top RSS values you want to output files for.  \
             Increasing the size of this number may lead to a large number of \
             outputs.  "
        )
    parser.add_argument(
        '--noSearch', 
        action="store_true", 
        dest="findHaps", 
        help="Use if you don't want to find new haplotypes."
        )
    parser.add_argument(
        '--restart', 
        action="store_true", 
        dest="resume", 
        help="Restart the program from last saved points."
        )
    parser.add_argument(
        '--saveFrequency', 
        action="store", 
        type=int,
        dest="saveFreq", 
        help="how often to save; default is not to save",
        default=0
        )
    parser.add_argument(
        '--keepTemps', 
        action="store_true", 
        dest="keepTmp", 
        help="Do not delete temporary files after finishing"
        )
    parser.add_argument('--deterministic', '-d', 
                        dest="ordered",
                        action="store_true", 
                        help="Use deterministic SNP ordering.  Ignores --numRandom.  ")
    o = parser.parse_args()
    if o.ordered:
        o.numRand = 1
    # version output
    if o.v:
        print(progVersion)
        exit()
    
    # Print initialization text
    print("Running CallHap on %s at %s:" % (time.strftime("%d/%m/%Y"),
                                            time.strftime("%H:%M:%S")))
    CommandStr = "python CallHap_VCF_Filt.py %s" % " ".join(sys.argv[1:])
    print("Command = %s" % CommandStr)
    
    # Generate poolSize related numbers:
    poolSizes = []
    inPoolSizes = open(o.poolSizesFile,"rb")
    for line in inPoolSizes:
        poolSizes.append(int(line.strip().split()[1]))
    inPoolSizes.close()    
    # Set initial output prefix
    outPrefix = "%s" % (o.outPrefix)
    
    if o.findHaps:
        print("Outputing initial solution")
        print("Loading haplotypes")
        # Load haplotypes
        KnownHaps, KnownNames = toNP_array(o.knownHaps, "GT")
        # Invert haplotypes so that ref allele is 1
        KnownHaps = invertArray(KnownHaps)
        # Find unique haplotypes
        inHapArray, UniqueNames = UniqueHaps(KnownHaps, KnownNames)
        # Count number of unique haplotypes
        numHapsInitial = len(UniqueNames)
        # Count number of SNPs
        numSNPs = inHapArray.shape[0]
        # Add "dummy" SNP to ensure haplotype frequencies sum correctly
        inHapArray = ExtendHaps(inHapArray)
        # Store input haplotypes in bestArray
        bestArray = np.copy(inHapArray)       
        print("Loading Frequencies")
        # Load SNPs
        SnpFreqs, poolNames = toNP_array(o.inFreqs, "RF")
        # Add "dummy" SNP to ensure haplotype frequencies sum correctly
        SnpFreqs = ExtendHaps(SnpFreqs)
        # Count number of pools present
        numPools = len(poolNames)
        
        # Count number of haplotypes again to save initial number of known 
        # haplotypes for later
        # May not be needed in random method
        numHapsInitial1 = len(UniqueNames)
        # Set baseNumHapSets to keep track of source haplotype set for each created
        # haplotype set
        baseNumHapSets = 1
        # Convert haplotypes and SNPs arrays to decimal format to prevent rounding
        # errors
        bestArray = npToDecNp(bestArray)
        SnpFreqs = npToDecNp(SnpFreqs)
        
        # Find base SLSq
        # Save base RSS
        baseSLSq = []
        # Save base haplotype frequencies
        baseFreqs = []
        # save base residuals
        baseResiduals = [[]]
        # Calculate RSS for each pool
        for poolIter in xrange(numPools):
            tmpSol = Find_Freqs(bestArray, SnpFreqs[:,poolIter], poolSizes[poolIter])
            baseSLSq.append(tmpSol[1])
            baseFreqs.append(tmpSol[0])
            baseResiduals[0].append(
                np.array([[x] for x in list(residuals(tmpSol[0][0],bestArray, 
                              SnpFreqs[:,poolIter],poolSizes[poolIter]))])
                )
    
        NexusWriter(KnownNames, KnownHaps, numSNPs, o.outPrefix, 
                "INITIAL", o.knownHaps)
        NexusWriter(UniqueNames, inHapArray, numSNPs, o.outPrefix, 
                "Unique", o.knownHaps)
        myDecHaps = []
        for haplotypeIter in xrange(bestArray.shape[1]):
            myDecHaps.append(int("1"+"".join([str(int(x)) 
                for x in bestArray[:, haplotypeIter]]),2))
        # Create final haplotypes array
        finSolution = bestArray
        # Find (or make) haplotype names
        myHapNames = UniqueNames
        print("Outputing base solution ")
        # If requested, generate a structure formatted file
        if o.strOutput:
            outFile = open("%s_base.str" % (outPrefix), 'wb')
        # Generate the haplotype frequencies file
        outFile2 = open("%s_base_freqs.csv" % (outPrefix), 'wb')
        outFile2.write("Population,")
        # Create decimal haplotype identifiers
        # Finish writing first line of haplotype frequencies file
        outFile2.write(",".join(myHapNames))
        outFile2.write(",RSS")
        # Write decimal names of haplotypes
        outFile2.write(
            "\n,%s" % ",".join([str(x) for x in myDecHaps])
            )
        # Create genpop output, if requested
        if o.genpopOutput:
            genpopOut = open("%s_base.genpop" % (outPrefix), 'wb')
            genpopOut.write(
                ",%s" % (",".join(["cp." + str(x) 
                                   for x in myDecHaps]))
                )
        SLSqs = []
        Freqs = []
        predSnpFreqs = []
        # Create regression output
        regressionOutput = open(
            "%s_base_Regression.csv" % (outPrefix), 'wb'
            )
        regressionOutput.write(
            "Pool,SNP,Observed Frequency,Predicted Frequency\n"
            )
        # Create predicted frequencies VCF output

        
        tmpVCF = vcfReader(o.inFreqs)

        output3 = vcfWriter(
            "%s_base_PredFreqs.vcf" % (outPrefix), 
            source="CallHaps_HapCallr_%s" % progVersion, 
            commandLine=CommandStr, 
            baseHead=tmpVCF.headInfo, 
            FormatBlock=tmpVCF.headInfo["FORMAT"])
        output3.writeHeader(poolNames)
        output3.setFormat("RF")
        
        output3.importLinesInfo(
            tmpVCF.getData("chrom", lineTarget="a"),
            tmpVCF.getData("pos", lineTarget="a"), 
            tmpVCF.getData("ref", lineTarget="a"), 
            tmpVCF.getData("alt", lineTarget="a"), 
            tmpVCF.getData("qual", lineTarget="a")
            )
        newResiduals = []
        print("Finding haplotype frequencies...")
        for poolIter in xrange(numPools):
            tmpSol = Find_Freqs(bestArray, SnpFreqs[:,poolIter], poolSizes[poolIter])
            SLSqs.append(tmpSol[1])
            Freqs.append(tmpSol[0])
            # Write haplotype frequencies and RSS values for this pool
            outFile2.write(
                "\n%s,%s" % (poolNames[poolIter], 
                             ",".join([str(x) for x in tmpSol[0][0]]))
                )
            outFile2.write(",%s" % tmpSol[1])
            # Write genpop file text for this pool, if requested
            if o.genpopOutput:
                genpopOut.write(
                    "\n%s,%s" % (poolNames[poolIter],
                                 ",".join([str(x) for x in tmpSol[0][0]]))
                    )
            # Write structure file text for this pool, if requested
            if o.strOutput:
                outputProt(UniqueNames, tmpSol[0], finSolution, poolSizes[poolIter], 
                           poolNames, poolIter, outFile)
            # Calculate residuals for this pool
            newResiduals.append(
                np.array([[x] for x in list(residuals(tmpSol[0][0],
                                                      finSolution, 
                                                      SnpFreqs[:,poolIter],
                                                      poolSizes[poolIter]))])
                )
            # Calculate predicted SNP frequencies                
            predSnpFreqs = np.sum(
                finSolution * tmpSol[0][0], axis = 1
                )/poolSizes[poolIter]
            #print("##DEBUG")
            # Write regression file lines for this pool
            regOutLines = zip(
                [poolNames[poolIter] 
                    for x in xrange(len(predSnpFreqs))],
                [str(y) for y in xrange(len(predSnpFreqs))], 
                [str(z) for z in list(SnpFreqs[:,poolIter])], 
                [str(w) for w in list(predSnpFreqs)]
                )
            regressionOutput.write(
                "\n".join([",".join(regOutLines[x]) 
                          for x in xrange(len(regOutLines))])
                )
            regressionOutput.write("\n") # add a new line between pools
            # Add predicted SNP frequencies to VCF output
            output3.importSampleValues(list(predSnpFreqs), poolNames[poolIter])
        # Calculate per SNP RSS values for VCF output
        SnpResiduals = [float(sum([newResiduals[poolIter][x]**2 
                                  for poolIter in xrange(numPools)])[0]) 
                        for x in xrange(numSNPs)]
        output3.importInfo("RSS",SnpResiduals)
        output3.writeSamples()
        # Close output files
        output3.close()
        regressionOutput.close()
        outFile2.close()
        if o.strOutput:
            outFile.close()
        if o.genpopOutput:
            genpopOut.close()
        exit()


    if o.ordered:
        result = [CallHapMain(0, o=o, resume=False)]
        cleaned = [x for x in result if not x is None]
    else:
        if o.resume == True:
            pool = Pool(processes=o.numProcesses, maxtasksperchild=10)
            func = partial(CallHapMain, o=o, resume=True)
            funcIterable = range(o.numRand)
            result = pool.map(func, funcIterable)
            cleaned = [x for x in result if not x is None]
            # not optimal but safe
            pool.close()
            pool.join()
        else:
            pool = Pool(processes=o.numProcesses, maxtasksperchild=10)
            func = partial(CallHapMain, o=o)
            funcIterable = range(o.numRand)
            result = pool.map(func, funcIterable)
            cleaned = [x for x in result if not x is None]
            # not optimal but safe
            pool.close()
            pool.join()

    ## Get initial haplotypes / SNP frequencies
    # Load haplotypes
    KnownHaps, KnownNames = toNP_array(o.knownHaps, "GT")
    # Invert haplotypes so that ref allele is 1
    KnownHaps = invertArray(KnownHaps)
    # Find unique haplotypes
    inHapArray, UniqueNames = UniqueHaps(KnownHaps, KnownNames)
    # Count number of unique haplotypes
    numHapsInitial = len(UniqueNames)
    # Count number of SNPs
    numSNPs = inHapArray.shape[0]
    # Add "dummy" SNP to ensure haplotype frequencies sum correctly
    
    # Write out starting nexus files for comparison to endpoints
    NexusWriter(KnownNames, KnownHaps, numSNPs, o.outPrefix, 
                "INITIAL", o.knownHaps)
    NexusWriter(UniqueNames, inHapArray, numSNPs, o.outPrefix, 
                "Unique", o.knownHaps)    
    
    # Load SNPs
    finSNPs, poolNames = toNP_array(o.inFreqs, "RF")
    # Add "dummy" SNP to ensure haplotype frequencies sum correctly
    finSNPs = ExtendHaps(finSNPs)
    # Count number of pools present
    numPools = len(poolNames)
    # Convert haplotypes and snps arrays to decimal format to prevent rounding 
    # errors
    finSNPs = npToDecNp(finSNPs)
    
    # Output for random orders (this will get updated as I figure out sorting 
    # and haplotype selection)
    # Output haplotypes for each random order, along with RSS values for those 
    # haplotypes
    rawOutput = open("%s_RAW.csv" % outPrefix, 'wb')
    rawOutput.write("Ordering,Solution,RSS, Haplotypes")
    for randIter in xrange(len(cleaned)):
        for solIter in xrange(len(cleaned[randIter])):
            rawOutput.write(
                "\n%s,%s,%s,%s" % (
                    str(randIter), 
                    str(solIter), 
                    str(cleaned[randIter][solIter][1]), 
                    ",".join([str(x) for x in cleaned[randIter][solIter][0]])
                    )
                )
    rawOutput.close()
    
    # Output frequencies for each haplotype across all orders
    # This part will probably stay and be used in sorting haplotypes eventually 
    print("Creating summary outputs")
    summaryOutput = open("%s_summary.csv" % outPrefix, 'wb')
    summaryOutput.write("Haplotype,Frequency")
    haplotypeCounter = {}
    for randIter in xrange(len(cleaned)):
        tmpCounter = {}
        for solIter in xrange(len(cleaned[randIter])):
            for hapIter in  cleaned[randIter][solIter][0]:
                if hapIter in tmpCounter.keys():
                    tmpCounter[hapIter] += 1.
                else:
                    tmpCounter[hapIter] = 1.
        for keyIter in tmpCounter.keys():
            if keyIter in haplotypeCounter.keys():
                haplotypeCounter[keyIter] += tmpCounter[
                    keyIter]/len(cleaned[randIter])
            else:
                haplotypeCounter[keyIter] = tmpCounter[
                    keyIter]/len(cleaned[randIter])
    for keyIter in haplotypeCounter.keys():
        summaryOutput.write(
            "\n%s,%s" % (
                str(keyIter), str(haplotypeCounter[keyIter] /len(cleaned))
                )
            )
    summaryOutput.close()
    
    ## Group solutions into unique solutions
    print("Find unique topologies")
    UniqueTopologies = []
    # Use sets for sorting to keep different oreders of the same haplotypes 
    # from being called different topologies
    UniqueTopoSets = []
    UniqueTopoRSSs = []
    countTopoOccurances = []
    for randIter in xrange(len(cleaned)):
        numSols = len(cleaned[randIter])
        for solIter in xrange(numSols):
            if set(cleaned[randIter][solIter][0]) not in UniqueTopoSets:
                UniqueTopologies.append(cleaned[randIter][solIter][0])
                UniqueTopoSets.append(set(cleaned[randIter][solIter][0]))
                UniqueTopoRSSs.append(cleaned[randIter][solIter][1])
                countTopoOccurances.append(1./numSols)
            else:
                countTopoOccurances[UniqueTopoSets.index(
                    set(cleaned[randIter][solIter][0])
                    )] += 1./numSols
    topoCountsOutput = open("%s_topologies.csv" % outPrefix, 'wb')
    topoCountsOutput.write("RSS,Occurrences,Haplotypes")
    
    # Output counts for different topologies
    for topoIter in xrange(len(UniqueTopologies)):
        topoCountsOutput.write(
            "\n%s,%s,%s" % (
                UniqueTopoRSSs[topoIter],
                countTopoOccurances[topoIter],
                ",".join([str(x) for x in UniqueTopologies[topoIter]])
                )
            )
    topoCountsOutput.close()
    
    ## Sort unique solutions by RSS value
    # Sort a list of pointers by RSS values they point to
    # This is a list of indexes to UniqueTopologies and UniqueTopoRSSs
    print("Sort by RSS")
    RssPointers = sorted(range(len(UniqueTopoRSSs)), 
                         key=lambda x: UniqueTopoRSSs[x])
    ## Find the third best RSS value
    # Keep track of if which RSS value this is
    whichBest = 0
    bestRSS = UniqueTopoRSSs[RssPointers[0]]
    currPointer = 0
    while currPointer < len(RssPointers) and whichBest < o.topNum: 
        if UniqueTopoRSSs[RssPointers[currPointer]] > bestRSS:
            whichBest += 1
            bestRSS = UniqueTopoRSSs[RssPointers[currPointer]]
        currPointer += 1
    ## Pull out the haplotype sets with one of the top three RSS values
    ## For each haplotype set:
    finTopos = []
    finDecHaps = []
    print("Extract solutions from best RSS values")
    for convPointer in xrange(currPointer - (1 if currPointer > 1 else 0)):
        # Convert haplotype set to list of numpy arrays
        finTopos.append(
            [DecHapToNPHap(UniqueTopologies[RssPointers[convPointer]][x]) 
            for x in xrange(len(UniqueTopologies[RssPointers[convPointer]]))]
            )
        finDecHaps.append(
            [UniqueTopologies[RssPointers[convPointer]][x] 
            for x in xrange(len(UniqueTopologies[RssPointers[convPointer]]))]
            )
    

    ## For each converted haplotype set:
        ## Find best solution for this haplotype set
        ## Output this solution as all requested outputs
    print("Find haplotype frequencies and output files")
    for outTopoPtr in xrange(len(finTopos)):
        # Create final haplotypes array
        finSolution = np.concatenate(
            [finTopos[outTopoPtr][x][np.newaxis].transpose() 
            for x in xrange(len(finTopos[outTopoPtr]))], axis=1
            )
        # Find (or make) haplotype names
        myHapNames = []
        print("Outputing solution %s/%s" % (str(outTopoPtr + 1), 
                                            str(len(finTopos))))
        print("Finding haplotype names...")
        for haplotypeIter in xrange(finSolution.shape[1]):
            if haplotypeIter >= len(UniqueNames):
                # For new haplotypes, build a new haplotype name, keeping track
                # of iteration and new haplotype number
                myHapNames.append("NewHap_%s" % (
                    str(finDecHaps[outTopoPtr][haplotypeIter]))
                    )
            else:
                # For known haplotypes, use the original haplotype name
                myHapNames.append(UniqueNames[haplotypeIter])
        # If requested, generate a structure formatted file
        if o.strOutput:
            outFile = open("%s_%s.str" % (outPrefix, outTopoPtr), 'wb')
        # Generate the haplotype frequencies file
        outFile2 = open("%s_%s_freqs.csv" % (outPrefix, outTopoPtr), 'wb')
        outFile2.write("Population,")
        # Create decimal haplotype identifiers
        # Finish writing first line of haplotype frequencies file
        outFile2.write(",".join(myHapNames))
        outFile2.write(",RSS")
        # Write decimal names of haplotypes
        outFile2.write(
            "\n,%s" % ",".join([str(x) for x in finDecHaps[outTopoPtr]])
            )
        # Create genpop output, if requested
        if o.genpopOutput:
            genpopOut = open("%s_%s.genpop" % (outPrefix, outTopoPtr), 'wb')
            genpopOut.write(
                ",%s" % (",".join(["cp." + str(x) 
                                   for x in finDecHaps[outTopoPtr]]))
                )
        SLSqs = []
        Freqs = []
        predSnpFreqs = []
        # Create regression output
        regressionOutput = open(
            "%s_%s_Regression.csv" % (outPrefix, outTopoPtr), 'wb'
            )
        regressionOutput.write(
            "Pool,SNP,Observed Frequency,Predicted Frequency\n"
            )
        # Create predicted frequencies VCF output
        #def __init__(self, inFileName, source, commandLine, baseHead, FormatBlock):
        tmpVCF = vcfReader(o.inFreqs)

        output3 = vcfWriter(
            "%s_%s_PredFreqs.vcf" % (outPrefix, outTopoPtr), 
            source="CallHaps_HapCallr_%s" % progVersion, 
            commandLine=CommandStr, 
            baseHead=tmpVCF.headInfo, 
            FormatBlock=tmpVCF.headInfo["FORMAT"])
            
        output3.writeHeader(poolNames)
        output3.setFormat("RF")
        
        output3.importLinesInfo(
            tmpVCF.getData("chrom", lineTarget="a"),
            tmpVCF.getData("pos", lineTarget="a"), 
            tmpVCF.getData("ref", lineTarget="a"), 
            tmpVCF.getData("alt", lineTarget="a"), 
            tmpVCF.getData("qual", lineTarget="a")
            )
        newResiduals = []
        print("Finding haplotype frequencies...")
        for poolIter in xrange(numPools):
            tmpSol = Find_Freqs(finSolution, finSNPs[:,poolIter], poolSizes[poolIter])
            SLSqs.append(tmpSol[1])
            Freqs.append(tmpSol[0])
            # Write haplotype frequencies and RSS values for this pool
            outFile2.write(
                "\n%s,%s" % (poolNames[poolIter], 
                             ",".join([str(x) for x in tmpSol[0][0]]))
                )
            outFile2.write(",%s" % tmpSol[1])
            # Write genpop file text for this pool, if requested
            if o.genpopOutput:
                genpopOut.write(
                    "\n%s,%s" % (poolNames[poolIter],
                                 ",".join([str(x) for x in tmpSol[0][0]]))
                    )
            # Write structure file text for this pool, if requested
            if o.strOutput:
                outputProt(UniqueNames, tmpSol[0], finSolution, poolSizes[poolIter], 
                           poolNames, poolIter, outFile)
            # Calculate residuals for this pool
            newResiduals.append(
                np.array([[x] for x in list(residuals(tmpSol[0][0],
                                                      finSolution, 
                                                      finSNPs[:,poolIter],
                                                      poolSizes[poolIter]))])
                )
            # Calculate predicted SNP frequencies                
            predSnpFreqs = np.sum(
                finSolution * tmpSol[0][0], axis = 1
                )/poolSizes[poolIter]
            #print("##DEBUG")
            # Write regression file lines for this pool
            regOutLines = zip(
                [poolNames[poolIter] 
                    for x in xrange(len(predSnpFreqs))],
                [str(y) for y in xrange(len(predSnpFreqs))], 
                [str(z) for z in list(finSNPs[:,poolIter])], 
                [str(w) for w in list(predSnpFreqs)]
                )
            regressionOutput.write(
                "\n".join([",".join(regOutLines[x]) 
                          for x in xrange(len(regOutLines))])
                )
            regressionOutput.write("\n") # add a new line between pools
            # Add predicted SNP frequencies to VCF output
            output3.importSampleValues(list(predSnpFreqs), poolNames[poolIter])
        # Calculate per SNP RSS values for VCF output
        SnpResiduals = [float(sum([newResiduals[poolIter][x]**2 
                                  for poolIter in xrange(numPools)])[0]) 
                        for x in xrange(numSNPs)]
        output3.importInfo("RSS",SnpResiduals)
        output3.writeSamples()
        # Close output files
        output3.close()
        regressionOutput.close()
        outFile2.close()
        if o.strOutput:
            outFile.close()
        if o.genpopOutput:
            genpopOut.close()
        # Write Nexus file for this solution
        # This allows for network phylogeny construction
        NexusWriter(myHapNames, finSolution, numSNPs, outPrefix, 
                    outTopoPtr, o.knownHaps)
        # Delete any remaining temporary files
        if not o.keepTmp:
            for deletionIter in xrange(o.numRand):
                try:
                    os.remove("%s_save%s.tmp" % (o.outPrefix, deletionIter))
                except OSError:
                    pass