plotPieChartsforAbundance.py

#!/usr/bin/env python3

import matplotlib.pyplot as plt
import sys
import csv
import pandas as pd
import numpy as np
import pickle
from getDisplayName import *


if len(sys.argv) != 7:
    raise Exception('Incorrect call to the script.')


# Data regarding the current sample passed by UNIX
outputDirectory = sys.argv[1]
variantDBfilename = sys.argv[2]

# Sample-sepecific input files generated by upstream processes
variantFreqFilename = sys.argv[3]
kallistoFilename = sys.argv[4]
#k2_allCovidFilename = sys.argv[5]
k2_majorCovidFilename = sys.argv[5]
freyjaOutputFile = sys.argv[6]
#lcsFile = sys.argv[8]


# Import the pre-processed variant definitions from file
with open(variantDBfilename, 'rb') as file:
    uniqueVarNames = pickle.load(file)
    # uniqueMutationLabels = pickle.load(file)
    # var2mut = pickle.load(file)
    # mut2var = pickle.load(file) # Skipped these for efficiency
    # importantVars = pickle.load(file)
    # pos2gene = pickle.load(file)
    # gene2pos = pickle.load(file)
    # sigMutationMatrix = pickle.load(file)


########################################################################
# Generate a full-size pie chart for the current sample depicting the prevalence of variants
# Only display variants that are >= x% abundant
# Less frequent variants will be cumulated under 'other' category
def drawPieChart(names2percentages, outfilename, title=''):
    minPlotThreshold = 5  # in %
    
    # Lookup the display name (e.g. WHO label), cumulate minor subvariants
    names2pct_combined = {}
    for (name, freq) in names2percentages.items():
        dname = getDisplayName(name)
        if dname != 'Other':
            if dname in names2pct_combined:
                names2pct_combined[dname] += freq
            else:
                names2pct_combined[dname] = freq
    
    # Eliminate infrequent variants and cast as two lists to plot
    percentages2plot = []
    names2plot = []
    for (name, pct) in names2pct_combined.items():
        if pct >= minPlotThreshold:
            names2plot.append(name)
            percentages2plot.append(pct)


    # Cumulate all other infrequent variants under "other" category
    other_pct = 100-np.sum(percentages2plot)
    if other_pct > 0.1:
        names2plot.append('Other')
        percentages2plot = np.append(percentages2plot, other_pct)
        
    colors2plot = [getColor(name) for name in names2plot]
    explosionArray = np.full(len(percentages2plot), 0.07)
    plt.rcParams.update({'font.size': 12})
    plt.pie(percentages2plot, labels=names2plot, autopct='%1.1f%%', shadow=False,
            explode=explosionArray, colors=colors2plot)
    plt.axis('equal')
    plt.title(title)
    plt.savefig(outfilename, dpi=300)
    plt.close()


########################################################
# Process the results of linear deconvolution approach
names2percentages = {}
with open(variantFreqFilename, 'r') as infile:
    reader = csv.reader(infile, delimiter=" ")
    counter = 0
    for row in reader:
        cFreq = float(row[1])
        dname = getDisplayName(uniqueVarNames[counter])
        if dname in names2percentages:
            names2percentages[dname] += cFreq
        else:
            names2percentages[dname] = cFreq
        counter += 1

drawPieChart(names2percentages, outputDirectory+'/pieChart_deconvolution.png',
             title='Abundance of variants\n by linear regression')


########################################################
# Process the results of kallisto approach
# Read the tsv file generated by kallisto
kallistoHits = {}
with open(kallistoFilename, 'r') as infile:
    reader = csv.reader(infile, delimiter="\t")
    next(reader) # Skip the header
    for row in reader:
        pangoName = row[0].split('_')[0]
        dname = getDisplayName(pangoName)
        numberHits = float(row[3])
        if dname in kallistoHits:
            kallistoHits[dname].append(numberHits)
        else:
            kallistoHits[dname] = [numberHits]


# Loop through the imported kallisto data.
# For duplicates, get an average
for varWHOname in kallistoHits:
    kallistoHits[varWHOname] = np.sum(kallistoHits[varWHOname])

totalNumReads = sum(kallistoHits.values())
names2percentages = {}
for varWHOname in kallistoHits:
    names2percentages[varWHOname] = 100.0 * \
        kallistoHits[varWHOname]/totalNumReads

drawPieChart(names2percentages, outputDirectory+'/pieChart_kallisto.png',
             title='Abundance of variants by kallisto')


with open(outputDirectory + '/kallisto.out', 'w') as outfile:
    for name in names2percentages:
        outfile.write('%s\t%.1f\n' % (name, names2percentages[name]))


########################################################
# Process the results of kraken2+bracken approach
# Read the tsv file generated by bracken
def importBrackenOutput(brackenFilename, tax_level):
    brackenHits = {}
    with open(brackenFilename, 'r') as infile:
        # If there were no reads that are variant specific, bracken generates
        # an empty file output, even no header to skip. 
        
        reader = csv.reader(infile, delimiter="\t")
        for row in reader:
            pctHits = float(row[0])
            entry_tax_level = row[3]
            varDispName = getDisplayName(row[5]).strip()
            
            # Skip the header or other taxonomic level rows than desired to retain
            if entry_tax_level == tax_level:
                brackenHits[varDispName] = pctHits
            
    return brackenHits


#brackenHits = importBrackenOutput(k2_allCovidFilename, 'P')
#drawPieChart(brackenHits, outputDirectory+'/pieChart_k2_allCovid.png',
#             title='Abundance of variants by\n kraken2+bracken, using allCovid DB')

brackenHits = importBrackenOutput(k2_majorCovidFilename, 'C')
drawPieChart(brackenHits, outputDirectory+'/pieChart_k2_majorCovid.png',
             title='Abundance of variants by\n kraken2+bracken, using majorCovid DB')


########################################################
# Process the abundance estimates by Freyja
freyja_raw = pd.read_table(freyjaOutputFile, index_col=0)

# Option A: summary reported by Freyja with WHO names
# var_pct = eval( pd.Series(freyja_raw.loc['summarized'][0])[0] )

# Option B: detailed subvariant breakdown
(lineages, abundances, freyja_names) = import_freyja_demix(freyjaOutputFile)
var_pct = tuple(zip(lineages, abundances))

freyjaHits = {}
for var in var_pct:
    name = var[0]
    pct = 100*var[1]
    freyjaHits[name] = pct

drawPieChart(freyjaHits, outputDirectory+'/pieChart_freyja.png',
                title='Abundance of variants by Freyja')


# #######################################################
# Process the abundance estimates by LCS
# with open(lcsFile, 'r') as infile:
#     reader = csv.reader(infile, delimiter="\t")
#     next(reader)  # Skip the header line of lcs.out  
#     lcsHits = {}
#     for row in reader:
#         pangoName = row[1].split('_')[-1]
#         dname = getDisplayName(pangoName)
#         proportion = float(row[2])*100
#         if dname in lcsHits:
#             lcsHits[dname].append(proportion)
#         else:
#             lcsHits[dname] = [proportion]

# # Loop through the imported kallisto data.
# # For duplicates, get an average
# for varWHOname in lcsHits:
#     lcsHits[varWHOname] = np.sum(lcsHits[varWHOname])

# drawPieChart(lcsHits, outputDirectory+'/pieChart_lcs.png',
#              title='Abundance of variants by LCS')