Skip to content

Commit

Permalink
Merge pull request #38 from hexavier/master
Browse files Browse the repository at this point in the history
Incorporate crosstalks analysis into mim-tRNAseq
  • Loading branch information
drewjbeh authored Feb 8, 2023
2 parents 236466e + a2fa767 commit ac333e0
Show file tree
Hide file tree
Showing 4 changed files with 151 additions and 11 deletions.
73 changes: 73 additions & 0 deletions mimseq/crosstalks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
# -*- coding: utf-8 -*-
"""
Created on Wed Oct 13 11:01:53 2021
@author: Xavier Hernandez-Alias
"""
import pandas as pd
import numpy as np
from os import listdir
from shutil import rmtree
from os.path import isdir,isfile,join
from scipy.stats import fisher_exact
from statsmodels.stats.multitest import multipletests
import multiprocessing
from functools import partial

# Define function for parallelization
def analyze_1sample(s,dirpath,thres):
outdf = pd.DataFrame(columns=["sample","ref","var1","var2","pval","odds_ratio","values"])
n=0
ref_files = [f for f in listdir(join(dirpath, s)) if isfile(join(dirpath, s, f))]
for f in ref_files:
# Load table
ref = f[:-7].replace(".","/")
ref = "-".join(ref.split("-")[:-1]) if not "chr" in ref and not "/" in ref else ref
readsdf = pd.read_csv(join(dirpath, s, f),sep="\t",compression="gzip",index_col="READ",dtype="category")
pairs = []
for v1 in readsdf.columns:
for v2 in readsdf.columns:
if v1!=v2 and set([v1,v2]) not in pairs:
pairs.append(set([v1,v2]))
reads1 = readsdf[v1].dropna()
reads2 = readsdf[v2].dropna()
# Do test only if at least % positions are modified
if reads1.shape[0]>0 and reads2.shape[0]>0:
if sum(reads1!="0")/reads1.shape[0]>thres and sum(reads2!="0")/reads2.shape[0]>thres:
# Keep only reads that contain both v1 and v2
tempdf = readsdf[[v1,v2]].dropna(how="any")
# Build contingency table, var1 in rows and var 2 in columns
counts = (tempdf!="0").value_counts()
if counts.shape[0]==4:
cont_tab = np.array([[counts.loc[True,True], counts.loc[True,False]],
[counts.loc[False,True], counts.loc[False,False]]])
#p = chi2_contingency(cont_tab)[1]
oddsr, p = fisher_exact(cont_tab)
outdf.loc[n] = [s,ref,v1,v2,p,oddsr,counts]
n += 1
# Remove temporary files
rmtree(join(dirpath, s))
return outdf

def crosstalks_wrapper(dirpath, thres, threads):
# Multiprocessed function
pool = multiprocessing.Pool(threads)
frozen_fun = partial(analyze_1sample, dirpath=dirpath, thres=thres)
samples = [f for f in listdir(dirpath) if isdir(join(dirpath, f))]
dfs = pool.map(func=frozen_fun, iterable=samples, chunksize=1)
pool.close()
pool.join()
# Concatenate tables
outdf = pd.concat(dfs, ignore_index=True)
# Correct multiple comparisons
outdf["pval_corrected"] = multipletests(outdf["pval"].values,method="fdr_bh")[1]
# Include canonical positions
posinfo = pd.read_csv("/".join(dirpath.split("/")[:-1])+"/mods/mismatchTable.csv", sep="\t",
index_col=["isodecoder","pos"],usecols=["isodecoder","pos","canon_pos"],
dtype="category")
posinfo = posinfo[~posinfo.index.duplicated(keep='first')]
outdf["canon_var1"] = ["Charged" if s[1]=="Charged" else posinfo.loc[(s[0],s[1]),"canon_pos"] if s[0] in posinfo.index.get_level_values(0) else "NA" for s in outdf[["ref","var1"]].to_numpy()]
outdf["canon_var2"] = ["Charged" if s[1]=="Charged" else posinfo.loc[(s[0],s[1]),"canon_pos"] if s[0] in posinfo.index.get_level_values(0) else "NA" for s in outdf[["ref","var2"]].to_numpy()]
# Save table
outdf.to_csv(dirpath+"/crosstalks.tsv",sep="\t",index=False)

19 changes: 14 additions & 5 deletions mimseq/mimseq.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from .tRNAmap import mainAlign
from .getCoverage import getCoverage, plotCoverage
from .mmQuant import generateModsTable, plotCCA
from .crosstalks import crosstalks_wrapper
from .ssAlign import structureParser, modContext
from .splitClusters import splitIsodecoder, unsplitClustersCov, getIsodecoderSizes, getDeconvSizes, writeDeconvTranscripts, writeIsodecoderTranscripts, writeSplitInfo, writeIsodecoderInfo
import sys, os, subprocess, logging, datetime, copy
Expand Down Expand Up @@ -49,7 +50,7 @@ def restrictedFloat2(x):
raise argparse.ArgumentTypeError('{} not a real number'.format(x))

def mimseq(trnas, trnaout, name, species, out, cluster, cluster_id, cov_diff, posttrans, control_cond, threads, max_multi, snp_tolerance, \
keep_temp, cca, double_cca, min_cov, mismatches, remap, remap_mismatches, misinc_thresh, mito_trnas, plastid_trnas, pretrnas, local_mod, p_adj, sample_data):
keep_temp, cca, double_cca, min_cov, mismatches, remap, remap_mismatches, misinc_thresh, mito_trnas, plastid_trnas, pretrnas, local_mod, p_adj, crosstalks, sample_data):

# Main wrapper
# Integrity check for output folder argument...
Expand Down Expand Up @@ -129,7 +130,7 @@ def mimseq(trnas, trnaout, name, species, out, cluster, cluster_id, cov_diff, po

# if remap and snp_tolerance are enabled, skip further analyses, find new mods, and redo alignment and coverage
if remap and (snp_tolerance or not mismatches == 0.0):
new_mods, new_Inosines, filtered_cov, filter_warning, unsplitCluster_lookup,readRef_unsplit_newNames = generateModsTable(coverageData, out, name, threads, min_cov, mismatch_dict, insert_dict, del_dict, cluster_dict, cca, remap, misinc_thresh, mod_lists, Inosine_lists, tRNA_dict, Inosine_clusters, unique_isodecoderMMs_new, splitBool_new, isodecoder_sizes, unsplitCluster_lookup, cluster)
new_mods, new_Inosines, filtered_cov, filter_warning, unsplitCluster_lookup,readRef_unsplit_newNames = generateModsTable(coverageData, out, name, threads, min_cov, mismatch_dict, insert_dict, del_dict, cluster_dict, cca, remap, misinc_thresh, mod_lists, Inosine_lists, tRNA_dict, Inosine_clusters, unique_isodecoderMMs_new, splitBool_new, isodecoder_sizes, unsplitCluster_lookup, cluster, crosstalks)
Inosine_clusters, snp_tolerance, newtRNA_dict, new_mod_lists, new_inosine_lists = newModsParser(out, name, new_mods, new_Inosines, mod_lists, Inosine_lists, tRNA_dict, cluster, remap, snp_tolerance)
map_round = 2
genome_index_path, genome_index_name, snp_index_path, snp_index_name = generateGSNAPIndices(species, name, out, map_round, snp_tolerance, cluster)
Expand All @@ -152,9 +153,15 @@ def mimseq(trnas, trnaout, name, species, out, cluster, cluster_id, cov_diff, po
filtered_cov = list()
if snp_tolerance or not mismatches == 0.0:
if 'newtRNA_dict' in locals():
new_mods, new_Inosines, filtered_cov, filter_warning, unsplitCluster_lookup,readRef_unsplit_newNames = generateModsTable(coverageData, out, name, threads, min_cov, mismatch_dict, insert_dict, del_dict, cluster_dict, cca, remap, misinc_thresh, new_mod_lists, Inosine_lists, newtRNA_dict, Inosine_clusters, unique_isodecoderMMs_new, splitBool_new, isodecoder_sizes, unsplitCluster_lookup, cluster)
new_mods, new_Inosines, filtered_cov, filter_warning, unsplitCluster_lookup,readRef_unsplit_newNames = generateModsTable(coverageData, out, name, threads, min_cov, mismatch_dict, insert_dict, del_dict, cluster_dict, cca, remap, misinc_thresh, new_mod_lists, Inosine_lists, newtRNA_dict, Inosine_clusters, unique_isodecoderMMs_new, splitBool_new, isodecoder_sizes, unsplitCluster_lookup, cluster, crosstalks)
else:
new_mods, new_Inosines, filtered_cov, filter_warning, unsplitCluster_lookup, readRef_unsplit_newNames = generateModsTable(coverageData, out, name, threads, min_cov, mismatch_dict, insert_dict, del_dict, cluster_dict, cca, remap, misinc_thresh, mod_lists, Inosine_lists, tRNA_dict, Inosine_clusters, unique_isodecoderMMs_new, splitBool_new, isodecoder_sizes, unsplitCluster_lookup, cluster)
new_mods, new_Inosines, filtered_cov, filter_warning, unsplitCluster_lookup, readRef_unsplit_newNames = generateModsTable(coverageData, out, name, threads, min_cov, mismatch_dict, insert_dict, del_dict, cluster_dict, cca, remap, misinc_thresh, mod_lists, Inosine_lists, tRNA_dict, Inosine_clusters, unique_isodecoderMMs_new, splitBool_new, isodecoder_sizes, unsplitCluster_lookup, cluster, crosstalks)

if crosstalks:
# Crosstalks analysis
log.info("Analyzing crosstalks between pairs of modifications and modification-charging...")
crosstalks_wrapper(out + "single_read_data", misinc_thresh, threads)

else:
log.info("*** Misincorporation analysis not possible; either --snp-tolerance must be enabled, or --max-mismatches must not be 0! ***\n")

Expand Down Expand Up @@ -267,6 +274,8 @@ def main():
help = "Adjusted p-value threshold for DESeq2 pairwise condition differential epxression dot plots. \
tRNAs with DESeq2 adjusted p-values equal to or below this value will be displayed as green or orange triangles for up- or down-regulated tRNAs, respectively. \
Default p-adj <= 0.05")
options.add_argument('--crosstalks', required = False, dest = 'crosstalks', action = "store_true",\
help = "Enable analysis of crosstalks between pairs of modifications and modification-charging. Full details of this method in: https://doi.org/10.1093/nar/gkac1185")

align = parser.add_argument_group("GSNAP alignment options")
align.add_argument('--max-mismatches', metavar = 'allowed mismatches', required = False, dest = 'mismatches', type = float, \
Expand Down Expand Up @@ -408,7 +417,7 @@ def main():
mimseq(args.trnas, args.trnaout, args.name, args.species, args.out, args.cluster, args.cluster_id, args.cov_diff, \
args.posttrans, args.control_cond, args.threads, args.max_multi, args.snp_tolerance, \
args.keep_temp, args.cca, args.double_cca, args.min_cov, args.mismatches, args.remap, args.remap_mismatches, \
args.misinc_thresh, args.mito, args.plastid, args.pretrnas, args.local_mod, args.p_adj, args.sampledata)
args.misinc_thresh, args.mito, args.plastid, args.pretrnas, args.local_mod, args.p_adj, args.crosstalks, args.sampledata)

if __name__ == '__main__':
main()
65 changes: 61 additions & 4 deletions mimseq/mmQuant.py
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from __future__ import absolute_import
import os, logging
import re
import gzip
import pysam
from itertools import groupby, combinations as comb
from operator import itemgetter, le
Expand Down Expand Up @@ -112,9 +113,9 @@ def unknownMods(inputs, knownTable, cluster_dict, modTable, misinc_thresh, cov_t

return(new_mods_cluster, new_inosines_cluster)

def bamMods_mp(out_dir, min_cov, info, mismatch_dict, insert_dict, del_dict, cluster_dict, cca, tRNA_struct, tRNA_ungap2canon,remap, misinc_thresh, knownTable, tRNA_dict, unique_isodecoderMMs, splitBool, isodecoder_sizes, threads, inputs):
def bamMods_mp(out_dir, min_cov, info, mismatch_dict, insert_dict, del_dict, cluster_dict, cca, tRNA_struct, tRNA_ungap2canon,remap, misinc_thresh, knownTable, tRNA_dict, unique_isodecoderMMs, splitBool, isodecoder_sizes, crosstalks, threads, inputs):
# modification counting and table generation, and CCA analysis

# output structures
modTable = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
stopTable = defaultdict(lambda: defaultdict(int))
Expand All @@ -130,6 +131,21 @@ def bamMods_mp(out_dir, min_cov, info, mismatch_dict, insert_dict, del_dict, clu
cca_dict = defaultdict(lambda: defaultdict(int))
dinuc_dict = defaultdict(int)

# Open files for single read analysis
if crosstalks and not remap:
sample_name = inputs.split("/")[-1].split(".")[0]
if not os.path.exists(out_dir + "single_read_data/" + sample_name):
os.mkdir(out_dir + "single_read_data/" + sample_name)
srfiles = dict()
for r in isodecoder_sizes.keys():
shortname = r.split("/")[0] if "/" in r else "-".join(r.split("-")[:-1]) if not "chr" in r else r
srfiles[shortname] = gzip.open(out_dir + "single_read_data/" + sample_name +"/"+ r.replace("/",".") +".tsv.gz", "wt")
if cca:
cols = ["READ"]+[str(s) for s in tRNA_struct.loc[shortname].index]+["Charged"]
else:
cols = ["READ"]+[str(s) for s in tRNA_struct.loc[shortname].index]
srfiles[shortname].write("\t".join(cols)+"\n")

# process mods by looping through alignments in bam file
bam_file = pysam.AlignmentFile(inputs, "rb")
log.info('Analysing {}...'.format(inputs))
Expand Down Expand Up @@ -241,6 +257,41 @@ def bamMods_mp(out_dir, min_cov, info, mismatch_dict, insert_dict, del_dict, clu
elif ref_pos <= ref_length - 3:
dinuc = "Absent"
cca_dict[reference][dinuc] += 1

############################
# Single-read mods and CCA #
############################

if crosstalks and not remap:
# Modifications
shortname = "-".join(reference.split("-")[:-1]) if not "chr" in reference else reference
seqtemp = []
for n in tRNA_struct.loc[shortname].index:
if (n<(offset+1)) or (n>(aln_end)):
seqtemp.append("NA")
elif n not in temp.keys():
seqtemp.append("0")
else:
b = temp[n]
if b!="N":
seqtemp.append(b)
else:
seqtemp.append("NA")
if cca:
# Record charging status
if aln_end==ref_length:
chrg = "1"
elif aln_end==(ref_length-1):
chrg = "0"
else:
chrg = "NA"
add = [read.query_name]+seqtemp+[chrg]
else:
add = [read.query_name]+seqtemp

# Add line
srfiles[shortname].write("\t".join(add)+"\n")


# filter readRef_match_dict to those clusters with more than 10% false reads (i.e. do not match parent) or those with no True reads (usually low count clusters)
readRef_match_dict = {k:v for k, v in readRef_match_dict.items() if ((v[True]) and (v[False]/v[True] >= 0.1)) or (not v[True])}
Expand Down Expand Up @@ -413,6 +464,10 @@ def bamMods_mp(out_dir, min_cov, info, mismatch_dict, insert_dict, del_dict, clu
for dinuc, count in data.items():
if dinuc.upper() in ["CA", "CC", "C", "ABSENT"]:
CCAvsCC_counts.write(cluster + "\t" + dinuc + "\t" + inputs + "\t" + condition + "\t" + str(count) + "\n")
if crosstalks:
# Close single-read analysis files
for r in srfiles.keys():
srfiles[r].close()

log.info('Analysis complete for {}...'.format(inputs))

Expand Down Expand Up @@ -577,7 +632,7 @@ def addNA(tRNA_struct, data_type, cluster_dict, name, table):

return(table)

def generateModsTable(sampleGroups, out_dir, name, threads, min_cov, mismatch_dict, insert_dict, del_dict, cluster_dict, cca, remap, misinc_thresh, knownTable, Inosine_lists, tRNA_dict, Inosine_clusters, unique_isodecoderMMs, splitBool, isodecoder_sizes, unsplitCluster_lookup, clustering):
def generateModsTable(sampleGroups, out_dir, name, threads, min_cov, mismatch_dict, insert_dict, del_dict, cluster_dict, cca, remap, misinc_thresh, knownTable, Inosine_lists, tRNA_dict, Inosine_clusters, unique_isodecoderMMs, splitBool, isodecoder_sizes, unsplitCluster_lookup, clustering, crosstalks):
# Wrapper function to call countMods_mp with multiprocessing

if cca:
Expand All @@ -589,6 +644,7 @@ def generateModsTable(sampleGroups, out_dir, name, threads, min_cov, mismatch_di
try:
os.mkdir(out_dir + "CCAanalysis")
os.mkdir(out_dir + "mods")
if crosstalks: os.mkdir(out_dir + "single_read_data")
except FileExistsError:
log.warning("Rewriting over old mods and CCA files...")

Expand All @@ -600,6 +656,7 @@ def generateModsTable(sampleGroups, out_dir, name, threads, min_cov, mismatch_di

try:
os.mkdir(out_dir + "mods")
if crosstalks: os.mkdir(out_dir + "single_read_data")
except FileExistsError:
log.warning("Rewriting over old mods files...")

Expand Down Expand Up @@ -634,7 +691,7 @@ def generateModsTable(sampleGroups, out_dir, name, threads, min_cov, mismatch_di
pool = MyPool(multi)
# to avoid assigning too many threads, divide available threads by number of processes
threadsForMP = int(threads/multi)
func = partial(bamMods_mp, out_dir, min_cov, baminfo, mismatch_dict, insert_dict, del_dict, cluster_dict, cca, tRNA_struct_df, tRNA_ungap2canon, remap, misinc_thresh, knownTable, tRNA_dict, unique_isodecoderMMs, splitBool, isodecoder_sizes, threadsForMP)
func = partial(bamMods_mp, out_dir, min_cov, baminfo, mismatch_dict, insert_dict, del_dict, cluster_dict, cca, tRNA_struct_df, tRNA_ungap2canon, remap, misinc_thresh, knownTable, tRNA_dict, unique_isodecoderMMs, splitBool, isodecoder_sizes, crosstalks, threadsForMP)
new_mods, new_Inosines, readRef_match_dict = zip(*pool.map(func, bamlist))
pool.close()
pool.join()
Expand Down
5 changes: 3 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,8 @@ def package_files(directory):
"pysam",
"seaborn",
"matplotlib",
"pybedtools"],
"pybedtools",
"statsmodels"],
classifiers=[
"Development Status :: 4 - Beta",
"Environment :: Console",
Expand All @@ -57,4 +58,4 @@ def package_files(directory):
"Programming Language :: Python :: 3",
"Topic :: Scientific/Engineering :: Bio-Informatics"
],
zip_safe=False)
zip_safe=False)

0 comments on commit ac333e0

Please sign in to comment.