Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dev #61

Open
wants to merge 8 commits into
base: dev
Choose a base branch
from
5 changes: 4 additions & 1 deletion cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,10 @@ def _ksd(families, sequences, outdir, tmpdir, nthreads, to_stop, cds, pairwise,
prefix = os.path.basename(families)
outfile = os.path.join(outdir, "{}.ks.tsv".format(prefix))
logging.info("Saving to {}".format(outfile))
ksdb.df.to_csv(outfile)
ksdb.df.to_csv(outfile, sep="\t")
nan_outfile = os.path.join(outdir, "{}.ks.nan.tsv".format(prefix))
logging.info("Saving to {}".format(nan_outfile))
ksdb.nan_df.to_csv(nan_outfile, sep="\t")
logging.info("Making plots")
df = apply_filters(ksdb.df, [("dS", 1e-4, 5.), ("S", 10, 1e6)])
fig = default_plot(df, title=prefix, rwidth=0.8, bins=50)
Expand Down
75 changes: 69 additions & 6 deletions wgd/codeml.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,57 @@ def _strip_gaps(aln):
new_aln += aln[:,j:j+1]
return new_aln

def _strip_aln(aln, max_gap_portion=0):
"""
A column is considered as a gap position if more than a fraction `max_gap_portion` (a value
between 0 and 1) of the positions in this column are gaps. Default is 0 to remove all gaps.
If the alignment is backtranslated into CDS, it does not have to consider reading frame anymore
"""
new_aln = aln[:,0:0]
num_seq = len(aln)
for j in range(aln.get_alignment_length()):
num_gap = aln[:,j].count("-")
per_gap = num_gap / num_seq
if per_gap > max_gap_portion:
continue
else:
new_aln += aln[:,j:j+1]
return new_aln

def _get_pairwise_stat(aln):
if len(aln) != 2:
raise Exception("This not a pairwise alignment")
saln=_strip_aln(aln)
id1, id2 = sorted([aln[0].id, aln[1].id])
pid = '__'.join([id1, id2])
saln_len = saln.get_alignment_length()
aln_len = aln.get_alignment_length()

if saln_len == 0:
identity = 0
else:
identity = (saln_len - _hamming_distance(saln[0].seq, saln[1].seq)) / saln_len
stat = [{
"pair": pid,
"AlignmentIdentity": identity,
"AlignmentLength": aln_len,
"AlignmentLengthStripped": saln_len,
"AlignmentCoverage": saln_len / aln_len}]
stat_df = pd.DataFrame.from_dict(stat).set_index("pair")
return(stat_df)

def _hamming_distance(s1, s2):
"""
Return the Hamming distance between equal-length sequences

:param s1: string 1
:param s2: string 2
:return: the Hamming distances between s1 and s2
"""
if len(s1) != len(s2):
raise ValueError("Undefined for sequences of unequal length")
return sum(el1 != el2 for el1, el2 in zip(s1, s2))

def _all_pairs(aln):
pairs = []
for i in range(len(aln)-1):
Expand Down Expand Up @@ -203,15 +254,22 @@ def run_codeml(self, **kwargs):
Run codeml on the full alignment. This will exclude all gap-containing
columns, which may lead to a significant loss of data.
"""
stripped_aln = _strip_gaps(self.aln) # codeml does this anyway
stripped_aln = _strip_aln(self.aln)
if stripped_aln.get_alignment_length() == 0:
logging.warning("Stripped alignment length == 0 for {}".format(self.prefix))
return None, _all_pairs(self.aln)
parentdir = os.path.abspath(os.curdir) # where we are currently
os.chdir(self.tmp) # go to tmpdir
self.write_ctrl()
_write_aln_codeml(self.aln, self.aln_file)
results = _run_codeml(self.exe, self.control_file, self.out_file, **kwargs)
codeml_df = _run_codeml(self.exe, self.control_file, self.out_file, **kwargs)
each_pair_df = []
for i in range(len(self.aln)-1):
for j in range(i+1, len(self.aln)):
pair = MultipleSeqAlignment([self.aln[i], self.aln[j]])
each_pair_df.append(_get_pairwise_stat(pair))
pair_stat_df = pd.concat(each_pair_df)
results = pd.merge(pair_stat_df, codeml_df, on="pair")
os.chdir(parentdir)
return results, []

Expand All @@ -226,14 +284,19 @@ def run_codeml_pairwise(self, **kwargs):
for i in range(len(self.aln)-1):
for j in range(i+1, len(self.aln)):
pair = MultipleSeqAlignment([self.aln[i], self.aln[j]])
stripped_pair = _strip_gaps(pair)
if stripped_pair.get_alignment_length() == 0:
pair_stat_df = _get_pairwise_stat(pair)
stripped_pair = _strip_aln(pair)
if stripped_pair.get_alignment_length() == 0: # should be min len
no_results.append([p.id for p in pair])
else:
self.write_ctrl()
_write_aln_codeml(stripped_pair, self.aln_file)
results.append(_run_codeml(self.exe, self.control_file,
self.out_file, **kwargs) )
codeml_df = _run_codeml(self.exe, self.control_file,
self.out_file, **kwargs)
merged_df = pd.merge(pair_stat_df, codeml_df, on="pair")
results.append(merged_df)
#results.append(_run_codeml(self.exe, self.control_file,
# self.out_file, **kwargs) )
if len(no_results) > 0:
logging.warning("Alignment length of 0 for {} pairs in {}".format(
len(no_results), self.prefix))
Expand Down
92 changes: 73 additions & 19 deletions wgd/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,23 @@ def _strip_gaps(aln):
new_aln += aln[:,j:j+1]
return new_aln

def _strip_aln(aln, max_gap_portion=0):
"""
A column is considered as a gap position if more than a fraction `max_gap_portion` (a value
between 0 and 1) of the positions in this column are gaps. Default is 0 to remove all gaps.
If the alignment is backtranslated into CDS, it does not have to consider reading frame anymore
"""
new_aln = aln[:,0:0]
num_seq = len(aln)
for j in range(aln.get_alignment_length()):
num_gap = aln[:,j].count("-")
per_gap = num_gap / num_seq
if per_gap > max_gap_portion:
continue
else:
new_aln += aln[:,j:j+1]
return new_aln

def _pal2nal(pro_aln, cds_seqs):
aln = {}
for i, s in enumerate(pro_aln):
Expand All @@ -58,7 +75,7 @@ def _pal2nal(pro_aln, cds_seqs):
cds_aln += cds_seq[k:k+3]
k += 3
aln[s.id] = cds_aln
return MultipleSeqAlignment([SeqRecord(v, id=k) for k, v in aln.items()])
return MultipleSeqAlignment([SeqRecord(v, id=k, description="", name="") for k, v in aln.items()])

def _log_process(o, program=""):
logging.debug("{} stderr: {}".format(program.upper(), o.stderr.decode()))
Expand Down Expand Up @@ -135,13 +152,19 @@ def make_diamond_db(self):

def run_diamond(self, seqs, eval=1e-10):
self.make_diamond_db()
run = "_".join([self.prefix, seqs.prefix + ".tsv"])
outfile = os.path.join(self.tmp_path, run)
run = "_".join([self.prefix, seqs.prefix + ".diamond.tsv"])
tmpfile = os.path.join(self.tmp_path, run)
cmd = ["diamond", "blastp", "-d", self.pro_db, "-q",
seqs.pro_fasta, "-o", outfile]
seqs.pro_fasta, "-o", tmpfile]
out = sp.run(cmd, stdout=sp.PIPE, stderr=sp.PIPE)
logging.debug(out.stderr.decode())
df = pd.read_csv(outfile, sep="\t", header=None)
df = pd.read_csv(tmpfile, sep="\t", header=None)
# print diamond output in original names
outdf = df.copy()
outdf[0] = list(map(lambda x: seqs.cds_seqs[x].id, outdf[0]))
outdf[1] = list(map(lambda x: self.cds_seqs[x].id, outdf[1]))
outfile = os.path.join(self.out_path, run)
outdf.to_csv(outfile, sep="\t", header=False, index=False)
df = df.loc[df[0] != df[1]]
self.dmd_hits[seqs.prefix] = df = df.loc[df[10] <= eval]
return df
Expand Down Expand Up @@ -173,12 +196,11 @@ def get_mcl_graph(self, *args):

def write_paranome(self, fname=None):
if not fname:
fname = os.path.join(self.out_path, "{}.tsv".format(self.prefix))
fname = os.path.join(self.out_path, "{}.mcl.tsv".format(self.prefix))
with open(fname, "w") as f:
f.write("\t" + self.prefix + "\n")
for i, (k, v) in enumerate(sorted(self.mcl.items())):
# We report original gene IDs
f.write("GF{:0>5}\t".format(i+1))
f.write("GF{:0>5}: ".format(i+1))
f.write(", ".join([self.cds_seqs[x].id for x in v]))
f.write("\n")
return fname
Expand Down Expand Up @@ -233,7 +255,7 @@ def _rename(family, ids):
return [ids[x] for x in family]

def read_gene_families(fname):
return pd.read_csv(fname, sep="\t", index_col=0).apply(
return pd.read_csv(fname, sep=": ", index_col=0, engine='python', header=None).apply(
lambda y: [x.split(", ") for x in y if x != ""])

def merge_seqs(seqs):
Expand Down Expand Up @@ -298,6 +320,7 @@ def __init__(self, gfid, cds, pro, tmp_path,
self.no_codeml_results = None
self.tree = None
self.out = os.path.join(self.tmp_path, "{}.csv".format(gfid))
self.nan_out = os.path.join(self.tmp_path, "{}.nan.csv".format(gfid))

# config
self.aligner = aligner # mafft | prank | muscle
Expand All @@ -320,12 +343,12 @@ def get_ks(self):
if self.codeml_results is not None:
self.get_tree()
self.compile_dataframe()
self.combine_results()
#self.combine_results()

def combine_results(self):
if self.no_codeml_results is None:
return
self.codeml_results = pd.concat([self.codeml_results, self.no_codeml_results])
#def combine_results(self):
# if self.no_codeml_results is None:
# return
# self.codeml_results = pd.concat([self.codeml_results, self.no_codeml_results])

def nan_result(self, pairs):
"""
Expand Down Expand Up @@ -368,8 +391,10 @@ def run_mafft(self, options="--auto"):

def get_codon_alignment(self):
self.cds_aln = _pal2nal(self.pro_aln, self.cds_seqs)
with open(self.cds_alnf, 'w') as f:
f.write(self.cds_aln.format('fasta'))
if self.strip_gaps:
self.cds_aln = _strip_gaps(self.cds_aln)
self.cds_aln = _strip_aln(self.cds_aln)

def run_codeml(self):
codeml = Codeml(self.cds_aln, exe="codeml", tmp=self.tmp_path, prefix=self.id)
Expand Down Expand Up @@ -415,25 +440,54 @@ def compile_dataframe(self):
for j in range(i+1, len(l)):
gj = l[j].name
pair = "__".join(sorted([gi, gj]))
if pair not in self.codeml_results.index:
continue
node = self.tree.common_ancestor(l[i], l[j])
length = self.cds_aln.get_alignment_length()
d[pair] = {"node": node.name, "family": self.id, "alnlen": length}
dist = self.tree.distance(l[i], l[j])
outlier = 1 if self.codeml_results.loc[pair]['dS'] > 5 else 0
d[pair] = {"Node": node.name, "Family": self.id, "Outlier": outlier, "Distance": dist}
df = pd.DataFrame.from_dict(d, orient="index")
self.codeml_results = self.codeml_results.join(df)
self.node_weights()

def node_weights(self):
# note that this returns a df with the same number of rows
df = self.codeml_results.copy()
sw = 1 / df.groupby(["Family", "Node"])["dS"].transform('count')
self.codeml_results["WeightOutliersIncluded"] = sw
df = df[df["Outlier"] == 0]
sw = 1 / df.groupby(["Family", "Node"])["dS"].transform('count')
self.codeml_results["WeightsOutliersExcluded"] = sw
self.codeml_results["WeightsOutliersExcluded"] = self.codeml_results["WeightsOutliersExcluded"].fillna(0)

def _get_ks(family):
family.get_ks()
family.codeml_results.to_csv(family.out)
outflag = False
if family.codeml_results is not None:
family.codeml_results.to_csv(family.out)
outflag = True
if family.no_codeml_results is not None:
family.no_codeml_results.to_csv(family.nan_out)
outflag = True
if not outflag:
raise Exception("{} has empty results".format(family.id))


class KsDistributionBuilder:
def __init__(self, gene_families, n_threads=4):
self.families = gene_families
self.df = None
self.nan_df = None
self.n_threads = n_threads

def get_distribution(self):
Parallel(n_jobs=self.n_threads)(
delayed(_get_ks)(family) for family in self.families)
self.df = pd.concat([pd.read_csv(x.out, index_col=0)
for x in self.families], sort=True)
for x in self.families if os.path.exists(x.out)])
self.df.index.name = None
nan = [pd.read_csv(x.nan_out, index_col=0) for x in self.families
if os.path.exists(x.nan_out)]
if nan:
self.nan_df = pd.concat(nan)
self.nan_df.index.name = None