Skip to content

Commit

Permalink
Add sample selection, coloring based on intensities to the protein co…
Browse files Browse the repository at this point in the history
…verage plot
  • Loading branch information
henninggaertner committed Nov 27, 2024
1 parent 4e981c2 commit 1006c0c
Show file tree
Hide file tree
Showing 3 changed files with 154 additions and 48 deletions.
191 changes: 144 additions & 47 deletions protzilla/data_analysis/protein_coverage.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,24 @@
from colour import color_scale
from docutils.nodes import title
from tqdm import tqdm
from protzilla.constants.paths import EXTERNAL_DATA_PATH
from protzilla.disk_operator import PickleOperator
from dataclasses import dataclass
import pandas as pd
from numpy import log2
import plotly.graph_objects as go

@dataclass(unsafe_hash=True)
class PeptideMatch:

peptide_sequence: str
start_location_on_protein: int
end_location_on_protein: int
intensity: float = 0.0
sample: str = ""



def build_kmer_dictionary(protein_dictionary: dict[str, str], k: int = 5) -> dict[str, list[tuple[str, int]]]:
"""
Builds a dictionary of all kmers in a list of protein sequences. The kmers are generated by sliding a window of size
Expand All @@ -16,9 +31,10 @@ def build_kmer_dictionary(protein_dictionary: dict[str, str], k: int = 5) -> dic
kmer_dict = PickleOperator.read(kmer_dict_path)
return kmer_dict
kmer_dict = {}
for protein_id, protein_sequence in tqdm(protein_dictionary.items(), desc="Building kmer dictionary", unit_scale=True, unit="protein"):
for protein_id, protein_sequence in tqdm(protein_dictionary.items(), desc="Building kmer dictionary",
unit_scale=True, unit="protein"):
for i in range(len(protein_sequence) - k + 1):
kmer = protein_sequence[i : i + k]
kmer = protein_sequence[i: i + k]
if kmer in kmer_dict:
kmer_dict[kmer].append((protein_id, i))
else:
Expand All @@ -28,15 +44,16 @@ def build_kmer_dictionary(protein_dictionary: dict[str, str], k: int = 5) -> dic


def match_peptide_to_protein_ids(
peptide_sequence: str, protein_kmer_dictionary : dict[str, list[tuple[str, int]]], protein_dictionary: dict[str, str]
peptide_sequence: str, protein_kmer_dictionary: dict[str, list[tuple[str, int]]],
protein_dictionary: dict[str, str]
) -> list[tuple[str, int, int]]:
"""
Matches a peptide sequence to a dictionary of kmers in protein sequences.
Returns a list of tuples containing the protein ID and the start, end location of the peptide in the protein sequence.
"""
# convert the peptide sequence to a list of kmers
k = 5
kmers = [peptide_sequence[i : i + k] for i in range(len(peptide_sequence) - k + 1)]
kmers = [peptide_sequence[i: i + k] for i in range(len(peptide_sequence) - k + 1)]
# for now, we only do exact matches, so only the first and last kmer of the peptide are needed
first_kmer = kmers[0]
last_kmer = kmers[-1]
Expand All @@ -58,90 +75,170 @@ def match_peptide_to_protein_ids(
subsequence = protein_sequence[start_first_kmer:end_last_kmer]
if subsequence != peptide_sequence:
continue
assert len(subsequence) == len(peptide_sequence), f"Lengths do not match: {len(subsequence)} != {len(peptide_sequence)}"
assert len(subsequence) == len(
peptide_sequence), f"Lengths do not match: {len(subsequence)} != {len(peptide_sequence)}"
assert subsequence in peptide_sequence, f"Subsequence not in peptide sequence:\nA: {subsequence}\nB: {peptide_sequence}"
assert peptide_sequence in protein_sequence, f"Peptide not in protein sequence: {peptide_sequence} not in {protein_sequence}"
hits.append((protein_id, start_first_kmer, end_last_kmer))
return hits



def plot_protein_coverage(
fasta_df: pd.DataFrame, peptide_df: pd.DataFrame, protein_id: str
fasta_df: pd.DataFrame, peptide_df: pd.DataFrame, protein_id: str, samples: list[str] = None
) -> None:
"""
Plots the coverage of a protein sequence by peptides.
"""
# generate the protein kmer dictionary
if samples is None:
samples = []
reduced_peptide_df = peptide_df[peptide_df["Sample"].isin(samples) & peptide_df['Intensity'] > 0]

protein_dict = dict(zip(fasta_df["Protein ID"], fasta_df["Protein Sequence"]))
kmer_dict = build_kmer_dictionary(protein_dict, k=5)

protein_sequence = protein_dict[protein_id]
protein_sequence_length = len(protein_sequence)
peptide_matches = []
coverage = [0] * protein_sequence_length

for peptide_sequence in tqdm(peptide_df['Sequence'].unique(), desc="Matching peptides to protein", unit_scale=True, unit="peptide"):
protein_hits = match_peptide_to_protein_ids(peptide_sequence=peptide_sequence, protein_kmer_dictionary=kmer_dict, protein_dictionary=protein_dict)
for sample, peptide_sequence, intensity in tqdm(
reduced_peptide_df[['Sample', 'Sequence', 'Intensity']].drop_duplicates().itertuples(index=False),
desc="Matching peptides to protein", unit_scale=True,
unit="peptide"):
protein_hits = match_peptide_to_protein_ids(peptide_sequence=peptide_sequence,
protein_kmer_dictionary=kmer_dict, protein_dictionary=protein_dict)
for prot_id, start_location_on_protein, end_location_on_protein in protein_hits:
# we are only interested in the provided protein id, everything else is discarded
if prot_id != protein_id:
continue
# add the peptide sequence and location to the peptide matches pertaining to the protein id
peptide_matches.append((peptide_sequence, start_location_on_protein, end_location_on_protein))
peptide_matches.append(
PeptideMatch(peptide_sequence=peptide_sequence,
start_location_on_protein=start_location_on_protein,
end_location_on_protein=end_location_on_protein,
intensity=log2(intensity),
sample=sample)
)
# update the coverage of the protein sequence
coverage[start_location_on_protein:end_location_on_protein] = \
[coverage + 1 for coverage in coverage[start_location_on_protein:end_location_on_protein]]

if len(peptide_matches) == 0:
raise ValueError(f"No peptides matched for protein {protein_id}")
# now that all the matches have been determined with their start and end on the protein sequence, we need to find
# an optimal solution for the location of their rectangles in the plot without overlap while minimizing the
# required number of rows (vertical space)
rows_of_peptide_matches = distribute_to_rows(peptide_matches)

x_labels = [f"{i} ({protein_sequence[i - 1]})" for i in range(1, protein_sequence_length + 1)]
fig = go.Figure(data=go.Bar(x=x_labels, y=[1] * protein_sequence_length,
name="Protein Sequence", marker=dict(color="lightgray")), )
for row_index, row in enumerate(rows_of_peptide_matches):
for start_location_on_protein, end_location_on_protein, peptide_sequence in row:
print(f"Peptide: {peptide_sequence}, start: {start_location_on_protein}, end: {end_location_on_protein}")
# add a rectangle for each peptide
rows = distribute_to_rows(peptide_matches)

x_labels = [f"{amino_acid_index} ({amino_acid})" for amino_acid_index, amino_acid in enumerate(protein_sequence)]
max_coverage_in_sequence = max(coverage)
max_intensity = max([peptide_match.intensity for peptide_match in peptide_matches])
normalized_coverage = [value / max_coverage_in_sequence for value in coverage]
hover_text = [f"Position: {i}<br>Amino acid: {amino_acid}<br>Coverage: {coverage}"
for i, (coverage, amino_acid) in
enumerate(zip(normalized_coverage, protein_sequence))]

fig = go.Figure(data=go.Bar(x=x_labels,
y=normalized_coverage,
marker=dict(
color=normalized_coverage,
colorscale='Bluered',
colorbar=dict(title="Coverage<br>(normalized)"),
line=dict(width=0)
),
hovertext=hover_text,
hoverinfo="text",
showlegend=False,
name="Coverage (normalized to max in Protein Sequence)"))

# Keep track of the current row position across all samples
current_row = 1

# Iterate through samples and their rows in order
for sample_idx, (sample, sample_rows) in enumerate(rows.items()):
# Add a sample label
fig.add_annotation(
x=-0.1, # slightly to the left of the plot
y=current_row + len(sample_rows) / 2,
text=sample,
showarrow=False,
textangle=-90,
xref='paper',
yref='y'
)

for row_index, row in enumerate(sample_rows):
for peptide_match in row:
# add a rectangle for each peptide
x0, x1 = peptide_match.start_location_on_protein, peptide_match.end_location_on_protein
y0, y1 = current_row, current_row + 1
normalized_intensity = (peptide_match.intensity) / max_intensity
fig.add_shape(
type="rect",
# the coordinates need to be a bit offset, as otherwise the rectangles would start in the middle of the bars
x0=x0 - 0.5, y0=y0,
x1=x1 - 0.5, y1=y1,
fillcolor=f"rgba({int(255 * (1 - normalized_intensity))}, 0, {int(255 * normalized_intensity)}, 0.5)",
opacity=0.5,
layer="above",
)
# add invisible plotly object to the rectangle to show the peptide sequence when hovered over
fig.add_trace(go.Scatter(
x=x_labels[peptide_match.start_location_on_protein:peptide_match.end_location_on_protein + 1],
y=[(y0 + y1) / 2] * len(peptide_match.peptide_sequence),
text=[f"Sample: {sample}<br>Peptide: {peptide_match.peptide_sequence}<br>"
f"({peptide_match.start_location_on_protein}-{peptide_match.end_location_on_protein})<br>"
f"Intensity: {peptide_match.intensity}"] * len(peptide_match.peptide_sequence),
mode="markers",
marker=dict(opacity=0, size=30), # make marker invisible
hoverinfo="text",
hovertemplate="%{text}<extra></extra>",
showlegend=False,
))

# Move to next row
current_row += 1

# Add a horizontal line separator between samples (except after the last sample)
if sample_idx < len(rows) - 1:
fig.add_shape(
type="rect",
x0=start_location_on_protein,
y0=row_index+1 + 0.1,
x1=end_location_on_protein,
y1=row_index+ 2 - 0.1,
fillcolor="blue",
opacity=0.5,
layer="above",
type="line",
x0=-0.5,
y0=current_row,
x1=len(protein_sequence) - 0.5,
y1=current_row,
line=dict(
color="Gray",
width=2,
dash="solid",
)
)
# add invisible plotly object to the rectangle to show the peptide sequence when hovered over
fig.add_trace(go.Scatter(
x=[x_labels[(start_location_on_protein + end_location_on_protein) // 2]],
y=[row_index + 1.5],
text=[f"{peptide_sequence}<br>({start_location_on_protein}-{end_location_on_protein})"],
mode="markers",
marker=dict(opacity=0, size=20), # make marker invisible
hoverinfo="text",
hovertemplate="%{text}<extra></extra>",
showlegend=False,
))

fig.update_layout(
title=f"Protein Coverage of {protein_id}",
xaxis_title="Protein Sequence",
yaxis=dict(visible=False, range=[0, 1+len(rows_of_peptide_matches)+1], title=""),
yaxis=dict(visible=False, range=[0, 1 + current_row], title=""),
showlegend=False,
bargap=0
)

return dict(plots=[fig])


def distribute_to_rows(coverage):
coverage.sort(key=lambda x: x[1])
rows = []
for peptide_sequence, start_location, end_location in coverage:
def distribute_to_rows(peptide_matches: list[PeptideMatch]) -> dict[list[list[PeptideMatch]]]:
"""
Distributes peptide matches to rows such that no two peptides overlap in the same row.
Greedy algorithm (see Interval Scheduling Problem).
"""
peptide_matches.sort(key=lambda peptide_match: peptide_match.start_location_on_protein)
rows = {sample : [] for sample in set([peptide_match.sample for peptide_match in peptide_matches])}
for peptide_match in peptide_matches:
# find the first row that does not overlap with the current peptide
for row in rows:
if row[-1][1] <= start_location:
row.append((start_location, end_location, peptide_sequence))
for row in rows[peptide_match.sample]:
if row[-1].end_location_on_protein < peptide_match.start_location_on_protein:
row.append((peptide_match))
break
else:
rows.append([(start_location, end_location, peptide_sequence)])
rows[peptide_match.sample].append([(peptide_match)])
return rows
1 change: 1 addition & 0 deletions protzilla/methods/data_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -792,6 +792,7 @@ class PlotProteinCoverage(PlotStep):
"protein_id",
"fasta_df",
"peptide_df",
"samples"
]
output_keys = []

Expand Down
10 changes: 9 additions & 1 deletion ui/runs/forms/data_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -1103,6 +1103,11 @@ class PlotProteinCoverageForm(MethodForm):
choices=[],
label="Protein ID",
)
samples = CustomMultipleChoiceField(
choices=[],
label="Samples",
)


def fill_form(self, run: Run) -> None:
self.fields["peptide_df_instance"].choices = fill_helper.get_choices(
Expand All @@ -1124,9 +1129,12 @@ def fill_form(self, run: Run) -> None:
Step, "fasta_df", fasta_df_instance_id
)["Protein ID"].unique()
peptide_df_protein_ids = peptide_df["Protein ID"].unique()
common_protein_ids = list(set(fasta_protein_ids) & set(peptide_df_protein_ids))
common_protein_ids = sorted(list(set(fasta_protein_ids) & set(peptide_df_protein_ids)))
self.fields["protein_id"].choices = fill_helper.to_choices(common_protein_ids)

peptide_df_samples = peptide_df["Sample"].unique()
self.fields["samples"].choices = fill_helper.to_choices(peptide_df_samples)


class FLEXIQuantLFForm(MethodForm):
is_dynamic = True
Expand Down

0 comments on commit 1006c0c

Please sign in to comment.