diff --git a/protzilla/data_analysis/protein_coverage.py b/protzilla/data_analysis/protein_coverage.py
index 35bc0516..6e4fb04c 100644
--- a/protzilla/data_analysis/protein_coverage.py
+++ b/protzilla/data_analysis/protein_coverage.py
@@ -1,9 +1,24 @@
+from colour import color_scale
from docutils.nodes import title
from tqdm import tqdm
from protzilla.constants.paths import EXTERNAL_DATA_PATH
from protzilla.disk_operator import PickleOperator
+from dataclasses import dataclass
import pandas as pd
+from numpy import log2
import plotly.graph_objects as go
+
+@dataclass(unsafe_hash=True)
+class PeptideMatch:
+
+ peptide_sequence: str
+ start_location_on_protein: int
+ end_location_on_protein: int
+ intensity: float = 0.0
+ sample: str = ""
+
+
+
def build_kmer_dictionary(protein_dictionary: dict[str, str], k: int = 5) -> dict[str, list[tuple[str, int]]]:
"""
Builds a dictionary of all kmers in a list of protein sequences. The kmers are generated by sliding a window of size
@@ -16,9 +31,10 @@ def build_kmer_dictionary(protein_dictionary: dict[str, str], k: int = 5) -> dic
kmer_dict = PickleOperator.read(kmer_dict_path)
return kmer_dict
kmer_dict = {}
- for protein_id, protein_sequence in tqdm(protein_dictionary.items(), desc="Building kmer dictionary", unit_scale=True, unit="protein"):
+ for protein_id, protein_sequence in tqdm(protein_dictionary.items(), desc="Building kmer dictionary",
+ unit_scale=True, unit="protein"):
for i in range(len(protein_sequence) - k + 1):
- kmer = protein_sequence[i : i + k]
+ kmer = protein_sequence[i: i + k]
if kmer in kmer_dict:
kmer_dict[kmer].append((protein_id, i))
else:
@@ -28,7 +44,8 @@ def build_kmer_dictionary(protein_dictionary: dict[str, str], k: int = 5) -> dic
def match_peptide_to_protein_ids(
- peptide_sequence: str, protein_kmer_dictionary : dict[str, list[tuple[str, int]]], protein_dictionary: dict[str, str]
+ peptide_sequence: str, protein_kmer_dictionary: dict[str, list[tuple[str, int]]],
+ protein_dictionary: dict[str, str]
) -> list[tuple[str, int, int]]:
"""
Matches a peptide sequence to a dictionary of kmers in protein sequences.
@@ -36,7 +53,7 @@ def match_peptide_to_protein_ids(
"""
# convert the peptide sequence to a list of kmers
k = 5
- kmers = [peptide_sequence[i : i + k] for i in range(len(peptide_sequence) - k + 1)]
+ kmers = [peptide_sequence[i: i + k] for i in range(len(peptide_sequence) - k + 1)]
# for now, we only do exact matches, so only the first and last kmer of the peptide are needed
first_kmer = kmers[0]
last_kmer = kmers[-1]
@@ -58,90 +75,170 @@ def match_peptide_to_protein_ids(
subsequence = protein_sequence[start_first_kmer:end_last_kmer]
if subsequence != peptide_sequence:
continue
- assert len(subsequence) == len(peptide_sequence), f"Lengths do not match: {len(subsequence)} != {len(peptide_sequence)}"
+ assert len(subsequence) == len(
+ peptide_sequence), f"Lengths do not match: {len(subsequence)} != {len(peptide_sequence)}"
assert subsequence in peptide_sequence, f"Subsequence not in peptide sequence:\nA: {subsequence}\nB: {peptide_sequence}"
assert peptide_sequence in protein_sequence, f"Peptide not in protein sequence: {peptide_sequence} not in {protein_sequence}"
hits.append((protein_id, start_first_kmer, end_last_kmer))
return hits
-
def plot_protein_coverage(
- fasta_df: pd.DataFrame, peptide_df: pd.DataFrame, protein_id: str
+ fasta_df: pd.DataFrame, peptide_df: pd.DataFrame, protein_id: str, samples: list[str] = None
) -> None:
"""
Plots the coverage of a protein sequence by peptides.
"""
# generate the protein kmer dictionary
+ if samples is None:
+ samples = []
+ reduced_peptide_df = peptide_df[peptide_df["Sample"].isin(samples) & peptide_df['Intensity'] > 0]
+
protein_dict = dict(zip(fasta_df["Protein ID"], fasta_df["Protein Sequence"]))
kmer_dict = build_kmer_dictionary(protein_dict, k=5)
protein_sequence = protein_dict[protein_id]
protein_sequence_length = len(protein_sequence)
peptide_matches = []
+ coverage = [0] * protein_sequence_length
- for peptide_sequence in tqdm(peptide_df['Sequence'].unique(), desc="Matching peptides to protein", unit_scale=True, unit="peptide"):
- protein_hits = match_peptide_to_protein_ids(peptide_sequence=peptide_sequence, protein_kmer_dictionary=kmer_dict, protein_dictionary=protein_dict)
+ for sample, peptide_sequence, intensity in tqdm(
+ reduced_peptide_df[['Sample', 'Sequence', 'Intensity']].drop_duplicates().itertuples(index=False),
+ desc="Matching peptides to protein", unit_scale=True,
+ unit="peptide"):
+ protein_hits = match_peptide_to_protein_ids(peptide_sequence=peptide_sequence,
+ protein_kmer_dictionary=kmer_dict, protein_dictionary=protein_dict)
for prot_id, start_location_on_protein, end_location_on_protein in protein_hits:
# we are only interested in the provided protein id, everything else is discarded
if prot_id != protein_id:
continue
# add the peptide sequence and location to the peptide matches pertaining to the protein id
- peptide_matches.append((peptide_sequence, start_location_on_protein, end_location_on_protein))
+ peptide_matches.append(
+ PeptideMatch(peptide_sequence=peptide_sequence,
+ start_location_on_protein=start_location_on_protein,
+ end_location_on_protein=end_location_on_protein,
+ intensity=log2(intensity),
+ sample=sample)
+ )
+ # update the coverage of the protein sequence
+ coverage[start_location_on_protein:end_location_on_protein] = \
+ [coverage + 1 for coverage in coverage[start_location_on_protein:end_location_on_protein]]
+ if len(peptide_matches) == 0:
+ raise ValueError(f"No peptides matched for protein {protein_id}")
# now that all the matches have been determined with their start and end on the protein sequence, we need to find
# an optimal solution for the location of their rectangles in the plot without overlap while minimizing the
# required number of rows (vertical space)
- rows_of_peptide_matches = distribute_to_rows(peptide_matches)
-
- x_labels = [f"{i} ({protein_sequence[i - 1]})" for i in range(1, protein_sequence_length + 1)]
- fig = go.Figure(data=go.Bar(x=x_labels, y=[1] * protein_sequence_length,
- name="Protein Sequence", marker=dict(color="lightgray")), )
- for row_index, row in enumerate(rows_of_peptide_matches):
- for start_location_on_protein, end_location_on_protein, peptide_sequence in row:
- print(f"Peptide: {peptide_sequence}, start: {start_location_on_protein}, end: {end_location_on_protein}")
- # add a rectangle for each peptide
+ rows = distribute_to_rows(peptide_matches)
+
+ x_labels = [f"{amino_acid_index} ({amino_acid})" for amino_acid_index, amino_acid in enumerate(protein_sequence)]
+ max_coverage_in_sequence = max(coverage)
+ max_intensity = max([peptide_match.intensity for peptide_match in peptide_matches])
+ normalized_coverage = [value / max_coverage_in_sequence for value in coverage]
+ hover_text = [f"Position: {i}
Amino acid: {amino_acid}
Coverage: {coverage}"
+ for i, (coverage, amino_acid) in
+ enumerate(zip(normalized_coverage, protein_sequence))]
+
+ fig = go.Figure(data=go.Bar(x=x_labels,
+ y=normalized_coverage,
+ marker=dict(
+ color=normalized_coverage,
+ colorscale='Bluered',
+ colorbar=dict(title="Coverage
(normalized)"),
+ line=dict(width=0)
+ ),
+ hovertext=hover_text,
+ hoverinfo="text",
+ showlegend=False,
+ name="Coverage (normalized to max in Protein Sequence)"))
+
+ # Keep track of the current row position across all samples
+ current_row = 1
+
+ # Iterate through samples and their rows in order
+ for sample_idx, (sample, sample_rows) in enumerate(rows.items()):
+ # Add a sample label
+ fig.add_annotation(
+ x=-0.1, # slightly to the left of the plot
+ y=current_row + len(sample_rows) / 2,
+ text=sample,
+ showarrow=False,
+ textangle=-90,
+ xref='paper',
+ yref='y'
+ )
+
+ for row_index, row in enumerate(sample_rows):
+ for peptide_match in row:
+ # add a rectangle for each peptide
+ x0, x1 = peptide_match.start_location_on_protein, peptide_match.end_location_on_protein
+ y0, y1 = current_row, current_row + 1
+ normalized_intensity = (peptide_match.intensity) / max_intensity
+ fig.add_shape(
+ type="rect",
+ # the coordinates need to be a bit offset, as otherwise the rectangles would start in the middle of the bars
+ x0=x0 - 0.5, y0=y0,
+ x1=x1 - 0.5, y1=y1,
+ fillcolor=f"rgba({int(255 * (1 - normalized_intensity))}, 0, {int(255 * normalized_intensity)}, 0.5)",
+ opacity=0.5,
+ layer="above",
+ )
+ # add invisible plotly object to the rectangle to show the peptide sequence when hovered over
+ fig.add_trace(go.Scatter(
+ x=x_labels[peptide_match.start_location_on_protein:peptide_match.end_location_on_protein + 1],
+ y=[(y0 + y1) / 2] * len(peptide_match.peptide_sequence),
+ text=[f"Sample: {sample}
Peptide: {peptide_match.peptide_sequence}
"
+ f"({peptide_match.start_location_on_protein}-{peptide_match.end_location_on_protein})
"
+ f"Intensity: {peptide_match.intensity}"] * len(peptide_match.peptide_sequence),
+ mode="markers",
+ marker=dict(opacity=0, size=30), # make marker invisible
+ hoverinfo="text",
+ hovertemplate="%{text}",
+ showlegend=False,
+ ))
+
+ # Move to next row
+ current_row += 1
+
+ # Add a horizontal line separator between samples (except after the last sample)
+ if sample_idx < len(rows) - 1:
fig.add_shape(
- type="rect",
- x0=start_location_on_protein,
- y0=row_index+1 + 0.1,
- x1=end_location_on_protein,
- y1=row_index+ 2 - 0.1,
- fillcolor="blue",
- opacity=0.5,
- layer="above",
+ type="line",
+ x0=-0.5,
+ y0=current_row,
+ x1=len(protein_sequence) - 0.5,
+ y1=current_row,
+ line=dict(
+ color="Gray",
+ width=2,
+ dash="solid",
+ )
)
- # add invisible plotly object to the rectangle to show the peptide sequence when hovered over
- fig.add_trace(go.Scatter(
- x=[x_labels[(start_location_on_protein + end_location_on_protein) // 2]],
- y=[row_index + 1.5],
- text=[f"{peptide_sequence}
({start_location_on_protein}-{end_location_on_protein})"],
- mode="markers",
- marker=dict(opacity=0, size=20), # make marker invisible
- hoverinfo="text",
- hovertemplate="%{text}",
- showlegend=False,
- ))
fig.update_layout(
title=f"Protein Coverage of {protein_id}",
xaxis_title="Protein Sequence",
- yaxis=dict(visible=False, range=[0, 1+len(rows_of_peptide_matches)+1], title=""),
+ yaxis=dict(visible=False, range=[0, 1 + current_row], title=""),
showlegend=False,
+ bargap=0
)
return dict(plots=[fig])
-def distribute_to_rows(coverage):
- coverage.sort(key=lambda x: x[1])
- rows = []
- for peptide_sequence, start_location, end_location in coverage:
+def distribute_to_rows(peptide_matches: list[PeptideMatch]) -> dict[list[list[PeptideMatch]]]:
+ """
+ Distributes peptide matches to rows such that no two peptides overlap in the same row.
+ Greedy algorithm (see Interval Scheduling Problem).
+ """
+ peptide_matches.sort(key=lambda peptide_match: peptide_match.start_location_on_protein)
+ rows = {sample : [] for sample in set([peptide_match.sample for peptide_match in peptide_matches])}
+ for peptide_match in peptide_matches:
# find the first row that does not overlap with the current peptide
- for row in rows:
- if row[-1][1] <= start_location:
- row.append((start_location, end_location, peptide_sequence))
+ for row in rows[peptide_match.sample]:
+ if row[-1].end_location_on_protein < peptide_match.start_location_on_protein:
+ row.append((peptide_match))
break
else:
- rows.append([(start_location, end_location, peptide_sequence)])
+ rows[peptide_match.sample].append([(peptide_match)])
return rows
diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py
index ab8bdc27..765b3047 100644
--- a/protzilla/methods/data_analysis.py
+++ b/protzilla/methods/data_analysis.py
@@ -792,6 +792,7 @@ class PlotProteinCoverage(PlotStep):
"protein_id",
"fasta_df",
"peptide_df",
+ "samples"
]
output_keys = []
diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py
index be9ea58f..38c8668e 100644
--- a/ui/runs/forms/data_analysis.py
+++ b/ui/runs/forms/data_analysis.py
@@ -1103,6 +1103,11 @@ class PlotProteinCoverageForm(MethodForm):
choices=[],
label="Protein ID",
)
+ samples = CustomMultipleChoiceField(
+ choices=[],
+ label="Samples",
+ )
+
def fill_form(self, run: Run) -> None:
self.fields["peptide_df_instance"].choices = fill_helper.get_choices(
@@ -1124,9 +1129,12 @@ def fill_form(self, run: Run) -> None:
Step, "fasta_df", fasta_df_instance_id
)["Protein ID"].unique()
peptide_df_protein_ids = peptide_df["Protein ID"].unique()
- common_protein_ids = list(set(fasta_protein_ids) & set(peptide_df_protein_ids))
+ common_protein_ids = sorted(list(set(fasta_protein_ids) & set(peptide_df_protein_ids)))
self.fields["protein_id"].choices = fill_helper.to_choices(common_protein_ids)
+ peptide_df_samples = peptide_df["Sample"].unique()
+ self.fields["samples"].choices = fill_helper.to_choices(peptide_df_samples)
+
class FLEXIQuantLFForm(MethodForm):
is_dynamic = True