diff --git a/protzilla/data_analysis/protein_coverage.py b/protzilla/data_analysis/protein_coverage.py index 35bc0516..6e4fb04c 100644 --- a/protzilla/data_analysis/protein_coverage.py +++ b/protzilla/data_analysis/protein_coverage.py @@ -1,9 +1,24 @@ +from colour import color_scale from docutils.nodes import title from tqdm import tqdm from protzilla.constants.paths import EXTERNAL_DATA_PATH from protzilla.disk_operator import PickleOperator +from dataclasses import dataclass import pandas as pd +from numpy import log2 import plotly.graph_objects as go + +@dataclass(unsafe_hash=True) +class PeptideMatch: + + peptide_sequence: str + start_location_on_protein: int + end_location_on_protein: int + intensity: float = 0.0 + sample: str = "" + + + def build_kmer_dictionary(protein_dictionary: dict[str, str], k: int = 5) -> dict[str, list[tuple[str, int]]]: """ Builds a dictionary of all kmers in a list of protein sequences. The kmers are generated by sliding a window of size @@ -16,9 +31,10 @@ def build_kmer_dictionary(protein_dictionary: dict[str, str], k: int = 5) -> dic kmer_dict = PickleOperator.read(kmer_dict_path) return kmer_dict kmer_dict = {} - for protein_id, protein_sequence in tqdm(protein_dictionary.items(), desc="Building kmer dictionary", unit_scale=True, unit="protein"): + for protein_id, protein_sequence in tqdm(protein_dictionary.items(), desc="Building kmer dictionary", + unit_scale=True, unit="protein"): for i in range(len(protein_sequence) - k + 1): - kmer = protein_sequence[i : i + k] + kmer = protein_sequence[i: i + k] if kmer in kmer_dict: kmer_dict[kmer].append((protein_id, i)) else: @@ -28,7 +44,8 @@ def build_kmer_dictionary(protein_dictionary: dict[str, str], k: int = 5) -> dic def match_peptide_to_protein_ids( - peptide_sequence: str, protein_kmer_dictionary : dict[str, list[tuple[str, int]]], protein_dictionary: dict[str, str] + peptide_sequence: str, protein_kmer_dictionary: dict[str, list[tuple[str, int]]], + protein_dictionary: dict[str, str] ) -> list[tuple[str, int, int]]: """ Matches a peptide sequence to a dictionary of kmers in protein sequences. @@ -36,7 +53,7 @@ def match_peptide_to_protein_ids( """ # convert the peptide sequence to a list of kmers k = 5 - kmers = [peptide_sequence[i : i + k] for i in range(len(peptide_sequence) - k + 1)] + kmers = [peptide_sequence[i: i + k] for i in range(len(peptide_sequence) - k + 1)] # for now, we only do exact matches, so only the first and last kmer of the peptide are needed first_kmer = kmers[0] last_kmer = kmers[-1] @@ -58,90 +75,170 @@ def match_peptide_to_protein_ids( subsequence = protein_sequence[start_first_kmer:end_last_kmer] if subsequence != peptide_sequence: continue - assert len(subsequence) == len(peptide_sequence), f"Lengths do not match: {len(subsequence)} != {len(peptide_sequence)}" + assert len(subsequence) == len( + peptide_sequence), f"Lengths do not match: {len(subsequence)} != {len(peptide_sequence)}" assert subsequence in peptide_sequence, f"Subsequence not in peptide sequence:\nA: {subsequence}\nB: {peptide_sequence}" assert peptide_sequence in protein_sequence, f"Peptide not in protein sequence: {peptide_sequence} not in {protein_sequence}" hits.append((protein_id, start_first_kmer, end_last_kmer)) return hits - def plot_protein_coverage( - fasta_df: pd.DataFrame, peptide_df: pd.DataFrame, protein_id: str + fasta_df: pd.DataFrame, peptide_df: pd.DataFrame, protein_id: str, samples: list[str] = None ) -> None: """ Plots the coverage of a protein sequence by peptides. """ # generate the protein kmer dictionary + if samples is None: + samples = [] + reduced_peptide_df = peptide_df[peptide_df["Sample"].isin(samples) & peptide_df['Intensity'] > 0] + protein_dict = dict(zip(fasta_df["Protein ID"], fasta_df["Protein Sequence"])) kmer_dict = build_kmer_dictionary(protein_dict, k=5) protein_sequence = protein_dict[protein_id] protein_sequence_length = len(protein_sequence) peptide_matches = [] + coverage = [0] * protein_sequence_length - for peptide_sequence in tqdm(peptide_df['Sequence'].unique(), desc="Matching peptides to protein", unit_scale=True, unit="peptide"): - protein_hits = match_peptide_to_protein_ids(peptide_sequence=peptide_sequence, protein_kmer_dictionary=kmer_dict, protein_dictionary=protein_dict) + for sample, peptide_sequence, intensity in tqdm( + reduced_peptide_df[['Sample', 'Sequence', 'Intensity']].drop_duplicates().itertuples(index=False), + desc="Matching peptides to protein", unit_scale=True, + unit="peptide"): + protein_hits = match_peptide_to_protein_ids(peptide_sequence=peptide_sequence, + protein_kmer_dictionary=kmer_dict, protein_dictionary=protein_dict) for prot_id, start_location_on_protein, end_location_on_protein in protein_hits: # we are only interested in the provided protein id, everything else is discarded if prot_id != protein_id: continue # add the peptide sequence and location to the peptide matches pertaining to the protein id - peptide_matches.append((peptide_sequence, start_location_on_protein, end_location_on_protein)) + peptide_matches.append( + PeptideMatch(peptide_sequence=peptide_sequence, + start_location_on_protein=start_location_on_protein, + end_location_on_protein=end_location_on_protein, + intensity=log2(intensity), + sample=sample) + ) + # update the coverage of the protein sequence + coverage[start_location_on_protein:end_location_on_protein] = \ + [coverage + 1 for coverage in coverage[start_location_on_protein:end_location_on_protein]] + if len(peptide_matches) == 0: + raise ValueError(f"No peptides matched for protein {protein_id}") # now that all the matches have been determined with their start and end on the protein sequence, we need to find # an optimal solution for the location of their rectangles in the plot without overlap while minimizing the # required number of rows (vertical space) - rows_of_peptide_matches = distribute_to_rows(peptide_matches) - - x_labels = [f"{i} ({protein_sequence[i - 1]})" for i in range(1, protein_sequence_length + 1)] - fig = go.Figure(data=go.Bar(x=x_labels, y=[1] * protein_sequence_length, - name="Protein Sequence", marker=dict(color="lightgray")), ) - for row_index, row in enumerate(rows_of_peptide_matches): - for start_location_on_protein, end_location_on_protein, peptide_sequence in row: - print(f"Peptide: {peptide_sequence}, start: {start_location_on_protein}, end: {end_location_on_protein}") - # add a rectangle for each peptide + rows = distribute_to_rows(peptide_matches) + + x_labels = [f"{amino_acid_index} ({amino_acid})" for amino_acid_index, amino_acid in enumerate(protein_sequence)] + max_coverage_in_sequence = max(coverage) + max_intensity = max([peptide_match.intensity for peptide_match in peptide_matches]) + normalized_coverage = [value / max_coverage_in_sequence for value in coverage] + hover_text = [f"Position: {i}
Amino acid: {amino_acid}
Coverage: {coverage}" + for i, (coverage, amino_acid) in + enumerate(zip(normalized_coverage, protein_sequence))] + + fig = go.Figure(data=go.Bar(x=x_labels, + y=normalized_coverage, + marker=dict( + color=normalized_coverage, + colorscale='Bluered', + colorbar=dict(title="Coverage
(normalized)"), + line=dict(width=0) + ), + hovertext=hover_text, + hoverinfo="text", + showlegend=False, + name="Coverage (normalized to max in Protein Sequence)")) + + # Keep track of the current row position across all samples + current_row = 1 + + # Iterate through samples and their rows in order + for sample_idx, (sample, sample_rows) in enumerate(rows.items()): + # Add a sample label + fig.add_annotation( + x=-0.1, # slightly to the left of the plot + y=current_row + len(sample_rows) / 2, + text=sample, + showarrow=False, + textangle=-90, + xref='paper', + yref='y' + ) + + for row_index, row in enumerate(sample_rows): + for peptide_match in row: + # add a rectangle for each peptide + x0, x1 = peptide_match.start_location_on_protein, peptide_match.end_location_on_protein + y0, y1 = current_row, current_row + 1 + normalized_intensity = (peptide_match.intensity) / max_intensity + fig.add_shape( + type="rect", + # the coordinates need to be a bit offset, as otherwise the rectangles would start in the middle of the bars + x0=x0 - 0.5, y0=y0, + x1=x1 - 0.5, y1=y1, + fillcolor=f"rgba({int(255 * (1 - normalized_intensity))}, 0, {int(255 * normalized_intensity)}, 0.5)", + opacity=0.5, + layer="above", + ) + # add invisible plotly object to the rectangle to show the peptide sequence when hovered over + fig.add_trace(go.Scatter( + x=x_labels[peptide_match.start_location_on_protein:peptide_match.end_location_on_protein + 1], + y=[(y0 + y1) / 2] * len(peptide_match.peptide_sequence), + text=[f"Sample: {sample}
Peptide: {peptide_match.peptide_sequence}
" + f"({peptide_match.start_location_on_protein}-{peptide_match.end_location_on_protein})
" + f"Intensity: {peptide_match.intensity}"] * len(peptide_match.peptide_sequence), + mode="markers", + marker=dict(opacity=0, size=30), # make marker invisible + hoverinfo="text", + hovertemplate="%{text}", + showlegend=False, + )) + + # Move to next row + current_row += 1 + + # Add a horizontal line separator between samples (except after the last sample) + if sample_idx < len(rows) - 1: fig.add_shape( - type="rect", - x0=start_location_on_protein, - y0=row_index+1 + 0.1, - x1=end_location_on_protein, - y1=row_index+ 2 - 0.1, - fillcolor="blue", - opacity=0.5, - layer="above", + type="line", + x0=-0.5, + y0=current_row, + x1=len(protein_sequence) - 0.5, + y1=current_row, + line=dict( + color="Gray", + width=2, + dash="solid", + ) ) - # add invisible plotly object to the rectangle to show the peptide sequence when hovered over - fig.add_trace(go.Scatter( - x=[x_labels[(start_location_on_protein + end_location_on_protein) // 2]], - y=[row_index + 1.5], - text=[f"{peptide_sequence}
({start_location_on_protein}-{end_location_on_protein})"], - mode="markers", - marker=dict(opacity=0, size=20), # make marker invisible - hoverinfo="text", - hovertemplate="%{text}", - showlegend=False, - )) fig.update_layout( title=f"Protein Coverage of {protein_id}", xaxis_title="Protein Sequence", - yaxis=dict(visible=False, range=[0, 1+len(rows_of_peptide_matches)+1], title=""), + yaxis=dict(visible=False, range=[0, 1 + current_row], title=""), showlegend=False, + bargap=0 ) return dict(plots=[fig]) -def distribute_to_rows(coverage): - coverage.sort(key=lambda x: x[1]) - rows = [] - for peptide_sequence, start_location, end_location in coverage: +def distribute_to_rows(peptide_matches: list[PeptideMatch]) -> dict[list[list[PeptideMatch]]]: + """ + Distributes peptide matches to rows such that no two peptides overlap in the same row. + Greedy algorithm (see Interval Scheduling Problem). + """ + peptide_matches.sort(key=lambda peptide_match: peptide_match.start_location_on_protein) + rows = {sample : [] for sample in set([peptide_match.sample for peptide_match in peptide_matches])} + for peptide_match in peptide_matches: # find the first row that does not overlap with the current peptide - for row in rows: - if row[-1][1] <= start_location: - row.append((start_location, end_location, peptide_sequence)) + for row in rows[peptide_match.sample]: + if row[-1].end_location_on_protein < peptide_match.start_location_on_protein: + row.append((peptide_match)) break else: - rows.append([(start_location, end_location, peptide_sequence)]) + rows[peptide_match.sample].append([(peptide_match)]) return rows diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py index ab8bdc27..765b3047 100644 --- a/protzilla/methods/data_analysis.py +++ b/protzilla/methods/data_analysis.py @@ -792,6 +792,7 @@ class PlotProteinCoverage(PlotStep): "protein_id", "fasta_df", "peptide_df", + "samples" ] output_keys = [] diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py index be9ea58f..38c8668e 100644 --- a/ui/runs/forms/data_analysis.py +++ b/ui/runs/forms/data_analysis.py @@ -1103,6 +1103,11 @@ class PlotProteinCoverageForm(MethodForm): choices=[], label="Protein ID", ) + samples = CustomMultipleChoiceField( + choices=[], + label="Samples", + ) + def fill_form(self, run: Run) -> None: self.fields["peptide_df_instance"].choices = fill_helper.get_choices( @@ -1124,9 +1129,12 @@ def fill_form(self, run: Run) -> None: Step, "fasta_df", fasta_df_instance_id )["Protein ID"].unique() peptide_df_protein_ids = peptide_df["Protein ID"].unique() - common_protein_ids = list(set(fasta_protein_ids) & set(peptide_df_protein_ids)) + common_protein_ids = sorted(list(set(fasta_protein_ids) & set(peptide_df_protein_ids))) self.fields["protein_id"].choices = fill_helper.to_choices(common_protein_ids) + peptide_df_samples = peptide_df["Sample"].unique() + self.fields["samples"].choices = fill_helper.to_choices(peptide_df_samples) + class FLEXIQuantLFForm(MethodForm): is_dynamic = True