-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add fasta import, primitive peptide - protein sequence alignment, pre…
…liminary coverage bar plot
- Loading branch information
1 parent
a2d0bf6
commit 51dff94
Showing
8 changed files
with
407 additions
and
83 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
from typing import List | ||
|
||
import pandas as pd | ||
import plotly.graph_objects as go | ||
|
||
|
||
def align_peptide_to_protein_sequence( | ||
peptide_sequence: str, protein_sequence: str | ||
) -> List[int]: | ||
""" | ||
Aligns a peptide to a protein sequence and returns the indices of the protein sequence where the peptide. | ||
NAIVE APPROACH | ||
""" | ||
if ( | ||
len(peptide_sequence) == 0 | ||
or len(protein_sequence) == 0 | ||
or len(peptide_sequence) > len(protein_sequence) | ||
): | ||
return [] | ||
indices = [] | ||
for i in range(len(protein_sequence) - len(peptide_sequence) + 1): | ||
if protein_sequence[i : i + len(peptide_sequence)] == peptide_sequence: | ||
indices.append(i) | ||
return indices | ||
|
||
|
||
def plot_protein_coverage( | ||
fasta_df: pd.DataFrame, peptide_df: pd.DataFrame, protein_id: str | ||
) -> None: | ||
""" | ||
Plots the coverage of a protein sequence by peptides. | ||
""" | ||
# Retrieve the relevant peptides from the peptide df | ||
relevant_peptides = peptide_df[peptide_df["Protein ID"] == protein_id] | ||
protein_sequence = fasta_df[fasta_df["Protein ID"] == protein_id][ | ||
"Protein Sequence" | ||
].values[0] | ||
protein_sequence_length = len(protein_sequence) | ||
coverage = [0] * protein_sequence_length | ||
for peptide_sequence in relevant_peptides["Sequence"].unique(): | ||
indices = align_peptide_to_protein_sequence(peptide_sequence, protein_sequence) | ||
for i in indices: | ||
for j in range(len(peptide_sequence)): | ||
coverage[i + j] += 1 | ||
|
||
# Make sure coverage length matches the sequence length | ||
if len(coverage) != protein_sequence_length: | ||
raise ValueError("Length of coverage must match protein_sequence_length.") | ||
|
||
# Generate labels for amino acid positions | ||
amino_acid_positions = list(range(1, protein_sequence_length + 1)) | ||
x_labels = [f"{i} ({protein_sequence[i - 1]})" for i in amino_acid_positions] | ||
|
||
# Create the bar chart | ||
fig = go.Figure() | ||
|
||
# Add bars for coverage values | ||
fig.add_trace( | ||
go.Bar(x=x_labels, y=coverage, name="Coverage", marker=dict(color="skyblue")) | ||
) | ||
|
||
# Customize layout | ||
fig.update_layout( | ||
title="Protein Coverage", | ||
xaxis_title="Amino Acid", | ||
yaxis_title="Coverage Value", | ||
xaxis=dict(tickmode="linear"), | ||
bargap=0.1, # Adjust space between bars if needed | ||
) | ||
return dict(plots=[fig]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
""" | ||
This module contains the code to parse a fasta file containing protein sequences and their ids. | ||
""" | ||
import logging | ||
|
||
import pandas as pd | ||
from Bio import SeqIO | ||
|
||
|
||
def parse_fasta_id(fasta_id: str) -> str: | ||
""" | ||
Parse the fasta id to get the protein name from the fasta id string | ||
""" | ||
metadata = fasta_id.split("|")[1] | ||
if len(metadata) < 2: | ||
logging.warning(f"Metadata too short: {metadata}") | ||
return "" | ||
return metadata | ||
|
||
|
||
def fasta_import(file_path: str) -> pd.DataFrame: | ||
""" | ||
Import a fasta file and return a DataFrame with the protein sequences and their protein ids | ||
""" | ||
fasta_iterator = SeqIO.parse(open(file_path), "fasta") | ||
protein_ids = [] | ||
protein_sequences = [] | ||
for fasta_sequence in fasta_iterator: | ||
id, sequence = parse_fasta_id(fasta_sequence.id), str(fasta_sequence.seq) | ||
protein_ids.append(id) | ||
protein_sequences.append(sequence) | ||
|
||
fasta_sequences = pd.DataFrame( | ||
{"Protein ID": protein_ids, "Protein Sequence": protein_sequences} | ||
) | ||
return {"fasta_df": fasta_sequences} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.