Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add filtering of invalid intensity values in log transformation, add message to warn user #557

Open
wants to merge 5 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 42 additions & 6 deletions protzilla/data_preprocessing/transformation.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,18 @@
import logging

import numpy as np
import pandas as pd

from protzilla.data_preprocessing.plots import create_box_plots, create_histograms
from protzilla.utilities import default_intensity_column


def by_log(protein_df: pd.DataFrame, peptide_df: pd.DataFrame | None, log_base="log10") -> dict:
def by_log(
protein_df: pd.DataFrame, peptide_df: pd.DataFrame | None, log_base="log10"
) -> dict:
"""
This function log-transforms intensity
DataFrames. Supports log-transformation to the base
of 2 or 10.
This function log-transforms intensity, while ignoring and dropping negative or 0 intensity values.
Supports log-transformation to the base of 2 or 10.

:param protein_df: a protein data frame in long format
:type protein_df: pd.DataFrame
Expand All @@ -22,11 +25,42 @@ def by_log(protein_df: pd.DataFrame, peptide_df: pd.DataFrame | None, log_base="
long format with the transformed data and an empty dict.
:rtype: Tuple[pandas DataFrame, dict]
"""
msg = []
intensity_name = default_intensity_column(protein_df)
transformed_df = protein_df.copy()
transformed_peptide_df = peptide_df.copy() if peptide_df is not None else None
zero_intensity_index = transformed_df[transformed_df[intensity_name] <= 0].index
untransformable_data_df = transformed_df.loc[zero_intensity_index]
transformed_df.drop(zero_intensity_index, inplace=True)
transformed_df.reset_index(drop=True, inplace=True)

if transformed_peptide_df is not None:
zero_intensity_peptide_index = transformed_peptide_df[
transformed_peptide_df["Intensity"] <= 0
].index
untransformable_peptide_data_df = transformed_peptide_df.loc[
zero_intensity_peptide_index
]
transformed_peptide_df.drop(zero_intensity_peptide_index, inplace=True)
transformed_peptide_df.reset_index(drop=True, inplace=True)
if not untransformable_peptide_data_df.empty:
msg.append(
dict(
msg=f"Warning: {len(untransformable_peptide_data_df)} data points of peptide data with zero or negative intensity values were found and will be dropped. "
f"Please adapt your preprocessing pipeline if this is unexpected.",
level=logging.WARNING,
)
)

if not untransformable_data_df.empty:
msg.append(
dict(
msg=f"Warning: {len(untransformable_data_df)} data points of {len(untransformable_data_df['Protein ID'])} distinct protein groups with zero or negative intensity values were found and will be dropped. "
f"Please adapt your preprocessing pipeline if this is unexpected.",
level=logging.WARNING,
)
)

# TODO 41 drop data when intensity is 0 and return them in dict
if log_base == "log2":
transformed_df[intensity_name] = np.log2(transformed_df[intensity_name])
if transformed_peptide_df is not None:
Expand All @@ -41,7 +75,9 @@ def by_log(protein_df: pd.DataFrame, peptide_df: pd.DataFrame | None, log_base="
)
else:
raise ValueError("Unknown log_base. Known log methods are 'log2' and 'log10'.")
return dict(protein_df=transformed_df, peptide_df=transformed_peptide_df)
return dict(
protein_df=transformed_df, peptide_df=transformed_peptide_df, messages=msg
)


def by_log_plot(method_inputs, method_outputs, graph_type, group_by):
Expand Down
50 changes: 44 additions & 6 deletions tests/protzilla/data_preprocessing/test_transformation.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ def log2_transformation_df():
["Sample2", "Protein2", "Gene2", np.nan],
["Sample2", "Protein3", "Gene3", 4],
["Sample2", "Protein4", "Gene4", 4],
["Sample3", "Protein1", "Gene1", 8],
["Sample3", "Protein2", "Gene2", 8],
["Sample3", "Protein1", "Gene1", 0],
["Sample3", "Protein2", "Gene2", 0],
["Sample3", "Protein3", "Gene3", 8],
["Sample3", "Protein4", "Gene4", 8],
["Sample4", "Protein1", "Gene1", 1024],
Expand All @@ -42,8 +42,6 @@ def log2_transformation_expected_df():
["Sample2", "Protein2", "Gene2", np.nan],
["Sample2", "Protein3", "Gene3", 2.0],
["Sample2", "Protein4", "Gene4", 2.0],
["Sample3", "Protein1", "Gene1", 3.0],
["Sample3", "Protein2", "Gene2", 3.0],
["Sample3", "Protein3", "Gene3", 3.0],
["Sample3", "Protein4", "Gene4", 3.0],
["Sample4", "Protein1", "Gene1", 10.0],
Expand Down Expand Up @@ -250,10 +248,50 @@ def test_by_log_without_peptide_df(log2_transformation_df, log_base):


def test_log_by_0_transformation():
# TODO 41 test expected behaviour when 0 occurs in df
df = pd.DataFrame(
data=(["Sample1", "Protein1", "Gene1", 0.0],),
columns=["Sample", "Protein ID", "Gene", "Intensity"],
)

by_log(df, None, log_base="log2")
method_outputs = by_log(df, None, log_base="log2")
assert method_outputs["protein_df"].empty, "The protein DataFrame should be empty."


def test_log2_transformation_with_negative_values(log2_transformation_df, peptides_df):
# Add negative values to the DataFrame with concat
log2_transformation_df = pd.concat(
[
log2_transformation_df,
pd.DataFrame(
[["Sample5", "Protein5", "Gene5", -2]],
columns=log2_transformation_df.columns,
),
]
)
peptides_df = pd.concat(
[
peptides_df,
pd.DataFrame(
[["Sample5", "Protein1", "Peptide5", -2, 0.037779]],
columns=["Sample", "Protein ID", "Sequence", "Intensity", "PEP"],
),
]
)

method_inputs = {
"protein_df": log2_transformation_df,
"peptide_df": peptides_df,
"log_base": "log2",
}
method_outputs = by_log(**method_inputs)

result_df = method_outputs["protein_df"]
result_peptide_df = method_outputs["peptide_df"]

# Check that negative values are removed
assert not (
result_df["Intensity"] < 0
).any(), "Negative values were not removed from the protein DataFrame"
assert not (
result_peptide_df["Intensity"] < 0
).any(), "Negative values were not removed from the peptide DataFrame"
Loading