Skip to content

Commit

Permalink
Revert "Merge pull request #322 from cschlaffner/306-dia-nn-support-f…
Browse files Browse the repository at this point in the history
…or-input-data"

This reverts commit 3ab5436, reversing
changes made to af789f7.
  • Loading branch information
henninggaertner committed Nov 24, 2023
1 parent 3ab5436 commit 5bd8231
Show file tree
Hide file tree
Showing 10 changed files with 38 additions and 291 deletions.
10 changes: 0 additions & 10 deletions protzilla/constants/location_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,21 +36,11 @@
"ms_data_import",
"ms_fragger_import",
): ms_data_import.ms_fragger_import,
(
"importing",
"ms_data_import",
"diann_import",
): ms_data_import.diann_import,
(
"importing",
"metadata_import",
"metadata_import_method",
): metadata_import.metadata_import_method,
(
"importing",
"metadata_import",
"metadata_import_method_diann",
): metadata_import.metadata_import_method_diann,
(
"importing",
"metadata_import",
Expand Down
33 changes: 0 additions & 33 deletions protzilla/constants/workflow_meta.json
Original file line number Diff line number Diff line change
Expand Up @@ -27,22 +27,6 @@
}
}
},
"diann_import": {
"name": "DIA-NN",
"description": "DIA-NN Data Import",
"parameters": {
"file_path": {
"name": "DIA-NN intensities file:",
"type": "file",
"default": null
},
"map_to_uniprot": {
"name": "Map to Uniprot IDs using Biomart (online)",
"type": "boolean",
"default": false
}
}
},
"ms_fragger_import": {
"name": "MS Fragger",
"description": "MS Fragger Data Import",
Expand Down Expand Up @@ -97,23 +81,6 @@
}
}
},
"metadata_import_method_diann": {
"name": "Metadata Import DIA-NN",
"description": "Import Metadata for run relationships of DIA-NN",
"parameters": {
"file_path": {
"name": "Run-Relationship Metadata file:",
"type": "file",
"default": null
},
"groupby_sample": {
"name": "Group Replicate Runs by Sample using Median",
"type": "boolean",
"default": false
}

}
},
"metadata_column_assignment": {
"name": "Metadata Column Assignment",
"description": "Assign columns to metadata categories, repeatable for each category",
Expand Down
10 changes: 7 additions & 3 deletions protzilla/data_preprocessing/imputation.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@

def by_knn(
intensity_df: pd.DataFrame,
number_of_neighbours: int = 5,
number_of_neighbours=5,
**kwargs # quantile, default is median
) -> (pd.DataFrame, dict):
) -> tuple[pd.DataFrame, dict]:
"""
A function to perform value imputation based on KNN
(k-nearest neighbors). Imputes missing values for each
Expand Down Expand Up @@ -60,7 +60,7 @@ def by_knn(

def by_simple_imputer(
intensity_df: pd.DataFrame,
strategy: str = "mean",
strategy="mean",
) -> tuple[pd.DataFrame, dict]:
"""
A function to perform protein-wise imputations
Expand Down Expand Up @@ -249,6 +249,9 @@ def by_normal_distribution_sampling(
distribution used for imputation is scaled compared to dataset.
Default: 1 (no scaling)
:type down_shift: float
:param round_values: whether to round the imputed values to the nearest integer
Default: False
:type round_values: bool
:return: returns an imputed dataframe in typical protzilla long format\
and an empty dict
:rtype: pd.DataFrame, int
Expand All @@ -259,6 +262,7 @@ def by_normal_distribution_sampling(
transformed_df = long_to_wide(intensity_df)
# iterate over all protein groups
for protein_grp in transformed_df.columns:

number_of_nans = transformed_df[protein_grp].isnull().sum()

# don't impute values if there not enough values (> 1) to sample from
Expand Down
129 changes: 27 additions & 102 deletions protzilla/importing/metadata_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,66 +2,41 @@

import pandas as pd
from django.contrib import messages
from pandas import DataFrame

from protzilla.constants.paths import PROJECT_PATH
from protzilla.utilities import random_string


def file_importer(file_path: str) -> tuple[pd.DataFrame, str]:
"""
Imports a file based on its file extension and returns a pandas DataFrame or None if the file format is not
supported / the file doesn't exist.
"""
try:
if file_path.endswith(".csv"):
meta_df = pd.read_csv(
file_path,
sep=",",
low_memory=False,
na_values=[""],
keep_default_na=True,
skipinitialspace=True,
)
elif file_path.endswith(".xlsx"):
meta_df = pd.read_excel(file_path)
elif file_path.endswith(".psv"):
meta_df = pd.read_csv(file_path, sep="|", low_memory=False)
elif file_path.endswith(".tsv"):
meta_df = pd.read_csv(file_path, sep="\t", low_memory=False)
elif file_path == "":
return (
pd.DataFrame(),
"The file upload is empty. Please select a metadata file.",
)
else:
return (
pd.DataFrame(),
"File format not supported. \
Supported file formats are csv, xlsx, psv or tsv",
)
msg = "Metadata file successfully imported."
return meta_df, msg
except pd.errors.EmptyDataError:
msg = "The file is empty."
return pd.DataFrame(), msg


def metadata_import_method(
df: pd.DataFrame, file_path: str, feature_orientation: str
) -> tuple[pd.DataFrame, dict]:
"""
Imports a metadata file and returns the intensity dataframe and a dict with a message if the file import failed,
and the metadata dataframe if the import was successful.
returns: (DataFrame, dict)
"""
meta_df, msg = file_importer(file_path)
if meta_df.empty:
def metadata_import_method(df, file_path, feature_orientation):
if file_path.endswith(".csv"):
meta_df = pd.read_csv(
file_path,
sep=",",
low_memory=False,
na_values=[""],
keep_default_na=True,
skipinitialspace=True,
)
elif file_path.endswith(".xlsx"):
meta_df = pd.read_excel(file_path)
elif file_path.endswith(".psv"):
meta_df = pd.read_csv(file_path, sep="|", low_memory=False)
elif file_path.endswith(".tsv"):
meta_df = pd.read_csv(file_path, sep="\t", low_memory=False)
elif file_path == "":
msg = "The file upload is empty. Please select a metadata file."
return df, dict(
meta_df=None,
messages=[dict(level=messages.ERROR, msg=msg)],
)
else:
msg = "File format not supported. \
Supported file formats are csv, xlsx, psv or tsv"
return df, dict(
metadata=None,
meta_df=None,
messages=[dict(level=messages.ERROR, msg=msg)],
)

# always return metadata in the same orientation (features as columns)
# as the dtype get lost when transposing, we save the df to disk after
# changing the format and read it again as "Columns"-oriented
Expand All @@ -80,56 +55,6 @@ def metadata_import_method(
f"{PROJECT_PATH}/tests/protzilla/importing/conversion_tmp_"
):
os.remove(file_path)
if "replicate" in meta_df.columns:
# this indicates a DIANN metadata file with replicate information, we now want to calculate the median across
# all MS runs for a sample then instead of having intensities for each MS run in our dataframe, we
# have intensities for each sample
# note that up until now, "Sample" in the intensity df referred to the ms run
res = pd.merge(
df,
meta_df[["MS run", "sample name"]],
left_on="Sample",
right_on="MS run",
how="left",
)
res.groupby(["Protein ID", "sample name"], as_index=False).median()

return df, {"metadata": meta_df, "messages": [dict(level=messages.INFO, msg=msg)]}


def metadata_import_method_diann(
df: DataFrame, file_path: str, groupby_sample: bool = False
) -> (DataFrame, dict):
"""
This method imports a metadata file with run relationship information and returns the intensity dataframe and the
metadata dataframe. If the import fails, it returns the unchanged dataframe and a dict with a message about the
error.
"""
meta_df, msg = file_importer(file_path)
if meta_df.empty:
return df, dict(
metadata=None,
messages=[dict(level=messages.ERROR, msg=msg)],
)

if file_path.startswith(
f"{PROJECT_PATH}/tests/protzilla/importing/conversion_tmp_"
):
os.remove(file_path)

if groupby_sample:
# we want to take the median of all MS runs (column "Sample" in the intensity df) for each Sample
# (column "sample name" in the metadata df)
res = pd.merge(
df,
meta_df[["MS run", "sample name"]],
left_on="Sample",
right_on="MS run",
how="left",
)
res = res.groupby(["Protein ID", "sample name"], as_index=False).median()
res.rename(columns={"sample name": "Sample"}, inplace=True)
return res, {"metadata": meta_df}

return df, {"metadata": meta_df}

Expand Down
37 changes: 4 additions & 33 deletions protzilla/importing/ms_data_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,7 @@
from protzilla.data_integration.database_query import biomart_query


def max_quant_import(
_: pd.DataFrame, file_path: str, intensity_name: str, map_to_uniprot=False
) -> (pd.DataFrame, dict):
def max_quant_import(_, file_path, intensity_name, map_to_uniprot=False):
assert intensity_name in ["Intensity", "iBAQ", "LFQ intensity"]
if not Path(file_path).is_file():
msg = "The file upload is empty. Please provide a Max Quant file."
Expand All @@ -38,9 +36,7 @@ def max_quant_import(
return transform_and_clean(intensity_df, intensity_name, map_to_uniprot)


def ms_fragger_import(
_: pd.DataFrame, file_path: str, intensity_name: str, map_to_uniprot=False
) -> (pd.DataFrame, dict):
def ms_fragger_import(_, file_path, intensity_name, map_to_uniprot=False):
assert intensity_name in [
"Intensity",
"MaxLFQ Total Intensity",
Expand Down Expand Up @@ -85,33 +81,7 @@ def ms_fragger_import(
return transform_and_clean(intensity_df, intensity_name, map_to_uniprot)


def diann_import(_, file_path, map_to_uniprot=False) -> (pd.DataFrame, dict):
if not Path(file_path).is_file():
msg = "The file upload is empty. Please provide a DIA-NN MS file."
return None, dict(messages=[dict(level=messages.ERROR, msg=msg)])

df = pd.read_csv(
file_path,
sep="\t",
low_memory=False,
na_values=["", 0],
keep_default_na=True,
)
df = df.drop(
columns=["Protein.Group", "Protein.Names", "Genes", "First.Protein.Description"]
)
# rename column names of samples, removing file path and ".raw" if present
intensity_df = df.rename(columns=lambda x: re.sub(r"(.*[/\\])|(.raw)", r"", x))
intensity_df = intensity_df.rename(columns={"Protein.Ids": "Protein ID"})

intensity_name = "Intensity"

return transform_and_clean(intensity_df, intensity_name, map_to_uniprot)


def transform_and_clean(
df: pd.DataFrame, intensity_name: str, map_to_uniprot: bool
) -> (pd.DataFrame, dict):
def transform_and_clean(df, intensity_name, map_to_uniprot):
"""
Transforms a dataframe that is read from a file in wide format into long format,
removing contaminant groups, and processing protein ids, removing invalid ones
Expand All @@ -127,6 +97,7 @@ def transform_and_clean(
:rtype: tuple[pd.DataFrame, list[str], list[str]]
"""
assert "Protein ID" in df.columns

contaminant_groups_mask = df["Protein ID"].map(
lambda group: any(id_.startswith("CON__") for id_ in group.split(";"))
)
Expand Down
1 change: 0 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -33,4 +33,3 @@ protgraph @ git+https://github.com/antonneubauer/ProtGraph@master
joblib==1.2.0
networkx==3.1
beautifulsoup4==4.12.2
openpyxl==3.1.2
6 changes: 0 additions & 6 deletions tests/diann_intensities.tsv

This file was deleted.

Binary file removed tests/diann_run_relationship_metadata.xlsx
Binary file not shown.
15 changes: 0 additions & 15 deletions tests/protzilla/importing/test_metadata_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,21 +23,6 @@ def test_metadata_import():
rmtree(RUNS_PATH / name)


def test_metadata_import_diann():
name = "test_run" + random_string()
run = Run.create(name)
run.step_index += 1
run.calculate_and_next(
metadata_import.metadata_import_method_diann,
file_path=f"{PROJECT_PATH}/tests/diann_run_relationship_metadata.xlsx",
)
test_metadata = pd.read_excel(
f"{PROJECT_PATH}/tests/diann_run_relationship_metadata.xlsx"
)
pd.testing.assert_frame_equal(test_metadata, run.metadata)
rmtree(RUNS_PATH / name)


def test_metadata_orientation():
name1 = "test_run" + random_string()
name2 = "test_run" + random_string()
Expand Down
Loading

0 comments on commit 5bd8231

Please sign in to comment.