Skip to content

Commit

Permalink
Merge pull request #322 from cschlaffner/306-dia-nn-support-for-input…
Browse files Browse the repository at this point in the history
…-data

306 dia nn support for input data
  • Loading branch information
RogerAK authored Nov 23, 2023
2 parents af789f7 + 3565c1a commit 3ab5436
Show file tree
Hide file tree
Showing 10 changed files with 291 additions and 38 deletions.
10 changes: 10 additions & 0 deletions protzilla/constants/location_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,21 @@
"ms_data_import",
"ms_fragger_import",
): ms_data_import.ms_fragger_import,
(
"importing",
"ms_data_import",
"diann_import",
): ms_data_import.diann_import,
(
"importing",
"metadata_import",
"metadata_import_method",
): metadata_import.metadata_import_method,
(
"importing",
"metadata_import",
"metadata_import_method_diann",
): metadata_import.metadata_import_method_diann,
(
"importing",
"metadata_import",
Expand Down
33 changes: 33 additions & 0 deletions protzilla/constants/workflow_meta.json
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,22 @@
}
}
},
"diann_import": {
"name": "DIA-NN",
"description": "DIA-NN Data Import",
"parameters": {
"file_path": {
"name": "DIA-NN intensities file:",
"type": "file",
"default": null
},
"map_to_uniprot": {
"name": "Map to Uniprot IDs using Biomart (online)",
"type": "boolean",
"default": false
}
}
},
"ms_fragger_import": {
"name": "MS Fragger",
"description": "MS Fragger Data Import",
Expand Down Expand Up @@ -81,6 +97,23 @@
}
}
},
"metadata_import_method_diann": {
"name": "Metadata Import DIA-NN",
"description": "Import Metadata for run relationships of DIA-NN",
"parameters": {
"file_path": {
"name": "Run-Relationship Metadata file:",
"type": "file",
"default": null
},
"groupby_sample": {
"name": "Group Replicate Runs by Sample using Median",
"type": "boolean",
"default": false
}

}
},
"metadata_column_assignment": {
"name": "Metadata Column Assignment",
"description": "Assign columns to metadata categories, repeatable for each category",
Expand Down
10 changes: 3 additions & 7 deletions protzilla/data_preprocessing/imputation.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@

def by_knn(
intensity_df: pd.DataFrame,
number_of_neighbours=5,
number_of_neighbours: int = 5,
**kwargs # quantile, default is median
) -> tuple[pd.DataFrame, dict]:
) -> (pd.DataFrame, dict):
"""
A function to perform value imputation based on KNN
(k-nearest neighbors). Imputes missing values for each
Expand Down Expand Up @@ -60,7 +60,7 @@ def by_knn(

def by_simple_imputer(
intensity_df: pd.DataFrame,
strategy="mean",
strategy: str = "mean",
) -> tuple[pd.DataFrame, dict]:
"""
A function to perform protein-wise imputations
Expand Down Expand Up @@ -249,9 +249,6 @@ def by_normal_distribution_sampling(
distribution used for imputation is scaled compared to dataset.
Default: 1 (no scaling)
:type down_shift: float
:param round_values: whether to round the imputed values to the nearest integer
Default: False
:type round_values: bool
:return: returns an imputed dataframe in typical protzilla long format\
and an empty dict
:rtype: pd.DataFrame, int
Expand All @@ -262,7 +259,6 @@ def by_normal_distribution_sampling(
transformed_df = long_to_wide(intensity_df)
# iterate over all protein groups
for protein_grp in transformed_df.columns:

number_of_nans = transformed_df[protein_grp].isnull().sum()

# don't impute values if there not enough values (> 1) to sample from
Expand Down
129 changes: 102 additions & 27 deletions protzilla/importing/metadata_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,41 +2,66 @@

import pandas as pd
from django.contrib import messages
from pandas import DataFrame

from protzilla.constants.paths import PROJECT_PATH
from protzilla.utilities import random_string


def metadata_import_method(df, file_path, feature_orientation):
if file_path.endswith(".csv"):
meta_df = pd.read_csv(
file_path,
sep=",",
low_memory=False,
na_values=[""],
keep_default_na=True,
skipinitialspace=True,
)
elif file_path.endswith(".xlsx"):
meta_df = pd.read_excel(file_path)
elif file_path.endswith(".psv"):
meta_df = pd.read_csv(file_path, sep="|", low_memory=False)
elif file_path.endswith(".tsv"):
meta_df = pd.read_csv(file_path, sep="\t", low_memory=False)
elif file_path == "":
msg = "The file upload is empty. Please select a metadata file."
return df, dict(
meta_df=None,
messages=[dict(level=messages.ERROR, msg=msg)],
)
else:
msg = "File format not supported. \
Supported file formats are csv, xlsx, psv or tsv"
def file_importer(file_path: str) -> tuple[pd.DataFrame, str]:
"""
Imports a file based on its file extension and returns a pandas DataFrame or None if the file format is not
supported / the file doesn't exist.
"""
try:
if file_path.endswith(".csv"):
meta_df = pd.read_csv(
file_path,
sep=",",
low_memory=False,
na_values=[""],
keep_default_na=True,
skipinitialspace=True,
)
elif file_path.endswith(".xlsx"):
meta_df = pd.read_excel(file_path)
elif file_path.endswith(".psv"):
meta_df = pd.read_csv(file_path, sep="|", low_memory=False)
elif file_path.endswith(".tsv"):
meta_df = pd.read_csv(file_path, sep="\t", low_memory=False)
elif file_path == "":
return (
pd.DataFrame(),
"The file upload is empty. Please select a metadata file.",
)
else:
return (
pd.DataFrame(),
"File format not supported. \
Supported file formats are csv, xlsx, psv or tsv",
)
msg = "Metadata file successfully imported."
return meta_df, msg
except pd.errors.EmptyDataError:
msg = "The file is empty."
return pd.DataFrame(), msg


def metadata_import_method(
df: pd.DataFrame, file_path: str, feature_orientation: str
) -> tuple[pd.DataFrame, dict]:
"""
Imports a metadata file and returns the intensity dataframe and a dict with a message if the file import failed,
and the metadata dataframe if the import was successful.
returns: (DataFrame, dict)
"""
meta_df, msg = file_importer(file_path)
if meta_df.empty:
return df, dict(
meta_df=None,
metadata=None,
messages=[dict(level=messages.ERROR, msg=msg)],
)

# always return metadata in the same orientation (features as columns)
# as the dtype get lost when transposing, we save the df to disk after
# changing the format and read it again as "Columns"-oriented
Expand All @@ -55,6 +80,56 @@ def metadata_import_method(df, file_path, feature_orientation):
f"{PROJECT_PATH}/tests/protzilla/importing/conversion_tmp_"
):
os.remove(file_path)
if "replicate" in meta_df.columns:
# this indicates a DIANN metadata file with replicate information, we now want to calculate the median across
# all MS runs for a sample then instead of having intensities for each MS run in our dataframe, we
# have intensities for each sample
# note that up until now, "Sample" in the intensity df referred to the ms run
res = pd.merge(
df,
meta_df[["MS run", "sample name"]],
left_on="Sample",
right_on="MS run",
how="left",
)
res.groupby(["Protein ID", "sample name"], as_index=False).median()

return df, {"metadata": meta_df, "messages": [dict(level=messages.INFO, msg=msg)]}


def metadata_import_method_diann(
df: DataFrame, file_path: str, groupby_sample: bool = False
) -> (DataFrame, dict):
"""
This method imports a metadata file with run relationship information and returns the intensity dataframe and the
metadata dataframe. If the import fails, it returns the unchanged dataframe and a dict with a message about the
error.
"""
meta_df, msg = file_importer(file_path)
if meta_df.empty:
return df, dict(
metadata=None,
messages=[dict(level=messages.ERROR, msg=msg)],
)

if file_path.startswith(
f"{PROJECT_PATH}/tests/protzilla/importing/conversion_tmp_"
):
os.remove(file_path)

if groupby_sample:
# we want to take the median of all MS runs (column "Sample" in the intensity df) for each Sample
# (column "sample name" in the metadata df)
res = pd.merge(
df,
meta_df[["MS run", "sample name"]],
left_on="Sample",
right_on="MS run",
how="left",
)
res = res.groupby(["Protein ID", "sample name"], as_index=False).median()
res.rename(columns={"sample name": "Sample"}, inplace=True)
return res, {"metadata": meta_df}

return df, {"metadata": meta_df}

Expand Down
37 changes: 33 additions & 4 deletions protzilla/importing/ms_data_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@
from protzilla.data_integration.database_query import biomart_query


def max_quant_import(_, file_path, intensity_name, map_to_uniprot=False):
def max_quant_import(
_: pd.DataFrame, file_path: str, intensity_name: str, map_to_uniprot=False
) -> (pd.DataFrame, dict):
assert intensity_name in ["Intensity", "iBAQ", "LFQ intensity"]
if not Path(file_path).is_file():
msg = "The file upload is empty. Please provide a Max Quant file."
Expand All @@ -36,7 +38,9 @@ def max_quant_import(_, file_path, intensity_name, map_to_uniprot=False):
return transform_and_clean(intensity_df, intensity_name, map_to_uniprot)


def ms_fragger_import(_, file_path, intensity_name, map_to_uniprot=False):
def ms_fragger_import(
_: pd.DataFrame, file_path: str, intensity_name: str, map_to_uniprot=False
) -> (pd.DataFrame, dict):
assert intensity_name in [
"Intensity",
"MaxLFQ Total Intensity",
Expand Down Expand Up @@ -81,7 +85,33 @@ def ms_fragger_import(_, file_path, intensity_name, map_to_uniprot=False):
return transform_and_clean(intensity_df, intensity_name, map_to_uniprot)


def transform_and_clean(df, intensity_name, map_to_uniprot):
def diann_import(_, file_path, map_to_uniprot=False) -> (pd.DataFrame, dict):
if not Path(file_path).is_file():
msg = "The file upload is empty. Please provide a DIA-NN MS file."
return None, dict(messages=[dict(level=messages.ERROR, msg=msg)])

df = pd.read_csv(
file_path,
sep="\t",
low_memory=False,
na_values=["", 0],
keep_default_na=True,
)
df = df.drop(
columns=["Protein.Group", "Protein.Names", "Genes", "First.Protein.Description"]
)
# rename column names of samples, removing file path and ".raw" if present
intensity_df = df.rename(columns=lambda x: re.sub(r"(.*[/\\])|(.raw)", r"", x))
intensity_df = intensity_df.rename(columns={"Protein.Ids": "Protein ID"})

intensity_name = "Intensity"

return transform_and_clean(intensity_df, intensity_name, map_to_uniprot)


def transform_and_clean(
df: pd.DataFrame, intensity_name: str, map_to_uniprot: bool
) -> (pd.DataFrame, dict):
"""
Transforms a dataframe that is read from a file in wide format into long format,
removing contaminant groups, and processing protein ids, removing invalid ones
Expand All @@ -97,7 +127,6 @@ def transform_and_clean(df, intensity_name, map_to_uniprot):
:rtype: tuple[pd.DataFrame, list[str], list[str]]
"""
assert "Protein ID" in df.columns

contaminant_groups_mask = df["Protein ID"].map(
lambda group: any(id_.startswith("CON__") for id_ in group.split(";"))
)
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,4 @@ protgraph @ git+https://github.com/antonneubauer/ProtGraph@master
joblib==1.2.0
networkx==3.1
beautifulsoup4==4.12.2
openpyxl==3.1.2
6 changes: 6 additions & 0 deletions tests/diann_intensities.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
Protein.Group Protein.Ids Protein.Names Genes First.Protein.Description D:\MPL\Gereon\20230418 Hela CM 24h prodi stork\LM07061.raw /home/sampleuser/data/LM07062.raw D:\MPL\Gereon\20230418 Hela CM 24h prodi stork\LM07063.raw
A0A087WWU8 A0A2R2Y2Q3;J3KN67;A0A087WWU8;A0A494C0P6 A0A087WWU8_HUMAN TPM3 Tropomyosin alpha-3 chain 329042.0 367477.0 381325.0
A0A0B4J2A2;P0DN37 A0A0B4J2A2;P0DN37 PAL4C_HUMAN;PAL4G_HUMAN PPIAL4C;PPIAL4G Peptidyl-prolyl cis-trans isomerase A-like 4C 138322.0 572539.0 96522.7
A0A0G2JPD3;A0A140T8W8;A0A140T8Y4;A0A1W2PPF8;A0A1W2PR61;Q5SPM2 A0A0G2JPD3;A0A140T8Y4;A0A1W2PR61;Q5SPM2;A0A140T8W8;A0A1W2PPF8 A0A0G2JPD3_HUMAN;A0A140T8W8_HUMAN;A0A140T8Y4_HUMAN;A0A1W2PPF8_HUMAN;A0A1W2PR61_HUMAN;Q5SPM2_HUMAN HLA-A HLA class I histocompatibility antigen, A alpha chain
A0A0U1RQV3 A0A0U1RQV3 A0A0U1RQV3_HUMAN EFEMP1 EGF-containing fibulin-like extracellular matrix protein 1 (Fragment) 122984.0 59042.7 72372.5
A0A140T913;A0A140T933;A0A140T955;A0A140T9I0;A0A140T9X5;A0A1W2PPQ2;A0A1W2PRT9;Q53Z42 A0A140T913;A0A140T9I0;A0A140T9X5;A0A1W2PRT9;Q53Z42;A0A140T933;A0A140T955;A0A1W2PPQ2 A0A140T913_HUMAN;A0A140T933_HUMAN;A0A140T955_HUMAN;A0A140T9I0_HUMAN;A0A140T9X5_HUMAN;A0A1W2PPQ2_HUMAN;A0A1W2PRT9_HUMAN;Q53Z42_HUMAN HLA-A HLA class I histocompatibility antigen, A alpha chain 36317.0 27456.7
Binary file added tests/diann_run_relationship_metadata.xlsx
Binary file not shown.
15 changes: 15 additions & 0 deletions tests/protzilla/importing/test_metadata_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,21 @@ def test_metadata_import():
rmtree(RUNS_PATH / name)


def test_metadata_import_diann():
name = "test_run" + random_string()
run = Run.create(name)
run.step_index += 1
run.calculate_and_next(
metadata_import.metadata_import_method_diann,
file_path=f"{PROJECT_PATH}/tests/diann_run_relationship_metadata.xlsx",
)
test_metadata = pd.read_excel(
f"{PROJECT_PATH}/tests/diann_run_relationship_metadata.xlsx"
)
pd.testing.assert_frame_equal(test_metadata, run.metadata)
rmtree(RUNS_PATH / name)


def test_metadata_orientation():
name1 = "test_run" + random_string()
name2 = "test_run" + random_string()
Expand Down
Loading

0 comments on commit 3ab5436

Please sign in to comment.