Merge pull request #322 from cschlaffner/306-dia-nn-support-for-input…

…-data 306 dia nn support for input data
cschlaffner · Nov 23, 2023 · 3ab5436 · 3ab5436
2 parents af789f7 + 3565c1a
commit 3ab5436
Show file tree

Hide file tree

Showing 10 changed files with 291 additions and 38 deletions.
diff --git a/protzilla/constants/location_mapping.py b/protzilla/constants/location_mapping.py
@@ -36,11 +36,21 @@
         "ms_data_import",
         "ms_fragger_import",
     ): ms_data_import.ms_fragger_import,
+    (
+        "importing",
+        "ms_data_import",
+        "diann_import",
+    ): ms_data_import.diann_import,
     (
         "importing",
         "metadata_import",
         "metadata_import_method",
     ): metadata_import.metadata_import_method,
+    (
+        "importing",
+        "metadata_import",
+        "metadata_import_method_diann",
+    ): metadata_import.metadata_import_method_diann,
     (
         "importing",
         "metadata_import",

diff --git a/protzilla/constants/workflow_meta.json b/protzilla/constants/workflow_meta.json
@@ -27,6 +27,22 @@
           }
         }
       },
+      "diann_import": {
+        "name": "DIA-NN",
+        "description": "DIA-NN Data Import",
+        "parameters": {
+          "file_path": {
+            "name": "DIA-NN intensities file:",
+            "type": "file",
+            "default": null
+          },
+          "map_to_uniprot": {
+            "name": "Map to Uniprot IDs using Biomart (online)",
+            "type": "boolean",
+            "default": false
+          }
+        }
+      },
       "ms_fragger_import": {
         "name": "MS Fragger",
         "description": "MS Fragger Data Import",
@@ -81,6 +97,23 @@
           }
         }
       },
+      "metadata_import_method_diann": {
+        "name": "Metadata Import DIA-NN",
+        "description": "Import Metadata for run relationships of DIA-NN",
+        "parameters": {
+          "file_path": {
+            "name": "Run-Relationship Metadata file:",
+            "type": "file",
+            "default": null
+          },
+          "groupby_sample": {
+            "name": "Group Replicate Runs by Sample using Median",
+            "type": "boolean",
+            "default": false
+          }
+
+        }
+      },
       "metadata_column_assignment": {
         "name": "Metadata Column Assignment",
         "description": "Assign columns to metadata categories, repeatable for each category",

diff --git a/protzilla/data_preprocessing/imputation.py b/protzilla/data_preprocessing/imputation.py
@@ -14,9 +14,9 @@
 
 def by_knn(
     intensity_df: pd.DataFrame,
-    number_of_neighbours=5,
+    number_of_neighbours: int = 5,
     **kwargs  # quantile, default is median
-) -> tuple[pd.DataFrame, dict]:
+) -> (pd.DataFrame, dict):
     """
     A function to perform value imputation based on KNN
     (k-nearest neighbors). Imputes missing values for each
@@ -60,7 +60,7 @@ def by_knn(
 
 def by_simple_imputer(
     intensity_df: pd.DataFrame,
-    strategy="mean",
+    strategy: str = "mean",
 ) -> tuple[pd.DataFrame, dict]:
     """
     A function to perform protein-wise imputations
@@ -249,9 +249,6 @@ def by_normal_distribution_sampling(
     distribution used for imputation is scaled compared to dataset.
     Default: 1 (no scaling)
     :type down_shift: float
-    :param round_values: whether to round the imputed values to the nearest integer
-    Default: False
-    :type round_values: bool
     :return: returns an imputed dataframe in typical protzilla long format\
     and an empty dict
     :rtype: pd.DataFrame, int
@@ -262,7 +259,6 @@ def by_normal_distribution_sampling(
         transformed_df = long_to_wide(intensity_df)
         # iterate over all protein groups
         for protein_grp in transformed_df.columns:
-
             number_of_nans = transformed_df[protein_grp].isnull().sum()
 
             # don't impute values if there not enough values (> 1) to sample from

diff --git a/protzilla/importing/metadata_import.py b/protzilla/importing/metadata_import.py
@@ -2,41 +2,66 @@
 
 import pandas as pd
 from django.contrib import messages
+from pandas import DataFrame
 
 from protzilla.constants.paths import PROJECT_PATH
 from protzilla.utilities import random_string
 
 
-def metadata_import_method(df, file_path, feature_orientation):
-    if file_path.endswith(".csv"):
-        meta_df = pd.read_csv(
-            file_path,
-            sep=",",
-            low_memory=False,
-            na_values=[""],
-            keep_default_na=True,
-            skipinitialspace=True,
-        )
-    elif file_path.endswith(".xlsx"):
-        meta_df = pd.read_excel(file_path)
-    elif file_path.endswith(".psv"):
-        meta_df = pd.read_csv(file_path, sep="|", low_memory=False)
-    elif file_path.endswith(".tsv"):
-        meta_df = pd.read_csv(file_path, sep="\t", low_memory=False)
-    elif file_path == "":
-        msg = "The file upload is empty. Please select a metadata file."
-        return df, dict(
-            meta_df=None,
-            messages=[dict(level=messages.ERROR, msg=msg)],
-        )
-    else:
-        msg = "File format not supported. \
-        Supported file formats are csv, xlsx, psv or tsv"
+def file_importer(file_path: str) -> tuple[pd.DataFrame, str]:
+    """
+    Imports a file based on its file extension and returns a pandas DataFrame or None if the file format is not
+    supported / the file doesn't exist.
+    """
+    try:
+        if file_path.endswith(".csv"):
+            meta_df = pd.read_csv(
+                file_path,
+                sep=",",
+                low_memory=False,
+                na_values=[""],
+                keep_default_na=True,
+                skipinitialspace=True,
+            )
+        elif file_path.endswith(".xlsx"):
+            meta_df = pd.read_excel(file_path)
+        elif file_path.endswith(".psv"):
+            meta_df = pd.read_csv(file_path, sep="|", low_memory=False)
+        elif file_path.endswith(".tsv"):
+            meta_df = pd.read_csv(file_path, sep="\t", low_memory=False)
+        elif file_path == "":
+            return (
+                pd.DataFrame(),
+                "The file upload is empty. Please select a metadata file.",
+            )
+        else:
+            return (
+                pd.DataFrame(),
+                "File format not supported. \
+            Supported file formats are csv, xlsx, psv or tsv",
+            )
+        msg = "Metadata file successfully imported."
+        return meta_df, msg
+    except pd.errors.EmptyDataError:
+        msg = "The file is empty."
+        return pd.DataFrame(), msg
+
+
+def metadata_import_method(
+    df: pd.DataFrame, file_path: str, feature_orientation: str
+) -> tuple[pd.DataFrame, dict]:
+    """
+        Imports a metadata file and returns the intensity dataframe and a dict with a message if the file import failed,
+        and the metadata dataframe if the import was successful.
+
+    returns: (DataFrame, dict)
+    """
+    meta_df, msg = file_importer(file_path)
+    if meta_df.empty:
         return df, dict(
-            meta_df=None,
+            metadata=None,
             messages=[dict(level=messages.ERROR, msg=msg)],
         )
-
     # always return metadata in the same orientation (features as columns)
     # as the dtype get lost when transposing, we save the df to disk after
     # changing the format and read it again as "Columns"-oriented
@@ -55,6 +80,56 @@ def metadata_import_method(df, file_path, feature_orientation):
         f"{PROJECT_PATH}/tests/protzilla/importing/conversion_tmp_"
     ):
         os.remove(file_path)
+    if "replicate" in meta_df.columns:
+        # this indicates a DIANN metadata file with replicate information, we now want to calculate the median across
+        # all MS runs for a sample then instead of having intensities for each MS run in our dataframe, we
+        # have intensities for each sample
+        # note that up until now, "Sample" in the intensity df referred to the ms run
+        res = pd.merge(
+            df,
+            meta_df[["MS run", "sample name"]],
+            left_on="Sample",
+            right_on="MS run",
+            how="left",
+        )
+        res.groupby(["Protein ID", "sample name"], as_index=False).median()
+
+    return df, {"metadata": meta_df, "messages": [dict(level=messages.INFO, msg=msg)]}
+
+
+def metadata_import_method_diann(
+    df: DataFrame, file_path: str, groupby_sample: bool = False
+) -> (DataFrame, dict):
+    """
+    This method imports a metadata file with run relationship information and returns the intensity dataframe and the
+    metadata dataframe. If the import fails, it returns the unchanged dataframe and a dict with a message about the
+    error.
+    """
+    meta_df, msg = file_importer(file_path)
+    if meta_df.empty:
+        return df, dict(
+            metadata=None,
+            messages=[dict(level=messages.ERROR, msg=msg)],
+        )
+
+    if file_path.startswith(
+        f"{PROJECT_PATH}/tests/protzilla/importing/conversion_tmp_"
+    ):
+        os.remove(file_path)
+
+    if groupby_sample:
+        # we want to take the median of all MS runs (column "Sample" in the intensity df) for each Sample
+        # (column "sample name" in the metadata df)
+        res = pd.merge(
+            df,
+            meta_df[["MS run", "sample name"]],
+            left_on="Sample",
+            right_on="MS run",
+            how="left",
+        )
+        res = res.groupby(["Protein ID", "sample name"], as_index=False).median()
+        res.rename(columns={"sample name": "Sample"}, inplace=True)
+        return res, {"metadata": meta_df}
 
     return df, {"metadata": meta_df}
 

diff --git a/protzilla/importing/ms_data_import.py b/protzilla/importing/ms_data_import.py
@@ -9,7 +9,9 @@
 from protzilla.data_integration.database_query import biomart_query
 
 
-def max_quant_import(_, file_path, intensity_name, map_to_uniprot=False):
+def max_quant_import(
+    _: pd.DataFrame, file_path: str, intensity_name: str, map_to_uniprot=False
+) -> (pd.DataFrame, dict):
     assert intensity_name in ["Intensity", "iBAQ", "LFQ intensity"]
     if not Path(file_path).is_file():
         msg = "The file upload is empty. Please provide a Max Quant file."
@@ -36,7 +38,9 @@ def max_quant_import(_, file_path, intensity_name, map_to_uniprot=False):
     return transform_and_clean(intensity_df, intensity_name, map_to_uniprot)
 
 
-def ms_fragger_import(_, file_path, intensity_name, map_to_uniprot=False):
+def ms_fragger_import(
+    _: pd.DataFrame, file_path: str, intensity_name: str, map_to_uniprot=False
+) -> (pd.DataFrame, dict):
     assert intensity_name in [
         "Intensity",
         "MaxLFQ Total Intensity",
@@ -81,7 +85,33 @@ def ms_fragger_import(_, file_path, intensity_name, map_to_uniprot=False):
     return transform_and_clean(intensity_df, intensity_name, map_to_uniprot)
 
 
-def transform_and_clean(df, intensity_name, map_to_uniprot):
+def diann_import(_, file_path, map_to_uniprot=False) -> (pd.DataFrame, dict):
+    if not Path(file_path).is_file():
+        msg = "The file upload is empty. Please provide a DIA-NN MS file."
+        return None, dict(messages=[dict(level=messages.ERROR, msg=msg)])
+
+    df = pd.read_csv(
+        file_path,
+        sep="\t",
+        low_memory=False,
+        na_values=["", 0],
+        keep_default_na=True,
+    )
+    df = df.drop(
+        columns=["Protein.Group", "Protein.Names", "Genes", "First.Protein.Description"]
+    )
+    # rename column names of samples, removing file path and ".raw" if present
+    intensity_df = df.rename(columns=lambda x: re.sub(r"(.*[/\\])|(.raw)", r"", x))
+    intensity_df = intensity_df.rename(columns={"Protein.Ids": "Protein ID"})
+
+    intensity_name = "Intensity"
+
+    return transform_and_clean(intensity_df, intensity_name, map_to_uniprot)
+
+
+def transform_and_clean(
+    df: pd.DataFrame, intensity_name: str, map_to_uniprot: bool
+) -> (pd.DataFrame, dict):
     """
     Transforms a dataframe that is read from a file in wide format into long format,
     removing contaminant groups, and processing protein ids, removing invalid ones
@@ -97,7 +127,6 @@ def transform_and_clean(df, intensity_name, map_to_uniprot):
     :rtype: tuple[pd.DataFrame, list[str], list[str]]
     """
     assert "Protein ID" in df.columns
-
     contaminant_groups_mask = df["Protein ID"].map(
         lambda group: any(id_.startswith("CON__") for id_ in group.split(";"))
     )

diff --git a/requirements.txt b/requirements.txt
@@ -33,3 +33,4 @@ protgraph @ git+https://github.com/antonneubauer/ProtGraph@master
 joblib==1.2.0
 networkx==3.1
 beautifulsoup4==4.12.2
+openpyxl==3.1.2
diff --git a/tests/diann_intensities.tsv b/tests/diann_intensities.tsv
@@ -0,0 +1,6 @@
+Protein.Group	Protein.Ids	Protein.Names	Genes	First.Protein.Description	D:\MPL\Gereon\20230418 Hela CM 24h prodi stork\LM07061.raw	/home/sampleuser/data/LM07062.raw	D:\MPL\Gereon\20230418 Hela CM 24h prodi stork\LM07063.raw
+A0A087WWU8	A0A2R2Y2Q3;J3KN67;A0A087WWU8;A0A494C0P6	A0A087WWU8_HUMAN	TPM3	Tropomyosin alpha-3 chain	329042.0	367477.0	381325.0
+A0A0B4J2A2;P0DN37	A0A0B4J2A2;P0DN37	PAL4C_HUMAN;PAL4G_HUMAN	PPIAL4C;PPIAL4G	Peptidyl-prolyl cis-trans isomerase A-like 4C	138322.0	572539.0	96522.7
+A0A0G2JPD3;A0A140T8W8;A0A140T8Y4;A0A1W2PPF8;A0A1W2PR61;Q5SPM2	A0A0G2JPD3;A0A140T8Y4;A0A1W2PR61;Q5SPM2;A0A140T8W8;A0A1W2PPF8	A0A0G2JPD3_HUMAN;A0A140T8W8_HUMAN;A0A140T8Y4_HUMAN;A0A1W2PPF8_HUMAN;A0A1W2PR61_HUMAN;Q5SPM2_HUMAN	HLA-A	HLA class I histocompatibility antigen, A alpha chain			
+A0A0U1RQV3	A0A0U1RQV3	A0A0U1RQV3_HUMAN	EFEMP1	EGF-containing fibulin-like extracellular matrix protein 1 (Fragment)	122984.0	59042.7	72372.5
+A0A140T913;A0A140T933;A0A140T955;A0A140T9I0;A0A140T9X5;A0A1W2PPQ2;A0A1W2PRT9;Q53Z42	A0A140T913;A0A140T9I0;A0A140T9X5;A0A1W2PRT9;Q53Z42;A0A140T933;A0A140T955;A0A1W2PPQ2	A0A140T913_HUMAN;A0A140T933_HUMAN;A0A140T955_HUMAN;A0A140T9I0_HUMAN;A0A140T9X5_HUMAN;A0A1W2PPQ2_HUMAN;A0A1W2PRT9_HUMAN;Q53Z42_HUMAN	HLA-A	HLA class I histocompatibility antigen, A alpha chain	36317.0		27456.7
diff --git a/tests/diann_run_relationship_metadata.xlsx b/tests/diann_run_relationship_metadata.xlsx
diff --git a/tests/protzilla/importing/test_metadata_import.py b/tests/protzilla/importing/test_metadata_import.py
@@ -23,6 +23,21 @@ def test_metadata_import():
     rmtree(RUNS_PATH / name)
 
 
+def test_metadata_import_diann():
+    name = "test_run" + random_string()
+    run = Run.create(name)
+    run.step_index += 1
+    run.calculate_and_next(
+        metadata_import.metadata_import_method_diann,
+        file_path=f"{PROJECT_PATH}/tests/diann_run_relationship_metadata.xlsx",
+    )
+    test_metadata = pd.read_excel(
+        f"{PROJECT_PATH}/tests/diann_run_relationship_metadata.xlsx"
+    )
+    pd.testing.assert_frame_equal(test_metadata, run.metadata)
+    rmtree(RUNS_PATH / name)
+
+
 def test_metadata_orientation():
     name1 = "test_run" + random_string()
     name2 = "test_run" + random_string()