Revert "Merge pull request #322 from cschlaffner/306-dia-nn-support-f…

…or-input-data" This reverts commit 3ab5436, reversing changes made to af789f7.
cschlaffner · Nov 24, 2023 · 5bd8231 · 5bd8231
1 parent 3ab5436
commit 5bd8231
Show file tree

Hide file tree

Showing 10 changed files with 38 additions and 291 deletions.
diff --git a/protzilla/constants/location_mapping.py b/protzilla/constants/location_mapping.py
@@ -36,21 +36,11 @@
         "ms_data_import",
         "ms_fragger_import",
     ): ms_data_import.ms_fragger_import,
-    (
-        "importing",
-        "ms_data_import",
-        "diann_import",
-    ): ms_data_import.diann_import,
     (
         "importing",
         "metadata_import",
         "metadata_import_method",
     ): metadata_import.metadata_import_method,
-    (
-        "importing",
-        "metadata_import",
-        "metadata_import_method_diann",
-    ): metadata_import.metadata_import_method_diann,
     (
         "importing",
         "metadata_import",

diff --git a/protzilla/constants/workflow_meta.json b/protzilla/constants/workflow_meta.json
@@ -27,22 +27,6 @@
           }
         }
       },
-      "diann_import": {
-        "name": "DIA-NN",
-        "description": "DIA-NN Data Import",
-        "parameters": {
-          "file_path": {
-            "name": "DIA-NN intensities file:",
-            "type": "file",
-            "default": null
-          },
-          "map_to_uniprot": {
-            "name": "Map to Uniprot IDs using Biomart (online)",
-            "type": "boolean",
-            "default": false
-          }
-        }
-      },
       "ms_fragger_import": {
         "name": "MS Fragger",
         "description": "MS Fragger Data Import",
@@ -97,23 +81,6 @@
           }
         }
       },
-      "metadata_import_method_diann": {
-        "name": "Metadata Import DIA-NN",
-        "description": "Import Metadata for run relationships of DIA-NN",
-        "parameters": {
-          "file_path": {
-            "name": "Run-Relationship Metadata file:",
-            "type": "file",
-            "default": null
-          },
-          "groupby_sample": {
-            "name": "Group Replicate Runs by Sample using Median",
-            "type": "boolean",
-            "default": false
-          }
-
-        }
-      },
       "metadata_column_assignment": {
         "name": "Metadata Column Assignment",
         "description": "Assign columns to metadata categories, repeatable for each category",

diff --git a/protzilla/data_preprocessing/imputation.py b/protzilla/data_preprocessing/imputation.py
@@ -14,9 +14,9 @@
 
 def by_knn(
     intensity_df: pd.DataFrame,
-    number_of_neighbours: int = 5,
+    number_of_neighbours=5,
     **kwargs  # quantile, default is median
-) -> (pd.DataFrame, dict):
+) -> tuple[pd.DataFrame, dict]:
     """
     A function to perform value imputation based on KNN
     (k-nearest neighbors). Imputes missing values for each
@@ -60,7 +60,7 @@ def by_knn(
 
 def by_simple_imputer(
     intensity_df: pd.DataFrame,
-    strategy: str = "mean",
+    strategy="mean",
 ) -> tuple[pd.DataFrame, dict]:
     """
     A function to perform protein-wise imputations
@@ -249,6 +249,9 @@ def by_normal_distribution_sampling(
     distribution used for imputation is scaled compared to dataset.
     Default: 1 (no scaling)
     :type down_shift: float
+    :param round_values: whether to round the imputed values to the nearest integer
+    Default: False
+    :type round_values: bool
     :return: returns an imputed dataframe in typical protzilla long format\
     and an empty dict
     :rtype: pd.DataFrame, int
@@ -259,6 +262,7 @@ def by_normal_distribution_sampling(
         transformed_df = long_to_wide(intensity_df)
         # iterate over all protein groups
         for protein_grp in transformed_df.columns:
+
             number_of_nans = transformed_df[protein_grp].isnull().sum()
 
             # don't impute values if there not enough values (> 1) to sample from

diff --git a/protzilla/importing/metadata_import.py b/protzilla/importing/metadata_import.py
@@ -2,66 +2,41 @@
 
 import pandas as pd
 from django.contrib import messages
-from pandas import DataFrame
 
 from protzilla.constants.paths import PROJECT_PATH
 from protzilla.utilities import random_string
 
 
-def file_importer(file_path: str) -> tuple[pd.DataFrame, str]:
-    """
-    Imports a file based on its file extension and returns a pandas DataFrame or None if the file format is not
-    supported / the file doesn't exist.
-    """
-    try:
-        if file_path.endswith(".csv"):
-            meta_df = pd.read_csv(
-                file_path,
-                sep=",",
-                low_memory=False,
-                na_values=[""],
-                keep_default_na=True,
-                skipinitialspace=True,
-            )
-        elif file_path.endswith(".xlsx"):
-            meta_df = pd.read_excel(file_path)
-        elif file_path.endswith(".psv"):
-            meta_df = pd.read_csv(file_path, sep="|", low_memory=False)
-        elif file_path.endswith(".tsv"):
-            meta_df = pd.read_csv(file_path, sep="\t", low_memory=False)
-        elif file_path == "":
-            return (
-                pd.DataFrame(),
-                "The file upload is empty. Please select a metadata file.",
-            )
-        else:
-            return (
-                pd.DataFrame(),
-                "File format not supported. \
-            Supported file formats are csv, xlsx, psv or tsv",
-            )
-        msg = "Metadata file successfully imported."
-        return meta_df, msg
-    except pd.errors.EmptyDataError:
-        msg = "The file is empty."
-        return pd.DataFrame(), msg
-
-
-def metadata_import_method(
-    df: pd.DataFrame, file_path: str, feature_orientation: str
-) -> tuple[pd.DataFrame, dict]:
-    """
-        Imports a metadata file and returns the intensity dataframe and a dict with a message if the file import failed,
-        and the metadata dataframe if the import was successful.
-
-    returns: (DataFrame, dict)
-    """
-    meta_df, msg = file_importer(file_path)
-    if meta_df.empty:
+def metadata_import_method(df, file_path, feature_orientation):
+    if file_path.endswith(".csv"):
+        meta_df = pd.read_csv(
+            file_path,
+            sep=",",
+            low_memory=False,
+            na_values=[""],
+            keep_default_na=True,
+            skipinitialspace=True,
+        )
+    elif file_path.endswith(".xlsx"):
+        meta_df = pd.read_excel(file_path)
+    elif file_path.endswith(".psv"):
+        meta_df = pd.read_csv(file_path, sep="|", low_memory=False)
+    elif file_path.endswith(".tsv"):
+        meta_df = pd.read_csv(file_path, sep="\t", low_memory=False)
+    elif file_path == "":
+        msg = "The file upload is empty. Please select a metadata file."
+        return df, dict(
+            meta_df=None,
+            messages=[dict(level=messages.ERROR, msg=msg)],
+        )
+    else:
+        msg = "File format not supported. \
+        Supported file formats are csv, xlsx, psv or tsv"
         return df, dict(
-            metadata=None,
+            meta_df=None,
             messages=[dict(level=messages.ERROR, msg=msg)],
         )
+
     # always return metadata in the same orientation (features as columns)
     # as the dtype get lost when transposing, we save the df to disk after
     # changing the format and read it again as "Columns"-oriented
@@ -80,56 +55,6 @@ def metadata_import_method(
         f"{PROJECT_PATH}/tests/protzilla/importing/conversion_tmp_"
     ):
         os.remove(file_path)
-    if "replicate" in meta_df.columns:
-        # this indicates a DIANN metadata file with replicate information, we now want to calculate the median across
-        # all MS runs for a sample then instead of having intensities for each MS run in our dataframe, we
-        # have intensities for each sample
-        # note that up until now, "Sample" in the intensity df referred to the ms run
-        res = pd.merge(
-            df,
-            meta_df[["MS run", "sample name"]],
-            left_on="Sample",
-            right_on="MS run",
-            how="left",
-        )
-        res.groupby(["Protein ID", "sample name"], as_index=False).median()
-
-    return df, {"metadata": meta_df, "messages": [dict(level=messages.INFO, msg=msg)]}
-
-
-def metadata_import_method_diann(
-    df: DataFrame, file_path: str, groupby_sample: bool = False
-) -> (DataFrame, dict):
-    """
-    This method imports a metadata file with run relationship information and returns the intensity dataframe and the
-    metadata dataframe. If the import fails, it returns the unchanged dataframe and a dict with a message about the
-    error.
-    """
-    meta_df, msg = file_importer(file_path)
-    if meta_df.empty:
-        return df, dict(
-            metadata=None,
-            messages=[dict(level=messages.ERROR, msg=msg)],
-        )
-
-    if file_path.startswith(
-        f"{PROJECT_PATH}/tests/protzilla/importing/conversion_tmp_"
-    ):
-        os.remove(file_path)
-
-    if groupby_sample:
-        # we want to take the median of all MS runs (column "Sample" in the intensity df) for each Sample
-        # (column "sample name" in the metadata df)
-        res = pd.merge(
-            df,
-            meta_df[["MS run", "sample name"]],
-            left_on="Sample",
-            right_on="MS run",
-            how="left",
-        )
-        res = res.groupby(["Protein ID", "sample name"], as_index=False).median()
-        res.rename(columns={"sample name": "Sample"}, inplace=True)
-        return res, {"metadata": meta_df}
 
     return df, {"metadata": meta_df}
 

diff --git a/protzilla/importing/ms_data_import.py b/protzilla/importing/ms_data_import.py
@@ -9,9 +9,7 @@
 from protzilla.data_integration.database_query import biomart_query
 
 
-def max_quant_import(
-    _: pd.DataFrame, file_path: str, intensity_name: str, map_to_uniprot=False
-) -> (pd.DataFrame, dict):
+def max_quant_import(_, file_path, intensity_name, map_to_uniprot=False):
     assert intensity_name in ["Intensity", "iBAQ", "LFQ intensity"]
     if not Path(file_path).is_file():
         msg = "The file upload is empty. Please provide a Max Quant file."
@@ -38,9 +36,7 @@ def max_quant_import(
     return transform_and_clean(intensity_df, intensity_name, map_to_uniprot)
 
 
-def ms_fragger_import(
-    _: pd.DataFrame, file_path: str, intensity_name: str, map_to_uniprot=False
-) -> (pd.DataFrame, dict):
+def ms_fragger_import(_, file_path, intensity_name, map_to_uniprot=False):
     assert intensity_name in [
         "Intensity",
         "MaxLFQ Total Intensity",
@@ -85,33 +81,7 @@ def ms_fragger_import(
     return transform_and_clean(intensity_df, intensity_name, map_to_uniprot)
 
 
-def diann_import(_, file_path, map_to_uniprot=False) -> (pd.DataFrame, dict):
-    if not Path(file_path).is_file():
-        msg = "The file upload is empty. Please provide a DIA-NN MS file."
-        return None, dict(messages=[dict(level=messages.ERROR, msg=msg)])
-
-    df = pd.read_csv(
-        file_path,
-        sep="\t",
-        low_memory=False,
-        na_values=["", 0],
-        keep_default_na=True,
-    )
-    df = df.drop(
-        columns=["Protein.Group", "Protein.Names", "Genes", "First.Protein.Description"]
-    )
-    # rename column names of samples, removing file path and ".raw" if present
-    intensity_df = df.rename(columns=lambda x: re.sub(r"(.*[/\\])|(.raw)", r"", x))
-    intensity_df = intensity_df.rename(columns={"Protein.Ids": "Protein ID"})
-
-    intensity_name = "Intensity"
-
-    return transform_and_clean(intensity_df, intensity_name, map_to_uniprot)
-
-
-def transform_and_clean(
-    df: pd.DataFrame, intensity_name: str, map_to_uniprot: bool
-) -> (pd.DataFrame, dict):
+def transform_and_clean(df, intensity_name, map_to_uniprot):
     """
     Transforms a dataframe that is read from a file in wide format into long format,
     removing contaminant groups, and processing protein ids, removing invalid ones
@@ -127,6 +97,7 @@ def transform_and_clean(
     :rtype: tuple[pd.DataFrame, list[str], list[str]]
     """
     assert "Protein ID" in df.columns
+
     contaminant_groups_mask = df["Protein ID"].map(
         lambda group: any(id_.startswith("CON__") for id_ in group.split(";"))
     )

diff --git a/requirements.txt b/requirements.txt
@@ -33,4 +33,3 @@ protgraph @ git+https://github.com/antonneubauer/ProtGraph@master
 joblib==1.2.0
 networkx==3.1
 beautifulsoup4==4.12.2
-openpyxl==3.1.2
diff --git a/tests/diann_intensities.tsv b/tests/diann_intensities.tsv
diff --git a/tests/diann_run_relationship_metadata.xlsx b/tests/diann_run_relationship_metadata.xlsx
diff --git a/tests/protzilla/importing/test_metadata_import.py b/tests/protzilla/importing/test_metadata_import.py
@@ -23,21 +23,6 @@ def test_metadata_import():
     rmtree(RUNS_PATH / name)
 
 
-def test_metadata_import_diann():
-    name = "test_run" + random_string()
-    run = Run.create(name)
-    run.step_index += 1
-    run.calculate_and_next(
-        metadata_import.metadata_import_method_diann,
-        file_path=f"{PROJECT_PATH}/tests/diann_run_relationship_metadata.xlsx",
-    )
-    test_metadata = pd.read_excel(
-        f"{PROJECT_PATH}/tests/diann_run_relationship_metadata.xlsx"
-    )
-    pd.testing.assert_frame_equal(test_metadata, run.metadata)
-    rmtree(RUNS_PATH / name)
-
-
 def test_metadata_orientation():
     name1 = "test_run" + random_string()
     name2 = "test_run" + random_string()