From 8a49a7bf9b12fbb39f0e46fae81e71eef04b4e21 Mon Sep 17 00:00:00 2001
From: "janni.roebbecke" <janni.roebbecke@student.hpi.uni-potsdam.de>
Date: Wed, 5 Jun 2024 17:06:57 +0200
Subject: [PATCH] implement requested changes: make peptide_df input optional
 for all preprocessing steps, remove peptide_df input in preprocessing steps,
 where it is not used

---
 .../data_preprocessing/filter_proteins.py     |  4 ++--
 .../data_preprocessing/filter_samples.py      |  6 +++---
 protzilla/data_preprocessing/imputation.py    |  6 ------
 protzilla/data_preprocessing/normalisation.py |  6 ++----
 .../data_preprocessing/outlier_detection.py   |  6 +++---
 .../data_preprocessing/transformation.py      |  2 +-
 protzilla/methods/data_preprocessing.py       | 20 +++++++++----------
 7 files changed, 21 insertions(+), 29 deletions(-)

diff --git a/protzilla/data_preprocessing/filter_proteins.py b/protzilla/data_preprocessing/filter_proteins.py
index a597c8c7..5e0bb7b8 100644
--- a/protzilla/data_preprocessing/filter_proteins.py
+++ b/protzilla/data_preprocessing/filter_proteins.py
@@ -5,8 +5,8 @@
 
 
 def by_samples_missing(
-    protein_df: pd.DataFrame = None,
-    peptide_df: pd.DataFrame = None,
+    protein_df: pd.DataFrame | None,
+    peptide_df: pd.DataFrame | None,
     percentage: float = 0.5,
 ) -> dict:
     """
diff --git a/protzilla/data_preprocessing/filter_samples.py b/protzilla/data_preprocessing/filter_samples.py
index da1fa1aa..626aabb4 100644
--- a/protzilla/data_preprocessing/filter_samples.py
+++ b/protzilla/data_preprocessing/filter_samples.py
@@ -6,7 +6,7 @@
 
 def by_protein_intensity_sum(
     protein_df: pd.DataFrame,
-    peptide_df: pd.DataFrame,
+    peptide_df: pd.DataFrame | None,
     deviation_threshold: float,
 ) -> dict:
     """
@@ -47,7 +47,7 @@ def by_protein_intensity_sum(
 
 def by_protein_count(
     protein_df: pd.DataFrame,
-    peptide_df: pd.DataFrame,
+    peptide_df: pd.DataFrame | None,
     deviation_threshold: float,
 ) -> dict:
     """
@@ -93,7 +93,7 @@ def by_protein_count(
 
 def by_proteins_missing(
     protein_df: pd.DataFrame,
-    peptide_df: pd.DataFrame,
+    peptide_df: pd.DataFrame | None,
     percentage: float,
 ) -> dict:
     """
diff --git a/protzilla/data_preprocessing/imputation.py b/protzilla/data_preprocessing/imputation.py
index d4f6fdd1..48af773b 100644
--- a/protzilla/data_preprocessing/imputation.py
+++ b/protzilla/data_preprocessing/imputation.py
@@ -57,7 +57,6 @@ def flag_invalid_values(df: pd.DataFrame, messages: list) -> dict:
 
 def by_knn(
     protein_df: pd.DataFrame,
-    peptide_df: pd.DataFrame,
     number_of_neighbours: int = 5,
     **kwargs,  # quantile, default is median
 ) -> dict:
@@ -105,7 +104,6 @@ def by_knn(
 
 def by_simple_imputer(
     protein_df: pd.DataFrame,
-    peptide_df: pd.DataFrame,
     strategy: str = "mean",
 ) -> dict:
     """
@@ -146,7 +144,6 @@ def by_simple_imputer(
 
 def by_min_per_sample(
     protein_df: pd.DataFrame,
-    peptide_df: pd.DataFrame,
     shrinking_value: float = 1,
 ) -> dict:
     """
@@ -189,7 +186,6 @@ def by_min_per_sample(
 
 def by_min_per_protein(
     protein_df: pd.DataFrame,
-    peptide_df: pd.DataFrame,
     shrinking_value: float = 1,
 ) -> dict:
     """
@@ -233,7 +229,6 @@ def by_min_per_protein(
 
 def by_min_per_dataset(
     protein_df: pd.DataFrame,
-    peptide_df: pd.DataFrame,
     shrinking_value: float = 1,
 ) -> dict:
     """
@@ -264,7 +259,6 @@ def by_min_per_dataset(
 
 def by_normal_distribution_sampling(
     protein_df: pd.DataFrame,
-    peptide_df: pd.DataFrame,
     strategy: str = "perProtein",
     down_shift: float = 0,
     scaling_factor: float = 1,
diff --git a/protzilla/data_preprocessing/normalisation.py b/protzilla/data_preprocessing/normalisation.py
index 5b7929a5..e2be755b 100644
--- a/protzilla/data_preprocessing/normalisation.py
+++ b/protzilla/data_preprocessing/normalisation.py
@@ -8,7 +8,7 @@
 from protzilla.utilities import default_intensity_column
 
 
-def by_z_score(protein_df: pd.DataFrame, peptide_df: pd.DataFrame) -> dict:
+def by_z_score(protein_df: pd.DataFrame) -> dict:
     """
     A function to run the sklearn StandardScaler class on your dataframe.
     Normalises the data on the level of each sample.
@@ -49,7 +49,6 @@ def by_z_score(protein_df: pd.DataFrame, peptide_df: pd.DataFrame) -> dict:
 
 def by_median(
     protein_df: pd.DataFrame,
-    peptide_df: pd.DataFrame,
     percentile=0.5,  # quartile, default is median
 ) -> dict:
     """
@@ -114,7 +113,7 @@ def by_median(
     )
 
 
-def by_totalsum(protein_df: pd.DataFrame, peptide_df: pd.DataFrame) -> dict:
+def by_totalsum(protein_df: pd.DataFrame) -> dict:
     """
     A function to perform normalisation using the total sum
     of sample intensities on your dataframe.
@@ -172,7 +171,6 @@ def by_totalsum(protein_df: pd.DataFrame, peptide_df: pd.DataFrame) -> dict:
 
 def by_reference_protein(
     protein_df: pd.DataFrame,
-    peptide_df: pd.DataFrame,
     reference_protein: str,
 ) -> dict:
     """
diff --git a/protzilla/data_preprocessing/outlier_detection.py b/protzilla/data_preprocessing/outlier_detection.py
index d279a118..be7b8eb8 100644
--- a/protzilla/data_preprocessing/outlier_detection.py
+++ b/protzilla/data_preprocessing/outlier_detection.py
@@ -15,7 +15,7 @@
 
 def by_isolation_forest(
         protein_df: pd.DataFrame,
-        peptide_df: pd.DataFrame,
+        peptide_df: pd.DataFrame | None,
         n_estimators: int = 100,
         n_jobs: int = -1,
 ) -> dict:
@@ -85,7 +85,7 @@ def by_isolation_forest(
 
 def by_local_outlier_factor(
     protein_df: pd.DataFrame,
-    peptide_df: pd.DataFrame,
+    peptide_df: pd.DataFrame | None,
     number_of_neighbors: int = 20,
     n_jobs: int = -1,
 ) -> dict:
@@ -148,7 +148,7 @@ def by_local_outlier_factor(
 
 def by_pca(
     protein_df: pd.DataFrame,
-    peptide_df: pd.DataFrame,
+    peptide_df: pd.DataFrame | None,
     threshold: int = 2,
     number_of_components: int = 3,
 ) -> dict:
diff --git a/protzilla/data_preprocessing/transformation.py b/protzilla/data_preprocessing/transformation.py
index f331195a..221b01ab 100644
--- a/protzilla/data_preprocessing/transformation.py
+++ b/protzilla/data_preprocessing/transformation.py
@@ -5,7 +5,7 @@
 from protzilla.utilities import default_intensity_column
 
 
-def by_log(protein_df: pd.DataFrame, peptide_df: pd.DataFrame, log_base="log10") -> dict:
+def by_log(protein_df: pd.DataFrame, peptide_df: pd.DataFrame | None, log_base="log10") -> dict:
     """
     This function log-transforms intensity
     DataFrames. Supports log-transformation to the base
diff --git a/protzilla/methods/data_preprocessing.py b/protzilla/methods/data_preprocessing.py
index 85d9b059..0565eaf0 100644
--- a/protzilla/methods/data_preprocessing.py
+++ b/protzilla/methods/data_preprocessing.py
@@ -182,7 +182,7 @@ class NormalisationByZScore(DataPreprocessingStep):
     operation = "normalisation"
     method_description = "Normalise data by Z-Score"
 
-    plot_input_names = ["protein_df", "peptide_df"]
+    plot_input_names = ["protein_df"]
 
     def method(self, inputs):
         return normalisation.by_z_score(**inputs)
@@ -196,7 +196,7 @@ class NormalisationByTotalSum(DataPreprocessingStep):
     operation = "normalisation"
     method_description = "Normalise data by total sum"
 
-    plot_input_names = ["protein_df", "peptide_df"]
+    plot_input_names = ["protein_df"]
 
     def method(self, inputs):
         return normalisation.by_totalsum(**inputs)
@@ -210,7 +210,7 @@ class NormalisationByMedian(DataPreprocessingStep):
     operation = "normalisation"
     method_description = "Normalise data by median"
 
-    input_keys = ["protein_df", "peptide_df", "percentile"]
+    input_keys = ["protein_df", "percentile"]
 
     def method(self, inputs):
         return normalisation.by_median(**inputs)
@@ -224,7 +224,7 @@ class NormalisationByReferenceProtein(DataPreprocessingStep):
     operation = "normalisation"
     method_description = "Normalise data by reference protein"
 
-    input_keys = ["protein_df", "peptide_df", "reference_protein"]
+    input_keys = ["protein_df", "reference_protein"]
 
     def method(self, inputs):
         return normalisation.by_reference_protein(**inputs)
@@ -238,7 +238,7 @@ class ImputationByMinPerDataset(DataPreprocessingStep):
     operation = "imputation"
     method_description = "Impute missing values by the minimum per dataset"
 
-    input_keys = ["protein_df", "peptide_df", "shrinking_value"]
+    input_keys = ["protein_df", "shrinking_value"]
 
     def method(self, inputs):
         return imputation.by_min_per_dataset(**inputs)
@@ -252,7 +252,7 @@ class ImputationByMinPerProtein(DataPreprocessingStep):
     operation = "imputation"
     method_description = "Impute missing values by the minimum per protein"
 
-    input_keys = ["protein_df", "peptide_df", "shrinking_value"]
+    input_keys = ["protein_df", "shrinking_value"]
 
     def method(self, inputs):
         return imputation.by_min_per_protein(**inputs)
@@ -266,7 +266,7 @@ class ImputationByMinPerSample(DataPreprocessingStep):
     operation = "imputation"
     method_description = "Impute missing values by the minimum per sample"
 
-    input_keys = ["protein_df", "peptide_df", "shrinking_value"]
+    input_keys = ["protein_df", "shrinking_value"]
 
     def method(self, inputs):
         return imputation.by_min_per_protein(**inputs)
@@ -283,7 +283,7 @@ class SimpleImputationPerProtein(DataPreprocessingStep):
         "sklearn.SimpleImputer class"
     )
 
-    input_keys = ["protein_df", "peptide_df", "strategy"]
+    input_keys = ["protein_df", "strategy"]
 
     def method(self, inputs):
         return imputation.by_simple_imputer(**inputs)
@@ -301,7 +301,7 @@ class ImputationByKNN(DataPreprocessingStep):
         "the features that neither is missing are close."
     )
 
-    input_keys = ["protein_df", "peptide_df", "number_of_neighbours"]
+    input_keys = ["protein_df", "number_of_neighbours"]
 
     def method(self, inputs):
         return imputation.by_knn(**inputs)
@@ -315,7 +315,7 @@ class ImputationByNormalDistributionSampling(DataPreprocessingStep):
     operation = "imputation"
     method_description = "Imputation methods include normal distribution sampling per protein or per dataset"
 
-    input_keys = ["protein_df", "peptide_df", "strategy", "down_shift", "scaling_factor"]
+    input_keys = ["protein_df", "strategy", "down_shift", "scaling_factor"]
 
     def method(self, inputs):
         return imputation.by_normal_distribution_sampling(**inputs)