From 8a49a7bf9b12fbb39f0e46fae81e71eef04b4e21 Mon Sep 17 00:00:00 2001 From: "janni.roebbecke" Date: Wed, 5 Jun 2024 17:06:57 +0200 Subject: [PATCH] implement requested changes: make peptide_df input optional for all preprocessing steps, remove peptide_df input in preprocessing steps, where it is not used --- .../data_preprocessing/filter_proteins.py | 4 ++-- .../data_preprocessing/filter_samples.py | 6 +++--- protzilla/data_preprocessing/imputation.py | 6 ------ protzilla/data_preprocessing/normalisation.py | 6 ++---- .../data_preprocessing/outlier_detection.py | 6 +++--- .../data_preprocessing/transformation.py | 2 +- protzilla/methods/data_preprocessing.py | 20 +++++++++---------- 7 files changed, 21 insertions(+), 29 deletions(-) diff --git a/protzilla/data_preprocessing/filter_proteins.py b/protzilla/data_preprocessing/filter_proteins.py index a597c8c7..5e0bb7b8 100644 --- a/protzilla/data_preprocessing/filter_proteins.py +++ b/protzilla/data_preprocessing/filter_proteins.py @@ -5,8 +5,8 @@ def by_samples_missing( - protein_df: pd.DataFrame = None, - peptide_df: pd.DataFrame = None, + protein_df: pd.DataFrame | None, + peptide_df: pd.DataFrame | None, percentage: float = 0.5, ) -> dict: """ diff --git a/protzilla/data_preprocessing/filter_samples.py b/protzilla/data_preprocessing/filter_samples.py index da1fa1aa..626aabb4 100644 --- a/protzilla/data_preprocessing/filter_samples.py +++ b/protzilla/data_preprocessing/filter_samples.py @@ -6,7 +6,7 @@ def by_protein_intensity_sum( protein_df: pd.DataFrame, - peptide_df: pd.DataFrame, + peptide_df: pd.DataFrame | None, deviation_threshold: float, ) -> dict: """ @@ -47,7 +47,7 @@ def by_protein_intensity_sum( def by_protein_count( protein_df: pd.DataFrame, - peptide_df: pd.DataFrame, + peptide_df: pd.DataFrame | None, deviation_threshold: float, ) -> dict: """ @@ -93,7 +93,7 @@ def by_protein_count( def by_proteins_missing( protein_df: pd.DataFrame, - peptide_df: pd.DataFrame, + peptide_df: pd.DataFrame | None, percentage: float, ) -> dict: """ diff --git a/protzilla/data_preprocessing/imputation.py b/protzilla/data_preprocessing/imputation.py index d4f6fdd1..48af773b 100644 --- a/protzilla/data_preprocessing/imputation.py +++ b/protzilla/data_preprocessing/imputation.py @@ -57,7 +57,6 @@ def flag_invalid_values(df: pd.DataFrame, messages: list) -> dict: def by_knn( protein_df: pd.DataFrame, - peptide_df: pd.DataFrame, number_of_neighbours: int = 5, **kwargs, # quantile, default is median ) -> dict: @@ -105,7 +104,6 @@ def by_knn( def by_simple_imputer( protein_df: pd.DataFrame, - peptide_df: pd.DataFrame, strategy: str = "mean", ) -> dict: """ @@ -146,7 +144,6 @@ def by_simple_imputer( def by_min_per_sample( protein_df: pd.DataFrame, - peptide_df: pd.DataFrame, shrinking_value: float = 1, ) -> dict: """ @@ -189,7 +186,6 @@ def by_min_per_sample( def by_min_per_protein( protein_df: pd.DataFrame, - peptide_df: pd.DataFrame, shrinking_value: float = 1, ) -> dict: """ @@ -233,7 +229,6 @@ def by_min_per_protein( def by_min_per_dataset( protein_df: pd.DataFrame, - peptide_df: pd.DataFrame, shrinking_value: float = 1, ) -> dict: """ @@ -264,7 +259,6 @@ def by_min_per_dataset( def by_normal_distribution_sampling( protein_df: pd.DataFrame, - peptide_df: pd.DataFrame, strategy: str = "perProtein", down_shift: float = 0, scaling_factor: float = 1, diff --git a/protzilla/data_preprocessing/normalisation.py b/protzilla/data_preprocessing/normalisation.py index 5b7929a5..e2be755b 100644 --- a/protzilla/data_preprocessing/normalisation.py +++ b/protzilla/data_preprocessing/normalisation.py @@ -8,7 +8,7 @@ from protzilla.utilities import default_intensity_column -def by_z_score(protein_df: pd.DataFrame, peptide_df: pd.DataFrame) -> dict: +def by_z_score(protein_df: pd.DataFrame) -> dict: """ A function to run the sklearn StandardScaler class on your dataframe. Normalises the data on the level of each sample. @@ -49,7 +49,6 @@ def by_z_score(protein_df: pd.DataFrame, peptide_df: pd.DataFrame) -> dict: def by_median( protein_df: pd.DataFrame, - peptide_df: pd.DataFrame, percentile=0.5, # quartile, default is median ) -> dict: """ @@ -114,7 +113,7 @@ def by_median( ) -def by_totalsum(protein_df: pd.DataFrame, peptide_df: pd.DataFrame) -> dict: +def by_totalsum(protein_df: pd.DataFrame) -> dict: """ A function to perform normalisation using the total sum of sample intensities on your dataframe. @@ -172,7 +171,6 @@ def by_totalsum(protein_df: pd.DataFrame, peptide_df: pd.DataFrame) -> dict: def by_reference_protein( protein_df: pd.DataFrame, - peptide_df: pd.DataFrame, reference_protein: str, ) -> dict: """ diff --git a/protzilla/data_preprocessing/outlier_detection.py b/protzilla/data_preprocessing/outlier_detection.py index d279a118..be7b8eb8 100644 --- a/protzilla/data_preprocessing/outlier_detection.py +++ b/protzilla/data_preprocessing/outlier_detection.py @@ -15,7 +15,7 @@ def by_isolation_forest( protein_df: pd.DataFrame, - peptide_df: pd.DataFrame, + peptide_df: pd.DataFrame | None, n_estimators: int = 100, n_jobs: int = -1, ) -> dict: @@ -85,7 +85,7 @@ def by_isolation_forest( def by_local_outlier_factor( protein_df: pd.DataFrame, - peptide_df: pd.DataFrame, + peptide_df: pd.DataFrame | None, number_of_neighbors: int = 20, n_jobs: int = -1, ) -> dict: @@ -148,7 +148,7 @@ def by_local_outlier_factor( def by_pca( protein_df: pd.DataFrame, - peptide_df: pd.DataFrame, + peptide_df: pd.DataFrame | None, threshold: int = 2, number_of_components: int = 3, ) -> dict: diff --git a/protzilla/data_preprocessing/transformation.py b/protzilla/data_preprocessing/transformation.py index f331195a..221b01ab 100644 --- a/protzilla/data_preprocessing/transformation.py +++ b/protzilla/data_preprocessing/transformation.py @@ -5,7 +5,7 @@ from protzilla.utilities import default_intensity_column -def by_log(protein_df: pd.DataFrame, peptide_df: pd.DataFrame, log_base="log10") -> dict: +def by_log(protein_df: pd.DataFrame, peptide_df: pd.DataFrame | None, log_base="log10") -> dict: """ This function log-transforms intensity DataFrames. Supports log-transformation to the base diff --git a/protzilla/methods/data_preprocessing.py b/protzilla/methods/data_preprocessing.py index 85d9b059..0565eaf0 100644 --- a/protzilla/methods/data_preprocessing.py +++ b/protzilla/methods/data_preprocessing.py @@ -182,7 +182,7 @@ class NormalisationByZScore(DataPreprocessingStep): operation = "normalisation" method_description = "Normalise data by Z-Score" - plot_input_names = ["protein_df", "peptide_df"] + plot_input_names = ["protein_df"] def method(self, inputs): return normalisation.by_z_score(**inputs) @@ -196,7 +196,7 @@ class NormalisationByTotalSum(DataPreprocessingStep): operation = "normalisation" method_description = "Normalise data by total sum" - plot_input_names = ["protein_df", "peptide_df"] + plot_input_names = ["protein_df"] def method(self, inputs): return normalisation.by_totalsum(**inputs) @@ -210,7 +210,7 @@ class NormalisationByMedian(DataPreprocessingStep): operation = "normalisation" method_description = "Normalise data by median" - input_keys = ["protein_df", "peptide_df", "percentile"] + input_keys = ["protein_df", "percentile"] def method(self, inputs): return normalisation.by_median(**inputs) @@ -224,7 +224,7 @@ class NormalisationByReferenceProtein(DataPreprocessingStep): operation = "normalisation" method_description = "Normalise data by reference protein" - input_keys = ["protein_df", "peptide_df", "reference_protein"] + input_keys = ["protein_df", "reference_protein"] def method(self, inputs): return normalisation.by_reference_protein(**inputs) @@ -238,7 +238,7 @@ class ImputationByMinPerDataset(DataPreprocessingStep): operation = "imputation" method_description = "Impute missing values by the minimum per dataset" - input_keys = ["protein_df", "peptide_df", "shrinking_value"] + input_keys = ["protein_df", "shrinking_value"] def method(self, inputs): return imputation.by_min_per_dataset(**inputs) @@ -252,7 +252,7 @@ class ImputationByMinPerProtein(DataPreprocessingStep): operation = "imputation" method_description = "Impute missing values by the minimum per protein" - input_keys = ["protein_df", "peptide_df", "shrinking_value"] + input_keys = ["protein_df", "shrinking_value"] def method(self, inputs): return imputation.by_min_per_protein(**inputs) @@ -266,7 +266,7 @@ class ImputationByMinPerSample(DataPreprocessingStep): operation = "imputation" method_description = "Impute missing values by the minimum per sample" - input_keys = ["protein_df", "peptide_df", "shrinking_value"] + input_keys = ["protein_df", "shrinking_value"] def method(self, inputs): return imputation.by_min_per_protein(**inputs) @@ -283,7 +283,7 @@ class SimpleImputationPerProtein(DataPreprocessingStep): "sklearn.SimpleImputer class" ) - input_keys = ["protein_df", "peptide_df", "strategy"] + input_keys = ["protein_df", "strategy"] def method(self, inputs): return imputation.by_simple_imputer(**inputs) @@ -301,7 +301,7 @@ class ImputationByKNN(DataPreprocessingStep): "the features that neither is missing are close." ) - input_keys = ["protein_df", "peptide_df", "number_of_neighbours"] + input_keys = ["protein_df", "number_of_neighbours"] def method(self, inputs): return imputation.by_knn(**inputs) @@ -315,7 +315,7 @@ class ImputationByNormalDistributionSampling(DataPreprocessingStep): operation = "imputation" method_description = "Imputation methods include normal distribution sampling per protein or per dataset" - input_keys = ["protein_df", "peptide_df", "strategy", "down_shift", "scaling_factor"] + input_keys = ["protein_df", "strategy", "down_shift", "scaling_factor"] def method(self, inputs): return imputation.by_normal_distribution_sampling(**inputs)