Skip to content

Commit

Permalink
implement requested changes: make peptide_df input optional for all p…
Browse files Browse the repository at this point in the history
…reprocessing steps, remove peptide_df input in preprocessing steps, where it is not used
  • Loading branch information
janni.roebbecke committed Jun 5, 2024
1 parent c13c6b5 commit 8a49a7b
Show file tree
Hide file tree
Showing 7 changed files with 21 additions and 29 deletions.
4 changes: 2 additions & 2 deletions protzilla/data_preprocessing/filter_proteins.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@


def by_samples_missing(
protein_df: pd.DataFrame = None,
peptide_df: pd.DataFrame = None,
protein_df: pd.DataFrame | None,
peptide_df: pd.DataFrame | None,
percentage: float = 0.5,
) -> dict:
"""
Expand Down
6 changes: 3 additions & 3 deletions protzilla/data_preprocessing/filter_samples.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

def by_protein_intensity_sum(
protein_df: pd.DataFrame,
peptide_df: pd.DataFrame,
peptide_df: pd.DataFrame | None,
deviation_threshold: float,
) -> dict:
"""
Expand Down Expand Up @@ -47,7 +47,7 @@ def by_protein_intensity_sum(

def by_protein_count(
protein_df: pd.DataFrame,
peptide_df: pd.DataFrame,
peptide_df: pd.DataFrame | None,
deviation_threshold: float,
) -> dict:
"""
Expand Down Expand Up @@ -93,7 +93,7 @@ def by_protein_count(

def by_proteins_missing(
protein_df: pd.DataFrame,
peptide_df: pd.DataFrame,
peptide_df: pd.DataFrame | None,
percentage: float,
) -> dict:
"""
Expand Down
6 changes: 0 additions & 6 deletions protzilla/data_preprocessing/imputation.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,6 @@ def flag_invalid_values(df: pd.DataFrame, messages: list) -> dict:

def by_knn(
protein_df: pd.DataFrame,
peptide_df: pd.DataFrame,
number_of_neighbours: int = 5,
**kwargs, # quantile, default is median
) -> dict:
Expand Down Expand Up @@ -105,7 +104,6 @@ def by_knn(

def by_simple_imputer(
protein_df: pd.DataFrame,
peptide_df: pd.DataFrame,
strategy: str = "mean",
) -> dict:
"""
Expand Down Expand Up @@ -146,7 +144,6 @@ def by_simple_imputer(

def by_min_per_sample(
protein_df: pd.DataFrame,
peptide_df: pd.DataFrame,
shrinking_value: float = 1,
) -> dict:
"""
Expand Down Expand Up @@ -189,7 +186,6 @@ def by_min_per_sample(

def by_min_per_protein(
protein_df: pd.DataFrame,
peptide_df: pd.DataFrame,
shrinking_value: float = 1,
) -> dict:
"""
Expand Down Expand Up @@ -233,7 +229,6 @@ def by_min_per_protein(

def by_min_per_dataset(
protein_df: pd.DataFrame,
peptide_df: pd.DataFrame,
shrinking_value: float = 1,
) -> dict:
"""
Expand Down Expand Up @@ -264,7 +259,6 @@ def by_min_per_dataset(

def by_normal_distribution_sampling(
protein_df: pd.DataFrame,
peptide_df: pd.DataFrame,
strategy: str = "perProtein",
down_shift: float = 0,
scaling_factor: float = 1,
Expand Down
6 changes: 2 additions & 4 deletions protzilla/data_preprocessing/normalisation.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from protzilla.utilities import default_intensity_column


def by_z_score(protein_df: pd.DataFrame, peptide_df: pd.DataFrame) -> dict:
def by_z_score(protein_df: pd.DataFrame) -> dict:
"""
A function to run the sklearn StandardScaler class on your dataframe.
Normalises the data on the level of each sample.
Expand Down Expand Up @@ -49,7 +49,6 @@ def by_z_score(protein_df: pd.DataFrame, peptide_df: pd.DataFrame) -> dict:

def by_median(
protein_df: pd.DataFrame,
peptide_df: pd.DataFrame,
percentile=0.5, # quartile, default is median
) -> dict:
"""
Expand Down Expand Up @@ -114,7 +113,7 @@ def by_median(
)


def by_totalsum(protein_df: pd.DataFrame, peptide_df: pd.DataFrame) -> dict:
def by_totalsum(protein_df: pd.DataFrame) -> dict:
"""
A function to perform normalisation using the total sum
of sample intensities on your dataframe.
Expand Down Expand Up @@ -172,7 +171,6 @@ def by_totalsum(protein_df: pd.DataFrame, peptide_df: pd.DataFrame) -> dict:

def by_reference_protein(
protein_df: pd.DataFrame,
peptide_df: pd.DataFrame,
reference_protein: str,
) -> dict:
"""
Expand Down
6 changes: 3 additions & 3 deletions protzilla/data_preprocessing/outlier_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

def by_isolation_forest(
protein_df: pd.DataFrame,
peptide_df: pd.DataFrame,
peptide_df: pd.DataFrame | None,
n_estimators: int = 100,
n_jobs: int = -1,
) -> dict:
Expand Down Expand Up @@ -85,7 +85,7 @@ def by_isolation_forest(

def by_local_outlier_factor(
protein_df: pd.DataFrame,
peptide_df: pd.DataFrame,
peptide_df: pd.DataFrame | None,
number_of_neighbors: int = 20,
n_jobs: int = -1,
) -> dict:
Expand Down Expand Up @@ -148,7 +148,7 @@ def by_local_outlier_factor(

def by_pca(
protein_df: pd.DataFrame,
peptide_df: pd.DataFrame,
peptide_df: pd.DataFrame | None,
threshold: int = 2,
number_of_components: int = 3,
) -> dict:
Expand Down
2 changes: 1 addition & 1 deletion protzilla/data_preprocessing/transformation.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from protzilla.utilities import default_intensity_column


def by_log(protein_df: pd.DataFrame, peptide_df: pd.DataFrame, log_base="log10") -> dict:
def by_log(protein_df: pd.DataFrame, peptide_df: pd.DataFrame | None, log_base="log10") -> dict:
"""
This function log-transforms intensity
DataFrames. Supports log-transformation to the base
Expand Down
20 changes: 10 additions & 10 deletions protzilla/methods/data_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ class NormalisationByZScore(DataPreprocessingStep):
operation = "normalisation"
method_description = "Normalise data by Z-Score"

plot_input_names = ["protein_df", "peptide_df"]
plot_input_names = ["protein_df"]

def method(self, inputs):
return normalisation.by_z_score(**inputs)
Expand All @@ -196,7 +196,7 @@ class NormalisationByTotalSum(DataPreprocessingStep):
operation = "normalisation"
method_description = "Normalise data by total sum"

plot_input_names = ["protein_df", "peptide_df"]
plot_input_names = ["protein_df"]

def method(self, inputs):
return normalisation.by_totalsum(**inputs)
Expand All @@ -210,7 +210,7 @@ class NormalisationByMedian(DataPreprocessingStep):
operation = "normalisation"
method_description = "Normalise data by median"

input_keys = ["protein_df", "peptide_df", "percentile"]
input_keys = ["protein_df", "percentile"]

def method(self, inputs):
return normalisation.by_median(**inputs)
Expand All @@ -224,7 +224,7 @@ class NormalisationByReferenceProtein(DataPreprocessingStep):
operation = "normalisation"
method_description = "Normalise data by reference protein"

input_keys = ["protein_df", "peptide_df", "reference_protein"]
input_keys = ["protein_df", "reference_protein"]

def method(self, inputs):
return normalisation.by_reference_protein(**inputs)
Expand All @@ -238,7 +238,7 @@ class ImputationByMinPerDataset(DataPreprocessingStep):
operation = "imputation"
method_description = "Impute missing values by the minimum per dataset"

input_keys = ["protein_df", "peptide_df", "shrinking_value"]
input_keys = ["protein_df", "shrinking_value"]

def method(self, inputs):
return imputation.by_min_per_dataset(**inputs)
Expand All @@ -252,7 +252,7 @@ class ImputationByMinPerProtein(DataPreprocessingStep):
operation = "imputation"
method_description = "Impute missing values by the minimum per protein"

input_keys = ["protein_df", "peptide_df", "shrinking_value"]
input_keys = ["protein_df", "shrinking_value"]

def method(self, inputs):
return imputation.by_min_per_protein(**inputs)
Expand All @@ -266,7 +266,7 @@ class ImputationByMinPerSample(DataPreprocessingStep):
operation = "imputation"
method_description = "Impute missing values by the minimum per sample"

input_keys = ["protein_df", "peptide_df", "shrinking_value"]
input_keys = ["protein_df", "shrinking_value"]

def method(self, inputs):
return imputation.by_min_per_protein(**inputs)
Expand All @@ -283,7 +283,7 @@ class SimpleImputationPerProtein(DataPreprocessingStep):
"sklearn.SimpleImputer class"
)

input_keys = ["protein_df", "peptide_df", "strategy"]
input_keys = ["protein_df", "strategy"]

def method(self, inputs):
return imputation.by_simple_imputer(**inputs)
Expand All @@ -301,7 +301,7 @@ class ImputationByKNN(DataPreprocessingStep):
"the features that neither is missing are close."
)

input_keys = ["protein_df", "peptide_df", "number_of_neighbours"]
input_keys = ["protein_df", "number_of_neighbours"]

def method(self, inputs):
return imputation.by_knn(**inputs)
Expand All @@ -315,7 +315,7 @@ class ImputationByNormalDistributionSampling(DataPreprocessingStep):
operation = "imputation"
method_description = "Imputation methods include normal distribution sampling per protein or per dataset"

input_keys = ["protein_df", "peptide_df", "strategy", "down_shift", "scaling_factor"]
input_keys = ["protein_df", "strategy", "down_shift", "scaling_factor"]

def method(self, inputs):
return imputation.by_normal_distribution_sampling(**inputs)
Expand Down

0 comments on commit 8a49a7b

Please sign in to comment.