From 4590cb54dd22de4af8de42fcff4209b6fa287054 Mon Sep 17 00:00:00 2001 From: henninggaertner Date: Thu, 9 Nov 2023 15:42:23 +0100 Subject: [PATCH 01/24] Implement simple per Protein ND-Sampling Imputation method --- protzilla/constants/location_mapping.py | 5 ++ protzilla/constants/workflow_meta.json | 63 +++++++++++++++++++++ protzilla/data_preprocessing/imputation.py | 65 ++++++++++++++++++++++ 3 files changed, 133 insertions(+) diff --git a/protzilla/constants/location_mapping.py b/protzilla/constants/location_mapping.py index aa59e2cd1..95c229e1b 100644 --- a/protzilla/constants/location_mapping.py +++ b/protzilla/constants/location_mapping.py @@ -313,6 +313,11 @@ "imputation", "min_value_per_dataset", ): imputation.by_min_per_dataset_plot, + ( + "data_preprocessing", + "imputation", + "normal_distribution_sampling_per_protein", + ): imputation.by_normal_distribution_sampling, ( "data_preprocessing", "outlier_detection", diff --git a/protzilla/constants/workflow_meta.json b/protzilla/constants/workflow_meta.json index 14ba36f09..e7a8f0fad 100644 --- a/protzilla/constants/workflow_meta.json +++ b/protzilla/constants/workflow_meta.json @@ -677,6 +677,69 @@ } } ] + }, + "normal_distribution_sampling_per_protein": { + "name": "Normal Distribution Sampling", + "description": "Imputation methods include normal distribution sampling per Protein or over Dataset", + "parameters": { + "strategy": { + "name": "Strategy:", + "type": "categorical", + "categories": [ + "perProtein", + "perDataset" + ], + "default": "perProtein" + }, + "down_shift": { + "name": "Downshift:", + "type": "numeric", + "min": -10, + "max": 10, + "default": -1 + }, + "scaling_factor": { + "name": "Scaling Factor:", + "type": "numeric", + "min": 0, + "max": 1, + "default": 1 + } + }, + "graphs": [ + { + "graph_type": { + "name": "Graph type:", + "type": "categorical", + "categories": [ + "Boxplot", + "Histogram" + ], + "default": "Boxplot" + }, + "group_by": { + "name": "Group by:", + "type": "categorical", + "categories": [ + "None", + "Sample", + "Protein ID" + ], + "default": "None" + } + }, + { + "graph_type_quantities": { + "name": "Graph type Imputed Values:", + "type": "categorical", + "categories": [ + "Bar chart", + "Pie chart" + ], + "default": "Pie chart" + } + } + ] } }, "filter_peptides": { diff --git a/protzilla/data_preprocessing/imputation.py b/protzilla/data_preprocessing/imputation.py index 14409a2f7..0a19bad6a 100644 --- a/protzilla/data_preprocessing/imputation.py +++ b/protzilla/data_preprocessing/imputation.py @@ -221,6 +221,71 @@ def by_min_per_dataset( return intensity_df_copy, dict() +def by_normal_distribution_sampling( + intensity_df: pd.DataFrame, + strategy="perProtein", + down_shift=0, + scaling_factor=1, +) -> tuple[pd.DataFrame, dict]: + """ + A function to perform imputation via sampling of a normal distribution + defined by the existing datapoints and user-defined parameters for down- + shifting and scaling. Imputes missing values for each protein taking into + account data from each protein. + + :param intensity_df: the dataframe that should be filtered in\ + long format + :type intensity_df: pandas DataFrame + :param strategy: which strategy to use for definition of the normal\ + distribution to be sampled. Can be "perProtein", "perDataset" or "most_frequent"\ + :type strategy: str + :param down_shift: a factor defining how many dataset standard deviations\ + to shift the mean of the normal distribution used for imputation.\ + Default: 0 (no shift) + :type down_shift: float + :param scaling_factor: a factor determining how the variance of the normal\ + distribution used for imputation is scaled compared to dataset. + Default: 1 (no scaling) + :type down_shift: float + :return: returns an imputed dataframe in typical protzilla long format\ + and an empty dict + :rtype: pd.DataFrame, int + """ + assert strategy in ["perProtein", "perDataset"] + + transformed_df = long_to_wide(intensity_df) + transformed_df.dropna(axis=1, how="all", inplace=True) + + # protein_means = transformed_df.mean(axis=1) + # protein_std = transformed_df.std(axis=1) + # scaled_protein_means = protein_means + down_shift * protein_std + # scaled_protein_std = protein_std * scaling_factor + + # Iterate over proteins to impute minimal value + if strategy == "perProtein": + for column in transformed_df.columns: + # determine mean (loc) + protein_mean = transformed_df[column].mean() + protein_std = transformed_df[column].std() + scaled_protein_mean = max(0, protein_mean + down_shift * protein_std) + scaled_protein_std = protein_std * scaling_factor + # determine standard deviation (scale) + value_to_be_imputed = abs( + np.random.normal( + loc=scaled_protein_mean, + scale=scaled_protein_std, + ) + ) + transformed_df[column].fillna(value_to_be_imputed, inplace=True) + else: + pass + # determine mean of normal distribution of dataset + # TODO + + imputed_df = wide_to_long(transformed_df, intensity_df) + return imputed_df, dict() + + def by_knn_plot( df, result_df, current_out, graph_type, graph_type_quantities, group_by ): From efb0a0464636b3cd7f1c8e5e7d451a39e838de9f Mon Sep 17 00:00:00 2001 From: henninggaertner Date: Mon, 13 Nov 2023 16:30:52 +0100 Subject: [PATCH 02/24] generalize python mapping name for ND sampling imputation, append missing location mapping --- protzilla/constants/location_mapping.py | 7 ++++++- protzilla/constants/workflow_meta.json | 2 +- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/protzilla/constants/location_mapping.py b/protzilla/constants/location_mapping.py index 95c229e1b..91e4b0e91 100644 --- a/protzilla/constants/location_mapping.py +++ b/protzilla/constants/location_mapping.py @@ -127,6 +127,11 @@ "imputation", "min_value_per_dataset", ): imputation.by_min_per_dataset, + ( + "data_preprocessing", + "imputation", + "normal_distribution_sampling", + ): imputation.by_normal_distribution_sampling, ( "data_preprocessing", "filter_peptides", @@ -316,7 +321,7 @@ ( "data_preprocessing", "imputation", - "normal_distribution_sampling_per_protein", + "normal_distribution_sampling", ): imputation.by_normal_distribution_sampling, ( "data_preprocessing", diff --git a/protzilla/constants/workflow_meta.json b/protzilla/constants/workflow_meta.json index e7a8f0fad..1f76ffe63 100644 --- a/protzilla/constants/workflow_meta.json +++ b/protzilla/constants/workflow_meta.json @@ -678,7 +678,7 @@ } ] }, - "normal_distribution_sampling_per_protein": { + "normal_distribution_sampling": { "name": "Normal Distribution Sampling", "description": "Imputation methods include normal distribution sampling per Protein or over Dataset", "parameters": { From 7ca6f60a1f2ef934a117f9a5830ee6ea74da7518 Mon Sep 17 00:00:00 2001 From: henninggaertner Date: Mon, 13 Nov 2023 16:33:16 +0100 Subject: [PATCH 03/24] ND-sampling imputation now uses log-transformed intensities for sampling --- protzilla/data_preprocessing/imputation.py | 23 +++++++++++----------- 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/protzilla/data_preprocessing/imputation.py b/protzilla/data_preprocessing/imputation.py index 0a19bad6a..28060471e 100644 --- a/protzilla/data_preprocessing/imputation.py +++ b/protzilla/data_preprocessing/imputation.py @@ -256,32 +256,31 @@ def by_normal_distribution_sampling( transformed_df = long_to_wide(intensity_df) transformed_df.dropna(axis=1, how="all", inplace=True) - # protein_means = transformed_df.mean(axis=1) - # protein_std = transformed_df.std(axis=1) - # scaled_protein_means = protein_means + down_shift * protein_std - # scaled_protein_std = protein_std * scaling_factor - - # Iterate over proteins to impute minimal value + # TODO: sample the normal distribution for each missing value instead of sampling once for all missing values if strategy == "perProtein": for column in transformed_df.columns: - # determine mean (loc) - protein_mean = transformed_df[column].mean() - protein_std = transformed_df[column].std() + # determine mean and standard deviation of log-transformed protein intensties + protein_mean = np.log10(transformed_df[column]).mean() + protein_std = np.log10(transformed_df[column]).std() + # calculate mean and standard deviation of normal distribution to be sampled scaled_protein_mean = max(0, protein_mean + down_shift * protein_std) scaled_protein_std = protein_std * scaling_factor - # determine standard deviation (scale) - value_to_be_imputed = abs( + # sample from normal distribution and transform back to linear scale + log_value_to_be_imputed = abs( np.random.normal( loc=scaled_protein_mean, scale=scaled_protein_std, ) ) + value_to_be_imputed = 10**log_value_to_be_imputed + # impute missing values for current protein group transformed_df[column].fillna(value_to_be_imputed, inplace=True) else: pass # determine mean of normal distribution of dataset - # TODO + # TODO: implement perDataset strategy + # Turn the wide format into the long format and return imputed dataframe imputed_df = wide_to_long(transformed_df, intensity_df) return imputed_df, dict() From 812449dc306cc332d133caab870aec80c804c010 Mon Sep 17 00:00:00 2001 From: henninggaertner Date: Mon, 13 Nov 2023 16:40:42 +0100 Subject: [PATCH 04/24] ND sampling imputation now samples for each missing value instead of sampling once for entire protein group --- protzilla/data_preprocessing/imputation.py | 25 +++++++++++----------- 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/protzilla/data_preprocessing/imputation.py b/protzilla/data_preprocessing/imputation.py index 28060471e..c3705a2a6 100644 --- a/protzilla/data_preprocessing/imputation.py +++ b/protzilla/data_preprocessing/imputation.py @@ -256,25 +256,26 @@ def by_normal_distribution_sampling( transformed_df = long_to_wide(intensity_df) transformed_df.dropna(axis=1, how="all", inplace=True) - # TODO: sample the normal distribution for each missing value instead of sampling once for all missing values if strategy == "perProtein": for column in transformed_df.columns: - # determine mean and standard deviation of log-transformed protein intensties + # determine mean and standard deviation of log-transformed protein intensities protein_mean = np.log10(transformed_df[column]).mean() protein_std = np.log10(transformed_df[column]).std() # calculate mean and standard deviation of normal distribution to be sampled scaled_protein_mean = max(0, protein_mean + down_shift * protein_std) scaled_protein_std = protein_std * scaling_factor - # sample from normal distribution and transform back to linear scale - log_value_to_be_imputed = abs( - np.random.normal( - loc=scaled_protein_mean, - scale=scaled_protein_std, - ) - ) - value_to_be_imputed = 10**log_value_to_be_imputed - # impute missing values for current protein group - transformed_df[column].fillna(value_to_be_imputed, inplace=True) + # iterate over all values of current protein group + for index, value in transformed_df[column].iteritems(): + # if value is NaN, sample from normal distribution and impute value + if np.isnan(value): + log_value_to_be_imputed = abs( + np.random.normal( + loc=scaled_protein_mean, + scale=scaled_protein_std, + ) + ) + value_to_be_imputed = 10**log_value_to_be_imputed + transformed_df[column].loc[index] = value_to_be_imputed else: pass # determine mean of normal distribution of dataset From 9afddfca8f0fee17be77742b0c71ba0a6acd7bd1 Mon Sep 17 00:00:00 2001 From: henninggaertner Date: Tue, 14 Nov 2023 14:29:06 +0100 Subject: [PATCH 05/24] fix location mapping for normal distrubiton sampling imputation --- protzilla/constants/location_mapping.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/protzilla/constants/location_mapping.py b/protzilla/constants/location_mapping.py index 91e4b0e91..9f35637b9 100644 --- a/protzilla/constants/location_mapping.py +++ b/protzilla/constants/location_mapping.py @@ -322,7 +322,7 @@ "data_preprocessing", "imputation", "normal_distribution_sampling", - ): imputation.by_normal_distribution_sampling, + ): imputation.by_normal_distribution_sampling_plot, ( "data_preprocessing", "outlier_detection", From a92941f4a23de6a5af39995e5c76cd9106ff69ab Mon Sep 17 00:00:00 2001 From: henninggaertner Date: Tue, 14 Nov 2023 17:46:37 +0100 Subject: [PATCH 06/24] implement fast perProtein and perDataset normal dist. sampling imputation method --- protzilla/data_preprocessing/imputation.py | 115 ++++++++++++++++----- 1 file changed, 89 insertions(+), 26 deletions(-) diff --git a/protzilla/data_preprocessing/imputation.py b/protzilla/data_preprocessing/imputation.py index c3705a2a6..0ce54398b 100644 --- a/protzilla/data_preprocessing/imputation.py +++ b/protzilla/data_preprocessing/imputation.py @@ -232,6 +232,8 @@ def by_normal_distribution_sampling( defined by the existing datapoints and user-defined parameters for down- shifting and scaling. Imputes missing values for each protein taking into account data from each protein. + The downshifted normal distribution that will be sampled for imputation has a lower + limit for the mean of at 0, meaning that if the downshifted mean were to be negative, it will be set at 0. :param intensity_df: the dataframe that should be filtered in\ long format @@ -253,37 +255,79 @@ def by_normal_distribution_sampling( """ assert strategy in ["perProtein", "perDataset"] - transformed_df = long_to_wide(intensity_df) - transformed_df.dropna(axis=1, how="all", inplace=True) - if strategy == "perProtein": - for column in transformed_df.columns: + transformed_df = long_to_wide(intensity_df) + # iterate over all protein groups + for protein_grp in transformed_df.columns: + # determine number of missing values + number_of_nans = transformed_df[protein_grp].isnull().sum() + + # get indices of NaN values in current protein group + location_of_nans = transformed_df[protein_grp].isnull() + indices_of_nans = location_of_nans[location_of_nans].index + # determine mean and standard deviation of log-transformed protein intensities - protein_mean = np.log10(transformed_df[column]).mean() - protein_std = np.log10(transformed_df[column]).std() + protein_grp_mean = np.log10(transformed_df[protein_grp]).mean(skipna=True) + protein_grp_std = np.log10(transformed_df[protein_grp]).std(skipna=True) + # calculate mean and standard deviation of normal distribution to be sampled - scaled_protein_mean = max(0, protein_mean + down_shift * protein_std) - scaled_protein_std = protein_std * scaling_factor - # iterate over all values of current protein group - for index, value in transformed_df[column].iteritems(): - # if value is NaN, sample from normal distribution and impute value - if np.isnan(value): - log_value_to_be_imputed = abs( - np.random.normal( - loc=scaled_protein_mean, - scale=scaled_protein_std, - ) - ) - value_to_be_imputed = 10**log_value_to_be_imputed - transformed_df[column].loc[index] = value_to_be_imputed + sampling_mean = max(0, protein_grp_mean + down_shift * protein_grp_std) + sampling_std = protein_grp_std * scaling_factor + + # calculate log-transformed values to be imputed + log_impute_values = abs( + np.random.normal( + loc=sampling_mean, + scale=sampling_std, + size=number_of_nans, + ) + ) + + # transform log-transformed values to be imputed back to normal scale and round to nearest integer + impute_values = np.round(10**log_impute_values, decimals=0) + + # zip indices of NaN values with values to be imputed together as a Series, such that fillna can be used + impute_value_series = pd.Series(impute_values, index=indices_of_nans) + transformed_df[protein_grp].fillna(impute_value_series, inplace=True) + + imputed_df = wide_to_long(transformed_df, intensity_df) + return imputed_df, dict() + else: - pass - # determine mean of normal distribution of dataset - # TODO: implement perDataset strategy + # deep copy the dataframe + intensity_type = intensity_df.columns[3] + + # get indices of NaN values in current protein group + location_of_nans = intensity_df[intensity_type].isnull() + indices_of_nans = location_of_nans[location_of_nans].index + # calculate the mean and standard deviation of the entire dataset + dataset_mean = np.log10(intensity_df[intensity_type]).mean() + dataset_std = np.log10(intensity_df[intensity_type]).std() + + # calculate mean and standard deviation of normal distribution to be sampled + scaled_dataset_mean = max(0, dataset_mean + down_shift * dataset_std) + scaled_dataset_std = dataset_std * scaling_factor + + # get number of NaN values in dataset + number_of_nans = intensity_df[intensity_type].isnull().sum() + + # calculate log-transformed values to be imputed + log_impute_values = abs( + np.random.normal( + loc=scaled_dataset_mean, + scale=scaled_dataset_std, + size=number_of_nans, + ) + ) - # Turn the wide format into the long format and return imputed dataframe - imputed_df = wide_to_long(transformed_df, intensity_df) - return imputed_df, dict() + # transform log-transformed values to be imputed back to normal scale and round to nearest integer + impute_values = np.round(10**log_impute_values, decimals=0) + + # zip indices of NaN values with values to be imputed together as a Series, such that fillna can be used + impute_value_series = pd.Series(impute_values, index=indices_of_nans) + intensity_df[intensity_type].fillna(impute_value_series, inplace=True) + + return intensity_df, dict() def by_knn_plot( @@ -294,6 +338,14 @@ def by_knn_plot( ) +def by_normal_distribution_sampling_plot( + df, result_df, current_out, graph_type, graph_type_quantities, group_by +): + return _build_box_hist_plot( + df, result_df, graph_type, graph_type_quantities, group_by + ) + + def by_simple_imputer_plot( df, result_df, current_out, graph_type, graph_type_quantities, group_by ): @@ -380,3 +432,14 @@ def _build_box_hist_plot( heading="Number of Imputed Values", ) return [fig1, fig2] + + +def xf(): + # load df from intensity_df.csv + df = pd.read_csv("intensity_df.csv") + # impute missing values + for i in range(1): + result_df, _ = by_normal_distribution_sampling( + df, strategy="perProtein", down_shift=0, scaling_factor=1 + ) + print("done") From 50af6a757e19ace2867cdf55c8872a30b0313ddc Mon Sep 17 00:00:00 2001 From: henninggaertner Date: Tue, 14 Nov 2023 17:54:27 +0100 Subject: [PATCH 07/24] removed unnecessary testing function for code --- protzilla/data_preprocessing/imputation.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/protzilla/data_preprocessing/imputation.py b/protzilla/data_preprocessing/imputation.py index 0ce54398b..5e1c84f57 100644 --- a/protzilla/data_preprocessing/imputation.py +++ b/protzilla/data_preprocessing/imputation.py @@ -432,14 +432,3 @@ def _build_box_hist_plot( heading="Number of Imputed Values", ) return [fig1, fig2] - - -def xf(): - # load df from intensity_df.csv - df = pd.read_csv("intensity_df.csv") - # impute missing values - for i in range(1): - result_df, _ = by_normal_distribution_sampling( - df, strategy="perProtein", down_shift=0, scaling_factor=1 - ) - print("done") From dbd99da193d77039afd784ea0a7f3e5fe49f548b Mon Sep 17 00:00:00 2001 From: henninggaertner Date: Tue, 14 Nov 2023 17:55:54 +0100 Subject: [PATCH 08/24] comment fix --- protzilla/data_preprocessing/imputation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/protzilla/data_preprocessing/imputation.py b/protzilla/data_preprocessing/imputation.py index 5e1c84f57..9cd5e5356 100644 --- a/protzilla/data_preprocessing/imputation.py +++ b/protzilla/data_preprocessing/imputation.py @@ -294,7 +294,7 @@ def by_normal_distribution_sampling( return imputed_df, dict() else: - # deep copy the dataframe + # determine column for protein intensities intensity_type = intensity_df.columns[3] # get indices of NaN values in current protein group From 19491ef13f6172e4d9ffbde16550fc47cecaf35b Mon Sep 17 00:00:00 2001 From: henninggaertner Date: Wed, 15 Nov 2023 11:03:17 +0100 Subject: [PATCH 09/24] ND-sampling imputation: complete tests, remove rounding and limiting sampel ND, do not impute if there are insufficient values in dataframe --- protzilla/data_preprocessing/imputation.py | 54 +++++++++++-------- .../data_preprocessing/test_imputation.py | 45 ++++++++++++++++ 2 files changed, 77 insertions(+), 22 deletions(-) diff --git a/protzilla/data_preprocessing/imputation.py b/protzilla/data_preprocessing/imputation.py index 9cd5e5356..a0834c3c9 100644 --- a/protzilla/data_preprocessing/imputation.py +++ b/protzilla/data_preprocessing/imputation.py @@ -226,29 +226,33 @@ def by_normal_distribution_sampling( strategy="perProtein", down_shift=0, scaling_factor=1, + round_values=False, ) -> tuple[pd.DataFrame, dict]: """ A function to perform imputation via sampling of a normal distribution defined by the existing datapoints and user-defined parameters for down- shifting and scaling. Imputes missing values for each protein taking into - account data from each protein. - The downshifted normal distribution that will be sampled for imputation has a lower - limit for the mean of at 0, meaning that if the downshifted mean were to be negative, it will be set at 0. - - :param intensity_df: the dataframe that should be filtered in\ + account data from each protein or the whole dataset. The data is log- + transformed before sampling from the normal distribution and transformed + back afterwards, meaning only values > 0 are imputed. + Will not impute if insufficient data is available for sampling. + :param intensity_df: the dataframe that should be filtered in long format :type intensity_df: pandas DataFrame - :param strategy: which strategy to use for definition of the normal\ - distribution to be sampled. Can be "perProtein", "perDataset" or "most_frequent"\ + :param strategy: which strategy to use for definition of the normal + distribution to be sampled. Can be "perProtein", "perDataset" or "most_frequent" :type strategy: str - :param down_shift: a factor defining how many dataset standard deviations\ - to shift the mean of the normal distribution used for imputation.\ + :param down_shift: a factor defining how many dataset standard deviations + to shift the mean of the normal distribution used for imputation. Default: 0 (no shift) :type down_shift: float - :param scaling_factor: a factor determining how the variance of the normal\ + :param scaling_factor: a factor determining how the variance of the normal distribution used for imputation is scaled compared to dataset. Default: 1 (no scaling) :type down_shift: float + :param round_values: whether to round the imputed values to the nearest integer + Default: False + :type round_values: bool :return: returns an imputed dataframe in typical protzilla long format\ and an empty dict :rtype: pd.DataFrame, int @@ -262,6 +266,10 @@ def by_normal_distribution_sampling( # determine number of missing values number_of_nans = transformed_df[protein_grp].isnull().sum() + # don't impute values if there not enough values (> 1) to sample from + if number_of_nans > len(transformed_df[protein_grp]) - 2: + continue + # get indices of NaN values in current protein group location_of_nans = transformed_df[protein_grp].isnull() indices_of_nans = location_of_nans[location_of_nans].index @@ -271,20 +279,18 @@ def by_normal_distribution_sampling( protein_grp_std = np.log10(transformed_df[protein_grp]).std(skipna=True) # calculate mean and standard deviation of normal distribution to be sampled - sampling_mean = max(0, protein_grp_mean + down_shift * protein_grp_std) + sampling_mean = protein_grp_mean + down_shift * protein_grp_std sampling_std = protein_grp_std * scaling_factor # calculate log-transformed values to be imputed - log_impute_values = abs( - np.random.normal( - loc=sampling_mean, - scale=sampling_std, - size=number_of_nans, - ) + log_impute_values = np.random.normal( + loc=sampling_mean, + scale=sampling_std, + size=number_of_nans, ) # transform log-transformed values to be imputed back to normal scale and round to nearest integer - impute_values = np.round(10**log_impute_values, decimals=0) + impute_values = 10**log_impute_values # zip indices of NaN values with values to be imputed together as a Series, such that fillna can be used impute_value_series = pd.Series(impute_values, index=indices_of_nans) @@ -297,9 +303,16 @@ def by_normal_distribution_sampling( # determine column for protein intensities intensity_type = intensity_df.columns[3] + # get number of NaN values in dataset + number_of_nans = intensity_df[intensity_type].isnull().sum() + + # throw error if dataset is basically empty, something went wrong + assert number_of_nans <= len(intensity_df[intensity_type]) - 2 + # get indices of NaN values in current protein group location_of_nans = intensity_df[intensity_type].isnull() indices_of_nans = location_of_nans[location_of_nans].index + # calculate the mean and standard deviation of the entire dataset dataset_mean = np.log10(intensity_df[intensity_type]).mean() dataset_std = np.log10(intensity_df[intensity_type]).std() @@ -308,9 +321,6 @@ def by_normal_distribution_sampling( scaled_dataset_mean = max(0, dataset_mean + down_shift * dataset_std) scaled_dataset_std = dataset_std * scaling_factor - # get number of NaN values in dataset - number_of_nans = intensity_df[intensity_type].isnull().sum() - # calculate log-transformed values to be imputed log_impute_values = abs( np.random.normal( @@ -321,7 +331,7 @@ def by_normal_distribution_sampling( ) # transform log-transformed values to be imputed back to normal scale and round to nearest integer - impute_values = np.round(10**log_impute_values, decimals=0) + impute_values = 10**log_impute_values # zip indices of NaN values with values to be imputed together as a Series, such that fillna can be used impute_value_series = pd.Series(impute_values, index=indices_of_nans) diff --git a/tests/protzilla/data_preprocessing/test_imputation.py b/tests/protzilla/data_preprocessing/test_imputation.py index 44b66bb13..40caba91a 100644 --- a/tests/protzilla/data_preprocessing/test_imputation.py +++ b/tests/protzilla/data_preprocessing/test_imputation.py @@ -9,6 +9,8 @@ by_min_per_protein_plot, by_min_per_sample, by_min_per_sample_plot, + by_normal_distribution_sampling, + by_normal_distribution_sampling_plot, by_simple_imputer, by_simple_imputer_plot, np, @@ -18,6 +20,11 @@ from tests.protzilla.data_preprocessing import test_plots_data_preprocessing +def protein_group_intensities(dataframe, protein_group_name): + # small helper function for tests + return dataframe[dataframe["Protein ID"] == protein_group_name]["Intensity"] + + @pytest.fixture def input_imputation_df(): test_intensity_list = ( @@ -266,6 +273,44 @@ def test_imputation_knn(show_figures, input_imputation_df, assertion_df_knn): {assertion_df} but is\n {result_df}" +@pytest.mark.order(2) +@pytest.mark.dependency(depends=["test_build_box_hist_plot"]) +def test_imputation_normal_distribution_sampling(show_figures, input_imputation_df): + # perform imputation on test data frame + result_df_perProtein = by_normal_distribution_sampling( + input_imputation_df, strategy="perProtein", down_shift=-10 + )[0] + result_df_perDataset = by_normal_distribution_sampling( + input_imputation_df, strategy="perDataset", down_shift=-10 + )[0] + + fig1, fig2 = by_normal_distribution_sampling_plot( + input_imputation_df, result_df_perProtein, {}, "Boxplot", "Bar chart", "Sample" + ) + if show_figures: + fig1.show() + fig2.show() + + assert ( + result_df_perProtein["Intensity"].min() >= 0 + ), f"Imputation by normal distribution sampling should not have negative values!" + assert ( + result_df_perDataset["Intensity"].min() >= 0 + ), f"Imputation by normal distribution sampling should not have negative values!" + + assert ( + False == protein_group_intensities(result_df_perProtein, "Protein1").hasnans + ) and ( + False == protein_group_intensities(result_df_perProtein, "Protein3").hasnans + ), f"Imputation by normal distribution sampling should not have NaN values!" + assert protein_group_intensities( + result_df_perProtein, "Protein2" + ).hasnans, f"This protein group should have NaN values! Not enough data points to sample from!" + assert ( + False == result_df_perDataset["Intensity"].hasnans + ), f"Imputation by normal distribution sampling per Dataset should not have NaN values!" + + def test_number_of_imputed_values(input_imputation_df, assertion_df_knn): count = number_of_imputed_values(input_imputation_df, assertion_df_knn) assert ( From abbb68fc19ad0586e63f4e12eac278250f62c668 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henning=20G=C3=A4rtner?= <104069093+henninggaertner@users.noreply.github.com> Date: Thu, 16 Nov 2023 17:07:58 +0100 Subject: [PATCH 10/24] Update issue.md template to automatically assign PROTzilla project --- .github/ISSUE_TEMPLATE/issue.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/ISSUE_TEMPLATE/issue.md b/.github/ISSUE_TEMPLATE/issue.md index 387d4a3ee..356273db8 100644 --- a/.github/ISSUE_TEMPLATE/issue.md +++ b/.github/ISSUE_TEMPLATE/issue.md @@ -3,7 +3,7 @@ name: Issue Template about: Standard Issue Template title: labels: -project: PROTzilla 2 +project: PROTzilla --- From cfe54281244bda20ba2d1171c6c3e81519e75665 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henning=20G=C3=A4rtner?= <104069093+henninggaertner@users.noreply.github.com> Date: Fri, 17 Nov 2023 11:23:47 +0100 Subject: [PATCH 11/24] Update todo_issue.md to automatically assign correct project --- .github/ISSUE_TEMPLATE/todo_issue.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/ISSUE_TEMPLATE/todo_issue.md b/.github/ISSUE_TEMPLATE/todo_issue.md index d8c7fbb93..c20221c3b 100644 --- a/.github/ISSUE_TEMPLATE/todo_issue.md +++ b/.github/ISSUE_TEMPLATE/todo_issue.md @@ -3,7 +3,7 @@ name: TODO about: TODO Issue Template title: '# TODO ' labels: todo -project: 'PROTzilla2' +project: 'PROTzilla' --- From 7b2fb4f832ea0cb4542b18c2336b174b24c36262 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henning=20G=C3=A4rtner?= <104069093+henninggaertner@users.noreply.github.com> Date: Fri, 17 Nov 2023 12:50:03 +0100 Subject: [PATCH 12/24] Update issue.md --- .github/ISSUE_TEMPLATE/issue.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/ISSUE_TEMPLATE/issue.md b/.github/ISSUE_TEMPLATE/issue.md index 356273db8..677a8ce72 100644 --- a/.github/ISSUE_TEMPLATE/issue.md +++ b/.github/ISSUE_TEMPLATE/issue.md @@ -3,7 +3,7 @@ name: Issue Template about: Standard Issue Template title: labels: -project: PROTzilla +project: "PROTzilla" --- From 5d65d0f20705cfe7507a51cd8f2727ff68f8202d Mon Sep 17 00:00:00 2001 From: "janni.roebbecke" Date: Fri, 17 Nov 2023 13:30:08 +0100 Subject: [PATCH 13/24] minor, purely stylistic changes --- protzilla/data_preprocessing/imputation.py | 24 +++++----------------- 1 file changed, 5 insertions(+), 19 deletions(-) diff --git a/protzilla/data_preprocessing/imputation.py b/protzilla/data_preprocessing/imputation.py index a0834c3c9..160112cfb 100644 --- a/protzilla/data_preprocessing/imputation.py +++ b/protzilla/data_preprocessing/imputation.py @@ -226,7 +226,6 @@ def by_normal_distribution_sampling( strategy="perProtein", down_shift=0, scaling_factor=1, - round_values=False, ) -> tuple[pd.DataFrame, dict]: """ A function to perform imputation via sampling of a normal distribution @@ -263,22 +262,18 @@ def by_normal_distribution_sampling( transformed_df = long_to_wide(intensity_df) # iterate over all protein groups for protein_grp in transformed_df.columns: - # determine number of missing values + number_of_nans = transformed_df[protein_grp].isnull().sum() # don't impute values if there not enough values (> 1) to sample from if number_of_nans > len(transformed_df[protein_grp]) - 2: continue - # get indices of NaN values in current protein group location_of_nans = transformed_df[protein_grp].isnull() indices_of_nans = location_of_nans[location_of_nans].index - # determine mean and standard deviation of log-transformed protein intensities protein_grp_mean = np.log10(transformed_df[protein_grp]).mean(skipna=True) protein_grp_std = np.log10(transformed_df[protein_grp]).std(skipna=True) - - # calculate mean and standard deviation of normal distribution to be sampled sampling_mean = protein_grp_mean + down_shift * protein_grp_std sampling_std = protein_grp_std * scaling_factor @@ -288,7 +283,6 @@ def by_normal_distribution_sampling( scale=sampling_std, size=number_of_nans, ) - # transform log-transformed values to be imputed back to normal scale and round to nearest integer impute_values = 10**log_impute_values @@ -303,33 +297,25 @@ def by_normal_distribution_sampling( # determine column for protein intensities intensity_type = intensity_df.columns[3] - # get number of NaN values in dataset number_of_nans = intensity_df[intensity_type].isnull().sum() - - # throw error if dataset is basically empty, something went wrong assert number_of_nans <= len(intensity_df[intensity_type]) - 2 - # get indices of NaN values in current protein group location_of_nans = intensity_df[intensity_type].isnull() indices_of_nans = location_of_nans[location_of_nans].index - # calculate the mean and standard deviation of the entire dataset dataset_mean = np.log10(intensity_df[intensity_type]).mean() dataset_std = np.log10(intensity_df[intensity_type]).std() - - # calculate mean and standard deviation of normal distribution to be sampled - scaled_dataset_mean = max(0, dataset_mean + down_shift * dataset_std) - scaled_dataset_std = dataset_std * scaling_factor + sampling_mean = max(0, dataset_mean + down_shift * dataset_std) + sampling_std = dataset_std * scaling_factor # calculate log-transformed values to be imputed log_impute_values = abs( np.random.normal( - loc=scaled_dataset_mean, - scale=scaled_dataset_std, + loc=sampling_mean, + scale=sampling_std, size=number_of_nans, ) ) - # transform log-transformed values to be imputed back to normal scale and round to nearest integer impute_values = 10**log_impute_values From 17ff747252c376f256bafcc2b46325a4e79f8bcf Mon Sep 17 00:00:00 2001 From: henninggaertner Date: Fri, 17 Nov 2023 14:02:15 +0100 Subject: [PATCH 14/24] Preliminary, somewhat expandable metadata column renaming method --- protzilla/constants/location_mapping.py | 5 +++ protzilla/constants/workflow_meta.json | 50 ++++++++++++++++++++----- protzilla/importing/metadata_import.py | 32 ++++++++++++++++ protzilla/run_helper.py | 4 +- 4 files changed, 81 insertions(+), 10 deletions(-) diff --git a/protzilla/constants/location_mapping.py b/protzilla/constants/location_mapping.py index aa59e2cd1..bbc82d783 100644 --- a/protzilla/constants/location_mapping.py +++ b/protzilla/constants/location_mapping.py @@ -41,6 +41,11 @@ "metadata_import", "metadata_import_method", ): metadata_import.metadata_import_method, + ( + "importing", + "metadata_import", + "metadata_column_assignment", + ): metadata_import.metadata_column_assignment, ("importing", "peptide_import", "peptide_import"): peptide_import.peptide_import, ( "data_preprocessing", diff --git a/protzilla/constants/workflow_meta.json b/protzilla/constants/workflow_meta.json index 14ba36f09..1c8df8aa3 100644 --- a/protzilla/constants/workflow_meta.json +++ b/protzilla/constants/workflow_meta.json @@ -80,6 +80,38 @@ "default": "Columns (samples in rows, features in columns)" } } + }, + "metadata_column_assignment": { + "name": "Metadata Column Assignment", + "description": "Assign columns to metadata categories", + "parameters": { + "metadata_df": { + "type": "empty", + "name": "metadata_df", + "default": null + }, + "metadata_sample_column": { + "name": "Sample Column", + "fill": "metadata_columns", + "type": "categorical", + "categories": [], + "default": "Sample" + }, + "metadata_group_column": { + "name": "Group Column", + "fill": "metadata_columns", + "type": "categorical", + "categories": [], + "default": "Group" + }, + "metadata_batch_column": { + "name": "Batch Column", + "fill": "metadata_columns", + "type": "categorical", + "categories": [], + "default": "Batch" + } + } } }, "peptide_import": { @@ -744,7 +776,7 @@ "grouping": { "name": "Grouping:", "type": "categorical", - "fill": "metadata_columns", + "fill": "metadata_non_sample_columns", "fill_dynamic": [ "selected_groups" ], @@ -810,7 +842,7 @@ "grouping": { "name": "Grouping", "type": "categorical", - "fill": "metadata_columns", + "fill": "metadata_non_sample_columns", "fill_dynamic": [ "group1", "group2" @@ -876,7 +908,7 @@ "grouping": { "name": "Grouping", "type": "categorical", - "fill": "metadata_columns", + "fill": "metadata_non_sample_columns", "fill_dynamic": [ "group1", "group2" @@ -1109,7 +1141,7 @@ "labels_column": { "name": "Choose labels column from metadata", "type": "categorical", - "fill": "metadata_columns", + "fill": "metadata_non_sample_columns", "fill_dynamic": [ "positive_label" ], @@ -1245,7 +1277,7 @@ "labels_column": { "name": "Choose labels column from metadata", "type": "categorical", - "fill": "metadata_columns", + "fill": "metadata_non_sample_columns", "fill_dynamic": [ "positive_label" ], @@ -1381,7 +1413,7 @@ "labels_column": { "name": "Choose labels column from metadata", "type": "categorical", - "fill": "metadata_columns", + "fill": "metadata_non_sample_columns", "fill_dynamic": [ "positive_label" ], @@ -1502,7 +1534,7 @@ "labels_column": { "name": "Choose labels column from metadata", "type": "categorical", - "fill": "metadata_columns", + "fill": "metadata_non_sample_columns", "fill_dynamic": [ "positive_label" ], @@ -1708,7 +1740,7 @@ "labels_column": { "name": "Choose labels column from metadata", "type": "categorical", - "fill": "metadata_columns", + "fill": "metadata_non_sample_columns", "categories": [], "default": null }, @@ -2457,7 +2489,7 @@ "grouping": { "name": "Grouping from metadata", "type": "categorical", - "fill": "metadata_columns", + "fill": "metadata_non_sample_columns", "fill_dynamic": [ "group1", "group2" diff --git a/protzilla/importing/metadata_import.py b/protzilla/importing/metadata_import.py index 7b4ae1b12..ed596e882 100644 --- a/protzilla/importing/metadata_import.py +++ b/protzilla/importing/metadata_import.py @@ -57,3 +57,35 @@ def metadata_import_method(df, file_path, feature_orientation): os.remove(file_path) return df, {"metadata": meta_df} + + +def metadata_column_assignment( + df, + metadata_df, + metadata_sample_column, + metadata_group_column, + metadata_batch_column, +): + assert ( + metadata_sample_column in metadata_df.columns + ), f"Sample column {metadata_sample_column} not found in metadata file." + assert ( + metadata_group_column in metadata_df.columns + ), f"Group column {metadata_group_column} not found in metadata file." + assert ( + metadata_batch_column in metadata_df.columns + ), f"Batch column {metadata_batch_column} not found in metadata file." + # assert that all columns parameters are unique + assert ( + len({metadata_sample_column, metadata_group_column, metadata_batch_column}) == 3 + ), "The columns parameters must be unique." + + rename_dict = { + metadata_sample_column: "Sample", + metadata_group_column: "Group", + metadata_batch_column: "Batch", + } + # rename given in metadata_sample_column column to "Sample" if it is called otherwise + renamed_metadata_df = metadata_df.rename(columns=rename_dict, inplace=True) + + return df, dict() diff --git a/protzilla/run_helper.py b/protzilla/run_helper.py index ac27d0b34..ff1f6c3a7 100644 --- a/protzilla/run_helper.py +++ b/protzilla/run_helper.py @@ -24,11 +24,13 @@ def insert_special_params(param_dict, run): param_dict["outputs"].sort() if "fill" in param_dict: - if param_dict["fill"] == "metadata_columns": + if param_dict["fill"] == "metadata_non_sample_columns": # Sample not needed for anova and t-test param_dict["categories"] = run.metadata.columns[ run.metadata.columns != "Sample" ].unique() + elif param_dict["fill"] == "metadata_columns": + param_dict["categories"] = run.metadata.columns.unique() elif param_dict["fill"] == "metadata_column_data": # per default fill with second column data since it is selected in dropdown param_dict["categories"] = run.metadata.iloc[:, 1].unique() From ddb8d04d3b6f42d759d2126e9f9be7ca3e61f74b Mon Sep 17 00:00:00 2001 From: henninggaertner Date: Fri, 17 Nov 2023 17:31:29 +0100 Subject: [PATCH 15/24] Column assignment: write tests, optimize variable names and add docstring, add messages for special cases --- protzilla/constants/workflow_meta.json | 26 ++++----- protzilla/importing/metadata_import.py | 57 +++++++++++-------- protzilla/run_helper.py | 18 +++++- .../importing/test_metadata_import.py | 44 ++++++++++++++ 4 files changed, 102 insertions(+), 43 deletions(-) diff --git a/protzilla/constants/workflow_meta.json b/protzilla/constants/workflow_meta.json index 1c8df8aa3..fd685f864 100644 --- a/protzilla/constants/workflow_meta.json +++ b/protzilla/constants/workflow_meta.json @@ -83,34 +83,28 @@ }, "metadata_column_assignment": { "name": "Metadata Column Assignment", - "description": "Assign columns to metadata categories", + "description": "Assign columns to metadata categories, repeatable for each category", "parameters": { "metadata_df": { "type": "empty", "name": "metadata_df", "default": null }, - "metadata_sample_column": { - "name": "Sample Column", - "fill": "metadata_columns", + "metadata_required_column": { + "name": "Missing, but required Metadata Columns", + "fill": "metadata_required_columns", "type": "categorical", "categories": [], - "default": "Sample" - }, - "metadata_group_column": { - "name": "Group Column", - "fill": "metadata_columns", - "type": "categorical", - "categories": [], - "default": "Group" + "default": null }, - "metadata_batch_column": { - "name": "Batch Column", - "fill": "metadata_columns", + "metadata_unknown_column": { + "name": "Existing, but unknown Metadata Columns", + "fill": "metadata_unknown_columns", "type": "categorical", "categories": [], - "default": "Batch" + "default": null } + } } }, diff --git a/protzilla/importing/metadata_import.py b/protzilla/importing/metadata_import.py index ed596e882..6a82a1bd7 100644 --- a/protzilla/importing/metadata_import.py +++ b/protzilla/importing/metadata_import.py @@ -60,32 +60,39 @@ def metadata_import_method(df, file_path, feature_orientation): def metadata_column_assignment( - df, - metadata_df, - metadata_sample_column, - metadata_group_column, - metadata_batch_column, + df: pd.DataFrame, + metadata_df: pd.DataFrame, + metadata_required_column: str = None, + metadata_unknown_column: str = None, ): - assert ( - metadata_sample_column in metadata_df.columns - ), f"Sample column {metadata_sample_column} not found in metadata file." - assert ( - metadata_group_column in metadata_df.columns - ), f"Group column {metadata_group_column} not found in metadata file." - assert ( - metadata_batch_column in metadata_df.columns - ), f"Batch column {metadata_batch_column} not found in metadata file." - # assert that all columns parameters are unique - assert ( - len({metadata_sample_column, metadata_group_column, metadata_batch_column}) == 3 - ), "The columns parameters must be unique." + """ + This function renames a column in the metadata dataframe to the required column name. - rename_dict = { - metadata_sample_column: "Sample", - metadata_group_column: "Group", - metadata_batch_column: "Batch", - } - # rename given in metadata_sample_column column to "Sample" if it is called otherwise - renamed_metadata_df = metadata_df.rename(columns=rename_dict, inplace=True) + :param df: this is passed for consistency, but not used + :type df: pandas DataFrame + :param metadata_df: the metadata dataframe to be changed + :type metadata_df: float + :param metadata_required_column: the name of the column in the dataframe that is used for the metadata assignment + :type metadata_df: str + :param metadata_unknown_column: the name of the column in the metadata dataframe that is renamed to the + required column name + :type metadata_unknown_column: str + :return: returns the unchanged dataframe and a dict with messages, potentially empty if no messages + :rtype: pd.DataFrame, dict + """ + + # TODO add info box in UI explaining that no option for unknown columns means all columns are named correctly + # check if required column already in metadata, if so give error message + if metadata_required_column is None or metadata_unknown_column is None: + msg = f"You can proceed, as there is nothing that needs to be changed." + return df, dict(messages=[dict(level=messages.INFO, msg=msg)]) + if metadata_required_column in metadata_df.columns: + msg = f"Metadata already contains column '{metadata_required_column}'. \ + Please rename the column or select another column." + return df, dict(messages=[dict(level=messages.ERROR, msg=msg)]) + # rename given in metadata_sample_column column to "Sample" if it is called otherwise + renamed_metadata_df = metadata_df.rename( + columns={metadata_unknown_column: metadata_required_column}, inplace=True + ) return df, dict() diff --git a/protzilla/run_helper.py b/protzilla/run_helper.py index ff1f6c3a7..6ffc695aa 100644 --- a/protzilla/run_helper.py +++ b/protzilla/run_helper.py @@ -29,8 +29,22 @@ def insert_special_params(param_dict, run): param_dict["categories"] = run.metadata.columns[ run.metadata.columns != "Sample" ].unique() - elif param_dict["fill"] == "metadata_columns": - param_dict["categories"] = run.metadata.columns.unique() + elif param_dict["fill"] == "metadata_unknown_columns": + # give selection of existing columns without ["Sample", "Group", "Batch"] + # as they are already named correctly for our purposes + param_dict["categories"] = run.metadata.columns[ + ~run.metadata.columns.isin(["Sample", "Group", "Batch"]) + ].unique() + + elif param_dict["fill"] == "metadata_required_columns": + # TODO add other possible metadata columns + # exclude columns that are already in metadata and known to be required + param_dict["categories"] = [ + col + for col in ["Sample", "Group", "Batch"] + if col not in run.metadata.columns + ] + elif param_dict["fill"] == "metadata_column_data": # per default fill with second column data since it is selected in dropdown param_dict["categories"] = run.metadata.iloc[:, 1].unique() diff --git a/tests/protzilla/importing/test_metadata_import.py b/tests/protzilla/importing/test_metadata_import.py index 7ed1d1785..5f29ce38f 100644 --- a/tests/protzilla/importing/test_metadata_import.py +++ b/tests/protzilla/importing/test_metadata_import.py @@ -1,6 +1,7 @@ from shutil import rmtree import pandas as pd +from django.contrib import messages from protzilla.constants.paths import PROJECT_PATH, RUNS_PATH from protzilla.importing import metadata_import @@ -42,3 +43,46 @@ def test_metadata_orientation(): pd.testing.assert_frame_equal(run1.metadata, run2.metadata) rmtree(RUNS_PATH / name1) rmtree(RUNS_PATH / name2) + + +def test_metadata_column_assignment(): + name = "test_run" + random_string() + run = Run.create(name) + run.step_index += 1 + run.calculate_and_next( + metadata_import.metadata_import_method, + file_path=f"{PROJECT_PATH}/tests/metadata_cut_columns.csv", + feature_orientation="Columns (samples in rows, features in columns)", + ) + # this is a workaround because the metadata is not passed properly using calculate_and_next, + # TODO but it works in the UI, it would be better to fix this + metadata_import.metadata_column_assignment( + df=run.df, + metadata_df=run.metadata, + metadata_required_column="Sample_renamed", + metadata_unknown_column="Sample", + ) + assert run.metadata.columns[0] == "Sample_renamed" + metadata_import.metadata_column_assignment( + df=run.df, + metadata_df=run.metadata, + metadata_required_column="Sample", + metadata_unknown_column="Sample_renamed", + ) + assert run.metadata.columns[0] == "Sample" + df, out = metadata_import.metadata_column_assignment( + df=run.df, + metadata_df=run.metadata, + metadata_required_column="Group", + metadata_unknown_column="Sample", + ) + assert out["messages"][0]["level"] == messages.ERROR + assert out["messages"][0]["msg"] + df_new, out_new = metadata_import.metadata_column_assignment( + df=run.df, + metadata_df=run.metadata, + metadata_required_column="", + metadata_unknown_column="", + ) + + rmtree(RUNS_PATH / name) From b99ec9bdcf447dad4cd90d6fc52d3fc1f857f062 Mon Sep 17 00:00:00 2001 From: henninggaertner Date: Wed, 22 Nov 2023 14:43:09 +0100 Subject: [PATCH 16/24] Add openpyxl (optional import), needed for reading .xlsx files --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 2d41a18ff..65108ffb1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -33,3 +33,4 @@ protgraph @ git+https://github.com/antonneubauer/ProtGraph@master joblib==1.2.0 networkx==3.1 beautifulsoup4==4.12.2 +openpyxl==3.1.2 From 7c5cc301f0f31fbfb8059a542ec238edf2866be0 Mon Sep 17 00:00:00 2001 From: henninggaertner Date: Wed, 22 Nov 2023 14:44:06 +0100 Subject: [PATCH 17/24] Add DIA-NN MS and metadata support, add mappings, and UI hooks in workflow_meta.json --- protzilla/constants/location_mapping.py | 10 ++++ protzilla/constants/workflow_meta.json | 33 ++++++++++++ protzilla/importing/metadata_import.py | 71 +++++++++++++++++++++---- protzilla/importing/ms_data_import.py | 25 +++++++++ 4 files changed, 130 insertions(+), 9 deletions(-) diff --git a/protzilla/constants/location_mapping.py b/protzilla/constants/location_mapping.py index 9f35637b9..2edfb9278 100644 --- a/protzilla/constants/location_mapping.py +++ b/protzilla/constants/location_mapping.py @@ -31,6 +31,11 @@ "ms_data_import", "max_quant_import", ): ms_data_import.max_quant_import, + ( + "importing", + "ms_data_import", + "diann_import", + ): ms_data_import.diann_import, ( "importing", "ms_data_import", @@ -41,6 +46,11 @@ "metadata_import", "metadata_import_method", ): metadata_import.metadata_import_method, + ( + "importing", + "metadata_import", + "metadata_import_method_diann", + ): metadata_import.metadata_import_method_diann, ("importing", "peptide_import", "peptide_import"): peptide_import.peptide_import, ( "data_preprocessing", diff --git a/protzilla/constants/workflow_meta.json b/protzilla/constants/workflow_meta.json index 1f76ffe63..a45b5e70f 100644 --- a/protzilla/constants/workflow_meta.json +++ b/protzilla/constants/workflow_meta.json @@ -27,6 +27,22 @@ } } }, + "diann_import": { + "name": "DIA-NN", + "description": "DIA-NN Data Import", + "parameters": { + "file_path": { + "name": "DIA-NN intensities file:", + "type": "file", + "default": null + }, + "map_to_uniprot": { + "name": "Map to Uniprot IDs using Biomart (online)", + "type": "boolean", + "default": false + } + } + }, "ms_fragger_import": { "name": "MS Fragger", "description": "MS Fragger Data Import", @@ -80,6 +96,23 @@ "default": "Columns (samples in rows, features in columns)" } } + }, + "metadata_import_method_diann": { + "name": "Metadata Import DIA-NN", + "description": "Import Metadata for run relationships of DIA-NN", + "parameters": { + "file_path": { + "name": "Run-Relationship Metadata file:", + "type": "file", + "default": null + }, + "groupby_sample": { + "name": "Group Replicate Runs by Sample using Median", + "type": "boolean", + "default": false + } + + } } }, "peptide_import": { diff --git a/protzilla/importing/metadata_import.py b/protzilla/importing/metadata_import.py index 7b4ae1b12..bb49fc105 100644 --- a/protzilla/importing/metadata_import.py +++ b/protzilla/importing/metadata_import.py @@ -2,12 +2,17 @@ import pandas as pd from django.contrib import messages +from pandas import DataFrame from protzilla.constants.paths import PROJECT_PATH from protzilla.utilities import random_string -def metadata_import_method(df, file_path, feature_orientation): +def file_importer(file_path: str) -> (DataFrame, str): + """ + Imports a file based on its file extension and returns a pandas DataFrame or None if the file format is not + supported / the file doesn't exist. + """ if file_path.endswith(".csv"): meta_df = pd.read_csv( file_path, @@ -24,19 +29,23 @@ def metadata_import_method(df, file_path, feature_orientation): elif file_path.endswith(".tsv"): meta_df = pd.read_csv(file_path, sep="\t", low_memory=False) elif file_path == "": - msg = "The file upload is empty. Please select a metadata file." - return df, dict( - meta_df=None, - messages=[dict(level=messages.ERROR, msg=msg)], - ) + return None, "The file upload is empty. Please select a metadata file." else: - msg = "File format not supported. \ - Supported file formats are csv, xlsx, psv or tsv" + return ( + None, + "File format not supported. \ + Supported file formats are csv, xlsx, psv or tsv", + ) + return meta_df, dict() + + +def metadata_import_method(df: pd.DataFrame, file_path: str, feature_orientation: str): + meta_df, msg = file_importer(file_path) + if meta_df is None: return df, dict( meta_df=None, messages=[dict(level=messages.ERROR, msg=msg)], ) - # always return metadata in the same orientation (features as columns) # as the dtype get lost when transposing, we save the df to disk after # changing the format and read it again as "Columns"-oriented @@ -55,5 +64,49 @@ def metadata_import_method(df, file_path, feature_orientation): f"{PROJECT_PATH}/tests/protzilla/importing/conversion_tmp_" ): os.remove(file_path) + if "replicate" in meta_df.columns: + # this indicates a DIANN metadata file with replicate information, we now want to calculate the median across + # all MS runs for a sample then instead of having intensities for each MS run in our dataframe, we + # have intensities for each sample + # note that up until now, "Sample" in the intensity df referred to the ms run + res = pd.merge( + df, + meta_df[["MS run", "sample name"]], + left_on="Sample", + right_on="MS run", + how="left", + ) + res.groupby(["Protein ID", "sample name"], as_index=False).median() + + return df, {"metadata": meta_df} + + +def metadata_import_method_diann(df, file_path, groupby_sample=False): + meta_df, msg = file_importer(file_path) + if meta_df is None: + return df, dict( + meta_df=None, + messages=[dict(level=messages.ERROR, msg=msg)], + ) + if file_path.startswith( + f"{PROJECT_PATH}/tests/protzilla/importing/conversion_tmp_" + ): + os.remove(file_path) + + # this indicates a DIANN metadata file with replicate information, we now want to calculate the median across + # all MS runs for a sample then instead of having intensities for each MS run in our dataframe, we + # have intensities for each sample + # note that up until now, "Sample" in the intensity df referred to the ms run + if groupby_sample: + res = pd.merge( + df, + meta_df[["MS run", "sample name"]], + left_on="Sample", + right_on="MS run", + how="left", + ) + res = res.groupby(["Protein ID", "sample name"], as_index=False).median() + res.rename(columns={"sample name": "Sample"}, inplace=True) + return res, {"metadata": meta_df} return df, {"metadata": meta_df} diff --git a/protzilla/importing/ms_data_import.py b/protzilla/importing/ms_data_import.py index 98a58128a..06f651c8f 100644 --- a/protzilla/importing/ms_data_import.py +++ b/protzilla/importing/ms_data_import.py @@ -81,6 +81,31 @@ def ms_fragger_import(_, file_path, intensity_name, map_to_uniprot=False): return transform_and_clean(intensity_df, intensity_name, map_to_uniprot) +def diann_import(_, file_path, map_to_uniprot=False): + if not Path(file_path).is_file(): + msg = "The file upload is empty. Please provide a DIA-NN MS file." + return None, dict(messages=[dict(level=messages.ERROR, msg=msg)]) + + df = pd.read_csv( + file_path, + sep="\t", + low_memory=False, + na_values=["", 0], + keep_default_na=True, + ) + df = df.drop( + columns=["Protein.Group", "Protein.Names", "Genes", "First.Protein.Description"] + ) + # rename column names of samples, removing file path and ".raw" + intensity_df = df.rename(columns=lambda x: re.sub(r"(.*[/\\])|(.raw)", r"", x)) + intensity_df = intensity_df.rename(columns={"Protein.Ids": "Protein ID"}) + + # placeholder intensity name for following cleanup + intensity_name = "Intensity" + + return transform_and_clean(intensity_df, intensity_name, map_to_uniprot) + + def transform_and_clean(df, intensity_name, map_to_uniprot): """ Transforms a dataframe that is read from a file in wide format into long format, From b93e6dd08fce3f820bda0e4126593c0a20744177 Mon Sep 17 00:00:00 2001 From: henninggaertner Date: Wed, 22 Nov 2023 15:32:37 +0100 Subject: [PATCH 18/24] Add tests for DIA-NN file support --- tests/diann_intensities.tsv | 6 ++ tests/diann_run_relationship_metadata.xlsx | Bin 0 -> 10855 bytes .../importing/test_metadata_import.py | 15 +++ .../importing/test_ms_data_import.py | 88 ++++++++++++++++++ 4 files changed, 109 insertions(+) create mode 100644 tests/diann_intensities.tsv create mode 100644 tests/diann_run_relationship_metadata.xlsx diff --git a/tests/diann_intensities.tsv b/tests/diann_intensities.tsv new file mode 100644 index 000000000..92e023553 --- /dev/null +++ b/tests/diann_intensities.tsv @@ -0,0 +1,6 @@ +Protein.Group Protein.Ids Protein.Names Genes First.Protein.Description D:\MPL\Gereon\20230418 Hela CM 24h prodi stork\LM07061.raw /home/sampleuser/data/LM07062.raw D:\MPL\Gereon\20230418 Hela CM 24h prodi stork\LM07063.raw +A0A087WWU8 A0A2R2Y2Q3;J3KN67;A0A087WWU8;A0A494C0P6 A0A087WWU8_HUMAN TPM3 Tropomyosin alpha-3 chain 329042.0 367477.0 381325.0 +A0A0B4J2A2;P0DN37 A0A0B4J2A2;P0DN37 PAL4C_HUMAN;PAL4G_HUMAN PPIAL4C;PPIAL4G Peptidyl-prolyl cis-trans isomerase A-like 4C 138322.0 572539.0 96522.7 +A0A0G2JPD3;A0A140T8W8;A0A140T8Y4;A0A1W2PPF8;A0A1W2PR61;Q5SPM2 A0A0G2JPD3;A0A140T8Y4;A0A1W2PR61;Q5SPM2;A0A140T8W8;A0A1W2PPF8 A0A0G2JPD3_HUMAN;A0A140T8W8_HUMAN;A0A140T8Y4_HUMAN;A0A1W2PPF8_HUMAN;A0A1W2PR61_HUMAN;Q5SPM2_HUMAN HLA-A HLA class I histocompatibility antigen, A alpha chain +A0A0U1RQV3 A0A0U1RQV3 A0A0U1RQV3_HUMAN EFEMP1 EGF-containing fibulin-like extracellular matrix protein 1 (Fragment) 122984.0 59042.7 72372.5 +A0A140T913;A0A140T933;A0A140T955;A0A140T9I0;A0A140T9X5;A0A1W2PPQ2;A0A1W2PRT9;Q53Z42 A0A140T913;A0A140T9I0;A0A140T9X5;A0A1W2PRT9;Q53Z42;A0A140T933;A0A140T955;A0A1W2PPQ2 A0A140T913_HUMAN;A0A140T933_HUMAN;A0A140T955_HUMAN;A0A140T9I0_HUMAN;A0A140T9X5_HUMAN;A0A1W2PPQ2_HUMAN;A0A1W2PRT9_HUMAN;Q53Z42_HUMAN HLA-A HLA class I histocompatibility antigen, A alpha chain 36317.0 27456.7 diff --git a/tests/diann_run_relationship_metadata.xlsx b/tests/diann_run_relationship_metadata.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..6a51ecb35a4c8c9c2bc17dee751e5671c4d8de04 GIT binary patch literal 10855 zcmeHtg;yNu^7h~mBse6<;4XtCcyK4k;1(dblfgZ>ySqzpcL@?)f(3#Ef&>jN!GDwN z-Y>iCet*HeJ?C_vIX%yFdaCQGs<+-!l!b%G1t0>D0RR9wK$y#S+y({!fJXoTZ~(}# z+M>2L4#qYPdMd7V#`d}_F4k6Lx$v-b*#KDR@BerF7h9lAWzeP@gxRjXBPiCP7LpTG zQi4p}Laa@-g_P_Pt3lPU@r3Tilh#B@wHmpLTFGi$o92X{uza3IQ$%RZisedEfl42> zD54y4BycO0CGMMFrWR2f9ri4n5dGi)ajG6ku4(=?iv`6!1vUP&xI1^Oy451QZBU&HL*YHnRoHf-<2e6 z$ylpsaMZiu^PPweSIt(Ksc~+$SdT(INwGiK;~Xd`4`CkCGTxKFK)G`vH1;^NWy(Jx z@EPAC%Vdi_a&7lVwhzQKUjlq-`Hpo;Y+nw$g>rPN#zk;Qa2<~kInB)-l|BzhayB4{ zso*_BgZDw(6NQ+~ore=&mM&l{W#kG2R}zmfi0+^|yT6A6DE>{R^{ODsQ>gFBLGPkM zWvT};wz6ks`FZ}IJpYR|`Io<57W-162ZR=QAaxToa51wKjVU7QEG*SZuHyApdhv06 zL>?8m^7=;w z%#4Wg_wvB1My~9gxbZaa*~G$abirUAxxL90yaD^yrk^W32CT@>E}yC@oAH=ez0QR2 zQn>0FS-n3IOY6kB^{P|LVju{zsAEMME&TK>#qWvbS;dEA(HUX!;-z0OE!jR1=l zUqEBRyz0!idDa*;7H$MQkf~f;$^5;SA+mWZPX=VE6innE*grnPZ#dSmwrAV3gZZE8G7^rq`m$-N2ZmHzPS;tTVv%DO=^u_3j%bY3c9=vE z3&WQd!(ZttH7xn7nHIgDQQ<#h9&%ZN?G@EbI5q~%)bpaUkP@+Mi6R&r1m}H+ztQkNYLN@% z>-{9+Z5?7cd~};aE)SATp_f+!J);wsjC!8+b;PFyP~TyF^W!Dr|X&aQv~xs_Q^&TNcAYA!s802dT0b48?P|FEsMD!{{Y{!CkN0IwDR?BxAYCd^`x?F0X zSDPCoCC|^Uvri|g?>a*Y`rD6y@Uk2U2yl}iFiDoTJPHWEi)zS{p3|ixm^BB~c=}`r z{`J6xt_O_cVMpiH+~vgV;^YGiAr9;}nXf!C~oWXKAm7%@TaJ zW;nU8;L|#e_4!;s4;mo0CkRE&ai<$DOK?{c8@YFc@Gr1WOh%6OrqI5@<|z@`@x#0U z&aRSO>t9XMB_iNKsLjsdmy3uRLd=k~p2ER)91KcC5*VP;iHIm0;Q;gKAlB=>?>0uH z+vL(uBg!6mpA5eS~&e9{S2pDoa2_7s2& z1J&st`RuQ1{ZDR#ftH?7mjCXf4E$2I7g~Cu-UWf2)12_IK0B~d?5XZPMjouAU!Wo9 z@H$;4VrbFT_$12;XB+5xFxu~Q@eE@X4(F_ku{abP-VMjzx|-2eiD>dNFjFGJQ>JoF$QvEx3K?6G6aq7!*%O# zul$ww%6;n^8_81XO*ZbEMHkKyvwU5F6DLnlr1{ZZ0TWx(wFJQf;tbX?=GFHX@i%v6$t*(4oD%{XU=4o?cHOStXy!P+%aK4bNZz z(fkg++TCp7%IS4|wV{*p2a)7ev5G=MRn zI@SHAj_;cVD!;8?-D@Sx@Z@hbxg_M(;~+UDDVP*2B=m#>GJM>N3ywnO){K z7eOtw;T2fb2{Ko*s0CtSMzy8k8`=SmSv|?9kOmiVm&Z2j<@dtUvIY?x0SA;Rjy!^i zg2u{yA2Log?v;~4Cisyx<|{-DR~j;35642k==(kHmT=ZMX|;kxyUN@fzp08;`<%-~ z>M`hc*6&L(Gs~+xDyL+&`He_=F zm@VWY6cZh5@}jYCFD-(|0U_Aodm!RqK$4_}514+*ziH{#TJPlRShfk%(4CrwmhZ(@CLxbx_iq5{!X2?LNlu$j|(0y+A|b!*r!sI`=3 zdb?cn?sZ6Jt5^WqlZY$V4!Xb#+8zV`)NrK>w%Gd6%3uEQ>lB}KVB-tImDKxi3KQ6P z$0fvmDvnI~nYIyat)UTE5K(a9Z0^)am?y~}Km0DS$9;LZ@_T#vaAnlNjJokaS>X8j z^YHO8qS~t*>xGPEugcqxXN~U0pIDttcWUhSO8ttW!fnykN~7k%n*2L8Td~AbA%w;Z8PWVO2t?| zBgMm=@H(j&E0{U%SW3;JCKJp#YUiA&)% z%ZGAQ8=FPHO@Tsoihn}z4z}q@(XkW9e}ZU#g5JT&!u|ve%1me(KAU*=DMo#oBzh^A zU-c{K71I~SL7f zAVy+Z+rm(OgPsu6SNhqne2z?x0$=vask(JX@+GI5-=DILDJK=8~Zk> z$FBS=3$)!^$_m1)eZO8=$Z+GYOPlHC)5Vk4FPN#tYg%%J)0g3LkAS#o)d9zptT}@L zn%FZXUxtN8{Kf>jvtUVi*8m9fr0qaLMjO%8H>#z$L%)R9Z1?xSUNL8c@S|=aH1SII zb$}WcR>4$zScRMKKt6|XhCe5Ti<`EA)4iw5{Y7YsMBRuZibASDx0iFEe!A>1%ayGH z=Kd;6r9=QI#@fLj6z1Y*v(>m1a#Jp>G)Qv?_D6+hE8N*)dgdG2On!akyn)+gvkfR+ z^z0>-B|rX(^lxFOivY7j9(t#b>}Pl6SJ-hdGqyHn{dN8oarQJv!-%+WI|;4?(d?aX zo~=bue_0u~id&*G%7`bfZ`e~(<6w_%Bf^2D<~ak?1F-`AHxP6vs!uzGV zDYNkViAnsvB;zv{fXfLfV}HEzeXZ#3$xcqWhn^tg?f$M!;VWB=2zof(vU*81bx!qK zPmM@$z!rF)gna(kAd`)cbk%+Gf({$o%j}Z+c9Woy6@NmFMr;F7Q`9E=p1oO4-$P@< z{&>5~Vv-EmPs_bP53X3J|7`bGtFvLU2*!-lH=(d$j(0b~CfWd9@oFZgRsBG982`&h zJ>A-7l9H%^*=!Z>sz4xG%ZfN^r7-;qot~f!lOogI*=$@r?VefL@2?aaPc0S2q#;d# zECKZv&du|9M%`PHIA75Ggxh23$CaYdOZULaavBs|8(QJ}>@NUsVZl_=tAiiV6h0!? zG=uF0%c%aSeBpCfNEzAyV$V;|)eq4i0-w9Q?j4kCxM9qm z;UZ(r6mY*g+&&5F6!1RXy6n-bX|89GwLzj*R^Fe9(!0OCCHSJ(d3U&T#I-$5d(_?A zd2>0O)Omlzdsel&h{xLDd3?MbN3iU9wy6+BGQU7fl6*)WQhiwQ)$o8O%)kfZ`Upla zqZcI78!LphsJOKMjn{21svFdyG0waQQ8QI!N~J_K!lGfg6QMKj zN=fSQ;h|lo+)~_0b4Y8*#~90f7MU`@#U@9#`YQNyM3wP3^ib@OLhi0f<<4r+aFWti z(cq0)0rpk7s*uxt%u2uDDQZ!OJ~f!7PV{x~aJMm;|NQaj7wgKNV#^r;4?tFx{hH*`* z+xt9L7P2+j97f0yBj7X_p~!DPFBdz|>qUb>E*|6TCr#iH^_fphK$KXqhHbGCrJ?)Q zgrW&y-6A!fHu%bTjHJ3PJf$5-;NV*oW<1Z|Fjqes!okOT@-AV%l~CCsypPbm@rAv{ zM+1)G&E^Sp3jG9W6^pGTEM&pa4ck%Ku zLuzPmoQFuAVeAnRqO^~$<9XBQj5o!6YEdqLOqOnDnTzl3V$1A5xvJW~n)z{mU1Boz z>P4mTm}Sa|R^QqAnXpAln}^R3&B5_jrb1iw1_yck{xHIaRuN}U__m2~BvA2!QVVUf z1T-K8M4izEx(uH0pLNrOHa7#$jdqneZriC9EcBUQTwuAWA(Myj-r)wArY;NV5Ar+P^%^27-t}<<3r#-{qxNuE?(G;ei|FCwH;1)QlpcNd*7}0VRV%n9-_;EF~*fwK=cMQ??)Bs0ex|J0xpQ^gOQgVj(P z;6$i4d|RcnILZF+vW2kreM`n@sJ}t)Y19F@uxM?#|JQ|LN%a^N541Q)L;(QM z|46^~4z5CkwkEJ~b`mV?+26W3Ke|aEPKVn&x&o z#`T3#2Ng@IjW8FQdkA$X#bZ|~wj*&$Znyjv6B(L0!4ps}yE;PLF1&a1csjjAJ$Vh8 zj&$k@N%@iZ)t#4DXG1e;sqk5FifO)=ek#jGCQ)y{zG95nr9GA6mh#vud{fahIRJ2sZdRiR${|w%5Q9H--jf2^wi*fy3Oy$(Aj)laJiD0+G??r)Rgqzp# zh|k^0qzsD%MC(WN{jSb#D|{EF&Ei;)%D>pw&8F_%kUiD4(+-M$UVv&g>?60^j8ni? zRN#|u(eP!iQVH*-BA!jL#asteqZVs7++%0xfiwSvm#fPJE`tkm+(?*6i^>JNL8C0} zOONywb0Yqb^bWvP$RpN^VDV_~ip-}Z9pYu`cdfO5VzS##AF~)LmrR6=Ny@FA(St3u zlcXEj5;3|or)BDH^xPv(b2{Gry3*$uW|>Wq3}{0MR`oC?ns2aZ)UMlr%;`ZI5`CQ* z%+W{|ttTYR&*9arcQ@_iTO+N9W>0W$yrY@IXpOgR*9({t$h=hy#&<0B=m$DdQ0|N* zNr6Z8^XxXs{9tYUS~HB9ozLGmoIL6s+b&?g;@44W+w5Nip}Z&)obICj zqrvvjSe{4f$}iT3L%?@%t*`Bp^UCwtFMqIrA|PA)mcvGw;e?>}lhqUt&)5N?9Q+5h@j z^$J?S{&QtLZ*9ti4OJfvRA;z12604xBMp``Lk{aUM2q!0gt@hhfO81~-o%ey2>aiT}Sxzx7 z(7nA{S{2g^tg0Igh0J~sP<1@pH03QANz{}>2SPF{0;HgOCxxoX0D=u_G24^laQB!( z4*nEljZ#Lw!W|!@P z+D-&&N6J5J__LMuXLsU{-q!CP#jnnme^gg!Hwah!0`5-K#Xas*0Gg;W7e$lmCfw_* zdBlaJ=nLFi_gZ4(_Zu5_;~US@uKC{)nP9{)O}D@XtQ!)TRUNv|Gj|GCbMGwTqA>GE z8%(iuq{7L|%Y^htYv7?)2Pje9683oYey)y0c^CcS+h^$zSJ2>P7}3YF5)12NJ)}BT z`T7HYbK%hZ=!avL!`9~EfP(0t@eXsT>wsnvXJr$PZ6O0Om zd3~cTw5&{t4hy)aq+i#}AWdlK<5gW9efIvFxSpq#X4dM*yzhB?`xFw5m-e)1@-}B* ztxEBQrAK-!bEmXnbj6*I!i1k=ZF{RM$ zQov)5y7ZcjslLVl9Ic%1ymW*gz-$FcTd7VK$7!jm^*uDl6`i)KF;1nbjJTlVp}xg$ zx~56aBtCMZrJ;z&5}EqqD{7WQ=7re`{cGrIJPT*`Hk$gM8&hFu!#X3Prm*w{QU)C} zKfC2*-~VvBS;Zsi3wrCSrWxp3Ku=nmS;dQ7Y*N0qI`59`>XIxPE-56wm`Ag(i~|>b zVqxD~1V$|5t0OjD-f%&4{9<1m#i9*!jdPm$-m+oUW1sEnG(o(0tW}qiVD_C|JB?>k z9V?>(@1`5P>taBwc3(xm?l$AXK8z!t(t&bAJVo>r&FOm7+$_pzM)T9j`MNk65a zI_{`sTyj>Zh+B*!c5c@&sF_|i#PEDmTr>Ixzup7Z^PN27tCts|ZqaGEY<44g=L<&S zV@xH$RdqX^zx%Fz2vn6JQY>KJm-kv|IMrjJUWV)UTj4Sv`Vz`ek%G zQ{K>OGO@Qt0gfPVGwA9D=#rIk10Cl0r$&jUtqp{r5y}wScE^IY-HmJw6(P2E_N<1s z5aT}wJ^t5ihX$|kSUuSu5blR%gig^x_q4(UHqiy2p-f`5bZ!sS!6h}TyJ)diPxmRO z(uB(u!+oC1jda=24@%D`2sG3!RViZg>0m$97teHvRi+v9OU9hYp?oq!pyu@v4cj>{ zTnU6&8(7!F>{{9>kChA%FCmkX*29Vdg|u?^%UvBO^vJ1ANgU?8&^ECCPL06fi=jTr2wXkxNyL^9sZx zyVR$ZAB6T$(KIlXg2jSh2{xF$J?5E|5(+d)i)xdNh(zcH&E4+5wBPP}5xgNBu=l;k z*^FO&8B8Fgx>9PZBG<^nE0`S?l#q<-C|KQm<%j3-1vkVE^@{o_uThjMLxYy4 zMv+gAD!%ZcvhbqM(C{K1j#8+P8l2g2wa`iGC)dGH+JI;_#^=Rm@p=Af#>(t{&&k9u zx7IkE0RFzTT`I&q%}%6g2N0e|U^|u(L>wuLbjnYGOe@Pyc@&;PZRt2sXgCM{wWA(W zBRtDL);YHY59$y6AVJgI5}gt5eb zoVKYK5gs=|+nd81Gr2cvg{@;2zFBuoH_(>vAC3$I%M7iG|9x2QKQHb-*MAwEQKfe%L;enUr~73qUPhlk+5@3;Pj0ssz=eu4i#yRQ#<9&W<@ zW-3Dae-H6z(fhmHeaP~#r~aFT6uP+w{VNYU><ONc#6w|Ca{&POKk{hd9{>OV literal 0 HcmV?d00001 diff --git a/tests/protzilla/importing/test_metadata_import.py b/tests/protzilla/importing/test_metadata_import.py index 7ed1d1785..eef797f72 100644 --- a/tests/protzilla/importing/test_metadata_import.py +++ b/tests/protzilla/importing/test_metadata_import.py @@ -22,6 +22,21 @@ def test_metadata_import(): rmtree(RUNS_PATH / name) +def test_metadata_import_diann(): + name = "test_run" + random_string() + run = Run.create(name) + run.step_index += 1 + run.calculate_and_next( + metadata_import.metadata_import_method_diann, + file_path=f"{PROJECT_PATH}/tests/diann_run_relationship_metadata.xlsx", + ) + test_metadata = pd.read_excel( + f"{PROJECT_PATH}/tests/diann_run_relationship_metadata.xlsx" + ) + pd.testing.assert_frame_equal(test_metadata, run.metadata) + rmtree(RUNS_PATH / name) + + def test_metadata_orientation(): name1 = "test_run" + random_string() name2 = "test_run" + random_string() diff --git a/tests/protzilla/importing/test_ms_data_import.py b/tests/protzilla/importing/test_ms_data_import.py index 0cf0ac6a1..3640cc14c 100644 --- a/tests/protzilla/importing/test_ms_data_import.py +++ b/tests/protzilla/importing/test_ms_data_import.py @@ -113,6 +113,80 @@ def ms_fragger_import_intensity_df(intensity_name): return ms_fragger_df +def diann_import_intensity_df(): + diann_intensity_df = { + "Sample": { + 0: "LM07061", + 1: "LM07061", + 2: "LM07061", + 3: "LM07061", + 4: "LM07061", + 5: "LM07062", + 6: "LM07062", + 7: "LM07062", + 8: "LM07062", + 9: "LM07062", + 10: "LM07063", + 11: "LM07063", + 12: "LM07063", + 13: "LM07063", + 14: "LM07063", + }, + "Protein ID": { + 0: "A0A087WWU8;A0A2R2Y2Q3;A0A494C0P6;J3KN67", + 1: "A0A0B4J2A2;P0DN37", + 2: "A0A0G2JPD3;A0A140T8W8;A0A140T8Y4;A0A1W2PPF8;A0A1W2PR61;Q5SPM2", + 3: "A0A0U1RQV3", + 4: "A0A140T913;A0A140T933;A0A140T955;A0A140T9I0;A0A140T9X5;A0A1W2PPQ2;A0A1W2PRT9;Q53Z42", + 5: "A0A087WWU8;A0A2R2Y2Q3;A0A494C0P6;J3KN67", + 6: "A0A0B4J2A2;P0DN37", + 7: "A0A0G2JPD3;A0A140T8W8;A0A140T8Y4;A0A1W2PPF8;A0A1W2PR61;Q5SPM2", + 8: "A0A0U1RQV3", + 9: "A0A140T913;A0A140T933;A0A140T955;A0A140T9I0;A0A140T9X5;A0A1W2PPQ2;A0A1W2PRT9;Q53Z42", + 10: "A0A087WWU8;A0A2R2Y2Q3;A0A494C0P6;J3KN67", + 11: "A0A0B4J2A2;P0DN37", + 12: "A0A0G2JPD3;A0A140T8W8;A0A140T8Y4;A0A1W2PPF8;A0A1W2PR61;Q5SPM2", + 13: "A0A0U1RQV3", + 14: "A0A140T913;A0A140T933;A0A140T955;A0A140T9I0;A0A140T9X5;A0A1W2PPQ2;A0A1W2PRT9;Q53Z42", + }, + "Gene": { + 0: np.nan, + 1: np.nan, + 2: np.nan, + 3: np.nan, + 4: np.nan, + 5: np.nan, + 6: np.nan, + 7: np.nan, + 8: np.nan, + 9: np.nan, + 10: np.nan, + 11: np.nan, + 12: np.nan, + 13: np.nan, + 14: np.nan, + }, + "Intensity": { + 0: 329042.0, + 1: 138322.0, + 2: np.nan, + 3: 122984.0, + 4: 36317.0, + 5: 367477.0, + 6: 572539.0, + 7: np.nan, + 8: 59042.7, + 9: np.nan, + 10: 381325.0, + 11: 96522.7, + 12: np.nan, + 13: 72372.5, + 14: 27456.7, + }, + } + return pd.DataFrame(data=diann_intensity_df) + + @pytest.mark.parametrize( "intensity_name", [ @@ -143,6 +217,20 @@ def test_ms_fragger_import(intensity_name): pd.testing.assert_frame_equal(test_intensity_df, intensity_df) +def test_diann_import(): + test_intensity_df, _ = ms_data_import.diann_import( + _=None, + file_path=f"{PROJECT_PATH}/tests/diann_intensities.tsv", + ) + + intensity_df = diann_import_intensity_df() + + # we do not care about the genes column, it is never used (and replaced by nan) + intensity_df = intensity_df.drop(columns=["Gene"]) + test_intensity_df = test_intensity_df.drop(columns=["Gene"]) + pd.testing.assert_frame_equal(test_intensity_df, intensity_df) + + def test_filter_rev_con(): intensity_df, other = ms_data_import.max_quant_import( _=None, From 1a814a0b3c4a9c05bc82c6d883aa3e5837fe4003 Mon Sep 17 00:00:00 2001 From: henninggaertner Date: Wed, 22 Nov 2023 18:20:20 +0100 Subject: [PATCH 19/24] reformatting, exception handling on file reading, added missing import, static typing --- protzilla/data_preprocessing/imputation.py | 10 +-- protzilla/importing/metadata_import.py | 82 +++++++++++++--------- protzilla/importing/ms_data_import.py | 18 +++-- 3 files changed, 64 insertions(+), 46 deletions(-) diff --git a/protzilla/data_preprocessing/imputation.py b/protzilla/data_preprocessing/imputation.py index 160112cfb..d2e28c7bd 100644 --- a/protzilla/data_preprocessing/imputation.py +++ b/protzilla/data_preprocessing/imputation.py @@ -14,9 +14,9 @@ def by_knn( intensity_df: pd.DataFrame, - number_of_neighbours=5, + number_of_neighbours: int = 5, **kwargs # quantile, default is median -) -> tuple[pd.DataFrame, dict]: +) -> (pd.DataFrame, dict): """ A function to perform value imputation based on KNN (k-nearest neighbors). Imputes missing values for each @@ -60,7 +60,7 @@ def by_knn( def by_simple_imputer( intensity_df: pd.DataFrame, - strategy="mean", + strategy: str = "mean", ) -> tuple[pd.DataFrame, dict]: """ A function to perform protein-wise imputations @@ -249,9 +249,6 @@ def by_normal_distribution_sampling( distribution used for imputation is scaled compared to dataset. Default: 1 (no scaling) :type down_shift: float - :param round_values: whether to round the imputed values to the nearest integer - Default: False - :type round_values: bool :return: returns an imputed dataframe in typical protzilla long format\ and an empty dict :rtype: pd.DataFrame, int @@ -262,7 +259,6 @@ def by_normal_distribution_sampling( transformed_df = long_to_wide(intensity_df) # iterate over all protein groups for protein_grp in transformed_df.columns: - number_of_nans = transformed_df[protein_grp].isnull().sum() # don't impute values if there not enough values (> 1) to sample from diff --git a/protzilla/importing/metadata_import.py b/protzilla/importing/metadata_import.py index bb49fc105..ed565a50d 100644 --- a/protzilla/importing/metadata_import.py +++ b/protzilla/importing/metadata_import.py @@ -8,42 +8,58 @@ from protzilla.utilities import random_string -def file_importer(file_path: str) -> (DataFrame, str): +def file_importer(file_path: str) -> tuple[pd.DataFrame, str]: """ Imports a file based on its file extension and returns a pandas DataFrame or None if the file format is not supported / the file doesn't exist. """ - if file_path.endswith(".csv"): - meta_df = pd.read_csv( - file_path, - sep=",", - low_memory=False, - na_values=[""], - keep_default_na=True, - skipinitialspace=True, - ) - elif file_path.endswith(".xlsx"): - meta_df = pd.read_excel(file_path) - elif file_path.endswith(".psv"): - meta_df = pd.read_csv(file_path, sep="|", low_memory=False) - elif file_path.endswith(".tsv"): - meta_df = pd.read_csv(file_path, sep="\t", low_memory=False) - elif file_path == "": - return None, "The file upload is empty. Please select a metadata file." - else: - return ( - None, - "File format not supported. \ - Supported file formats are csv, xlsx, psv or tsv", - ) - return meta_df, dict() + try: + if file_path.endswith(".csv"): + meta_df = pd.read_csv( + file_path, + sep=",", + low_memory=False, + na_values=[""], + keep_default_na=True, + skipinitialspace=True, + ) + elif file_path.endswith(".xlsx"): + meta_df = pd.read_excel(file_path) + elif file_path.endswith(".psv"): + meta_df = pd.read_csv(file_path, sep="|", low_memory=False) + elif file_path.endswith(".tsv"): + meta_df = pd.read_csv(file_path, sep="\t", low_memory=False) + elif file_path == "": + return ( + pd.DataFrame(), + "The file upload is empty. Please select a metadata file.", + ) + else: + return ( + pd.DataFrame(), + "File format not supported. \ + Supported file formats are csv, xlsx, psv or tsv", + ) + msg = "Metadata file successfully imported." + return meta_df, msg + except pd.errors.EmptyDataError: + msg = "The file is empty." + return pd.DataFrame(), msg + +def metadata_import_method( + df: pd.DataFrame, file_path: str, feature_orientation: str +) -> tuple[pd.DataFrame, dict]: + """ + Imports a metadata file and returns the intensity dataframe and a dict with a message if the file import failed, + and the metadata dataframe if the import was successful. -def metadata_import_method(df: pd.DataFrame, file_path: str, feature_orientation: str): + returns: (DataFrame, dict) + """ meta_df, msg = file_importer(file_path) - if meta_df is None: + if meta_df.empty: return df, dict( - meta_df=None, + metadata=None, messages=[dict(level=messages.ERROR, msg=msg)], ) # always return metadata in the same orientation (features as columns) @@ -78,14 +94,16 @@ def metadata_import_method(df: pd.DataFrame, file_path: str, feature_orientation ) res.groupby(["Protein ID", "sample name"], as_index=False).median() - return df, {"metadata": meta_df} + return df, {"metadata": meta_df, "messages": [dict(level=messages.INFO, msg=msg)]} -def metadata_import_method_diann(df, file_path, groupby_sample=False): +def metadata_import_method_diann( + df: DataFrame, file_path: str, groupby_sample: bool = False +) -> (DataFrame, dict): meta_df, msg = file_importer(file_path) - if meta_df is None: + if meta_df.empty: return df, dict( - meta_df=None, + metadata=None, messages=[dict(level=messages.ERROR, msg=msg)], ) if file_path.startswith( diff --git a/protzilla/importing/ms_data_import.py b/protzilla/importing/ms_data_import.py index 06f651c8f..59f20717f 100644 --- a/protzilla/importing/ms_data_import.py +++ b/protzilla/importing/ms_data_import.py @@ -9,7 +9,9 @@ from protzilla.data_integration.database_query import biomart_query -def max_quant_import(_, file_path, intensity_name, map_to_uniprot=False): +def max_quant_import( + _: pd.DataFrame, file_path: str, intensity_name: str, map_to_uniprot=False +) -> (pd.DataFrame, dict): assert intensity_name in ["Intensity", "iBAQ", "LFQ intensity"] if not Path(file_path).is_file(): msg = "The file upload is empty. Please provide a Max Quant file." @@ -36,7 +38,9 @@ def max_quant_import(_, file_path, intensity_name, map_to_uniprot=False): return transform_and_clean(intensity_df, intensity_name, map_to_uniprot) -def ms_fragger_import(_, file_path, intensity_name, map_to_uniprot=False): +def ms_fragger_import( + _: pd.DataFrame, file_path: str, intensity_name: str, map_to_uniprot=False +) -> (pd.DataFrame, dict): assert intensity_name in [ "Intensity", "MaxLFQ Total Intensity", @@ -81,7 +85,7 @@ def ms_fragger_import(_, file_path, intensity_name, map_to_uniprot=False): return transform_and_clean(intensity_df, intensity_name, map_to_uniprot) -def diann_import(_, file_path, map_to_uniprot=False): +def diann_import(_, file_path, map_to_uniprot=False) -> (pd.DataFrame, dict): if not Path(file_path).is_file(): msg = "The file upload is empty. Please provide a DIA-NN MS file." return None, dict(messages=[dict(level=messages.ERROR, msg=msg)]) @@ -96,17 +100,18 @@ def diann_import(_, file_path, map_to_uniprot=False): df = df.drop( columns=["Protein.Group", "Protein.Names", "Genes", "First.Protein.Description"] ) - # rename column names of samples, removing file path and ".raw" + # rename column names of samples, removing file path and ".raw" if present intensity_df = df.rename(columns=lambda x: re.sub(r"(.*[/\\])|(.raw)", r"", x)) intensity_df = intensity_df.rename(columns={"Protein.Ids": "Protein ID"}) - # placeholder intensity name for following cleanup intensity_name = "Intensity" return transform_and_clean(intensity_df, intensity_name, map_to_uniprot) -def transform_and_clean(df, intensity_name, map_to_uniprot): +def transform_and_clean( + df: pd.DataFrame, intensity_name: str, map_to_uniprot: bool +) -> (pd.DataFrame, dict): """ Transforms a dataframe that is read from a file in wide format into long format, removing contaminant groups, and processing protein ids, removing invalid ones @@ -122,7 +127,6 @@ def transform_and_clean(df, intensity_name, map_to_uniprot): :rtype: tuple[pd.DataFrame, list[str], list[str]] """ assert "Protein ID" in df.columns - contaminant_groups_mask = df["Protein ID"].map( lambda group: any(id_.startswith("CON__") for id_ in group.split(";")) ) From 597ef8bb8490baee3ae65c8e773922df5335dd60 Mon Sep 17 00:00:00 2001 From: henninggaertner Date: Wed, 22 Nov 2023 18:44:18 +0100 Subject: [PATCH 20/24] add docstring for metadata import method for DIA-NN --- protzilla/importing/metadata_import.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/protzilla/importing/metadata_import.py b/protzilla/importing/metadata_import.py index 2b5fe16a4..aacade101 100644 --- a/protzilla/importing/metadata_import.py +++ b/protzilla/importing/metadata_import.py @@ -100,22 +100,26 @@ def metadata_import_method( def metadata_import_method_diann( df: DataFrame, file_path: str, groupby_sample: bool = False ) -> (DataFrame, dict): + """ + This method imports a metadata file with run relationship information and returns the intensity dataframe and the + metadata dataframe. If the import fails, it returns the unchanged dataframe and a dict with a message about the + error. + """ meta_df, msg = file_importer(file_path) if meta_df.empty: return df, dict( metadata=None, messages=[dict(level=messages.ERROR, msg=msg)], ) + if file_path.startswith( f"{PROJECT_PATH}/tests/protzilla/importing/conversion_tmp_" ): os.remove(file_path) - # this indicates a DIANN metadata file with replicate information, we now want to calculate the median across - # all MS runs for a sample then instead of having intensities for each MS run in our dataframe, we - # have intensities for each sample - # note that up until now, "Sample" in the intensity df referred to the ms run if groupby_sample: + # we want to take the median of all MS runs (column "Sample" in the intensity df) for each Sample + # (column "sample name" in the metadata df) res = pd.merge( df, meta_df[["MS run", "sample name"]], From 3565c1a3aeca384d61d064609dae181e044cbeaf Mon Sep 17 00:00:00 2001 From: henninggaertner Date: Thu, 23 Nov 2023 11:40:22 +0100 Subject: [PATCH 21/24] added missing location mapping --- protzilla/constants/location_mapping.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/protzilla/constants/location_mapping.py b/protzilla/constants/location_mapping.py index a7b32a0d3..9971d04cc 100644 --- a/protzilla/constants/location_mapping.py +++ b/protzilla/constants/location_mapping.py @@ -36,6 +36,11 @@ "ms_data_import", "ms_fragger_import", ): ms_data_import.ms_fragger_import, + ( + "importing", + "ms_data_import", + "diann_import", + ): ms_data_import.diann_import, ( "importing", "metadata_import", From 5bd8231b20bc0064da349f3b40961b556779a175 Mon Sep 17 00:00:00 2001 From: henninggaertner Date: Fri, 24 Nov 2023 12:03:30 +0100 Subject: [PATCH 22/24] Revert "Merge pull request #322 from cschlaffner/306-dia-nn-support-for-input-data" This reverts commit 3ab5436ae7f9bf6856682f76239d895e63e38f51, reversing changes made to af789f790ecb78f0e4d49389932a35f35b63214b. --- protzilla/constants/location_mapping.py | 10 -- protzilla/constants/workflow_meta.json | 33 ----- protzilla/data_preprocessing/imputation.py | 10 +- protzilla/importing/metadata_import.py | 129 ++++-------------- protzilla/importing/ms_data_import.py | 37 +---- requirements.txt | 1 - tests/diann_intensities.tsv | 6 - tests/diann_run_relationship_metadata.xlsx | Bin 10855 -> 0 bytes .../importing/test_metadata_import.py | 15 -- .../importing/test_ms_data_import.py | 88 ------------ 10 files changed, 38 insertions(+), 291 deletions(-) delete mode 100644 tests/diann_intensities.tsv delete mode 100644 tests/diann_run_relationship_metadata.xlsx diff --git a/protzilla/constants/location_mapping.py b/protzilla/constants/location_mapping.py index 9971d04cc..d9c075e66 100644 --- a/protzilla/constants/location_mapping.py +++ b/protzilla/constants/location_mapping.py @@ -36,21 +36,11 @@ "ms_data_import", "ms_fragger_import", ): ms_data_import.ms_fragger_import, - ( - "importing", - "ms_data_import", - "diann_import", - ): ms_data_import.diann_import, ( "importing", "metadata_import", "metadata_import_method", ): metadata_import.metadata_import_method, - ( - "importing", - "metadata_import", - "metadata_import_method_diann", - ): metadata_import.metadata_import_method_diann, ( "importing", "metadata_import", diff --git a/protzilla/constants/workflow_meta.json b/protzilla/constants/workflow_meta.json index 9c0d6dc16..aca0c83b9 100644 --- a/protzilla/constants/workflow_meta.json +++ b/protzilla/constants/workflow_meta.json @@ -27,22 +27,6 @@ } } }, - "diann_import": { - "name": "DIA-NN", - "description": "DIA-NN Data Import", - "parameters": { - "file_path": { - "name": "DIA-NN intensities file:", - "type": "file", - "default": null - }, - "map_to_uniprot": { - "name": "Map to Uniprot IDs using Biomart (online)", - "type": "boolean", - "default": false - } - } - }, "ms_fragger_import": { "name": "MS Fragger", "description": "MS Fragger Data Import", @@ -97,23 +81,6 @@ } } }, - "metadata_import_method_diann": { - "name": "Metadata Import DIA-NN", - "description": "Import Metadata for run relationships of DIA-NN", - "parameters": { - "file_path": { - "name": "Run-Relationship Metadata file:", - "type": "file", - "default": null - }, - "groupby_sample": { - "name": "Group Replicate Runs by Sample using Median", - "type": "boolean", - "default": false - } - - } - }, "metadata_column_assignment": { "name": "Metadata Column Assignment", "description": "Assign columns to metadata categories, repeatable for each category", diff --git a/protzilla/data_preprocessing/imputation.py b/protzilla/data_preprocessing/imputation.py index d2e28c7bd..160112cfb 100644 --- a/protzilla/data_preprocessing/imputation.py +++ b/protzilla/data_preprocessing/imputation.py @@ -14,9 +14,9 @@ def by_knn( intensity_df: pd.DataFrame, - number_of_neighbours: int = 5, + number_of_neighbours=5, **kwargs # quantile, default is median -) -> (pd.DataFrame, dict): +) -> tuple[pd.DataFrame, dict]: """ A function to perform value imputation based on KNN (k-nearest neighbors). Imputes missing values for each @@ -60,7 +60,7 @@ def by_knn( def by_simple_imputer( intensity_df: pd.DataFrame, - strategy: str = "mean", + strategy="mean", ) -> tuple[pd.DataFrame, dict]: """ A function to perform protein-wise imputations @@ -249,6 +249,9 @@ def by_normal_distribution_sampling( distribution used for imputation is scaled compared to dataset. Default: 1 (no scaling) :type down_shift: float + :param round_values: whether to round the imputed values to the nearest integer + Default: False + :type round_values: bool :return: returns an imputed dataframe in typical protzilla long format\ and an empty dict :rtype: pd.DataFrame, int @@ -259,6 +262,7 @@ def by_normal_distribution_sampling( transformed_df = long_to_wide(intensity_df) # iterate over all protein groups for protein_grp in transformed_df.columns: + number_of_nans = transformed_df[protein_grp].isnull().sum() # don't impute values if there not enough values (> 1) to sample from diff --git a/protzilla/importing/metadata_import.py b/protzilla/importing/metadata_import.py index aacade101..6a82a1bd7 100644 --- a/protzilla/importing/metadata_import.py +++ b/protzilla/importing/metadata_import.py @@ -2,66 +2,41 @@ import pandas as pd from django.contrib import messages -from pandas import DataFrame from protzilla.constants.paths import PROJECT_PATH from protzilla.utilities import random_string -def file_importer(file_path: str) -> tuple[pd.DataFrame, str]: - """ - Imports a file based on its file extension and returns a pandas DataFrame or None if the file format is not - supported / the file doesn't exist. - """ - try: - if file_path.endswith(".csv"): - meta_df = pd.read_csv( - file_path, - sep=",", - low_memory=False, - na_values=[""], - keep_default_na=True, - skipinitialspace=True, - ) - elif file_path.endswith(".xlsx"): - meta_df = pd.read_excel(file_path) - elif file_path.endswith(".psv"): - meta_df = pd.read_csv(file_path, sep="|", low_memory=False) - elif file_path.endswith(".tsv"): - meta_df = pd.read_csv(file_path, sep="\t", low_memory=False) - elif file_path == "": - return ( - pd.DataFrame(), - "The file upload is empty. Please select a metadata file.", - ) - else: - return ( - pd.DataFrame(), - "File format not supported. \ - Supported file formats are csv, xlsx, psv or tsv", - ) - msg = "Metadata file successfully imported." - return meta_df, msg - except pd.errors.EmptyDataError: - msg = "The file is empty." - return pd.DataFrame(), msg - - -def metadata_import_method( - df: pd.DataFrame, file_path: str, feature_orientation: str -) -> tuple[pd.DataFrame, dict]: - """ - Imports a metadata file and returns the intensity dataframe and a dict with a message if the file import failed, - and the metadata dataframe if the import was successful. - - returns: (DataFrame, dict) - """ - meta_df, msg = file_importer(file_path) - if meta_df.empty: +def metadata_import_method(df, file_path, feature_orientation): + if file_path.endswith(".csv"): + meta_df = pd.read_csv( + file_path, + sep=",", + low_memory=False, + na_values=[""], + keep_default_na=True, + skipinitialspace=True, + ) + elif file_path.endswith(".xlsx"): + meta_df = pd.read_excel(file_path) + elif file_path.endswith(".psv"): + meta_df = pd.read_csv(file_path, sep="|", low_memory=False) + elif file_path.endswith(".tsv"): + meta_df = pd.read_csv(file_path, sep="\t", low_memory=False) + elif file_path == "": + msg = "The file upload is empty. Please select a metadata file." + return df, dict( + meta_df=None, + messages=[dict(level=messages.ERROR, msg=msg)], + ) + else: + msg = "File format not supported. \ + Supported file formats are csv, xlsx, psv or tsv" return df, dict( - metadata=None, + meta_df=None, messages=[dict(level=messages.ERROR, msg=msg)], ) + # always return metadata in the same orientation (features as columns) # as the dtype get lost when transposing, we save the df to disk after # changing the format and read it again as "Columns"-oriented @@ -80,56 +55,6 @@ def metadata_import_method( f"{PROJECT_PATH}/tests/protzilla/importing/conversion_tmp_" ): os.remove(file_path) - if "replicate" in meta_df.columns: - # this indicates a DIANN metadata file with replicate information, we now want to calculate the median across - # all MS runs for a sample then instead of having intensities for each MS run in our dataframe, we - # have intensities for each sample - # note that up until now, "Sample" in the intensity df referred to the ms run - res = pd.merge( - df, - meta_df[["MS run", "sample name"]], - left_on="Sample", - right_on="MS run", - how="left", - ) - res.groupby(["Protein ID", "sample name"], as_index=False).median() - - return df, {"metadata": meta_df, "messages": [dict(level=messages.INFO, msg=msg)]} - - -def metadata_import_method_diann( - df: DataFrame, file_path: str, groupby_sample: bool = False -) -> (DataFrame, dict): - """ - This method imports a metadata file with run relationship information and returns the intensity dataframe and the - metadata dataframe. If the import fails, it returns the unchanged dataframe and a dict with a message about the - error. - """ - meta_df, msg = file_importer(file_path) - if meta_df.empty: - return df, dict( - metadata=None, - messages=[dict(level=messages.ERROR, msg=msg)], - ) - - if file_path.startswith( - f"{PROJECT_PATH}/tests/protzilla/importing/conversion_tmp_" - ): - os.remove(file_path) - - if groupby_sample: - # we want to take the median of all MS runs (column "Sample" in the intensity df) for each Sample - # (column "sample name" in the metadata df) - res = pd.merge( - df, - meta_df[["MS run", "sample name"]], - left_on="Sample", - right_on="MS run", - how="left", - ) - res = res.groupby(["Protein ID", "sample name"], as_index=False).median() - res.rename(columns={"sample name": "Sample"}, inplace=True) - return res, {"metadata": meta_df} return df, {"metadata": meta_df} diff --git a/protzilla/importing/ms_data_import.py b/protzilla/importing/ms_data_import.py index 59f20717f..98a58128a 100644 --- a/protzilla/importing/ms_data_import.py +++ b/protzilla/importing/ms_data_import.py @@ -9,9 +9,7 @@ from protzilla.data_integration.database_query import biomart_query -def max_quant_import( - _: pd.DataFrame, file_path: str, intensity_name: str, map_to_uniprot=False -) -> (pd.DataFrame, dict): +def max_quant_import(_, file_path, intensity_name, map_to_uniprot=False): assert intensity_name in ["Intensity", "iBAQ", "LFQ intensity"] if not Path(file_path).is_file(): msg = "The file upload is empty. Please provide a Max Quant file." @@ -38,9 +36,7 @@ def max_quant_import( return transform_and_clean(intensity_df, intensity_name, map_to_uniprot) -def ms_fragger_import( - _: pd.DataFrame, file_path: str, intensity_name: str, map_to_uniprot=False -) -> (pd.DataFrame, dict): +def ms_fragger_import(_, file_path, intensity_name, map_to_uniprot=False): assert intensity_name in [ "Intensity", "MaxLFQ Total Intensity", @@ -85,33 +81,7 @@ def ms_fragger_import( return transform_and_clean(intensity_df, intensity_name, map_to_uniprot) -def diann_import(_, file_path, map_to_uniprot=False) -> (pd.DataFrame, dict): - if not Path(file_path).is_file(): - msg = "The file upload is empty. Please provide a DIA-NN MS file." - return None, dict(messages=[dict(level=messages.ERROR, msg=msg)]) - - df = pd.read_csv( - file_path, - sep="\t", - low_memory=False, - na_values=["", 0], - keep_default_na=True, - ) - df = df.drop( - columns=["Protein.Group", "Protein.Names", "Genes", "First.Protein.Description"] - ) - # rename column names of samples, removing file path and ".raw" if present - intensity_df = df.rename(columns=lambda x: re.sub(r"(.*[/\\])|(.raw)", r"", x)) - intensity_df = intensity_df.rename(columns={"Protein.Ids": "Protein ID"}) - - intensity_name = "Intensity" - - return transform_and_clean(intensity_df, intensity_name, map_to_uniprot) - - -def transform_and_clean( - df: pd.DataFrame, intensity_name: str, map_to_uniprot: bool -) -> (pd.DataFrame, dict): +def transform_and_clean(df, intensity_name, map_to_uniprot): """ Transforms a dataframe that is read from a file in wide format into long format, removing contaminant groups, and processing protein ids, removing invalid ones @@ -127,6 +97,7 @@ def transform_and_clean( :rtype: tuple[pd.DataFrame, list[str], list[str]] """ assert "Protein ID" in df.columns + contaminant_groups_mask = df["Protein ID"].map( lambda group: any(id_.startswith("CON__") for id_ in group.split(";")) ) diff --git a/requirements.txt b/requirements.txt index 65108ffb1..2d41a18ff 100644 --- a/requirements.txt +++ b/requirements.txt @@ -33,4 +33,3 @@ protgraph @ git+https://github.com/antonneubauer/ProtGraph@master joblib==1.2.0 networkx==3.1 beautifulsoup4==4.12.2 -openpyxl==3.1.2 diff --git a/tests/diann_intensities.tsv b/tests/diann_intensities.tsv deleted file mode 100644 index 92e023553..000000000 --- a/tests/diann_intensities.tsv +++ /dev/null @@ -1,6 +0,0 @@ -Protein.Group Protein.Ids Protein.Names Genes First.Protein.Description D:\MPL\Gereon\20230418 Hela CM 24h prodi stork\LM07061.raw /home/sampleuser/data/LM07062.raw D:\MPL\Gereon\20230418 Hela CM 24h prodi stork\LM07063.raw -A0A087WWU8 A0A2R2Y2Q3;J3KN67;A0A087WWU8;A0A494C0P6 A0A087WWU8_HUMAN TPM3 Tropomyosin alpha-3 chain 329042.0 367477.0 381325.0 -A0A0B4J2A2;P0DN37 A0A0B4J2A2;P0DN37 PAL4C_HUMAN;PAL4G_HUMAN PPIAL4C;PPIAL4G Peptidyl-prolyl cis-trans isomerase A-like 4C 138322.0 572539.0 96522.7 -A0A0G2JPD3;A0A140T8W8;A0A140T8Y4;A0A1W2PPF8;A0A1W2PR61;Q5SPM2 A0A0G2JPD3;A0A140T8Y4;A0A1W2PR61;Q5SPM2;A0A140T8W8;A0A1W2PPF8 A0A0G2JPD3_HUMAN;A0A140T8W8_HUMAN;A0A140T8Y4_HUMAN;A0A1W2PPF8_HUMAN;A0A1W2PR61_HUMAN;Q5SPM2_HUMAN HLA-A HLA class I histocompatibility antigen, A alpha chain -A0A0U1RQV3 A0A0U1RQV3 A0A0U1RQV3_HUMAN EFEMP1 EGF-containing fibulin-like extracellular matrix protein 1 (Fragment) 122984.0 59042.7 72372.5 -A0A140T913;A0A140T933;A0A140T955;A0A140T9I0;A0A140T9X5;A0A1W2PPQ2;A0A1W2PRT9;Q53Z42 A0A140T913;A0A140T9I0;A0A140T9X5;A0A1W2PRT9;Q53Z42;A0A140T933;A0A140T955;A0A1W2PPQ2 A0A140T913_HUMAN;A0A140T933_HUMAN;A0A140T955_HUMAN;A0A140T9I0_HUMAN;A0A140T9X5_HUMAN;A0A1W2PPQ2_HUMAN;A0A1W2PRT9_HUMAN;Q53Z42_HUMAN HLA-A HLA class I histocompatibility antigen, A alpha chain 36317.0 27456.7 diff --git a/tests/diann_run_relationship_metadata.xlsx b/tests/diann_run_relationship_metadata.xlsx deleted file mode 100644 index 6a51ecb35a4c8c9c2bc17dee751e5671c4d8de04..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 10855 zcmeHtg;yNu^7h~mBse6<;4XtCcyK4k;1(dblfgZ>ySqzpcL@?)f(3#Ef&>jN!GDwN z-Y>iCet*HeJ?C_vIX%yFdaCQGs<+-!l!b%G1t0>D0RR9wK$y#S+y({!fJXoTZ~(}# z+M>2L4#qYPdMd7V#`d}_F4k6Lx$v-b*#KDR@BerF7h9lAWzeP@gxRjXBPiCP7LpTG zQi4p}Laa@-g_P_Pt3lPU@r3Tilh#B@wHmpLTFGi$o92X{uza3IQ$%RZisedEfl42> zD54y4BycO0CGMMFrWR2f9ri4n5dGi)ajG6ku4(=?iv`6!1vUP&xI1^Oy451QZBU&HL*YHnRoHf-<2e6 z$ylpsaMZiu^PPweSIt(Ksc~+$SdT(INwGiK;~Xd`4`CkCGTxKFK)G`vH1;^NWy(Jx z@EPAC%Vdi_a&7lVwhzQKUjlq-`Hpo;Y+nw$g>rPN#zk;Qa2<~kInB)-l|BzhayB4{ zso*_BgZDw(6NQ+~ore=&mM&l{W#kG2R}zmfi0+^|yT6A6DE>{R^{ODsQ>gFBLGPkM zWvT};wz6ks`FZ}IJpYR|`Io<57W-162ZR=QAaxToa51wKjVU7QEG*SZuHyApdhv06 zL>?8m^7=;w z%#4Wg_wvB1My~9gxbZaa*~G$abirUAxxL90yaD^yrk^W32CT@>E}yC@oAH=ez0QR2 zQn>0FS-n3IOY6kB^{P|LVju{zsAEMME&TK>#qWvbS;dEA(HUX!;-z0OE!jR1=l zUqEBRyz0!idDa*;7H$MQkf~f;$^5;SA+mWZPX=VE6innE*grnPZ#dSmwrAV3gZZE8G7^rq`m$-N2ZmHzPS;tTVv%DO=^u_3j%bY3c9=vE z3&WQd!(ZttH7xn7nHIgDQQ<#h9&%ZN?G@EbI5q~%)bpaUkP@+Mi6R&r1m}H+ztQkNYLN@% z>-{9+Z5?7cd~};aE)SATp_f+!J);wsjC!8+b;PFyP~TyF^W!Dr|X&aQv~xs_Q^&TNcAYA!s802dT0b48?P|FEsMD!{{Y{!CkN0IwDR?BxAYCd^`x?F0X zSDPCoCC|^Uvri|g?>a*Y`rD6y@Uk2U2yl}iFiDoTJPHWEi)zS{p3|ixm^BB~c=}`r z{`J6xt_O_cVMpiH+~vgV;^YGiAr9;}nXf!C~oWXKAm7%@TaJ zW;nU8;L|#e_4!;s4;mo0CkRE&ai<$DOK?{c8@YFc@Gr1WOh%6OrqI5@<|z@`@x#0U z&aRSO>t9XMB_iNKsLjsdmy3uRLd=k~p2ER)91KcC5*VP;iHIm0;Q;gKAlB=>?>0uH z+vL(uBg!6mpA5eS~&e9{S2pDoa2_7s2& z1J&st`RuQ1{ZDR#ftH?7mjCXf4E$2I7g~Cu-UWf2)12_IK0B~d?5XZPMjouAU!Wo9 z@H$;4VrbFT_$12;XB+5xFxu~Q@eE@X4(F_ku{abP-VMjzx|-2eiD>dNFjFGJQ>JoF$QvEx3K?6G6aq7!*%O# zul$ww%6;n^8_81XO*ZbEMHkKyvwU5F6DLnlr1{ZZ0TWx(wFJQf;tbX?=GFHX@i%v6$t*(4oD%{XU=4o?cHOStXy!P+%aK4bNZz z(fkg++TCp7%IS4|wV{*p2a)7ev5G=MRn zI@SHAj_;cVD!;8?-D@Sx@Z@hbxg_M(;~+UDDVP*2B=m#>GJM>N3ywnO){K z7eOtw;T2fb2{Ko*s0CtSMzy8k8`=SmSv|?9kOmiVm&Z2j<@dtUvIY?x0SA;Rjy!^i zg2u{yA2Log?v;~4Cisyx<|{-DR~j;35642k==(kHmT=ZMX|;kxyUN@fzp08;`<%-~ z>M`hc*6&L(Gs~+xDyL+&`He_=F zm@VWY6cZh5@}jYCFD-(|0U_Aodm!RqK$4_}514+*ziH{#TJPlRShfk%(4CrwmhZ(@CLxbx_iq5{!X2?LNlu$j|(0y+A|b!*r!sI`=3 zdb?cn?sZ6Jt5^WqlZY$V4!Xb#+8zV`)NrK>w%Gd6%3uEQ>lB}KVB-tImDKxi3KQ6P z$0fvmDvnI~nYIyat)UTE5K(a9Z0^)am?y~}Km0DS$9;LZ@_T#vaAnlNjJokaS>X8j z^YHO8qS~t*>xGPEugcqxXN~U0pIDttcWUhSO8ttW!fnykN~7k%n*2L8Td~AbA%w;Z8PWVO2t?| zBgMm=@H(j&E0{U%SW3;JCKJp#YUiA&)% z%ZGAQ8=FPHO@Tsoihn}z4z}q@(XkW9e}ZU#g5JT&!u|ve%1me(KAU*=DMo#oBzh^A zU-c{K71I~SL7f zAVy+Z+rm(OgPsu6SNhqne2z?x0$=vask(JX@+GI5-=DILDJK=8~Zk> z$FBS=3$)!^$_m1)eZO8=$Z+GYOPlHC)5Vk4FPN#tYg%%J)0g3LkAS#o)d9zptT}@L zn%FZXUxtN8{Kf>jvtUVi*8m9fr0qaLMjO%8H>#z$L%)R9Z1?xSUNL8c@S|=aH1SII zb$}WcR>4$zScRMKKt6|XhCe5Ti<`EA)4iw5{Y7YsMBRuZibASDx0iFEe!A>1%ayGH z=Kd;6r9=QI#@fLj6z1Y*v(>m1a#Jp>G)Qv?_D6+hE8N*)dgdG2On!akyn)+gvkfR+ z^z0>-B|rX(^lxFOivY7j9(t#b>}Pl6SJ-hdGqyHn{dN8oarQJv!-%+WI|;4?(d?aX zo~=bue_0u~id&*G%7`bfZ`e~(<6w_%Bf^2D<~ak?1F-`AHxP6vs!uzGV zDYNkViAnsvB;zv{fXfLfV}HEzeXZ#3$xcqWhn^tg?f$M!;VWB=2zof(vU*81bx!qK zPmM@$z!rF)gna(kAd`)cbk%+Gf({$o%j}Z+c9Woy6@NmFMr;F7Q`9E=p1oO4-$P@< z{&>5~Vv-EmPs_bP53X3J|7`bGtFvLU2*!-lH=(d$j(0b~CfWd9@oFZgRsBG982`&h zJ>A-7l9H%^*=!Z>sz4xG%ZfN^r7-;qot~f!lOogI*=$@r?VefL@2?aaPc0S2q#;d# zECKZv&du|9M%`PHIA75Ggxh23$CaYdOZULaavBs|8(QJ}>@NUsVZl_=tAiiV6h0!? zG=uF0%c%aSeBpCfNEzAyV$V;|)eq4i0-w9Q?j4kCxM9qm z;UZ(r6mY*g+&&5F6!1RXy6n-bX|89GwLzj*R^Fe9(!0OCCHSJ(d3U&T#I-$5d(_?A zd2>0O)Omlzdsel&h{xLDd3?MbN3iU9wy6+BGQU7fl6*)WQhiwQ)$o8O%)kfZ`Upla zqZcI78!LphsJOKMjn{21svFdyG0waQQ8QI!N~J_K!lGfg6QMKj zN=fSQ;h|lo+)~_0b4Y8*#~90f7MU`@#U@9#`YQNyM3wP3^ib@OLhi0f<<4r+aFWti z(cq0)0rpk7s*uxt%u2uDDQZ!OJ~f!7PV{x~aJMm;|NQaj7wgKNV#^r;4?tFx{hH*`* z+xt9L7P2+j97f0yBj7X_p~!DPFBdz|>qUb>E*|6TCr#iH^_fphK$KXqhHbGCrJ?)Q zgrW&y-6A!fHu%bTjHJ3PJf$5-;NV*oW<1Z|Fjqes!okOT@-AV%l~CCsypPbm@rAv{ zM+1)G&E^Sp3jG9W6^pGTEM&pa4ck%Ku zLuzPmoQFuAVeAnRqO^~$<9XBQj5o!6YEdqLOqOnDnTzl3V$1A5xvJW~n)z{mU1Boz z>P4mTm}Sa|R^QqAnXpAln}^R3&B5_jrb1iw1_yck{xHIaRuN}U__m2~BvA2!QVVUf z1T-K8M4izEx(uH0pLNrOHa7#$jdqneZriC9EcBUQTwuAWA(Myj-r)wArY;NV5Ar+P^%^27-t}<<3r#-{qxNuE?(G;ei|FCwH;1)QlpcNd*7}0VRV%n9-_;EF~*fwK=cMQ??)Bs0ex|J0xpQ^gOQgVj(P z;6$i4d|RcnILZF+vW2kreM`n@sJ}t)Y19F@uxM?#|JQ|LN%a^N541Q)L;(QM z|46^~4z5CkwkEJ~b`mV?+26W3Ke|aEPKVn&x&o z#`T3#2Ng@IjW8FQdkA$X#bZ|~wj*&$Znyjv6B(L0!4ps}yE;PLF1&a1csjjAJ$Vh8 zj&$k@N%@iZ)t#4DXG1e;sqk5FifO)=ek#jGCQ)y{zG95nr9GA6mh#vud{fahIRJ2sZdRiR${|w%5Q9H--jf2^wi*fy3Oy$(Aj)laJiD0+G??r)Rgqzp# zh|k^0qzsD%MC(WN{jSb#D|{EF&Ei;)%D>pw&8F_%kUiD4(+-M$UVv&g>?60^j8ni? zRN#|u(eP!iQVH*-BA!jL#asteqZVs7++%0xfiwSvm#fPJE`tkm+(?*6i^>JNL8C0} zOONywb0Yqb^bWvP$RpN^VDV_~ip-}Z9pYu`cdfO5VzS##AF~)LmrR6=Ny@FA(St3u zlcXEj5;3|or)BDH^xPv(b2{Gry3*$uW|>Wq3}{0MR`oC?ns2aZ)UMlr%;`ZI5`CQ* z%+W{|ttTYR&*9arcQ@_iTO+N9W>0W$yrY@IXpOgR*9({t$h=hy#&<0B=m$DdQ0|N* zNr6Z8^XxXs{9tYUS~HB9ozLGmoIL6s+b&?g;@44W+w5Nip}Z&)obICj zqrvvjSe{4f$}iT3L%?@%t*`Bp^UCwtFMqIrA|PA)mcvGw;e?>}lhqUt&)5N?9Q+5h@j z^$J?S{&QtLZ*9ti4OJfvRA;z12604xBMp``Lk{aUM2q!0gt@hhfO81~-o%ey2>aiT}Sxzx7 z(7nA{S{2g^tg0Igh0J~sP<1@pH03QANz{}>2SPF{0;HgOCxxoX0D=u_G24^laQB!( z4*nEljZ#Lw!W|!@P z+D-&&N6J5J__LMuXLsU{-q!CP#jnnme^gg!Hwah!0`5-K#Xas*0Gg;W7e$lmCfw_* zdBlaJ=nLFi_gZ4(_Zu5_;~US@uKC{)nP9{)O}D@XtQ!)TRUNv|Gj|GCbMGwTqA>GE z8%(iuq{7L|%Y^htYv7?)2Pje9683oYey)y0c^CcS+h^$zSJ2>P7}3YF5)12NJ)}BT z`T7HYbK%hZ=!avL!`9~EfP(0t@eXsT>wsnvXJr$PZ6O0Om zd3~cTw5&{t4hy)aq+i#}AWdlK<5gW9efIvFxSpq#X4dM*yzhB?`xFw5m-e)1@-}B* ztxEBQrAK-!bEmXnbj6*I!i1k=ZF{RM$ zQov)5y7ZcjslLVl9Ic%1ymW*gz-$FcTd7VK$7!jm^*uDl6`i)KF;1nbjJTlVp}xg$ zx~56aBtCMZrJ;z&5}EqqD{7WQ=7re`{cGrIJPT*`Hk$gM8&hFu!#X3Prm*w{QU)C} zKfC2*-~VvBS;Zsi3wrCSrWxp3Ku=nmS;dQ7Y*N0qI`59`>XIxPE-56wm`Ag(i~|>b zVqxD~1V$|5t0OjD-f%&4{9<1m#i9*!jdPm$-m+oUW1sEnG(o(0tW}qiVD_C|JB?>k z9V?>(@1`5P>taBwc3(xm?l$AXK8z!t(t&bAJVo>r&FOm7+$_pzM)T9j`MNk65a zI_{`sTyj>Zh+B*!c5c@&sF_|i#PEDmTr>Ixzup7Z^PN27tCts|ZqaGEY<44g=L<&S zV@xH$RdqX^zx%Fz2vn6JQY>KJm-kv|IMrjJUWV)UTj4Sv`Vz`ek%G zQ{K>OGO@Qt0gfPVGwA9D=#rIk10Cl0r$&jUtqp{r5y}wScE^IY-HmJw6(P2E_N<1s z5aT}wJ^t5ihX$|kSUuSu5blR%gig^x_q4(UHqiy2p-f`5bZ!sS!6h}TyJ)diPxmRO z(uB(u!+oC1jda=24@%D`2sG3!RViZg>0m$97teHvRi+v9OU9hYp?oq!pyu@v4cj>{ zTnU6&8(7!F>{{9>kChA%FCmkX*29Vdg|u?^%UvBO^vJ1ANgU?8&^ECCPL06fi=jTr2wXkxNyL^9sZx zyVR$ZAB6T$(KIlXg2jSh2{xF$J?5E|5(+d)i)xdNh(zcH&E4+5wBPP}5xgNBu=l;k z*^FO&8B8Fgx>9PZBG<^nE0`S?l#q<-C|KQm<%j3-1vkVE^@{o_uThjMLxYy4 zMv+gAD!%ZcvhbqM(C{K1j#8+P8l2g2wa`iGC)dGH+JI;_#^=Rm@p=Af#>(t{&&k9u zx7IkE0RFzTT`I&q%}%6g2N0e|U^|u(L>wuLbjnYGOe@Pyc@&;PZRt2sXgCM{wWA(W zBRtDL);YHY59$y6AVJgI5}gt5eb zoVKYK5gs=|+nd81Gr2cvg{@;2zFBuoH_(>vAC3$I%M7iG|9x2QKQHb-*MAwEQKfe%L;enUr~73qUPhlk+5@3;Pj0ssz=eu4i#yRQ#<9&W<@ zW-3Dae-H6z(fhmHeaP~#r~aFT6uP+w{VNYU><ONc#6w|Ca{&POKk{hd9{>OV diff --git a/tests/protzilla/importing/test_metadata_import.py b/tests/protzilla/importing/test_metadata_import.py index 7e6c6e30e..5f29ce38f 100644 --- a/tests/protzilla/importing/test_metadata_import.py +++ b/tests/protzilla/importing/test_metadata_import.py @@ -23,21 +23,6 @@ def test_metadata_import(): rmtree(RUNS_PATH / name) -def test_metadata_import_diann(): - name = "test_run" + random_string() - run = Run.create(name) - run.step_index += 1 - run.calculate_and_next( - metadata_import.metadata_import_method_diann, - file_path=f"{PROJECT_PATH}/tests/diann_run_relationship_metadata.xlsx", - ) - test_metadata = pd.read_excel( - f"{PROJECT_PATH}/tests/diann_run_relationship_metadata.xlsx" - ) - pd.testing.assert_frame_equal(test_metadata, run.metadata) - rmtree(RUNS_PATH / name) - - def test_metadata_orientation(): name1 = "test_run" + random_string() name2 = "test_run" + random_string() diff --git a/tests/protzilla/importing/test_ms_data_import.py b/tests/protzilla/importing/test_ms_data_import.py index 3640cc14c..0cf0ac6a1 100644 --- a/tests/protzilla/importing/test_ms_data_import.py +++ b/tests/protzilla/importing/test_ms_data_import.py @@ -113,80 +113,6 @@ def ms_fragger_import_intensity_df(intensity_name): return ms_fragger_df -def diann_import_intensity_df(): - diann_intensity_df = { - "Sample": { - 0: "LM07061", - 1: "LM07061", - 2: "LM07061", - 3: "LM07061", - 4: "LM07061", - 5: "LM07062", - 6: "LM07062", - 7: "LM07062", - 8: "LM07062", - 9: "LM07062", - 10: "LM07063", - 11: "LM07063", - 12: "LM07063", - 13: "LM07063", - 14: "LM07063", - }, - "Protein ID": { - 0: "A0A087WWU8;A0A2R2Y2Q3;A0A494C0P6;J3KN67", - 1: "A0A0B4J2A2;P0DN37", - 2: "A0A0G2JPD3;A0A140T8W8;A0A140T8Y4;A0A1W2PPF8;A0A1W2PR61;Q5SPM2", - 3: "A0A0U1RQV3", - 4: "A0A140T913;A0A140T933;A0A140T955;A0A140T9I0;A0A140T9X5;A0A1W2PPQ2;A0A1W2PRT9;Q53Z42", - 5: "A0A087WWU8;A0A2R2Y2Q3;A0A494C0P6;J3KN67", - 6: "A0A0B4J2A2;P0DN37", - 7: "A0A0G2JPD3;A0A140T8W8;A0A140T8Y4;A0A1W2PPF8;A0A1W2PR61;Q5SPM2", - 8: "A0A0U1RQV3", - 9: "A0A140T913;A0A140T933;A0A140T955;A0A140T9I0;A0A140T9X5;A0A1W2PPQ2;A0A1W2PRT9;Q53Z42", - 10: "A0A087WWU8;A0A2R2Y2Q3;A0A494C0P6;J3KN67", - 11: "A0A0B4J2A2;P0DN37", - 12: "A0A0G2JPD3;A0A140T8W8;A0A140T8Y4;A0A1W2PPF8;A0A1W2PR61;Q5SPM2", - 13: "A0A0U1RQV3", - 14: "A0A140T913;A0A140T933;A0A140T955;A0A140T9I0;A0A140T9X5;A0A1W2PPQ2;A0A1W2PRT9;Q53Z42", - }, - "Gene": { - 0: np.nan, - 1: np.nan, - 2: np.nan, - 3: np.nan, - 4: np.nan, - 5: np.nan, - 6: np.nan, - 7: np.nan, - 8: np.nan, - 9: np.nan, - 10: np.nan, - 11: np.nan, - 12: np.nan, - 13: np.nan, - 14: np.nan, - }, - "Intensity": { - 0: 329042.0, - 1: 138322.0, - 2: np.nan, - 3: 122984.0, - 4: 36317.0, - 5: 367477.0, - 6: 572539.0, - 7: np.nan, - 8: 59042.7, - 9: np.nan, - 10: 381325.0, - 11: 96522.7, - 12: np.nan, - 13: 72372.5, - 14: 27456.7, - }, - } - return pd.DataFrame(data=diann_intensity_df) - - @pytest.mark.parametrize( "intensity_name", [ @@ -217,20 +143,6 @@ def test_ms_fragger_import(intensity_name): pd.testing.assert_frame_equal(test_intensity_df, intensity_df) -def test_diann_import(): - test_intensity_df, _ = ms_data_import.diann_import( - _=None, - file_path=f"{PROJECT_PATH}/tests/diann_intensities.tsv", - ) - - intensity_df = diann_import_intensity_df() - - # we do not care about the genes column, it is never used (and replaced by nan) - intensity_df = intensity_df.drop(columns=["Gene"]) - test_intensity_df = test_intensity_df.drop(columns=["Gene"]) - pd.testing.assert_frame_equal(test_intensity_df, intensity_df) - - def test_filter_rev_con(): intensity_df, other = ms_data_import.max_quant_import( _=None, From 67c3a6f20a6d1a51f3fe16f7f6ee6a7cf7020222 Mon Sep 17 00:00:00 2001 From: Lilly Zintl <73845790+lill28@users.noreply.github.com> Date: Sat, 25 Nov 2023 09:56:31 +0100 Subject: [PATCH 23/24] Docstrings (#289) * correct docstring syntax of data_analysis package * Fix docstring syntax in data_integration package * Fix docstring syntax in utilities package * Fix more docstring syntax * format code * add docstrings for database_integration.py * add returns for database_download.py * add docstrings for database_query.py * add docstrings for upload_handler.py * docstrings runs/views part1 * docstrings 2 runs/views.py * docstrings for runs/fields.py * docstrings for clustergram copy * docstrings for data_analysis/dimension_reduction.py * docstrings for data_analysis/classification.py * add __init__.py * Fix run docstring * Add docs build instructions and packages * update docs build instructions * Implement review suggestions * remove \ in data_preprocessing docstrings * remove more \ in docstrings * adopt PR suggestions --------- Co-authored-by: Fynn Co-authored-by: BelanaZ <66524915+BelanaZ@users.noreply.github.com> Co-authored-by: Sara Grau Co-authored-by: antonneubauer --- docs/Makefile | 20 ++ docs/build_docs.md | 8 + docs/make.bat | 35 +++ docs/source/conf.py | 27 +++ docs/source/index.rst | 20 ++ protzilla/constants/location_mapping.py | 17 +- protzilla/data_analysis/classification.py | 63 ++++- protzilla/data_analysis/clustering.py | 42 ++-- .../differential_expression_anova.py | 6 +- .../differential_expression_linear_model.py | 4 +- .../differential_expression_t_test.py | 29 ++- .../data_analysis/dimension_reduction.py | 59 +++-- protzilla/data_analysis/model_evaluation.py | 2 +- .../data_analysis/model_evaluation_plots.py | 6 +- protzilla/data_analysis/plots.py | 30 +-- protzilla/data_analysis/protein_graphs.py | 103 ++++++-- protzilla/data_integration/__init__.py | 0 .../data_integration/database_download.py | 18 +- .../data_integration/database_integration.py | 29 +++ protzilla/data_integration/database_query.py | 44 +++- protzilla/data_integration/di_plots.py | 4 + .../data_integration/enrichment_analysis.py | 8 + .../enrichment_analysis_gsea.py | 4 + .../enrichment_analysis_helper.py | 5 + .../data_preprocessing/filter_proteins.py | 16 +- .../data_preprocessing/filter_samples.py | 16 +- protzilla/data_preprocessing/imputation.py | 81 ++++--- protzilla/data_preprocessing/normalisation.py | 41 ++-- .../data_preprocessing/outlier_detection.py | 39 +-- .../data_preprocessing/peptide_filter.py | 6 +- protzilla/data_preprocessing/plots.py | 69 +++--- .../data_preprocessing/transformation.py | 11 +- protzilla/history.py | 10 +- protzilla/importing/__init__.py | 0 protzilla/run.py | 64 +++-- protzilla/utilities/clustergram.py | 13 + protzilla/utilities/dunn_score.py | 1 + protzilla/utilities/transform_dfs.py | 26 +- protzilla/utilities/utilities.py | 1 + requirements.txt | 2 + tests/protzilla/data_analysis/__init__.py | 0 tests/protzilla/data_integration/__init__.py | 0 tests/protzilla/importing/__init__.py | 0 tests/ui/__init__.py | 0 ui/main/upload_handler.py | 6 +- ui/runs/fields.py | 126 +++++++++- ui/runs/utilities/__init__.py | 0 ui/runs/views.py | 229 +++++++++++++++++- 48 files changed, 1048 insertions(+), 292 deletions(-) create mode 100644 docs/Makefile create mode 100644 docs/build_docs.md create mode 100644 docs/make.bat create mode 100644 docs/source/conf.py create mode 100644 docs/source/index.rst create mode 100644 protzilla/data_integration/__init__.py create mode 100644 protzilla/importing/__init__.py create mode 100644 tests/protzilla/data_analysis/__init__.py create mode 100644 tests/protzilla/data_integration/__init__.py create mode 100644 tests/protzilla/importing/__init__.py create mode 100644 tests/ui/__init__.py create mode 100644 ui/runs/utilities/__init__.py diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 000000000..d0c3cbf10 --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = source +BUILDDIR = build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/build_docs.md b/docs/build_docs.md new file mode 100644 index 000000000..42d402fc4 --- /dev/null +++ b/docs/build_docs.md @@ -0,0 +1,8 @@ +## Build docs with Sphinx ## +- Docs are build with sphinx-autoapi +- after installing the packages in requirements.txt, all necessary dependencies for building the docs should be installed (sphinx==7.2.6, sphinx-autoapi==3.0.0, requests==2.31.0) +- to build the docs open the docs\ folder in a terminal and run "make html" to create the html documentation + - in case the error "Could not import extension sphinx.builders.linkcheck" occurs, try reinstalling python requests (pip install requests==2.31.0) + - warnings might occur, they usually do not prevent the successful build of the docs +- To open the docs open the index.html in the docs\build\html folder +- when adding docstrings to the code they should follow the correct syntax(https://sphinx-rtd-tutorial.readthedocs.io/en/latest/docstrings.html), in order to be formatted correctly in the generated documentation \ No newline at end of file diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 000000000..dc1312ab0 --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=source +set BUILDDIR=build + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.https://www.sphinx-doc.org/ + exit /b 1 +) + +if "%1" == "" goto help + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/source/conf.py b/docs/source/conf.py new file mode 100644 index 000000000..53667bb5e --- /dev/null +++ b/docs/source/conf.py @@ -0,0 +1,27 @@ +# Configuration file for the Sphinx documentation builder. +# +# For the full list of built-in configuration values, see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Project information ----------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information + +project = "PROTzilla" +copyright = "2023, BP22/23" +author = "BP22/23" + +# -- General configuration --------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration + +extensions = ["sphinx.ext.napoleon", "autoapi.extension"] +autoapi_dirs = ["../../"] + +templates_path = ["_templates"] +exclude_patterns = [] + + +# -- Options for HTML output ------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output + +html_theme = "alabaster" +html_static_path = ["_static"] diff --git a/docs/source/index.rst b/docs/source/index.rst new file mode 100644 index 000000000..3140cb153 --- /dev/null +++ b/docs/source/index.rst @@ -0,0 +1,20 @@ +.. PROTzilla documentation master file, created by + sphinx-quickstart on Wed Sep 27 18:24:09 2023. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Welcome to PROTzilla's documentation! +===================================== + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + + + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` diff --git a/protzilla/constants/location_mapping.py b/protzilla/constants/location_mapping.py index d9c075e66..03ace1f0e 100644 --- a/protzilla/constants/location_mapping.py +++ b/protzilla/constants/location_mapping.py @@ -20,11 +20,9 @@ ) from ..importing import metadata_import, ms_data_import, peptide_import -""" -In this data structure, a method is associated with a location. The location is -determined by the section, step, and method keys found in the workflow_meta -file that correspond to the method. -""" +# In this data structure, a method is associated with a location. The location is +# determined by the section, step, and method keys found in the workflow_meta +# file that correspond to the method. method_map = { ( "importing", @@ -247,11 +245,10 @@ # reversed mapping of method callable and location location_map = {v: k for k, v in method_map.items()} -""" -In this data structure, a plot for a given method is associated with a -location. The location is determined by the section, step, and method keys -found in the workflow_meta file that correspond to the method. -""" + +# In this data structure, a plot for a given method is associated with a +# location. The location is determined by the section, step, and method keys +# found in the workflow_meta file that correspond to the method. plot_map = { ( "data_preprocessing", diff --git a/protzilla/data_analysis/classification.py b/protzilla/data_analysis/classification.py index 3a2ac6772..157331930 100644 --- a/protzilla/data_analysis/classification.py +++ b/protzilla/data_analysis/classification.py @@ -107,33 +107,33 @@ def random_forest( :param metadata_df: A separate dataframe containing additional metadata information. :type metadata_df: pd.DataFrame :param labels_column: The column name in the `metadata_df` dataframe that contains - the target variable (labels) for classification. + the target variable (labels) for classification. :type labels_column: str :param train_test_split: The proportion of data to be used for testing. Default is - 0.2 (80-20 train-test split). + 0.2 (80-20 train-test split). :type train_test_split: int, optional :param n_estimators: The number of decision trees to be used in the random forest. :type n_estimators: int, optional :param criterion: The impurity measure used for tree construction. :type criterion: str, optional :param max_depth: The maximum depth of the decision trees. If not specified (None), - the trees will expand until all leaves are pure or contain minimum samples per leaf. + the trees will expand until all leaves are pure or contain minimum samples per leaf. :type max_depth: int or None, optional :param bootstrap: Whether bootstrap samples should be used when building trees. :type bootstrap: bool, optional :param random_state: The random seed for reproducibility. - :type random_state: int, optional + :type random_state: int :param model_selection: The model selection method for hyperparameter tuning. - :type model_selection: str, optional + :type model_selection: str :param validation_strategy: The strategy for model validation. - :type validation_strategy: str, optional + :type validation_strategy: str :param scoring: The scoring metric(s) used to evaluate the model's performance - during validation. - :type scoring: list[str], optional + during validation. + :type scoring: list[str] :param **kwargs: Additional keyword arguments to be passed to the function. :return: A RandomForestClassifier instance, a dataframe consisting of the model's - training parameters and the validation score, along with four dataframes containing - the respective test and training samples and labels. + training parameters and the validation score, along with four dataframes + containing the respective test and training samples and labels. :rtype: dict """ @@ -215,6 +215,49 @@ def svm( scoring: list[str] = ["accuracy"], **kwargs, ): + """ + Perform classification using the support vector machine classifier from sklearn. + + :param input_df: The dataframe that should be classified in wide or long format + :type input_df: pd.DataFrame + :param metadata_df: A separate dataframe containing additional metadata information. + :type metadata_df: pd.DataFrame + :param labels_column: The column name in the `metadata_df` dataframe that contains + the target variable (labels) for classification. + :type labels_column: str + :param C: Regularization parameter + :type C: float + :param kernel: Specifies the kernel type. + :type kernel: str, optional + :param gamma: Kernel coefficient (default: 'scale', relevant for 'rbf', 'poly', and + 'sigmoid'). + :type gamma: str + :param coef0: Independent term in the kernel function (relevant for 'poly' and + 'sigmoid'). + :type coef0: float + :param probability: Whether to enable probability estimates + :type probability: bool, optional + :param tol: Tolerance for stopping criterion + :type tol: float + :param class_weight: Weights associated with classes + :type class_weight: float + :param max_iter: Maximum number of iterations (default: -1, indicating no limit). + :type max_iter: int + :param random_state: The random seed for reproducibility. + :type random_state: int + :param model_selection: The model selection method for hyperparameter tuning. + :type model_selection: str + :param validation_strategy: The strategy for model validation. + :type validation_strategy: str + :param scoring: The scoring metric(s) used to evaluate the model's performance + during validation. + :type scoring: list[str] + :param **kwargs: Additional keyword arguments to be passed to the function. + :return: A dict containing: a SVC instance, a dataframe consisting of the model's + training parameters and the validation score, along with four dataframes + containing the respective test and training samples and labels. + :rtype: dict + """ # TODO 216 add warning to user that data should be to shuffled, give that is being sorted at the beginning! input_df_wide = long_to_wide(input_df) if is_long_format(input_df) else input_df diff --git a/protzilla/data_analysis/clustering.py b/protzilla/data_analysis/clustering.py index f234b664a..acfa64fe7 100644 --- a/protzilla/data_analysis/clustering.py +++ b/protzilla/data_analysis/clustering.py @@ -30,8 +30,8 @@ def k_means( **kwargs, ): """ - A method that uses k-means to partition a number of samples in k clusters. The \ - function returns a dataframe with the corresponding cluster of each sample and \ + A method that uses k-means to partition a number of samples in k clusters. The + function returns a dataframe with the corresponding cluster of each sample and another dataframe with the coordinates of the cluster centers. :param input_df: The dataframe that should be clustered in wide or long format @@ -39,7 +39,7 @@ def k_means( :param metadata_df: A separate dataframe containing additional metadata information. :type metadata_df: pd.DataFrame :param labels_column: The column name in the `metadata_df` dataframe that contains - the true labels of the data + the true labels of the data :type labels_column: str :param positive_label: The positive label for clustering. :type positive_label: str @@ -47,23 +47,23 @@ def k_means( :type model_selection: str :param scoring: The scoring metric(s) used for model evaluation. :type scoring: list[str] - :param n_clusters: the number of clusters to form as well as the number of \ - centroids to generate. + :param n_clusters: the number of clusters to form as well as the number of + centroids to generate. :type n_clusters: int :param random_state: Determines random number generation for centroid initialization :type random_state: int - :param init_centroid_strategy: method for centroid initialization. Possible methods\ - are: k-means++ and random + :param init_centroid_strategy: method for centroid initialization. Possible methods + are: k-means++ and random :type init_centroid_strategy: str - :param n_init: Number of times the k-means algorithm is run with different centroid\ - seeds. + :param n_init: Number of times the k-means algorithm is run with different centroid + seeds. :type n_init: int - :param max_iter: Maximum number of iterations of the k-means algorithm for a single\ - run. + :param max_iter: Maximum number of iterations of the k-means algorithm for a single + run. :type max_iter: int - :param tolerance: Relative tolerance with regards to Frobenius norm of the \ - difference in the cluster centers of two consecutive iterations to declare\ - convergence. + :param tolerance: Relative tolerance with regards to Frobenius norm of the + difference in the cluster centers of two consecutive iterations to declare + convergence. :type tolerance: float :returns: A dictionary containing the following elements: - model: The trained Gaussian Mixture Model. @@ -171,9 +171,10 @@ def expectation_maximisation( :param metadata_df: A separate dataframe containing additional metadata information. :type metadata_df: pd.DataFrame :param labels_column: The column name in the `metadata_df` dataframe that contains - the true labels of the data + the true labels of the data :type labels_column: str :param positive_label: The positive label for clustering. + :type positive_label: str :param model_selection: The model selection method for hyperparameter tuning. :type model_selection: str :param scoring: The scoring metric(s) used for model evaluation. @@ -183,17 +184,17 @@ def expectation_maximisation( :param covariance_type: The covariance type for the Gaussian Mixture Model. :type covariance_type: str, optional :param reg_covar: Non-negative regularization added to the diagonal of covariance - matrices. + matrices. :type reg_covar: float :param init_params: The method used to initialize the weights, the means and - the precisions. + the precisions. :type init_params: str :param max_iter: The number of EM iterations to perform. :type max_iter: int, optional :param random_state: The random seed for reproducibility. :type random_state: int :param **kwargs: Additional keyword arguments to be passed to the - `perform_clustering` function. + `perform_clustering` function. :returns: A dictionary containing the following elements: - model: The trained Gaussian Mixture Model. - model_evaluation_df: dataframe consisting of the model's parameters and the @@ -275,9 +276,10 @@ def hierarchical_agglomerative_clustering( :param metadata_df: A separate dataframe containing additional metadata information. :type metadata_df: pd.DataFrame :param labels_column: The column name in the `metadata_df` dataframe that contains - the true labels of the data + the true labels of the data :type labels_column: str :param positive_label: The positive label for clustering. + :type positive_label: str :param model_selection: The model selection method for hyperparameter tuning. :type model_selection: str :param scoring: The scoring metric(s) used for model evaluation. @@ -287,7 +289,7 @@ def hierarchical_agglomerative_clustering( :param metric: Metric used to compute the linkage. :type metric: str :param linkage: Which linkage criterion to use. The linkage criterion determines - which distance to use between sets of observation + which distance to use between sets of observation :type linkage: str :returns: A dictionary containing the following elements: - model: The trained Gaussian Mixture Model. diff --git a/protzilla/data_analysis/differential_expression_anova.py b/protzilla/data_analysis/differential_expression_anova.py index 31c0cd045..54adcceb2 100644 --- a/protzilla/data_analysis/differential_expression_anova.py +++ b/protzilla/data_analysis/differential_expression_anova.py @@ -40,9 +40,9 @@ def anova( :rtype: pandas DataFrame, dict :return: a dataframe in typical protzilla long format - with the differentially expressed proteins and a dict, containing - the corrected p-values and the log2 fold change, the alpha used - and the corrected alpha, as well as filtered out proteins. + with the differentially expressed proteins and a dict, containing + the corrected p-values and the log2 fold change, the alpha used + and the corrected alpha, as well as filtered out proteins. """ # Check if the grouping variable is present in the metadata_df assert grouping in metadata_df.columns, f"{grouping} not found in metadata_df" diff --git a/protzilla/data_analysis/differential_expression_linear_model.py b/protzilla/data_analysis/differential_expression_linear_model.py index 1a0b3440c..9988eaed5 100644 --- a/protzilla/data_analysis/differential_expression_linear_model.py +++ b/protzilla/data_analysis/differential_expression_linear_model.py @@ -42,8 +42,8 @@ def linear_model( :type fc_threshold: float :return: a dataframe in typical protzilla long format with the differentially expressed - proteins and a dict, containing the corrected p-values and the log2 fold change (coefficients), the alpha used - and the corrected alpha, as well as filtered out proteins. + proteins and a dict, containing the corrected p-values and the log2 fold change (coefficients), the alpha used + and the corrected alpha, as well as filtered out proteins. :rtype: Tuple[pandas DataFrame, dict] """ assert grouping in metadata_df.columns diff --git a/protzilla/data_analysis/differential_expression_t_test.py b/protzilla/data_analysis/differential_expression_t_test.py index 22e7eb6f1..7fe3ffc96 100644 --- a/protzilla/data_analysis/differential_expression_t_test.py +++ b/protzilla/data_analysis/differential_expression_t_test.py @@ -45,20 +45,25 @@ def t_test( :type multiple_testing_correction_method: str :param alpha: the alpha value for the t-test :type alpha: float + :param fc_threshold: threshold for the abs(log_2(fold_change)) (vertical line in a volcano plot). + Only proteins with a larger abs(log_2(fold_change)) than the fc_threshold are seen as differentially expressed + :type fc-threshold: float + :param log_base: in case the data was previously log transformed this parameter contains the base (e.g. 2 if the data was log_2 transformed). + If the data was not log transformed the parmeter should be "" + :type log_base: int/str :return: a dict containing - a df corrected_p_values, containing the p_values after application of multiple testing correction, - a df log2_fold_change, containing the log2 fold changes per protein, - a float fc_threshold, containing the absolute threshold for the log fold change, above which a protein is considered differentially expressed, - a float corrected_alpha, containing the alpha value after application of multiple testing correction (depending on the selected multiple testing correction method corrected_alpha may be equal to alpha), - a df filtered_proteins, containing the filtered out proteins (proteins where the mean of a group was 0), - a df fold_change_df, containing the fold_changes per protein, - a df t_statistic_df, containing the t-statistic per protein, - a df de_proteins_df in typical protzilla long format containing the differentially expressed proteins; - corrected_p_value, log2_fold_change, fold_change and t_statistic per protein, - a df significant_proteins_df, containing the proteins where the p-values are smaller than alpha (if fc_threshold = 0, the significant proteins equal the differentially expressed ones) - corrected_p_value, log2_fold_change, fold_change and t_statistic per protein, - + a df corrected_p_values, containing the p_values after application of multiple testing correction, + a df log2_fold_change, containing the log2 fold changes per protein, + a float fc_threshold, containing the absolute threshold for the log fold change, above which a protein is considered differentially expressed, + a float corrected_alpha, containing the alpha value after application of multiple testing correction (depending on the selected multiple testing correction method corrected_alpha may be equal to alpha), + a df filtered_proteins, containing the filtered out proteins (proteins where the mean of a group was 0), + a df fold_change_df, containing the fold_changes per protein, + a df t_statistic_df, containing the t-statistic per protein, + a df de_proteins_df in typical protzilla long format containing the differentially expressed proteins; + corrected_p_value, log2_fold_change, fold_change and t_statistic per protein, + a df significant_proteins_df, containing the proteins where the p-values are smaller than alpha (if fc_threshold = 0, the significant proteins equal the differentially expressed ones) + corrected_p_value, log2_fold_change, fold_change and t_statistic per protein, :rtype: dict """ assert grouping in metadata_df.columns diff --git a/protzilla/data_analysis/dimension_reduction.py b/protzilla/data_analysis/dimension_reduction.py index d38eab6bc..6ec487c65 100644 --- a/protzilla/data_analysis/dimension_reduction.py +++ b/protzilla/data_analysis/dimension_reduction.py @@ -16,10 +16,10 @@ def t_sne( method: str = "barnes_hut", ): """ - A function that uses t-SNE to reduce the dimension of a dataframe and returns a \ + A function that uses t-SNE to reduce the dimension of a dataframe and returns a dataframe in wide format with the entered number of components. - Please note that this function is a simplified version of t-SNE, and it only \ - enables you to adjust the most significant parameters that affect the output. \ + Please note that this function is a simplified version of t-SNE, and it only + enables you to adjust the most significant parameters that affect the output. You can find the default values for the non-adjustable parameters here: https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html @@ -29,22 +29,27 @@ def t_sne( :type n_components: int :param perplexity: the perplexity is related to the number of nearest neighbors :type perplexity: float - :param metric: The metric to use when calculating distance between instances in a \ - feature array. Possible metrics are: euclidean, manhattan, cosine and haversine + :param metric: The metric to use when calculating distance between instances in a + feature array. Possible metrics are: euclidean, manhattan, cosine and haversine :type metric: str :param random_state: determines the random number generator. :type random_state: int :param n_iter: maximum number of iterations for the optimization :type n_iter: int - :param n_iter_without_progress: Maximum number of iterations without progress \ - before we abort the optimization, used after 250 initial iterations with early \ - exaggeration. Note that progress is only checked every 50 iterations so this \ - value is rounded to the next multiple of 50. + :param n_iter_without_progress: Maximum number of iterations without progress + before we abort the optimization, used after 250 initial iterations with early + exaggeration. Note that progress is only checked every 50 iterations so this + value is rounded to the next multiple of 50. :type n_iter_without_progress: int - :param method: the method exact will run on the slower, but exact, algorithm in \ - O(N^2) time. However, the exact method cannot scale to millions of examples. \ - Barnes-Hut approximation will run faster, but not exact, in O(NlogN) time. + :param method: the method 'exact' will run on the slower, but exact, algorithm in + O(N^2) time. However, the 'exact' method cannot scale to millions of examples. + Barnes-Hut approximation will run faster, but not exact, in O(NlogN) time. :type method: str + :return: a dictionary with a single key, "embedded_data", which contains a new + DataFrame in wide format. This DataFrame consists of the t-SNE embedded data + with two columns, "Component1" and "Component2", and shares the same index as + the input_df. + :rtype: dict """ intensity_df_wide = long_to_wide(input_df) if is_long_format(input_df) else input_df try: @@ -108,10 +113,10 @@ def umap( transform_seed: int = 42, ): """ - A function that uses UMAP to reduce the dimension of a dataframe and returns a \ + A function that uses UMAP to reduce the dimension of a dataframe and returns a dataframe in wide format with the entered number of components. - Please note that this function is a simplified version of UMAP, and it only \ - enables you to adjust the most significant parameters that affect the output. \ + Please note that this function is a simplified version of UMAP, and it only + enables you to adjust the most significant parameters that affect the output. You can find the default values for the non-adjustable parameters here: https://umap-learn.readthedocs.io/en/latest/api.html @@ -119,19 +124,27 @@ def umap( :type input_df: pd.DataFrame :param n_components: The dimension of the space to embed into. :type n_components: int - :param n_neighbors: The size of local neighborhood in terms of number of \ - neighboring sample points + :param n_neighbors: The size of local neighborhood in terms of number of + neighboring sample points :type n_neighbors: float - :param min_dist: the effective minimum distance between embedded points. Smaller \ - values will result in a more clustered/clumped embedding where nearby points on \ - the manifold are drawn closer together, while larger values will result on a more \ - even dispersal of points. + :param min_dist: the effective minimum distance between embedded points. Smaller + values will result in a more clustered/clumped embedding where nearby points on + the manifold are drawn closer together, while larger values will result on a more + even dispersal of points. :type min_dist: float - :param metric: The metric to use when calculating distance between instances in a \ - feature array. + :param metric: The metric to use when calculating distance between instances in a + feature array. :type metric: str :param random_state: determines the random number generator. :type random_state: int + :param transform_seed: Random seed used for the stochastic aspects of the transform + operation. + :type transform_seed: int + :return: a dictionary with a single key, "embedded_data", which contains a new + DataFrame in wide format. This DataFrame consists of the UMAP embedded data + with two columns, "Component1" and "Component2", and shares the same index as + the input_df. + :rtype: dict """ # umap import is slow, so it should only get imported when needed diff --git a/protzilla/data_analysis/model_evaluation.py b/protzilla/data_analysis/model_evaluation.py index 0fba7d783..5a1f045c5 100644 --- a/protzilla/data_analysis/model_evaluation.py +++ b/protzilla/data_analysis/model_evaluation.py @@ -18,7 +18,7 @@ def evaluate_classification_model(model, input_test_df, labels_test_df, scoring) :param labels_test_df: The true labels of the testing data as a DataFrame. :type labels_test_df: pd.DataFrame :param scoring: The scoring metric to be used for evaluation. It can be a string - representing a predefined metric e.g. accuracy, precision, recall, matthews_corrcoef + representing a predefined metric e.g. accuracy, precision, recall, matthews_corrcoef :type scoring: str or callable :return: A dataframe with the metric name and its corresponding score. :rtype: dict diff --git a/protzilla/data_analysis/model_evaluation_plots.py b/protzilla/data_analysis/model_evaluation_plots.py index f1167b210..2d2079d92 100644 --- a/protzilla/data_analysis/model_evaluation_plots.py +++ b/protzilla/data_analysis/model_evaluation_plots.py @@ -8,6 +8,7 @@ def precision_recall_curve_plot(model, input_test_df, labels_test_df, title=None): """ Calculate and plot the precision-recall curve for a classification model. + :param model: The trained classification model instance to be evaluated. :type model: BaseEstimator :param input_test_df: The input features of the testing data as a DataFrame. @@ -15,7 +16,7 @@ def precision_recall_curve_plot(model, input_test_df, labels_test_df, title=None :param labels_test_df: The true labels of the testing data as a DataFrame. :type labels_test_df: pd.DataFrame :param title: The title of the precision-recall curve plot. This is an optional - parameter. + parameter. :type title: str, optional :return: Base64 encoded image of the plot :rtype: bytes @@ -34,6 +35,7 @@ def precision_recall_curve_plot(model, input_test_df, labels_test_df, title=None def roc_curve_plot(model, input_test_df, labels_test_df, title=None): """ Calculate and plot the roc curve for a classification model. + :param model: The trained classification model instance to be evaluated. :type model: BaseEstimator :param input_test_df: The input features of the testing data as a DataFrame. @@ -41,7 +43,7 @@ def roc_curve_plot(model, input_test_df, labels_test_df, title=None): :param labels_test_df: The true labels of the testing data as a DataFrame. :type labels_test_df: pd.DataFrame :param title: The title of the precision-recall curve plot. This is an optional - parameter. + parameter. :type title: str, optional :return: Base64 encoded image of the plot :rtype: bytes diff --git a/protzilla/data_analysis/plots.py b/protzilla/data_analysis/plots.py index c158a3fbf..00facc486 100644 --- a/protzilla/data_analysis/plots.py +++ b/protzilla/data_analysis/plots.py @@ -13,14 +13,17 @@ def scatter_plot( color_df: pd.DataFrame | None = None, ): """ - Function to create a scatter plot from data. - - :param input_df: the dataframe that should be plotted. It should have either 2 \ - or 3 dimension + Function to create a scatter plot from data. + + :param input_df: the dataframe that should be plotted. It should have either 2 + or 3 dimensions :type input_df: pd.Dataframe - :param color_df: the Dataframe with one column according to which the marks should \ - be colored. This is an optional parameter + :param color_df: the Dataframe with one column according to which the marks should + be colored. This is an optional parameter :type color_df: pd.Dataframe + + :return: returns a list with a plotly figure or a list with a dictionary if an error occurs + :rtype: list[plotly figure]/dict """ intensity_df_wide = long_to_wide(input_df) if is_long_format(input_df) else input_df try: @@ -83,6 +86,9 @@ def create_volcano_plot( :type alpha: float :param proteins_of_interest: the proteins that should be annotated in the plot :type proteins_of_interest: list or None + + :return: returns a list with a plotly figure + :rtype: [plotly figure] """ plot_df = p_values.join(log2_fc.set_index("Protein ID"), on="Protein ID") @@ -149,11 +155,6 @@ def clustergram_plot( input_df: pd.DataFrame, sample_group_df: pd.DataFrame | None, flip_axes: str ): """ - - :param grouping: the column name of the grouping variable in the - metadata_df - :type grouping: str - Creates a clustergram plot from a dataframe in protzilla wide format. The rows or columns of the clustergram are ordered according to the clustering resulting from the dendrogram. Optionally, a colorbar representing the different groups present @@ -170,8 +171,9 @@ def clustergram_plot( :param flip_axes: If "yes", the rows and columns of the clustergram will be swapped. If "no", the default orientation is used. :type flip_axes: str - :return: returns a list with a figure or a list with a dictionary if an error occurs - :rtype: [go.Figure] + + :return: returns a list with a plotly figure or a list with a dictionary if an error occurs + :rtype: list[plotly figure]/dict """ try: assert isinstance(input_df, pd.DataFrame) and not input_df.empty @@ -188,7 +190,7 @@ def clustergram_plot( ) # In the clustergram each row represents a sample that can pertain to a # group. In the following code the necessary data structures are created - # to assign eachgroup to a unique color. + # to assign each group to a unique color. sample_group_dict = dict( zip(sample_group_df.index, sample_group_df[sample_group_df.columns[0]]) ) diff --git a/protzilla/data_analysis/protein_graphs.py b/protzilla/data_analysis/protein_graphs.py index 5aefe71f0..999a0dc93 100644 --- a/protzilla/data_analysis/protein_graphs.py +++ b/protzilla/data_analysis/protein_graphs.py @@ -129,8 +129,8 @@ def peptides_to_isoform( :type allowed_mismatches: int :return: dict of path to graph - either the modified graph or the original graph if - the modification failed, the protein id, list of matched peptides, list of unmatched - peptides, messages passed to the frontend + the modification failed, the protein id, list of matched peptides, list of unmatched + peptides, messages passed to the frontend :rtype: dict[str, str, list, list, list] """ @@ -224,11 +224,69 @@ def peptides_to_isoform( ) +def _create_protein_variation_graph(protein_id: str, run_name: str) -> dict: + """ + Creates a Protein-Variation-Graph for a given UniProt Protein ID using ProtGraph. + Included features are just `Variation`, digestion is skipped. + The Graph is saved in .graphml-Format. + + This is designed, so it can be used for peptides_to_isoform but works independently + as well + + ProtGraph: https://github.com/mpc-bioinformatics/ProtGraph/ + + :param protein_id: UniProt Protein-ID + :type protein_id: str + :param run_name: name of the run this is executed from. Used for saving the protein + file, graph + :type run_name: str + :param queue_size: Queue Size for ProtGraph, This is yet to be merged by ProtGraph + :type queue_size: int + + :return: dict(graph_path, messages) + """ + + logger.info(f"Creating graph for protein {protein_id}") + run_path = RUNS_PATH / run_name + path_to_protein_file, filtered_blocks, request = _get_protein_file( + protein_id, run_path + ) + + path_to_protein_file = Path(path_to_protein_file) + if not path_to_protein_file.exists() and request.status_code != 200: + msg = f"error while downloading protein file for {protein_id}. Statuscode:{request.status_code}, {request.reason}. Got: {request.text}. Tip: check if the ID is correct" + logger.error(msg) + return dict( + graph_path=None, + filtered_blocks=filtered_blocks, + messages=[dict(level=messages.ERROR, msg=msg, trace=request.__dict__)], + ) + + output_folder_path = run_path / "graphs" + output_csv = output_folder_path / f"{protein_id}.csv" + graph_path = output_folder_path / f"{protein_id}.graphml" + cmd_str = f"protgraph -egraphml {path_to_protein_file} \ + --export_output_folder={output_folder_path} \ + --output_csv={output_csv} \ + -ft VARIANT \ + -d skip" + + subprocess.run(cmd_str, shell=True) + + msg = f"Graph created for protein {protein_id} at {graph_path} using {path_to_protein_file}" + logger.info(msg) + return dict( + graph_path=str(graph_path), + filtered_blocks=filtered_blocks, + messages=[dict(level=messages.INFO, msg=msg)], + ) + + def _create_graph_index( protein_graph: nx.DiGraph, seq_len: int ) -> tuple[list | None, str, dict | None]: """ - create a mapping from the position in the protein (using the longest path) to + Create a mapping from the position in the protein (using the longest path) to node(s) in the graph For information about _longest_path() please see the docstring of that function. @@ -314,7 +372,7 @@ def _longest_paths(protein_graph: nx.DiGraph, start_node: str): n4 longest_paths: {n1: 0, n2: 3, n3: 5, n4: 5, n5: 6, __end__: 8} - :param protein_graph: Protein-Graph as created by ProtGraph \ + :param protein_graph: Protein-Graph as created by ProtGraph (-> _create_protein_variation_graph) :type protein_graph: nx.DiGraph :param start_node: Source of protein_graph @@ -442,13 +500,14 @@ def _create_reference_sequence_index( protein_path: str, k: int = 5 ) -> tuple[dict, str, int]: """ - Create mapping from kmer of reference_sequence of protein to starting position(s) \ + Create mapping from kmer of reference_sequence of protein to starting position(s) of kmer in reference_sequence :param protein_path: Path to protein file from UniProt (.txt) :type protein_path: str :param k: length of kmers :type k: int + :return: index {kmer: [starting positions]}, reference sequence, length of reference sequence :rtype: tuple(dict, str, int) @@ -554,8 +613,9 @@ def _potential_peptide_matches( :type peptides: list :param ref_index: mapping from kmer to match-positions on reference sequence :type ref_index: dict(kmer: [starting position]} + :return: dict(peptide: [match start on reference sequence]), - list(peptides without match) + list(peptides without match) :rtype: dict, list """ @@ -604,11 +664,11 @@ def _create_contigs_dict(node_start_end: dict): peptide(s) that is responsible for the match. :param node_start_end: dict of peptide to dict of start index of peptide match to - dict of node to tuple of start and end positions of matches within the node + dict of node to tuple of start and end positions of matches within the node :type node_start_end: dict[str, dict[int, dict[str, tuple[int, int]]]] :return: dict of node to list of triple of start position, end position and - peptide(s) responsible for match + peptide(s) responsible for match """ node_match_data = defaultdict(lambda: {"match_locations": []}) @@ -666,22 +726,22 @@ def _match_potential_matches( :param potential_peptide_matches: dict of peptide to list of starting positions :type potential_peptide_matches: dict[str, list[int]] :param graph_index: list of lists, each list contains the nodes and AAs at that - given index along the longest path through the graph + given index along the longest path through the graph :type graph_index: list[list[tuple[str, str]]] :param peptide_mismatches: list of peptides that did not match to the reference - sequence + sequence :type peptide_mismatches: list[str] :param allowed_mismatches: number of mismatches allowed for a peptide to be - considered a match + considered a match :type allowed_mismatches: int :param graph: protein variation graph, as created by ProtGraph - (-> _create_protein_variation_graph) + (-> _create_protein_variation_graph) :type graph: networkx.DiGraph :param longest_paths: length of longest path through the graph to each node :type longest_paths: dict[str, int] :return: dict of peptide to dict of start index of peptide match to dict of node to - tuple of start and end position of match in this node + tuple of start and end position of match in this node :rtype: dict[str, dict[int, dict[str, tuple[int, int]]]] """ @@ -713,7 +773,7 @@ def _match_on_graph( :param allowed_mismatches: number of mismatches allowed per start position :type allowed_mismatches: int :param graph: protein variation graph, as created by ProtGraph - (-> _create_protein_variation_graph) + (-> _create_protein_variation_graph) :type graph: networkx.DiGraph :param current_node: current node in the graph, starting with the node of the match start @@ -721,12 +781,13 @@ def _match_on_graph( :param left_over_peptide: peptide that still needs to be matched to the graph :type left_over_peptide: str :param node_match_data: dict of node to tuple of start position, end position + :type node_match_data: dict :param current_index: index of the amino acid in the current node that is being - matched to the peptide + matched to the peptide :type current_index: int :return: tuple of bool, dict of node to tuple of start position, end position, - number of mismatches + number of mismatches :rtype: tuple[bool, dict[str, tuple[int, int]], int] """ @@ -829,12 +890,16 @@ def _modify_graph(graph, contig_positions): will end up without `match`-attribute. :param graph: Protein Graph to be modified - :type: nx.DiGraph + :type graph: nx.DiGraph :param contig_positions: Dict from current_node to contig-positions {current_node: [(start, end)]}. - :type: dict(list[tuple]) + :type contig_positions: dict(list[tuple]) + :param longest_paths: mapping from current_node to the longest path to current_node + (-> _longest_paths()) + :type longest_paths: dict + :return: modified protein graph, with contigs & not-matched AAs as nodes, indicated - by current_node attribute `matched` + by current_node attribute `matched` """ def _node_length(node): diff --git a/protzilla/data_integration/__init__.py b/protzilla/data_integration/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/protzilla/data_integration/database_download.py b/protzilla/data_integration/database_download.py index f8ee4830e..147c09b84 100644 --- a/protzilla/data_integration/database_download.py +++ b/protzilla/data_integration/database_download.py @@ -33,10 +33,15 @@ def get_batch(batch_url, session): def download_uniprot_paged(name): """ - downloads basic info on all human proteins from the uniprot paged rest api. + Downloads basic info on all human proteins from the uniprot paged rest api. this will take very long due to limitations in the api, therefore stream should be used. code taken from https://www.uniprot.org/help/api_queries including get_next_link and get_batch - parameter name: str = name the database will be saved as + + :param name: name the database will be saved as + :type name: str + + :return: the number of proteins that were downloaded + :rtype: int """ retries = Retry(total=5, backoff_factor=0.25, status_forcelist=[500, 502, 503, 504]) @@ -60,9 +65,14 @@ def download_uniprot_paged(name): def download_uniprot_stream(name): """ - downloads basic info on all human proteins from the streamed uniprot rest api. + Downloads basic info on all human proteins from the streamed uniprot rest api. can fail due to unstable internet connection or problems with the api. - parameter name: str = name the database will be saved as + + :param name: name the database will be saved as + :type name: str + + :return: nothing + :rtype: NoneType """ with requests.get( "https://rest.uniprot.org/uniprotkb/stream", diff --git a/protzilla/data_integration/database_integration.py b/protzilla/data_integration/database_integration.py index f461b2e0d..5ff2104ee 100644 --- a/protzilla/data_integration/database_integration.py +++ b/protzilla/data_integration/database_integration.py @@ -6,6 +6,19 @@ def add_uniprot_data(dataframe, database_name=None, fields=None): + """ + Extend a protein dataframe with information from UniProt for each protein. + + :param dataframe: the protein dataframe to be extendet + :type dataframe: pd.DataFrame + :param database_name: name of the database file that will be queried + :type database_name: str + :param fields: the fields of the database that will be added to the dataframe + :type fields: list[str] + + :return: the extended dataframe, and a message if applicable + :rtype: dict + """ if not fields: msg = "No fields that should be added specified." return dict( @@ -68,6 +81,22 @@ def add_uniprot_data(dataframe, database_name=None, fields=None): def gene_mapping(dataframe, database_names, use_biomart=False): + """ + Maps the protein ID groups to HGNC gene symbols, filtering out ones that are not + found. + + :param dataframe: the protein dataframe of which the protein ID groups will be + mapped. + :type dataframe: pd.DataFrame + :param database_names: names of the database files that will be queried + :type database_names: list[str] | str + :param use_biomart: should biomart be used to map ids that could not be mapped with + databases + :type use_biomart: bool + + :return: the gene mapping, consisting of group_to_genes, gene_to_groups and filtered + :rtype: dict + """ try: groups = dataframe["Protein ID"].unique().tolist() except KeyError: diff --git a/protzilla/data_integration/database_query.py b/protzilla/data_integration/database_query.py index 5f34f638e..2758505f4 100644 --- a/protzilla/data_integration/database_query.py +++ b/protzilla/data_integration/database_query.py @@ -10,6 +10,19 @@ def biomart_query(queries, filter_name, attributes, use_grch37=False): + """ + Construct an XML query for BioMart, send it, decode the result and return it as an + iterator. + + :param queries: what entities to look for with the filter + :type queries: list[str] + :param filter_name: the name of the BioMart category the queries will be searched in + :type filter_name: str + :param attributes: what BioMart categories to return for each found entity + :type attributes: Iterable[str] + :param use_grch37: if truthy, use the outdated GRCh37 biomart endpoint + :type use_grch37: bool + """ if not queries: return @@ -86,12 +99,17 @@ def uniprot_to_genes(uniprot_ids, databases, use_biomart): First uses all uniprot databases that contain genes, then uses biomart to map proteins that have not been found with uniprot if biomart is enabled. - :param uniprot_ids: cleaned uniprot IDs, not containing isoforms or other modifications + :param uniprot_ids: cleaned uniprot IDs, not containing isoforms or other + modifications :type uniprot_ids: list[str] :param databases: names of uniprot databases that should be used for mapping :type databases: list[str] - :param use_biomart: should biomart be used to map ids that could not be mapped with databases - :return: a dict that maps uniprot ids to genes and a list of uniprot ids that were not found + :param use_biomart: if true, biomart should be used to map ids that could not be + mapped with databases + :type use_biomart: bool + + :return: a dict that maps uniprot ids to genes and a list of uniprot ids that were + not found :rtype: tuple[dict[str, str], list[str]] """ @@ -125,7 +143,7 @@ def merge_dict(gene_mapping, new_gene_mapping): if not ids_to_search: logger.info( - "All proteins mapped using uniprot, no biomart mapping will be performed." + "All proteins mapped using uniprot, no biomart mapping will be performed." # noqa E501 ) return out_dict, [] if not use_biomart: @@ -151,6 +169,24 @@ def merge_dict(gene_mapping, new_gene_mapping): def uniprot_groups_to_genes(uniprot_groups, databases, use_biomart): + """ + Maps uniprot ID groups to hgnc gene symbols. Also returns groups that could not be + mapped. Merges the mappings per group and creates a reverse mapping, from genes to + groups. + + :param uniprot_groups: groups of UniProt IDs, as found in a protein dataframe, may + contain isoforms and modifications + :type uniprot_groups: list[str] + :param databases: names of uniprot databases that should be used for mapping + :type databases: list[str] + :param use_biomart: should biomart be used to map ids that could not be mapped with + databases + :type use_biomart: bool + + :return: a dict that maps genes to groups, one that maps groups to genes and a list + of uniprot ids that were not found + :rtype: tuple[dict[str, list[str]], dict[str, list[str]], list[str]] + """ proteins = set() for group in uniprot_groups: for protein in group.split(";"): diff --git a/protzilla/data_integration/di_plots.py b/protzilla/data_integration/di_plots.py index f8ae20bf7..c253d604d 100644 --- a/protzilla/data_integration/di_plots.py +++ b/protzilla/data_integration/di_plots.py @@ -42,6 +42,7 @@ def GO_enrichment_bar_plot( :type colors: list, optional :param figsize: Size of the plot, defaults to None and is calculated dynamically if not provided. :type figsize: tuple, optional + :return: Base64 encoded image of the plot :rtype: bytes """ @@ -162,6 +163,7 @@ def GO_enrichment_dot_plot( :type dot_size: int :param figsize: Size of the plot, defaults to None and is calculated dynamically if not provided. :type figsize: tuple, optional + :return: Base64 encoded image of the plot :rtype: bytes """ @@ -276,6 +278,7 @@ def gsea_dot_plot( :type remove_library_names: bool :param figsize: Size of the plot, defaults to None and is calculated dynamically if not provided. :type figsize: tuple, optional + :return: Base64 encoded image of the plot :rtype: bytes """ @@ -350,6 +353,7 @@ def gsea_enrichment_plot( :type neg_pheno_label: str, optional :param figsize: Size of the plot, defaults to None and is calculated dynamically if not provided. :type figsize: tuple, optional + :return: Base64 encoded image of the plot :rtype: bytes """ diff --git a/protzilla/data_integration/enrichment_analysis.py b/protzilla/data_integration/enrichment_analysis.py index 2671db6e0..f4e336348 100644 --- a/protzilla/data_integration/enrichment_analysis.py +++ b/protzilla/data_integration/enrichment_analysis.py @@ -33,10 +33,12 @@ def get_functional_enrichment_with_delay(protein_list, **string_params): This method performs online functional enrichment analysis using the STRING DB API via the restring package. It adds a delay between calls to the API to avoid exceeding the rate limit. + :param protein_list: list of protein IDs to perform enrichment analysis for :type protein_list: list :param string_params: parameters for the restring package :type string_params: dict + :return: dataframe with functional enrichment results :rtype: pandas.DataFrame """ @@ -62,6 +64,7 @@ def merge_up_down_regulated_dfs_restring(up_df, down_df): :type up_df: pandas.DataFrame :param down_df: dataframe with enrichment results for downregulated proteins :type down_df: pandas.DataFrame + :return: merged dataframe :rtype: pandas.DataFrame """ @@ -146,6 +149,7 @@ def GO_analysis_with_STRING( - both: functional enrichment info is retrieved for upregulated and downregulated proteins separately, but the terms are aggregated for the result dataframe :type direction: str + :return: dictionary with enrichment dataframe :rtype: dict """ @@ -306,6 +310,7 @@ def merge_up_down_regulated_dfs_gseapy(up_enriched, down_enriched): :type up_enriched: pandas.DataFrame :param down_enriched: dataframe with enrichment results for downregulated proteins :type down_enriched: pandas.DataFrame + :return: merged dataframe :rtype: pandas.DataFrame """ @@ -368,6 +373,7 @@ def gseapy_enrichment( :type background: list or None :param offline: whether to run the enrichment offline :type offline: bool + :return: enrichment results, filtered groups, error message if occurred {level, msg, trace(optional)} :rtype: tuple[pandas.DataFrame, list, dict] """ @@ -502,6 +508,7 @@ def GO_analysis_with_Enrichr( :type background_number: int or None :param background_biomart: name of biomart dataset to use as background :type background_biomart: str or None + :return: dictionary with results and filtered groups :rtype: dict """ @@ -706,6 +713,7 @@ def GO_analysis_offline( - both: functional enrichment info is retrieved for upregulated and downregulated proteins separately, but the terms are aggregated for the resulting dataframe :type direction: str + :return: dictionary with results dataframe :rtype: dict """ diff --git a/protzilla/data_integration/enrichment_analysis_gsea.py b/protzilla/data_integration/enrichment_analysis_gsea.py index 73be1e2a8..62cdb2ada 100644 --- a/protzilla/data_integration/enrichment_analysis_gsea.py +++ b/protzilla/data_integration/enrichment_analysis_gsea.py @@ -35,6 +35,7 @@ def create_ranked_df( :type group_to_genes: dict :param filtered_groups: list of protein groups that were filtered out :type filtered_groups: list + :return: ranked dataframe of genes :rtype: pd.DataFrame """ @@ -133,6 +134,7 @@ def gsea_preranked( :type seed: int :param threads: Number of threads :type threads: int + :return: dictionary with results dataframe, ranking, enrichment detail dataframe per enriched gene set and messages :rtype: dict """ @@ -241,6 +243,7 @@ def create_genes_intensity_wide_df( :type group_to_genes: dict :param filtered_groups: list of protein IDs that could not be mapped to gene symbols :type filtered_groups: list + :return: dataframe with genes in rows and samples in columns with intensity values :rtype: pd.DataFrame """ @@ -348,6 +351,7 @@ def gsea( :type seed: int :param threads: Number of threads to use :type threads: int + :return: dict with enriched dataframe, ranking, enrichment detail dataframe per enriched gene set and messages :rtype: dict """ diff --git a/protzilla/data_integration/enrichment_analysis_helper.py b/protzilla/data_integration/enrichment_analysis_helper.py index fb40b72a9..8919332c2 100644 --- a/protzilla/data_integration/enrichment_analysis_helper.py +++ b/protzilla/data_integration/enrichment_analysis_helper.py @@ -25,8 +25,10 @@ def read_protein_or_gene_sets_file(path): - .json: {Set_name: [Protein1, Protein2, ...], Set_name2: [Protein2, Protein3, ...]} Empty strings are removed from the list of proteins or genes. + :param path: path to file :type path: str + :return: dict with protein or gene sets, a path to a gmt file or error message :rtype: dict """ @@ -74,8 +76,10 @@ def read_background_file(path): Reads a file of background proteins or genes. Accepts .csv and .txt files with one protein or gene per line. Empty strings are removed from the list of proteins or genes. + :param path: path to file :type path: str or None + :return: list of background proteins or genes or error message :rtype: list """ @@ -110,6 +114,7 @@ def map_to_STRING_ids(proteins_list, organism): :type proteins_list: list :param organism: organism NCBI identifier :type organism: str + :return: list of STRING IDs or None if no IDs could be found :rtype: list or None """ diff --git a/protzilla/data_preprocessing/filter_proteins.py b/protzilla/data_preprocessing/filter_proteins.py index b1b047832..eea81e6a8 100644 --- a/protzilla/data_preprocessing/filter_proteins.py +++ b/protzilla/data_preprocessing/filter_proteins.py @@ -7,15 +7,17 @@ def by_samples_missing(intensity_df, percentage): """ This function filters proteins based on its amount of nan values. If the percentage of existing values is below a threshold (percentage), the protein is filtered out. - :param df: the intensity dataframe that should be filtered\ - in long format + + :param df: the intensity dataframe that should be filtered + in long format :type df: pd.DataFrame - :param percentage: float ranging from 0 to 1. Defining the\ - relative share of samples the proteins should be present in inorder to be kept.\ + :param percentage: float ranging from 0 to 1. Defining the + relative share of samples the proteins should be present in inorder to be kept. :type percentage: float - :return: returns the filtered df as a Dataframe and a dict with a listof Protein IDs\ - that were discarded and a list of Protein IDs\ - that were kept + + :return: returns the filtered df as a Dataframe and a dict with a listof Protein IDs + that were discarded and a list of Protein IDs + that were kept :rtype: Tuple[pandas DataFrame, dict] """ diff --git a/protzilla/data_preprocessing/filter_samples.py b/protzilla/data_preprocessing/filter_samples.py index fbcdd50bc..6e518af8a 100644 --- a/protzilla/data_preprocessing/filter_samples.py +++ b/protzilla/data_preprocessing/filter_samples.py @@ -48,15 +48,17 @@ def by_proteins_missing(intensity_df: pd.DataFrame, percentage): """ This function filters samples based on the amount of nan values. If the percentage of existing values is below a threshold (percentage), the sample is filtered out. - :param df: the intensity dataframe that should be filtered\ - in long format + + :param df: the intensity dataframe that should be filtered + in long format :type df: pd.DataFrame - :param percentage: float ranging from 0 to 1. Defining the\ - relative share of proteins that were detected in the sample in inorder to be kept.\ + :param percentage: float ranging from 0 to 1. Defining the + relative share of proteins that were detected in the sample in inorder to be kept. :type percentage: float - :return: returns the filtered df as a Dataframe and a dict with a list of Protein IDs\ - that were discarded and a list of Protein IDs\ - that were kept + + :return: returns the filtered df as a Dataframe and a dict with a list of Protein IDs + that were discarded and a list of Protein IDs + that were kept :rtype: Tuple[pandas DataFrame, dict] """ intensity_name = intensity_df.columns.values.tolist()[3] diff --git a/protzilla/data_preprocessing/imputation.py b/protzilla/data_preprocessing/imputation.py index 160112cfb..11c743c02 100644 --- a/protzilla/data_preprocessing/imputation.py +++ b/protzilla/data_preprocessing/imputation.py @@ -29,17 +29,18 @@ def by_knn( Implements an instance of the sklearn.impute KNNImputer class. https://scikit-learn.org/stable/modules/generated/sklearn.impute.KNNImputer.html - :param intensity_df: the dataframe that should be filtered in\ - long format + + :param intensity_df: the dataframe that should be filtered in + long format :type intensity_df: pandas DataFrame - :param number_of_neighbours: number of neighbouring samples used for\ - imputation. Default: 5 + :param number_of_neighbours: number of neighbouring samples used for + imputation. Default: 5 :type number_of_neighbours: int - :param **kwargs: additional keyword arguments passed to\ + :param **kwargs: additional keyword arguments passed to KNNImputer.fit_transform :type kwargs: dict - :return: returns an imputed dataframe in typical protzilla long format\ - and an empty dict + :return: returns an imputed dataframe in typical protzilla long format + and an empty dict :rtype: pd.DataFrame """ @@ -74,14 +75,15 @@ def by_simple_imputer( no data will be imputed. This function automatically filters out such proteins from the DataFrame beforehand. - :param intensity_df: the dataframe that should be filtered in\ - long format + :param intensity_df: the dataframe that should be filtered in + long format :type intensity_df: pandas DataFrame - :param strategy: Defines the imputation strategy. Can be "mean",\ - "median" or "most_frequent" (for mode). + :param strategy: Defines the imputation strategy. Can be "mean", + "median" or "most_frequent" (for mode). :type strategy: str - :return: returns an imputed dataframe in typical protzilla long format\ - and an empty dict + + :return: returns an imputed dataframe in typical protzilla long format + and an empty dict :rtype: pd.DataFrame, int """ assert strategy in ["mean", "median", "most_frequent"] @@ -117,16 +119,17 @@ def by_min_per_sample( If not wanted, make sure to filter 0 intensity samples in the filtering step. - :param intensity_df: the dataframe that should be filtered in\ - long format + :param intensity_df: the dataframe that should be filtered in + long format :type intensity_df: pandas DataFrame - :param shrinking_value: a factor to alter the minimum value\ - used for imputation. With a shrinking factor of 0.1 for\ - example, a tenth of the minimum value found will be used for\ - imputation. Default: 1 (no shrinking) + :param shrinking_value: a factor to alter the minimum value + used for imputation. With a shrinking factor of 0.1 for + example, a tenth of the minimum value found will be used for + imputation. Default: 1 (no shrinking) :type shrinking_value: float - :return: returns an imputed dataframe in typical protzilla long format\ - and an empty dict + + :return: returns an imputed dataframe in typical protzilla long format + and an empty dict :rtype: pd.DataFrame, dict """ intensity_df_copy = intensity_df.copy(deep=True) @@ -156,16 +159,17 @@ def by_min_per_protein( take a fraction of that minimum value for imputation. CAVE: All proteins without any values will be filtered out. - :param intensity_df: the dataframe that should be filtered in\ - long format + :param intensity_df: the dataframe that should be filtered in + long format :type intensity_df: pandas DataFrame - :param shrinking_value: a factor to alter the minimum value\ - used for imputation. With a shrinking factor of 0.1 for\ - example, a tenth of the minimum value found will be used for\ - imputation. Default: 1 (no shrinking) + :param shrinking_value: a factor to alter the minimum value + used for imputation. With a shrinking factor of 0.1 for + example, a tenth of the minimum value found will be used for + imputation. Default: 1 (no shrinking) :type shrinking_value: float - :return: returns an imputed dataframe in typical protzilla long format\ - and an empty dict + + :return: returns an imputed dataframe in typical protzilla long format + and an empty dict :rtype: pd.DataFrame, dict """ transformed_df = long_to_wide(intensity_df) @@ -200,16 +204,17 @@ def by_min_per_dataset( the dataframe. The user can also assign a shrinking factor to take a fraction of that minimum value for imputation. - :param intensity_df: the dataframe that should be filtered in\ - long format + :param intensity_df: the dataframe that should be filtered in + long format :type intensity_df: pandas DataFrame - :param shrinking_value: a factor to alter the minimum value\ - used for imputation. With a shrinking factor of 0.1 for\ - example, a tenth of the minimum value found will be used for\ - imputation. Default: 1 (no shrinking) + :param shrinking_value: a factor to alter the minimum value + used for imputation. With a shrinking factor of 0.1 for + example, a tenth of the minimum value found will be used for + imputation. Default: 1 (no shrinking) :type shrinking_value: float - :return: returns an imputed dataframe in typical protzilla long format\ - and an empty dict + + :return: returns an imputed dataframe in typical protzilla long format + and an empty dict :rtype: pd.DataFrame, dict """ intensity_df_copy = intensity_df.copy(deep=True) @@ -262,7 +267,6 @@ def by_normal_distribution_sampling( transformed_df = long_to_wide(intensity_df) # iterate over all protein groups for protein_grp in transformed_df.columns: - number_of_nans = transformed_df[protein_grp].isnull().sum() # don't impute values if there not enough values (> 1) to sample from @@ -391,7 +395,6 @@ def _build_box_hist_plot( 2. a graph summarising the amount of filtered proteins. - """ if graph_type == "Boxplot": fig1 = create_box_plots( diff --git a/protzilla/data_preprocessing/normalisation.py b/protzilla/data_preprocessing/normalisation.py index 11c82f0b5..85d32433f 100644 --- a/protzilla/data_preprocessing/normalisation.py +++ b/protzilla/data_preprocessing/normalisation.py @@ -14,11 +14,12 @@ def by_z_score(intensity_df: pd.DataFrame) -> tuple[pd.DataFrame, dict]: Scales the data to zero mean and unit variance. This is often also called z-score normalisation/transformation. - :param intensity_df: the dataframe that should be filtered in\ - long format + :param intensity_df: the dataframe that should be filtered in + long format :type intensity_df: pd.DataFrame - :return: returns a scaled dataframe in typical protzilla long format and an empty\ - dictionary + + :return: returns a scaled dataframe in typical protzilla long format and an empty + dictionary :rtype: Tuple[pandas DataFrame, dict] """ @@ -55,14 +56,15 @@ def by_median( Divides each intensity by the chosen intensity quartile of the respective sample. By default, the median (50%-quartile) is used. - :param intensity_df: the dataframe that should be filtered in\ - long format + :param intensity_df: the dataframe that should be filtered in + long format :type intensity_df: pandas DataFrame - :param percentile: the chosen quartile of the sample intensities for\ - normalisation + :param percentile: the chosen quartile of the sample intensities for + normalisation :type percentile: float - :return: returns a scaled dataframe in typical protzilla long format\ - and a dict, containing all zeroed samples due to quantile being 0 + + :return: returns a scaled dataframe in typical protzilla long format + and a dict, containing all zeroed samples due to quantile being 0 :rtype: Tuple[pandas DataFrame, dict] """ @@ -116,11 +118,12 @@ def by_totalsum(intensity_df: pd.DataFrame) -> tuple[pd.DataFrame, dict]: Normalises the data on the level of each sample. Divides each intensity by the total sum of sample intensities. - :param intensity_df: the dataframe that should be filtered in\ - long format + :param intensity_df: the dataframe that should be filtered in + long format :type intensity_df: pandas DataFrame - :return: returns a scaled dataframe in typical protzilla long format\ - and a dict, containing all zeroed samples due to sum being 0 + + :return: returns a scaled dataframe in typical protzilla long format + and a dict, containing all zeroed samples due to sum being 0 :rtype: Tuple[pandas DataFrame, dict] """ @@ -178,13 +181,13 @@ def by_reference_protein( protein in each sample. Samples where this value is zero will be removed and returned separately. - :param intensity_df: the dataframe that should be filtered in\ - long format + :param intensity_df: the dataframe that should be filtered in + long format :type intensity_df: pandas DataFrame :param reference_protein: Protein ID of the protein to normalise by - type reference_protein_id: str - :return: returns a scaled dataframe in typical protzilla long format \ - and dict with a list of the indices of the dropped samples + type reference_protein_id: str + :return: returns a scaled dataframe in typical protzilla long format + and dict with a list of the indices of the dropped samples :rtype: Tuple[pandas DataFrame, dict] """ scaled_df = pd.DataFrame() diff --git a/protzilla/data_preprocessing/outlier_detection.py b/protzilla/data_preprocessing/outlier_detection.py index bde4ea5dd..f88807eb3 100644 --- a/protzilla/data_preprocessing/outlier_detection.py +++ b/protzilla/data_preprocessing/outlier_detection.py @@ -21,16 +21,17 @@ def by_isolation_forest( isolation forest approach. :param intensity_df: a dataframe in typical protzilla long format - on which the outlier detection is performed + on which the outlier detection is performed :type intensity_df: pandas DataFrame :param n_estimators: the number of estimators used by the algorithm, - default: 100 + default: 100 :type n_estimators: integer :param n_jobs: Number kernels used by algorithm, default: - all kernels (-1) + all kernels (-1) :type n_jobs: integer - :return: returns a Dataframe containing all samples that are not outliers and a\ - dict with list of outlier sample names + + :return: returns a Dataframe containing all samples that are not outliers and a + dict with list of outlier sample names :rtype: Tuple[pandas DataFrame, dict] """ try: @@ -85,16 +86,17 @@ def by_local_outlier_factor( https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.LocalOutlierFactor.html :param intensity_df: a dataframe in typical protzilla long format - on which the outlier detection is performed + on which the outlier detection is performed :type intensity_df: pandas DataFrame :param number_of_neighbors: number of neighbors used by the - algorithm, default: 20 + algorithm, default: 20 :type number_of_neighbors: int :param n_jobs: Number kernels used by algorithm, default: - all kernels (-1) + all kernels (-1) :type n_jobs: int - :return: returns a Dataframe containing all samples that are not outliers and a\ - dict with list of outlier sample names + + :return: returns a Dataframe containing all samples that are not outliers and a + dict with list of outlier sample names :rtype: Tuple[pandas DataFrame, dict] """ try: @@ -141,20 +143,21 @@ def by_pca( https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html :param intensity_df: a dataframe in typical protzilla long format - on which the outlier detection is performed + on which the outlier detection is performed :type intensity_df: pandas DataFrame :param threshold: distance from the median in - number of standard deviations to be included, - default: 2 + number of standard deviations to be included, + default: 2 :type threshold: float :param number_of_components: number of principal components - used in the PCA. Allowed: 2 or 3. Default: 3 + used in the PCA. Allowed: 2 or 3. Default: 3 :type number_of_components: integer (2 or 3) + :return: returns a Dataframe containing all samples that are not outliers. - A dict with list of inlier sample names, a DataFrame that contains the projection \ - of the intensity_df on first principal components, a list that contains the \ - explained variation for each component and an int, the number of components \ - the calculations were executed with + A dict with list of inlier sample names, a DataFrame that contains the projection + of the intensity_df on first principal components, a list that contains the + explained variation for each component and an int, the number of components + the calculations were executed with :rtype: Tuple[pandas DataFrame, dict] """ try: diff --git a/protzilla/data_preprocessing/peptide_filter.py b/protzilla/data_preprocessing/peptide_filter.py index 4be6adde2..ce1833aef 100644 --- a/protzilla/data_preprocessing/peptide_filter.py +++ b/protzilla/data_preprocessing/peptide_filter.py @@ -14,12 +14,12 @@ def by_pep_value( :type intensity_df: pd.Dataframe :param peptide_df: the pandas dataframe containing the peptide information :type peptide_df: pd.Dataframe - :param threshold: peptides with a PEP-value below this threshold will be filtered\ - out + :param threshold: peptides with a PEP-value below this threshold will be filtered + out :type threshold: float :return: intensity-df, piped through, dict with peptide_df without the peptides - below the threshold and a list with filtered-out peptides (Sequences) + below the threshold and a list with filtered-out peptides (Sequences) :rtype: Tuple[pd.Dataframe, dict(pd.Dataframe, list)] """ diff --git a/protzilla/data_preprocessing/plots.py b/protzilla/data_preprocessing/plots.py index c8add29b2..a4cfcb411 100644 --- a/protzilla/data_preprocessing/plots.py +++ b/protzilla/data_preprocessing/plots.py @@ -123,11 +123,11 @@ def create_box_plots( (for example before and after filtering/normalisation) and creates a visualisation for each one. - :param dataframe_a: First dataframe in protzilla long format for\ - first boxplot + :param dataframe_a: First dataframe in protzilla long format for + first boxplot :type dataframe_a: pd.DataFrame - :param dataframe_b: Second dataframe in protzilla long format\ - for second boxplot + :param dataframe_b: Second dataframe in protzilla long format + for second boxplot :type dataframe_b: pd.DataFrame :param name_a: Name of first boxplot :type name_a: str @@ -139,12 +139,13 @@ def create_box_plots( :type y_title: str :param x_title: Optional x-axis title for graphs. :type x_title: str - :param group_by: Optional argument to create a grouped boxplot\ - graph. Arguments can be either "Sample" to group by sample or\ - "Protein ID" to group by protein. Leave "None" to get ungrouped\ - conventional graphs. If set the function will ignore the\ - graph_type argument. Default is "None". + :param group_by: Optional argument to create a grouped boxplot + graph. Arguments can be either "Sample" to group by sample or + "Protein ID" to group by protein. Leave "None" to get ungrouped + conventional graphs. If set the function will ignore the + graph_type argument. Default is "None". :type group_by: str + :return: returns a boxplot of the data :rtype: Figure (plotly object) """ @@ -223,11 +224,11 @@ def create_histograms( (for example before and after filtering/normalisation) and creates a visualisation for each one. - :param dataframe_a: First dataframe in protzilla long format for\ - first histogram + :param dataframe_a: First dataframe in protzilla long format for + first histogram :type dataframe_a: pd.DataFrame - :param dataframe_b: Second dataframe in protzilla long format\ - for second histogram + :param dataframe_b: Second dataframe in protzilla long format + for second histogram :type dataframe_b: pd.DataFrame :param name_a: Name of first histogram :type name_a: str @@ -239,6 +240,7 @@ def create_histograms( :type y_title: str :param x_title: Optional x axis title for graphs. :type x_title: str + :return: returns a pie or bar chart of the data :rtype: Figure (plotly object) """ @@ -288,16 +290,17 @@ def create_anomaly_score_bar_plot( This function creates a graph visualising the outlier and non-outlier samples using the anomaly score. - :param anomaly_df: pandas Dataframe that contains the anomaly score for each\ - sample, including outliers and on-outliers samples + :param anomaly_df: pandas Dataframe that contains the anomaly score for each + sample, including outliers and on-outliers samples :type anomaly_df: pd.DataFrame :param colour_outlier: hex code for colour depicting the outliers. - Default: PROTZILLA_DISCRETE_COLOR_OUTLIER_SEQUENCE outlier colour + Default: PROTZILLA_DISCRETE_COLOR_OUTLIER_SEQUENCE outlier colour :type colour_outlier: str :param colour_non_outlier: hex code for colour depicting the - non-outliers. Default: PROTZILLA_DISCRETE_COLOR_OUTLIER_SEQUENCE - non-outlier colour + non-outliers. Default: PROTZILLA_DISCRETE_COLOR_OUTLIER_SEQUENCE + non-outlier colour :type colour_non_outlier: str + :return: returns a plotly Figure object :rtype: Figure (plotly object) """ @@ -349,19 +352,20 @@ def create_pca_2d_scatter_plot( and non-outlier points by showing the principal components. It returns a ploty Figure object. - :param pca_df: a DataFrame that contains the projection of\ - the intensity_df on first principal components + :param pca_df: a DataFrame that contains the projection of + the intensity_df on first principal components :type pca_df: pd.DataFrame - :param explained_variance_ratio: a list that contains the\ - explained variation for each component + :param explained_variance_ratio: a list that contains the + explained variation for each component :type explained_variance_ratio: list :param colour_outlier: hex code for colour depicting the outliers. - Default: PROTZILLA_DISCRETE_COLOR_OUTLIER_SEQUENCE outlier colour + Default: PROTZILLA_DISCRETE_COLOR_OUTLIER_SEQUENCE outlier colour :type colour_outlier: str :param colour_non_outlier: hex code for colour depicting the - non-outliers. Default: PROTZILLA_DISCRETE_COLOR_OUTLIER_SEQUENCE - non-outlier colour + non-outliers. Default: PROTZILLA_DISCRETE_COLOR_OUTLIER_SEQUENCE + non-outlier colour :type colour_non_outlier: str + :return: returns a plotly Figure object :rtype: Figure (plotly object) """ @@ -403,19 +407,20 @@ def create_pca_3d_scatter_plot( and non-outlier points by showing the principal components. It returns a ploty Figure object. - :param pca_df: a DataFrame that contains the projection of\ - the intensity_df on first principal components + :param pca_df: a DataFrame that contains the projection of + the intensity_df on first principal components :type pca_df: pd.DataFrame - :param explained_variance_ratio: a list that contains the\ - explained variation for each component + :param explained_variance_ratio: a list that contains the + explained variation for each component :type explained_variance_ratio: list :param colour_outlier: hex code for colour depicting the outliers. - Default: PROTZILLA_DISCRETE_COLOR_OUTLIER_SEQUENCE outlier colour + Default: PROTZILLA_DISCRETE_COLOR_OUTLIER_SEQUENCE outlier colour :type colour_outlier: str :param colour_non_outlier: hex code for colour depicting the - non-outliers. Default: PROTZILLA_DISCRETE_COLOR_OUTLIER_SEQUENCE - non-outlier colour + non-outliers. Default: PROTZILLA_DISCRETE_COLOR_OUTLIER_SEQUENCE + non-outlier colour :type colour_non_outlier: str + :return: returns a plotly Figure object :rtype: Figure (plotly object) """ diff --git a/protzilla/data_preprocessing/transformation.py b/protzilla/data_preprocessing/transformation.py index 4b546a0ca..5e8e16ba7 100644 --- a/protzilla/data_preprocessing/transformation.py +++ b/protzilla/data_preprocessing/transformation.py @@ -12,11 +12,12 @@ def by_log(intensity_df: pd.DataFrame, log_base="log10"): :param intensity_df: a protein data frame in long format :type intensity_df: pd.DataFrame - :param log_base: String of the used log method "log10" (base 10)\ - or "log2" (base 2). Default: "log10" - :type log_base: Str - :return: returns a pandas DataFrame in typical protzilla\ - long format with the transformed data and an empty dict. + :param log_base: String of the used log method "log10" (base 10) + or "log2" (base 2). Default: "log10" + :type log_base: str + + :return: returns a pandas DataFrame in typical protzilla + long format with the transformed data and an empty dict. :rtype: Tuple[pandas DataFrame, dict] """ intensity_name = intensity_df.columns.values.tolist()[3] diff --git a/protzilla/history.py b/protzilla/history.py index 8de6f99fd..4af01a468 100644 --- a/protzilla/history.py +++ b/protzilla/history.py @@ -15,13 +15,17 @@ class History: This class has the responsibility to save what methods were previously executed in a Run. Each Run has one History. It is responsible for saving dataframes to disk. - :ivar steps is a list of the steps that have been executed, represented by + + :param steps: is a list of the steps that have been executed, represented by ExecutedStep instances. - :ivar df_mode determines if the dataframe of a completed step that is added to the + :type steps: list[ExecutedStep] + :param df_mode: determines if the dataframe of a completed step that is added to the history is saved to disk and not held im memory ("disk" mode), held in memory but not saved to disk ("memory" mode) or both ("disk_memory" mode). - :ivar run_name is the name of the run a history instance belongs to. It is used to + :type df_mode: str + :param run_name: is the name of the run a history instance belongs to. It is used to save things at the correct disk location. + :type run_name: str """ @classmethod diff --git a/protzilla/importing/__init__.py b/protzilla/importing/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/protzilla/run.py b/protzilla/run.py index 97e6054bd..ba5a35af5 100644 --- a/protzilla/run.py +++ b/protzilla/run.py @@ -22,24 +22,52 @@ class Run: """ - :ivar run_path: the path to this runs' dir - :ivar workflow_config - :ivar run_name - :ivar history - :ivar step_index - :ivar workflow_meta - - :ivar section - :ivar step - :ivar method - :ivar df: dataframe that will be used as input for the next data preprocessing step, not used in data analysis - :ivar result_df - :ivar current_out - :ivar current_parameters: calculation parameters that were used to calculate for each method - :ivar current_plot_parameters: plot parameters that were used to generate plots for each method, not used in data analysis - :ivar calculated_method: method that was last used to calculate - :ivar plots - :ivar plotted_for_parameters: calculation parameters that were used to generate the results that were used to generate plots, not used in data analysis + A class to represent a complete data analysis run in protzilla. + + :param run_path: the path to this runs' dir + :type run_path: str + :param workflow_config: Contains the contents of the workflow .json + that was selected for this run at first. It is always updated when + the workflow gets changed throughout the run (e.g. change of a parameter). + :type workflow_config: dict + :param run_name: name of the run + :type run_name: str + :param history: an instance of the history class to access the history of this run + :type history: protzilla.History + :param step_index: index of the current step over all steps in the workflow + :type step_index: int + :param workflow_meta: contains contents of the workflow meta file that contains all + methods and parameters that exist in protzilla + :type workflow_meta: dict + + :param section: current section + :type section: str + :param step: current step + :type step: str + :param method: current method + :type method: str + :param df: dataframe that will be used as input for the next data preprocessing step + (Not used in data analysis! Due to the more flexible dataflow during analysis + the input dataframe for an analysis step needs to be selectable in the frontend and is an + input parameter for each new step) + :type df: pandas.DataFrame + :param result_df: contains the modified intensity dataframe after a step + :type result_df: pandas.DataFrame + :param current_out: contains other outputs from the current step + :type current_out: dict + :param current_parameters: calculation parameters that were used to calculate the current step + (e.g. to update workflow_config correctly) + :type current_parameters: dict + :param current_plot_parameters: plot parameters that were used to generate plots for the + current step (Not used in data analysis! A plot is its own step in that section + to allow for more flexibility) + :type current_plot_parameters: dict + :param calculated_method: method that was used to calculate the current step + :type calculated_method: str + :param plots: contains the plots generated in the current step + :type plots: list[Figure] + :param plotted_for_parameters: calculation parameters that were used to generate the results that were used to generate current plots, not used in data analysis + :type plotted_for_parameters: dict """ @classmethod diff --git a/protzilla/utilities/clustergram.py b/protzilla/utilities/clustergram.py index 178aa048c..edfdd47d6 100644 --- a/protzilla/utilities/clustergram.py +++ b/protzilla/utilities/clustergram.py @@ -65,6 +65,19 @@ def Clustergram( height=800, width=1000, ): + """ + This is an adapted version of Plotly's clustergram found in the Dash Bio package. + In this adaptation, we've made it possible to display a vertical or horizontal + colorbar, as well as a legend for the color bar. To achieve this, we've made changes + to the "row_colors" parameter and introduced a new parameter called + "row_colors_to_label_dict." + + The "row_colors" parameter now consists of a list with a length equal to the number + of samples, where each element represents a specific color corresponding to that + sample. The "row_colors_to_label_dict" is a dictionary that associates each color + in the "row_colors" list with a specific group. This mapping is used to create the + color bar legend. + """ if color_threshold is None: color_threshold = dict(row=0, col=0) diff --git a/protzilla/utilities/dunn_score.py b/protzilla/utilities/dunn_score.py index 1ba0924c8..8862901a7 100644 --- a/protzilla/utilities/dunn_score.py +++ b/protzilla/utilities/dunn_score.py @@ -34,6 +34,7 @@ def dunn_score(X, labels=None): :type X: pd.DataFrame :param labels: the predicted labels/classes by the clustering algorithm :type labels: pd.DataFrame + :returns: the dunn index for the clusters found for a given data set X :rtype: float """ diff --git a/protzilla/utilities/transform_dfs.py b/protzilla/utilities/transform_dfs.py index bea8fffab..df36dab49 100644 --- a/protzilla/utilities/transform_dfs.py +++ b/protzilla/utilities/transform_dfs.py @@ -3,17 +3,18 @@ def long_to_wide(intensity_df: pd.DataFrame): """ - This function transforms the dataframe to a wide format that - can be more easily handled by packages such as sklearn. - Each sample gets one row with all observations as columns. + This function transforms the dataframe to a wide format that + can be more easily handled by packages such as sklearn. + Each sample gets one row with all observations as columns. - :param intensity_df: the dataframe that should be changed in\ + :param intensity_df: the dataframe that should be transformed into long format :type intensity_df: pd.DataFrame - :return: returns dataframe in wide format suitable for use by\ + + :return: returns dataframe in wide format suitable for use by packages such as sklearn - :rtype: pd.DataFrame - """ + :rtype: pd.DataFrame + """ values_name = intensity_df.columns[3] return pd.pivot( intensity_df, index="Sample", columns="Protein ID", values=values_name @@ -25,12 +26,13 @@ def wide_to_long(wide_df: pd.DataFrame, original_long_df: pd.DataFrame): This functions transforms the dataframe from a wide format to the typical protzilla long format. - :param wide_df: the dataframe in wide format that\ - should be changed + :param wide_df: the dataframe in wide format that + should be changed :type wide_df: pd.DataFrame - :param original_long_df: the original long protzilla format\ - dataframe, that was the source of the wide format dataframe + :param original_long_df: the original long protzilla format + dataframe, that was the source of the wide format dataframe :type orginal_long_df: pd.DataFrame + :return: returns dataframe in typical protzilla long format :rtype: pd.DataFrame """ @@ -63,8 +65,10 @@ def is_intensity_df(df: pd.DataFrame): Checks if the dataframe is an intensity dataframe. An intensity dataframe should have the columns "Sample", "Protein ID" and and intensity column. + :param df: the dataframe that should be checked :type df: pd.DataFrame + :return: returns True if the dataframe is an intensity dataframe :rtype: bool """ diff --git a/protzilla/utilities/utilities.py b/protzilla/utilities/utilities.py index c1bc418fd..a188dfdc8 100644 --- a/protzilla/utilities/utilities.py +++ b/protzilla/utilities/utilities.py @@ -40,6 +40,7 @@ def fig_to_base64(fig): :param fig: matplotlib figure :type fig: matplotlib.figure.Figure + :return: base64 encoded image :rtype: bytes """ diff --git a/requirements.txt b/requirements.txt index 2d41a18ff..da2023280 100644 --- a/requirements.txt +++ b/requirements.txt @@ -33,3 +33,5 @@ protgraph @ git+https://github.com/antonneubauer/ProtGraph@master joblib==1.2.0 networkx==3.1 beautifulsoup4==4.12.2 +sphinx==7.2.6 +sphinx-autoapi==3.0.0 diff --git a/tests/protzilla/data_analysis/__init__.py b/tests/protzilla/data_analysis/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/protzilla/data_integration/__init__.py b/tests/protzilla/data_integration/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/protzilla/importing/__init__.py b/tests/protzilla/importing/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/ui/__init__.py b/tests/ui/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/ui/main/upload_handler.py b/ui/main/upload_handler.py index 4d412f050..a49286581 100644 --- a/ui/main/upload_handler.py +++ b/ui/main/upload_handler.py @@ -9,6 +9,10 @@ # copied from TemporaryFileUploadHandler class CustomFileUploadHandler(FileUploadHandler): + """ + The same as Django's TemporaryFileUploadHandler, except for writing to a CustomUploadedFile. + """ + def new_file(self, *args, **kwargs): """ Create the file object to append to as data is coming in. @@ -39,7 +43,7 @@ def upload_interrupted(self): # copied from TemporaryUploadedFile class CustomUploadedFile(UploadedFile): """ - A file uploaded to a temporary location (i.e. stream-to-disk). + The same as Django's TemporaryUploadedFile, except for passing different arguments when creating the file """ def __init__(self, name, content_type, size, charset, content_type_extra=None): diff --git a/ui/runs/fields.py b/ui/runs/fields.py index 66d3b53cb..05125c186 100644 --- a/ui/runs/fields.py +++ b/ui/runs/fields.py @@ -13,6 +13,22 @@ def make_current_fields(run, section, step, method): + """ + Wrapper method that generates the fields for the current method + based on the data in the workflow_meta.json file. + + :param run: The current run object + :type run: Run + :param section: The current section + :type section: str + :param step: The current step + :type step: str + :param method: The current method + :type method: str + + :return: A list of fields for the current method + :rtype: list + """ if not step: return [] parameters = get_parameters(run, section, step, method) @@ -28,10 +44,26 @@ def make_current_fields(run, section, step, method): def make_parameter_input(key, param_dict, all_parameters_dict, disabled): - # In this method param_dict refers to the dictionary that contains all - # meta information about a specific parameter e.g. type, default value. The - # all_parameters_dict refers to the dictionary that contains all parameters for - # a method with its corresponding meta information + """ + Generates the html for a single parameter input field. The + type of the input field is determined by the type of the parameter as specified + in the workflow_meta.json. + May be called recursively by make_dynamic_fields if the parameter is a dynamic parameter. + + :param key: The name of the parameter, matches the key in the workflow_meta.json + :type key: str + :param param_dict: The dictionary containing all meta information about the parameter + e.g. type, default value + :type param_dict: dict + :param all_parameters_dict: The dictionary containing all parameters for the current method + with corresponding meta information + :type all_parameters_dict: dict + :param disabled: Should the input field be disabled + :type disabled: bool + + :return: The html for the input field + :rtype: str + """ if param_dict["type"] == "numeric": param_dict["multiple"] = param_dict.get("multiple", False) template = "runs/field_number.html" @@ -71,6 +103,21 @@ def make_parameter_input(key, param_dict, all_parameters_dict, disabled): def make_dynamic_fields(param_dict, selected_category, all_parameters_dict, disabled): + """ + Generates the html for the dynamic fields of a "categorical_dynamic" type parameter. + This is used to dynamically add fields based on the selected_category. + + :param param_dict: The dictionary containing all meta information about the parameter + e.g. type, default value + :type param_dict: dict + :param selected_category: The currently selected category of the field described by param_dict + :type selected_category: str + :param all_parameters_dict: The dictionary containing all parameters for the current method + with corresponding meta information + :type all_parameters_dict: dict + :param disabled: Should the fields be disabled + :type disabled: bool + """ dynamic_fields = [] if selected_category in param_dict["dynamic_parameters"]: dynamic_parameters_list = param_dict["dynamic_parameters"][selected_category] @@ -85,6 +132,19 @@ def make_dynamic_fields(param_dict, selected_category, all_parameters_dict, disa def make_sidebar(request, run, run_name): + """ + Renders the sidebar of the run detail page. + + :param request: The current request + :type request: HttpRequest + :param run: The current run object + :type run: Run + :param run_name: The name of the current run + :type run_name: str + + :return: The html for the sidebar + :rtype: str + """ csrf_token = request.META["CSRF_COOKIE"] template = "runs/sidebar.html" return render_to_string( @@ -100,6 +160,23 @@ def make_sidebar(request, run, run_name): def make_plot_fields(run, section, step, method): + """ + Generates the html for the plot fields of the current method. + This is only used when a plot is a part of a step and not its own step + as is the case for the data preprocessing section. + + :param run: The current run object + :type run: Run + :param section: The current section + :type section: str + :param step: The current step + :type step: str + :param method: The current method + :type method: str + + :return: The html for the plot fields + :rtype: str + """ if not step: return plots = run.workflow_meta[section][step][method].get("graphs", []) @@ -115,6 +192,21 @@ def make_plot_fields(run, section, step, method): def make_method_dropdown(run, section, step, method): + """ + Generates the html for the method dropdown of the current step. + + :param run: The current run object + :type run: Run + :param section: The current section + :type section: str + :param step: The current step + :type step: str + :param method: The current method + :type method: str + + :return: The html for the method dropdown + :rtype: str + """ if not step: return "" methods = run.workflow_meta[section][step].keys() @@ -133,6 +225,16 @@ def make_method_dropdown(run, section, step, method): def make_displayed_history(run): + """ + Generates the html for the displayed history that is displayed at the + top of the current run. + + :param run: The current run object + :type run: Run + + :return: The html for the displayed history + :rtype: str + """ displayed_history = [] for i, history_step in enumerate(run.history.steps): fields = [] @@ -215,6 +317,22 @@ def make_displayed_history(run): def make_name_field(allow_next, form, run, end_of_run): + """ + Generates the html for the field that allows to name the output of the + current method. + + :param allow_next: Whether the next button should be enabled + :type allow_next: bool + :param form: The form that the field belongs to + :type form: Form + :param run: The current run object + :type run: Run + :param end_of_run: Whether the current step is the last step of the run + :type end_of_run: bool + + :return: The html for the name field + :rtype: str + """ if end_of_run: return "" default = get_workflow_default_param_value( diff --git a/ui/runs/utilities/__init__.py b/ui/runs/utilities/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/ui/runs/views.py b/ui/runs/views.py index 7ec7393d3..0140f361a 100644 --- a/ui/runs/views.py +++ b/ui/runs/views.py @@ -43,6 +43,15 @@ def index(request): + """ + Renders the main index page of the PROTzilla application. + + :param request: the request object + :type request: HttpRequest + + :return: the rendered index page + :rtype: HttpResponse + """ return render( request, "runs/index.html", @@ -54,6 +63,20 @@ def index(request): def detail(request, run_name): + """ + Renders the details page of a specific run. + For rendering a context dict is created that contains all the dynamic information + that is needed to display the page. This wraps other methods that provide subparts + for the page e.g. make_displayed_history() to show the history. + + :param request: the request object + :type request: HttpRequest + :param run_name: the name of the run + :type run_name: str + + :return: the rendered details page + :rtype: HttpResponse + """ if run_name not in active_runs: active_runs[run_name] = Run.continue_existing(run_name) run = active_runs[run_name] @@ -120,7 +143,19 @@ def detail(request, run_name): def change_method(request, run_name): - # TODO 92 extract into a seperate method like try_reactivate_run + """ + Changes the method during a step of a run. + This is called when the user selects a new method in the first dropdown of a step. + + :param request: the request object + :type request: HttpRequest + :param run_name: the name of the run + :type run_name: str + + :return: a JSON response object containing the new fields for the selected method + :rtype: JsonResponse + """ + # TODO 92 extract into a separate method like try_reactivate_run try: if run_name not in active_runs: active_runs[run_name] = Run.continue_existing(run_name) @@ -153,6 +188,21 @@ def change_method(request, run_name): def change_dynamic_fields(request, run_name): + """ + Renders fields that depend on the value of another field e.g. a dropdown, the value + being the dynamic_trigger_value below. The field is specified by its key and part of + the request. + + :param request: the request object + :type request: HttpRequest + :param run_name: the name of the run + :type run_name: str + + :return: a JSON response object containing the new fields depending on the value of + the dynamic trigger + :rtype: JsonResponse + """ + try: if run_name not in active_runs: active_runs[run_name] = Run.continue_existing(run_name) @@ -178,6 +228,22 @@ def change_dynamic_fields(request, run_name): def change_field(request, run_name): + """ + Changes the value of one or multiple fields during a method of a run depending on a + selected value in another field. The field that triggers this method is identified by + the post_id variable. + In contrast to change_dynamic_fields, this method changes the value of the field itself + instead of rendering new fields. + + :param request: the request object + :type request: HttpRequest + :param run_name: the name of the run + :type run_name: str + + :return: a JSON response object containing the updated fields depending on the value of the + dynamic trigger field + :rtype: JsonResponse + """ try: if run_name not in active_runs: active_runs[run_name] = Run.continue_existing(run_name) @@ -310,6 +376,15 @@ def change_field(request, run_name): def create(request): + """ + Creates a new run. The user is then redirected to the detail page of the run. + + :param request: the request object + :type request: HttpRequest + + :return: the rendered details page of the new run + :rtype: HttpResponse + """ run_name = request.POST["run_name"] run = Run.create( run_name, @@ -321,24 +396,67 @@ def create(request): def continue_(request): + """ + Continues an existing run. The user is redirected to the detail page of the run and + can resume working on the run. + + :param request: the request object + :type request: HttpRequest + + :return: the rendered details page of the run + :rtype: HttpResponse + """ run_name = request.POST["run_name"] active_runs[run_name] = Run.continue_existing(run_name) return HttpResponseRedirect(reverse("runs:detail", args=(run_name,))) def next_(request, run_name): + """ + Skips to and renders the next step/method of the run. + + :param request: the request object + :type request: HttpRequest + :param run_name: the name of the run + :type run_name: str + + :return: the rendered detail page of the run with the next step/method + :rtype: HttpResponse + """ run = active_runs[run_name] run.next_step(request.POST["name"]) return HttpResponseRedirect(reverse("runs:detail", args=(run_name,))) def back(request, run_name): + """ + Goes back to and renders the previous step/method of the run. + + :param request: the request object + :type request: HttpRequest + :param run_name: the name of the run + :type run_name: str + + :return: the rendered detail page of the run with the previous step/method + :rtype: HttpResponse + """ run = active_runs[run_name] run.back_step() return HttpResponseRedirect(reverse("runs:detail", args=(run_name,))) def add(request, run_name): + """ + Adds a new method to the run. The method is added as the next step. + + :param request: the request object + :type request: HttpRequest + :param run_name: the name of the run + :type run_name: str + + :return: the rendered detail page of the run, new method visible in sidebar + :rtype: HttpResponse + """ run = active_runs[run_name] post = dict(request.POST) @@ -352,6 +470,17 @@ def add(request, run_name): def delete_step(request, run_name): + """ + Deletes a step/method from the run. + + :param request: the request object + :type request: HttpRequest + :param run_name: the name of the run + :type run_name: str + + :return: the rendered detail page of the run, deleted method no longer visible in sidebar + :rtype: HttpResponse + """ run = active_runs[run_name] post = dict(request.POST) @@ -363,6 +492,17 @@ def delete_step(request, run_name): def export_workflow(request, run_name): + """ + Exports the workflow of the run as a JSON file so that it can be reused and shared. + + :param request: the request object + :type request: HttpRequest + :param run_name: the name of the run + :type run_name: str + + :return: the rendered detail page of the run + :rtype: HttpResponse + """ run = active_runs[run_name] post = dict(request.POST) del post["csrfmiddlewaretoken"] @@ -375,6 +515,18 @@ def export_workflow(request, run_name): def calculate(request, run_name): + """ + Performs the current methods calculation during the run. Django messages are used to + display additional information, warnings and errors to the user. + + :param request: the request object + :type request: HttpRequest + :param run_name: the name of the run + :type run_name: str + + :return: the rendered detail page of the run + :rtype: HttpResponse + """ run = active_runs[run_name] parameters = parameters_from_post(request.POST) del parameters["chosen_method"] @@ -406,6 +558,20 @@ def calculate(request, run_name): def plot(request, run_name): + """ + Creates a plot from the current step/method of the run. + This is only called by the plot button in the data preprocessing section aka when a plot is + simultaneously a step on its own. + Django messages are used to display additional information, warnings and errors to the user. + + :param request: the request object + :type request: HttpRequest + :param run_name: the name of the run + :type run_name: str + + :return: the rendered detail page of the run, now with the plot + :rtype: HttpResponse + """ run = active_runs[run_name] section, step, method = run.current_run_location() parameters = parameters_from_post(request.POST) @@ -439,17 +605,54 @@ def plot(request, run_name): def add_name(request, run_name): + """ + Adds a name to the results of a calculated method of the run. The name can be used + to identify the result and use them later. + + :param request: the request object + :type request: HttpRequest + :param run_name: the name of the run + :type run_name: str + + :return: the rendered detail page of the run + :rtype: HttpResponse + """ run = active_runs[run_name] run.name_step(int(request.POST["index"]), request.POST["name"]) return HttpResponseRedirect(reverse("runs:detail", args=(run_name,))) def results_exist(request, run_name): + """ + Checks if the results of the run exist. This is used to determine if the Next button + should be enabled or not. + + :param request: the request object + :type request: HttpRequest + :param run_name: the name of the run + :type run_name: str + + :return: a JSON response with a boolean value + :rtype: JsonResponse + """ run = active_runs[run_name] return JsonResponse(dict(results_exist=run.result_df is not None)) def all_button_parameters(request, run_name): + """ + Returns all parameters that are needed to render the buttons as enabled or disabled + in the run detail page. + See ui/runs/templates/runs/form_buttons.html for detailed documentation. + + :param request: the request object + :type request: HttpRequest + :param run_name: the name of the run + :type run_name: str + + :return: a JSON response with the parameters + :rtype: JsonResponse + """ run = active_runs[run_name] d = dict() d["current_plot_parameters"] = run.current_plot_parameters.get(run.method, {}) @@ -468,12 +671,36 @@ def all_button_parameters(request, run_name): def outputs_of_step(request, run_name): + """ + Returns the output keys of a named step of the run. This is used to determine which + parameters can be used as input for future steps. + + :param request: the request object + :type request: HttpRequest + :param run_name: the name of the run + :type run_name: str + + :return: a JSON response with the output keys + :rtype: JsonResponse + """ run = active_runs[run_name] step_name = request.POST["step_name"] return JsonResponse(run.history.output_keys_of_named_step(step_name), safe=False) def download_plots(request, run_name): + """ + Downloads all plots of the current method in the run. If multiple plots are created, + they are zipped together. The format of the plots is specified in the request. + + :param request: the request object + :type request: HttpRequest + :param run_name: the name of the run + :type run_name: str + + :return: a FileResponse with the plots + :rtype: FileResponse + """ run = active_runs[run_name] format_ = request.GET["format"] exported = run.export_plots(format_=format_) From ce239600a2257b32b1a95171acf7993834be9e48 Mon Sep 17 00:00:00 2001 From: Anton Neubauer Date: Sat, 25 Nov 2023 11:44:25 +0100 Subject: [PATCH 24/24] Migrations (#288) * migrated & tracking unused databases * clarify why db is tracked --- .gitignore | 2 +- db.sqlite3 | 0 ui/db.sqlite3 | Bin 0 -> 131072 bytes 3 files changed, 1 insertion(+), 1 deletion(-) create mode 100644 db.sqlite3 create mode 100644 ui/db.sqlite3 diff --git a/.gitignore b/.gitignore index c3b764762..6d9a540ce 100644 --- a/.gitignore +++ b/.gitignore @@ -70,7 +70,7 @@ coverage.xml # Django stuff: *.log local_settings.py -db.sqlite3 +#db.sqlite3 #tracked to remove migrations warning. The database is not actually in use db.sqlite3-journal # Flask stuff: diff --git a/db.sqlite3 b/db.sqlite3 new file mode 100644 index 000000000..e69de29bb diff --git a/ui/db.sqlite3 b/ui/db.sqlite3 new file mode 100644 index 0000000000000000000000000000000000000000..86540294d07a740673f3442dc12a9a1f645417ba GIT binary patch literal 131072 zcmeI5TWlNIdB-{8kQ617M^}#;TNXvHcFoFK@lMg(w5#%@R9{JGc>&z$dk^FP0HnVIj*nAhICTxsjk?MAbvwWDdzgh!G*FGr&u zkLNV~&;OdA^YleF6ZBWIeLv{yY0rz>-}P~0_P^Xl>Hay$9Q#241V8`;KmY_l00ck) z1V8`;KmY`O@&sn5!{l^o{0$HLD*H4mv&G2YMSd;vO87_NFNE)fGog1w?}UCKG&%nD z@keBY4+ww&2!H?xfB*=900@9UAA$6g&ojTY-qKsGN~7LNBoaxrQm?ctS~ZnOrB@P# zl~g{O%$-jq&Zp8Zt`$<5RN~U4&olehy0+K8MfNFGtG4x~y4Rx3cBR>Bt97lWt2OPu zTGi`2?OU#%*4EO=+{+W>>7wZ=sqXETwYIMAHXHXJsJnWzR_W?;^^ngc*D@Cvd3fFQ zkTA6DG#h)nN0N~%gX*9j)0NaXd%il1XRR zlCOox&xcGudBe|Lty&>Ijb=;L%4Jn=HXBW~Mop*f=q*=7sX`)~T^}cBo2Ijz+SKpt zRhqh5YSi0$z1@DWtG85Y`IM`Ry7!u7E}O}| z93&rRTPHGhA3ZjW-L`v05~*Y%bzzh|m~EX%+db$ttx`Rt%jFBHL^g9NKu*lgn=m`C z)z^;FYx4O_dM$s+@AE8OT(@K76p3%Gakqz8<$Q%xH@x}mS|OL-^!YprX)4z$b#jqb zi6^y8^MPn*t<LKr@S8W|#P!DAM)l6Td# zu2puL8jpdrrpCyrpMAo^zR&)O{R8_F`&~-l0|Fob0w4eaAOHd&00JNY0w4eaAn>FJ z%*gX6CG)`{fFjgb%#a^6a8WT90CkOv=i9 zP=o}`2O{|M|8t>V^00qmf69K971?kcD!7+t-uK6H$73gK91tQq6orqp+YiGq{DJ&9pE?@#OpHl?!QpN#>XaDV)SMNwYJwde$T=X(6ymgAGnR7pl4%95b0C&Y6I2M z%Qf^2xCoIrORp_R%jprJ!n(WdNwIdF3y|e%e#FpYWpGIQi zJD;a_45-(RaE;bpZpwvPd!C~g2!bypg;?{hyC;=)M8%5@)zkzb5ljHIXlJ|F-BAOHd&00JNY0w4eaAOHf#iNKUBElGXB zo4FZTic5}Ijnds|+>;tHmlKn+bVjmAVP;Rr(uysjyjYve6=A~Y+BRnBXi;^wA5fTLY0vhApU4|18 zkr)$4w!#%bq0bqh7*%9xRT>ON1iJeThgRgkFQ_^K48AEj_=40IO^|2g;IiZh9eBk~ z!>4)sKQbW)m*@ciDz6X5Irxc@)K z4UDRR00@8p2!H?xfB*=900@8p2!H?)!1_PB0SJHq2!H?xfB*=900@8p2!H?x9Df41 z|3Ch1j2eOf2!H?xfB*=900@8p2!H?xfB>HVM;`zI5C8!X009sH0T2KI5C8!X0DD z5LPM2&P8KF9_fk2JLe|@%1%M*XjTonxx0F^R%x{=je1M9zVk|~WlGsnA*Bmt1IANE zE$s3Kj1KT<6F*RdSD{Y_iC>-wC?6v6uMZR7^+dXh$yB0L+%CB5o;;H70cVFvHVee! z`-$;@va%vQoNjByYL}j^vpCy1WazoR{>tSI_ZFoFj8^JxeMfIbx2{}|Zr!+iIr_%c z&DYnj-i*GsaWlGpQHuWznjzI@Zzx$&yKKOMob0CF_X@$u)SfBCTnC<;H-1opQ^14^MFJx3Sgkr^-~V7J zpscS;4_~rIs30LWR@J(r)HNsUGE!X)A8WJ81eN zx>K#Rb+x76+0*MK-I4V6A#`M(t!p*?9F>fBZb|`VbyeEegwOJAt-jMxYn7d**5=o7 z13O0g{Kf|IAFD;Q-Q8I9uGTEw(wb*e*=)RbZ9-VN)`N4YXtko#)L7Qqdb?87dqv|| z|9`JjC+tB01V8`;KmY_l00ck)1V8`;K;YRU5cXd5Ond&z!)`}*!=DTN+t|BfKNI|e z(eI2-`Tw1s1E`JsnfIpm%Ok%cy*+Y)%$}l;_oDR0$CA^TNd}Z_aj8=faYiFL)6&JF zoT}g7tu%F&M@m#JTgs(!xpcot)QIjFD{=jquI0Od!hS;@1F{{9R^4)|0VN)n_Fu3^ znG9L|Wyf6OONJ?TY*yyO`U5K}lu1}2Bl~g|g^x^os!dBQh+50-$nOqJii;GgHX2niLs8yJV zZ>%4XmGM}SxBlQ-GHd794JOCEzu}qc?~Lhle&yog(28~A!`+&R?+fH2)$iu8)JK1M zC7>)UNFO_6!QcVe9FakSl4QE$tweB^V6iGA#k;&AW)vKtlRJB%Htu}Tl zb$hgt7Emd#(o$P{6x;3|zSw9~?Y)feJ)E?8h#{7|$JVq;)e@P?X8J^OA#H7YI~H|{ zpd78#uH22)C(My9QF}pxc!u)qRMtc;~qoEvzs1Ev#Fyeznw~Mdx~3Z9mx6 z`I<&*JCo1pneO6?Wfw78)cz!Zm3yEU+iHbn&Bue|urw20TMQ@3mkR)?lnsenYg&;)`W;MAt{I2{no-j-C@(iU#?=)%`#L4tD@zpjVi#qb= z>41_>OZx@;Db-%mMZ4s4aKYWM7TXJX#IVr$6K%C$iJ@R{vGY!R7m4_sn?J`9#eZ+T)$Grvi#fi^y-fzh@ygFhjk>yB)plahR!yr`>G`DSUcFV>q0>?CEE>IVY2(6cd@$JU zfA;;+M4V5Bw`ko`S8H^ktL@MU&Uu;Vgl0`p{Q0M!)Z}FL9?QG=gq8gWy~`H^_2HAA zQDwfmX}|I?oX0yquh6WvE_Lo$=QJTjY^`A}YRyI9O_r%F9R-GOE+@V@@ud$6J7 z;XKo`?QG^Mh!e05*By9kCjv@|Ds%I&${4g&t&r8Va?wO|eIBI(Mj4tj z?#H~KfAy%^`zo&{Gg_{gOQ)=h4B^{ZojIdAF4r5b_PzOY$kE>;H${VIdy` zKmY_l00ck)1V8`;KmY_l00f?W0=WNw_S+CO009sH0T2KI5C8!X009sH0T2Lz!wBI1 z|1d(x2LTWO0T2KI5C8!X009sH0T2LzXP*G>|DXLfL=8Xy1V8`;KmY_l00ck)1V8`; zK;SR}xc@(l5b{9)1V8`;KmY_l00ck)1V8`;K;YRY5T+{l+1noWJ@)tPFWDE_JCwi& z1V8`;KmY_l00ck)1V8`;KmY_l;0Y0!_RF66lg2NxwT!51V8`;KmY_l00ck)1V8`;K;Wq$fam|8ie`Wa5C8!X009sH0T2KI5C8!X009tq kq6BdN|3vY^IS7CN2!H?xfB*=900@8p2!H?xJQW1~AC^h?%>V!Z literal 0 HcmV?d00001