From 4590cb54dd22de4af8de42fcff4209b6fa287054 Mon Sep 17 00:00:00 2001 From: henninggaertner Date: Thu, 9 Nov 2023 15:42:23 +0100 Subject: [PATCH 01/15] Implement simple per Protein ND-Sampling Imputation method --- protzilla/constants/location_mapping.py | 5 ++ protzilla/constants/workflow_meta.json | 63 +++++++++++++++++++++ protzilla/data_preprocessing/imputation.py | 65 ++++++++++++++++++++++ 3 files changed, 133 insertions(+) diff --git a/protzilla/constants/location_mapping.py b/protzilla/constants/location_mapping.py index aa59e2cd1..95c229e1b 100644 --- a/protzilla/constants/location_mapping.py +++ b/protzilla/constants/location_mapping.py @@ -313,6 +313,11 @@ "imputation", "min_value_per_dataset", ): imputation.by_min_per_dataset_plot, + ( + "data_preprocessing", + "imputation", + "normal_distribution_sampling_per_protein", + ): imputation.by_normal_distribution_sampling, ( "data_preprocessing", "outlier_detection", diff --git a/protzilla/constants/workflow_meta.json b/protzilla/constants/workflow_meta.json index 14ba36f09..e7a8f0fad 100644 --- a/protzilla/constants/workflow_meta.json +++ b/protzilla/constants/workflow_meta.json @@ -677,6 +677,69 @@ } } ] + }, + "normal_distribution_sampling_per_protein": { + "name": "Normal Distribution Sampling", + "description": "Imputation methods include normal distribution sampling per Protein or over Dataset", + "parameters": { + "strategy": { + "name": "Strategy:", + "type": "categorical", + "categories": [ + "perProtein", + "perDataset" + ], + "default": "perProtein" + }, + "down_shift": { + "name": "Downshift:", + "type": "numeric", + "min": -10, + "max": 10, + "default": -1 + }, + "scaling_factor": { + "name": "Scaling Factor:", + "type": "numeric", + "min": 0, + "max": 1, + "default": 1 + } + }, + "graphs": [ + { + "graph_type": { + "name": "Graph type:", + "type": "categorical", + "categories": [ + "Boxplot", + "Histogram" + ], + "default": "Boxplot" + }, + "group_by": { + "name": "Group by:", + "type": "categorical", + "categories": [ + "None", + "Sample", + "Protein ID" + ], + "default": "None" + } + }, + { + "graph_type_quantities": { + "name": "Graph type Imputed Values:", + "type": "categorical", + "categories": [ + "Bar chart", + "Pie chart" + ], + "default": "Pie chart" + } + } + ] } }, "filter_peptides": { diff --git a/protzilla/data_preprocessing/imputation.py b/protzilla/data_preprocessing/imputation.py index 14409a2f7..0a19bad6a 100644 --- a/protzilla/data_preprocessing/imputation.py +++ b/protzilla/data_preprocessing/imputation.py @@ -221,6 +221,71 @@ def by_min_per_dataset( return intensity_df_copy, dict() +def by_normal_distribution_sampling( + intensity_df: pd.DataFrame, + strategy="perProtein", + down_shift=0, + scaling_factor=1, +) -> tuple[pd.DataFrame, dict]: + """ + A function to perform imputation via sampling of a normal distribution + defined by the existing datapoints and user-defined parameters for down- + shifting and scaling. Imputes missing values for each protein taking into + account data from each protein. + + :param intensity_df: the dataframe that should be filtered in\ + long format + :type intensity_df: pandas DataFrame + :param strategy: which strategy to use for definition of the normal\ + distribution to be sampled. Can be "perProtein", "perDataset" or "most_frequent"\ + :type strategy: str + :param down_shift: a factor defining how many dataset standard deviations\ + to shift the mean of the normal distribution used for imputation.\ + Default: 0 (no shift) + :type down_shift: float + :param scaling_factor: a factor determining how the variance of the normal\ + distribution used for imputation is scaled compared to dataset. + Default: 1 (no scaling) + :type down_shift: float + :return: returns an imputed dataframe in typical protzilla long format\ + and an empty dict + :rtype: pd.DataFrame, int + """ + assert strategy in ["perProtein", "perDataset"] + + transformed_df = long_to_wide(intensity_df) + transformed_df.dropna(axis=1, how="all", inplace=True) + + # protein_means = transformed_df.mean(axis=1) + # protein_std = transformed_df.std(axis=1) + # scaled_protein_means = protein_means + down_shift * protein_std + # scaled_protein_std = protein_std * scaling_factor + + # Iterate over proteins to impute minimal value + if strategy == "perProtein": + for column in transformed_df.columns: + # determine mean (loc) + protein_mean = transformed_df[column].mean() + protein_std = transformed_df[column].std() + scaled_protein_mean = max(0, protein_mean + down_shift * protein_std) + scaled_protein_std = protein_std * scaling_factor + # determine standard deviation (scale) + value_to_be_imputed = abs( + np.random.normal( + loc=scaled_protein_mean, + scale=scaled_protein_std, + ) + ) + transformed_df[column].fillna(value_to_be_imputed, inplace=True) + else: + pass + # determine mean of normal distribution of dataset + # TODO + + imputed_df = wide_to_long(transformed_df, intensity_df) + return imputed_df, dict() + + def by_knn_plot( df, result_df, current_out, graph_type, graph_type_quantities, group_by ): From aeca5b2d771d8f2ae434b904385294ab8d1f21ba Mon Sep 17 00:00:00 2001 From: antonneubauer Date: Fri, 10 Nov 2023 14:26:38 +0100 Subject: [PATCH 02/15] just moved function --- protzilla/data_analysis/protein_graphs.py | 116 +++++++++++----------- 1 file changed, 58 insertions(+), 58 deletions(-) diff --git a/protzilla/data_analysis/protein_graphs.py b/protzilla/data_analysis/protein_graphs.py index fc1fc548d..78282376b 100644 --- a/protzilla/data_analysis/protein_graphs.py +++ b/protzilla/data_analysis/protein_graphs.py @@ -40,6 +40,64 @@ def variation_graph(protein_id: str, run_name: str): return out +def _create_protein_variation_graph(protein_id: str, run_name: str) -> dict: + """ + Creates a Protein-Variation-Graph for a given UniProt Protein ID using ProtGraph. + Included features are just `Variation`, digestion is skipped. + The Graph is saved in .graphml-Format. + + This is designed, so it can be used for peptides_to_isoform but works independently + as well + + ProtGraph: https://github.com/mpc-bioinformatics/ProtGraph/ + + :param protein_id: UniProt Protein-ID + :type: str + :param run_name: name of the run this is executed from. Used for saving the protein + file, graph + :type: str + :param queue_size: Queue Size for ProtGraph, This is yet to be merged by ProtGraph + :type: int + + :return: dict(graph_path, messages) + """ + + logger.info(f"Creating graph for protein {protein_id}") + run_path = RUNS_PATH / run_name + path_to_protein_file, filtered_blocks, request = _get_protein_file( + protein_id, run_path + ) + + path_to_protein_file = Path(path_to_protein_file) + if not path_to_protein_file.exists() and request.status_code != 200: + msg = f"error while downloading protein file for {protein_id}. Statuscode:{request.status_code}, {request.reason}. Got: {request.text}. Tip: check if the ID is correct" + logger.error(msg) + return dict( + graph_path=None, + filtered_blocks=filtered_blocks, + messages=[dict(level=messages.ERROR, msg=msg, trace=request.__dict__)], + ) + + output_folder_path = run_path / "graphs" + output_csv = output_folder_path / f"{protein_id}.csv" + graph_path = output_folder_path / f"{protein_id}.graphml" + cmd_str = f"protgraph -egraphml {path_to_protein_file} \ + --export_output_folder={output_folder_path} \ + --output_csv={output_csv} \ + -ft VARIANT \ + -d skip" + + subprocess.run(cmd_str, shell=True) + + msg = f"Graph created for protein {protein_id} at {graph_path} using {path_to_protein_file}" + logger.info(msg) + return dict( + graph_path=str(graph_path), + filtered_blocks=filtered_blocks, + messages=[dict(level=messages.INFO, msg=msg)], + ) + + def peptides_to_isoform( peptide_df: pd.DataFrame, protein_id: str, @@ -164,64 +222,6 @@ def peptides_to_isoform( ) -def _create_protein_variation_graph(protein_id: str, run_name: str) -> dict: - """ - Creates a Protein-Variation-Graph for a given UniProt Protein ID using ProtGraph. - Included features are just `Variation`, digestion is skipped. - The Graph is saved in .graphml-Format. - - This is designed, so it can be used for peptides_to_isoform but works independently - as well - - ProtGraph: https://github.com/mpc-bioinformatics/ProtGraph/ - - :param protein_id: UniProt Protein-ID - :type: str - :param run_name: name of the run this is executed from. Used for saving the protein - file, graph - :type: str - :param queue_size: Queue Size for ProtGraph, This is yet to be merged by ProtGraph - :type: int - - :return: dict(graph_path, messages) - """ - - logger.info(f"Creating graph for protein {protein_id}") - run_path = RUNS_PATH / run_name - path_to_protein_file, filtered_blocks, request = _get_protein_file( - protein_id, run_path - ) - - path_to_protein_file = Path(path_to_protein_file) - if not path_to_protein_file.exists() and request.status_code != 200: - msg = f"error while downloading protein file for {protein_id}. Statuscode:{request.status_code}, {request.reason}. Got: {request.text}. Tip: check if the ID is correct" - logger.error(msg) - return dict( - graph_path=None, - filtered_blocks=filtered_blocks, - messages=[dict(level=messages.ERROR, msg=msg, trace=request.__dict__)], - ) - - output_folder_path = run_path / "graphs" - output_csv = output_folder_path / f"{protein_id}.csv" - graph_path = output_folder_path / f"{protein_id}.graphml" - cmd_str = f"protgraph -egraphml {path_to_protein_file} \ - --export_output_folder={output_folder_path} \ - --output_csv={output_csv} \ - -ft VARIANT \ - -d skip" - - subprocess.run(cmd_str, shell=True) - - msg = f"Graph created for protein {protein_id} at {graph_path} using {path_to_protein_file}" - logger.info(msg) - return dict( - graph_path=str(graph_path), - filtered_blocks=filtered_blocks, - messages=[dict(level=messages.INFO, msg=msg)], - ) - - def _create_graph_index( protein_graph: nx.DiGraph, seq_len: int ) -> tuple[list | None, str, dict | None]: From 3a44abaf427a8440031678bd627cde0723b547cd Mon Sep 17 00:00:00 2001 From: antonneubauer Date: Fri, 10 Nov 2023 15:00:25 +0100 Subject: [PATCH 03/15] improve doc strings, deal with (most) PEP8 warnings --- protzilla/data_analysis/protein_graphs.py | 58 ++++++++++++----------- 1 file changed, 30 insertions(+), 28 deletions(-) diff --git a/protzilla/data_analysis/protein_graphs.py b/protzilla/data_analysis/protein_graphs.py index 78282376b..5aefe71f0 100644 --- a/protzilla/data_analysis/protein_graphs.py +++ b/protzilla/data_analysis/protein_graphs.py @@ -56,8 +56,6 @@ def _create_protein_variation_graph(protein_id: str, run_name: str) -> dict: :param run_name: name of the run this is executed from. Used for saving the protein file, graph :type: str - :param queue_size: Queue Size for ProtGraph, This is yet to be merged by ProtGraph - :type: int :return: dict(graph_path, messages) """ @@ -70,7 +68,7 @@ def _create_protein_variation_graph(protein_id: str, run_name: str) -> dict: path_to_protein_file = Path(path_to_protein_file) if not path_to_protein_file.exists() and request.status_code != 200: - msg = f"error while downloading protein file for {protein_id}. Statuscode:{request.status_code}, {request.reason}. Got: {request.text}. Tip: check if the ID is correct" + msg = f"error while downloading protein file for {protein_id}. Statuscode:{request.status_code}, {request.reason}. Got: {request.text}. Tip: check if the ID is correct" # noqa E501 logger.error(msg) return dict( graph_path=None, @@ -89,7 +87,7 @@ def _create_protein_variation_graph(protein_id: str, run_name: str) -> dict: subprocess.run(cmd_str, shell=True) - msg = f"Graph created for protein {protein_id} at {graph_path} using {path_to_protein_file}" + msg = f"Graph created for protein {protein_id} at {graph_path} using {path_to_protein_file}" # noqa E501 logger.info(msg) return dict( graph_path=str(graph_path), @@ -125,6 +123,10 @@ def peptides_to_isoform( :type run_name: str :param k: k-mer size to build necessary indices for matching peptides, defaults to 5 :type k: int, optional + :param allowed_mismatches: max number of mismatched amino acids between a given + peptide and the reference sequence at a given starting location of a potential + match + :type allowed_mismatches: int :return: dict of path to graph - either the modified graph or the original graph if the modification failed, the protein id, list of matched peptides, list of unmatched @@ -250,7 +252,7 @@ def _create_graph_index( starting_point = node break else: - msg = "No starting point found in the graph. An error in the graph creation is likely." + msg = "No starting point found in the graph. An error in the graph creation is likely." # noqa E501 logger.error(msg) return None, msg, None @@ -263,15 +265,15 @@ def _create_graph_index( for node in protein_graph.nodes: if protein_graph.nodes[node]["aminoacid"] == "__end__": if longest_paths[node] < seq_len: - msg = f"The longest path to the last node is shorter than the reference sequence. An error in the graph creation is likely. Node: {node}, longest path: {longest_paths[node]}, seq_len: {seq_len}" + msg = f"The longest path to the last node is shorter than the reference sequence. An error in the graph creation is likely. Node: {node}, longest path: {longest_paths[node]}, seq_len: {seq_len}" # noqa E501 logger.error(msg) return None, msg, longest_paths elif longest_paths[node] > seq_len: - msg = f"The longest path to the last node is longer than the reference sequence. This could occur if a VARIANT substitutes more amino acids than it replaces. This is unexpected behaviour and should have been filtered out of the protein data. Node: {node}, longest path: {longest_paths[node]}, seq_len: {seq_len}" + msg = f"The longest path to the last node is longer than the reference sequence. This could occur if a VARIANT substitutes more amino acids than it replaces. This is unexpected behaviour and should have been filtered out of the protein data. Node: {node}, longest path: {longest_paths[node]}, seq_len: {seq_len}" # noqa E501 logger.error(msg) return None, msg, longest_paths - index = [[] for i in range(seq_len)] + index = [[] for _ in range(seq_len)] for node in longest_paths: if ( protein_graph.nodes[node]["aminoacid"] == "__start__" @@ -338,7 +340,7 @@ def _longest_paths(protein_graph: nx.DiGraph, start_node: str): distances[succ] = max(distances[succ], distances[node] + node_len) else: raise Exception( - f"The node {node} was not visited in the topological order (distance should be set already)" + f"The node {node} was not visited in the topological order (distance should be set already)" # noqa E501 ) longest_paths = dict(sorted(distances.items(), key=lambda x: x[1])) @@ -418,7 +420,7 @@ def _parse_file(file_path): if variant_block and "/note" in line: # only VARIANTs that don't elongate the sequence are valid - if not "Missing" in line: + if "Missing" not in line: matches = re.search(r"([A-Z]+) -> ([A-Z]+)", line) if matches is None: # if this is invalid we don't want to be the ones who filter it out @@ -561,7 +563,7 @@ def _potential_peptide_matches( raise ValueError(f"k must be positive integer, but is {k}") if not isinstance(allowed_mismatches, int) or allowed_mismatches < 0: raise ValueError( - f"allowed_mismatches must be non-negative integer, but is {allowed_mismatches}" + f"allowed_mismatches must be non-negative integer, but is {allowed_mismatches}" # noqa E501 ) logger.debug("Matching peptides to reference sequence") @@ -578,7 +580,7 @@ def _potential_peptide_matches( # for now potential matches like this will be dismissed even if # match_start_pos + len(peptide) - allowed_mismatches <= seq_len logger.debug( - f"match would be out of bounds for peptide {peptide}, match_start_pos {match_start_pos}" + f"match would be out of bounds for peptide {peptide}, match_start_pos {match_start_pos}" # noqa E501 ) continue matched_starts.append(match_start_pos) @@ -589,7 +591,7 @@ def _potential_peptide_matches( peptide_mismatches.add(peptide) logger.debug( - f"potential peptide matches - peptide:[starting_pos] :: {potential_peptide_matches}" + f"potential peptide matches - peptide:[starting_pos] :: {potential_peptide_matches}" # noqa E501 ) logger.debug(f"peptide mismatches: {peptide_mismatches}") @@ -654,9 +656,10 @@ def _match_potential_matches( recursive matching method that branches off at the end of each node that is matches to the end while there is still amino acids left over in the peptide. A new recursion is opened for each successor of a node, given the scenario above. - A recursion is closed if the end of the peptide is reached or if the number of mismatches - exceeds the allowed number of mismatches. This leads to potential run time problems - for many allowed mismatches and long peptides in graphs with frequent Variations. + A recursion is closed if the end of the peptide is reached or if the number of + mismatches exceeds the allowed number of mismatches. This leads to potential run + time problems for many allowed mismatches and long peptides in graphs with frequent + Variations. For the recursive method, see below. @@ -712,7 +715,8 @@ def _match_on_graph( :param graph: protein variation graph, as created by ProtGraph (-> _create_protein_variation_graph) :type graph: networkx.DiGraph - :param current_node: current node in the graph, starting with the node of the match start + :param current_node: current node in the graph, starting with the node of the + match start :type current_node: str :param left_over_peptide: peptide that still needs to be matched to the graph :type left_over_peptide: str @@ -757,7 +761,7 @@ def _match_on_graph( last_index = i # node is matched til end, peptide not done - data_from_succ = [] + data_from_successor = [] recursion_start_mismatches = mismatches for succ in graph.successors(current_node): recursion_start_data = node_match_data.copy() @@ -771,9 +775,9 @@ def _match_on_graph( current_index=0, ) if match: - data_from_succ.append((match, match_data_from_succ, mismatches)) - if data_from_succ: - return_val = min(data_from_succ, key=lambda item: item[2]) + data_from_successor.append((match, match_data_from_succ, mismatches)) + if data_from_successor: + return_val = min(data_from_successor, key=lambda item: item[2]) return return_val else: return False, {}, mismatches @@ -790,7 +794,7 @@ def _match_on_graph( break else: logger.error( - f"No fitting node for match start position {match_start_index} of {peptide} found" + f"No fitting node for match start position {match_start_index} of {peptide} found" # noqa E501 ) continue matched, node_match_data, mismatches = _match_on_graph( @@ -826,11 +830,9 @@ def _modify_graph(graph, contig_positions): :param graph: Protein Graph to be modified :type: nx.DiGraph - :param contig_positions: Dict from current_node to contig-positions {current_node: [(start, end)]}. + :param contig_positions: Dict from current_node to contig-positions + {current_node: [(start, end)]}. :type: dict(list[tuple]) - :param longest_paths: mapping from current_node to the longest path to current_node - (-> _longest_paths()) - :type: dict :return: modified protein graph, with contigs & not-matched AAs as nodes, indicated by current_node attribute `matched` """ @@ -974,8 +976,8 @@ def _get_peptides(peptide_df: pd.DataFrame, protein_id: str) -> list[str] | None df = peptide_df[peptide_df["Protein ID"].str.contains(protein_id)] pattern = rf"^({protein_id}-\d+)$" - filter = df["Protein ID"].str.contains(pattern) - df = df[~filter] + protein_id_filter = df["Protein ID"].str.contains(pattern) + df = df[~protein_id_filter] intensity_name = [col for col in df.columns if "intensity" in col.lower()][0] df = df.dropna(subset=[intensity_name]) From efb0a0464636b3cd7f1c8e5e7d451a39e838de9f Mon Sep 17 00:00:00 2001 From: henninggaertner Date: Mon, 13 Nov 2023 16:30:52 +0100 Subject: [PATCH 04/15] generalize python mapping name for ND sampling imputation, append missing location mapping --- protzilla/constants/location_mapping.py | 7 ++++++- protzilla/constants/workflow_meta.json | 2 +- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/protzilla/constants/location_mapping.py b/protzilla/constants/location_mapping.py index 95c229e1b..91e4b0e91 100644 --- a/protzilla/constants/location_mapping.py +++ b/protzilla/constants/location_mapping.py @@ -127,6 +127,11 @@ "imputation", "min_value_per_dataset", ): imputation.by_min_per_dataset, + ( + "data_preprocessing", + "imputation", + "normal_distribution_sampling", + ): imputation.by_normal_distribution_sampling, ( "data_preprocessing", "filter_peptides", @@ -316,7 +321,7 @@ ( "data_preprocessing", "imputation", - "normal_distribution_sampling_per_protein", + "normal_distribution_sampling", ): imputation.by_normal_distribution_sampling, ( "data_preprocessing", diff --git a/protzilla/constants/workflow_meta.json b/protzilla/constants/workflow_meta.json index e7a8f0fad..1f76ffe63 100644 --- a/protzilla/constants/workflow_meta.json +++ b/protzilla/constants/workflow_meta.json @@ -678,7 +678,7 @@ } ] }, - "normal_distribution_sampling_per_protein": { + "normal_distribution_sampling": { "name": "Normal Distribution Sampling", "description": "Imputation methods include normal distribution sampling per Protein or over Dataset", "parameters": { From 7ca6f60a1f2ef934a117f9a5830ee6ea74da7518 Mon Sep 17 00:00:00 2001 From: henninggaertner Date: Mon, 13 Nov 2023 16:33:16 +0100 Subject: [PATCH 05/15] ND-sampling imputation now uses log-transformed intensities for sampling --- protzilla/data_preprocessing/imputation.py | 23 +++++++++++----------- 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/protzilla/data_preprocessing/imputation.py b/protzilla/data_preprocessing/imputation.py index 0a19bad6a..28060471e 100644 --- a/protzilla/data_preprocessing/imputation.py +++ b/protzilla/data_preprocessing/imputation.py @@ -256,32 +256,31 @@ def by_normal_distribution_sampling( transformed_df = long_to_wide(intensity_df) transformed_df.dropna(axis=1, how="all", inplace=True) - # protein_means = transformed_df.mean(axis=1) - # protein_std = transformed_df.std(axis=1) - # scaled_protein_means = protein_means + down_shift * protein_std - # scaled_protein_std = protein_std * scaling_factor - - # Iterate over proteins to impute minimal value + # TODO: sample the normal distribution for each missing value instead of sampling once for all missing values if strategy == "perProtein": for column in transformed_df.columns: - # determine mean (loc) - protein_mean = transformed_df[column].mean() - protein_std = transformed_df[column].std() + # determine mean and standard deviation of log-transformed protein intensties + protein_mean = np.log10(transformed_df[column]).mean() + protein_std = np.log10(transformed_df[column]).std() + # calculate mean and standard deviation of normal distribution to be sampled scaled_protein_mean = max(0, protein_mean + down_shift * protein_std) scaled_protein_std = protein_std * scaling_factor - # determine standard deviation (scale) - value_to_be_imputed = abs( + # sample from normal distribution and transform back to linear scale + log_value_to_be_imputed = abs( np.random.normal( loc=scaled_protein_mean, scale=scaled_protein_std, ) ) + value_to_be_imputed = 10**log_value_to_be_imputed + # impute missing values for current protein group transformed_df[column].fillna(value_to_be_imputed, inplace=True) else: pass # determine mean of normal distribution of dataset - # TODO + # TODO: implement perDataset strategy + # Turn the wide format into the long format and return imputed dataframe imputed_df = wide_to_long(transformed_df, intensity_df) return imputed_df, dict() From 812449dc306cc332d133caab870aec80c804c010 Mon Sep 17 00:00:00 2001 From: henninggaertner Date: Mon, 13 Nov 2023 16:40:42 +0100 Subject: [PATCH 06/15] ND sampling imputation now samples for each missing value instead of sampling once for entire protein group --- protzilla/data_preprocessing/imputation.py | 25 +++++++++++----------- 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/protzilla/data_preprocessing/imputation.py b/protzilla/data_preprocessing/imputation.py index 28060471e..c3705a2a6 100644 --- a/protzilla/data_preprocessing/imputation.py +++ b/protzilla/data_preprocessing/imputation.py @@ -256,25 +256,26 @@ def by_normal_distribution_sampling( transformed_df = long_to_wide(intensity_df) transformed_df.dropna(axis=1, how="all", inplace=True) - # TODO: sample the normal distribution for each missing value instead of sampling once for all missing values if strategy == "perProtein": for column in transformed_df.columns: - # determine mean and standard deviation of log-transformed protein intensties + # determine mean and standard deviation of log-transformed protein intensities protein_mean = np.log10(transformed_df[column]).mean() protein_std = np.log10(transformed_df[column]).std() # calculate mean and standard deviation of normal distribution to be sampled scaled_protein_mean = max(0, protein_mean + down_shift * protein_std) scaled_protein_std = protein_std * scaling_factor - # sample from normal distribution and transform back to linear scale - log_value_to_be_imputed = abs( - np.random.normal( - loc=scaled_protein_mean, - scale=scaled_protein_std, - ) - ) - value_to_be_imputed = 10**log_value_to_be_imputed - # impute missing values for current protein group - transformed_df[column].fillna(value_to_be_imputed, inplace=True) + # iterate over all values of current protein group + for index, value in transformed_df[column].iteritems(): + # if value is NaN, sample from normal distribution and impute value + if np.isnan(value): + log_value_to_be_imputed = abs( + np.random.normal( + loc=scaled_protein_mean, + scale=scaled_protein_std, + ) + ) + value_to_be_imputed = 10**log_value_to_be_imputed + transformed_df[column].loc[index] = value_to_be_imputed else: pass # determine mean of normal distribution of dataset From 9afddfca8f0fee17be77742b0c71ba0a6acd7bd1 Mon Sep 17 00:00:00 2001 From: henninggaertner Date: Tue, 14 Nov 2023 14:29:06 +0100 Subject: [PATCH 07/15] fix location mapping for normal distrubiton sampling imputation --- protzilla/constants/location_mapping.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/protzilla/constants/location_mapping.py b/protzilla/constants/location_mapping.py index 91e4b0e91..9f35637b9 100644 --- a/protzilla/constants/location_mapping.py +++ b/protzilla/constants/location_mapping.py @@ -322,7 +322,7 @@ "data_preprocessing", "imputation", "normal_distribution_sampling", - ): imputation.by_normal_distribution_sampling, + ): imputation.by_normal_distribution_sampling_plot, ( "data_preprocessing", "outlier_detection", From a92941f4a23de6a5af39995e5c76cd9106ff69ab Mon Sep 17 00:00:00 2001 From: henninggaertner Date: Tue, 14 Nov 2023 17:46:37 +0100 Subject: [PATCH 08/15] implement fast perProtein and perDataset normal dist. sampling imputation method --- protzilla/data_preprocessing/imputation.py | 115 ++++++++++++++++----- 1 file changed, 89 insertions(+), 26 deletions(-) diff --git a/protzilla/data_preprocessing/imputation.py b/protzilla/data_preprocessing/imputation.py index c3705a2a6..0ce54398b 100644 --- a/protzilla/data_preprocessing/imputation.py +++ b/protzilla/data_preprocessing/imputation.py @@ -232,6 +232,8 @@ def by_normal_distribution_sampling( defined by the existing datapoints and user-defined parameters for down- shifting and scaling. Imputes missing values for each protein taking into account data from each protein. + The downshifted normal distribution that will be sampled for imputation has a lower + limit for the mean of at 0, meaning that if the downshifted mean were to be negative, it will be set at 0. :param intensity_df: the dataframe that should be filtered in\ long format @@ -253,37 +255,79 @@ def by_normal_distribution_sampling( """ assert strategy in ["perProtein", "perDataset"] - transformed_df = long_to_wide(intensity_df) - transformed_df.dropna(axis=1, how="all", inplace=True) - if strategy == "perProtein": - for column in transformed_df.columns: + transformed_df = long_to_wide(intensity_df) + # iterate over all protein groups + for protein_grp in transformed_df.columns: + # determine number of missing values + number_of_nans = transformed_df[protein_grp].isnull().sum() + + # get indices of NaN values in current protein group + location_of_nans = transformed_df[protein_grp].isnull() + indices_of_nans = location_of_nans[location_of_nans].index + # determine mean and standard deviation of log-transformed protein intensities - protein_mean = np.log10(transformed_df[column]).mean() - protein_std = np.log10(transformed_df[column]).std() + protein_grp_mean = np.log10(transformed_df[protein_grp]).mean(skipna=True) + protein_grp_std = np.log10(transformed_df[protein_grp]).std(skipna=True) + # calculate mean and standard deviation of normal distribution to be sampled - scaled_protein_mean = max(0, protein_mean + down_shift * protein_std) - scaled_protein_std = protein_std * scaling_factor - # iterate over all values of current protein group - for index, value in transformed_df[column].iteritems(): - # if value is NaN, sample from normal distribution and impute value - if np.isnan(value): - log_value_to_be_imputed = abs( - np.random.normal( - loc=scaled_protein_mean, - scale=scaled_protein_std, - ) - ) - value_to_be_imputed = 10**log_value_to_be_imputed - transformed_df[column].loc[index] = value_to_be_imputed + sampling_mean = max(0, protein_grp_mean + down_shift * protein_grp_std) + sampling_std = protein_grp_std * scaling_factor + + # calculate log-transformed values to be imputed + log_impute_values = abs( + np.random.normal( + loc=sampling_mean, + scale=sampling_std, + size=number_of_nans, + ) + ) + + # transform log-transformed values to be imputed back to normal scale and round to nearest integer + impute_values = np.round(10**log_impute_values, decimals=0) + + # zip indices of NaN values with values to be imputed together as a Series, such that fillna can be used + impute_value_series = pd.Series(impute_values, index=indices_of_nans) + transformed_df[protein_grp].fillna(impute_value_series, inplace=True) + + imputed_df = wide_to_long(transformed_df, intensity_df) + return imputed_df, dict() + else: - pass - # determine mean of normal distribution of dataset - # TODO: implement perDataset strategy + # deep copy the dataframe + intensity_type = intensity_df.columns[3] + + # get indices of NaN values in current protein group + location_of_nans = intensity_df[intensity_type].isnull() + indices_of_nans = location_of_nans[location_of_nans].index + # calculate the mean and standard deviation of the entire dataset + dataset_mean = np.log10(intensity_df[intensity_type]).mean() + dataset_std = np.log10(intensity_df[intensity_type]).std() + + # calculate mean and standard deviation of normal distribution to be sampled + scaled_dataset_mean = max(0, dataset_mean + down_shift * dataset_std) + scaled_dataset_std = dataset_std * scaling_factor + + # get number of NaN values in dataset + number_of_nans = intensity_df[intensity_type].isnull().sum() + + # calculate log-transformed values to be imputed + log_impute_values = abs( + np.random.normal( + loc=scaled_dataset_mean, + scale=scaled_dataset_std, + size=number_of_nans, + ) + ) - # Turn the wide format into the long format and return imputed dataframe - imputed_df = wide_to_long(transformed_df, intensity_df) - return imputed_df, dict() + # transform log-transformed values to be imputed back to normal scale and round to nearest integer + impute_values = np.round(10**log_impute_values, decimals=0) + + # zip indices of NaN values with values to be imputed together as a Series, such that fillna can be used + impute_value_series = pd.Series(impute_values, index=indices_of_nans) + intensity_df[intensity_type].fillna(impute_value_series, inplace=True) + + return intensity_df, dict() def by_knn_plot( @@ -294,6 +338,14 @@ def by_knn_plot( ) +def by_normal_distribution_sampling_plot( + df, result_df, current_out, graph_type, graph_type_quantities, group_by +): + return _build_box_hist_plot( + df, result_df, graph_type, graph_type_quantities, group_by + ) + + def by_simple_imputer_plot( df, result_df, current_out, graph_type, graph_type_quantities, group_by ): @@ -380,3 +432,14 @@ def _build_box_hist_plot( heading="Number of Imputed Values", ) return [fig1, fig2] + + +def xf(): + # load df from intensity_df.csv + df = pd.read_csv("intensity_df.csv") + # impute missing values + for i in range(1): + result_df, _ = by_normal_distribution_sampling( + df, strategy="perProtein", down_shift=0, scaling_factor=1 + ) + print("done") From 50af6a757e19ace2867cdf55c8872a30b0313ddc Mon Sep 17 00:00:00 2001 From: henninggaertner Date: Tue, 14 Nov 2023 17:54:27 +0100 Subject: [PATCH 09/15] removed unnecessary testing function for code --- protzilla/data_preprocessing/imputation.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/protzilla/data_preprocessing/imputation.py b/protzilla/data_preprocessing/imputation.py index 0ce54398b..5e1c84f57 100644 --- a/protzilla/data_preprocessing/imputation.py +++ b/protzilla/data_preprocessing/imputation.py @@ -432,14 +432,3 @@ def _build_box_hist_plot( heading="Number of Imputed Values", ) return [fig1, fig2] - - -def xf(): - # load df from intensity_df.csv - df = pd.read_csv("intensity_df.csv") - # impute missing values - for i in range(1): - result_df, _ = by_normal_distribution_sampling( - df, strategy="perProtein", down_shift=0, scaling_factor=1 - ) - print("done") From dbd99da193d77039afd784ea0a7f3e5fe49f548b Mon Sep 17 00:00:00 2001 From: henninggaertner Date: Tue, 14 Nov 2023 17:55:54 +0100 Subject: [PATCH 10/15] comment fix --- protzilla/data_preprocessing/imputation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/protzilla/data_preprocessing/imputation.py b/protzilla/data_preprocessing/imputation.py index 5e1c84f57..9cd5e5356 100644 --- a/protzilla/data_preprocessing/imputation.py +++ b/protzilla/data_preprocessing/imputation.py @@ -294,7 +294,7 @@ def by_normal_distribution_sampling( return imputed_df, dict() else: - # deep copy the dataframe + # determine column for protein intensities intensity_type = intensity_df.columns[3] # get indices of NaN values in current protein group From 19491ef13f6172e4d9ffbde16550fc47cecaf35b Mon Sep 17 00:00:00 2001 From: henninggaertner Date: Wed, 15 Nov 2023 11:03:17 +0100 Subject: [PATCH 11/15] ND-sampling imputation: complete tests, remove rounding and limiting sampel ND, do not impute if there are insufficient values in dataframe --- protzilla/data_preprocessing/imputation.py | 54 +++++++++++-------- .../data_preprocessing/test_imputation.py | 45 ++++++++++++++++ 2 files changed, 77 insertions(+), 22 deletions(-) diff --git a/protzilla/data_preprocessing/imputation.py b/protzilla/data_preprocessing/imputation.py index 9cd5e5356..a0834c3c9 100644 --- a/protzilla/data_preprocessing/imputation.py +++ b/protzilla/data_preprocessing/imputation.py @@ -226,29 +226,33 @@ def by_normal_distribution_sampling( strategy="perProtein", down_shift=0, scaling_factor=1, + round_values=False, ) -> tuple[pd.DataFrame, dict]: """ A function to perform imputation via sampling of a normal distribution defined by the existing datapoints and user-defined parameters for down- shifting and scaling. Imputes missing values for each protein taking into - account data from each protein. - The downshifted normal distribution that will be sampled for imputation has a lower - limit for the mean of at 0, meaning that if the downshifted mean were to be negative, it will be set at 0. - - :param intensity_df: the dataframe that should be filtered in\ + account data from each protein or the whole dataset. The data is log- + transformed before sampling from the normal distribution and transformed + back afterwards, meaning only values > 0 are imputed. + Will not impute if insufficient data is available for sampling. + :param intensity_df: the dataframe that should be filtered in long format :type intensity_df: pandas DataFrame - :param strategy: which strategy to use for definition of the normal\ - distribution to be sampled. Can be "perProtein", "perDataset" or "most_frequent"\ + :param strategy: which strategy to use for definition of the normal + distribution to be sampled. Can be "perProtein", "perDataset" or "most_frequent" :type strategy: str - :param down_shift: a factor defining how many dataset standard deviations\ - to shift the mean of the normal distribution used for imputation.\ + :param down_shift: a factor defining how many dataset standard deviations + to shift the mean of the normal distribution used for imputation. Default: 0 (no shift) :type down_shift: float - :param scaling_factor: a factor determining how the variance of the normal\ + :param scaling_factor: a factor determining how the variance of the normal distribution used for imputation is scaled compared to dataset. Default: 1 (no scaling) :type down_shift: float + :param round_values: whether to round the imputed values to the nearest integer + Default: False + :type round_values: bool :return: returns an imputed dataframe in typical protzilla long format\ and an empty dict :rtype: pd.DataFrame, int @@ -262,6 +266,10 @@ def by_normal_distribution_sampling( # determine number of missing values number_of_nans = transformed_df[protein_grp].isnull().sum() + # don't impute values if there not enough values (> 1) to sample from + if number_of_nans > len(transformed_df[protein_grp]) - 2: + continue + # get indices of NaN values in current protein group location_of_nans = transformed_df[protein_grp].isnull() indices_of_nans = location_of_nans[location_of_nans].index @@ -271,20 +279,18 @@ def by_normal_distribution_sampling( protein_grp_std = np.log10(transformed_df[protein_grp]).std(skipna=True) # calculate mean and standard deviation of normal distribution to be sampled - sampling_mean = max(0, protein_grp_mean + down_shift * protein_grp_std) + sampling_mean = protein_grp_mean + down_shift * protein_grp_std sampling_std = protein_grp_std * scaling_factor # calculate log-transformed values to be imputed - log_impute_values = abs( - np.random.normal( - loc=sampling_mean, - scale=sampling_std, - size=number_of_nans, - ) + log_impute_values = np.random.normal( + loc=sampling_mean, + scale=sampling_std, + size=number_of_nans, ) # transform log-transformed values to be imputed back to normal scale and round to nearest integer - impute_values = np.round(10**log_impute_values, decimals=0) + impute_values = 10**log_impute_values # zip indices of NaN values with values to be imputed together as a Series, such that fillna can be used impute_value_series = pd.Series(impute_values, index=indices_of_nans) @@ -297,9 +303,16 @@ def by_normal_distribution_sampling( # determine column for protein intensities intensity_type = intensity_df.columns[3] + # get number of NaN values in dataset + number_of_nans = intensity_df[intensity_type].isnull().sum() + + # throw error if dataset is basically empty, something went wrong + assert number_of_nans <= len(intensity_df[intensity_type]) - 2 + # get indices of NaN values in current protein group location_of_nans = intensity_df[intensity_type].isnull() indices_of_nans = location_of_nans[location_of_nans].index + # calculate the mean and standard deviation of the entire dataset dataset_mean = np.log10(intensity_df[intensity_type]).mean() dataset_std = np.log10(intensity_df[intensity_type]).std() @@ -308,9 +321,6 @@ def by_normal_distribution_sampling( scaled_dataset_mean = max(0, dataset_mean + down_shift * dataset_std) scaled_dataset_std = dataset_std * scaling_factor - # get number of NaN values in dataset - number_of_nans = intensity_df[intensity_type].isnull().sum() - # calculate log-transformed values to be imputed log_impute_values = abs( np.random.normal( @@ -321,7 +331,7 @@ def by_normal_distribution_sampling( ) # transform log-transformed values to be imputed back to normal scale and round to nearest integer - impute_values = np.round(10**log_impute_values, decimals=0) + impute_values = 10**log_impute_values # zip indices of NaN values with values to be imputed together as a Series, such that fillna can be used impute_value_series = pd.Series(impute_values, index=indices_of_nans) diff --git a/tests/protzilla/data_preprocessing/test_imputation.py b/tests/protzilla/data_preprocessing/test_imputation.py index 44b66bb13..40caba91a 100644 --- a/tests/protzilla/data_preprocessing/test_imputation.py +++ b/tests/protzilla/data_preprocessing/test_imputation.py @@ -9,6 +9,8 @@ by_min_per_protein_plot, by_min_per_sample, by_min_per_sample_plot, + by_normal_distribution_sampling, + by_normal_distribution_sampling_plot, by_simple_imputer, by_simple_imputer_plot, np, @@ -18,6 +20,11 @@ from tests.protzilla.data_preprocessing import test_plots_data_preprocessing +def protein_group_intensities(dataframe, protein_group_name): + # small helper function for tests + return dataframe[dataframe["Protein ID"] == protein_group_name]["Intensity"] + + @pytest.fixture def input_imputation_df(): test_intensity_list = ( @@ -266,6 +273,44 @@ def test_imputation_knn(show_figures, input_imputation_df, assertion_df_knn): {assertion_df} but is\n {result_df}" +@pytest.mark.order(2) +@pytest.mark.dependency(depends=["test_build_box_hist_plot"]) +def test_imputation_normal_distribution_sampling(show_figures, input_imputation_df): + # perform imputation on test data frame + result_df_perProtein = by_normal_distribution_sampling( + input_imputation_df, strategy="perProtein", down_shift=-10 + )[0] + result_df_perDataset = by_normal_distribution_sampling( + input_imputation_df, strategy="perDataset", down_shift=-10 + )[0] + + fig1, fig2 = by_normal_distribution_sampling_plot( + input_imputation_df, result_df_perProtein, {}, "Boxplot", "Bar chart", "Sample" + ) + if show_figures: + fig1.show() + fig2.show() + + assert ( + result_df_perProtein["Intensity"].min() >= 0 + ), f"Imputation by normal distribution sampling should not have negative values!" + assert ( + result_df_perDataset["Intensity"].min() >= 0 + ), f"Imputation by normal distribution sampling should not have negative values!" + + assert ( + False == protein_group_intensities(result_df_perProtein, "Protein1").hasnans + ) and ( + False == protein_group_intensities(result_df_perProtein, "Protein3").hasnans + ), f"Imputation by normal distribution sampling should not have NaN values!" + assert protein_group_intensities( + result_df_perProtein, "Protein2" + ).hasnans, f"This protein group should have NaN values! Not enough data points to sample from!" + assert ( + False == result_df_perDataset["Intensity"].hasnans + ), f"Imputation by normal distribution sampling per Dataset should not have NaN values!" + + def test_number_of_imputed_values(input_imputation_df, assertion_df_knn): count = number_of_imputed_values(input_imputation_df, assertion_df_knn) assert ( From abbb68fc19ad0586e63f4e12eac278250f62c668 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henning=20G=C3=A4rtner?= <104069093+henninggaertner@users.noreply.github.com> Date: Thu, 16 Nov 2023 17:07:58 +0100 Subject: [PATCH 12/15] Update issue.md template to automatically assign PROTzilla project --- .github/ISSUE_TEMPLATE/issue.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/ISSUE_TEMPLATE/issue.md b/.github/ISSUE_TEMPLATE/issue.md index 387d4a3ee..356273db8 100644 --- a/.github/ISSUE_TEMPLATE/issue.md +++ b/.github/ISSUE_TEMPLATE/issue.md @@ -3,7 +3,7 @@ name: Issue Template about: Standard Issue Template title: labels: -project: PROTzilla 2 +project: PROTzilla --- From cfe54281244bda20ba2d1171c6c3e81519e75665 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henning=20G=C3=A4rtner?= <104069093+henninggaertner@users.noreply.github.com> Date: Fri, 17 Nov 2023 11:23:47 +0100 Subject: [PATCH 13/15] Update todo_issue.md to automatically assign correct project --- .github/ISSUE_TEMPLATE/todo_issue.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/ISSUE_TEMPLATE/todo_issue.md b/.github/ISSUE_TEMPLATE/todo_issue.md index d8c7fbb93..c20221c3b 100644 --- a/.github/ISSUE_TEMPLATE/todo_issue.md +++ b/.github/ISSUE_TEMPLATE/todo_issue.md @@ -3,7 +3,7 @@ name: TODO about: TODO Issue Template title: '# TODO ' labels: todo -project: 'PROTzilla2' +project: 'PROTzilla' --- From 7b2fb4f832ea0cb4542b18c2336b174b24c36262 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henning=20G=C3=A4rtner?= <104069093+henninggaertner@users.noreply.github.com> Date: Fri, 17 Nov 2023 12:50:03 +0100 Subject: [PATCH 14/15] Update issue.md --- .github/ISSUE_TEMPLATE/issue.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/ISSUE_TEMPLATE/issue.md b/.github/ISSUE_TEMPLATE/issue.md index 356273db8..677a8ce72 100644 --- a/.github/ISSUE_TEMPLATE/issue.md +++ b/.github/ISSUE_TEMPLATE/issue.md @@ -3,7 +3,7 @@ name: Issue Template about: Standard Issue Template title: labels: -project: PROTzilla +project: "PROTzilla" --- From 5d65d0f20705cfe7507a51cd8f2727ff68f8202d Mon Sep 17 00:00:00 2001 From: "janni.roebbecke" Date: Fri, 17 Nov 2023 13:30:08 +0100 Subject: [PATCH 15/15] minor, purely stylistic changes --- protzilla/data_preprocessing/imputation.py | 24 +++++----------------- 1 file changed, 5 insertions(+), 19 deletions(-) diff --git a/protzilla/data_preprocessing/imputation.py b/protzilla/data_preprocessing/imputation.py index a0834c3c9..160112cfb 100644 --- a/protzilla/data_preprocessing/imputation.py +++ b/protzilla/data_preprocessing/imputation.py @@ -226,7 +226,6 @@ def by_normal_distribution_sampling( strategy="perProtein", down_shift=0, scaling_factor=1, - round_values=False, ) -> tuple[pd.DataFrame, dict]: """ A function to perform imputation via sampling of a normal distribution @@ -263,22 +262,18 @@ def by_normal_distribution_sampling( transformed_df = long_to_wide(intensity_df) # iterate over all protein groups for protein_grp in transformed_df.columns: - # determine number of missing values + number_of_nans = transformed_df[protein_grp].isnull().sum() # don't impute values if there not enough values (> 1) to sample from if number_of_nans > len(transformed_df[protein_grp]) - 2: continue - # get indices of NaN values in current protein group location_of_nans = transformed_df[protein_grp].isnull() indices_of_nans = location_of_nans[location_of_nans].index - # determine mean and standard deviation of log-transformed protein intensities protein_grp_mean = np.log10(transformed_df[protein_grp]).mean(skipna=True) protein_grp_std = np.log10(transformed_df[protein_grp]).std(skipna=True) - - # calculate mean and standard deviation of normal distribution to be sampled sampling_mean = protein_grp_mean + down_shift * protein_grp_std sampling_std = protein_grp_std * scaling_factor @@ -288,7 +283,6 @@ def by_normal_distribution_sampling( scale=sampling_std, size=number_of_nans, ) - # transform log-transformed values to be imputed back to normal scale and round to nearest integer impute_values = 10**log_impute_values @@ -303,33 +297,25 @@ def by_normal_distribution_sampling( # determine column for protein intensities intensity_type = intensity_df.columns[3] - # get number of NaN values in dataset number_of_nans = intensity_df[intensity_type].isnull().sum() - - # throw error if dataset is basically empty, something went wrong assert number_of_nans <= len(intensity_df[intensity_type]) - 2 - # get indices of NaN values in current protein group location_of_nans = intensity_df[intensity_type].isnull() indices_of_nans = location_of_nans[location_of_nans].index - # calculate the mean and standard deviation of the entire dataset dataset_mean = np.log10(intensity_df[intensity_type]).mean() dataset_std = np.log10(intensity_df[intensity_type]).std() - - # calculate mean and standard deviation of normal distribution to be sampled - scaled_dataset_mean = max(0, dataset_mean + down_shift * dataset_std) - scaled_dataset_std = dataset_std * scaling_factor + sampling_mean = max(0, dataset_mean + down_shift * dataset_std) + sampling_std = dataset_std * scaling_factor # calculate log-transformed values to be imputed log_impute_values = abs( np.random.normal( - loc=scaled_dataset_mean, - scale=scaled_dataset_std, + loc=sampling_mean, + scale=sampling_std, size=number_of_nans, ) ) - # transform log-transformed values to be imputed back to normal scale and round to nearest integer impute_values = 10**log_impute_values