diff --git a/.github/ISSUE_TEMPLATE/issue.md b/.github/ISSUE_TEMPLATE/issue.md index 387d4a3ee..677a8ce72 100644 --- a/.github/ISSUE_TEMPLATE/issue.md +++ b/.github/ISSUE_TEMPLATE/issue.md @@ -3,7 +3,7 @@ name: Issue Template about: Standard Issue Template title: labels: -project: PROTzilla 2 +project: "PROTzilla" --- diff --git a/.github/ISSUE_TEMPLATE/todo_issue.md b/.github/ISSUE_TEMPLATE/todo_issue.md index d8c7fbb93..c20221c3b 100644 --- a/.github/ISSUE_TEMPLATE/todo_issue.md +++ b/.github/ISSUE_TEMPLATE/todo_issue.md @@ -3,7 +3,7 @@ name: TODO about: TODO Issue Template title: '# TODO ' labels: todo -project: 'PROTzilla2' +project: 'PROTzilla' --- diff --git a/protzilla/constants/location_mapping.py b/protzilla/constants/location_mapping.py index bbc82d783..d9c075e66 100644 --- a/protzilla/constants/location_mapping.py +++ b/protzilla/constants/location_mapping.py @@ -132,6 +132,11 @@ "imputation", "min_value_per_dataset", ): imputation.by_min_per_dataset, + ( + "data_preprocessing", + "imputation", + "normal_distribution_sampling", + ): imputation.by_normal_distribution_sampling, ( "data_preprocessing", "filter_peptides", @@ -318,6 +323,11 @@ "imputation", "min_value_per_dataset", ): imputation.by_min_per_dataset_plot, + ( + "data_preprocessing", + "imputation", + "normal_distribution_sampling", + ): imputation.by_normal_distribution_sampling_plot, ( "data_preprocessing", "outlier_detection", diff --git a/protzilla/constants/workflow_meta.json b/protzilla/constants/workflow_meta.json index fd685f864..aca0c83b9 100644 --- a/protzilla/constants/workflow_meta.json +++ b/protzilla/constants/workflow_meta.json @@ -703,6 +703,69 @@ } } ] + }, + "normal_distribution_sampling": { + "name": "Normal Distribution Sampling", + "description": "Imputation methods include normal distribution sampling per Protein or over Dataset", + "parameters": { + "strategy": { + "name": "Strategy:", + "type": "categorical", + "categories": [ + "perProtein", + "perDataset" + ], + "default": "perProtein" + }, + "down_shift": { + "name": "Downshift:", + "type": "numeric", + "min": -10, + "max": 10, + "default": -1 + }, + "scaling_factor": { + "name": "Scaling Factor:", + "type": "numeric", + "min": 0, + "max": 1, + "default": 1 + } + }, + "graphs": [ + { + "graph_type": { + "name": "Graph type:", + "type": "categorical", + "categories": [ + "Boxplot", + "Histogram" + ], + "default": "Boxplot" + }, + "group_by": { + "name": "Group by:", + "type": "categorical", + "categories": [ + "None", + "Sample", + "Protein ID" + ], + "default": "None" + } + }, + { + "graph_type_quantities": { + "name": "Graph type Imputed Values:", + "type": "categorical", + "categories": [ + "Bar chart", + "Pie chart" + ], + "default": "Pie chart" + } + } + ] } }, "filter_peptides": { diff --git a/protzilla/data_analysis/protein_graphs.py b/protzilla/data_analysis/protein_graphs.py index fc1fc548d..5aefe71f0 100644 --- a/protzilla/data_analysis/protein_graphs.py +++ b/protzilla/data_analysis/protein_graphs.py @@ -40,6 +40,62 @@ def variation_graph(protein_id: str, run_name: str): return out +def _create_protein_variation_graph(protein_id: str, run_name: str) -> dict: + """ + Creates a Protein-Variation-Graph for a given UniProt Protein ID using ProtGraph. + Included features are just `Variation`, digestion is skipped. + The Graph is saved in .graphml-Format. + + This is designed, so it can be used for peptides_to_isoform but works independently + as well + + ProtGraph: https://github.com/mpc-bioinformatics/ProtGraph/ + + :param protein_id: UniProt Protein-ID + :type: str + :param run_name: name of the run this is executed from. Used for saving the protein + file, graph + :type: str + + :return: dict(graph_path, messages) + """ + + logger.info(f"Creating graph for protein {protein_id}") + run_path = RUNS_PATH / run_name + path_to_protein_file, filtered_blocks, request = _get_protein_file( + protein_id, run_path + ) + + path_to_protein_file = Path(path_to_protein_file) + if not path_to_protein_file.exists() and request.status_code != 200: + msg = f"error while downloading protein file for {protein_id}. Statuscode:{request.status_code}, {request.reason}. Got: {request.text}. Tip: check if the ID is correct" # noqa E501 + logger.error(msg) + return dict( + graph_path=None, + filtered_blocks=filtered_blocks, + messages=[dict(level=messages.ERROR, msg=msg, trace=request.__dict__)], + ) + + output_folder_path = run_path / "graphs" + output_csv = output_folder_path / f"{protein_id}.csv" + graph_path = output_folder_path / f"{protein_id}.graphml" + cmd_str = f"protgraph -egraphml {path_to_protein_file} \ + --export_output_folder={output_folder_path} \ + --output_csv={output_csv} \ + -ft VARIANT \ + -d skip" + + subprocess.run(cmd_str, shell=True) + + msg = f"Graph created for protein {protein_id} at {graph_path} using {path_to_protein_file}" # noqa E501 + logger.info(msg) + return dict( + graph_path=str(graph_path), + filtered_blocks=filtered_blocks, + messages=[dict(level=messages.INFO, msg=msg)], + ) + + def peptides_to_isoform( peptide_df: pd.DataFrame, protein_id: str, @@ -67,6 +123,10 @@ def peptides_to_isoform( :type run_name: str :param k: k-mer size to build necessary indices for matching peptides, defaults to 5 :type k: int, optional + :param allowed_mismatches: max number of mismatched amino acids between a given + peptide and the reference sequence at a given starting location of a potential + match + :type allowed_mismatches: int :return: dict of path to graph - either the modified graph or the original graph if the modification failed, the protein id, list of matched peptides, list of unmatched @@ -164,64 +224,6 @@ def peptides_to_isoform( ) -def _create_protein_variation_graph(protein_id: str, run_name: str) -> dict: - """ - Creates a Protein-Variation-Graph for a given UniProt Protein ID using ProtGraph. - Included features are just `Variation`, digestion is skipped. - The Graph is saved in .graphml-Format. - - This is designed, so it can be used for peptides_to_isoform but works independently - as well - - ProtGraph: https://github.com/mpc-bioinformatics/ProtGraph/ - - :param protein_id: UniProt Protein-ID - :type: str - :param run_name: name of the run this is executed from. Used for saving the protein - file, graph - :type: str - :param queue_size: Queue Size for ProtGraph, This is yet to be merged by ProtGraph - :type: int - - :return: dict(graph_path, messages) - """ - - logger.info(f"Creating graph for protein {protein_id}") - run_path = RUNS_PATH / run_name - path_to_protein_file, filtered_blocks, request = _get_protein_file( - protein_id, run_path - ) - - path_to_protein_file = Path(path_to_protein_file) - if not path_to_protein_file.exists() and request.status_code != 200: - msg = f"error while downloading protein file for {protein_id}. Statuscode:{request.status_code}, {request.reason}. Got: {request.text}. Tip: check if the ID is correct" - logger.error(msg) - return dict( - graph_path=None, - filtered_blocks=filtered_blocks, - messages=[dict(level=messages.ERROR, msg=msg, trace=request.__dict__)], - ) - - output_folder_path = run_path / "graphs" - output_csv = output_folder_path / f"{protein_id}.csv" - graph_path = output_folder_path / f"{protein_id}.graphml" - cmd_str = f"protgraph -egraphml {path_to_protein_file} \ - --export_output_folder={output_folder_path} \ - --output_csv={output_csv} \ - -ft VARIANT \ - -d skip" - - subprocess.run(cmd_str, shell=True) - - msg = f"Graph created for protein {protein_id} at {graph_path} using {path_to_protein_file}" - logger.info(msg) - return dict( - graph_path=str(graph_path), - filtered_blocks=filtered_blocks, - messages=[dict(level=messages.INFO, msg=msg)], - ) - - def _create_graph_index( protein_graph: nx.DiGraph, seq_len: int ) -> tuple[list | None, str, dict | None]: @@ -250,7 +252,7 @@ def _create_graph_index( starting_point = node break else: - msg = "No starting point found in the graph. An error in the graph creation is likely." + msg = "No starting point found in the graph. An error in the graph creation is likely." # noqa E501 logger.error(msg) return None, msg, None @@ -263,15 +265,15 @@ def _create_graph_index( for node in protein_graph.nodes: if protein_graph.nodes[node]["aminoacid"] == "__end__": if longest_paths[node] < seq_len: - msg = f"The longest path to the last node is shorter than the reference sequence. An error in the graph creation is likely. Node: {node}, longest path: {longest_paths[node]}, seq_len: {seq_len}" + msg = f"The longest path to the last node is shorter than the reference sequence. An error in the graph creation is likely. Node: {node}, longest path: {longest_paths[node]}, seq_len: {seq_len}" # noqa E501 logger.error(msg) return None, msg, longest_paths elif longest_paths[node] > seq_len: - msg = f"The longest path to the last node is longer than the reference sequence. This could occur if a VARIANT substitutes more amino acids than it replaces. This is unexpected behaviour and should have been filtered out of the protein data. Node: {node}, longest path: {longest_paths[node]}, seq_len: {seq_len}" + msg = f"The longest path to the last node is longer than the reference sequence. This could occur if a VARIANT substitutes more amino acids than it replaces. This is unexpected behaviour and should have been filtered out of the protein data. Node: {node}, longest path: {longest_paths[node]}, seq_len: {seq_len}" # noqa E501 logger.error(msg) return None, msg, longest_paths - index = [[] for i in range(seq_len)] + index = [[] for _ in range(seq_len)] for node in longest_paths: if ( protein_graph.nodes[node]["aminoacid"] == "__start__" @@ -338,7 +340,7 @@ def _longest_paths(protein_graph: nx.DiGraph, start_node: str): distances[succ] = max(distances[succ], distances[node] + node_len) else: raise Exception( - f"The node {node} was not visited in the topological order (distance should be set already)" + f"The node {node} was not visited in the topological order (distance should be set already)" # noqa E501 ) longest_paths = dict(sorted(distances.items(), key=lambda x: x[1])) @@ -418,7 +420,7 @@ def _parse_file(file_path): if variant_block and "/note" in line: # only VARIANTs that don't elongate the sequence are valid - if not "Missing" in line: + if "Missing" not in line: matches = re.search(r"([A-Z]+) -> ([A-Z]+)", line) if matches is None: # if this is invalid we don't want to be the ones who filter it out @@ -561,7 +563,7 @@ def _potential_peptide_matches( raise ValueError(f"k must be positive integer, but is {k}") if not isinstance(allowed_mismatches, int) or allowed_mismatches < 0: raise ValueError( - f"allowed_mismatches must be non-negative integer, but is {allowed_mismatches}" + f"allowed_mismatches must be non-negative integer, but is {allowed_mismatches}" # noqa E501 ) logger.debug("Matching peptides to reference sequence") @@ -578,7 +580,7 @@ def _potential_peptide_matches( # for now potential matches like this will be dismissed even if # match_start_pos + len(peptide) - allowed_mismatches <= seq_len logger.debug( - f"match would be out of bounds for peptide {peptide}, match_start_pos {match_start_pos}" + f"match would be out of bounds for peptide {peptide}, match_start_pos {match_start_pos}" # noqa E501 ) continue matched_starts.append(match_start_pos) @@ -589,7 +591,7 @@ def _potential_peptide_matches( peptide_mismatches.add(peptide) logger.debug( - f"potential peptide matches - peptide:[starting_pos] :: {potential_peptide_matches}" + f"potential peptide matches - peptide:[starting_pos] :: {potential_peptide_matches}" # noqa E501 ) logger.debug(f"peptide mismatches: {peptide_mismatches}") @@ -654,9 +656,10 @@ def _match_potential_matches( recursive matching method that branches off at the end of each node that is matches to the end while there is still amino acids left over in the peptide. A new recursion is opened for each successor of a node, given the scenario above. - A recursion is closed if the end of the peptide is reached or if the number of mismatches - exceeds the allowed number of mismatches. This leads to potential run time problems - for many allowed mismatches and long peptides in graphs with frequent Variations. + A recursion is closed if the end of the peptide is reached or if the number of + mismatches exceeds the allowed number of mismatches. This leads to potential run + time problems for many allowed mismatches and long peptides in graphs with frequent + Variations. For the recursive method, see below. @@ -712,7 +715,8 @@ def _match_on_graph( :param graph: protein variation graph, as created by ProtGraph (-> _create_protein_variation_graph) :type graph: networkx.DiGraph - :param current_node: current node in the graph, starting with the node of the match start + :param current_node: current node in the graph, starting with the node of the + match start :type current_node: str :param left_over_peptide: peptide that still needs to be matched to the graph :type left_over_peptide: str @@ -757,7 +761,7 @@ def _match_on_graph( last_index = i # node is matched til end, peptide not done - data_from_succ = [] + data_from_successor = [] recursion_start_mismatches = mismatches for succ in graph.successors(current_node): recursion_start_data = node_match_data.copy() @@ -771,9 +775,9 @@ def _match_on_graph( current_index=0, ) if match: - data_from_succ.append((match, match_data_from_succ, mismatches)) - if data_from_succ: - return_val = min(data_from_succ, key=lambda item: item[2]) + data_from_successor.append((match, match_data_from_succ, mismatches)) + if data_from_successor: + return_val = min(data_from_successor, key=lambda item: item[2]) return return_val else: return False, {}, mismatches @@ -790,7 +794,7 @@ def _match_on_graph( break else: logger.error( - f"No fitting node for match start position {match_start_index} of {peptide} found" + f"No fitting node for match start position {match_start_index} of {peptide} found" # noqa E501 ) continue matched, node_match_data, mismatches = _match_on_graph( @@ -826,11 +830,9 @@ def _modify_graph(graph, contig_positions): :param graph: Protein Graph to be modified :type: nx.DiGraph - :param contig_positions: Dict from current_node to contig-positions {current_node: [(start, end)]}. + :param contig_positions: Dict from current_node to contig-positions + {current_node: [(start, end)]}. :type: dict(list[tuple]) - :param longest_paths: mapping from current_node to the longest path to current_node - (-> _longest_paths()) - :type: dict :return: modified protein graph, with contigs & not-matched AAs as nodes, indicated by current_node attribute `matched` """ @@ -974,8 +976,8 @@ def _get_peptides(peptide_df: pd.DataFrame, protein_id: str) -> list[str] | None df = peptide_df[peptide_df["Protein ID"].str.contains(protein_id)] pattern = rf"^({protein_id}-\d+)$" - filter = df["Protein ID"].str.contains(pattern) - df = df[~filter] + protein_id_filter = df["Protein ID"].str.contains(pattern) + df = df[~protein_id_filter] intensity_name = [col for col in df.columns if "intensity" in col.lower()][0] df = df.dropna(subset=[intensity_name]) diff --git a/protzilla/data_preprocessing/imputation.py b/protzilla/data_preprocessing/imputation.py index 14409a2f7..160112cfb 100644 --- a/protzilla/data_preprocessing/imputation.py +++ b/protzilla/data_preprocessing/imputation.py @@ -221,6 +221,111 @@ def by_min_per_dataset( return intensity_df_copy, dict() +def by_normal_distribution_sampling( + intensity_df: pd.DataFrame, + strategy="perProtein", + down_shift=0, + scaling_factor=1, +) -> tuple[pd.DataFrame, dict]: + """ + A function to perform imputation via sampling of a normal distribution + defined by the existing datapoints and user-defined parameters for down- + shifting and scaling. Imputes missing values for each protein taking into + account data from each protein or the whole dataset. The data is log- + transformed before sampling from the normal distribution and transformed + back afterwards, meaning only values > 0 are imputed. + Will not impute if insufficient data is available for sampling. + :param intensity_df: the dataframe that should be filtered in + long format + :type intensity_df: pandas DataFrame + :param strategy: which strategy to use for definition of the normal + distribution to be sampled. Can be "perProtein", "perDataset" or "most_frequent" + :type strategy: str + :param down_shift: a factor defining how many dataset standard deviations + to shift the mean of the normal distribution used for imputation. + Default: 0 (no shift) + :type down_shift: float + :param scaling_factor: a factor determining how the variance of the normal + distribution used for imputation is scaled compared to dataset. + Default: 1 (no scaling) + :type down_shift: float + :param round_values: whether to round the imputed values to the nearest integer + Default: False + :type round_values: bool + :return: returns an imputed dataframe in typical protzilla long format\ + and an empty dict + :rtype: pd.DataFrame, int + """ + assert strategy in ["perProtein", "perDataset"] + + if strategy == "perProtein": + transformed_df = long_to_wide(intensity_df) + # iterate over all protein groups + for protein_grp in transformed_df.columns: + + number_of_nans = transformed_df[protein_grp].isnull().sum() + + # don't impute values if there not enough values (> 1) to sample from + if number_of_nans > len(transformed_df[protein_grp]) - 2: + continue + + location_of_nans = transformed_df[protein_grp].isnull() + indices_of_nans = location_of_nans[location_of_nans].index + + protein_grp_mean = np.log10(transformed_df[protein_grp]).mean(skipna=True) + protein_grp_std = np.log10(transformed_df[protein_grp]).std(skipna=True) + sampling_mean = protein_grp_mean + down_shift * protein_grp_std + sampling_std = protein_grp_std * scaling_factor + + # calculate log-transformed values to be imputed + log_impute_values = np.random.normal( + loc=sampling_mean, + scale=sampling_std, + size=number_of_nans, + ) + # transform log-transformed values to be imputed back to normal scale and round to nearest integer + impute_values = 10**log_impute_values + + # zip indices of NaN values with values to be imputed together as a Series, such that fillna can be used + impute_value_series = pd.Series(impute_values, index=indices_of_nans) + transformed_df[protein_grp].fillna(impute_value_series, inplace=True) + + imputed_df = wide_to_long(transformed_df, intensity_df) + return imputed_df, dict() + + else: + # determine column for protein intensities + intensity_type = intensity_df.columns[3] + + number_of_nans = intensity_df[intensity_type].isnull().sum() + assert number_of_nans <= len(intensity_df[intensity_type]) - 2 + + location_of_nans = intensity_df[intensity_type].isnull() + indices_of_nans = location_of_nans[location_of_nans].index + + dataset_mean = np.log10(intensity_df[intensity_type]).mean() + dataset_std = np.log10(intensity_df[intensity_type]).std() + sampling_mean = max(0, dataset_mean + down_shift * dataset_std) + sampling_std = dataset_std * scaling_factor + + # calculate log-transformed values to be imputed + log_impute_values = abs( + np.random.normal( + loc=sampling_mean, + scale=sampling_std, + size=number_of_nans, + ) + ) + # transform log-transformed values to be imputed back to normal scale and round to nearest integer + impute_values = 10**log_impute_values + + # zip indices of NaN values with values to be imputed together as a Series, such that fillna can be used + impute_value_series = pd.Series(impute_values, index=indices_of_nans) + intensity_df[intensity_type].fillna(impute_value_series, inplace=True) + + return intensity_df, dict() + + def by_knn_plot( df, result_df, current_out, graph_type, graph_type_quantities, group_by ): @@ -229,6 +334,14 @@ def by_knn_plot( ) +def by_normal_distribution_sampling_plot( + df, result_df, current_out, graph_type, graph_type_quantities, group_by +): + return _build_box_hist_plot( + df, result_df, graph_type, graph_type_quantities, group_by + ) + + def by_simple_imputer_plot( df, result_df, current_out, graph_type, graph_type_quantities, group_by ): diff --git a/tests/protzilla/data_preprocessing/test_imputation.py b/tests/protzilla/data_preprocessing/test_imputation.py index 44b66bb13..40caba91a 100644 --- a/tests/protzilla/data_preprocessing/test_imputation.py +++ b/tests/protzilla/data_preprocessing/test_imputation.py @@ -9,6 +9,8 @@ by_min_per_protein_plot, by_min_per_sample, by_min_per_sample_plot, + by_normal_distribution_sampling, + by_normal_distribution_sampling_plot, by_simple_imputer, by_simple_imputer_plot, np, @@ -18,6 +20,11 @@ from tests.protzilla.data_preprocessing import test_plots_data_preprocessing +def protein_group_intensities(dataframe, protein_group_name): + # small helper function for tests + return dataframe[dataframe["Protein ID"] == protein_group_name]["Intensity"] + + @pytest.fixture def input_imputation_df(): test_intensity_list = ( @@ -266,6 +273,44 @@ def test_imputation_knn(show_figures, input_imputation_df, assertion_df_knn): {assertion_df} but is\n {result_df}" +@pytest.mark.order(2) +@pytest.mark.dependency(depends=["test_build_box_hist_plot"]) +def test_imputation_normal_distribution_sampling(show_figures, input_imputation_df): + # perform imputation on test data frame + result_df_perProtein = by_normal_distribution_sampling( + input_imputation_df, strategy="perProtein", down_shift=-10 + )[0] + result_df_perDataset = by_normal_distribution_sampling( + input_imputation_df, strategy="perDataset", down_shift=-10 + )[0] + + fig1, fig2 = by_normal_distribution_sampling_plot( + input_imputation_df, result_df_perProtein, {}, "Boxplot", "Bar chart", "Sample" + ) + if show_figures: + fig1.show() + fig2.show() + + assert ( + result_df_perProtein["Intensity"].min() >= 0 + ), f"Imputation by normal distribution sampling should not have negative values!" + assert ( + result_df_perDataset["Intensity"].min() >= 0 + ), f"Imputation by normal distribution sampling should not have negative values!" + + assert ( + False == protein_group_intensities(result_df_perProtein, "Protein1").hasnans + ) and ( + False == protein_group_intensities(result_df_perProtein, "Protein3").hasnans + ), f"Imputation by normal distribution sampling should not have NaN values!" + assert protein_group_intensities( + result_df_perProtein, "Protein2" + ).hasnans, f"This protein group should have NaN values! Not enough data points to sample from!" + assert ( + False == result_df_perDataset["Intensity"].hasnans + ), f"Imputation by normal distribution sampling per Dataset should not have NaN values!" + + def test_number_of_imputed_values(input_imputation_df, assertion_df_knn): count = number_of_imputed_values(input_imputation_df, assertion_df_knn) assert (