diff --git a/protzilla/data_analysis/protein_graphs.py b/protzilla/data_analysis/protein_graphs.py index fc1fc548d..5aefe71f0 100644 --- a/protzilla/data_analysis/protein_graphs.py +++ b/protzilla/data_analysis/protein_graphs.py @@ -40,6 +40,62 @@ def variation_graph(protein_id: str, run_name: str): return out +def _create_protein_variation_graph(protein_id: str, run_name: str) -> dict: + """ + Creates a Protein-Variation-Graph for a given UniProt Protein ID using ProtGraph. + Included features are just `Variation`, digestion is skipped. + The Graph is saved in .graphml-Format. + + This is designed, so it can be used for peptides_to_isoform but works independently + as well + + ProtGraph: https://github.com/mpc-bioinformatics/ProtGraph/ + + :param protein_id: UniProt Protein-ID + :type: str + :param run_name: name of the run this is executed from. Used for saving the protein + file, graph + :type: str + + :return: dict(graph_path, messages) + """ + + logger.info(f"Creating graph for protein {protein_id}") + run_path = RUNS_PATH / run_name + path_to_protein_file, filtered_blocks, request = _get_protein_file( + protein_id, run_path + ) + + path_to_protein_file = Path(path_to_protein_file) + if not path_to_protein_file.exists() and request.status_code != 200: + msg = f"error while downloading protein file for {protein_id}. Statuscode:{request.status_code}, {request.reason}. Got: {request.text}. Tip: check if the ID is correct" # noqa E501 + logger.error(msg) + return dict( + graph_path=None, + filtered_blocks=filtered_blocks, + messages=[dict(level=messages.ERROR, msg=msg, trace=request.__dict__)], + ) + + output_folder_path = run_path / "graphs" + output_csv = output_folder_path / f"{protein_id}.csv" + graph_path = output_folder_path / f"{protein_id}.graphml" + cmd_str = f"protgraph -egraphml {path_to_protein_file} \ + --export_output_folder={output_folder_path} \ + --output_csv={output_csv} \ + -ft VARIANT \ + -d skip" + + subprocess.run(cmd_str, shell=True) + + msg = f"Graph created for protein {protein_id} at {graph_path} using {path_to_protein_file}" # noqa E501 + logger.info(msg) + return dict( + graph_path=str(graph_path), + filtered_blocks=filtered_blocks, + messages=[dict(level=messages.INFO, msg=msg)], + ) + + def peptides_to_isoform( peptide_df: pd.DataFrame, protein_id: str, @@ -67,6 +123,10 @@ def peptides_to_isoform( :type run_name: str :param k: k-mer size to build necessary indices for matching peptides, defaults to 5 :type k: int, optional + :param allowed_mismatches: max number of mismatched amino acids between a given + peptide and the reference sequence at a given starting location of a potential + match + :type allowed_mismatches: int :return: dict of path to graph - either the modified graph or the original graph if the modification failed, the protein id, list of matched peptides, list of unmatched @@ -164,64 +224,6 @@ def peptides_to_isoform( ) -def _create_protein_variation_graph(protein_id: str, run_name: str) -> dict: - """ - Creates a Protein-Variation-Graph for a given UniProt Protein ID using ProtGraph. - Included features are just `Variation`, digestion is skipped. - The Graph is saved in .graphml-Format. - - This is designed, so it can be used for peptides_to_isoform but works independently - as well - - ProtGraph: https://github.com/mpc-bioinformatics/ProtGraph/ - - :param protein_id: UniProt Protein-ID - :type: str - :param run_name: name of the run this is executed from. Used for saving the protein - file, graph - :type: str - :param queue_size: Queue Size for ProtGraph, This is yet to be merged by ProtGraph - :type: int - - :return: dict(graph_path, messages) - """ - - logger.info(f"Creating graph for protein {protein_id}") - run_path = RUNS_PATH / run_name - path_to_protein_file, filtered_blocks, request = _get_protein_file( - protein_id, run_path - ) - - path_to_protein_file = Path(path_to_protein_file) - if not path_to_protein_file.exists() and request.status_code != 200: - msg = f"error while downloading protein file for {protein_id}. Statuscode:{request.status_code}, {request.reason}. Got: {request.text}. Tip: check if the ID is correct" - logger.error(msg) - return dict( - graph_path=None, - filtered_blocks=filtered_blocks, - messages=[dict(level=messages.ERROR, msg=msg, trace=request.__dict__)], - ) - - output_folder_path = run_path / "graphs" - output_csv = output_folder_path / f"{protein_id}.csv" - graph_path = output_folder_path / f"{protein_id}.graphml" - cmd_str = f"protgraph -egraphml {path_to_protein_file} \ - --export_output_folder={output_folder_path} \ - --output_csv={output_csv} \ - -ft VARIANT \ - -d skip" - - subprocess.run(cmd_str, shell=True) - - msg = f"Graph created for protein {protein_id} at {graph_path} using {path_to_protein_file}" - logger.info(msg) - return dict( - graph_path=str(graph_path), - filtered_blocks=filtered_blocks, - messages=[dict(level=messages.INFO, msg=msg)], - ) - - def _create_graph_index( protein_graph: nx.DiGraph, seq_len: int ) -> tuple[list | None, str, dict | None]: @@ -250,7 +252,7 @@ def _create_graph_index( starting_point = node break else: - msg = "No starting point found in the graph. An error in the graph creation is likely." + msg = "No starting point found in the graph. An error in the graph creation is likely." # noqa E501 logger.error(msg) return None, msg, None @@ -263,15 +265,15 @@ def _create_graph_index( for node in protein_graph.nodes: if protein_graph.nodes[node]["aminoacid"] == "__end__": if longest_paths[node] < seq_len: - msg = f"The longest path to the last node is shorter than the reference sequence. An error in the graph creation is likely. Node: {node}, longest path: {longest_paths[node]}, seq_len: {seq_len}" + msg = f"The longest path to the last node is shorter than the reference sequence. An error in the graph creation is likely. Node: {node}, longest path: {longest_paths[node]}, seq_len: {seq_len}" # noqa E501 logger.error(msg) return None, msg, longest_paths elif longest_paths[node] > seq_len: - msg = f"The longest path to the last node is longer than the reference sequence. This could occur if a VARIANT substitutes more amino acids than it replaces. This is unexpected behaviour and should have been filtered out of the protein data. Node: {node}, longest path: {longest_paths[node]}, seq_len: {seq_len}" + msg = f"The longest path to the last node is longer than the reference sequence. This could occur if a VARIANT substitutes more amino acids than it replaces. This is unexpected behaviour and should have been filtered out of the protein data. Node: {node}, longest path: {longest_paths[node]}, seq_len: {seq_len}" # noqa E501 logger.error(msg) return None, msg, longest_paths - index = [[] for i in range(seq_len)] + index = [[] for _ in range(seq_len)] for node in longest_paths: if ( protein_graph.nodes[node]["aminoacid"] == "__start__" @@ -338,7 +340,7 @@ def _longest_paths(protein_graph: nx.DiGraph, start_node: str): distances[succ] = max(distances[succ], distances[node] + node_len) else: raise Exception( - f"The node {node} was not visited in the topological order (distance should be set already)" + f"The node {node} was not visited in the topological order (distance should be set already)" # noqa E501 ) longest_paths = dict(sorted(distances.items(), key=lambda x: x[1])) @@ -418,7 +420,7 @@ def _parse_file(file_path): if variant_block and "/note" in line: # only VARIANTs that don't elongate the sequence are valid - if not "Missing" in line: + if "Missing" not in line: matches = re.search(r"([A-Z]+) -> ([A-Z]+)", line) if matches is None: # if this is invalid we don't want to be the ones who filter it out @@ -561,7 +563,7 @@ def _potential_peptide_matches( raise ValueError(f"k must be positive integer, but is {k}") if not isinstance(allowed_mismatches, int) or allowed_mismatches < 0: raise ValueError( - f"allowed_mismatches must be non-negative integer, but is {allowed_mismatches}" + f"allowed_mismatches must be non-negative integer, but is {allowed_mismatches}" # noqa E501 ) logger.debug("Matching peptides to reference sequence") @@ -578,7 +580,7 @@ def _potential_peptide_matches( # for now potential matches like this will be dismissed even if # match_start_pos + len(peptide) - allowed_mismatches <= seq_len logger.debug( - f"match would be out of bounds for peptide {peptide}, match_start_pos {match_start_pos}" + f"match would be out of bounds for peptide {peptide}, match_start_pos {match_start_pos}" # noqa E501 ) continue matched_starts.append(match_start_pos) @@ -589,7 +591,7 @@ def _potential_peptide_matches( peptide_mismatches.add(peptide) logger.debug( - f"potential peptide matches - peptide:[starting_pos] :: {potential_peptide_matches}" + f"potential peptide matches - peptide:[starting_pos] :: {potential_peptide_matches}" # noqa E501 ) logger.debug(f"peptide mismatches: {peptide_mismatches}") @@ -654,9 +656,10 @@ def _match_potential_matches( recursive matching method that branches off at the end of each node that is matches to the end while there is still amino acids left over in the peptide. A new recursion is opened for each successor of a node, given the scenario above. - A recursion is closed if the end of the peptide is reached or if the number of mismatches - exceeds the allowed number of mismatches. This leads to potential run time problems - for many allowed mismatches and long peptides in graphs with frequent Variations. + A recursion is closed if the end of the peptide is reached or if the number of + mismatches exceeds the allowed number of mismatches. This leads to potential run + time problems for many allowed mismatches and long peptides in graphs with frequent + Variations. For the recursive method, see below. @@ -712,7 +715,8 @@ def _match_on_graph( :param graph: protein variation graph, as created by ProtGraph (-> _create_protein_variation_graph) :type graph: networkx.DiGraph - :param current_node: current node in the graph, starting with the node of the match start + :param current_node: current node in the graph, starting with the node of the + match start :type current_node: str :param left_over_peptide: peptide that still needs to be matched to the graph :type left_over_peptide: str @@ -757,7 +761,7 @@ def _match_on_graph( last_index = i # node is matched til end, peptide not done - data_from_succ = [] + data_from_successor = [] recursion_start_mismatches = mismatches for succ in graph.successors(current_node): recursion_start_data = node_match_data.copy() @@ -771,9 +775,9 @@ def _match_on_graph( current_index=0, ) if match: - data_from_succ.append((match, match_data_from_succ, mismatches)) - if data_from_succ: - return_val = min(data_from_succ, key=lambda item: item[2]) + data_from_successor.append((match, match_data_from_succ, mismatches)) + if data_from_successor: + return_val = min(data_from_successor, key=lambda item: item[2]) return return_val else: return False, {}, mismatches @@ -790,7 +794,7 @@ def _match_on_graph( break else: logger.error( - f"No fitting node for match start position {match_start_index} of {peptide} found" + f"No fitting node for match start position {match_start_index} of {peptide} found" # noqa E501 ) continue matched, node_match_data, mismatches = _match_on_graph( @@ -826,11 +830,9 @@ def _modify_graph(graph, contig_positions): :param graph: Protein Graph to be modified :type: nx.DiGraph - :param contig_positions: Dict from current_node to contig-positions {current_node: [(start, end)]}. + :param contig_positions: Dict from current_node to contig-positions + {current_node: [(start, end)]}. :type: dict(list[tuple]) - :param longest_paths: mapping from current_node to the longest path to current_node - (-> _longest_paths()) - :type: dict :return: modified protein graph, with contigs & not-matched AAs as nodes, indicated by current_node attribute `matched` """ @@ -974,8 +976,8 @@ def _get_peptides(peptide_df: pd.DataFrame, protein_id: str) -> list[str] | None df = peptide_df[peptide_df["Protein ID"].str.contains(protein_id)] pattern = rf"^({protein_id}-\d+)$" - filter = df["Protein ID"].str.contains(pattern) - df = df[~filter] + protein_id_filter = df["Protein ID"].str.contains(pattern) + df = df[~protein_id_filter] intensity_name = [col for col in df.columns if "intensity" in col.lower()][0] df = df.dropna(subset=[intensity_name])