From 4590cb54dd22de4af8de42fcff4209b6fa287054 Mon Sep 17 00:00:00 2001
From: henninggaertner <Henning.Gaertner@student.hpi.de>
Date: Thu, 9 Nov 2023 15:42:23 +0100
Subject: [PATCH 01/15] Implement simple per Protein ND-Sampling Imputation
 method

---
 protzilla/constants/location_mapping.py    |  5 ++
 protzilla/constants/workflow_meta.json     | 63 +++++++++++++++++++++
 protzilla/data_preprocessing/imputation.py | 65 ++++++++++++++++++++++
 3 files changed, 133 insertions(+)

diff --git a/protzilla/constants/location_mapping.py b/protzilla/constants/location_mapping.py
index aa59e2cd1..95c229e1b 100644
--- a/protzilla/constants/location_mapping.py
+++ b/protzilla/constants/location_mapping.py
@@ -313,6 +313,11 @@
         "imputation",
         "min_value_per_dataset",
     ): imputation.by_min_per_dataset_plot,
+    (
+        "data_preprocessing",
+        "imputation",
+        "normal_distribution_sampling_per_protein",
+    ): imputation.by_normal_distribution_sampling,
     (
         "data_preprocessing",
         "outlier_detection",
diff --git a/protzilla/constants/workflow_meta.json b/protzilla/constants/workflow_meta.json
index 14ba36f09..e7a8f0fad 100644
--- a/protzilla/constants/workflow_meta.json
+++ b/protzilla/constants/workflow_meta.json
@@ -677,6 +677,69 @@
             }
           }
         ]
+      },
+      "normal_distribution_sampling_per_protein": {
+        "name": "Normal Distribution Sampling",
+        "description": "Imputation methods include normal distribution sampling per Protein or over Dataset",
+        "parameters": {
+          "strategy": {
+            "name": "Strategy:",
+            "type": "categorical",
+            "categories": [
+              "perProtein",
+              "perDataset"
+            ],
+            "default": "perProtein"
+          },
+          "down_shift": {
+            "name": "Downshift:",
+            "type": "numeric",
+            "min": -10,
+            "max": 10,
+            "default": -1
+          },
+           "scaling_factor": {
+            "name": "Scaling Factor:",
+            "type": "numeric",
+            "min": 0,
+            "max": 1,
+            "default": 1
+          }
+        },
+        "graphs": [
+          {
+            "graph_type": {
+              "name": "Graph type:",
+              "type": "categorical",
+              "categories": [
+                "Boxplot",
+                "Histogram"
+              ],
+              "default": "Boxplot"
+            },
+            "group_by": {
+              "name": "Group by:",
+              "type": "categorical",
+              "categories": [
+                "None",
+                "Sample",
+                "Protein ID"
+              ],
+              "default": "None"
+            }
+          },
+          {
+            "graph_type_quantities": {
+              "name": "Graph type Imputed Values:",
+              "type": "categorical",
+              "categories": [
+                "Bar chart",
+                "Pie chart"
+              ],
+              "default": "Pie chart"
+            }
+          }
+        ]
       }
     },
     "filter_peptides": {
diff --git a/protzilla/data_preprocessing/imputation.py b/protzilla/data_preprocessing/imputation.py
index 14409a2f7..0a19bad6a 100644
--- a/protzilla/data_preprocessing/imputation.py
+++ b/protzilla/data_preprocessing/imputation.py
@@ -221,6 +221,71 @@ def by_min_per_dataset(
     return intensity_df_copy, dict()
 
 
+def by_normal_distribution_sampling(
+    intensity_df: pd.DataFrame,
+    strategy="perProtein",
+    down_shift=0,
+    scaling_factor=1,
+) -> tuple[pd.DataFrame, dict]:
+    """
+    A function to perform imputation via sampling of a normal distribution
+    defined by the existing datapoints and user-defined parameters for down-
+    shifting and scaling. Imputes missing values for each protein  taking into
+    account data from each protein.
+
+    :param intensity_df: the dataframe that should be filtered in\
+    long format
+    :type intensity_df: pandas DataFrame
+    :param strategy: which strategy to use for definition of the normal\
+    distribution to be sampled. Can be "perProtein", "perDataset" or "most_frequent"\
+    :type strategy: str
+    :param down_shift: a factor defining how many dataset standard deviations\
+    to shift the mean of the normal distribution used for imputation.\
+    Default: 0 (no shift)
+    :type down_shift: float
+    :param scaling_factor: a factor determining how the variance of the normal\
+    distribution used for imputation is scaled compared to dataset.
+    Default: 1 (no scaling)
+    :type down_shift: float
+    :return: returns an imputed dataframe in typical protzilla long format\
+    and an empty dict
+    :rtype: pd.DataFrame, int
+    """
+    assert strategy in ["perProtein", "perDataset"]
+
+    transformed_df = long_to_wide(intensity_df)
+    transformed_df.dropna(axis=1, how="all", inplace=True)
+
+    # protein_means = transformed_df.mean(axis=1)
+    # protein_std = transformed_df.std(axis=1)
+    # scaled_protein_means = protein_means + down_shift * protein_std
+    # scaled_protein_std = protein_std * scaling_factor
+
+    # Iterate over proteins to impute minimal value
+    if strategy == "perProtein":
+        for column in transformed_df.columns:
+            # determine mean (loc)
+            protein_mean = transformed_df[column].mean()
+            protein_std = transformed_df[column].std()
+            scaled_protein_mean = max(0, protein_mean + down_shift * protein_std)
+            scaled_protein_std = protein_std * scaling_factor
+            # determine standard deviation (scale)
+            value_to_be_imputed = abs(
+                np.random.normal(
+                    loc=scaled_protein_mean,
+                    scale=scaled_protein_std,
+                )
+            )
+            transformed_df[column].fillna(value_to_be_imputed, inplace=True)
+    else:
+        pass
+        # determine mean of normal distribution of dataset
+        # TODO
+
+    imputed_df = wide_to_long(transformed_df, intensity_df)
+    return imputed_df, dict()
+
+
 def by_knn_plot(
     df, result_df, current_out, graph_type, graph_type_quantities, group_by
 ):

From aeca5b2d771d8f2ae434b904385294ab8d1f21ba Mon Sep 17 00:00:00 2001
From: antonneubauer <antongneubauer@gmail.com>
Date: Fri, 10 Nov 2023 14:26:38 +0100
Subject: [PATCH 02/15] just moved function

---
 protzilla/data_analysis/protein_graphs.py | 116 +++++++++++-----------
 1 file changed, 58 insertions(+), 58 deletions(-)

diff --git a/protzilla/data_analysis/protein_graphs.py b/protzilla/data_analysis/protein_graphs.py
index fc1fc548d..78282376b 100644
--- a/protzilla/data_analysis/protein_graphs.py
+++ b/protzilla/data_analysis/protein_graphs.py
@@ -40,6 +40,64 @@ def variation_graph(protein_id: str, run_name: str):
     return out
 
 
+def _create_protein_variation_graph(protein_id: str, run_name: str) -> dict:
+    """
+    Creates a Protein-Variation-Graph for a given UniProt Protein ID using ProtGraph.
+    Included features are just `Variation`, digestion is skipped.
+    The Graph is saved in .graphml-Format.
+
+    This is designed, so it can be used for peptides_to_isoform but works independently
+    as well
+
+    ProtGraph: https://github.com/mpc-bioinformatics/ProtGraph/
+
+    :param protein_id: UniProt Protein-ID
+    :type: str
+    :param run_name: name of the run this is executed from. Used for saving the protein
+        file, graph
+    :type: str
+    :param queue_size: Queue Size for ProtGraph, This is yet to be merged by ProtGraph
+    :type: int
+
+    :return: dict(graph_path, messages)
+    """
+
+    logger.info(f"Creating graph for protein {protein_id}")
+    run_path = RUNS_PATH / run_name
+    path_to_protein_file, filtered_blocks, request = _get_protein_file(
+        protein_id, run_path
+    )
+
+    path_to_protein_file = Path(path_to_protein_file)
+    if not path_to_protein_file.exists() and request.status_code != 200:
+        msg = f"error while downloading protein file for {protein_id}. Statuscode:{request.status_code}, {request.reason}. Got: {request.text}. Tip: check if the ID is correct"
+        logger.error(msg)
+        return dict(
+            graph_path=None,
+            filtered_blocks=filtered_blocks,
+            messages=[dict(level=messages.ERROR, msg=msg, trace=request.__dict__)],
+        )
+
+    output_folder_path = run_path / "graphs"
+    output_csv = output_folder_path / f"{protein_id}.csv"
+    graph_path = output_folder_path / f"{protein_id}.graphml"
+    cmd_str = f"protgraph -egraphml {path_to_protein_file} \
+                --export_output_folder={output_folder_path} \
+                --output_csv={output_csv} \
+                -ft VARIANT \
+                -d skip"
+
+    subprocess.run(cmd_str, shell=True)
+
+    msg = f"Graph created for protein {protein_id} at {graph_path} using {path_to_protein_file}"
+    logger.info(msg)
+    return dict(
+        graph_path=str(graph_path),
+        filtered_blocks=filtered_blocks,
+        messages=[dict(level=messages.INFO, msg=msg)],
+    )
+
+
 def peptides_to_isoform(
     peptide_df: pd.DataFrame,
     protein_id: str,
@@ -164,64 +222,6 @@ def peptides_to_isoform(
     )
 
 
-def _create_protein_variation_graph(protein_id: str, run_name: str) -> dict:
-    """
-    Creates a Protein-Variation-Graph for a given UniProt Protein ID using ProtGraph.
-    Included features are just `Variation`, digestion is skipped.
-    The Graph is saved in .graphml-Format.
-
-    This is designed, so it can be used for peptides_to_isoform but works independently
-    as well
-
-    ProtGraph: https://github.com/mpc-bioinformatics/ProtGraph/
-
-    :param protein_id: UniProt Protein-ID
-    :type: str
-    :param run_name: name of the run this is executed from. Used for saving the protein
-        file, graph
-    :type: str
-    :param queue_size: Queue Size for ProtGraph, This is yet to be merged by ProtGraph
-    :type: int
-
-    :return: dict(graph_path, messages)
-    """
-
-    logger.info(f"Creating graph for protein {protein_id}")
-    run_path = RUNS_PATH / run_name
-    path_to_protein_file, filtered_blocks, request = _get_protein_file(
-        protein_id, run_path
-    )
-
-    path_to_protein_file = Path(path_to_protein_file)
-    if not path_to_protein_file.exists() and request.status_code != 200:
-        msg = f"error while downloading protein file for {protein_id}. Statuscode:{request.status_code}, {request.reason}. Got: {request.text}. Tip: check if the ID is correct"
-        logger.error(msg)
-        return dict(
-            graph_path=None,
-            filtered_blocks=filtered_blocks,
-            messages=[dict(level=messages.ERROR, msg=msg, trace=request.__dict__)],
-        )
-
-    output_folder_path = run_path / "graphs"
-    output_csv = output_folder_path / f"{protein_id}.csv"
-    graph_path = output_folder_path / f"{protein_id}.graphml"
-    cmd_str = f"protgraph -egraphml {path_to_protein_file} \
-                --export_output_folder={output_folder_path} \
-                --output_csv={output_csv} \
-                -ft VARIANT \
-                -d skip"
-
-    subprocess.run(cmd_str, shell=True)
-
-    msg = f"Graph created for protein {protein_id} at {graph_path} using {path_to_protein_file}"
-    logger.info(msg)
-    return dict(
-        graph_path=str(graph_path),
-        filtered_blocks=filtered_blocks,
-        messages=[dict(level=messages.INFO, msg=msg)],
-    )
-
-
 def _create_graph_index(
     protein_graph: nx.DiGraph, seq_len: int
 ) -> tuple[list | None, str, dict | None]:

From 3a44abaf427a8440031678bd627cde0723b547cd Mon Sep 17 00:00:00 2001
From: antonneubauer <antongneubauer@gmail.com>
Date: Fri, 10 Nov 2023 15:00:25 +0100
Subject: [PATCH 03/15] improve doc strings, deal with (most) PEP8 warnings

---
 protzilla/data_analysis/protein_graphs.py | 58 ++++++++++++-----------
 1 file changed, 30 insertions(+), 28 deletions(-)

diff --git a/protzilla/data_analysis/protein_graphs.py b/protzilla/data_analysis/protein_graphs.py
index 78282376b..5aefe71f0 100644
--- a/protzilla/data_analysis/protein_graphs.py
+++ b/protzilla/data_analysis/protein_graphs.py
@@ -56,8 +56,6 @@ def _create_protein_variation_graph(protein_id: str, run_name: str) -> dict:
     :param run_name: name of the run this is executed from. Used for saving the protein
         file, graph
     :type: str
-    :param queue_size: Queue Size for ProtGraph, This is yet to be merged by ProtGraph
-    :type: int
 
     :return: dict(graph_path, messages)
     """
@@ -70,7 +68,7 @@ def _create_protein_variation_graph(protein_id: str, run_name: str) -> dict:
 
     path_to_protein_file = Path(path_to_protein_file)
     if not path_to_protein_file.exists() and request.status_code != 200:
-        msg = f"error while downloading protein file for {protein_id}. Statuscode:{request.status_code}, {request.reason}. Got: {request.text}. Tip: check if the ID is correct"
+        msg = f"error while downloading protein file for {protein_id}. Statuscode:{request.status_code}, {request.reason}. Got: {request.text}. Tip: check if the ID is correct"  # noqa E501
         logger.error(msg)
         return dict(
             graph_path=None,
@@ -89,7 +87,7 @@ def _create_protein_variation_graph(protein_id: str, run_name: str) -> dict:
 
     subprocess.run(cmd_str, shell=True)
 
-    msg = f"Graph created for protein {protein_id} at {graph_path} using {path_to_protein_file}"
+    msg = f"Graph created for protein {protein_id} at {graph_path} using {path_to_protein_file}"  # noqa E501
     logger.info(msg)
     return dict(
         graph_path=str(graph_path),
@@ -125,6 +123,10 @@ def peptides_to_isoform(
     :type run_name: str
     :param k: k-mer size to build necessary indices for matching peptides, defaults to 5
     :type k: int, optional
+    :param allowed_mismatches: max number of mismatched amino acids between a given
+        peptide and the reference sequence at a given starting location of a potential
+        match
+    :type allowed_mismatches: int
 
     :return: dict of path to graph - either the modified graph or the original graph if
     the modification failed, the protein id, list of matched peptides, list of unmatched
@@ -250,7 +252,7 @@ def _create_graph_index(
             starting_point = node
             break
     else:
-        msg = "No starting point found in the graph. An error in the graph creation is likely."
+        msg = "No starting point found in the graph. An error in the graph creation is likely."  # noqa E501
         logger.error(msg)
         return None, msg, None
 
@@ -263,15 +265,15 @@ def _create_graph_index(
     for node in protein_graph.nodes:
         if protein_graph.nodes[node]["aminoacid"] == "__end__":
             if longest_paths[node] < seq_len:
-                msg = f"The longest path to the last node is shorter than the reference sequence. An error in the graph creation is likely. Node: {node}, longest path: {longest_paths[node]}, seq_len: {seq_len}"
+                msg = f"The longest path to the last node is shorter than the reference sequence. An error in the graph creation is likely. Node: {node}, longest path: {longest_paths[node]}, seq_len: {seq_len}"  # noqa E501
                 logger.error(msg)
                 return None, msg, longest_paths
             elif longest_paths[node] > seq_len:
-                msg = f"The longest path to the last node is longer than the reference sequence. This could occur if a VARIANT substitutes more amino acids than it replaces. This is unexpected behaviour and should have been filtered out of the protein data. Node: {node}, longest path: {longest_paths[node]}, seq_len: {seq_len}"
+                msg = f"The longest path to the last node is longer than the reference sequence. This could occur if a VARIANT substitutes more amino acids than it replaces. This is unexpected behaviour and should have been filtered out of the protein data. Node: {node}, longest path: {longest_paths[node]}, seq_len: {seq_len}"  # noqa E501
                 logger.error(msg)
                 return None, msg, longest_paths
 
-    index = [[] for i in range(seq_len)]
+    index = [[] for _ in range(seq_len)]
     for node in longest_paths:
         if (
             protein_graph.nodes[node]["aminoacid"] == "__start__"
@@ -338,7 +340,7 @@ def _longest_paths(protein_graph: nx.DiGraph, start_node: str):
                 distances[succ] = max(distances[succ], distances[node] + node_len)
         else:
             raise Exception(
-                f"The node {node} was not visited in the topological order (distance should be set already)"
+                f"The node {node} was not visited in the topological order (distance should be set already)"  # noqa E501
             )
 
     longest_paths = dict(sorted(distances.items(), key=lambda x: x[1]))
@@ -418,7 +420,7 @@ def _parse_file(file_path):
 
         if variant_block and "/note" in line:
             # only VARIANTs that don't elongate the sequence are valid
-            if not "Missing" in line:
+            if "Missing" not in line:
                 matches = re.search(r"([A-Z]+) -> ([A-Z]+)", line)
                 if matches is None:
                     # if this is invalid we don't want to be the ones who filter it out
@@ -561,7 +563,7 @@ def _potential_peptide_matches(
         raise ValueError(f"k must be positive integer, but is {k}")
     if not isinstance(allowed_mismatches, int) or allowed_mismatches < 0:
         raise ValueError(
-            f"allowed_mismatches must be non-negative integer, but is {allowed_mismatches}"
+            f"allowed_mismatches must be non-negative integer, but is {allowed_mismatches}"  # noqa E501
         )
 
     logger.debug("Matching peptides to reference sequence")
@@ -578,7 +580,7 @@ def _potential_peptide_matches(
                 # for now potential matches like this will be dismissed even if
                 # match_start_pos + len(peptide) - allowed_mismatches <= seq_len
                 logger.debug(
-                    f"match would be out of bounds for peptide {peptide}, match_start_pos {match_start_pos}"
+                    f"match would be out of bounds for peptide {peptide}, match_start_pos {match_start_pos}"  # noqa E501
                 )
                 continue
             matched_starts.append(match_start_pos)
@@ -589,7 +591,7 @@ def _potential_peptide_matches(
             peptide_mismatches.add(peptide)
 
     logger.debug(
-        f"potential peptide matches - peptide:[starting_pos] :: {potential_peptide_matches}"
+        f"potential peptide matches - peptide:[starting_pos] :: {potential_peptide_matches}"  # noqa E501
     )
     logger.debug(f"peptide mismatches: {peptide_mismatches}")
 
@@ -654,9 +656,10 @@ def _match_potential_matches(
     recursive matching method that branches off at the end of each node that is matches
     to the end while there is still amino acids left over in the peptide. A new
     recursion is opened for each successor of a node, given the scenario above.
-    A recursion is closed if the end of the peptide is reached or if the number of mismatches
-    exceeds the allowed number of mismatches. This leads to potential run time problems
-    for many allowed mismatches and long peptides in graphs with frequent Variations.
+    A recursion is closed if the end of the peptide is reached or if the number of
+    mismatches exceeds the allowed number of mismatches. This leads to potential run
+    time problems for many allowed mismatches and long peptides in graphs with frequent
+    Variations.
 
     For the recursive method, see below.
 
@@ -712,7 +715,8 @@ def _match_on_graph(
         :param graph: protein variation graph, as created by ProtGraph
         (-> _create_protein_variation_graph)
         :type graph: networkx.DiGraph
-        :param current_node: current node in the graph, starting with the node of the match start
+        :param current_node: current node in the graph, starting with the node of the
+            match start
         :type current_node: str
         :param left_over_peptide: peptide that still needs to be matched to the graph
         :type left_over_peptide: str
@@ -757,7 +761,7 @@ def _match_on_graph(
             last_index = i
 
         # node is matched til end, peptide not done
-        data_from_succ = []
+        data_from_successor = []
         recursion_start_mismatches = mismatches
         for succ in graph.successors(current_node):
             recursion_start_data = node_match_data.copy()
@@ -771,9 +775,9 @@ def _match_on_graph(
                 current_index=0,
             )
             if match:
-                data_from_succ.append((match, match_data_from_succ, mismatches))
-        if data_from_succ:
-            return_val = min(data_from_succ, key=lambda item: item[2])
+                data_from_successor.append((match, match_data_from_succ, mismatches))
+        if data_from_successor:
+            return_val = min(data_from_successor, key=lambda item: item[2])
             return return_val
         else:
             return False, {}, mismatches
@@ -790,7 +794,7 @@ def _match_on_graph(
                     break
             else:
                 logger.error(
-                    f"No fitting node for match start position {match_start_index} of {peptide} found"
+                    f"No fitting node for match start position {match_start_index} of {peptide} found"  # noqa E501
                 )
                 continue
             matched, node_match_data, mismatches = _match_on_graph(
@@ -826,11 +830,9 @@ def _modify_graph(graph, contig_positions):
 
     :param graph: Protein Graph to be modified
     :type: nx.DiGraph
-    :param contig_positions: Dict from current_node to contig-positions {current_node: [(start, end)]}.
+    :param contig_positions: Dict from current_node to contig-positions
+        {current_node: [(start, end)]}.
     :type: dict(list[tuple])
-    :param longest_paths: mapping from current_node to the longest path to current_node
-    (-> _longest_paths())
-    :type: dict
     :return: modified protein graph, with contigs & not-matched AAs as nodes, indicated
     by current_node attribute `matched`
     """
@@ -974,8 +976,8 @@ def _get_peptides(peptide_df: pd.DataFrame, protein_id: str) -> list[str] | None
 
     df = peptide_df[peptide_df["Protein ID"].str.contains(protein_id)]
     pattern = rf"^({protein_id}-\d+)$"
-    filter = df["Protein ID"].str.contains(pattern)
-    df = df[~filter]
+    protein_id_filter = df["Protein ID"].str.contains(pattern)
+    df = df[~protein_id_filter]
 
     intensity_name = [col for col in df.columns if "intensity" in col.lower()][0]
     df = df.dropna(subset=[intensity_name])

From efb0a0464636b3cd7f1c8e5e7d451a39e838de9f Mon Sep 17 00:00:00 2001
From: henninggaertner <Henning.Gaertner@student.hpi.de>
Date: Mon, 13 Nov 2023 16:30:52 +0100
Subject: [PATCH 04/15] generalize python mapping name for ND sampling
 imputation, append missing location mapping

---
 protzilla/constants/location_mapping.py | 7 ++++++-
 protzilla/constants/workflow_meta.json  | 2 +-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/protzilla/constants/location_mapping.py b/protzilla/constants/location_mapping.py
index 95c229e1b..91e4b0e91 100644
--- a/protzilla/constants/location_mapping.py
+++ b/protzilla/constants/location_mapping.py
@@ -127,6 +127,11 @@
         "imputation",
         "min_value_per_dataset",
     ): imputation.by_min_per_dataset,
+    (
+        "data_preprocessing",
+        "imputation",
+        "normal_distribution_sampling",
+    ): imputation.by_normal_distribution_sampling,
     (
         "data_preprocessing",
         "filter_peptides",
@@ -316,7 +321,7 @@
     (
         "data_preprocessing",
         "imputation",
-        "normal_distribution_sampling_per_protein",
+        "normal_distribution_sampling",
     ): imputation.by_normal_distribution_sampling,
     (
         "data_preprocessing",
diff --git a/protzilla/constants/workflow_meta.json b/protzilla/constants/workflow_meta.json
index e7a8f0fad..1f76ffe63 100644
--- a/protzilla/constants/workflow_meta.json
+++ b/protzilla/constants/workflow_meta.json
@@ -678,7 +678,7 @@
           }
         ]
       },
-      "normal_distribution_sampling_per_protein": {
+      "normal_distribution_sampling": {
         "name": "Normal Distribution Sampling",
         "description": "Imputation methods include normal distribution sampling per Protein or over Dataset",
         "parameters": {

From 7ca6f60a1f2ef934a117f9a5830ee6ea74da7518 Mon Sep 17 00:00:00 2001
From: henninggaertner <Henning.Gaertner@student.hpi.de>
Date: Mon, 13 Nov 2023 16:33:16 +0100
Subject: [PATCH 05/15] ND-sampling imputation now uses log-transformed
 intensities for sampling

---
 protzilla/data_preprocessing/imputation.py | 23 +++++++++++-----------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/protzilla/data_preprocessing/imputation.py b/protzilla/data_preprocessing/imputation.py
index 0a19bad6a..28060471e 100644
--- a/protzilla/data_preprocessing/imputation.py
+++ b/protzilla/data_preprocessing/imputation.py
@@ -256,32 +256,31 @@ def by_normal_distribution_sampling(
     transformed_df = long_to_wide(intensity_df)
     transformed_df.dropna(axis=1, how="all", inplace=True)
 
-    # protein_means = transformed_df.mean(axis=1)
-    # protein_std = transformed_df.std(axis=1)
-    # scaled_protein_means = protein_means + down_shift * protein_std
-    # scaled_protein_std = protein_std * scaling_factor
-
-    # Iterate over proteins to impute minimal value
+    # TODO: sample the normal distribution for each missing value instead of sampling once for all missing values
     if strategy == "perProtein":
         for column in transformed_df.columns:
-            # determine mean (loc)
-            protein_mean = transformed_df[column].mean()
-            protein_std = transformed_df[column].std()
+            # determine mean and standard deviation of log-transformed protein intensties
+            protein_mean = np.log10(transformed_df[column]).mean()
+            protein_std = np.log10(transformed_df[column]).std()
+            # calculate mean and standard deviation of normal distribution to be sampled
             scaled_protein_mean = max(0, protein_mean + down_shift * protein_std)
             scaled_protein_std = protein_std * scaling_factor
-            # determine standard deviation (scale)
-            value_to_be_imputed = abs(
+            # sample from normal distribution and transform back to linear scale
+            log_value_to_be_imputed = abs(
                 np.random.normal(
                     loc=scaled_protein_mean,
                     scale=scaled_protein_std,
                 )
             )
+            value_to_be_imputed = 10**log_value_to_be_imputed
+            # impute missing values for current protein group
             transformed_df[column].fillna(value_to_be_imputed, inplace=True)
     else:
         pass
         # determine mean of normal distribution of dataset
-        # TODO
+        # TODO: implement perDataset strategy
 
+    # Turn the wide format into the long format and return imputed dataframe
     imputed_df = wide_to_long(transformed_df, intensity_df)
     return imputed_df, dict()
 

From 812449dc306cc332d133caab870aec80c804c010 Mon Sep 17 00:00:00 2001
From: henninggaertner <Henning.Gaertner@student.hpi.de>
Date: Mon, 13 Nov 2023 16:40:42 +0100
Subject: [PATCH 06/15] ND sampling imputation now samples for each missing
 value instead of sampling once for entire protein group

---
 protzilla/data_preprocessing/imputation.py | 25 +++++++++++-----------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/protzilla/data_preprocessing/imputation.py b/protzilla/data_preprocessing/imputation.py
index 28060471e..c3705a2a6 100644
--- a/protzilla/data_preprocessing/imputation.py
+++ b/protzilla/data_preprocessing/imputation.py
@@ -256,25 +256,26 @@ def by_normal_distribution_sampling(
     transformed_df = long_to_wide(intensity_df)
     transformed_df.dropna(axis=1, how="all", inplace=True)
 
-    # TODO: sample the normal distribution for each missing value instead of sampling once for all missing values
     if strategy == "perProtein":
         for column in transformed_df.columns:
-            # determine mean and standard deviation of log-transformed protein intensties
+            # determine mean and standard deviation of log-transformed protein intensities
             protein_mean = np.log10(transformed_df[column]).mean()
             protein_std = np.log10(transformed_df[column]).std()
             # calculate mean and standard deviation of normal distribution to be sampled
             scaled_protein_mean = max(0, protein_mean + down_shift * protein_std)
             scaled_protein_std = protein_std * scaling_factor
-            # sample from normal distribution and transform back to linear scale
-            log_value_to_be_imputed = abs(
-                np.random.normal(
-                    loc=scaled_protein_mean,
-                    scale=scaled_protein_std,
-                )
-            )
-            value_to_be_imputed = 10**log_value_to_be_imputed
-            # impute missing values for current protein group
-            transformed_df[column].fillna(value_to_be_imputed, inplace=True)
+            # iterate over all values of current protein group
+            for index, value in transformed_df[column].iteritems():
+                # if value is NaN, sample from normal distribution and impute value
+                if np.isnan(value):
+                    log_value_to_be_imputed = abs(
+                        np.random.normal(
+                            loc=scaled_protein_mean,
+                            scale=scaled_protein_std,
+                        )
+                    )
+                    value_to_be_imputed = 10**log_value_to_be_imputed
+                    transformed_df[column].loc[index] = value_to_be_imputed
     else:
         pass
         # determine mean of normal distribution of dataset

From 9afddfca8f0fee17be77742b0c71ba0a6acd7bd1 Mon Sep 17 00:00:00 2001
From: henninggaertner <Henning.Gaertner@student.hpi.de>
Date: Tue, 14 Nov 2023 14:29:06 +0100
Subject: [PATCH 07/15] fix location mapping for normal distrubiton sampling
 imputation

---
 protzilla/constants/location_mapping.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/protzilla/constants/location_mapping.py b/protzilla/constants/location_mapping.py
index 91e4b0e91..9f35637b9 100644
--- a/protzilla/constants/location_mapping.py
+++ b/protzilla/constants/location_mapping.py
@@ -322,7 +322,7 @@
         "data_preprocessing",
         "imputation",
         "normal_distribution_sampling",
-    ): imputation.by_normal_distribution_sampling,
+    ): imputation.by_normal_distribution_sampling_plot,
     (
         "data_preprocessing",
         "outlier_detection",

From a92941f4a23de6a5af39995e5c76cd9106ff69ab Mon Sep 17 00:00:00 2001
From: henninggaertner <Henning.Gaertner@student.hpi.de>
Date: Tue, 14 Nov 2023 17:46:37 +0100
Subject: [PATCH 08/15] implement fast perProtein and perDataset normal dist.
 sampling imputation method

---
 protzilla/data_preprocessing/imputation.py | 115 ++++++++++++++++-----
 1 file changed, 89 insertions(+), 26 deletions(-)

diff --git a/protzilla/data_preprocessing/imputation.py b/protzilla/data_preprocessing/imputation.py
index c3705a2a6..0ce54398b 100644
--- a/protzilla/data_preprocessing/imputation.py
+++ b/protzilla/data_preprocessing/imputation.py
@@ -232,6 +232,8 @@ def by_normal_distribution_sampling(
     defined by the existing datapoints and user-defined parameters for down-
     shifting and scaling. Imputes missing values for each protein  taking into
     account data from each protein.
+    The downshifted normal distribution that will be sampled for imputation has a lower
+    limit for the mean of at 0, meaning that if the downshifted mean were to be negative, it will be set at 0.
 
     :param intensity_df: the dataframe that should be filtered in\
     long format
@@ -253,37 +255,79 @@ def by_normal_distribution_sampling(
     """
     assert strategy in ["perProtein", "perDataset"]
 
-    transformed_df = long_to_wide(intensity_df)
-    transformed_df.dropna(axis=1, how="all", inplace=True)
-
     if strategy == "perProtein":
-        for column in transformed_df.columns:
+        transformed_df = long_to_wide(intensity_df)
+        # iterate over all protein groups
+        for protein_grp in transformed_df.columns:
+            # determine number of missing values
+            number_of_nans = transformed_df[protein_grp].isnull().sum()
+
+            # get indices of NaN values in current protein group
+            location_of_nans = transformed_df[protein_grp].isnull()
+            indices_of_nans = location_of_nans[location_of_nans].index
+
             # determine mean and standard deviation of log-transformed protein intensities
-            protein_mean = np.log10(transformed_df[column]).mean()
-            protein_std = np.log10(transformed_df[column]).std()
+            protein_grp_mean = np.log10(transformed_df[protein_grp]).mean(skipna=True)
+            protein_grp_std = np.log10(transformed_df[protein_grp]).std(skipna=True)
+
             # calculate mean and standard deviation of normal distribution to be sampled
-            scaled_protein_mean = max(0, protein_mean + down_shift * protein_std)
-            scaled_protein_std = protein_std * scaling_factor
-            # iterate over all values of current protein group
-            for index, value in transformed_df[column].iteritems():
-                # if value is NaN, sample from normal distribution and impute value
-                if np.isnan(value):
-                    log_value_to_be_imputed = abs(
-                        np.random.normal(
-                            loc=scaled_protein_mean,
-                            scale=scaled_protein_std,
-                        )
-                    )
-                    value_to_be_imputed = 10**log_value_to_be_imputed
-                    transformed_df[column].loc[index] = value_to_be_imputed
+            sampling_mean = max(0, protein_grp_mean + down_shift * protein_grp_std)
+            sampling_std = protein_grp_std * scaling_factor
+
+            # calculate log-transformed values to be imputed
+            log_impute_values = abs(
+                np.random.normal(
+                    loc=sampling_mean,
+                    scale=sampling_std,
+                    size=number_of_nans,
+                )
+            )
+
+            # transform log-transformed values to be imputed back to normal scale and round to nearest integer
+            impute_values = np.round(10**log_impute_values, decimals=0)
+
+            # zip indices of NaN values with values to be imputed together as a Series, such that fillna can be used
+            impute_value_series = pd.Series(impute_values, index=indices_of_nans)
+            transformed_df[protein_grp].fillna(impute_value_series, inplace=True)
+
+        imputed_df = wide_to_long(transformed_df, intensity_df)
+        return imputed_df, dict()
+
     else:
-        pass
-        # determine mean of normal distribution of dataset
-        # TODO: implement perDataset strategy
+        # deep copy the dataframe
+        intensity_type = intensity_df.columns[3]
+
+        # get indices of NaN values in current protein group
+        location_of_nans = intensity_df[intensity_type].isnull()
+        indices_of_nans = location_of_nans[location_of_nans].index
+        # calculate the mean and standard deviation of the entire dataset
+        dataset_mean = np.log10(intensity_df[intensity_type]).mean()
+        dataset_std = np.log10(intensity_df[intensity_type]).std()
+
+        # calculate mean and standard deviation of normal distribution to be sampled
+        scaled_dataset_mean = max(0, dataset_mean + down_shift * dataset_std)
+        scaled_dataset_std = dataset_std * scaling_factor
+
+        # get number of NaN values in dataset
+        number_of_nans = intensity_df[intensity_type].isnull().sum()
+
+        # calculate log-transformed values to be imputed
+        log_impute_values = abs(
+            np.random.normal(
+                loc=scaled_dataset_mean,
+                scale=scaled_dataset_std,
+                size=number_of_nans,
+            )
+        )
 
-    # Turn the wide format into the long format and return imputed dataframe
-    imputed_df = wide_to_long(transformed_df, intensity_df)
-    return imputed_df, dict()
+        # transform log-transformed values to be imputed back to normal scale and round to nearest integer
+        impute_values = np.round(10**log_impute_values, decimals=0)
+
+        # zip indices of NaN values with values to be imputed together as a Series, such that fillna can be used
+        impute_value_series = pd.Series(impute_values, index=indices_of_nans)
+        intensity_df[intensity_type].fillna(impute_value_series, inplace=True)
+
+        return intensity_df, dict()
 
 
 def by_knn_plot(
@@ -294,6 +338,14 @@ def by_knn_plot(
     )
 
 
+def by_normal_distribution_sampling_plot(
+    df, result_df, current_out, graph_type, graph_type_quantities, group_by
+):
+    return _build_box_hist_plot(
+        df, result_df, graph_type, graph_type_quantities, group_by
+    )
+
+
 def by_simple_imputer_plot(
     df, result_df, current_out, graph_type, graph_type_quantities, group_by
 ):
@@ -380,3 +432,14 @@ def _build_box_hist_plot(
             heading="Number of Imputed Values",
         )
     return [fig1, fig2]
+
+
+def xf():
+    # load df from intensity_df.csv
+    df = pd.read_csv("intensity_df.csv")
+    # impute missing values
+    for i in range(1):
+        result_df, _ = by_normal_distribution_sampling(
+            df, strategy="perProtein", down_shift=0, scaling_factor=1
+        )
+        print("done")

From 50af6a757e19ace2867cdf55c8872a30b0313ddc Mon Sep 17 00:00:00 2001
From: henninggaertner <Henning.Gaertner@student.hpi.de>
Date: Tue, 14 Nov 2023 17:54:27 +0100
Subject: [PATCH 09/15] removed unnecessary testing function for code

---
 protzilla/data_preprocessing/imputation.py | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/protzilla/data_preprocessing/imputation.py b/protzilla/data_preprocessing/imputation.py
index 0ce54398b..5e1c84f57 100644
--- a/protzilla/data_preprocessing/imputation.py
+++ b/protzilla/data_preprocessing/imputation.py
@@ -432,14 +432,3 @@ def _build_box_hist_plot(
             heading="Number of Imputed Values",
         )
     return [fig1, fig2]
-
-
-def xf():
-    # load df from intensity_df.csv
-    df = pd.read_csv("intensity_df.csv")
-    # impute missing values
-    for i in range(1):
-        result_df, _ = by_normal_distribution_sampling(
-            df, strategy="perProtein", down_shift=0, scaling_factor=1
-        )
-        print("done")

From dbd99da193d77039afd784ea0a7f3e5fe49f548b Mon Sep 17 00:00:00 2001
From: henninggaertner <Henning.Gaertner@student.hpi.de>
Date: Tue, 14 Nov 2023 17:55:54 +0100
Subject: [PATCH 10/15] comment fix

---
 protzilla/data_preprocessing/imputation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/protzilla/data_preprocessing/imputation.py b/protzilla/data_preprocessing/imputation.py
index 5e1c84f57..9cd5e5356 100644
--- a/protzilla/data_preprocessing/imputation.py
+++ b/protzilla/data_preprocessing/imputation.py
@@ -294,7 +294,7 @@ def by_normal_distribution_sampling(
         return imputed_df, dict()
 
     else:
-        # deep copy the dataframe
+        # determine column for protein intensities
         intensity_type = intensity_df.columns[3]
 
         # get indices of NaN values in current protein group

From 19491ef13f6172e4d9ffbde16550fc47cecaf35b Mon Sep 17 00:00:00 2001
From: henninggaertner <Henning.Gaertner@student.hpi.de>
Date: Wed, 15 Nov 2023 11:03:17 +0100
Subject: [PATCH 11/15] ND-sampling imputation: complete tests, remove rounding
 and limiting sampel ND, do not impute if there are insufficient values in
 dataframe

---
 protzilla/data_preprocessing/imputation.py    | 54 +++++++++++--------
 .../data_preprocessing/test_imputation.py     | 45 ++++++++++++++++
 2 files changed, 77 insertions(+), 22 deletions(-)

diff --git a/protzilla/data_preprocessing/imputation.py b/protzilla/data_preprocessing/imputation.py
index 9cd5e5356..a0834c3c9 100644
--- a/protzilla/data_preprocessing/imputation.py
+++ b/protzilla/data_preprocessing/imputation.py
@@ -226,29 +226,33 @@ def by_normal_distribution_sampling(
     strategy="perProtein",
     down_shift=0,
     scaling_factor=1,
+    round_values=False,
 ) -> tuple[pd.DataFrame, dict]:
     """
     A function to perform imputation via sampling of a normal distribution
     defined by the existing datapoints and user-defined parameters for down-
     shifting and scaling. Imputes missing values for each protein  taking into
-    account data from each protein.
-    The downshifted normal distribution that will be sampled for imputation has a lower
-    limit for the mean of at 0, meaning that if the downshifted mean were to be negative, it will be set at 0.
-
-    :param intensity_df: the dataframe that should be filtered in\
+    account data from each protein or the whole dataset. The data is log-
+    transformed before sampling from the normal distribution and transformed
+    back afterwards, meaning only values > 0 are imputed.
+    Will not impute if insufficient data is available for sampling.
+    :param intensity_df: the dataframe that should be filtered in
     long format
     :type intensity_df: pandas DataFrame
-    :param strategy: which strategy to use for definition of the normal\
-    distribution to be sampled. Can be "perProtein", "perDataset" or "most_frequent"\
+    :param strategy: which strategy to use for definition of the normal
+    distribution to be sampled. Can be "perProtein", "perDataset" or "most_frequent"
     :type strategy: str
-    :param down_shift: a factor defining how many dataset standard deviations\
-    to shift the mean of the normal distribution used for imputation.\
+    :param down_shift: a factor defining how many dataset standard deviations
+    to shift the mean of the normal distribution used for imputation.
     Default: 0 (no shift)
     :type down_shift: float
-    :param scaling_factor: a factor determining how the variance of the normal\
+    :param scaling_factor: a factor determining how the variance of the normal
     distribution used for imputation is scaled compared to dataset.
     Default: 1 (no scaling)
     :type down_shift: float
+    :param round_values: whether to round the imputed values to the nearest integer
+    Default: False
+    :type round_values: bool
     :return: returns an imputed dataframe in typical protzilla long format\
     and an empty dict
     :rtype: pd.DataFrame, int
@@ -262,6 +266,10 @@ def by_normal_distribution_sampling(
             # determine number of missing values
             number_of_nans = transformed_df[protein_grp].isnull().sum()
 
+            # don't impute values if there not enough values (> 1) to sample from
+            if number_of_nans > len(transformed_df[protein_grp]) - 2:
+                continue
+
             # get indices of NaN values in current protein group
             location_of_nans = transformed_df[protein_grp].isnull()
             indices_of_nans = location_of_nans[location_of_nans].index
@@ -271,20 +279,18 @@ def by_normal_distribution_sampling(
             protein_grp_std = np.log10(transformed_df[protein_grp]).std(skipna=True)
 
             # calculate mean and standard deviation of normal distribution to be sampled
-            sampling_mean = max(0, protein_grp_mean + down_shift * protein_grp_std)
+            sampling_mean = protein_grp_mean + down_shift * protein_grp_std
             sampling_std = protein_grp_std * scaling_factor
 
             # calculate log-transformed values to be imputed
-            log_impute_values = abs(
-                np.random.normal(
-                    loc=sampling_mean,
-                    scale=sampling_std,
-                    size=number_of_nans,
-                )
+            log_impute_values = np.random.normal(
+                loc=sampling_mean,
+                scale=sampling_std,
+                size=number_of_nans,
             )
 
             # transform log-transformed values to be imputed back to normal scale and round to nearest integer
-            impute_values = np.round(10**log_impute_values, decimals=0)
+            impute_values = 10**log_impute_values
 
             # zip indices of NaN values with values to be imputed together as a Series, such that fillna can be used
             impute_value_series = pd.Series(impute_values, index=indices_of_nans)
@@ -297,9 +303,16 @@ def by_normal_distribution_sampling(
         # determine column for protein intensities
         intensity_type = intensity_df.columns[3]
 
+        # get number of NaN values in dataset
+        number_of_nans = intensity_df[intensity_type].isnull().sum()
+
+        # throw error if dataset is basically empty, something went wrong
+        assert number_of_nans <= len(intensity_df[intensity_type]) - 2
+
         # get indices of NaN values in current protein group
         location_of_nans = intensity_df[intensity_type].isnull()
         indices_of_nans = location_of_nans[location_of_nans].index
+
         # calculate the mean and standard deviation of the entire dataset
         dataset_mean = np.log10(intensity_df[intensity_type]).mean()
         dataset_std = np.log10(intensity_df[intensity_type]).std()
@@ -308,9 +321,6 @@ def by_normal_distribution_sampling(
         scaled_dataset_mean = max(0, dataset_mean + down_shift * dataset_std)
         scaled_dataset_std = dataset_std * scaling_factor
 
-        # get number of NaN values in dataset
-        number_of_nans = intensity_df[intensity_type].isnull().sum()
-
         # calculate log-transformed values to be imputed
         log_impute_values = abs(
             np.random.normal(
@@ -321,7 +331,7 @@ def by_normal_distribution_sampling(
         )
 
         # transform log-transformed values to be imputed back to normal scale and round to nearest integer
-        impute_values = np.round(10**log_impute_values, decimals=0)
+        impute_values = 10**log_impute_values
 
         # zip indices of NaN values with values to be imputed together as a Series, such that fillna can be used
         impute_value_series = pd.Series(impute_values, index=indices_of_nans)
diff --git a/tests/protzilla/data_preprocessing/test_imputation.py b/tests/protzilla/data_preprocessing/test_imputation.py
index 44b66bb13..40caba91a 100644
--- a/tests/protzilla/data_preprocessing/test_imputation.py
+++ b/tests/protzilla/data_preprocessing/test_imputation.py
@@ -9,6 +9,8 @@
     by_min_per_protein_plot,
     by_min_per_sample,
     by_min_per_sample_plot,
+    by_normal_distribution_sampling,
+    by_normal_distribution_sampling_plot,
     by_simple_imputer,
     by_simple_imputer_plot,
     np,
@@ -18,6 +20,11 @@
 from tests.protzilla.data_preprocessing import test_plots_data_preprocessing
 
 
+def protein_group_intensities(dataframe, protein_group_name):
+    # small helper function for tests
+    return dataframe[dataframe["Protein ID"] == protein_group_name]["Intensity"]
+
+
 @pytest.fixture
 def input_imputation_df():
     test_intensity_list = (
@@ -266,6 +273,44 @@ def test_imputation_knn(show_figures, input_imputation_df, assertion_df_knn):
             {assertion_df} but is\n {result_df}"
 
 
+@pytest.mark.order(2)
+@pytest.mark.dependency(depends=["test_build_box_hist_plot"])
+def test_imputation_normal_distribution_sampling(show_figures, input_imputation_df):
+    # perform imputation on test data frame
+    result_df_perProtein = by_normal_distribution_sampling(
+        input_imputation_df, strategy="perProtein", down_shift=-10
+    )[0]
+    result_df_perDataset = by_normal_distribution_sampling(
+        input_imputation_df, strategy="perDataset", down_shift=-10
+    )[0]
+
+    fig1, fig2 = by_normal_distribution_sampling_plot(
+        input_imputation_df, result_df_perProtein, {}, "Boxplot", "Bar chart", "Sample"
+    )
+    if show_figures:
+        fig1.show()
+        fig2.show()
+
+    assert (
+        result_df_perProtein["Intensity"].min() >= 0
+    ), f"Imputation by normal distribution sampling should not have negative values!"
+    assert (
+        result_df_perDataset["Intensity"].min() >= 0
+    ), f"Imputation by normal distribution sampling should not have negative values!"
+
+    assert (
+        False == protein_group_intensities(result_df_perProtein, "Protein1").hasnans
+    ) and (
+        False == protein_group_intensities(result_df_perProtein, "Protein3").hasnans
+    ), f"Imputation by normal distribution sampling should not have NaN values!"
+    assert protein_group_intensities(
+        result_df_perProtein, "Protein2"
+    ).hasnans, f"This protein group should have NaN values! Not enough data points to sample from!"
+    assert (
+        False == result_df_perDataset["Intensity"].hasnans
+    ), f"Imputation by normal distribution sampling per Dataset should not have NaN values!"
+
+
 def test_number_of_imputed_values(input_imputation_df, assertion_df_knn):
     count = number_of_imputed_values(input_imputation_df, assertion_df_knn)
     assert (

From abbb68fc19ad0586e63f4e12eac278250f62c668 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henning=20G=C3=A4rtner?=
 <104069093+henninggaertner@users.noreply.github.com>
Date: Thu, 16 Nov 2023 17:07:58 +0100
Subject: [PATCH 12/15] Update issue.md template to automatically assign
 PROTzilla project

---
 .github/ISSUE_TEMPLATE/issue.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/ISSUE_TEMPLATE/issue.md b/.github/ISSUE_TEMPLATE/issue.md
index 387d4a3ee..356273db8 100644
--- a/.github/ISSUE_TEMPLATE/issue.md
+++ b/.github/ISSUE_TEMPLATE/issue.md
@@ -3,7 +3,7 @@ name: Issue Template
 about: Standard Issue Template
 title: 
 labels: 
-project: PROTzilla 2
+project: PROTzilla
 
 ---
 

From cfe54281244bda20ba2d1171c6c3e81519e75665 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henning=20G=C3=A4rtner?=
 <104069093+henninggaertner@users.noreply.github.com>
Date: Fri, 17 Nov 2023 11:23:47 +0100
Subject: [PATCH 13/15] Update todo_issue.md to automatically assign correct
 project

---
 .github/ISSUE_TEMPLATE/todo_issue.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/ISSUE_TEMPLATE/todo_issue.md b/.github/ISSUE_TEMPLATE/todo_issue.md
index d8c7fbb93..c20221c3b 100644
--- a/.github/ISSUE_TEMPLATE/todo_issue.md
+++ b/.github/ISSUE_TEMPLATE/todo_issue.md
@@ -3,7 +3,7 @@ name: TODO
 about: TODO Issue Template
 title: '# TODO <issue_nr>'
 labels: todo
-project: 'PROTzilla2'
+project: 'PROTzilla'
 
 ---
 

From 7b2fb4f832ea0cb4542b18c2336b174b24c36262 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henning=20G=C3=A4rtner?=
 <104069093+henninggaertner@users.noreply.github.com>
Date: Fri, 17 Nov 2023 12:50:03 +0100
Subject: [PATCH 14/15] Update issue.md

---
 .github/ISSUE_TEMPLATE/issue.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/ISSUE_TEMPLATE/issue.md b/.github/ISSUE_TEMPLATE/issue.md
index 356273db8..677a8ce72 100644
--- a/.github/ISSUE_TEMPLATE/issue.md
+++ b/.github/ISSUE_TEMPLATE/issue.md
@@ -3,7 +3,7 @@ name: Issue Template
 about: Standard Issue Template
 title: 
 labels: 
-project: PROTzilla
+project: "PROTzilla"
 
 ---
 

From 5d65d0f20705cfe7507a51cd8f2727ff68f8202d Mon Sep 17 00:00:00 2001
From: "janni.roebbecke" <janni.roebbecke@student.hpi.uni-potsdam.de>
Date: Fri, 17 Nov 2023 13:30:08 +0100
Subject: [PATCH 15/15] minor, purely stylistic changes

---
 protzilla/data_preprocessing/imputation.py | 24 +++++-----------------
 1 file changed, 5 insertions(+), 19 deletions(-)

diff --git a/protzilla/data_preprocessing/imputation.py b/protzilla/data_preprocessing/imputation.py
index a0834c3c9..160112cfb 100644
--- a/protzilla/data_preprocessing/imputation.py
+++ b/protzilla/data_preprocessing/imputation.py
@@ -226,7 +226,6 @@ def by_normal_distribution_sampling(
     strategy="perProtein",
     down_shift=0,
     scaling_factor=1,
-    round_values=False,
 ) -> tuple[pd.DataFrame, dict]:
     """
     A function to perform imputation via sampling of a normal distribution
@@ -263,22 +262,18 @@ def by_normal_distribution_sampling(
         transformed_df = long_to_wide(intensity_df)
         # iterate over all protein groups
         for protein_grp in transformed_df.columns:
-            # determine number of missing values
+
             number_of_nans = transformed_df[protein_grp].isnull().sum()
 
             # don't impute values if there not enough values (> 1) to sample from
             if number_of_nans > len(transformed_df[protein_grp]) - 2:
                 continue
 
-            # get indices of NaN values in current protein group
             location_of_nans = transformed_df[protein_grp].isnull()
             indices_of_nans = location_of_nans[location_of_nans].index
 
-            # determine mean and standard deviation of log-transformed protein intensities
             protein_grp_mean = np.log10(transformed_df[protein_grp]).mean(skipna=True)
             protein_grp_std = np.log10(transformed_df[protein_grp]).std(skipna=True)
-
-            # calculate mean and standard deviation of normal distribution to be sampled
             sampling_mean = protein_grp_mean + down_shift * protein_grp_std
             sampling_std = protein_grp_std * scaling_factor
 
@@ -288,7 +283,6 @@ def by_normal_distribution_sampling(
                 scale=sampling_std,
                 size=number_of_nans,
             )
-
             # transform log-transformed values to be imputed back to normal scale and round to nearest integer
             impute_values = 10**log_impute_values
 
@@ -303,33 +297,25 @@ def by_normal_distribution_sampling(
         # determine column for protein intensities
         intensity_type = intensity_df.columns[3]
 
-        # get number of NaN values in dataset
         number_of_nans = intensity_df[intensity_type].isnull().sum()
-
-        # throw error if dataset is basically empty, something went wrong
         assert number_of_nans <= len(intensity_df[intensity_type]) - 2
 
-        # get indices of NaN values in current protein group
         location_of_nans = intensity_df[intensity_type].isnull()
         indices_of_nans = location_of_nans[location_of_nans].index
 
-        # calculate the mean and standard deviation of the entire dataset
         dataset_mean = np.log10(intensity_df[intensity_type]).mean()
         dataset_std = np.log10(intensity_df[intensity_type]).std()
-
-        # calculate mean and standard deviation of normal distribution to be sampled
-        scaled_dataset_mean = max(0, dataset_mean + down_shift * dataset_std)
-        scaled_dataset_std = dataset_std * scaling_factor
+        sampling_mean = max(0, dataset_mean + down_shift * dataset_std)
+        sampling_std = dataset_std * scaling_factor
 
         # calculate log-transformed values to be imputed
         log_impute_values = abs(
             np.random.normal(
-                loc=scaled_dataset_mean,
-                scale=scaled_dataset_std,
+                loc=sampling_mean,
+                scale=sampling_std,
                 size=number_of_nans,
             )
         )
-
         # transform log-transformed values to be imputed back to normal scale and round to nearest integer
         impute_values = 10**log_impute_values