From d3a79431beb8230c791492cd86bb6336169efbdd Mon Sep 17 00:00:00 2001 From: Lilly Zintl Date: Tue, 31 Oct 2023 17:38:46 +0100 Subject: [PATCH] remove \ in data_preprocessing docstrings --- .../data_preprocessing/filter_proteins.py | 10 ++-- protzilla/data_preprocessing/imputation.py | 49 ++++++++++--------- protzilla/data_preprocessing/normalisation.py | 18 +++---- .../data_preprocessing/outlier_detection.py | 10 ++-- .../data_preprocessing/peptide_filter.py | 2 +- protzilla/data_preprocessing/plots.py | 26 +++++----- .../data_preprocessing/transformation.py | 4 +- 7 files changed, 60 insertions(+), 59 deletions(-) diff --git a/protzilla/data_preprocessing/filter_proteins.py b/protzilla/data_preprocessing/filter_proteins.py index d589ab650..54c054136 100644 --- a/protzilla/data_preprocessing/filter_proteins.py +++ b/protzilla/data_preprocessing/filter_proteins.py @@ -8,15 +8,15 @@ def by_samples_missing(intensity_df, percentage): This function filters proteins based on its amount of nan values. If the percentage of existing values is below a threshold (percentage), the protein is filtered out. - :param df: the intensity dataframe that should be filtered\ + :param df: the intensity dataframe that should be filtered in long format :type df: pd.DataFrame - :param percentage: float ranging from 0 to 1. Defining the\ - relative share of samples the proteins should be present in inorder to be kept.\ + :param percentage: float ranging from 0 to 1. Defining the + relative share of samples the proteins should be present in inorder to be kept. :type percentage: float - :return: returns the filtered df as a Dataframe and a dict with a listof Protein IDs\ - that were discarded and a list of Protein IDs\ + :return: returns the filtered df as a Dataframe and a dict with a listof Protein IDs + that were discarded and a list of Protein IDs that were kept :rtype: Tuple[pandas DataFrame, dict] """ diff --git a/protzilla/data_preprocessing/imputation.py b/protzilla/data_preprocessing/imputation.py index 56ca7985b..3cbcecb95 100644 --- a/protzilla/data_preprocessing/imputation.py +++ b/protzilla/data_preprocessing/imputation.py @@ -29,17 +29,18 @@ def by_knn( Implements an instance of the sklearn.impute KNNImputer class. https://scikit-learn.org/stable/modules/generated/sklearn.impute.KNNImputer.html - :param intensity_df: the dataframe that should be filtered in\ - long format + + :param intensity_df: the dataframe that should be filtered in + long format :type intensity_df: pandas DataFrame - :param number_of_neighbours: number of neighbouring samples used for\ - imputation. Default: 5 + :param number_of_neighbours: number of neighbouring samples used for + imputation. Default: 5 :type number_of_neighbours: int - :param **kwargs: additional keyword arguments passed to\ + :param **kwargs: additional keyword arguments passed to KNNImputer.fit_transform :type kwargs: dict - :return: returns an imputed dataframe in typical protzilla long format\ - and an empty dict + :return: returns an imputed dataframe in typical protzilla long format + and an empty dict :rtype: pd.DataFrame """ @@ -74,14 +75,15 @@ def by_simple_imputer( no data will be imputed. This function automatically filters out such proteins from the DataFrame beforehand. - :param intensity_df: the dataframe that should be filtered in\ - long format + :param intensity_df: the dataframe that should be filtered in + long format :type intensity_df: pandas DataFrame - :param strategy: Defines the imputation strategy. Can be "mean",\ - "median" or "most_frequent" (for mode). + :param strategy: Defines the imputation strategy. Can be "mean", + "median" or "most_frequent" (for mode). :type strategy: str - :return: returns an imputed dataframe in typical protzilla long format\ - and an empty dict + + :return: returns an imputed dataframe in typical protzilla long format + and an empty dict :rtype: pd.DataFrame, int """ assert strategy in ["mean", "median", "most_frequent"] @@ -157,16 +159,16 @@ def by_min_per_protein( take a fraction of that minimum value for imputation. CAVE: All proteins without any values will be filtered out. - :param intensity_df: the dataframe that should be filtered in\ + :param intensity_df: the dataframe that should be filtered in long format :type intensity_df: pandas DataFrame - :param shrinking_value: a factor to alter the minimum value\ - used for imputation. With a shrinking factor of 0.1 for\ - example, a tenth of the minimum value found will be used for\ + :param shrinking_value: a factor to alter the minimum value + used for imputation. With a shrinking factor of 0.1 for + example, a tenth of the minimum value found will be used for imputation. Default: 1 (no shrinking) :type shrinking_value: float - :return: returns an imputed dataframe in typical protzilla long format\ + :return: returns an imputed dataframe in typical protzilla long format and an empty dict :rtype: pd.DataFrame, dict """ @@ -202,16 +204,16 @@ def by_min_per_dataset( the dataframe. The user can also assign a shrinking factor to take a fraction of that minimum value for imputation. - :param intensity_df: the dataframe that should be filtered in\ + :param intensity_df: the dataframe that should be filtered in long format :type intensity_df: pandas DataFrame - :param shrinking_value: a factor to alter the minimum value\ - used for imputation. With a shrinking factor of 0.1 for\ - example, a tenth of the minimum value found will be used for\ + :param shrinking_value: a factor to alter the minimum value + used for imputation. With a shrinking factor of 0.1 for + example, a tenth of the minimum value found will be used for imputation. Default: 1 (no shrinking) :type shrinking_value: float - :return: returns an imputed dataframe in typical protzilla long format\ + :return: returns an imputed dataframe in typical protzilla long format and an empty dict :rtype: pd.DataFrame, dict """ @@ -279,7 +281,6 @@ def _build_box_hist_plot( 2. a graph summarising the amount of filtered proteins. - """ if graph_type == "Boxplot": fig1 = create_box_plots( diff --git a/protzilla/data_preprocessing/normalisation.py b/protzilla/data_preprocessing/normalisation.py index 2f503e53f..036eb9dba 100644 --- a/protzilla/data_preprocessing/normalisation.py +++ b/protzilla/data_preprocessing/normalisation.py @@ -14,11 +14,11 @@ def by_z_score(intensity_df: pd.DataFrame) -> tuple[pd.DataFrame, dict]: Scales the data to zero mean and unit variance. This is often also called z-score normalisation/transformation. - :param intensity_df: the dataframe that should be filtered in\ + :param intensity_df: the dataframe that should be filtered in long format :type intensity_df: pd.DataFrame - :return: returns a scaled dataframe in typical protzilla long format and an empty\ + :return: returns a scaled dataframe in typical protzilla long format and an empty dictionary :rtype: Tuple[pandas DataFrame, dict] """ @@ -56,14 +56,14 @@ def by_median( Divides each intensity by the chosen intensity quartile of the respective sample. By default, the median (50%-quartile) is used. - :param intensity_df: the dataframe that should be filtered in\ + :param intensity_df: the dataframe that should be filtered in long format :type intensity_df: pandas DataFrame - :param percentile: the chosen quartile of the sample intensities for\ + :param percentile: the chosen quartile of the sample intensities for normalisation :type percentile: float - :return: returns a scaled dataframe in typical protzilla long format\ + :return: returns a scaled dataframe in typical protzilla long format and a dict, containing all zeroed samples due to quantile being 0 :rtype: Tuple[pandas DataFrame, dict] """ @@ -118,11 +118,11 @@ def by_totalsum(intensity_df: pd.DataFrame) -> tuple[pd.DataFrame, dict]: Normalises the data on the level of each sample. Divides each intensity by the total sum of sample intensities. - :param intensity_df: the dataframe that should be filtered in\ + :param intensity_df: the dataframe that should be filtered in long format :type intensity_df: pandas DataFrame - :return: returns a scaled dataframe in typical protzilla long format\ + :return: returns a scaled dataframe in typical protzilla long format and a dict, containing all zeroed samples due to sum being 0 :rtype: Tuple[pandas DataFrame, dict] """ @@ -181,12 +181,12 @@ def by_reference_protein( protein in each sample. Samples where this value is zero will be removed and returned separately. - :param intensity_df: the dataframe that should be filtered in\ + :param intensity_df: the dataframe that should be filtered in long format :type intensity_df: pandas DataFrame :param reference_protein: Protein ID of the protein to normalise by type reference_protein_id: str - :return: returns a scaled dataframe in typical protzilla long format \ + :return: returns a scaled dataframe in typical protzilla long format and dict with a list of the indices of the dropped samples :rtype: Tuple[pandas DataFrame, dict] """ diff --git a/protzilla/data_preprocessing/outlier_detection.py b/protzilla/data_preprocessing/outlier_detection.py index 231b8cdb6..f8d6f3155 100644 --- a/protzilla/data_preprocessing/outlier_detection.py +++ b/protzilla/data_preprocessing/outlier_detection.py @@ -30,7 +30,7 @@ def by_isolation_forest( all kernels (-1) :type n_jobs: integer - :return: returns a Dataframe containing all samples that are not outliers and a\ + :return: returns a Dataframe containing all samples that are not outliers and a dict with list of outlier sample names :rtype: Tuple[pandas DataFrame, dict] """ @@ -95,7 +95,7 @@ def by_local_outlier_factor( all kernels (-1) :type n_jobs: int - :return: returns a Dataframe containing all samples that are not outliers and a\ + :return: returns a Dataframe containing all samples that are not outliers and a dict with list of outlier sample names :rtype: Tuple[pandas DataFrame, dict] """ @@ -154,9 +154,9 @@ def by_pca( :type number_of_components: integer (2 or 3) :return: returns a Dataframe containing all samples that are not outliers. - A dict with list of inlier sample names, a DataFrame that contains the projection \ - of the intensity_df on first principal components, a list that contains the \ - explained variation for each component and an int, the number of components \ + A dict with list of inlier sample names, a DataFrame that contains the projection + of the intensity_df on first principal components, a list that contains the + explained variation for each component and an int, the number of components the calculations were executed with :rtype: Tuple[pandas DataFrame, dict] """ diff --git a/protzilla/data_preprocessing/peptide_filter.py b/protzilla/data_preprocessing/peptide_filter.py index 9657cfb44..ce1833aef 100644 --- a/protzilla/data_preprocessing/peptide_filter.py +++ b/protzilla/data_preprocessing/peptide_filter.py @@ -14,7 +14,7 @@ def by_pep_value( :type intensity_df: pd.Dataframe :param peptide_df: the pandas dataframe containing the peptide information :type peptide_df: pd.Dataframe - :param threshold: peptides with a PEP-value below this threshold will be filtered\ + :param threshold: peptides with a PEP-value below this threshold will be filtered out :type threshold: float diff --git a/protzilla/data_preprocessing/plots.py b/protzilla/data_preprocessing/plots.py index a49854099..5775f5f0e 100644 --- a/protzilla/data_preprocessing/plots.py +++ b/protzilla/data_preprocessing/plots.py @@ -123,10 +123,10 @@ def create_box_plots( (for example before and after filtering/normalisation) and creates a visualisation for each one. - :param dataframe_a: First dataframe in protzilla long format for\ + :param dataframe_a: First dataframe in protzilla long format for first boxplot :type dataframe_a: pd.DataFrame - :param dataframe_b: Second dataframe in protzilla long format\ + :param dataframe_b: Second dataframe in protzilla long format for second boxplot :type dataframe_b: pd.DataFrame :param name_a: Name of first boxplot @@ -139,10 +139,10 @@ def create_box_plots( :type y_title: str :param x_title: Optional x-axis title for graphs. :type x_title: str - :param group_by: Optional argument to create a grouped boxplot\ - graph. Arguments can be either "Sample" to group by sample or\ - "Protein ID" to group by protein. Leave "None" to get ungrouped\ - conventional graphs. If set the function will ignore the\ + :param group_by: Optional argument to create a grouped boxplot + graph. Arguments can be either "Sample" to group by sample or + "Protein ID" to group by protein. Leave "None" to get ungrouped + conventional graphs. If set the function will ignore the graph_type argument. Default is "None". :type group_by: str @@ -224,10 +224,10 @@ def create_histograms( (for example before and after filtering/normalisation) and creates a visualisation for each one. - :param dataframe_a: First dataframe in protzilla long format for\ + :param dataframe_a: First dataframe in protzilla long format for first histogram :type dataframe_a: pd.DataFrame - :param dataframe_b: Second dataframe in protzilla long format\ + :param dataframe_b: Second dataframe in protzilla long format for second histogram :type dataframe_b: pd.DataFrame :param name_a: Name of first histogram @@ -290,7 +290,7 @@ def create_anomaly_score_bar_plot( This function creates a graph visualising the outlier and non-outlier samples using the anomaly score. - :param anomaly_df: pandas Dataframe that contains the anomaly score for each\ + :param anomaly_df: pandas Dataframe that contains the anomaly score for each sample, including outliers and on-outliers samples :type anomaly_df: pd.DataFrame :param colour_outlier: hex code for colour depicting the outliers. @@ -352,10 +352,10 @@ def create_pca_2d_scatter_plot( and non-outlier points by showing the principal components. It returns a ploty Figure object. - :param pca_df: a DataFrame that contains the projection of\ + :param pca_df: a DataFrame that contains the projection of the intensity_df on first principal components :type pca_df: pd.DataFrame - :param explained_variance_ratio: a list that contains the\ + :param explained_variance_ratio: a list that contains the explained variation for each component :type explained_variance_ratio: list :param colour_outlier: hex code for colour depicting the outliers. @@ -407,10 +407,10 @@ def create_pca_3d_scatter_plot( and non-outlier points by showing the principal components. It returns a ploty Figure object. - :param pca_df: a DataFrame that contains the projection of\ + :param pca_df: a DataFrame that contains the projection of the intensity_df on first principal components :type pca_df: pd.DataFrame - :param explained_variance_ratio: a list that contains the\ + :param explained_variance_ratio: a list that contains the explained variation for each component :type explained_variance_ratio: list :param colour_outlier: hex code for colour depicting the outliers. diff --git a/protzilla/data_preprocessing/transformation.py b/protzilla/data_preprocessing/transformation.py index 6e232a79d..5e8e16ba7 100644 --- a/protzilla/data_preprocessing/transformation.py +++ b/protzilla/data_preprocessing/transformation.py @@ -12,11 +12,11 @@ def by_log(intensity_df: pd.DataFrame, log_base="log10"): :param intensity_df: a protein data frame in long format :type intensity_df: pd.DataFrame - :param log_base: String of the used log method "log10" (base 10)\ + :param log_base: String of the used log method "log10" (base 10) or "log2" (base 2). Default: "log10" :type log_base: str - :return: returns a pandas DataFrame in typical protzilla\ + :return: returns a pandas DataFrame in typical protzilla long format with the transformed data and an empty dict. :rtype: Tuple[pandas DataFrame, dict] """