Skip to content

Commit

Permalink
remove \ in data_preprocessing docstrings
Browse files Browse the repository at this point in the history
  • Loading branch information
lill28 committed Oct 31, 2023
1 parent e4b002a commit d3a7943
Show file tree
Hide file tree
Showing 7 changed files with 60 additions and 59 deletions.
10 changes: 5 additions & 5 deletions protzilla/data_preprocessing/filter_proteins.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,15 @@ def by_samples_missing(intensity_df, percentage):
This function filters proteins based on its amount of nan values.
If the percentage of existing values is below a threshold (percentage), the protein is filtered out.
:param df: the intensity dataframe that should be filtered\
:param df: the intensity dataframe that should be filtered
in long format
:type df: pd.DataFrame
:param percentage: float ranging from 0 to 1. Defining the\
relative share of samples the proteins should be present in inorder to be kept.\
:param percentage: float ranging from 0 to 1. Defining the
relative share of samples the proteins should be present in inorder to be kept.
:type percentage: float
:return: returns the filtered df as a Dataframe and a dict with a listof Protein IDs\
that were discarded and a list of Protein IDs\
:return: returns the filtered df as a Dataframe and a dict with a listof Protein IDs
that were discarded and a list of Protein IDs
that were kept
:rtype: Tuple[pandas DataFrame, dict]
"""
Expand Down
49 changes: 25 additions & 24 deletions protzilla/data_preprocessing/imputation.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,17 +29,18 @@ def by_knn(
Implements an instance of the sklearn.impute KNNImputer
class.
https://scikit-learn.org/stable/modules/generated/sklearn.impute.KNNImputer.html
:param intensity_df: the dataframe that should be filtered in\
long format
:param intensity_df: the dataframe that should be filtered in
long format
:type intensity_df: pandas DataFrame
:param number_of_neighbours: number of neighbouring samples used for\
imputation. Default: 5
:param number_of_neighbours: number of neighbouring samples used for
imputation. Default: 5
:type number_of_neighbours: int
:param **kwargs: additional keyword arguments passed to\
:param **kwargs: additional keyword arguments passed to
KNNImputer.fit_transform
:type kwargs: dict
:return: returns an imputed dataframe in typical protzilla long format\
and an empty dict
:return: returns an imputed dataframe in typical protzilla long format
and an empty dict
:rtype: pd.DataFrame
"""

Expand Down Expand Up @@ -74,14 +75,15 @@ def by_simple_imputer(
no data will be imputed. This function automatically filters
out such proteins from the DataFrame beforehand.
:param intensity_df: the dataframe that should be filtered in\
long format
:param intensity_df: the dataframe that should be filtered in
long format
:type intensity_df: pandas DataFrame
:param strategy: Defines the imputation strategy. Can be "mean",\
"median" or "most_frequent" (for mode).
:param strategy: Defines the imputation strategy. Can be "mean",
"median" or "most_frequent" (for mode).
:type strategy: str
:return: returns an imputed dataframe in typical protzilla long format\
and an empty dict
:return: returns an imputed dataframe in typical protzilla long format
and an empty dict
:rtype: pd.DataFrame, int
"""
assert strategy in ["mean", "median", "most_frequent"]
Expand Down Expand Up @@ -157,16 +159,16 @@ def by_min_per_protein(
take a fraction of that minimum value for imputation.
CAVE: All proteins without any values will be filtered out.
:param intensity_df: the dataframe that should be filtered in\
:param intensity_df: the dataframe that should be filtered in
long format
:type intensity_df: pandas DataFrame
:param shrinking_value: a factor to alter the minimum value\
used for imputation. With a shrinking factor of 0.1 for\
example, a tenth of the minimum value found will be used for\
:param shrinking_value: a factor to alter the minimum value
used for imputation. With a shrinking factor of 0.1 for
example, a tenth of the minimum value found will be used for
imputation. Default: 1 (no shrinking)
:type shrinking_value: float
:return: returns an imputed dataframe in typical protzilla long format\
:return: returns an imputed dataframe in typical protzilla long format
and an empty dict
:rtype: pd.DataFrame, dict
"""
Expand Down Expand Up @@ -202,16 +204,16 @@ def by_min_per_dataset(
the dataframe. The user can also assign a shrinking factor to
take a fraction of that minimum value for imputation.
:param intensity_df: the dataframe that should be filtered in\
:param intensity_df: the dataframe that should be filtered in
long format
:type intensity_df: pandas DataFrame
:param shrinking_value: a factor to alter the minimum value\
used for imputation. With a shrinking factor of 0.1 for\
example, a tenth of the minimum value found will be used for\
:param shrinking_value: a factor to alter the minimum value
used for imputation. With a shrinking factor of 0.1 for
example, a tenth of the minimum value found will be used for
imputation. Default: 1 (no shrinking)
:type shrinking_value: float
:return: returns an imputed dataframe in typical protzilla long format\
:return: returns an imputed dataframe in typical protzilla long format
and an empty dict
:rtype: pd.DataFrame, dict
"""
Expand Down Expand Up @@ -279,7 +281,6 @@ def _build_box_hist_plot(
2. a graph summarising the amount of
filtered proteins.
"""
if graph_type == "Boxplot":
fig1 = create_box_plots(
Expand Down
18 changes: 9 additions & 9 deletions protzilla/data_preprocessing/normalisation.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,11 @@ def by_z_score(intensity_df: pd.DataFrame) -> tuple[pd.DataFrame, dict]:
Scales the data to zero mean and unit variance. This is often also
called z-score normalisation/transformation.
:param intensity_df: the dataframe that should be filtered in\
:param intensity_df: the dataframe that should be filtered in
long format
:type intensity_df: pd.DataFrame
:return: returns a scaled dataframe in typical protzilla long format and an empty\
:return: returns a scaled dataframe in typical protzilla long format and an empty
dictionary
:rtype: Tuple[pandas DataFrame, dict]
"""
Expand Down Expand Up @@ -56,14 +56,14 @@ def by_median(
Divides each intensity by the chosen intensity quartile of the
respective sample. By default, the median (50%-quartile) is used.
:param intensity_df: the dataframe that should be filtered in\
:param intensity_df: the dataframe that should be filtered in
long format
:type intensity_df: pandas DataFrame
:param percentile: the chosen quartile of the sample intensities for\
:param percentile: the chosen quartile of the sample intensities for
normalisation
:type percentile: float
:return: returns a scaled dataframe in typical protzilla long format\
:return: returns a scaled dataframe in typical protzilla long format
and a dict, containing all zeroed samples due to quantile being 0
:rtype: Tuple[pandas DataFrame, dict]
"""
Expand Down Expand Up @@ -118,11 +118,11 @@ def by_totalsum(intensity_df: pd.DataFrame) -> tuple[pd.DataFrame, dict]:
Normalises the data on the level of each sample.
Divides each intensity by the total sum of sample intensities.
:param intensity_df: the dataframe that should be filtered in\
:param intensity_df: the dataframe that should be filtered in
long format
:type intensity_df: pandas DataFrame
:return: returns a scaled dataframe in typical protzilla long format\
:return: returns a scaled dataframe in typical protzilla long format
and a dict, containing all zeroed samples due to sum being 0
:rtype: Tuple[pandas DataFrame, dict]
"""
Expand Down Expand Up @@ -181,12 +181,12 @@ def by_reference_protein(
protein in each sample. Samples where this value is zero will be
removed and returned separately.
:param intensity_df: the dataframe that should be filtered in\
:param intensity_df: the dataframe that should be filtered in
long format
:type intensity_df: pandas DataFrame
:param reference_protein: Protein ID of the protein to normalise by
type reference_protein_id: str
:return: returns a scaled dataframe in typical protzilla long format \
:return: returns a scaled dataframe in typical protzilla long format
and dict with a list of the indices of the dropped samples
:rtype: Tuple[pandas DataFrame, dict]
"""
Expand Down
10 changes: 5 additions & 5 deletions protzilla/data_preprocessing/outlier_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def by_isolation_forest(
all kernels (-1)
:type n_jobs: integer
:return: returns a Dataframe containing all samples that are not outliers and a\
:return: returns a Dataframe containing all samples that are not outliers and a
dict with list of outlier sample names
:rtype: Tuple[pandas DataFrame, dict]
"""
Expand Down Expand Up @@ -95,7 +95,7 @@ def by_local_outlier_factor(
all kernels (-1)
:type n_jobs: int
:return: returns a Dataframe containing all samples that are not outliers and a\
:return: returns a Dataframe containing all samples that are not outliers and a
dict with list of outlier sample names
:rtype: Tuple[pandas DataFrame, dict]
"""
Expand Down Expand Up @@ -154,9 +154,9 @@ def by_pca(
:type number_of_components: integer (2 or 3)
:return: returns a Dataframe containing all samples that are not outliers.
A dict with list of inlier sample names, a DataFrame that contains the projection \
of the intensity_df on first principal components, a list that contains the \
explained variation for each component and an int, the number of components \
A dict with list of inlier sample names, a DataFrame that contains the projection
of the intensity_df on first principal components, a list that contains the
explained variation for each component and an int, the number of components
the calculations were executed with
:rtype: Tuple[pandas DataFrame, dict]
"""
Expand Down
2 changes: 1 addition & 1 deletion protzilla/data_preprocessing/peptide_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def by_pep_value(
:type intensity_df: pd.Dataframe
:param peptide_df: the pandas dataframe containing the peptide information
:type peptide_df: pd.Dataframe
:param threshold: peptides with a PEP-value below this threshold will be filtered\
:param threshold: peptides with a PEP-value below this threshold will be filtered
out
:type threshold: float
Expand Down
26 changes: 13 additions & 13 deletions protzilla/data_preprocessing/plots.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,10 +123,10 @@ def create_box_plots(
(for example before and after filtering/normalisation) and creates
a visualisation for each one.
:param dataframe_a: First dataframe in protzilla long format for\
:param dataframe_a: First dataframe in protzilla long format for
first boxplot
:type dataframe_a: pd.DataFrame
:param dataframe_b: Second dataframe in protzilla long format\
:param dataframe_b: Second dataframe in protzilla long format
for second boxplot
:type dataframe_b: pd.DataFrame
:param name_a: Name of first boxplot
Expand All @@ -139,10 +139,10 @@ def create_box_plots(
:type y_title: str
:param x_title: Optional x-axis title for graphs.
:type x_title: str
:param group_by: Optional argument to create a grouped boxplot\
graph. Arguments can be either "Sample" to group by sample or\
"Protein ID" to group by protein. Leave "None" to get ungrouped\
conventional graphs. If set the function will ignore the\
:param group_by: Optional argument to create a grouped boxplot
graph. Arguments can be either "Sample" to group by sample or
"Protein ID" to group by protein. Leave "None" to get ungrouped
conventional graphs. If set the function will ignore the
graph_type argument. Default is "None".
:type group_by: str
Expand Down Expand Up @@ -224,10 +224,10 @@ def create_histograms(
(for example before and after filtering/normalisation) and creates
a visualisation for each one.
:param dataframe_a: First dataframe in protzilla long format for\
:param dataframe_a: First dataframe in protzilla long format for
first histogram
:type dataframe_a: pd.DataFrame
:param dataframe_b: Second dataframe in protzilla long format\
:param dataframe_b: Second dataframe in protzilla long format
for second histogram
:type dataframe_b: pd.DataFrame
:param name_a: Name of first histogram
Expand Down Expand Up @@ -290,7 +290,7 @@ def create_anomaly_score_bar_plot(
This function creates a graph visualising the outlier
and non-outlier samples using the anomaly score.
:param anomaly_df: pandas Dataframe that contains the anomaly score for each\
:param anomaly_df: pandas Dataframe that contains the anomaly score for each
sample, including outliers and on-outliers samples
:type anomaly_df: pd.DataFrame
:param colour_outlier: hex code for colour depicting the outliers.
Expand Down Expand Up @@ -352,10 +352,10 @@ def create_pca_2d_scatter_plot(
and non-outlier points by showing the principal components. It
returns a ploty Figure object.
:param pca_df: a DataFrame that contains the projection of\
:param pca_df: a DataFrame that contains the projection of
the intensity_df on first principal components
:type pca_df: pd.DataFrame
:param explained_variance_ratio: a list that contains the\
:param explained_variance_ratio: a list that contains the
explained variation for each component
:type explained_variance_ratio: list
:param colour_outlier: hex code for colour depicting the outliers.
Expand Down Expand Up @@ -407,10 +407,10 @@ def create_pca_3d_scatter_plot(
and non-outlier points by showing the principal components. It
returns a ploty Figure object.
:param pca_df: a DataFrame that contains the projection of\
:param pca_df: a DataFrame that contains the projection of
the intensity_df on first principal components
:type pca_df: pd.DataFrame
:param explained_variance_ratio: a list that contains the\
:param explained_variance_ratio: a list that contains the
explained variation for each component
:type explained_variance_ratio: list
:param colour_outlier: hex code for colour depicting the outliers.
Expand Down
4 changes: 2 additions & 2 deletions protzilla/data_preprocessing/transformation.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,11 @@ def by_log(intensity_df: pd.DataFrame, log_base="log10"):
:param intensity_df: a protein data frame in long format
:type intensity_df: pd.DataFrame
:param log_base: String of the used log method "log10" (base 10)\
:param log_base: String of the used log method "log10" (base 10)
or "log2" (base 2). Default: "log10"
:type log_base: str
:return: returns a pandas DataFrame in typical protzilla\
:return: returns a pandas DataFrame in typical protzilla
long format with the transformed data and an empty dict.
:rtype: Tuple[pandas DataFrame, dict]
"""
Expand Down

0 comments on commit d3a7943

Please sign in to comment.