From 08436b3d09bc9bc0937ca7e14df42d5f2f13a03a Mon Sep 17 00:00:00 2001 From: Lilly Zintl <73845790+lill28@users.noreply.github.com> Date: Sat, 25 Nov 2023 09:56:31 +0100 Subject: [PATCH] Docstrings (#289) * correct docstring syntax of data_analysis package * Fix docstring syntax in data_integration package * Fix docstring syntax in utilities package * Fix more docstring syntax * format code * add docstrings for database_integration.py * add returns for database_download.py * add docstrings for database_query.py * add docstrings for upload_handler.py * docstrings runs/views part1 * docstrings 2 runs/views.py * docstrings for runs/fields.py * docstrings for clustergram copy * docstrings for data_analysis/dimension_reduction.py * docstrings for data_analysis/classification.py * add __init__.py * Fix run docstring * Add docs build instructions and packages * update docs build instructions * Implement review suggestions * remove \ in data_preprocessing docstrings * remove more \ in docstrings * adopt PR suggestions --------- Co-authored-by: Fynn Co-authored-by: BelanaZ <66524915+BelanaZ@users.noreply.github.com> Co-authored-by: Sara Grau Co-authored-by: antonneubauer --- docs/Makefile | 20 ++ docs/build_docs.md | 8 + docs/make.bat | 35 +++ docs/source/conf.py | 27 +++ docs/source/index.rst | 20 ++ protzilla/constants/location_mapping.py | 17 +- protzilla/data_analysis/classification.py | 63 ++++- protzilla/data_analysis/clustering.py | 42 ++-- .../differential_expression_anova.py | 6 +- .../differential_expression_linear_model.py | 4 +- .../differential_expression_t_test.py | 29 ++- .../data_analysis/dimension_reduction.py | 59 +++-- protzilla/data_analysis/model_evaluation.py | 2 +- .../data_analysis/model_evaluation_plots.py | 6 +- protzilla/data_analysis/plots.py | 30 +-- protzilla/data_analysis/protein_graphs.py | 103 ++++++-- protzilla/data_integration/__init__.py | 0 .../data_integration/database_download.py | 18 +- .../data_integration/database_integration.py | 29 +++ protzilla/data_integration/database_query.py | 44 +++- protzilla/data_integration/di_plots.py | 4 + .../data_integration/enrichment_analysis.py | 8 + .../enrichment_analysis_gsea.py | 4 + .../enrichment_analysis_helper.py | 5 + .../data_preprocessing/filter_proteins.py | 16 +- .../data_preprocessing/filter_samples.py | 16 +- protzilla/data_preprocessing/imputation.py | 81 ++++--- protzilla/data_preprocessing/normalisation.py | 41 ++-- .../data_preprocessing/outlier_detection.py | 39 +-- .../data_preprocessing/peptide_filter.py | 6 +- protzilla/data_preprocessing/plots.py | 69 +++--- .../data_preprocessing/transformation.py | 11 +- protzilla/history.py | 10 +- protzilla/importing/__init__.py | 0 protzilla/run.py | 64 +++-- protzilla/utilities/clustergram.py | 13 + protzilla/utilities/dunn_score.py | 1 + protzilla/utilities/transform_dfs.py | 26 +- protzilla/utilities/utilities.py | 1 + requirements.txt | 2 + tests/protzilla/data_analysis/__init__.py | 0 tests/protzilla/data_integration/__init__.py | 0 tests/protzilla/importing/__init__.py | 0 tests/ui/__init__.py | 0 ui/main/upload_handler.py | 6 +- ui/runs/fields.py | 126 +++++++++- ui/runs/utilities/__init__.py | 0 ui/runs/views.py | 229 +++++++++++++++++- 48 files changed, 1048 insertions(+), 292 deletions(-) create mode 100644 docs/Makefile create mode 100644 docs/build_docs.md create mode 100644 docs/make.bat create mode 100644 docs/source/conf.py create mode 100644 docs/source/index.rst create mode 100644 protzilla/data_integration/__init__.py create mode 100644 protzilla/importing/__init__.py create mode 100644 tests/protzilla/data_analysis/__init__.py create mode 100644 tests/protzilla/data_integration/__init__.py create mode 100644 tests/protzilla/importing/__init__.py create mode 100644 tests/ui/__init__.py create mode 100644 ui/runs/utilities/__init__.py diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 000000000..d0c3cbf10 --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = source +BUILDDIR = build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/build_docs.md b/docs/build_docs.md new file mode 100644 index 000000000..42d402fc4 --- /dev/null +++ b/docs/build_docs.md @@ -0,0 +1,8 @@ +## Build docs with Sphinx ## +- Docs are build with sphinx-autoapi +- after installing the packages in requirements.txt, all necessary dependencies for building the docs should be installed (sphinx==7.2.6, sphinx-autoapi==3.0.0, requests==2.31.0) +- to build the docs open the docs\ folder in a terminal and run "make html" to create the html documentation + - in case the error "Could not import extension sphinx.builders.linkcheck" occurs, try reinstalling python requests (pip install requests==2.31.0) + - warnings might occur, they usually do not prevent the successful build of the docs +- To open the docs open the index.html in the docs\build\html folder +- when adding docstrings to the code they should follow the correct syntax(https://sphinx-rtd-tutorial.readthedocs.io/en/latest/docstrings.html), in order to be formatted correctly in the generated documentation \ No newline at end of file diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 000000000..dc1312ab0 --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=source +set BUILDDIR=build + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.https://www.sphinx-doc.org/ + exit /b 1 +) + +if "%1" == "" goto help + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/source/conf.py b/docs/source/conf.py new file mode 100644 index 000000000..53667bb5e --- /dev/null +++ b/docs/source/conf.py @@ -0,0 +1,27 @@ +# Configuration file for the Sphinx documentation builder. +# +# For the full list of built-in configuration values, see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Project information ----------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information + +project = "PROTzilla" +copyright = "2023, BP22/23" +author = "BP22/23" + +# -- General configuration --------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration + +extensions = ["sphinx.ext.napoleon", "autoapi.extension"] +autoapi_dirs = ["../../"] + +templates_path = ["_templates"] +exclude_patterns = [] + + +# -- Options for HTML output ------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output + +html_theme = "alabaster" +html_static_path = ["_static"] diff --git a/docs/source/index.rst b/docs/source/index.rst new file mode 100644 index 000000000..3140cb153 --- /dev/null +++ b/docs/source/index.rst @@ -0,0 +1,20 @@ +.. PROTzilla documentation master file, created by + sphinx-quickstart on Wed Sep 27 18:24:09 2023. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Welcome to PROTzilla's documentation! +===================================== + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + + + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` diff --git a/protzilla/constants/location_mapping.py b/protzilla/constants/location_mapping.py index d9c075e66..03ace1f0e 100644 --- a/protzilla/constants/location_mapping.py +++ b/protzilla/constants/location_mapping.py @@ -20,11 +20,9 @@ ) from ..importing import metadata_import, ms_data_import, peptide_import -""" -In this data structure, a method is associated with a location. The location is -determined by the section, step, and method keys found in the workflow_meta -file that correspond to the method. -""" +# In this data structure, a method is associated with a location. The location is +# determined by the section, step, and method keys found in the workflow_meta +# file that correspond to the method. method_map = { ( "importing", @@ -247,11 +245,10 @@ # reversed mapping of method callable and location location_map = {v: k for k, v in method_map.items()} -""" -In this data structure, a plot for a given method is associated with a -location. The location is determined by the section, step, and method keys -found in the workflow_meta file that correspond to the method. -""" + +# In this data structure, a plot for a given method is associated with a +# location. The location is determined by the section, step, and method keys +# found in the workflow_meta file that correspond to the method. plot_map = { ( "data_preprocessing", diff --git a/protzilla/data_analysis/classification.py b/protzilla/data_analysis/classification.py index 3a2ac6772..157331930 100644 --- a/protzilla/data_analysis/classification.py +++ b/protzilla/data_analysis/classification.py @@ -107,33 +107,33 @@ def random_forest( :param metadata_df: A separate dataframe containing additional metadata information. :type metadata_df: pd.DataFrame :param labels_column: The column name in the `metadata_df` dataframe that contains - the target variable (labels) for classification. + the target variable (labels) for classification. :type labels_column: str :param train_test_split: The proportion of data to be used for testing. Default is - 0.2 (80-20 train-test split). + 0.2 (80-20 train-test split). :type train_test_split: int, optional :param n_estimators: The number of decision trees to be used in the random forest. :type n_estimators: int, optional :param criterion: The impurity measure used for tree construction. :type criterion: str, optional :param max_depth: The maximum depth of the decision trees. If not specified (None), - the trees will expand until all leaves are pure or contain minimum samples per leaf. + the trees will expand until all leaves are pure or contain minimum samples per leaf. :type max_depth: int or None, optional :param bootstrap: Whether bootstrap samples should be used when building trees. :type bootstrap: bool, optional :param random_state: The random seed for reproducibility. - :type random_state: int, optional + :type random_state: int :param model_selection: The model selection method for hyperparameter tuning. - :type model_selection: str, optional + :type model_selection: str :param validation_strategy: The strategy for model validation. - :type validation_strategy: str, optional + :type validation_strategy: str :param scoring: The scoring metric(s) used to evaluate the model's performance - during validation. - :type scoring: list[str], optional + during validation. + :type scoring: list[str] :param **kwargs: Additional keyword arguments to be passed to the function. :return: A RandomForestClassifier instance, a dataframe consisting of the model's - training parameters and the validation score, along with four dataframes containing - the respective test and training samples and labels. + training parameters and the validation score, along with four dataframes + containing the respective test and training samples and labels. :rtype: dict """ @@ -215,6 +215,49 @@ def svm( scoring: list[str] = ["accuracy"], **kwargs, ): + """ + Perform classification using the support vector machine classifier from sklearn. + + :param input_df: The dataframe that should be classified in wide or long format + :type input_df: pd.DataFrame + :param metadata_df: A separate dataframe containing additional metadata information. + :type metadata_df: pd.DataFrame + :param labels_column: The column name in the `metadata_df` dataframe that contains + the target variable (labels) for classification. + :type labels_column: str + :param C: Regularization parameter + :type C: float + :param kernel: Specifies the kernel type. + :type kernel: str, optional + :param gamma: Kernel coefficient (default: 'scale', relevant for 'rbf', 'poly', and + 'sigmoid'). + :type gamma: str + :param coef0: Independent term in the kernel function (relevant for 'poly' and + 'sigmoid'). + :type coef0: float + :param probability: Whether to enable probability estimates + :type probability: bool, optional + :param tol: Tolerance for stopping criterion + :type tol: float + :param class_weight: Weights associated with classes + :type class_weight: float + :param max_iter: Maximum number of iterations (default: -1, indicating no limit). + :type max_iter: int + :param random_state: The random seed for reproducibility. + :type random_state: int + :param model_selection: The model selection method for hyperparameter tuning. + :type model_selection: str + :param validation_strategy: The strategy for model validation. + :type validation_strategy: str + :param scoring: The scoring metric(s) used to evaluate the model's performance + during validation. + :type scoring: list[str] + :param **kwargs: Additional keyword arguments to be passed to the function. + :return: A dict containing: a SVC instance, a dataframe consisting of the model's + training parameters and the validation score, along with four dataframes + containing the respective test and training samples and labels. + :rtype: dict + """ # TODO 216 add warning to user that data should be to shuffled, give that is being sorted at the beginning! input_df_wide = long_to_wide(input_df) if is_long_format(input_df) else input_df diff --git a/protzilla/data_analysis/clustering.py b/protzilla/data_analysis/clustering.py index f234b664a..acfa64fe7 100644 --- a/protzilla/data_analysis/clustering.py +++ b/protzilla/data_analysis/clustering.py @@ -30,8 +30,8 @@ def k_means( **kwargs, ): """ - A method that uses k-means to partition a number of samples in k clusters. The \ - function returns a dataframe with the corresponding cluster of each sample and \ + A method that uses k-means to partition a number of samples in k clusters. The + function returns a dataframe with the corresponding cluster of each sample and another dataframe with the coordinates of the cluster centers. :param input_df: The dataframe that should be clustered in wide or long format @@ -39,7 +39,7 @@ def k_means( :param metadata_df: A separate dataframe containing additional metadata information. :type metadata_df: pd.DataFrame :param labels_column: The column name in the `metadata_df` dataframe that contains - the true labels of the data + the true labels of the data :type labels_column: str :param positive_label: The positive label for clustering. :type positive_label: str @@ -47,23 +47,23 @@ def k_means( :type model_selection: str :param scoring: The scoring metric(s) used for model evaluation. :type scoring: list[str] - :param n_clusters: the number of clusters to form as well as the number of \ - centroids to generate. + :param n_clusters: the number of clusters to form as well as the number of + centroids to generate. :type n_clusters: int :param random_state: Determines random number generation for centroid initialization :type random_state: int - :param init_centroid_strategy: method for centroid initialization. Possible methods\ - are: k-means++ and random + :param init_centroid_strategy: method for centroid initialization. Possible methods + are: k-means++ and random :type init_centroid_strategy: str - :param n_init: Number of times the k-means algorithm is run with different centroid\ - seeds. + :param n_init: Number of times the k-means algorithm is run with different centroid + seeds. :type n_init: int - :param max_iter: Maximum number of iterations of the k-means algorithm for a single\ - run. + :param max_iter: Maximum number of iterations of the k-means algorithm for a single + run. :type max_iter: int - :param tolerance: Relative tolerance with regards to Frobenius norm of the \ - difference in the cluster centers of two consecutive iterations to declare\ - convergence. + :param tolerance: Relative tolerance with regards to Frobenius norm of the + difference in the cluster centers of two consecutive iterations to declare + convergence. :type tolerance: float :returns: A dictionary containing the following elements: - model: The trained Gaussian Mixture Model. @@ -171,9 +171,10 @@ def expectation_maximisation( :param metadata_df: A separate dataframe containing additional metadata information. :type metadata_df: pd.DataFrame :param labels_column: The column name in the `metadata_df` dataframe that contains - the true labels of the data + the true labels of the data :type labels_column: str :param positive_label: The positive label for clustering. + :type positive_label: str :param model_selection: The model selection method for hyperparameter tuning. :type model_selection: str :param scoring: The scoring metric(s) used for model evaluation. @@ -183,17 +184,17 @@ def expectation_maximisation( :param covariance_type: The covariance type for the Gaussian Mixture Model. :type covariance_type: str, optional :param reg_covar: Non-negative regularization added to the diagonal of covariance - matrices. + matrices. :type reg_covar: float :param init_params: The method used to initialize the weights, the means and - the precisions. + the precisions. :type init_params: str :param max_iter: The number of EM iterations to perform. :type max_iter: int, optional :param random_state: The random seed for reproducibility. :type random_state: int :param **kwargs: Additional keyword arguments to be passed to the - `perform_clustering` function. + `perform_clustering` function. :returns: A dictionary containing the following elements: - model: The trained Gaussian Mixture Model. - model_evaluation_df: dataframe consisting of the model's parameters and the @@ -275,9 +276,10 @@ def hierarchical_agglomerative_clustering( :param metadata_df: A separate dataframe containing additional metadata information. :type metadata_df: pd.DataFrame :param labels_column: The column name in the `metadata_df` dataframe that contains - the true labels of the data + the true labels of the data :type labels_column: str :param positive_label: The positive label for clustering. + :type positive_label: str :param model_selection: The model selection method for hyperparameter tuning. :type model_selection: str :param scoring: The scoring metric(s) used for model evaluation. @@ -287,7 +289,7 @@ def hierarchical_agglomerative_clustering( :param metric: Metric used to compute the linkage. :type metric: str :param linkage: Which linkage criterion to use. The linkage criterion determines - which distance to use between sets of observation + which distance to use between sets of observation :type linkage: str :returns: A dictionary containing the following elements: - model: The trained Gaussian Mixture Model. diff --git a/protzilla/data_analysis/differential_expression_anova.py b/protzilla/data_analysis/differential_expression_anova.py index 31c0cd045..54adcceb2 100644 --- a/protzilla/data_analysis/differential_expression_anova.py +++ b/protzilla/data_analysis/differential_expression_anova.py @@ -40,9 +40,9 @@ def anova( :rtype: pandas DataFrame, dict :return: a dataframe in typical protzilla long format - with the differentially expressed proteins and a dict, containing - the corrected p-values and the log2 fold change, the alpha used - and the corrected alpha, as well as filtered out proteins. + with the differentially expressed proteins and a dict, containing + the corrected p-values and the log2 fold change, the alpha used + and the corrected alpha, as well as filtered out proteins. """ # Check if the grouping variable is present in the metadata_df assert grouping in metadata_df.columns, f"{grouping} not found in metadata_df" diff --git a/protzilla/data_analysis/differential_expression_linear_model.py b/protzilla/data_analysis/differential_expression_linear_model.py index 1a0b3440c..9988eaed5 100644 --- a/protzilla/data_analysis/differential_expression_linear_model.py +++ b/protzilla/data_analysis/differential_expression_linear_model.py @@ -42,8 +42,8 @@ def linear_model( :type fc_threshold: float :return: a dataframe in typical protzilla long format with the differentially expressed - proteins and a dict, containing the corrected p-values and the log2 fold change (coefficients), the alpha used - and the corrected alpha, as well as filtered out proteins. + proteins and a dict, containing the corrected p-values and the log2 fold change (coefficients), the alpha used + and the corrected alpha, as well as filtered out proteins. :rtype: Tuple[pandas DataFrame, dict] """ assert grouping in metadata_df.columns diff --git a/protzilla/data_analysis/differential_expression_t_test.py b/protzilla/data_analysis/differential_expression_t_test.py index 22e7eb6f1..7fe3ffc96 100644 --- a/protzilla/data_analysis/differential_expression_t_test.py +++ b/protzilla/data_analysis/differential_expression_t_test.py @@ -45,20 +45,25 @@ def t_test( :type multiple_testing_correction_method: str :param alpha: the alpha value for the t-test :type alpha: float + :param fc_threshold: threshold for the abs(log_2(fold_change)) (vertical line in a volcano plot). + Only proteins with a larger abs(log_2(fold_change)) than the fc_threshold are seen as differentially expressed + :type fc-threshold: float + :param log_base: in case the data was previously log transformed this parameter contains the base (e.g. 2 if the data was log_2 transformed). + If the data was not log transformed the parmeter should be "" + :type log_base: int/str :return: a dict containing - a df corrected_p_values, containing the p_values after application of multiple testing correction, - a df log2_fold_change, containing the log2 fold changes per protein, - a float fc_threshold, containing the absolute threshold for the log fold change, above which a protein is considered differentially expressed, - a float corrected_alpha, containing the alpha value after application of multiple testing correction (depending on the selected multiple testing correction method corrected_alpha may be equal to alpha), - a df filtered_proteins, containing the filtered out proteins (proteins where the mean of a group was 0), - a df fold_change_df, containing the fold_changes per protein, - a df t_statistic_df, containing the t-statistic per protein, - a df de_proteins_df in typical protzilla long format containing the differentially expressed proteins; - corrected_p_value, log2_fold_change, fold_change and t_statistic per protein, - a df significant_proteins_df, containing the proteins where the p-values are smaller than alpha (if fc_threshold = 0, the significant proteins equal the differentially expressed ones) - corrected_p_value, log2_fold_change, fold_change and t_statistic per protein, - + a df corrected_p_values, containing the p_values after application of multiple testing correction, + a df log2_fold_change, containing the log2 fold changes per protein, + a float fc_threshold, containing the absolute threshold for the log fold change, above which a protein is considered differentially expressed, + a float corrected_alpha, containing the alpha value after application of multiple testing correction (depending on the selected multiple testing correction method corrected_alpha may be equal to alpha), + a df filtered_proteins, containing the filtered out proteins (proteins where the mean of a group was 0), + a df fold_change_df, containing the fold_changes per protein, + a df t_statistic_df, containing the t-statistic per protein, + a df de_proteins_df in typical protzilla long format containing the differentially expressed proteins; + corrected_p_value, log2_fold_change, fold_change and t_statistic per protein, + a df significant_proteins_df, containing the proteins where the p-values are smaller than alpha (if fc_threshold = 0, the significant proteins equal the differentially expressed ones) + corrected_p_value, log2_fold_change, fold_change and t_statistic per protein, :rtype: dict """ assert grouping in metadata_df.columns diff --git a/protzilla/data_analysis/dimension_reduction.py b/protzilla/data_analysis/dimension_reduction.py index d38eab6bc..6ec487c65 100644 --- a/protzilla/data_analysis/dimension_reduction.py +++ b/protzilla/data_analysis/dimension_reduction.py @@ -16,10 +16,10 @@ def t_sne( method: str = "barnes_hut", ): """ - A function that uses t-SNE to reduce the dimension of a dataframe and returns a \ + A function that uses t-SNE to reduce the dimension of a dataframe and returns a dataframe in wide format with the entered number of components. - Please note that this function is a simplified version of t-SNE, and it only \ - enables you to adjust the most significant parameters that affect the output. \ + Please note that this function is a simplified version of t-SNE, and it only + enables you to adjust the most significant parameters that affect the output. You can find the default values for the non-adjustable parameters here: https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html @@ -29,22 +29,27 @@ def t_sne( :type n_components: int :param perplexity: the perplexity is related to the number of nearest neighbors :type perplexity: float - :param metric: The metric to use when calculating distance between instances in a \ - feature array. Possible metrics are: euclidean, manhattan, cosine and haversine + :param metric: The metric to use when calculating distance between instances in a + feature array. Possible metrics are: euclidean, manhattan, cosine and haversine :type metric: str :param random_state: determines the random number generator. :type random_state: int :param n_iter: maximum number of iterations for the optimization :type n_iter: int - :param n_iter_without_progress: Maximum number of iterations without progress \ - before we abort the optimization, used after 250 initial iterations with early \ - exaggeration. Note that progress is only checked every 50 iterations so this \ - value is rounded to the next multiple of 50. + :param n_iter_without_progress: Maximum number of iterations without progress + before we abort the optimization, used after 250 initial iterations with early + exaggeration. Note that progress is only checked every 50 iterations so this + value is rounded to the next multiple of 50. :type n_iter_without_progress: int - :param method: the method exact will run on the slower, but exact, algorithm in \ - O(N^2) time. However, the exact method cannot scale to millions of examples. \ - Barnes-Hut approximation will run faster, but not exact, in O(NlogN) time. + :param method: the method 'exact' will run on the slower, but exact, algorithm in + O(N^2) time. However, the 'exact' method cannot scale to millions of examples. + Barnes-Hut approximation will run faster, but not exact, in O(NlogN) time. :type method: str + :return: a dictionary with a single key, "embedded_data", which contains a new + DataFrame in wide format. This DataFrame consists of the t-SNE embedded data + with two columns, "Component1" and "Component2", and shares the same index as + the input_df. + :rtype: dict """ intensity_df_wide = long_to_wide(input_df) if is_long_format(input_df) else input_df try: @@ -108,10 +113,10 @@ def umap( transform_seed: int = 42, ): """ - A function that uses UMAP to reduce the dimension of a dataframe and returns a \ + A function that uses UMAP to reduce the dimension of a dataframe and returns a dataframe in wide format with the entered number of components. - Please note that this function is a simplified version of UMAP, and it only \ - enables you to adjust the most significant parameters that affect the output. \ + Please note that this function is a simplified version of UMAP, and it only + enables you to adjust the most significant parameters that affect the output. You can find the default values for the non-adjustable parameters here: https://umap-learn.readthedocs.io/en/latest/api.html @@ -119,19 +124,27 @@ def umap( :type input_df: pd.DataFrame :param n_components: The dimension of the space to embed into. :type n_components: int - :param n_neighbors: The size of local neighborhood in terms of number of \ - neighboring sample points + :param n_neighbors: The size of local neighborhood in terms of number of + neighboring sample points :type n_neighbors: float - :param min_dist: the effective minimum distance between embedded points. Smaller \ - values will result in a more clustered/clumped embedding where nearby points on \ - the manifold are drawn closer together, while larger values will result on a more \ - even dispersal of points. + :param min_dist: the effective minimum distance between embedded points. Smaller + values will result in a more clustered/clumped embedding where nearby points on + the manifold are drawn closer together, while larger values will result on a more + even dispersal of points. :type min_dist: float - :param metric: The metric to use when calculating distance between instances in a \ - feature array. + :param metric: The metric to use when calculating distance between instances in a + feature array. :type metric: str :param random_state: determines the random number generator. :type random_state: int + :param transform_seed: Random seed used for the stochastic aspects of the transform + operation. + :type transform_seed: int + :return: a dictionary with a single key, "embedded_data", which contains a new + DataFrame in wide format. This DataFrame consists of the UMAP embedded data + with two columns, "Component1" and "Component2", and shares the same index as + the input_df. + :rtype: dict """ # umap import is slow, so it should only get imported when needed diff --git a/protzilla/data_analysis/model_evaluation.py b/protzilla/data_analysis/model_evaluation.py index 0fba7d783..5a1f045c5 100644 --- a/protzilla/data_analysis/model_evaluation.py +++ b/protzilla/data_analysis/model_evaluation.py @@ -18,7 +18,7 @@ def evaluate_classification_model(model, input_test_df, labels_test_df, scoring) :param labels_test_df: The true labels of the testing data as a DataFrame. :type labels_test_df: pd.DataFrame :param scoring: The scoring metric to be used for evaluation. It can be a string - representing a predefined metric e.g. accuracy, precision, recall, matthews_corrcoef + representing a predefined metric e.g. accuracy, precision, recall, matthews_corrcoef :type scoring: str or callable :return: A dataframe with the metric name and its corresponding score. :rtype: dict diff --git a/protzilla/data_analysis/model_evaluation_plots.py b/protzilla/data_analysis/model_evaluation_plots.py index f1167b210..2d2079d92 100644 --- a/protzilla/data_analysis/model_evaluation_plots.py +++ b/protzilla/data_analysis/model_evaluation_plots.py @@ -8,6 +8,7 @@ def precision_recall_curve_plot(model, input_test_df, labels_test_df, title=None): """ Calculate and plot the precision-recall curve for a classification model. + :param model: The trained classification model instance to be evaluated. :type model: BaseEstimator :param input_test_df: The input features of the testing data as a DataFrame. @@ -15,7 +16,7 @@ def precision_recall_curve_plot(model, input_test_df, labels_test_df, title=None :param labels_test_df: The true labels of the testing data as a DataFrame. :type labels_test_df: pd.DataFrame :param title: The title of the precision-recall curve plot. This is an optional - parameter. + parameter. :type title: str, optional :return: Base64 encoded image of the plot :rtype: bytes @@ -34,6 +35,7 @@ def precision_recall_curve_plot(model, input_test_df, labels_test_df, title=None def roc_curve_plot(model, input_test_df, labels_test_df, title=None): """ Calculate and plot the roc curve for a classification model. + :param model: The trained classification model instance to be evaluated. :type model: BaseEstimator :param input_test_df: The input features of the testing data as a DataFrame. @@ -41,7 +43,7 @@ def roc_curve_plot(model, input_test_df, labels_test_df, title=None): :param labels_test_df: The true labels of the testing data as a DataFrame. :type labels_test_df: pd.DataFrame :param title: The title of the precision-recall curve plot. This is an optional - parameter. + parameter. :type title: str, optional :return: Base64 encoded image of the plot :rtype: bytes diff --git a/protzilla/data_analysis/plots.py b/protzilla/data_analysis/plots.py index c158a3fbf..00facc486 100644 --- a/protzilla/data_analysis/plots.py +++ b/protzilla/data_analysis/plots.py @@ -13,14 +13,17 @@ def scatter_plot( color_df: pd.DataFrame | None = None, ): """ - Function to create a scatter plot from data. - - :param input_df: the dataframe that should be plotted. It should have either 2 \ - or 3 dimension + Function to create a scatter plot from data. + + :param input_df: the dataframe that should be plotted. It should have either 2 + or 3 dimensions :type input_df: pd.Dataframe - :param color_df: the Dataframe with one column according to which the marks should \ - be colored. This is an optional parameter + :param color_df: the Dataframe with one column according to which the marks should + be colored. This is an optional parameter :type color_df: pd.Dataframe + + :return: returns a list with a plotly figure or a list with a dictionary if an error occurs + :rtype: list[plotly figure]/dict """ intensity_df_wide = long_to_wide(input_df) if is_long_format(input_df) else input_df try: @@ -83,6 +86,9 @@ def create_volcano_plot( :type alpha: float :param proteins_of_interest: the proteins that should be annotated in the plot :type proteins_of_interest: list or None + + :return: returns a list with a plotly figure + :rtype: [plotly figure] """ plot_df = p_values.join(log2_fc.set_index("Protein ID"), on="Protein ID") @@ -149,11 +155,6 @@ def clustergram_plot( input_df: pd.DataFrame, sample_group_df: pd.DataFrame | None, flip_axes: str ): """ - - :param grouping: the column name of the grouping variable in the - metadata_df - :type grouping: str - Creates a clustergram plot from a dataframe in protzilla wide format. The rows or columns of the clustergram are ordered according to the clustering resulting from the dendrogram. Optionally, a colorbar representing the different groups present @@ -170,8 +171,9 @@ def clustergram_plot( :param flip_axes: If "yes", the rows and columns of the clustergram will be swapped. If "no", the default orientation is used. :type flip_axes: str - :return: returns a list with a figure or a list with a dictionary if an error occurs - :rtype: [go.Figure] + + :return: returns a list with a plotly figure or a list with a dictionary if an error occurs + :rtype: list[plotly figure]/dict """ try: assert isinstance(input_df, pd.DataFrame) and not input_df.empty @@ -188,7 +190,7 @@ def clustergram_plot( ) # In the clustergram each row represents a sample that can pertain to a # group. In the following code the necessary data structures are created - # to assign eachgroup to a unique color. + # to assign each group to a unique color. sample_group_dict = dict( zip(sample_group_df.index, sample_group_df[sample_group_df.columns[0]]) ) diff --git a/protzilla/data_analysis/protein_graphs.py b/protzilla/data_analysis/protein_graphs.py index 5aefe71f0..999a0dc93 100644 --- a/protzilla/data_analysis/protein_graphs.py +++ b/protzilla/data_analysis/protein_graphs.py @@ -129,8 +129,8 @@ def peptides_to_isoform( :type allowed_mismatches: int :return: dict of path to graph - either the modified graph or the original graph if - the modification failed, the protein id, list of matched peptides, list of unmatched - peptides, messages passed to the frontend + the modification failed, the protein id, list of matched peptides, list of unmatched + peptides, messages passed to the frontend :rtype: dict[str, str, list, list, list] """ @@ -224,11 +224,69 @@ def peptides_to_isoform( ) +def _create_protein_variation_graph(protein_id: str, run_name: str) -> dict: + """ + Creates a Protein-Variation-Graph for a given UniProt Protein ID using ProtGraph. + Included features are just `Variation`, digestion is skipped. + The Graph is saved in .graphml-Format. + + This is designed, so it can be used for peptides_to_isoform but works independently + as well + + ProtGraph: https://github.com/mpc-bioinformatics/ProtGraph/ + + :param protein_id: UniProt Protein-ID + :type protein_id: str + :param run_name: name of the run this is executed from. Used for saving the protein + file, graph + :type run_name: str + :param queue_size: Queue Size for ProtGraph, This is yet to be merged by ProtGraph + :type queue_size: int + + :return: dict(graph_path, messages) + """ + + logger.info(f"Creating graph for protein {protein_id}") + run_path = RUNS_PATH / run_name + path_to_protein_file, filtered_blocks, request = _get_protein_file( + protein_id, run_path + ) + + path_to_protein_file = Path(path_to_protein_file) + if not path_to_protein_file.exists() and request.status_code != 200: + msg = f"error while downloading protein file for {protein_id}. Statuscode:{request.status_code}, {request.reason}. Got: {request.text}. Tip: check if the ID is correct" + logger.error(msg) + return dict( + graph_path=None, + filtered_blocks=filtered_blocks, + messages=[dict(level=messages.ERROR, msg=msg, trace=request.__dict__)], + ) + + output_folder_path = run_path / "graphs" + output_csv = output_folder_path / f"{protein_id}.csv" + graph_path = output_folder_path / f"{protein_id}.graphml" + cmd_str = f"protgraph -egraphml {path_to_protein_file} \ + --export_output_folder={output_folder_path} \ + --output_csv={output_csv} \ + -ft VARIANT \ + -d skip" + + subprocess.run(cmd_str, shell=True) + + msg = f"Graph created for protein {protein_id} at {graph_path} using {path_to_protein_file}" + logger.info(msg) + return dict( + graph_path=str(graph_path), + filtered_blocks=filtered_blocks, + messages=[dict(level=messages.INFO, msg=msg)], + ) + + def _create_graph_index( protein_graph: nx.DiGraph, seq_len: int ) -> tuple[list | None, str, dict | None]: """ - create a mapping from the position in the protein (using the longest path) to + Create a mapping from the position in the protein (using the longest path) to node(s) in the graph For information about _longest_path() please see the docstring of that function. @@ -314,7 +372,7 @@ def _longest_paths(protein_graph: nx.DiGraph, start_node: str): n4 longest_paths: {n1: 0, n2: 3, n3: 5, n4: 5, n5: 6, __end__: 8} - :param protein_graph: Protein-Graph as created by ProtGraph \ + :param protein_graph: Protein-Graph as created by ProtGraph (-> _create_protein_variation_graph) :type protein_graph: nx.DiGraph :param start_node: Source of protein_graph @@ -442,13 +500,14 @@ def _create_reference_sequence_index( protein_path: str, k: int = 5 ) -> tuple[dict, str, int]: """ - Create mapping from kmer of reference_sequence of protein to starting position(s) \ + Create mapping from kmer of reference_sequence of protein to starting position(s) of kmer in reference_sequence :param protein_path: Path to protein file from UniProt (.txt) :type protein_path: str :param k: length of kmers :type k: int + :return: index {kmer: [starting positions]}, reference sequence, length of reference sequence :rtype: tuple(dict, str, int) @@ -554,8 +613,9 @@ def _potential_peptide_matches( :type peptides: list :param ref_index: mapping from kmer to match-positions on reference sequence :type ref_index: dict(kmer: [starting position]} + :return: dict(peptide: [match start on reference sequence]), - list(peptides without match) + list(peptides without match) :rtype: dict, list """ @@ -604,11 +664,11 @@ def _create_contigs_dict(node_start_end: dict): peptide(s) that is responsible for the match. :param node_start_end: dict of peptide to dict of start index of peptide match to - dict of node to tuple of start and end positions of matches within the node + dict of node to tuple of start and end positions of matches within the node :type node_start_end: dict[str, dict[int, dict[str, tuple[int, int]]]] :return: dict of node to list of triple of start position, end position and - peptide(s) responsible for match + peptide(s) responsible for match """ node_match_data = defaultdict(lambda: {"match_locations": []}) @@ -666,22 +726,22 @@ def _match_potential_matches( :param potential_peptide_matches: dict of peptide to list of starting positions :type potential_peptide_matches: dict[str, list[int]] :param graph_index: list of lists, each list contains the nodes and AAs at that - given index along the longest path through the graph + given index along the longest path through the graph :type graph_index: list[list[tuple[str, str]]] :param peptide_mismatches: list of peptides that did not match to the reference - sequence + sequence :type peptide_mismatches: list[str] :param allowed_mismatches: number of mismatches allowed for a peptide to be - considered a match + considered a match :type allowed_mismatches: int :param graph: protein variation graph, as created by ProtGraph - (-> _create_protein_variation_graph) + (-> _create_protein_variation_graph) :type graph: networkx.DiGraph :param longest_paths: length of longest path through the graph to each node :type longest_paths: dict[str, int] :return: dict of peptide to dict of start index of peptide match to dict of node to - tuple of start and end position of match in this node + tuple of start and end position of match in this node :rtype: dict[str, dict[int, dict[str, tuple[int, int]]]] """ @@ -713,7 +773,7 @@ def _match_on_graph( :param allowed_mismatches: number of mismatches allowed per start position :type allowed_mismatches: int :param graph: protein variation graph, as created by ProtGraph - (-> _create_protein_variation_graph) + (-> _create_protein_variation_graph) :type graph: networkx.DiGraph :param current_node: current node in the graph, starting with the node of the match start @@ -721,12 +781,13 @@ def _match_on_graph( :param left_over_peptide: peptide that still needs to be matched to the graph :type left_over_peptide: str :param node_match_data: dict of node to tuple of start position, end position + :type node_match_data: dict :param current_index: index of the amino acid in the current node that is being - matched to the peptide + matched to the peptide :type current_index: int :return: tuple of bool, dict of node to tuple of start position, end position, - number of mismatches + number of mismatches :rtype: tuple[bool, dict[str, tuple[int, int]], int] """ @@ -829,12 +890,16 @@ def _modify_graph(graph, contig_positions): will end up without `match`-attribute. :param graph: Protein Graph to be modified - :type: nx.DiGraph + :type graph: nx.DiGraph :param contig_positions: Dict from current_node to contig-positions {current_node: [(start, end)]}. - :type: dict(list[tuple]) + :type contig_positions: dict(list[tuple]) + :param longest_paths: mapping from current_node to the longest path to current_node + (-> _longest_paths()) + :type longest_paths: dict + :return: modified protein graph, with contigs & not-matched AAs as nodes, indicated - by current_node attribute `matched` + by current_node attribute `matched` """ def _node_length(node): diff --git a/protzilla/data_integration/__init__.py b/protzilla/data_integration/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/protzilla/data_integration/database_download.py b/protzilla/data_integration/database_download.py index f8ee4830e..147c09b84 100644 --- a/protzilla/data_integration/database_download.py +++ b/protzilla/data_integration/database_download.py @@ -33,10 +33,15 @@ def get_batch(batch_url, session): def download_uniprot_paged(name): """ - downloads basic info on all human proteins from the uniprot paged rest api. + Downloads basic info on all human proteins from the uniprot paged rest api. this will take very long due to limitations in the api, therefore stream should be used. code taken from https://www.uniprot.org/help/api_queries including get_next_link and get_batch - parameter name: str = name the database will be saved as + + :param name: name the database will be saved as + :type name: str + + :return: the number of proteins that were downloaded + :rtype: int """ retries = Retry(total=5, backoff_factor=0.25, status_forcelist=[500, 502, 503, 504]) @@ -60,9 +65,14 @@ def download_uniprot_paged(name): def download_uniprot_stream(name): """ - downloads basic info on all human proteins from the streamed uniprot rest api. + Downloads basic info on all human proteins from the streamed uniprot rest api. can fail due to unstable internet connection or problems with the api. - parameter name: str = name the database will be saved as + + :param name: name the database will be saved as + :type name: str + + :return: nothing + :rtype: NoneType """ with requests.get( "https://rest.uniprot.org/uniprotkb/stream", diff --git a/protzilla/data_integration/database_integration.py b/protzilla/data_integration/database_integration.py index f461b2e0d..5ff2104ee 100644 --- a/protzilla/data_integration/database_integration.py +++ b/protzilla/data_integration/database_integration.py @@ -6,6 +6,19 @@ def add_uniprot_data(dataframe, database_name=None, fields=None): + """ + Extend a protein dataframe with information from UniProt for each protein. + + :param dataframe: the protein dataframe to be extendet + :type dataframe: pd.DataFrame + :param database_name: name of the database file that will be queried + :type database_name: str + :param fields: the fields of the database that will be added to the dataframe + :type fields: list[str] + + :return: the extended dataframe, and a message if applicable + :rtype: dict + """ if not fields: msg = "No fields that should be added specified." return dict( @@ -68,6 +81,22 @@ def add_uniprot_data(dataframe, database_name=None, fields=None): def gene_mapping(dataframe, database_names, use_biomart=False): + """ + Maps the protein ID groups to HGNC gene symbols, filtering out ones that are not + found. + + :param dataframe: the protein dataframe of which the protein ID groups will be + mapped. + :type dataframe: pd.DataFrame + :param database_names: names of the database files that will be queried + :type database_names: list[str] | str + :param use_biomart: should biomart be used to map ids that could not be mapped with + databases + :type use_biomart: bool + + :return: the gene mapping, consisting of group_to_genes, gene_to_groups and filtered + :rtype: dict + """ try: groups = dataframe["Protein ID"].unique().tolist() except KeyError: diff --git a/protzilla/data_integration/database_query.py b/protzilla/data_integration/database_query.py index 5f34f638e..2758505f4 100644 --- a/protzilla/data_integration/database_query.py +++ b/protzilla/data_integration/database_query.py @@ -10,6 +10,19 @@ def biomart_query(queries, filter_name, attributes, use_grch37=False): + """ + Construct an XML query for BioMart, send it, decode the result and return it as an + iterator. + + :param queries: what entities to look for with the filter + :type queries: list[str] + :param filter_name: the name of the BioMart category the queries will be searched in + :type filter_name: str + :param attributes: what BioMart categories to return for each found entity + :type attributes: Iterable[str] + :param use_grch37: if truthy, use the outdated GRCh37 biomart endpoint + :type use_grch37: bool + """ if not queries: return @@ -86,12 +99,17 @@ def uniprot_to_genes(uniprot_ids, databases, use_biomart): First uses all uniprot databases that contain genes, then uses biomart to map proteins that have not been found with uniprot if biomart is enabled. - :param uniprot_ids: cleaned uniprot IDs, not containing isoforms or other modifications + :param uniprot_ids: cleaned uniprot IDs, not containing isoforms or other + modifications :type uniprot_ids: list[str] :param databases: names of uniprot databases that should be used for mapping :type databases: list[str] - :param use_biomart: should biomart be used to map ids that could not be mapped with databases - :return: a dict that maps uniprot ids to genes and a list of uniprot ids that were not found + :param use_biomart: if true, biomart should be used to map ids that could not be + mapped with databases + :type use_biomart: bool + + :return: a dict that maps uniprot ids to genes and a list of uniprot ids that were + not found :rtype: tuple[dict[str, str], list[str]] """ @@ -125,7 +143,7 @@ def merge_dict(gene_mapping, new_gene_mapping): if not ids_to_search: logger.info( - "All proteins mapped using uniprot, no biomart mapping will be performed." + "All proteins mapped using uniprot, no biomart mapping will be performed." # noqa E501 ) return out_dict, [] if not use_biomart: @@ -151,6 +169,24 @@ def merge_dict(gene_mapping, new_gene_mapping): def uniprot_groups_to_genes(uniprot_groups, databases, use_biomart): + """ + Maps uniprot ID groups to hgnc gene symbols. Also returns groups that could not be + mapped. Merges the mappings per group and creates a reverse mapping, from genes to + groups. + + :param uniprot_groups: groups of UniProt IDs, as found in a protein dataframe, may + contain isoforms and modifications + :type uniprot_groups: list[str] + :param databases: names of uniprot databases that should be used for mapping + :type databases: list[str] + :param use_biomart: should biomart be used to map ids that could not be mapped with + databases + :type use_biomart: bool + + :return: a dict that maps genes to groups, one that maps groups to genes and a list + of uniprot ids that were not found + :rtype: tuple[dict[str, list[str]], dict[str, list[str]], list[str]] + """ proteins = set() for group in uniprot_groups: for protein in group.split(";"): diff --git a/protzilla/data_integration/di_plots.py b/protzilla/data_integration/di_plots.py index f8ae20bf7..c253d604d 100644 --- a/protzilla/data_integration/di_plots.py +++ b/protzilla/data_integration/di_plots.py @@ -42,6 +42,7 @@ def GO_enrichment_bar_plot( :type colors: list, optional :param figsize: Size of the plot, defaults to None and is calculated dynamically if not provided. :type figsize: tuple, optional + :return: Base64 encoded image of the plot :rtype: bytes """ @@ -162,6 +163,7 @@ def GO_enrichment_dot_plot( :type dot_size: int :param figsize: Size of the plot, defaults to None and is calculated dynamically if not provided. :type figsize: tuple, optional + :return: Base64 encoded image of the plot :rtype: bytes """ @@ -276,6 +278,7 @@ def gsea_dot_plot( :type remove_library_names: bool :param figsize: Size of the plot, defaults to None and is calculated dynamically if not provided. :type figsize: tuple, optional + :return: Base64 encoded image of the plot :rtype: bytes """ @@ -350,6 +353,7 @@ def gsea_enrichment_plot( :type neg_pheno_label: str, optional :param figsize: Size of the plot, defaults to None and is calculated dynamically if not provided. :type figsize: tuple, optional + :return: Base64 encoded image of the plot :rtype: bytes """ diff --git a/protzilla/data_integration/enrichment_analysis.py b/protzilla/data_integration/enrichment_analysis.py index 2671db6e0..f4e336348 100644 --- a/protzilla/data_integration/enrichment_analysis.py +++ b/protzilla/data_integration/enrichment_analysis.py @@ -33,10 +33,12 @@ def get_functional_enrichment_with_delay(protein_list, **string_params): This method performs online functional enrichment analysis using the STRING DB API via the restring package. It adds a delay between calls to the API to avoid exceeding the rate limit. + :param protein_list: list of protein IDs to perform enrichment analysis for :type protein_list: list :param string_params: parameters for the restring package :type string_params: dict + :return: dataframe with functional enrichment results :rtype: pandas.DataFrame """ @@ -62,6 +64,7 @@ def merge_up_down_regulated_dfs_restring(up_df, down_df): :type up_df: pandas.DataFrame :param down_df: dataframe with enrichment results for downregulated proteins :type down_df: pandas.DataFrame + :return: merged dataframe :rtype: pandas.DataFrame """ @@ -146,6 +149,7 @@ def GO_analysis_with_STRING( - both: functional enrichment info is retrieved for upregulated and downregulated proteins separately, but the terms are aggregated for the result dataframe :type direction: str + :return: dictionary with enrichment dataframe :rtype: dict """ @@ -306,6 +310,7 @@ def merge_up_down_regulated_dfs_gseapy(up_enriched, down_enriched): :type up_enriched: pandas.DataFrame :param down_enriched: dataframe with enrichment results for downregulated proteins :type down_enriched: pandas.DataFrame + :return: merged dataframe :rtype: pandas.DataFrame """ @@ -368,6 +373,7 @@ def gseapy_enrichment( :type background: list or None :param offline: whether to run the enrichment offline :type offline: bool + :return: enrichment results, filtered groups, error message if occurred {level, msg, trace(optional)} :rtype: tuple[pandas.DataFrame, list, dict] """ @@ -502,6 +508,7 @@ def GO_analysis_with_Enrichr( :type background_number: int or None :param background_biomart: name of biomart dataset to use as background :type background_biomart: str or None + :return: dictionary with results and filtered groups :rtype: dict """ @@ -706,6 +713,7 @@ def GO_analysis_offline( - both: functional enrichment info is retrieved for upregulated and downregulated proteins separately, but the terms are aggregated for the resulting dataframe :type direction: str + :return: dictionary with results dataframe :rtype: dict """ diff --git a/protzilla/data_integration/enrichment_analysis_gsea.py b/protzilla/data_integration/enrichment_analysis_gsea.py index 73be1e2a8..62cdb2ada 100644 --- a/protzilla/data_integration/enrichment_analysis_gsea.py +++ b/protzilla/data_integration/enrichment_analysis_gsea.py @@ -35,6 +35,7 @@ def create_ranked_df( :type group_to_genes: dict :param filtered_groups: list of protein groups that were filtered out :type filtered_groups: list + :return: ranked dataframe of genes :rtype: pd.DataFrame """ @@ -133,6 +134,7 @@ def gsea_preranked( :type seed: int :param threads: Number of threads :type threads: int + :return: dictionary with results dataframe, ranking, enrichment detail dataframe per enriched gene set and messages :rtype: dict """ @@ -241,6 +243,7 @@ def create_genes_intensity_wide_df( :type group_to_genes: dict :param filtered_groups: list of protein IDs that could not be mapped to gene symbols :type filtered_groups: list + :return: dataframe with genes in rows and samples in columns with intensity values :rtype: pd.DataFrame """ @@ -348,6 +351,7 @@ def gsea( :type seed: int :param threads: Number of threads to use :type threads: int + :return: dict with enriched dataframe, ranking, enrichment detail dataframe per enriched gene set and messages :rtype: dict """ diff --git a/protzilla/data_integration/enrichment_analysis_helper.py b/protzilla/data_integration/enrichment_analysis_helper.py index fb40b72a9..8919332c2 100644 --- a/protzilla/data_integration/enrichment_analysis_helper.py +++ b/protzilla/data_integration/enrichment_analysis_helper.py @@ -25,8 +25,10 @@ def read_protein_or_gene_sets_file(path): - .json: {Set_name: [Protein1, Protein2, ...], Set_name2: [Protein2, Protein3, ...]} Empty strings are removed from the list of proteins or genes. + :param path: path to file :type path: str + :return: dict with protein or gene sets, a path to a gmt file or error message :rtype: dict """ @@ -74,8 +76,10 @@ def read_background_file(path): Reads a file of background proteins or genes. Accepts .csv and .txt files with one protein or gene per line. Empty strings are removed from the list of proteins or genes. + :param path: path to file :type path: str or None + :return: list of background proteins or genes or error message :rtype: list """ @@ -110,6 +114,7 @@ def map_to_STRING_ids(proteins_list, organism): :type proteins_list: list :param organism: organism NCBI identifier :type organism: str + :return: list of STRING IDs or None if no IDs could be found :rtype: list or None """ diff --git a/protzilla/data_preprocessing/filter_proteins.py b/protzilla/data_preprocessing/filter_proteins.py index b1b047832..eea81e6a8 100644 --- a/protzilla/data_preprocessing/filter_proteins.py +++ b/protzilla/data_preprocessing/filter_proteins.py @@ -7,15 +7,17 @@ def by_samples_missing(intensity_df, percentage): """ This function filters proteins based on its amount of nan values. If the percentage of existing values is below a threshold (percentage), the protein is filtered out. - :param df: the intensity dataframe that should be filtered\ - in long format + + :param df: the intensity dataframe that should be filtered + in long format :type df: pd.DataFrame - :param percentage: float ranging from 0 to 1. Defining the\ - relative share of samples the proteins should be present in inorder to be kept.\ + :param percentage: float ranging from 0 to 1. Defining the + relative share of samples the proteins should be present in inorder to be kept. :type percentage: float - :return: returns the filtered df as a Dataframe and a dict with a listof Protein IDs\ - that were discarded and a list of Protein IDs\ - that were kept + + :return: returns the filtered df as a Dataframe and a dict with a listof Protein IDs + that were discarded and a list of Protein IDs + that were kept :rtype: Tuple[pandas DataFrame, dict] """ diff --git a/protzilla/data_preprocessing/filter_samples.py b/protzilla/data_preprocessing/filter_samples.py index fbcdd50bc..6e518af8a 100644 --- a/protzilla/data_preprocessing/filter_samples.py +++ b/protzilla/data_preprocessing/filter_samples.py @@ -48,15 +48,17 @@ def by_proteins_missing(intensity_df: pd.DataFrame, percentage): """ This function filters samples based on the amount of nan values. If the percentage of existing values is below a threshold (percentage), the sample is filtered out. - :param df: the intensity dataframe that should be filtered\ - in long format + + :param df: the intensity dataframe that should be filtered + in long format :type df: pd.DataFrame - :param percentage: float ranging from 0 to 1. Defining the\ - relative share of proteins that were detected in the sample in inorder to be kept.\ + :param percentage: float ranging from 0 to 1. Defining the + relative share of proteins that were detected in the sample in inorder to be kept. :type percentage: float - :return: returns the filtered df as a Dataframe and a dict with a list of Protein IDs\ - that were discarded and a list of Protein IDs\ - that were kept + + :return: returns the filtered df as a Dataframe and a dict with a list of Protein IDs + that were discarded and a list of Protein IDs + that were kept :rtype: Tuple[pandas DataFrame, dict] """ intensity_name = intensity_df.columns.values.tolist()[3] diff --git a/protzilla/data_preprocessing/imputation.py b/protzilla/data_preprocessing/imputation.py index 160112cfb..11c743c02 100644 --- a/protzilla/data_preprocessing/imputation.py +++ b/protzilla/data_preprocessing/imputation.py @@ -29,17 +29,18 @@ def by_knn( Implements an instance of the sklearn.impute KNNImputer class. https://scikit-learn.org/stable/modules/generated/sklearn.impute.KNNImputer.html - :param intensity_df: the dataframe that should be filtered in\ - long format + + :param intensity_df: the dataframe that should be filtered in + long format :type intensity_df: pandas DataFrame - :param number_of_neighbours: number of neighbouring samples used for\ - imputation. Default: 5 + :param number_of_neighbours: number of neighbouring samples used for + imputation. Default: 5 :type number_of_neighbours: int - :param **kwargs: additional keyword arguments passed to\ + :param **kwargs: additional keyword arguments passed to KNNImputer.fit_transform :type kwargs: dict - :return: returns an imputed dataframe in typical protzilla long format\ - and an empty dict + :return: returns an imputed dataframe in typical protzilla long format + and an empty dict :rtype: pd.DataFrame """ @@ -74,14 +75,15 @@ def by_simple_imputer( no data will be imputed. This function automatically filters out such proteins from the DataFrame beforehand. - :param intensity_df: the dataframe that should be filtered in\ - long format + :param intensity_df: the dataframe that should be filtered in + long format :type intensity_df: pandas DataFrame - :param strategy: Defines the imputation strategy. Can be "mean",\ - "median" or "most_frequent" (for mode). + :param strategy: Defines the imputation strategy. Can be "mean", + "median" or "most_frequent" (for mode). :type strategy: str - :return: returns an imputed dataframe in typical protzilla long format\ - and an empty dict + + :return: returns an imputed dataframe in typical protzilla long format + and an empty dict :rtype: pd.DataFrame, int """ assert strategy in ["mean", "median", "most_frequent"] @@ -117,16 +119,17 @@ def by_min_per_sample( If not wanted, make sure to filter 0 intensity samples in the filtering step. - :param intensity_df: the dataframe that should be filtered in\ - long format + :param intensity_df: the dataframe that should be filtered in + long format :type intensity_df: pandas DataFrame - :param shrinking_value: a factor to alter the minimum value\ - used for imputation. With a shrinking factor of 0.1 for\ - example, a tenth of the minimum value found will be used for\ - imputation. Default: 1 (no shrinking) + :param shrinking_value: a factor to alter the minimum value + used for imputation. With a shrinking factor of 0.1 for + example, a tenth of the minimum value found will be used for + imputation. Default: 1 (no shrinking) :type shrinking_value: float - :return: returns an imputed dataframe in typical protzilla long format\ - and an empty dict + + :return: returns an imputed dataframe in typical protzilla long format + and an empty dict :rtype: pd.DataFrame, dict """ intensity_df_copy = intensity_df.copy(deep=True) @@ -156,16 +159,17 @@ def by_min_per_protein( take a fraction of that minimum value for imputation. CAVE: All proteins without any values will be filtered out. - :param intensity_df: the dataframe that should be filtered in\ - long format + :param intensity_df: the dataframe that should be filtered in + long format :type intensity_df: pandas DataFrame - :param shrinking_value: a factor to alter the minimum value\ - used for imputation. With a shrinking factor of 0.1 for\ - example, a tenth of the minimum value found will be used for\ - imputation. Default: 1 (no shrinking) + :param shrinking_value: a factor to alter the minimum value + used for imputation. With a shrinking factor of 0.1 for + example, a tenth of the minimum value found will be used for + imputation. Default: 1 (no shrinking) :type shrinking_value: float - :return: returns an imputed dataframe in typical protzilla long format\ - and an empty dict + + :return: returns an imputed dataframe in typical protzilla long format + and an empty dict :rtype: pd.DataFrame, dict """ transformed_df = long_to_wide(intensity_df) @@ -200,16 +204,17 @@ def by_min_per_dataset( the dataframe. The user can also assign a shrinking factor to take a fraction of that minimum value for imputation. - :param intensity_df: the dataframe that should be filtered in\ - long format + :param intensity_df: the dataframe that should be filtered in + long format :type intensity_df: pandas DataFrame - :param shrinking_value: a factor to alter the minimum value\ - used for imputation. With a shrinking factor of 0.1 for\ - example, a tenth of the minimum value found will be used for\ - imputation. Default: 1 (no shrinking) + :param shrinking_value: a factor to alter the minimum value + used for imputation. With a shrinking factor of 0.1 for + example, a tenth of the minimum value found will be used for + imputation. Default: 1 (no shrinking) :type shrinking_value: float - :return: returns an imputed dataframe in typical protzilla long format\ - and an empty dict + + :return: returns an imputed dataframe in typical protzilla long format + and an empty dict :rtype: pd.DataFrame, dict """ intensity_df_copy = intensity_df.copy(deep=True) @@ -262,7 +267,6 @@ def by_normal_distribution_sampling( transformed_df = long_to_wide(intensity_df) # iterate over all protein groups for protein_grp in transformed_df.columns: - number_of_nans = transformed_df[protein_grp].isnull().sum() # don't impute values if there not enough values (> 1) to sample from @@ -391,7 +395,6 @@ def _build_box_hist_plot( 2. a graph summarising the amount of filtered proteins. - """ if graph_type == "Boxplot": fig1 = create_box_plots( diff --git a/protzilla/data_preprocessing/normalisation.py b/protzilla/data_preprocessing/normalisation.py index 11c82f0b5..85d32433f 100644 --- a/protzilla/data_preprocessing/normalisation.py +++ b/protzilla/data_preprocessing/normalisation.py @@ -14,11 +14,12 @@ def by_z_score(intensity_df: pd.DataFrame) -> tuple[pd.DataFrame, dict]: Scales the data to zero mean and unit variance. This is often also called z-score normalisation/transformation. - :param intensity_df: the dataframe that should be filtered in\ - long format + :param intensity_df: the dataframe that should be filtered in + long format :type intensity_df: pd.DataFrame - :return: returns a scaled dataframe in typical protzilla long format and an empty\ - dictionary + + :return: returns a scaled dataframe in typical protzilla long format and an empty + dictionary :rtype: Tuple[pandas DataFrame, dict] """ @@ -55,14 +56,15 @@ def by_median( Divides each intensity by the chosen intensity quartile of the respective sample. By default, the median (50%-quartile) is used. - :param intensity_df: the dataframe that should be filtered in\ - long format + :param intensity_df: the dataframe that should be filtered in + long format :type intensity_df: pandas DataFrame - :param percentile: the chosen quartile of the sample intensities for\ - normalisation + :param percentile: the chosen quartile of the sample intensities for + normalisation :type percentile: float - :return: returns a scaled dataframe in typical protzilla long format\ - and a dict, containing all zeroed samples due to quantile being 0 + + :return: returns a scaled dataframe in typical protzilla long format + and a dict, containing all zeroed samples due to quantile being 0 :rtype: Tuple[pandas DataFrame, dict] """ @@ -116,11 +118,12 @@ def by_totalsum(intensity_df: pd.DataFrame) -> tuple[pd.DataFrame, dict]: Normalises the data on the level of each sample. Divides each intensity by the total sum of sample intensities. - :param intensity_df: the dataframe that should be filtered in\ - long format + :param intensity_df: the dataframe that should be filtered in + long format :type intensity_df: pandas DataFrame - :return: returns a scaled dataframe in typical protzilla long format\ - and a dict, containing all zeroed samples due to sum being 0 + + :return: returns a scaled dataframe in typical protzilla long format + and a dict, containing all zeroed samples due to sum being 0 :rtype: Tuple[pandas DataFrame, dict] """ @@ -178,13 +181,13 @@ def by_reference_protein( protein in each sample. Samples where this value is zero will be removed and returned separately. - :param intensity_df: the dataframe that should be filtered in\ - long format + :param intensity_df: the dataframe that should be filtered in + long format :type intensity_df: pandas DataFrame :param reference_protein: Protein ID of the protein to normalise by - type reference_protein_id: str - :return: returns a scaled dataframe in typical protzilla long format \ - and dict with a list of the indices of the dropped samples + type reference_protein_id: str + :return: returns a scaled dataframe in typical protzilla long format + and dict with a list of the indices of the dropped samples :rtype: Tuple[pandas DataFrame, dict] """ scaled_df = pd.DataFrame() diff --git a/protzilla/data_preprocessing/outlier_detection.py b/protzilla/data_preprocessing/outlier_detection.py index bde4ea5dd..f88807eb3 100644 --- a/protzilla/data_preprocessing/outlier_detection.py +++ b/protzilla/data_preprocessing/outlier_detection.py @@ -21,16 +21,17 @@ def by_isolation_forest( isolation forest approach. :param intensity_df: a dataframe in typical protzilla long format - on which the outlier detection is performed + on which the outlier detection is performed :type intensity_df: pandas DataFrame :param n_estimators: the number of estimators used by the algorithm, - default: 100 + default: 100 :type n_estimators: integer :param n_jobs: Number kernels used by algorithm, default: - all kernels (-1) + all kernels (-1) :type n_jobs: integer - :return: returns a Dataframe containing all samples that are not outliers and a\ - dict with list of outlier sample names + + :return: returns a Dataframe containing all samples that are not outliers and a + dict with list of outlier sample names :rtype: Tuple[pandas DataFrame, dict] """ try: @@ -85,16 +86,17 @@ def by_local_outlier_factor( https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.LocalOutlierFactor.html :param intensity_df: a dataframe in typical protzilla long format - on which the outlier detection is performed + on which the outlier detection is performed :type intensity_df: pandas DataFrame :param number_of_neighbors: number of neighbors used by the - algorithm, default: 20 + algorithm, default: 20 :type number_of_neighbors: int :param n_jobs: Number kernels used by algorithm, default: - all kernels (-1) + all kernels (-1) :type n_jobs: int - :return: returns a Dataframe containing all samples that are not outliers and a\ - dict with list of outlier sample names + + :return: returns a Dataframe containing all samples that are not outliers and a + dict with list of outlier sample names :rtype: Tuple[pandas DataFrame, dict] """ try: @@ -141,20 +143,21 @@ def by_pca( https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html :param intensity_df: a dataframe in typical protzilla long format - on which the outlier detection is performed + on which the outlier detection is performed :type intensity_df: pandas DataFrame :param threshold: distance from the median in - number of standard deviations to be included, - default: 2 + number of standard deviations to be included, + default: 2 :type threshold: float :param number_of_components: number of principal components - used in the PCA. Allowed: 2 or 3. Default: 3 + used in the PCA. Allowed: 2 or 3. Default: 3 :type number_of_components: integer (2 or 3) + :return: returns a Dataframe containing all samples that are not outliers. - A dict with list of inlier sample names, a DataFrame that contains the projection \ - of the intensity_df on first principal components, a list that contains the \ - explained variation for each component and an int, the number of components \ - the calculations were executed with + A dict with list of inlier sample names, a DataFrame that contains the projection + of the intensity_df on first principal components, a list that contains the + explained variation for each component and an int, the number of components + the calculations were executed with :rtype: Tuple[pandas DataFrame, dict] """ try: diff --git a/protzilla/data_preprocessing/peptide_filter.py b/protzilla/data_preprocessing/peptide_filter.py index 4be6adde2..ce1833aef 100644 --- a/protzilla/data_preprocessing/peptide_filter.py +++ b/protzilla/data_preprocessing/peptide_filter.py @@ -14,12 +14,12 @@ def by_pep_value( :type intensity_df: pd.Dataframe :param peptide_df: the pandas dataframe containing the peptide information :type peptide_df: pd.Dataframe - :param threshold: peptides with a PEP-value below this threshold will be filtered\ - out + :param threshold: peptides with a PEP-value below this threshold will be filtered + out :type threshold: float :return: intensity-df, piped through, dict with peptide_df without the peptides - below the threshold and a list with filtered-out peptides (Sequences) + below the threshold and a list with filtered-out peptides (Sequences) :rtype: Tuple[pd.Dataframe, dict(pd.Dataframe, list)] """ diff --git a/protzilla/data_preprocessing/plots.py b/protzilla/data_preprocessing/plots.py index c8add29b2..a4cfcb411 100644 --- a/protzilla/data_preprocessing/plots.py +++ b/protzilla/data_preprocessing/plots.py @@ -123,11 +123,11 @@ def create_box_plots( (for example before and after filtering/normalisation) and creates a visualisation for each one. - :param dataframe_a: First dataframe in protzilla long format for\ - first boxplot + :param dataframe_a: First dataframe in protzilla long format for + first boxplot :type dataframe_a: pd.DataFrame - :param dataframe_b: Second dataframe in protzilla long format\ - for second boxplot + :param dataframe_b: Second dataframe in protzilla long format + for second boxplot :type dataframe_b: pd.DataFrame :param name_a: Name of first boxplot :type name_a: str @@ -139,12 +139,13 @@ def create_box_plots( :type y_title: str :param x_title: Optional x-axis title for graphs. :type x_title: str - :param group_by: Optional argument to create a grouped boxplot\ - graph. Arguments can be either "Sample" to group by sample or\ - "Protein ID" to group by protein. Leave "None" to get ungrouped\ - conventional graphs. If set the function will ignore the\ - graph_type argument. Default is "None". + :param group_by: Optional argument to create a grouped boxplot + graph. Arguments can be either "Sample" to group by sample or + "Protein ID" to group by protein. Leave "None" to get ungrouped + conventional graphs. If set the function will ignore the + graph_type argument. Default is "None". :type group_by: str + :return: returns a boxplot of the data :rtype: Figure (plotly object) """ @@ -223,11 +224,11 @@ def create_histograms( (for example before and after filtering/normalisation) and creates a visualisation for each one. - :param dataframe_a: First dataframe in protzilla long format for\ - first histogram + :param dataframe_a: First dataframe in protzilla long format for + first histogram :type dataframe_a: pd.DataFrame - :param dataframe_b: Second dataframe in protzilla long format\ - for second histogram + :param dataframe_b: Second dataframe in protzilla long format + for second histogram :type dataframe_b: pd.DataFrame :param name_a: Name of first histogram :type name_a: str @@ -239,6 +240,7 @@ def create_histograms( :type y_title: str :param x_title: Optional x axis title for graphs. :type x_title: str + :return: returns a pie or bar chart of the data :rtype: Figure (plotly object) """ @@ -288,16 +290,17 @@ def create_anomaly_score_bar_plot( This function creates a graph visualising the outlier and non-outlier samples using the anomaly score. - :param anomaly_df: pandas Dataframe that contains the anomaly score for each\ - sample, including outliers and on-outliers samples + :param anomaly_df: pandas Dataframe that contains the anomaly score for each + sample, including outliers and on-outliers samples :type anomaly_df: pd.DataFrame :param colour_outlier: hex code for colour depicting the outliers. - Default: PROTZILLA_DISCRETE_COLOR_OUTLIER_SEQUENCE outlier colour + Default: PROTZILLA_DISCRETE_COLOR_OUTLIER_SEQUENCE outlier colour :type colour_outlier: str :param colour_non_outlier: hex code for colour depicting the - non-outliers. Default: PROTZILLA_DISCRETE_COLOR_OUTLIER_SEQUENCE - non-outlier colour + non-outliers. Default: PROTZILLA_DISCRETE_COLOR_OUTLIER_SEQUENCE + non-outlier colour :type colour_non_outlier: str + :return: returns a plotly Figure object :rtype: Figure (plotly object) """ @@ -349,19 +352,20 @@ def create_pca_2d_scatter_plot( and non-outlier points by showing the principal components. It returns a ploty Figure object. - :param pca_df: a DataFrame that contains the projection of\ - the intensity_df on first principal components + :param pca_df: a DataFrame that contains the projection of + the intensity_df on first principal components :type pca_df: pd.DataFrame - :param explained_variance_ratio: a list that contains the\ - explained variation for each component + :param explained_variance_ratio: a list that contains the + explained variation for each component :type explained_variance_ratio: list :param colour_outlier: hex code for colour depicting the outliers. - Default: PROTZILLA_DISCRETE_COLOR_OUTLIER_SEQUENCE outlier colour + Default: PROTZILLA_DISCRETE_COLOR_OUTLIER_SEQUENCE outlier colour :type colour_outlier: str :param colour_non_outlier: hex code for colour depicting the - non-outliers. Default: PROTZILLA_DISCRETE_COLOR_OUTLIER_SEQUENCE - non-outlier colour + non-outliers. Default: PROTZILLA_DISCRETE_COLOR_OUTLIER_SEQUENCE + non-outlier colour :type colour_non_outlier: str + :return: returns a plotly Figure object :rtype: Figure (plotly object) """ @@ -403,19 +407,20 @@ def create_pca_3d_scatter_plot( and non-outlier points by showing the principal components. It returns a ploty Figure object. - :param pca_df: a DataFrame that contains the projection of\ - the intensity_df on first principal components + :param pca_df: a DataFrame that contains the projection of + the intensity_df on first principal components :type pca_df: pd.DataFrame - :param explained_variance_ratio: a list that contains the\ - explained variation for each component + :param explained_variance_ratio: a list that contains the + explained variation for each component :type explained_variance_ratio: list :param colour_outlier: hex code for colour depicting the outliers. - Default: PROTZILLA_DISCRETE_COLOR_OUTLIER_SEQUENCE outlier colour + Default: PROTZILLA_DISCRETE_COLOR_OUTLIER_SEQUENCE outlier colour :type colour_outlier: str :param colour_non_outlier: hex code for colour depicting the - non-outliers. Default: PROTZILLA_DISCRETE_COLOR_OUTLIER_SEQUENCE - non-outlier colour + non-outliers. Default: PROTZILLA_DISCRETE_COLOR_OUTLIER_SEQUENCE + non-outlier colour :type colour_non_outlier: str + :return: returns a plotly Figure object :rtype: Figure (plotly object) """ diff --git a/protzilla/data_preprocessing/transformation.py b/protzilla/data_preprocessing/transformation.py index 4b546a0ca..5e8e16ba7 100644 --- a/protzilla/data_preprocessing/transformation.py +++ b/protzilla/data_preprocessing/transformation.py @@ -12,11 +12,12 @@ def by_log(intensity_df: pd.DataFrame, log_base="log10"): :param intensity_df: a protein data frame in long format :type intensity_df: pd.DataFrame - :param log_base: String of the used log method "log10" (base 10)\ - or "log2" (base 2). Default: "log10" - :type log_base: Str - :return: returns a pandas DataFrame in typical protzilla\ - long format with the transformed data and an empty dict. + :param log_base: String of the used log method "log10" (base 10) + or "log2" (base 2). Default: "log10" + :type log_base: str + + :return: returns a pandas DataFrame in typical protzilla + long format with the transformed data and an empty dict. :rtype: Tuple[pandas DataFrame, dict] """ intensity_name = intensity_df.columns.values.tolist()[3] diff --git a/protzilla/history.py b/protzilla/history.py index 8de6f99fd..4af01a468 100644 --- a/protzilla/history.py +++ b/protzilla/history.py @@ -15,13 +15,17 @@ class History: This class has the responsibility to save what methods were previously executed in a Run. Each Run has one History. It is responsible for saving dataframes to disk. - :ivar steps is a list of the steps that have been executed, represented by + + :param steps: is a list of the steps that have been executed, represented by ExecutedStep instances. - :ivar df_mode determines if the dataframe of a completed step that is added to the + :type steps: list[ExecutedStep] + :param df_mode: determines if the dataframe of a completed step that is added to the history is saved to disk and not held im memory ("disk" mode), held in memory but not saved to disk ("memory" mode) or both ("disk_memory" mode). - :ivar run_name is the name of the run a history instance belongs to. It is used to + :type df_mode: str + :param run_name: is the name of the run a history instance belongs to. It is used to save things at the correct disk location. + :type run_name: str """ @classmethod diff --git a/protzilla/importing/__init__.py b/protzilla/importing/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/protzilla/run.py b/protzilla/run.py index 97e6054bd..ba5a35af5 100644 --- a/protzilla/run.py +++ b/protzilla/run.py @@ -22,24 +22,52 @@ class Run: """ - :ivar run_path: the path to this runs' dir - :ivar workflow_config - :ivar run_name - :ivar history - :ivar step_index - :ivar workflow_meta - - :ivar section - :ivar step - :ivar method - :ivar df: dataframe that will be used as input for the next data preprocessing step, not used in data analysis - :ivar result_df - :ivar current_out - :ivar current_parameters: calculation parameters that were used to calculate for each method - :ivar current_plot_parameters: plot parameters that were used to generate plots for each method, not used in data analysis - :ivar calculated_method: method that was last used to calculate - :ivar plots - :ivar plotted_for_parameters: calculation parameters that were used to generate the results that were used to generate plots, not used in data analysis + A class to represent a complete data analysis run in protzilla. + + :param run_path: the path to this runs' dir + :type run_path: str + :param workflow_config: Contains the contents of the workflow .json + that was selected for this run at first. It is always updated when + the workflow gets changed throughout the run (e.g. change of a parameter). + :type workflow_config: dict + :param run_name: name of the run + :type run_name: str + :param history: an instance of the history class to access the history of this run + :type history: protzilla.History + :param step_index: index of the current step over all steps in the workflow + :type step_index: int + :param workflow_meta: contains contents of the workflow meta file that contains all + methods and parameters that exist in protzilla + :type workflow_meta: dict + + :param section: current section + :type section: str + :param step: current step + :type step: str + :param method: current method + :type method: str + :param df: dataframe that will be used as input for the next data preprocessing step + (Not used in data analysis! Due to the more flexible dataflow during analysis + the input dataframe for an analysis step needs to be selectable in the frontend and is an + input parameter for each new step) + :type df: pandas.DataFrame + :param result_df: contains the modified intensity dataframe after a step + :type result_df: pandas.DataFrame + :param current_out: contains other outputs from the current step + :type current_out: dict + :param current_parameters: calculation parameters that were used to calculate the current step + (e.g. to update workflow_config correctly) + :type current_parameters: dict + :param current_plot_parameters: plot parameters that were used to generate plots for the + current step (Not used in data analysis! A plot is its own step in that section + to allow for more flexibility) + :type current_plot_parameters: dict + :param calculated_method: method that was used to calculate the current step + :type calculated_method: str + :param plots: contains the plots generated in the current step + :type plots: list[Figure] + :param plotted_for_parameters: calculation parameters that were used to generate the results that were used to generate current plots, not used in data analysis + :type plotted_for_parameters: dict """ @classmethod diff --git a/protzilla/utilities/clustergram.py b/protzilla/utilities/clustergram.py index 178aa048c..edfdd47d6 100644 --- a/protzilla/utilities/clustergram.py +++ b/protzilla/utilities/clustergram.py @@ -65,6 +65,19 @@ def Clustergram( height=800, width=1000, ): + """ + This is an adapted version of Plotly's clustergram found in the Dash Bio package. + In this adaptation, we've made it possible to display a vertical or horizontal + colorbar, as well as a legend for the color bar. To achieve this, we've made changes + to the "row_colors" parameter and introduced a new parameter called + "row_colors_to_label_dict." + + The "row_colors" parameter now consists of a list with a length equal to the number + of samples, where each element represents a specific color corresponding to that + sample. The "row_colors_to_label_dict" is a dictionary that associates each color + in the "row_colors" list with a specific group. This mapping is used to create the + color bar legend. + """ if color_threshold is None: color_threshold = dict(row=0, col=0) diff --git a/protzilla/utilities/dunn_score.py b/protzilla/utilities/dunn_score.py index 1ba0924c8..8862901a7 100644 --- a/protzilla/utilities/dunn_score.py +++ b/protzilla/utilities/dunn_score.py @@ -34,6 +34,7 @@ def dunn_score(X, labels=None): :type X: pd.DataFrame :param labels: the predicted labels/classes by the clustering algorithm :type labels: pd.DataFrame + :returns: the dunn index for the clusters found for a given data set X :rtype: float """ diff --git a/protzilla/utilities/transform_dfs.py b/protzilla/utilities/transform_dfs.py index bea8fffab..df36dab49 100644 --- a/protzilla/utilities/transform_dfs.py +++ b/protzilla/utilities/transform_dfs.py @@ -3,17 +3,18 @@ def long_to_wide(intensity_df: pd.DataFrame): """ - This function transforms the dataframe to a wide format that - can be more easily handled by packages such as sklearn. - Each sample gets one row with all observations as columns. + This function transforms the dataframe to a wide format that + can be more easily handled by packages such as sklearn. + Each sample gets one row with all observations as columns. - :param intensity_df: the dataframe that should be changed in\ + :param intensity_df: the dataframe that should be transformed into long format :type intensity_df: pd.DataFrame - :return: returns dataframe in wide format suitable for use by\ + + :return: returns dataframe in wide format suitable for use by packages such as sklearn - :rtype: pd.DataFrame - """ + :rtype: pd.DataFrame + """ values_name = intensity_df.columns[3] return pd.pivot( intensity_df, index="Sample", columns="Protein ID", values=values_name @@ -25,12 +26,13 @@ def wide_to_long(wide_df: pd.DataFrame, original_long_df: pd.DataFrame): This functions transforms the dataframe from a wide format to the typical protzilla long format. - :param wide_df: the dataframe in wide format that\ - should be changed + :param wide_df: the dataframe in wide format that + should be changed :type wide_df: pd.DataFrame - :param original_long_df: the original long protzilla format\ - dataframe, that was the source of the wide format dataframe + :param original_long_df: the original long protzilla format + dataframe, that was the source of the wide format dataframe :type orginal_long_df: pd.DataFrame + :return: returns dataframe in typical protzilla long format :rtype: pd.DataFrame """ @@ -63,8 +65,10 @@ def is_intensity_df(df: pd.DataFrame): Checks if the dataframe is an intensity dataframe. An intensity dataframe should have the columns "Sample", "Protein ID" and and intensity column. + :param df: the dataframe that should be checked :type df: pd.DataFrame + :return: returns True if the dataframe is an intensity dataframe :rtype: bool """ diff --git a/protzilla/utilities/utilities.py b/protzilla/utilities/utilities.py index c1bc418fd..a188dfdc8 100644 --- a/protzilla/utilities/utilities.py +++ b/protzilla/utilities/utilities.py @@ -40,6 +40,7 @@ def fig_to_base64(fig): :param fig: matplotlib figure :type fig: matplotlib.figure.Figure + :return: base64 encoded image :rtype: bytes """ diff --git a/requirements.txt b/requirements.txt index 2d41a18ff..da2023280 100644 --- a/requirements.txt +++ b/requirements.txt @@ -33,3 +33,5 @@ protgraph @ git+https://github.com/antonneubauer/ProtGraph@master joblib==1.2.0 networkx==3.1 beautifulsoup4==4.12.2 +sphinx==7.2.6 +sphinx-autoapi==3.0.0 diff --git a/tests/protzilla/data_analysis/__init__.py b/tests/protzilla/data_analysis/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/protzilla/data_integration/__init__.py b/tests/protzilla/data_integration/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/protzilla/importing/__init__.py b/tests/protzilla/importing/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/ui/__init__.py b/tests/ui/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/ui/main/upload_handler.py b/ui/main/upload_handler.py index 4d412f050..a49286581 100644 --- a/ui/main/upload_handler.py +++ b/ui/main/upload_handler.py @@ -9,6 +9,10 @@ # copied from TemporaryFileUploadHandler class CustomFileUploadHandler(FileUploadHandler): + """ + The same as Django's TemporaryFileUploadHandler, except for writing to a CustomUploadedFile. + """ + def new_file(self, *args, **kwargs): """ Create the file object to append to as data is coming in. @@ -39,7 +43,7 @@ def upload_interrupted(self): # copied from TemporaryUploadedFile class CustomUploadedFile(UploadedFile): """ - A file uploaded to a temporary location (i.e. stream-to-disk). + The same as Django's TemporaryUploadedFile, except for passing different arguments when creating the file """ def __init__(self, name, content_type, size, charset, content_type_extra=None): diff --git a/ui/runs/fields.py b/ui/runs/fields.py index 66d3b53cb..05125c186 100644 --- a/ui/runs/fields.py +++ b/ui/runs/fields.py @@ -13,6 +13,22 @@ def make_current_fields(run, section, step, method): + """ + Wrapper method that generates the fields for the current method + based on the data in the workflow_meta.json file. + + :param run: The current run object + :type run: Run + :param section: The current section + :type section: str + :param step: The current step + :type step: str + :param method: The current method + :type method: str + + :return: A list of fields for the current method + :rtype: list + """ if not step: return [] parameters = get_parameters(run, section, step, method) @@ -28,10 +44,26 @@ def make_current_fields(run, section, step, method): def make_parameter_input(key, param_dict, all_parameters_dict, disabled): - # In this method param_dict refers to the dictionary that contains all - # meta information about a specific parameter e.g. type, default value. The - # all_parameters_dict refers to the dictionary that contains all parameters for - # a method with its corresponding meta information + """ + Generates the html for a single parameter input field. The + type of the input field is determined by the type of the parameter as specified + in the workflow_meta.json. + May be called recursively by make_dynamic_fields if the parameter is a dynamic parameter. + + :param key: The name of the parameter, matches the key in the workflow_meta.json + :type key: str + :param param_dict: The dictionary containing all meta information about the parameter + e.g. type, default value + :type param_dict: dict + :param all_parameters_dict: The dictionary containing all parameters for the current method + with corresponding meta information + :type all_parameters_dict: dict + :param disabled: Should the input field be disabled + :type disabled: bool + + :return: The html for the input field + :rtype: str + """ if param_dict["type"] == "numeric": param_dict["multiple"] = param_dict.get("multiple", False) template = "runs/field_number.html" @@ -71,6 +103,21 @@ def make_parameter_input(key, param_dict, all_parameters_dict, disabled): def make_dynamic_fields(param_dict, selected_category, all_parameters_dict, disabled): + """ + Generates the html for the dynamic fields of a "categorical_dynamic" type parameter. + This is used to dynamically add fields based on the selected_category. + + :param param_dict: The dictionary containing all meta information about the parameter + e.g. type, default value + :type param_dict: dict + :param selected_category: The currently selected category of the field described by param_dict + :type selected_category: str + :param all_parameters_dict: The dictionary containing all parameters for the current method + with corresponding meta information + :type all_parameters_dict: dict + :param disabled: Should the fields be disabled + :type disabled: bool + """ dynamic_fields = [] if selected_category in param_dict["dynamic_parameters"]: dynamic_parameters_list = param_dict["dynamic_parameters"][selected_category] @@ -85,6 +132,19 @@ def make_dynamic_fields(param_dict, selected_category, all_parameters_dict, disa def make_sidebar(request, run, run_name): + """ + Renders the sidebar of the run detail page. + + :param request: The current request + :type request: HttpRequest + :param run: The current run object + :type run: Run + :param run_name: The name of the current run + :type run_name: str + + :return: The html for the sidebar + :rtype: str + """ csrf_token = request.META["CSRF_COOKIE"] template = "runs/sidebar.html" return render_to_string( @@ -100,6 +160,23 @@ def make_sidebar(request, run, run_name): def make_plot_fields(run, section, step, method): + """ + Generates the html for the plot fields of the current method. + This is only used when a plot is a part of a step and not its own step + as is the case for the data preprocessing section. + + :param run: The current run object + :type run: Run + :param section: The current section + :type section: str + :param step: The current step + :type step: str + :param method: The current method + :type method: str + + :return: The html for the plot fields + :rtype: str + """ if not step: return plots = run.workflow_meta[section][step][method].get("graphs", []) @@ -115,6 +192,21 @@ def make_plot_fields(run, section, step, method): def make_method_dropdown(run, section, step, method): + """ + Generates the html for the method dropdown of the current step. + + :param run: The current run object + :type run: Run + :param section: The current section + :type section: str + :param step: The current step + :type step: str + :param method: The current method + :type method: str + + :return: The html for the method dropdown + :rtype: str + """ if not step: return "" methods = run.workflow_meta[section][step].keys() @@ -133,6 +225,16 @@ def make_method_dropdown(run, section, step, method): def make_displayed_history(run): + """ + Generates the html for the displayed history that is displayed at the + top of the current run. + + :param run: The current run object + :type run: Run + + :return: The html for the displayed history + :rtype: str + """ displayed_history = [] for i, history_step in enumerate(run.history.steps): fields = [] @@ -215,6 +317,22 @@ def make_displayed_history(run): def make_name_field(allow_next, form, run, end_of_run): + """ + Generates the html for the field that allows to name the output of the + current method. + + :param allow_next: Whether the next button should be enabled + :type allow_next: bool + :param form: The form that the field belongs to + :type form: Form + :param run: The current run object + :type run: Run + :param end_of_run: Whether the current step is the last step of the run + :type end_of_run: bool + + :return: The html for the name field + :rtype: str + """ if end_of_run: return "" default = get_workflow_default_param_value( diff --git a/ui/runs/utilities/__init__.py b/ui/runs/utilities/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/ui/runs/views.py b/ui/runs/views.py index 7ec7393d3..0140f361a 100644 --- a/ui/runs/views.py +++ b/ui/runs/views.py @@ -43,6 +43,15 @@ def index(request): + """ + Renders the main index page of the PROTzilla application. + + :param request: the request object + :type request: HttpRequest + + :return: the rendered index page + :rtype: HttpResponse + """ return render( request, "runs/index.html", @@ -54,6 +63,20 @@ def index(request): def detail(request, run_name): + """ + Renders the details page of a specific run. + For rendering a context dict is created that contains all the dynamic information + that is needed to display the page. This wraps other methods that provide subparts + for the page e.g. make_displayed_history() to show the history. + + :param request: the request object + :type request: HttpRequest + :param run_name: the name of the run + :type run_name: str + + :return: the rendered details page + :rtype: HttpResponse + """ if run_name not in active_runs: active_runs[run_name] = Run.continue_existing(run_name) run = active_runs[run_name] @@ -120,7 +143,19 @@ def detail(request, run_name): def change_method(request, run_name): - # TODO 92 extract into a seperate method like try_reactivate_run + """ + Changes the method during a step of a run. + This is called when the user selects a new method in the first dropdown of a step. + + :param request: the request object + :type request: HttpRequest + :param run_name: the name of the run + :type run_name: str + + :return: a JSON response object containing the new fields for the selected method + :rtype: JsonResponse + """ + # TODO 92 extract into a separate method like try_reactivate_run try: if run_name not in active_runs: active_runs[run_name] = Run.continue_existing(run_name) @@ -153,6 +188,21 @@ def change_method(request, run_name): def change_dynamic_fields(request, run_name): + """ + Renders fields that depend on the value of another field e.g. a dropdown, the value + being the dynamic_trigger_value below. The field is specified by its key and part of + the request. + + :param request: the request object + :type request: HttpRequest + :param run_name: the name of the run + :type run_name: str + + :return: a JSON response object containing the new fields depending on the value of + the dynamic trigger + :rtype: JsonResponse + """ + try: if run_name not in active_runs: active_runs[run_name] = Run.continue_existing(run_name) @@ -178,6 +228,22 @@ def change_dynamic_fields(request, run_name): def change_field(request, run_name): + """ + Changes the value of one or multiple fields during a method of a run depending on a + selected value in another field. The field that triggers this method is identified by + the post_id variable. + In contrast to change_dynamic_fields, this method changes the value of the field itself + instead of rendering new fields. + + :param request: the request object + :type request: HttpRequest + :param run_name: the name of the run + :type run_name: str + + :return: a JSON response object containing the updated fields depending on the value of the + dynamic trigger field + :rtype: JsonResponse + """ try: if run_name not in active_runs: active_runs[run_name] = Run.continue_existing(run_name) @@ -310,6 +376,15 @@ def change_field(request, run_name): def create(request): + """ + Creates a new run. The user is then redirected to the detail page of the run. + + :param request: the request object + :type request: HttpRequest + + :return: the rendered details page of the new run + :rtype: HttpResponse + """ run_name = request.POST["run_name"] run = Run.create( run_name, @@ -321,24 +396,67 @@ def create(request): def continue_(request): + """ + Continues an existing run. The user is redirected to the detail page of the run and + can resume working on the run. + + :param request: the request object + :type request: HttpRequest + + :return: the rendered details page of the run + :rtype: HttpResponse + """ run_name = request.POST["run_name"] active_runs[run_name] = Run.continue_existing(run_name) return HttpResponseRedirect(reverse("runs:detail", args=(run_name,))) def next_(request, run_name): + """ + Skips to and renders the next step/method of the run. + + :param request: the request object + :type request: HttpRequest + :param run_name: the name of the run + :type run_name: str + + :return: the rendered detail page of the run with the next step/method + :rtype: HttpResponse + """ run = active_runs[run_name] run.next_step(request.POST["name"]) return HttpResponseRedirect(reverse("runs:detail", args=(run_name,))) def back(request, run_name): + """ + Goes back to and renders the previous step/method of the run. + + :param request: the request object + :type request: HttpRequest + :param run_name: the name of the run + :type run_name: str + + :return: the rendered detail page of the run with the previous step/method + :rtype: HttpResponse + """ run = active_runs[run_name] run.back_step() return HttpResponseRedirect(reverse("runs:detail", args=(run_name,))) def add(request, run_name): + """ + Adds a new method to the run. The method is added as the next step. + + :param request: the request object + :type request: HttpRequest + :param run_name: the name of the run + :type run_name: str + + :return: the rendered detail page of the run, new method visible in sidebar + :rtype: HttpResponse + """ run = active_runs[run_name] post = dict(request.POST) @@ -352,6 +470,17 @@ def add(request, run_name): def delete_step(request, run_name): + """ + Deletes a step/method from the run. + + :param request: the request object + :type request: HttpRequest + :param run_name: the name of the run + :type run_name: str + + :return: the rendered detail page of the run, deleted method no longer visible in sidebar + :rtype: HttpResponse + """ run = active_runs[run_name] post = dict(request.POST) @@ -363,6 +492,17 @@ def delete_step(request, run_name): def export_workflow(request, run_name): + """ + Exports the workflow of the run as a JSON file so that it can be reused and shared. + + :param request: the request object + :type request: HttpRequest + :param run_name: the name of the run + :type run_name: str + + :return: the rendered detail page of the run + :rtype: HttpResponse + """ run = active_runs[run_name] post = dict(request.POST) del post["csrfmiddlewaretoken"] @@ -375,6 +515,18 @@ def export_workflow(request, run_name): def calculate(request, run_name): + """ + Performs the current methods calculation during the run. Django messages are used to + display additional information, warnings and errors to the user. + + :param request: the request object + :type request: HttpRequest + :param run_name: the name of the run + :type run_name: str + + :return: the rendered detail page of the run + :rtype: HttpResponse + """ run = active_runs[run_name] parameters = parameters_from_post(request.POST) del parameters["chosen_method"] @@ -406,6 +558,20 @@ def calculate(request, run_name): def plot(request, run_name): + """ + Creates a plot from the current step/method of the run. + This is only called by the plot button in the data preprocessing section aka when a plot is + simultaneously a step on its own. + Django messages are used to display additional information, warnings and errors to the user. + + :param request: the request object + :type request: HttpRequest + :param run_name: the name of the run + :type run_name: str + + :return: the rendered detail page of the run, now with the plot + :rtype: HttpResponse + """ run = active_runs[run_name] section, step, method = run.current_run_location() parameters = parameters_from_post(request.POST) @@ -439,17 +605,54 @@ def plot(request, run_name): def add_name(request, run_name): + """ + Adds a name to the results of a calculated method of the run. The name can be used + to identify the result and use them later. + + :param request: the request object + :type request: HttpRequest + :param run_name: the name of the run + :type run_name: str + + :return: the rendered detail page of the run + :rtype: HttpResponse + """ run = active_runs[run_name] run.name_step(int(request.POST["index"]), request.POST["name"]) return HttpResponseRedirect(reverse("runs:detail", args=(run_name,))) def results_exist(request, run_name): + """ + Checks if the results of the run exist. This is used to determine if the Next button + should be enabled or not. + + :param request: the request object + :type request: HttpRequest + :param run_name: the name of the run + :type run_name: str + + :return: a JSON response with a boolean value + :rtype: JsonResponse + """ run = active_runs[run_name] return JsonResponse(dict(results_exist=run.result_df is not None)) def all_button_parameters(request, run_name): + """ + Returns all parameters that are needed to render the buttons as enabled or disabled + in the run detail page. + See ui/runs/templates/runs/form_buttons.html for detailed documentation. + + :param request: the request object + :type request: HttpRequest + :param run_name: the name of the run + :type run_name: str + + :return: a JSON response with the parameters + :rtype: JsonResponse + """ run = active_runs[run_name] d = dict() d["current_plot_parameters"] = run.current_plot_parameters.get(run.method, {}) @@ -468,12 +671,36 @@ def all_button_parameters(request, run_name): def outputs_of_step(request, run_name): + """ + Returns the output keys of a named step of the run. This is used to determine which + parameters can be used as input for future steps. + + :param request: the request object + :type request: HttpRequest + :param run_name: the name of the run + :type run_name: str + + :return: a JSON response with the output keys + :rtype: JsonResponse + """ run = active_runs[run_name] step_name = request.POST["step_name"] return JsonResponse(run.history.output_keys_of_named_step(step_name), safe=False) def download_plots(request, run_name): + """ + Downloads all plots of the current method in the run. If multiple plots are created, + they are zipped together. The format of the plots is specified in the request. + + :param request: the request object + :type request: HttpRequest + :param run_name: the name of the run + :type run_name: str + + :return: a FileResponse with the plots + :rtype: FileResponse + """ run = active_runs[run_name] format_ = request.GET["format"] exported = run.export_plots(format_=format_)