Docstrings (#289)

* correct docstring syntax of data_analysis package * Fix docstring syntax in data_integration package * Fix docstring syntax in utilities package * Fix more docstring syntax * format code * add docstrings for database_integration.py * add returns for database_download.py * add docstrings for database_query.py * add docstrings for upload_handler.py * docstrings runs/views part1 * docstrings 2 runs/views.py * docstrings for runs/fields.py * docstrings for clustergram copy * docstrings for data_analysis/dimension_reduction.py * docstrings for data_analysis/classification.py * add __init__.py * Fix run docstring * Add docs build instructions and packages * update docs build instructions * Implement review suggestions * remove \ in data_preprocessing docstrings * remove more \ in docstrings * adopt PR suggestions --------- Co-authored-by: Fynn <[email protected]> Co-authored-by: BelanaZ <[email protected]> Co-authored-by: Sara Grau <[email protected]> Co-authored-by: antonneubauer <[email protected]>
cschlaffner · Nov 25, 2023 · 67c3a6f · 67c3a6f
1 parent 5bd8231
commit 67c3a6f
Show file tree

Hide file tree

Showing 48 changed files with 1,048 additions and 292 deletions.
diff --git a/docs/Makefile b/docs/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = source
+BUILDDIR      = build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/build_docs.md b/docs/build_docs.md
@@ -0,0 +1,8 @@
+## Build docs with Sphinx ##
+- Docs are build with sphinx-autoapi
+- after installing the packages in requirements.txt, all necessary dependencies for building the docs should be installed (sphinx==7.2.6, sphinx-autoapi==3.0.0, requests==2.31.0)
+- to build the docs open the docs\ folder in a terminal and run "make html" to create the html documentation
+    - in case the error "Could not import extension sphinx.builders.linkcheck" occurs, try reinstalling python requests (pip install requests==2.31.0)
+    - warnings might occur, they usually do not prevent the successful build of the docs
+- To open the docs open the index.html in the docs\build\html folder
+- when adding docstrings to the code they should follow the correct syntax(https://sphinx-rtd-tutorial.readthedocs.io/en/latest/docstrings.html), in order to be formatted correctly in the generated documentation
diff --git a/docs/make.bat b/docs/make.bat
@@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=source
+set BUILDDIR=build
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.https://www.sphinx-doc.org/
+	exit /b 1
+)
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -0,0 +1,27 @@
+# Configuration file for the Sphinx documentation builder.
+#
+# For the full list of built-in configuration values, see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Project information -----------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
+
+project = "PROTzilla"
+copyright = "2023, BP22/23"
+author = "BP22/23"
+
+# -- General configuration ---------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
+
+extensions = ["sphinx.ext.napoleon", "autoapi.extension"]
+autoapi_dirs = ["../../"]
+
+templates_path = ["_templates"]
+exclude_patterns = []
+
+
+# -- Options for HTML output -------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
+
+html_theme = "alabaster"
+html_static_path = ["_static"]
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -0,0 +1,20 @@
+.. PROTzilla documentation master file, created by
+   sphinx-quickstart on Wed Sep 27 18:24:09 2023.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+Welcome to PROTzilla's documentation!
+=====================================
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
diff --git a/protzilla/constants/location_mapping.py b/protzilla/constants/location_mapping.py
@@ -20,11 +20,9 @@
 )
 from ..importing import metadata_import, ms_data_import, peptide_import
 
-"""
-In this data structure, a method is associated with a location. The location is
-determined by the section, step, and method keys found in the workflow_meta 
-file that correspond to the method.
-"""
+# In this data structure, a method is associated with a location. The location is
+# determined by the section, step, and method keys found in the workflow_meta
+# file that correspond to the method.
 method_map = {
     (
         "importing",
@@ -247,11 +245,10 @@
 # reversed mapping of method callable and location
 location_map = {v: k for k, v in method_map.items()}
 
-"""
-In this data structure, a plot for a given method is associated with a 
-location. The location is determined by the section, step, and method keys 
-found in the workflow_meta file that correspond to the method.
-"""
+
+# In this data structure, a plot for a given method is associated with a
+# location. The location is determined by the section, step, and method keys
+# found in the workflow_meta file that correspond to the method.
 plot_map = {
     (
         "data_preprocessing",

diff --git a/protzilla/data_analysis/classification.py b/protzilla/data_analysis/classification.py
@@ -107,33 +107,33 @@ def random_forest(
     :param metadata_df: A separate dataframe containing additional metadata information.
     :type metadata_df: pd.DataFrame
     :param labels_column: The column name in the `metadata_df` dataframe that contains
-     the target variable (labels) for classification.
+        the target variable (labels) for classification.
     :type labels_column: str
     :param train_test_split: The proportion of data to be used for testing. Default is
-     0.2 (80-20 train-test split).
+        0.2 (80-20 train-test split).
     :type train_test_split: int, optional
     :param n_estimators: The number of decision trees to be used in the random forest.
     :type n_estimators: int, optional
     :param criterion: The impurity measure used for tree construction.
     :type criterion: str, optional
     :param max_depth: The maximum depth of the decision trees. If not specified (None),
-     the trees will expand until all leaves are pure or contain minimum samples per leaf.
+        the trees will expand until all leaves are pure or contain minimum samples per leaf.
     :type max_depth: int or None, optional
     :param bootstrap: Whether bootstrap samples should be used when building trees.
     :type bootstrap: bool, optional
     :param random_state: The random seed for reproducibility.
-    :type random_state: int, optional
+    :type random_state: int
     :param model_selection: The model selection method for hyperparameter tuning.
-    :type model_selection: str, optional
+    :type model_selection: str
     :param validation_strategy: The strategy for model validation.
-    :type validation_strategy: str, optional
+    :type validation_strategy: str
     :param scoring: The scoring metric(s) used to evaluate the model's performance
-    during validation.
-    :type scoring: list[str], optional
+        during validation.
+    :type scoring: list[str]
     :param **kwargs: Additional keyword arguments to be passed to the function.
     :return: A RandomForestClassifier instance, a dataframe consisting of the model's
-     training parameters and the validation score, along with four dataframes containing
-     the respective test and training samples and labels.
+        training parameters and the validation score, along with four dataframes
+        containing the respective test and training samples and labels.
     :rtype: dict
 
     """
@@ -215,6 +215,49 @@ def svm(
     scoring: list[str] = ["accuracy"],
     **kwargs,
 ):
+    """
+    Perform classification using the support vector machine classifier from sklearn.
+
+    :param input_df: The dataframe that should be classified in wide or long format
+    :type input_df: pd.DataFrame
+    :param metadata_df: A separate dataframe containing additional metadata information.
+    :type metadata_df: pd.DataFrame
+    :param labels_column: The column name in the `metadata_df` dataframe that contains
+        the target variable (labels) for classification.
+    :type labels_column: str
+    :param C: Regularization parameter
+    :type C: float
+    :param kernel: Specifies the kernel type.
+    :type kernel: str, optional
+    :param gamma: Kernel coefficient (default: 'scale', relevant for 'rbf', 'poly', and
+        'sigmoid').
+    :type gamma: str
+    :param coef0: Independent term in the kernel function (relevant for 'poly' and
+        'sigmoid').
+    :type coef0: float
+    :param probability: Whether to enable probability estimates
+    :type probability: bool, optional
+    :param tol: Tolerance for stopping criterion
+    :type tol: float
+    :param class_weight: Weights associated with classes
+    :type class_weight: float
+    :param max_iter: Maximum number of iterations (default: -1, indicating no limit).
+    :type max_iter: int
+    :param random_state: The random seed for reproducibility.
+    :type random_state: int
+    :param model_selection: The model selection method for hyperparameter tuning.
+    :type model_selection: str
+    :param validation_strategy: The strategy for model validation.
+    :type validation_strategy: str
+    :param scoring: The scoring metric(s) used to evaluate the model's performance
+        during validation.
+    :type scoring: list[str]
+    :param **kwargs: Additional keyword arguments to be passed to the function.
+    :return: A dict containing: a SVC instance, a dataframe consisting of the model's
+        training parameters and the validation score, along with four dataframes
+        containing the respective test and training samples and labels.
+    :rtype: dict
+    """
     # TODO 216 add warning to user that data should be to shuffled, give that is being sorted at the beginning!
 
     input_df_wide = long_to_wide(input_df) if is_long_format(input_df) else input_df

diff --git a/protzilla/data_analysis/clustering.py b/protzilla/data_analysis/clustering.py
@@ -30,40 +30,40 @@ def k_means(
     **kwargs,
 ):
     """
-    A method that uses k-means to partition a number of samples in k clusters. The \
-    function returns a dataframe with the corresponding cluster of each sample and \
+    A method that uses k-means to partition a number of samples in k clusters. The
+    function returns a dataframe with the corresponding cluster of each sample and
     another dataframe with the coordinates of the cluster centers.
 
     :param input_df: The dataframe that should be clustered in wide or long format
     :type input_df: pd.DataFrame
     :param metadata_df: A separate dataframe containing additional metadata information.
     :type metadata_df: pd.DataFrame
     :param labels_column: The column name in the `metadata_df` dataframe that contains
-     the true labels of the data
+        the true labels of the data
     :type labels_column: str
     :param positive_label: The positive label for clustering.
     :type positive_label: str
     :param model_selection: The model selection method for hyperparameter tuning.
     :type model_selection: str
     :param scoring: The scoring metric(s) used for model evaluation.
     :type scoring: list[str]
-    :param n_clusters: the number of clusters to form as well as the number of \
-    centroids to generate.
+    :param n_clusters: the number of clusters to form as well as the number of
+        centroids to generate.
     :type n_clusters: int
     :param random_state: Determines random number generation for centroid initialization
     :type random_state: int
-    :param init_centroid_strategy: method for centroid initialization. Possible methods\
-     are: k-means++ and random
+    :param init_centroid_strategy: method for centroid initialization. Possible methods
+        are: k-means++ and random
     :type init_centroid_strategy: str
-    :param n_init: Number of times the k-means algorithm is run with different centroid\
-     seeds.
+    :param n_init: Number of times the k-means algorithm is run with different centroid
+        seeds.
     :type n_init: int
-    :param max_iter: Maximum number of iterations of the k-means algorithm for a single\
-     run.
+    :param max_iter: Maximum number of iterations of the k-means algorithm for a single
+        run.
     :type max_iter: int
-    :param tolerance: Relative tolerance with regards to Frobenius norm of the \
-    difference in the cluster centers of two consecutive iterations to declare\
-     convergence.
+    :param tolerance: Relative tolerance with regards to Frobenius norm of the
+        difference in the cluster centers of two consecutive iterations to declare
+        convergence.
     :type tolerance: float
     :returns: A dictionary containing the following elements:
         - model: The trained Gaussian Mixture Model.
@@ -171,9 +171,10 @@ def expectation_maximisation(
     :param metadata_df: A separate dataframe containing additional metadata information.
     :type metadata_df: pd.DataFrame
     :param labels_column: The column name in the `metadata_df` dataframe that contains
-     the true labels of the data
+        the true labels of the data
     :type labels_column: str
     :param positive_label: The positive label for clustering.
+    :type positive_label: str
     :param model_selection: The model selection method for hyperparameter tuning.
     :type model_selection: str
     :param scoring: The scoring metric(s) used for model evaluation.
@@ -183,17 +184,17 @@ def expectation_maximisation(
     :param covariance_type: The covariance type for the Gaussian Mixture Model.
     :type covariance_type: str, optional
     :param reg_covar: Non-negative regularization added to the diagonal of covariance
-     matrices.
+        matrices.
     :type reg_covar: float
     :param init_params: The method used to initialize the weights, the means and
-     the precisions.
+        the precisions.
     :type init_params: str
     :param max_iter: The number of EM iterations to perform.
     :type max_iter: int, optional
     :param random_state: The random seed for reproducibility.
     :type random_state: int
     :param **kwargs: Additional keyword arguments to be passed to the
-     `perform_clustering` function.
+        `perform_clustering` function.
     :returns: A dictionary containing the following elements:
         - model: The trained Gaussian Mixture Model.
         - model_evaluation_df:  dataframe consisting of the model's parameters and the
@@ -275,9 +276,10 @@ def hierarchical_agglomerative_clustering(
     :param metadata_df: A separate dataframe containing additional metadata information.
     :type metadata_df: pd.DataFrame
     :param labels_column: The column name in the `metadata_df` dataframe that contains
-     the true labels of the data
+        the true labels of the data
     :type labels_column: str
     :param positive_label: The positive label for clustering.
+    :type positive_label: str
     :param model_selection: The model selection method for hyperparameter tuning.
     :type model_selection: str
     :param scoring: The scoring metric(s) used for model evaluation.
@@ -287,7 +289,7 @@ def hierarchical_agglomerative_clustering(
     :param metric: Metric used to compute the linkage.
     :type metric: str
     :param linkage: Which linkage criterion to use. The linkage criterion determines
-     which distance to use between sets of observation
+        which distance to use between sets of observation
     :type linkage: str
     :returns: A dictionary containing the following elements:
         - model: The trained Gaussian Mixture Model.

diff --git a/protzilla/data_analysis/differential_expression_anova.py b/protzilla/data_analysis/differential_expression_anova.py
@@ -40,9 +40,9 @@ def anova(
     :rtype: pandas DataFrame, dict
 
     :return: a dataframe in typical protzilla long format
-    with the differentially expressed proteins and a dict, containing
-    the corrected p-values and the log2 fold change, the alpha used
-    and the corrected alpha, as well as filtered out proteins.
+        with the differentially expressed proteins and a dict, containing
+        the corrected p-values and the log2 fold change, the alpha used
+        and the corrected alpha, as well as filtered out proteins.
     """
     # Check if the grouping variable is present in the metadata_df
     assert grouping in metadata_df.columns, f"{grouping} not found in metadata_df"

diff --git a/protzilla/data_analysis/differential_expression_linear_model.py b/protzilla/data_analysis/differential_expression_linear_model.py
@@ -42,8 +42,8 @@ def linear_model(
     :type fc_threshold: float
 
     :return: a dataframe in typical protzilla long format with the differentially expressed
-    proteins and a dict, containing the corrected p-values and the log2 fold change (coefficients), the alpha used
-    and the corrected alpha, as well as filtered out proteins.
+        proteins and a dict, containing the corrected p-values and the log2 fold change (coefficients), the alpha used
+        and the corrected alpha, as well as filtered out proteins.
     :rtype: Tuple[pandas DataFrame, dict]
     """
     assert grouping in metadata_df.columns