Merge pull request #6 from lzj1769/develop

Develop
pinellolab · Jan 31, 2023 · 25aad67 · 25aad67
2 parents dc41d4f + b66a2ec
commit 25aad67
Show file tree

Hide file tree

Showing 12 changed files with 168 additions and 99 deletions.
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
@@ -9,7 +9,7 @@ version: 2
 build:
   os: ubuntu-22.04
   tools:
-    python: "3.11"
+    python: "3.10"
     # You can also specify other tool versions:
     # nodejs: "19"
     # rust: "1.64"
@@ -18,6 +18,7 @@ build:
 # Build documentation in the docs/ directory with Sphinx
 sphinx:
    configuration: docs/source/conf.py
+   fail_on_warning: true
 
 # If using Sphinx, optionally build your docs in additional formats such as PDF
 # formats:
@@ -26,4 +27,6 @@ sphinx:
 # Optionally declare the Python requirements required to build your docs
 python:
    install:
-   - requirements: docs/source/requirements.txt
+      - requirements: docs/source/requirements.txt
+      - method: pip
+        path: .
diff --git a/docs/source/api.rst b/docs/source/api.rst
@@ -0,0 +1,40 @@
+API
+=============================
+
+.. automodule:: pychromvar
+   :members:
+
+Preprocessing
+------------------
+
+.. autosummary::
+   :toctree: generated
+
+   get_bg_peaks
+   add_peak_seq
+   add_gc_bias
+
+Motif match
+------------------
+
+.. autosummary::
+   :toctree: generated
+
+   match_motif
+
+Genome
+------------------
+
+.. autosummary::
+   :toctree: generated
+
+   get_genome
+
+Compute deviation
+------------------
+
+.. autosummary::
+   :toctree: generated
+
+   compute_deviations
+   compute_expectation
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -1,4 +1,4 @@
-pychromVAR
+Welcome to pychromVAR's documentation!
 ==============================================================
 
 pychromVAR is a python package for inferring transcription factor binding variability from 
@@ -11,12 +11,25 @@ with `scanpy <https://scanpy.readthedocs.io/en/stable/>`__ and
 
 For more methdological detials, please refer to the original `paper <https://www.nature.com/articles/nmeth.4401>`__.
 
+Installation
+============
+
+**pychromVAR** requires Python version >= 3.8 to run.
+
+PyPI
+----
+**pychromVAR** is also available on PyPI:
+
+.. code-block:: console
+
+   pip install pychromvar
+
 .. toctree::
-   :caption: mail
+   :caption: pychromvar
    :maxdepth: 1
    :hidden:
-
-   installation
+   
+   api
 
 .. toctree::
    :caption: notebooks

diff --git a/docs/source/installation.rst b/docs/source/installation.rst
diff --git a/docs/source/requirements.txt b/docs/source/requirements.txt
@@ -1,5 +1,4 @@
 numpydoc
 nbsphinx
 ipython
-scikit-learn
 skranger
diff --git a/pychromvar/__init__.py b/pychromvar/__init__.py
@@ -1,8 +1,8 @@
 __version__ = "0.0.3"
 __version_info__ = tuple([int(num) for num in __version__.split('.')])  # noqa: F401
 
-from .preprocessing import *
+from .preprocessing import get_bg_peaks, add_gc_bias, add_peak_seq
 from .match_motif import match_motif
-from .compute_deviations import compute_deviations
+from .compute_deviations import compute_deviations, compute_expectation
 from .get_genome import get_genome
 
diff --git a/pychromvar/compute_deviations.py b/pychromvar/compute_deviations.py
@@ -12,15 +12,20 @@
     datefmt='%Y-%m-%d %H:%M:%S')
 
 
-def compute_deviations(data: Union[AnnData, MuData], n_jobs=-1):
-    """
-    Compute raw and bias-corrected deviations.
-
-    Args:
-        data (Union[AnnData, MuData]): 
-            AnnData object with peak counts or MuData object with 'atac' modality.
-        n_jobs:
-            Number of cpus used for motif matching. If set to -1, all cpus will be used. Default: -1
+def compute_deviations(data: Union[AnnData, MuData], n_jobs=-1) -> AnnData:
+    """Compute raw and bias-corrected deviations.
+
+    Parameters
+    ----------
+    data : Union[AnnData, MuData]
+        AnnData object with peak counts or MuData object with 'atac' modality.
+    n_jobs : int, optional
+        Number of cpus used for motif matching. If set to -1, all cpus will be used. Default: -1.
+
+    Returns
+    -------
+    Anndata
+        An anndata object containing estimated deviations.
     """
 
     if isinstance(data, AnnData):
@@ -60,7 +65,8 @@ def compute_deviations(data: Union[AnnData, MuData], n_jobs=-1):
     if n_jobs == 1:
         for i in range(n_bg_peaks):
             bg_peak_idx = adata.varm['bg_peaks'][:, i]
-            bg_motif_match = adata.varm['motif_match'][bg_peak_idx, :].transpose()
+            bg_motif_match = adata.varm['motif_match'][bg_peak_idx, :].transpose(
+            )
             bg_dev[i, :, :] = _compute_dev((bg_motif_match, adata.X.transpose(),
                                             expectation.transpose())).transpose()
 
@@ -69,8 +75,10 @@ def compute_deviations(data: Union[AnnData, MuData], n_jobs=-1):
         arguments_list = list()
         for i in range(n_bg_peaks):
             bg_peak_idx = adata.varm['bg_peaks'][:, i]
-            bg_motif_match = adata.varm['motif_match'][bg_peak_idx, :].transpose()
-            arguments = (bg_motif_match, adata.X.transpose(), expectation.transpose())
+            bg_motif_match = adata.varm['motif_match'][bg_peak_idx, :].transpose(
+            )
+            arguments = (bg_motif_match, adata.X.transpose(),
+                         expectation.transpose())
             arguments_list.append(arguments)
 
         # run the function with multiple cpus
@@ -105,11 +113,17 @@ def _compute_dev(arguments):
 
 def compute_expectation(count: np.array) -> np.array:
     """
-    Compute expetation accessibility per peak and per cell by assuming identical 
-    read probability per peak for each cell with a sequencing depth matched to that cell
-    observed sequencing depth.
-    Args:
-        count (_type_): _description_
+    Compute expetation accessibility per peak and per cell by assuming identical read probability per peak for each cell with a sequencing depth matched to that cell observed sequencing depth
+
+    Parameters
+    ----------
+    count : np.array
+        Count matrix containing raw accessibility data.
+
+    Returns
+    -------
+    np.array
+        Expectation matrix
     """
 
     a = np.sum(count, axis=0, keepdims=True)

diff --git a/pychromvar/get_genome.py b/pychromvar/get_genome.py
@@ -13,17 +13,16 @@
 }
 
 def get_genome(genome:str="hg38", output_dir:str=None):
+    """Download genome
+
+    Parameters
+    ----------
+    genome : str, optional
+        Which genome should be downloaded, Available options are: "hg19", "hg38", "mm9", "mm10". By default "hg38"
+    output_dir : str, optional
+        Output directory. Default: current directory.
     """
-    Download genome
 
-    Args:
-        genome (str, optional): 
-            Which genome should be downloaded. Available options are: "hg19", "hg38", "mm9", "mm10".
-            Defaults to "hg38".
-
-        output_dir (str):
-            Output directory. Default: current directory.
-    """
     assert genome in ["hg19", "hg38", "mm10", "mm39"], f"Cannot find {genome}!"
 
     if not os.path.exists(output_dir):

diff --git a/pychromvar/match_motif.py b/pychromvar/match_motif.py
@@ -13,28 +13,28 @@
 
 def match_motif(data: Union[AnnData, MuData], motifs, pseudocounts=0.0001, p_value=5e-05,
                 background: _BACKGROUND = "even", genome_file: str = None):
-    """
-    Perform motif matching to predict binding sites using MOODS. 
-    This function wraps 
-
-    Args:
-        data (Union[AnnData, MuData]): 
-            AnnData object with peak counts or MuData object with 'atac' modality.
-        motifs: 
-            List of motifs
-        pseudocounts:
-            Pseudocounts for each nucleotide. Default value is 0.0001
-        p_value:
-            P-value threshold for motif matching. Default: 5e-05
-        background:
-            Background distribution of nucleotides for computing thresholds from p-value. 
-            Three options are available: "subject" to use the subject sequences, "genome" to use the
-            whole genome (need to provide a genome file), or even using 0.25 for each base.
-            Default: "subject".
-        genome_file:
-            If background is set to genome, a genome file must be provided. Default: None
-        n_jobs:
-            Number of cpus used for motif matching. If set to -1, all cpus will be used. Default: 1
+    """Perform motif matching to predict binding sites using MOODS. 
+
+    Parameters
+    ----------
+    data : Union[AnnData, MuData]
+        AnnData object with peak counts or MuData object with 'atac' modality.
+    motifs : _type_
+        List of motifs
+    pseudocounts : float, optional
+        Pseudocounts for each nucleotide, by default 0.0001
+    p_value : _type_, optional
+        _description_, by default 5e-05
+    background : _BACKGROUND, optional
+        Background distribution of nucleotides for computing thresholds from p-value. 
+        Three options are available: "subject" to use the subject sequences, "genome" to use the
+        whole genome (need to provide a genome file), or even using 0.25 for each base, by default "even"
+    genome_file : str, optional
+        If background is set to genome, a genome file must be provided, by default None
+
+    Returns
+    -------
+    Update data.
     """
 
     if isinstance(data, AnnData):

diff --git a/pychromvar/preprocessing.py b/pychromvar/preprocessing.py
@@ -9,20 +9,25 @@
 from tqdm import tqdm
 from pynndescent import NNDescent
 
+
 def get_bg_peaks(data: Union[AnnData, MuData], niterations=50, n_jobs=-1):
-    """
-    Find background peaks based on GC bias.
+    """Find background peaks based on GC bias and number of reads per peak
 
-    Args:
-        data (Union[AnnData, MuData]):
-            AnnData object with peak counts or MuData object with 'atac' modality.
-        niterations (int, optional): 
-            Number of background peaks to sample. Defaults to 50.
-        n_jobs:
+    Parameters
+    ----------
+    data : Union[AnnData, MuData]
+        AnnData object with peak counts or MuData object with 'atac' modality
+    niterations : int, optional
+        Number of background peaks to sample,, by default 50
+    n_jobs : int, optional
+        Number of cpus for compute. If set to -1, all cpus will be used, by default -1
 
-    Raises:
-        TypeError: _description_
+    Returns
+    -------
+
+    updates `data`.
     """
+
     if isinstance(data, AnnData):
         adata = data
     elif isinstance(data, MuData) and "atac" in data.mod:
@@ -54,18 +59,22 @@ def get_bg_peaks(data: Union[AnnData, MuData], niterations=50, n_jobs=-1):
 
 
 def add_peak_seq(data: Union[AnnData, MuData], genome_file: str, delimiter="-"):
+    """Add the DNA sequence of each peak to data object. 
+    
+    Parameters
+    ----------
+    data : Union[AnnData, MuData]
+        AnnData object with peak counts or MuData object with 'atac' modality.
+    genome_file : str
+        Filename of genome reference
+    delimiter : str, optional
+        Delimiter that separates peaks, by default "-"
+
+    Returns
+    -------
+    Update `data`
     """
-    Add the DNA sequence of each peak to data object. 
-    The sequences will be used in GC bias estimation and motif binding sites matching.
-
-    Args:
-        data (Union[AnnData, MuData]): 
-            AnnData object with peak counts or MuData object with 'atac' modality.
-        genome_file (str): 
-            Filename of genome reference
-        delimiter (str, optional): 
-            Delimiter that separates peaks. Defaults to "-".
-    """
+
 
     if isinstance(data, AnnData):
         adata = data
@@ -86,14 +95,16 @@ def add_peak_seq(data: Union[AnnData, MuData], genome_file: str, delimiter="-"):
 
 
 def add_gc_bias(data: Union[AnnData, MuData]):
-    """
-    Compute GC bias for each peak.
+    """Compute GC bias for each peak.
+
+    Parameters
+    ----------
+    data : Union[AnnData, MuData]
+        AnnData object with peak counts or MuData object with 'atac' modality.
 
-    Args:
-        data (Union[AnnData, MuData]): 
-            AnnData object with peak counts or MuData object with 'atac' modality.
-    Returns:
-        _type_: _description_
+    Returns
+    -------
+    Update data
     """
 
     if isinstance(data, AnnData):

diff --git a/pyproject.toml b/pyproject.toml
@@ -25,6 +25,7 @@ dependencies = [
     "scipy",
     "mudata",
     "scanpy",
+    "scikit-learn",
     "muon",
     "biopython",
     "MOODS-python",
@@ -33,4 +34,5 @@ dependencies = [
 ]
 
 [project.urls]
-Source = "https://github.com/lzj1769/pychromVAR"
+Source = "https://github.com/lzj1769/pychromVAR"
+Documentation = "https://pychromvar.readthedocs.io/en/latest/"