Skip to content

Commit

Permalink
Merge pull request #6 from lzj1769/develop
Browse files Browse the repository at this point in the history
Develop
  • Loading branch information
lzj1769 authored Jan 31, 2023
2 parents dc41d4f + b66a2ec commit 25aad67
Show file tree
Hide file tree
Showing 12 changed files with 168 additions and 99 deletions.
7 changes: 5 additions & 2 deletions .readthedocs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ version: 2
build:
os: ubuntu-22.04
tools:
python: "3.11"
python: "3.10"
# You can also specify other tool versions:
# nodejs: "19"
# rust: "1.64"
Expand All @@ -18,6 +18,7 @@ build:
# Build documentation in the docs/ directory with Sphinx
sphinx:
configuration: docs/source/conf.py
fail_on_warning: true

# If using Sphinx, optionally build your docs in additional formats such as PDF
# formats:
Expand All @@ -26,4 +27,6 @@ sphinx:
# Optionally declare the Python requirements required to build your docs
python:
install:
- requirements: docs/source/requirements.txt
- requirements: docs/source/requirements.txt
- method: pip
path: .
40 changes: 40 additions & 0 deletions docs/source/api.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
API
=============================

.. automodule:: pychromvar
:members:

Preprocessing
------------------

.. autosummary::
:toctree: generated

get_bg_peaks
add_peak_seq
add_gc_bias

Motif match
------------------

.. autosummary::
:toctree: generated

match_motif

Genome
------------------

.. autosummary::
:toctree: generated

get_genome

Compute deviation
------------------

.. autosummary::
:toctree: generated

compute_deviations
compute_expectation
21 changes: 17 additions & 4 deletions docs/source/index.rst
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
pychromVAR
Welcome to pychromVAR's documentation!
==============================================================

pychromVAR is a python package for inferring transcription factor binding variability from
Expand All @@ -11,12 +11,25 @@ with `scanpy <https://scanpy.readthedocs.io/en/stable/>`__ and

For more methdological detials, please refer to the original `paper <https://www.nature.com/articles/nmeth.4401>`__.

Installation
============

**pychromVAR** requires Python version >= 3.8 to run.

PyPI
----
**pychromVAR** is also available on PyPI:

.. code-block:: console
pip install pychromvar
.. toctree::
:caption: mail
:caption: pychromvar
:maxdepth: 1
:hidden:

installation
api

.. toctree::
:caption: notebooks
Expand Down
12 changes: 0 additions & 12 deletions docs/source/installation.rst

This file was deleted.

1 change: 0 additions & 1 deletion docs/source/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
numpydoc
nbsphinx
ipython
scikit-learn
skranger
4 changes: 2 additions & 2 deletions pychromvar/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
__version__ = "0.0.3"
__version_info__ = tuple([int(num) for num in __version__.split('.')]) # noqa: F401

from .preprocessing import *
from .preprocessing import get_bg_peaks, add_gc_bias, add_peak_seq
from .match_motif import match_motif
from .compute_deviations import compute_deviations
from .compute_deviations import compute_deviations, compute_expectation
from .get_genome import get_genome

48 changes: 31 additions & 17 deletions pychromvar/compute_deviations.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,20 @@
datefmt='%Y-%m-%d %H:%M:%S')


def compute_deviations(data: Union[AnnData, MuData], n_jobs=-1):
"""
Compute raw and bias-corrected deviations.
Args:
data (Union[AnnData, MuData]):
AnnData object with peak counts or MuData object with 'atac' modality.
n_jobs:
Number of cpus used for motif matching. If set to -1, all cpus will be used. Default: -1
def compute_deviations(data: Union[AnnData, MuData], n_jobs=-1) -> AnnData:
"""Compute raw and bias-corrected deviations.
Parameters
----------
data : Union[AnnData, MuData]
AnnData object with peak counts or MuData object with 'atac' modality.
n_jobs : int, optional
Number of cpus used for motif matching. If set to -1, all cpus will be used. Default: -1.
Returns
-------
Anndata
An anndata object containing estimated deviations.
"""

if isinstance(data, AnnData):
Expand Down Expand Up @@ -60,7 +65,8 @@ def compute_deviations(data: Union[AnnData, MuData], n_jobs=-1):
if n_jobs == 1:
for i in range(n_bg_peaks):
bg_peak_idx = adata.varm['bg_peaks'][:, i]
bg_motif_match = adata.varm['motif_match'][bg_peak_idx, :].transpose()
bg_motif_match = adata.varm['motif_match'][bg_peak_idx, :].transpose(
)
bg_dev[i, :, :] = _compute_dev((bg_motif_match, adata.X.transpose(),
expectation.transpose())).transpose()

Expand All @@ -69,8 +75,10 @@ def compute_deviations(data: Union[AnnData, MuData], n_jobs=-1):
arguments_list = list()
for i in range(n_bg_peaks):
bg_peak_idx = adata.varm['bg_peaks'][:, i]
bg_motif_match = adata.varm['motif_match'][bg_peak_idx, :].transpose()
arguments = (bg_motif_match, adata.X.transpose(), expectation.transpose())
bg_motif_match = adata.varm['motif_match'][bg_peak_idx, :].transpose(
)
arguments = (bg_motif_match, adata.X.transpose(),
expectation.transpose())
arguments_list.append(arguments)

# run the function with multiple cpus
Expand Down Expand Up @@ -105,11 +113,17 @@ def _compute_dev(arguments):

def compute_expectation(count: np.array) -> np.array:
"""
Compute expetation accessibility per peak and per cell by assuming identical
read probability per peak for each cell with a sequencing depth matched to that cell
observed sequencing depth.
Args:
count (_type_): _description_
Compute expetation accessibility per peak and per cell by assuming identical read probability per peak for each cell with a sequencing depth matched to that cell observed sequencing depth
Parameters
----------
count : np.array
Count matrix containing raw accessibility data.
Returns
-------
np.array
Expectation matrix
"""

a = np.sum(count, axis=0, keepdims=True)
Expand Down
17 changes: 8 additions & 9 deletions pychromvar/get_genome.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,16 @@
}

def get_genome(genome:str="hg38", output_dir:str=None):
"""Download genome
Parameters
----------
genome : str, optional
Which genome should be downloaded, Available options are: "hg19", "hg38", "mm9", "mm10". By default "hg38"
output_dir : str, optional
Output directory. Default: current directory.
"""
Download genome

Args:
genome (str, optional):
Which genome should be downloaded. Available options are: "hg19", "hg38", "mm9", "mm10".
Defaults to "hg38".
output_dir (str):
Output directory. Default: current directory.
"""
assert genome in ["hg19", "hg38", "mm10", "mm39"], f"Cannot find {genome}!"

if not os.path.exists(output_dir):
Expand Down
44 changes: 22 additions & 22 deletions pychromvar/match_motif.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,28 +13,28 @@

def match_motif(data: Union[AnnData, MuData], motifs, pseudocounts=0.0001, p_value=5e-05,
background: _BACKGROUND = "even", genome_file: str = None):
"""
Perform motif matching to predict binding sites using MOODS.
This function wraps
Args:
data (Union[AnnData, MuData]):
AnnData object with peak counts or MuData object with 'atac' modality.
motifs:
List of motifs
pseudocounts:
Pseudocounts for each nucleotide. Default value is 0.0001
p_value:
P-value threshold for motif matching. Default: 5e-05
background:
Background distribution of nucleotides for computing thresholds from p-value.
Three options are available: "subject" to use the subject sequences, "genome" to use the
whole genome (need to provide a genome file), or even using 0.25 for each base.
Default: "subject".
genome_file:
If background is set to genome, a genome file must be provided. Default: None
n_jobs:
Number of cpus used for motif matching. If set to -1, all cpus will be used. Default: 1
"""Perform motif matching to predict binding sites using MOODS.
Parameters
----------
data : Union[AnnData, MuData]
AnnData object with peak counts or MuData object with 'atac' modality.
motifs : _type_
List of motifs
pseudocounts : float, optional
Pseudocounts for each nucleotide, by default 0.0001
p_value : _type_, optional
_description_, by default 5e-05
background : _BACKGROUND, optional
Background distribution of nucleotides for computing thresholds from p-value.
Three options are available: "subject" to use the subject sequences, "genome" to use the
whole genome (need to provide a genome file), or even using 0.25 for each base, by default "even"
genome_file : str, optional
If background is set to genome, a genome file must be provided, by default None
Returns
-------
Update data.
"""

if isinstance(data, AnnData):
Expand Down
67 changes: 39 additions & 28 deletions pychromvar/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,20 +9,25 @@
from tqdm import tqdm
from pynndescent import NNDescent


def get_bg_peaks(data: Union[AnnData, MuData], niterations=50, n_jobs=-1):
"""
Find background peaks based on GC bias.
"""Find background peaks based on GC bias and number of reads per peak
Args:
data (Union[AnnData, MuData]):
AnnData object with peak counts or MuData object with 'atac' modality.
niterations (int, optional):
Number of background peaks to sample. Defaults to 50.
n_jobs:
Parameters
----------
data : Union[AnnData, MuData]
AnnData object with peak counts or MuData object with 'atac' modality
niterations : int, optional
Number of background peaks to sample,, by default 50
n_jobs : int, optional
Number of cpus for compute. If set to -1, all cpus will be used, by default -1
Raises:
TypeError: _description_
Returns
-------
updates `data`.
"""

if isinstance(data, AnnData):
adata = data
elif isinstance(data, MuData) and "atac" in data.mod:
Expand Down Expand Up @@ -54,18 +59,22 @@ def get_bg_peaks(data: Union[AnnData, MuData], niterations=50, n_jobs=-1):


def add_peak_seq(data: Union[AnnData, MuData], genome_file: str, delimiter="-"):
"""Add the DNA sequence of each peak to data object.
Parameters
----------
data : Union[AnnData, MuData]
AnnData object with peak counts or MuData object with 'atac' modality.
genome_file : str
Filename of genome reference
delimiter : str, optional
Delimiter that separates peaks, by default "-"
Returns
-------
Update `data`
"""
Add the DNA sequence of each peak to data object.
The sequences will be used in GC bias estimation and motif binding sites matching.
Args:
data (Union[AnnData, MuData]):
AnnData object with peak counts or MuData object with 'atac' modality.
genome_file (str):
Filename of genome reference
delimiter (str, optional):
Delimiter that separates peaks. Defaults to "-".
"""


if isinstance(data, AnnData):
adata = data
Expand All @@ -86,14 +95,16 @@ def add_peak_seq(data: Union[AnnData, MuData], genome_file: str, delimiter="-"):


def add_gc_bias(data: Union[AnnData, MuData]):
"""
Compute GC bias for each peak.
"""Compute GC bias for each peak.
Parameters
----------
data : Union[AnnData, MuData]
AnnData object with peak counts or MuData object with 'atac' modality.
Args:
data (Union[AnnData, MuData]):
AnnData object with peak counts or MuData object with 'atac' modality.
Returns:
_type_: _description_
Returns
-------
Update data
"""

if isinstance(data, AnnData):
Expand Down
4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ dependencies = [
"scipy",
"mudata",
"scanpy",
"scikit-learn",
"muon",
"biopython",
"MOODS-python",
Expand All @@ -33,4 +34,5 @@ dependencies = [
]

[project.urls]
Source = "https://github.com/lzj1769/pychromVAR"
Source = "https://github.com/lzj1769/pychromVAR"
Documentation = "https://pychromvar.readthedocs.io/en/latest/"
Loading

0 comments on commit 25aad67

Please sign in to comment.