Skip to content

Commit

Permalink
GSEA supports blitzGSEA
Browse files Browse the repository at this point in the history
  • Loading branch information
yihming committed May 29, 2024
1 parent 6ca9763 commit 6d1dc3a
Show file tree
Hide file tree
Showing 6 changed files with 160 additions and 37 deletions.
4 changes: 2 additions & 2 deletions docs/api/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -137,8 +137,8 @@ Differential Expression and Gene Set Enrichment Analysis
de_analysis
markers
write_results_to_excel
fgsea
write_fgsea_results_to_excel
gsea
write_gsea_results_to_excel

Annotate clusters
-------------------
Expand Down
4 changes: 2 additions & 2 deletions pegasus/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,8 @@
clone_subset,
pseudobulk,
deseq2,
fgsea,
write_fgsea_results_to_excel,
gsea,
write_gsea_results_to_excel,
run_scvi,
train_scarches_scanvi,
predict_scarches_scanvi,
Expand Down
8 changes: 4 additions & 4 deletions pegasus/plotting/plot_library.py
Original file line number Diff line number Diff line change
Expand Up @@ -2381,7 +2381,7 @@ def _make_one_gsea_plot(df, ax, color, size=10, fontsize=5):

def plot_gsea(
data: Union[MultimodalData, UnimodalData],
gsea_keyword: Optional[str] = "fgsea_out",
gsea_keyword: Optional[str] = "gsea_out",
alpha: Optional[float] = 0.1,
top_n: Optional[int] = 20,
panel_size: Optional[Tuple[float, float]] = (6, 4),
Expand All @@ -2397,8 +2397,8 @@ def plot_gsea(
data : ``UnimodalData`` or ``MultimodalData`` object
The main data object.
gsea_keyword: ``str``, optional, default: ``"fgsea_out"``
Keyword in data.uns that stores the fGSEA results in pandas data frame.
gsea_keyword: ``str``, optional, default: ``"gsea_out"``
Keyword in data.uns that stores the GSEA results in pandas data frame.
alpha: ``float``, optional, default: ``0.1``
False discovery rate threshold.
top_n: ``int``, optional, default: ``20``
Expand All @@ -2421,7 +2421,7 @@ def plot_gsea(
Examples
--------
>>> fig = pg.plot_gsea(data, 'fgsea_out', dpi = 500)
>>> fig = pg.plot_gsea(data, 'gsea_out', dpi = 500)
"""
df = data.uns[gsea_keyword]
df = df[df['padj'] <= alpha].copy()
Expand Down
2 changes: 1 addition & 1 deletion pegasus/tools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@
from .doublet_detection import infer_doublets, mark_doublets
from .nmf import nmf, integrative_nmf
from .pseudobulk import pseudobulk, deseq2
from .fgsea import fgsea, write_fgsea_results_to_excel
from .gsea import gsea, write_gsea_results_to_excel
from .scvitools import (
run_scvi,
train_scarches_scanvi,
Expand Down
175 changes: 149 additions & 26 deletions pegasus/tools/gsea.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,65 +4,179 @@

import pandas as pd

from pegasus.tools import predefined_pathways, load_signatures_from_file
from pegasus.tools import predefined_pathways, load_signatures_from_file, eff_n_jobs
from pegasusio import MultimodalData, UnimodalData, timer
from typing import Union, Optional


@timer(logger=logger)
def fgsea(
def gsea(
data: Union[MultimodalData, UnimodalData],
log2fc_key: str,
pathways: str,
de_key: Optional[str] = "de_res",
minSize: Optional[int] = 15,
maxSize: Optional[int] = 500,
nproc: Optional[int] = 0,
seed: Optional[int] = 0,
fgsea_key: Optional[str] = "fgsea_out",
de_key: str = "de_res:stat",
method: str = "blitzgsea",
gsea_key: str = "gsea_out",
min_size: int = 5,
max_size: int = 4000,
n_jobs: int = 1,
seed: int = 0,
**kwargs,
) -> None:
"""Perform Gene Set Enrichment Analysis using fGSEA. This function calls R package fGSEA, requiring fGSEA in R installed.
"""Perform Gene Set Enrichment Analysis (GSEA).
Parameters
----------
data: Union[``MultimodalData``, ``UnimodalData``]
Single-cell or pseudo-bulk data.
log2fc_key: ``str``
Key in pre-computed DE results representing log2 fold change.
rank_key: ``str``, optional, default: ``stat``
Key in pre-computed DE results representing gene ranks.
By default, use the test statistics in the pre-computed DE results.
pathways: ``str``
Either a string or a path to the gene set file in GMT format. If string, choosing from "hallmark" and "canonical_pathways" (MSigDB H and C2/CP).
de_key: ``str``, optional, default: ``"de_res"``
Key name of DE analysis results stored. data.varm[de_key] should contain a record array of DE results.
backend: ``str``, optional, default: ``blitzgsea``
Specify which package to use as the backend for GSEA.
By default, use ``blitzgsea`` which is a pure Python fast implementation of the pre-rank GSEA algorithm.
Alternatively, if specify ``fgsea``, then use R package ``fgsea``, which requires ``rpy2`` and R installation.
de_key: ``str``, optional, default: ``de_res:stat``
Key name in the DE results to be used for gene signatures.
Specify in format ``df_key:attr``, where ``df_key`` must exist in ``data.varm``, and ``attr`` must exist in columns of ``data.varm[df_key]``.
minSize: ``int``, optional, default: ``15``
minSize: ``int``, optional, default: ``5``
Minimal size of a gene set to consider.
maxSize: ``int``, optional, default: ``500``
maxSize: ``int``, optional, default: ``4000``
Maximal size of a gene set to consider.
nproc: ``int``, optional, default: ``0``
Numbr of processes for parallel computation. If nproc > 0, set BPPARAM.
n_jobs: ``int``, optional, default: ``1``
Number of threads for parallel computation.
If ``method='fgsea'``, set ``n_jobs=0`` by default to use one thread. If ``n_jobs>0``, set BPPARAM.
seed: ``int``, optional, default: ``0``
Random seed to make sure fGSEA results are reproducible.
fgsea_key: ``str``, optional, default: ``"fgsea_out"``
gsea_key: ``str``, optional, default: ``"gsea_out"``
Key to use to store fGSEA results as a data frame.
Returns
-------
``None``
Update ``data.uns``:
``data.uns[fgsea_key]``: fGSEA outputs sorted by padj.
``data.uns[gsea_key]``: GSEA outputs sorted by ``padj``.
Examples
--------
>>> pg.fgsea(data, '3:log2FC', hallmark', fgsea_key='fgsea_res')
>>> pg.gsea(data, "de_res:stat", "hallmark", gsea_key='gsea_res')
"""
if method == "blitzgsea":
# center default is False, unless explicitly specified by users
center = False
if 'center' in kwargs:
center = kwargs['center']
del kwargs['center']

_run_blitzgsea(
data=data,
de_key=de_key,
pathways=pathways,
gsea_key=gsea_key,
min_size=min_size,
max_size=max_size,
n_jobs=n_jobs,
seed=seed,
center=center,
**kwargs,
)
else:
_run_fgsea(
data=data,
de_key=de_key,
pathways=pathways,
gsea_key=gsea_key,
minSize=min_size,
maxSize=max_size,
nproc=n_jobs,
seed=seed,
)


def _split_de_key(de_key):
assert ":" in de_key, f"Key '{de_key}' does not satisfy format 'df_key:attr'!"
keys = de_key.split(":")
df_key = keys[0]
rank_key = ":".join(keys[1:])

return df_key, rank_key


def _run_blitzgsea(
data,
de_key,
pathways,
gsea_key,
min_size,
max_size,
n_jobs,
seed,
center,
**kwargs,
) -> None:
try:
import blitzgsea as blitz
except ModuleNotFoundError as e:
import sys
logger.error(f"{e}\nNeed blitzgsea! Try 'pip install blitzgsea'.")
sys.exit(-1)

df_key, rank_key = _split_de_key(de_key)
assert df_key in data.varm, f"Key '{df_key}' not in data.varm! Wrong key name or run DE analysis beforehand!"
assert rank_key in data.varm[df_key].dtype.names, f"Key '{rank_key}' not in DE result! Wrong key name specified!"

rank_df = pd.DataFrame(data.varm[df_key], index=data.var_names)[[rank_key]].reset_index()
rank_df.columns = [0, 1]
rank_df = rank_df.sort_values(by=1, ascending=False)

library = load_signatures_from_file(predefined_pathways.get(pathways, pathways))

n_jobs = eff_n_jobs(n_jobs)

result = blitz.gsea(
signature=rank_df,
library=library,
min_size=min_size,
max_size=max_size,
seed=seed,
processes=n_jobs,
center=center,
**kwargs,
)
# Rename column fdr to padj
result = result.reset_index()
result.rename(columns={
"Term": "pathway",
"fdr": "padj",
"nes": "NES",
"es": "ES",
}, inplace=True)
result.sort_values('padj', inplace=True)

data.uns[gsea_key] = result


def _run_fgsea(
data: Union[MultimodalData, UnimodalData],
de_key: str,
pathways: str,
gsea_key: str,
minSize: int,
maxSize: int,
nproc: int,
seed: int,
) -> None:
try:
import rpy2.robjects as ro
from rpy2.robjects import pandas2ri
Expand All @@ -88,12 +202,21 @@ def fgsea(
logger.error(text)
sys.exit(-1)

nproc = eff_n_jobs(nproc)
if nproc == 1:
nproc = 0 # Avoid BPPARAM to just use 1 core

ro.r(f"set.seed({seed})")
pwdict = load_signatures_from_file(predefined_pathways.get(pathways, pathways))
pathways_r = ro.ListVector(pwdict)
log2fc = ro.FloatVector(data.varm[de_key][log2fc_key])
log2fc.names = ro.StrVector(data.var_names)
res = fgsea.fgsea(pathways_r, log2fc, minSize=minSize, maxSize=maxSize, nproc=nproc)

df_key, rank_key = _split_de_key(de_key)
assert df_key in data.varm, f"Key '{df_key}' not in data.varm! Wrong key name or run DE analysis beforehand!"
assert rank_key in data.varm[df_key].dtype.names, f"Key '{rank_key}' not in DE result! Wrong key name specified!"

rank_vec = ro.FloatVector(data.varm[df_key][rank_key])
rank_vec.names = ro.StrVector(data.var_names)
res = fgsea.fgsea(pathways_r, rank_vec, minSize=minSize, maxSize=maxSize, nproc=nproc)
unlist = ro.r(
"""
function(df) {
Expand All @@ -105,11 +228,11 @@ def fgsea(
with localconverter(ro.default_converter + pandas2ri.converter):
res_df = ro.conversion.rpy2py(unlist(res))
res_df.sort_values("padj", inplace=True)
data.uns[fgsea_key] = res_df
data.uns[gsea_key] = res_df


@timer(logger=logger)
def write_fgsea_results_to_excel(
def write_gsea_results_to_excel(
data: Union[MultimodalData, UnimodalData],
output_file: str,
fgsea_key: Optional[str] = "fgsea_out",
Expand Down
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,9 @@
torch=["torch", "harmony-pytorch", "nmf-torch"],
forceatlas=["forceatlas2-python"],
mkl=["mkl"],
rpy2=["rpy2"],
scvi=["scvi-tools"],
all=["fitsne", "louvain", "scanorama", "torch", "harmony-pytorch", "nmf-torch", "rpy2", "forceatlas2-python", "scvi-tools"]
pseudo=["pydeseq2", "blitzgsea"],
all=["fitsne", "louvain", "scanorama", "torch", "harmony-pytorch", "nmf-torch", "forceatlas2-python", "scvi-tools", "pydeseq2", "blitzgsea"],
),
python_requires="~=3.8",
package_data={
Expand Down

0 comments on commit 6d1dc3a

Please sign in to comment.