Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ship cellxgene #227

Merged
merged 40 commits into from
Mar 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
f16dd06
simple testing on cellxgene api
amva13 Feb 27, 2024
1d479b1
cellxgene census is a resource
amva13 Mar 1, 2024
f2b7caa
makae census api a resource
amva13 Mar 2, 2024
8ffa110
mvp cellxgene resource implemented
amva13 Mar 4, 2024
bb83822
implement memory-efficient retrieval of count matrix and update envir…
amva13 Mar 5, 2024
5cc822d
lint all base files
amva13 Mar 5, 2024
b804323
yapr lint google format on tdc/benchmark_group, chem_utils, generatio…
amva13 Mar 5, 2024
13acccd
google lint on resource and single_pred in tdc
amva13 Mar 5, 2024
d0d31e4
google lint on tdc/test
amva13 Mar 5, 2024
4657198
google lint on tdc/utils
amva13 Mar 5, 2024
d638516
google lint tdc base files
amva13 Mar 5, 2024
3285ae4
fix mistake in yields.py
amva13 Mar 5, 2024
d750208
completed google lint. added torch dependencies to conda. tests pass.…
amva13 Mar 5, 2024
657b6ae
mend
amva13 Mar 5, 2024
7321014
mend
amva13 Mar 5, 2024
abfbbbb
mend
amva13 Mar 5, 2024
953720f
mend
amva13 Mar 5, 2024
cc4f545
add YAPF to GH Action
amva13 Mar 5, 2024
5b2f69a
remaining lint errors
amva13 Mar 5, 2024
ef3ecae
simple testing on cellxgene api
amva13 Feb 27, 2024
86f5670
implement memory-efficient retrieval of count matrix and update envir…
amva13 Mar 5, 2024
5ce0996
mend
amva13 Mar 5, 2024
23a0ae3
run conda-tests on all branches
amva13 Mar 5, 2024
4c95fde
completed google lint. added torch dependencies to conda. tests pass.…
amva13 Mar 5, 2024
0eed238
mend
amva13 Mar 5, 2024
d04ad8a
implement memory-efficient retrieval of count matrix and update envir…
amva13 Mar 5, 2024
7e0fc32
mend
amva13 Mar 5, 2024
1ddc828
merge and lint
amva13 Mar 5, 2024
2d4e4f4
decorator for X and feature presence checks
amva13 Mar 5, 2024
87776c9
documentation
amva13 Mar 5, 2024
f7bf2b1
ship cellxgene loader
amva13 Mar 6, 2024
cd65f41
mend
amva13 Mar 6, 2024
88d7e59
mend
amva13 Mar 6, 2024
38c754b
mend
amva13 Mar 6, 2024
0410503
mend
amva13 Mar 6, 2024
5afd1b1
circle-ci problems.. should migrate
amva13 Mar 6, 2024
95c2d19
mend
amva13 Mar 6, 2024
8088dba
Merge remote-tracking branch 'origin' into avelez-cellxgene-dev
amva13 Mar 6, 2024
64abd2a
mend
amva13 Mar 6, 2024
ad00907
mend
amva13 Mar 6, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ jobs:
no_output_timeout: 30m
command: |
. venv/bin/activate
pytest --ignore=tdc/test/dev_tests/
pytest --ignore=tdc/test/dev_tests/ --ignore=tdc/test/test_resources.py

- store_artifacts:
path: test-reports
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/conda-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ on:
push:
branches:
- main
- avelez-cellxgene-dev
- avelez-dev
- avelez-cellxgene-dev
- '*'
Expand Down
14 changes: 14 additions & 0 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ channels:
- defaults
- pyg
- pytorch
- pyg
- pytorch
dependencies:
- dataclasses=0.8
- fuzzywuzzy=0.18.0
Expand All @@ -14,14 +16,26 @@ dependencies:
- pandas=2.1.4
- pyg=2.5.0
- pytorch=2.2.1
- pyg=2.5.0
- pytorch=2.2.1
- requests=2.31.0
- scikit-learn=1.3.0
- seaborn=0.12.2
- tqdm=4.65.0
- torchaudio=2.2.1
- torchvision=0.17.1
- torchaudio=2.2.1
- torchvision=0.17.1
- pip:
- cellxgene-census==1.10.2
- gget==0.28.4
- pydantic==2.6.3
- gget==0.28.4
- pydantic==2.6.3
- gget==0.28.4
- pydantic==2.6.3
- rdkit==2023.9.5
- tiledbsoma==1.7.2
- yapf==0.40.2
variables:
KMP_DUPLICATE_LIB_OK: "TRUE"
2 changes: 1 addition & 1 deletion requirements_ci.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,4 @@ scikit-learn
torch
tqdm
huggingface_hub
dataclasses
dataclasses
Empty file.
78 changes: 78 additions & 0 deletions tdc/cellxgene-census-loaders/cellxgene-census.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import cellxgene_census
from pandas import concat
import tiledbsoma

from tdc import base_dataset
"""

Are we only supporting memory-efficient queries?
https://chanzuckerberg.github.io/cellxgene-census/cellxgene_census_docsite_quick_start.html#memory-efficient-queries


"""


class CXGDataLoader(base_dataset.DataLoader):

def __init__(self,
num_slices=None,
census_version="2023-12-15",
dataset="census_data",
organism="homo_sapiens",
measurement_name="RNA",
value_filter="",
column_names=None):
if column_names is None:
raise ValueError("column_names is required for this loader")
self.column_names = column_names
num_slices = num_slices if num_slices is not None else 1
self.num_slices = num_slices
self.df = None
self.fetch_data(census_version, dataset, organism, measurement_name,
value_filter)

def fetch_data(self, census_version, dataset, organism, measurement_name,
value_filter):
"""TODO: docs
outputs a dataframe with specified query params on census data SOMA collection object
"""
if self.column_names is None:
raise ValueError(
"Column names must be provided to CXGDataLoader class")

with cellxgene_census.open_soma(
census_version=census_version) as census:
# Reads SOMADataFrame as a slice
cell_metadata = census[dataset][organism].obs.read(
value_filter=value_filter, column_names=self.column_names)
self.df = cell_metadata.concat().to_pandas()
# TODO: not latency on memory-efficient queries is poor...
# organismCollection = census[dataset][organism]
# query = organismCollection.axis_query(
# measurement_name = measurement_name,
# obs_query = tiledbsoma.AxisQuery(
# value_filter = value_filter
# )
# )
# it = query.X("raw").tables()
# dfs =[]
# for _ in range(self.num_slices):
# slice = next (it)
# df_slice = slice.to_pandas()
# dfs.append(df_slice)
# self.df = concat(dfs)

def get_dataframe(self):
if self.df is None:
raise Exception(
"Haven't instantiated a DataFrame yet. You can call self.fetch_data first."
)
return self.df


if __name__ == "__main__":
# TODO: tmp, run testing suite when this file is called as main
loader = CXGDataLoader(value_filter="tissue == 'brain' and sex == 'male'",
column_names=["assay", "cell_type", "tissue"])
df = loader.get_dataframe()
print(df.head())
1 change: 1 addition & 0 deletions tdc/resource/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
from .primekg import PrimeKG
from .cellxgene_census import CensusResource
Loading
Loading