Skip to content

Commit

Permalink
Merge branch 'mims-harvard:main' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
abearab authored Mar 6, 2024
2 parents 7c896b9 + 74d2702 commit 7434782
Show file tree
Hide file tree
Showing 11 changed files with 525 additions and 10 deletions.
2 changes: 1 addition & 1 deletion .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ jobs:
no_output_timeout: 30m
command: |
. venv/bin/activate
pytest --ignore=tdc/test/dev_tests/
pytest --ignore=tdc/test/dev_tests/ --ignore=tdc/test/test_resources.py
- store_artifacts:
path: test-reports
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/conda-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ on:
push:
branches:
- main
- avelez-cellxgene-dev
- avelez-dev
- avelez-cellxgene-dev
- '*'
Expand Down
14 changes: 14 additions & 0 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ channels:
- defaults
- pyg
- pytorch
- pyg
- pytorch
dependencies:
- dataclasses=0.8
- fuzzywuzzy=0.18.0
Expand All @@ -14,14 +16,26 @@ dependencies:
- pandas=2.1.4
- pyg=2.5.0
- pytorch=2.2.1
- pyg=2.5.0
- pytorch=2.2.1
- requests=2.31.0
- scikit-learn=1.3.0
- seaborn=0.12.2
- tqdm=4.65.0
- torchaudio=2.2.1
- torchvision=0.17.1
- torchaudio=2.2.1
- torchvision=0.17.1
- pip:
- cellxgene-census==1.10.2
- gget==0.28.4
- pydantic==2.6.3
- gget==0.28.4
- pydantic==2.6.3
- gget==0.28.4
- pydantic==2.6.3
- rdkit==2023.9.5
- tiledbsoma==1.7.2
- yapf==0.40.2
variables:
KMP_DUPLICATE_LIB_OK: "TRUE"
2 changes: 1 addition & 1 deletion requirements_ci.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,4 @@ scikit-learn
torch
tqdm
huggingface_hub
dataclasses
dataclasses
Empty file.
78 changes: 78 additions & 0 deletions tdc/cellxgene-census-loaders/cellxgene-census.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import cellxgene_census
from pandas import concat
import tiledbsoma

from tdc import base_dataset
"""
Are we only supporting memory-efficient queries?
https://chanzuckerberg.github.io/cellxgene-census/cellxgene_census_docsite_quick_start.html#memory-efficient-queries
"""


class CXGDataLoader(base_dataset.DataLoader):

def __init__(self,
num_slices=None,
census_version="2023-12-15",
dataset="census_data",
organism="homo_sapiens",
measurement_name="RNA",
value_filter="",
column_names=None):
if column_names is None:
raise ValueError("column_names is required for this loader")
self.column_names = column_names
num_slices = num_slices if num_slices is not None else 1
self.num_slices = num_slices
self.df = None
self.fetch_data(census_version, dataset, organism, measurement_name,
value_filter)

def fetch_data(self, census_version, dataset, organism, measurement_name,
value_filter):
"""TODO: docs
outputs a dataframe with specified query params on census data SOMA collection object
"""
if self.column_names is None:
raise ValueError(
"Column names must be provided to CXGDataLoader class")

with cellxgene_census.open_soma(
census_version=census_version) as census:
# Reads SOMADataFrame as a slice
cell_metadata = census[dataset][organism].obs.read(
value_filter=value_filter, column_names=self.column_names)
self.df = cell_metadata.concat().to_pandas()
# TODO: not latency on memory-efficient queries is poor...
# organismCollection = census[dataset][organism]
# query = organismCollection.axis_query(
# measurement_name = measurement_name,
# obs_query = tiledbsoma.AxisQuery(
# value_filter = value_filter
# )
# )
# it = query.X("raw").tables()
# dfs =[]
# for _ in range(self.num_slices):
# slice = next (it)
# df_slice = slice.to_pandas()
# dfs.append(df_slice)
# self.df = concat(dfs)

def get_dataframe(self):
if self.df is None:
raise Exception(
"Haven't instantiated a DataFrame yet. You can call self.fetch_data first."
)
return self.df


if __name__ == "__main__":
# TODO: tmp, run testing suite when this file is called as main
loader = CXGDataLoader(value_filter="tissue == 'brain' and sex == 'male'",
column_names=["assay", "cell_type", "tissue"])
df = loader.get_dataframe()
print(df.head())
1 change: 1 addition & 0 deletions tdc/resource/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
from .primekg import PrimeKG
from .cellxgene_census import CensusResource
Loading

0 comments on commit 7434782

Please sign in to comment.