Merge branch 'mims-harvard:main' into main

mims-harvard · Mar 6, 2024 · 7434782 · 7434782
2 parents 7c896b9 + 74d2702
commit 7434782
Show file tree

Hide file tree

Showing 11 changed files with 525 additions and 10 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -50,7 +50,7 @@ jobs:
           no_output_timeout: 30m
           command: |
             . venv/bin/activate
-            pytest --ignore=tdc/test/dev_tests/
+            pytest --ignore=tdc/test/dev_tests/ --ignore=tdc/test/test_resources.py
 
       - store_artifacts:
           path: test-reports

diff --git a/.github/workflows/conda-tests.yml b/.github/workflows/conda-tests.yml
@@ -8,6 +8,7 @@ on:
   push:
     branches:
       - main
+      - avelez-cellxgene-dev
       - avelez-dev
       - avelez-cellxgene-dev
       - '*'

diff --git a/environment.yml b/environment.yml
@@ -4,6 +4,8 @@ channels:
   - defaults
   - pyg
   - pytorch
+  - pyg
+  - pytorch
 dependencies:
   - dataclasses=0.8
   - fuzzywuzzy=0.18.0
@@ -14,14 +16,26 @@ dependencies:
   - pandas=2.1.4
   - pyg=2.5.0
   - pytorch=2.2.1
+  - pyg=2.5.0
+  - pytorch=2.2.1
   - requests=2.31.0
   - scikit-learn=1.3.0
   - seaborn=0.12.2
   - tqdm=4.65.0
   - torchaudio=2.2.1
   - torchvision=0.17.1
+  - torchaudio=2.2.1
+  - torchvision=0.17.1
   - pip:
     - cellxgene-census==1.10.2 
+    - gget==0.28.4
+    - pydantic==2.6.3
+    - gget==0.28.4
+    - pydantic==2.6.3
+    - gget==0.28.4
     - pydantic==2.6.3
     - rdkit==2023.9.5
+    - tiledbsoma==1.7.2
     - yapf==0.40.2
+variables:
+  KMP_DUPLICATE_LIB_OK: "TRUE"
diff --git a/requirements_ci.txt b/requirements_ci.txt
@@ -8,4 +8,4 @@ scikit-learn
 torch
 tqdm
 huggingface_hub
-dataclasses
+dataclasses
diff --git a/tdc/cellxgene-census-loaders/__init__.py b/tdc/cellxgene-census-loaders/__init__.py
diff --git a/tdc/cellxgene-census-loaders/cellxgene-census.py b/tdc/cellxgene-census-loaders/cellxgene-census.py
@@ -0,0 +1,78 @@
+import cellxgene_census
+from pandas import concat
+import tiledbsoma
+
+from tdc import base_dataset
+"""
+    
+Are we only supporting memory-efficient queries?
+https://chanzuckerberg.github.io/cellxgene-census/cellxgene_census_docsite_quick_start.html#memory-efficient-queries
+
+
+"""
+
+
+class CXGDataLoader(base_dataset.DataLoader):
+
+    def __init__(self,
+                 num_slices=None,
+                 census_version="2023-12-15",
+                 dataset="census_data",
+                 organism="homo_sapiens",
+                 measurement_name="RNA",
+                 value_filter="",
+                 column_names=None):
+        if column_names is None:
+            raise ValueError("column_names is required for this loader")
+        self.column_names = column_names
+        num_slices = num_slices if num_slices is not None else 1
+        self.num_slices = num_slices
+        self.df = None
+        self.fetch_data(census_version, dataset, organism, measurement_name,
+                        value_filter)
+
+    def fetch_data(self, census_version, dataset, organism, measurement_name,
+                   value_filter):
+        """TODO: docs
+        outputs a dataframe with specified query params on census data SOMA collection object
+        """
+        if self.column_names is None:
+            raise ValueError(
+                "Column names must be provided to CXGDataLoader class")
+
+        with cellxgene_census.open_soma(
+                census_version=census_version) as census:
+            # Reads SOMADataFrame as a slice
+            cell_metadata = census[dataset][organism].obs.read(
+                value_filter=value_filter, column_names=self.column_names)
+            self.df = cell_metadata.concat().to_pandas()
+            # TODO: not latency on memory-efficient queries is poor...
+            # organismCollection = census[dataset][organism]
+            # query = organismCollection.axis_query(
+            #     measurement_name = measurement_name,
+            #     obs_query = tiledbsoma.AxisQuery(
+            #         value_filter = value_filter
+            #     )
+            # )
+            # it = query.X("raw").tables()
+            # dfs =[]
+            # for  _ in range(self.num_slices):
+            #     slice = next (it)
+            #     df_slice = slice.to_pandas()
+            #     dfs.append(df_slice)
+            # self.df = concat(dfs)
+
+    def get_dataframe(self):
+        if self.df is None:
+            raise Exception(
+                "Haven't instantiated a DataFrame yet. You can call self.fetch_data first."
+            )
+        return self.df
+
+
+if __name__ == "__main__":
+    # TODO: tmp, run testing suite when this file is called as main
+    loader = CXGDataLoader(value_filter="tissue == 'brain' and sex == 'male'",
+                           column_names=["assay", "cell_type", "tissue"])
+    df = loader.get_dataframe()
+    print(df.head())
diff --git a/tdc/resource/__init__.py b/tdc/resource/__init__.py
@@ -1 +1,2 @@
 from .primekg import PrimeKG
+from .cellxgene_census import CensusResource