Skip to content

Commit

Permalink
Merge branch 'main' into bkmartinjr/experimentdatapipe
Browse files Browse the repository at this point in the history
  • Loading branch information
bkmartinjr committed Sep 12, 2024
2 parents 33e8ff6 + 7039921 commit 0b73c23
Show file tree
Hide file tree
Showing 57 changed files with 1,586 additions and 430 deletions.
8 changes: 4 additions & 4 deletions .github/workflows/python-ci-packaging.yml
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ jobs:
run: |
mkdir -p external
# Please do not edit manually -- let scripts/update-tiledb-version.py update this
wget --quiet https://github.com/TileDB-Inc/TileDB/releases/download/2.25.0/tiledb-linux-x86_64-2.25.0-bbcbd3f.tar.gz
wget --quiet https://github.com/TileDB-Inc/TileDB/releases/download/2.26.0/tiledb-linux-x86_64-2.26.0-983b716.tar.gz
tar -C external -xzf tiledb-linux-x86_64-*.tar.gz
ls external/lib/
echo "LD_LIBRARY_PATH=$(pwd)/external/lib" >> $GITHUB_ENV
Expand Down Expand Up @@ -172,7 +172,7 @@ jobs:
run: |
mkdir -p external
# Please do not edit manually -- let scripts/update-tiledb-version.py update this
wget --quiet https://github.com/TileDB-Inc/TileDB/releases/download/2.25.0/tiledb-macos-x86_64-2.25.0-bbcbd3f.tar.gz
wget --quiet https://github.com/TileDB-Inc/TileDB/releases/download/2.26.0/tiledb-macos-x86_64-2.26.0-983b716.tar.gz
tar -C external -xzf tiledb-macos-x86_64-*.tar.gz
ls external/lib/
echo "DYLD_LIBRARY_PATH=$(pwd)/external/lib" >> $GITHUB_ENV
Expand Down Expand Up @@ -264,10 +264,10 @@ jobs:
if [ `uname -s` == "Darwin" ];
then
# Please do not edit manually -- let scripts/update-tiledb-version.py update this
wget --quiet https://github.com/TileDB-Inc/TileDB/releases/download/2.25.0/tiledb-macos-x86_64-2.25.0-bbcbd3f.tar.gz
wget --quiet https://github.com/TileDB-Inc/TileDB/releases/download/2.26.0/tiledb-macos-x86_64-2.26.0-983b716.tar.gz
else
# Please do not edit manually -- let scripts/update-tiledb-version.py update this
wget --quiet https://github.com/TileDB-Inc/TileDB/releases/download/2.25.0/tiledb-linux-x86_64-2.25.0-bbcbd3f.tar.gz
wget --quiet https://github.com/TileDB-Inc/TileDB/releases/download/2.26.0/tiledb-linux-x86_64-2.26.0-983b716.tar.gz
fi
tar -C external -xzf tiledb-*.tar.gz
ls external/lib/
Expand Down
9 changes: 8 additions & 1 deletion .github/workflows/python-ci-single.yml
Original file line number Diff line number Diff line change
Expand Up @@ -129,13 +129,20 @@ jobs:
env:
TILEDB_SOMA_INIT_BUFFER_BYTES: 33554432 # accommodate tiny runners

- name: Run pytests for Python
- name: Run pytests for Python without new shape
shell: bash
# Setting PYTHONPATH ensures the tests load the in-tree source code under apis/python/src
# instead of the copy we `pip install`ed to site-packages above. That's needed for the code
# coverage analysis to work.
run: PYTHONPATH=$(pwd)/apis/python/src python -m pytest --cov=apis/python/src --cov-report=xml apis/python/tests -v --durations=20

- name: Run pytests for Python with new shape
shell: bash
# Setting PYTHONPATH ensures the tests load the in-tree source code under apis/python/src
# instead of the copy we `pip install`ed to site-packages above. That's needed for the code
# coverage analysis to work.
run: export SOMA_PY_NEW_SHAPE=true; PYTHONPATH=$(pwd)/apis/python/src python -m pytest --cov=apis/python/src --cov-report=xml apis/python/tests -v --durations=20

- name: Report coverage to Codecov
if: inputs.report_codecov
uses: codecov/codecov-action@v4
Expand Down
9 changes: 7 additions & 2 deletions .github/workflows/r-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -148,10 +148,15 @@ jobs:
# -e "devtools::install(upgrade = FALSE)" \
# -e "testthat::test_local('tests/testthat', load_package = 'installed')"

- name: Test
- name: Test without new shape
if: ${{ matrix.covr == 'no' }}
run: cd apis/r/tests && Rscript testthat.R


# https://github.com/single-cell-data/TileDB-SOMA/issues/2407
- name: Test with new shape
if: ${{ matrix.covr == 'no' }}
run: export SOMA_R_NEW_SHAPE=true && cd apis/r/tests && Rscript testthat.R

- name: Coverage
if: ${{ matrix.os == 'ubuntu-latest' && matrix.covr == 'yes' && github.event_name == 'workflow_dispatch' }}
run: apis/r/tools/r-ci.sh coverage
Expand Down
4 changes: 3 additions & 1 deletion apis/python/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -343,7 +343,9 @@ def run(self):
"scipy",
# Note: the somacore version is in .pre-commit-config.yaml too
"somacore==1.0.14",
"tiledb~=0.31.0",
# TEMP WHILE WE AWAIT WHEELS
# "tiledb~=0.32.0",
"tiledb",
"typing-extensions", # Note "-" even though `import typing_extensions`
],
extras_require={
Expand Down
12 changes: 12 additions & 0 deletions apis/python/src/tiledbsoma/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,18 @@
import os
import sys

# Temporary for https://github.com/single-cell-data/TileDB-SOMA/issues/2407
_new_shape_feature_flag = os.getenv("SOMA_PY_NEW_SHAPE") is not None


def _new_shape_feature_flag_enabled() -> bool:
"""
This is temporary only and will be removed once
https://github.com/single-cell-data/TileDB-SOMA/issues/2407
is complete.
"""
return _new_shape_feature_flag


# Load native libraries. On wheel builds, we may have a shared library
# already linked. In this case, we can import directly
Expand Down
112 changes: 99 additions & 13 deletions apis/python/src/tiledbsoma/_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
from somacore import options
from typing_extensions import Self

from tiledbsoma import _new_shape_feature_flag_enabled

from . import _arrow_types, _util
from . import pytiledbsoma as clib
from ._constants import SOMA_JOINID
Expand Down Expand Up @@ -215,10 +217,33 @@ def create(
"""
context = _validate_soma_tiledb_context(context)
schema = _canonicalize_schema(schema, index_column_names)
if domain is None:
domain = tuple(None for _ in index_column_names)

# SOMA-to-core mappings:
#
# Before the current-domain feature was enabled (possible after core 2.25):
#
# * SOMA domain <-> core domain, AKA "max domain" which is a name we'll use for clarity
# * core current domain did not exist
#
# After the current-domain feature was enabled:
#
# * SOMA max_domain <-> core domain
# * SOMA domain <-> core current domain
#
# As far as the user is concerned, the SOMA-level domain is the only
# thing they see and care about. Before 2.25 support, it was immutable
# (since it was implemented by core domain). After 2.25 support, it is
# mutable/up-resizeable (since it is implemented by core current domain).

# At this point shift from API terminology "domain" to specifying a soma_ or core_
# prefix for these variables. This is crucial to avoid developer confusion.
soma_domain = domain
domain = None

if soma_domain is None:
soma_domain = tuple(None for _ in index_column_names)
else:
ndom = len(domain)
ndom = len(soma_domain)
nidx = len(index_column_names)
if ndom != nidx:
raise ValueError(
Expand All @@ -228,25 +253,56 @@ def create(
index_column_schema = []
index_column_data = {}

for index_column_name, slot_domain in zip(index_column_names, domain):
for index_column_name, slot_soma_domain in zip(index_column_names, soma_domain):
pa_field = schema.field(index_column_name)
dtype = _arrow_types.tiledb_type_from_arrow_type(
pa_field.type, is_indexed_column=True
)

slot_domain = _fill_out_slot_domain(
slot_domain, index_column_name, pa_field.type, dtype
(slot_core_current_domain, saturated_cd) = _fill_out_slot_soma_domain(
slot_soma_domain, index_column_name, pa_field.type, dtype
)
(slot_core_max_domain, saturated_md) = _fill_out_slot_soma_domain(
None, index_column_name, pa_field.type, dtype
)

extent = _find_extent_for_domain(
index_column_name,
TileDBCreateOptions.from_platform_config(platform_config),
dtype,
slot_domain,
slot_core_current_domain,
)

# Necessary to avoid core array-creation error "Reduce domain max by
# 1 tile extent to allow for expansion."
slot_core_current_domain = _revise_domain_for_extent(
slot_core_current_domain, extent, saturated_cd
)
slot_core_max_domain = _revise_domain_for_extent(
slot_core_max_domain, extent, saturated_md
)

# Here is our Arrow data API for communicating schema info between
# Python/R and C++ libtiledbsoma:
#
# [0] core max domain lo
# [1] core max domain hi
# [2] core extent parameter
# If present, these next two signal to use the current-domain feature:
# [3] core current domain lo
# [4] core current domain hi

index_column_schema.append(pa_field)
index_column_data[pa_field.name] = [*slot_domain, extent]
if _new_shape_feature_flag_enabled():

index_column_data[pa_field.name] = [
*slot_core_max_domain,
extent,
*slot_core_current_domain,
]

else:
index_column_data[pa_field.name] = [*slot_core_current_domain, extent]

index_column_info = pa.RecordBatch.from_pydict(
index_column_data, schema=pa.schema(index_column_schema)
Expand Down Expand Up @@ -304,7 +360,17 @@ def domain(self) -> Tuple[Tuple[Any, Any], ...]:
Lifecycle:
Maturing.
"""
return self._tiledb_domain()
return self._domain()

@property
def maxdomain(self) -> Tuple[Tuple[Any, Any], ...]:
"""Returns a tuple of minimum and maximum values, inclusive, storable
on each index column of the dataframe.
Lifecycle:
Maturing.
"""
return self._maxdomain()

@property
def count(self) -> int:
Expand Down Expand Up @@ -708,17 +774,20 @@ def _canonicalize_schema(
return schema


def _fill_out_slot_domain(
def _fill_out_slot_soma_domain(
slot_domain: AxisDomain,
index_column_name: str,
pa_type: pa.DataType,
dtype: Any,
) -> Tuple[Any, Any]:
) -> Tuple[Tuple[Any, Any], bool]:
"""Helper function for _build_tiledb_schema. Given a user-specified domain for a
dimension slot -- which may be ``None``, or a two-tuple of which either element
may be ``None`` -- return either what the user specified (if adequate) or
sensible type-inferred values appropriate to the datatype.
Returns a boolean for whether the underlying datatype's max range was used.
"""
saturated_range = False
if slot_domain is not None:
# User-specified; go with it when possible
if (
Expand Down Expand Up @@ -759,9 +828,12 @@ def _fill_out_slot_domain(
# The SOMA spec disallows negative soma_joinid.
if index_column_name == SOMA_JOINID:
slot_domain = (0, 2**31 - 2) # R-friendly, which 2**63-1 is not
else:
saturated_range = True
elif np.issubdtype(dtype, NPFloating):
finfo = np.finfo(cast(NPFloating, dtype))
slot_domain = finfo.min, finfo.max
saturated_range = True

# The `iinfo.min+1` is necessary as of tiledb core 2.15 / tiledb-py 0.21.1 since
# `iinfo.min` maps to `NaT` (not a time), resulting in
Expand Down Expand Up @@ -796,7 +868,7 @@ def _fill_out_slot_domain(
else:
raise TypeError(f"Unsupported dtype {dtype}")

return slot_domain
return (slot_domain, saturated_range)


def _find_extent_for_domain(
Expand All @@ -813,7 +885,7 @@ def _find_extent_for_domain(
# Default 2048 mods to 0 for 8-bit types and 0 is an invalid extent
extent = tiledb_create_write_options.dim_tile(index_column_name)
if isinstance(dtype, np.dtype) and dtype.itemsize == 1:
extent = 64
extent = 1

if isinstance(dtype, str):
return ""
Expand Down Expand Up @@ -850,3 +922,17 @@ def _find_extent_for_domain(
return np.datetime64(iextent, "ns")

return extent


# We need to do this to avoid this error at array-creation time:
#
# Error: Tile extent check failed; domain max expanded to multiple of tile
# extent exceeds max value representable by domain type. Reduce domain max
# by 1 tile extent to allow for expansion.
def _revise_domain_for_extent(
domain: Tuple[Any, Any], extent: Any, saturated_range: bool
) -> Tuple[Any, Any]:
if saturated_range:
return (domain[0], domain[1] - extent)
else:
return domain
2 changes: 2 additions & 0 deletions apis/python/src/tiledbsoma/_dense_nd_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,8 @@ def create(
TileDBCreateOptions.from_platform_config(platform_config),
)
index_column_schema.append(pa_field)
# TODO: support current domain for dense arrays once we have core support.
# https://github.com/single-cell-data/TileDB-SOMA/issues/2955
index_column_data[pa_field.name] = [0, dim_capacity - 1, dim_extent]

index_column_info = pa.RecordBatch.from_pydict(
Expand Down
29 changes: 28 additions & 1 deletion apis/python/src/tiledbsoma/_soma_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,9 +83,36 @@ def _tiledb_attr_names(self) -> Tuple[str, ...]:
"""
return self._handle.attr_names

def _tiledb_domain(self) -> Tuple[Tuple[Any, Any], ...]:
def _domain(self) -> Tuple[Tuple[Any, Any], ...]:
"""This is the SOMA domain, not the core domain.
* For arrays with core current-domain support:
o soma domain is core current domain
o soma maxdomain is core domain
* For arrays without core current-domain support:
o soma domain is core domain
o soma maxdomain is core domain
o core current domain is not accessed at the soma level
* Core domain has been around forever and is immutable.
* Core current domain is new as of core 2.25 and can be
resized up to core (max) domain.
"""
return self._handle.domain

def _maxdomain(self) -> Tuple[Tuple[Any, Any], ...]:
"""This is the SOMA maxdomain, not the core domain.
* For arrays with core current-domain support:
o soma domain is core current domain
o soma maxdomain is core domain
* For arrays without core current-domain support:
o soma domain is core domain
o soma maxdomain is core domain
o core current domain is not accessed at the soma level
* Core domain has been around forever and is immutable.
* Core current domain is new as of core 2.25 and can be
resized up to core (max) domain.
"""
return self._handle.maxdomain

def _set_reader_coords(self, sr: clib.SOMAArray, coords: Sequence[object]) -> None:
"""Parses the given coords and sets them on the SOMA Reader."""
if not is_nonstringy_sequence(coords):
Expand Down
Loading

0 comments on commit 0b73c23

Please sign in to comment.