Numpy 2 testing (#1237)

* Fix type * Removing the case '0.0005' as it was previously passing by accident. * Fix test_ld on numpy 2 * Fix test_hash_array * Add GitHub Actions workflow to run using NumPy 2 * Restrict to numpy<2.1 for numba compatibility * Don't run NumPy 2 on Python 3.9 due to scikit-allel incompatibility
sgkit-dev · Sep 2, 2024 · 03daf6b · 03daf6b
1 parent ee90b6e
commit 03daf6b
Show file tree

Hide file tree

Showing 5 changed files with 53 additions and 49 deletions.
diff --git a/.github/workflows/build-numpy-2.yml b/.github/workflows/build-numpy-2.yml
@@ -0,0 +1,38 @@
+name: Build NumPy 2
+
+on:
+ push:
+ pull_request:
+
+jobs:
+ build:
+ # Scheduled runs only on the origin org
+ if: (github.event_name == 'schedule' && github.repository_owner == 'sgkit-dev') || (github.event_name != 'schedule')
+ runs-on: ubuntu-latest
+ strategy:
+ matrix:
+ python-version: ["3.10", "3.11"]
+
+ steps:
+ - uses: actions/checkout@v2
+ - name: Set up Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v2
+ with:
+ python-version: ${{ matrix.python-version }}
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip
+ pip install -r requirements.txt -r requirements-dev.txt
+ pip install -U 'numpy<2.1'
+ - name: Run pre-commit
+ uses: pre-commit/[email protected]
+ - name: Test with pytest (numba jit disabled)
+ env:
+ NUMBA_DISABLE_JIT: 1
+ run: |
+ # avoid guvectorized functions #1194
+ pytest -v sgkit/tests/test_pedigree.py
+ pytest -v sgkit/tests/io/vcf/test_vcf_writer_utils.py
+ - name: Test with pytest and coverage
+ run: |
+ pytest -v --cov=sgkit --cov-report=term-missing
diff --git a/sgkit/tests/io/vcf/test_vcf_writer_utils.py b/sgkit/tests/io/vcf/test_vcf_writer_utils.py
@@ -66,7 +66,6 @@ def test_itoa_out_of_range():
  [
  (0.0, "0"),
  (0.0001, "0"),
- (0.0005, "0.001"),
  (0.3, "0.3"),
  (0.32, "0.32"),
  (0.329, "0.329"),

diff --git a/sgkit/tests/test_ld.py b/sgkit/tests/test_ld.py
@@ -1,12 +1,9 @@
 from typing import Optional
 
-import allel
-import dask.array as da
 import numpy as np
 import numpy.testing as npt
 import pytest
 from dask.dataframe import DataFrame
-from hypothesis import Phase, example, given, settings
 from hypothesis import strategies as st
 from hypothesis.extra.numpy import arrays
 
@@ -27,40 +24,27 @@ def test_rogers_huff_r_between():
  gnb = np.array([[0, 1, 2]])
  npt.assert_allclose(rogers_huff_r_between(gna[0], gnb[0]), 1.0, rtol=1e-06)
  npt.assert_allclose(rogers_huff_r2_between(gna[0], gnb[0]), 1.0, rtol=1e-06)
- npt.assert_allclose(
- allel.rogers_huff_r_between(gna, gnb),
- rogers_huff_r_between(gna[0], gnb[0]),
- rtol=1e-06,
- )
 
  gna = np.array([[0, 1, 2]])
  gnb = np.array([[2, 1, 0]])
  npt.assert_allclose(rogers_huff_r_between(gna[0], gnb[0]), -1.0, rtol=1e-06)
  npt.assert_allclose(rogers_huff_r2_between(gna[0], gnb[0]), 1.0, rtol=1e-06)
- npt.assert_allclose(
- allel.rogers_huff_r_between(gna, gnb),
- rogers_huff_r_between(gna[0], gnb[0]),
- rtol=1e-06,
- )
 
  gna = np.array([[0, 0, 0]])
  gnb = np.array([[1, 1, 1]])
  assert np.isnan(rogers_huff_r_between(gna[0], gnb[0]))
  assert np.isnan(rogers_huff_r2_between(gna[0], gnb[0]))
- assert np.isnan(allel.rogers_huff_r_between(gna, gnb))
 
  gna = np.array([[1, 1, 1]])
  gnb = np.array([[1, 1, 1]])
  assert np.isnan(rogers_huff_r_between(gna[0], gnb[0]))
  assert np.isnan(rogers_huff_r2_between(gna[0], gnb[0]))
- assert np.isnan(allel.rogers_huff_r_between(gna, gnb))
 
  # a case which fails if fastmath=True is enabled for rogers_huff_r_between
  gna = np.full((1, 49), 2)
  gnb = np.full((1, 49), 2)
  assert np.isnan(rogers_huff_r_between(gna[0], gnb[0]))
  assert np.isnan(rogers_huff_r2_between(gna[0], gnb[0]))
- assert np.isnan(allel.rogers_huff_r_between(gna, gnb))
 
 
 def ldm_df(
@@ -115,7 +99,16 @@ def test_threshold():
 
 @pytest.mark.parametrize(
  "dtype",
- [dtype for k, v in np.sctypes.items() for dtype in v if k in ["int", "uint"]], # type: ignore
+ [
+ np.int8,
+ np.int16,
+ np.int32,
+ np.int64,
+ np.uint8,
+ np.uint16,
+ np.uint32,
+ np.uint64,
+ ],
 )
 def test_dtypes(dtype):
  # Input matrices should work regardless of integer type
@@ -148,37 +141,6 @@ def ld_prune_args(draw):
  return x, window, step, threshold, chunks
 
 
-# Phases setting without shrinking for complex, conditional draws in
-# which shrinking wastes time and adds little information
-# (see https://hypothesis.readthedocs.io/en/latest/settings.html#hypothesis.settings.phases)
-PHASES_NO_SHRINK = (Phase.explicit, Phase.reuse, Phase.generate, Phase.target)
-
-
-@given(args=ld_prune_args()) # pylint: disable=no-value-for-parameter
-@settings(max_examples=50, deadline=None, phases=PHASES_NO_SHRINK)
-@example(args=(np.array([[1, 1], [1, 1]], dtype="uint8"), 1, 1, 0.0, -1))
-@pytest.mark.skip(
- reason="Hypothesis generates failures that need investigation: https://github.com/sgkit-dev/sgkit/issues/864"
-)
-def test_vs_skallel(args):
- x, size, step, threshold, chunks = args
-
- ds = simulate_genotype_call_dataset(n_variant=x.shape[0], n_sample=x.shape[1])
- ds["call_dosage"] = (["variants", "samples"], da.asarray(x).rechunk({0: chunks}))
- ds = window_by_variant(ds, size=size, step=step)
-
- ldm = ld_matrix(ds, threshold=threshold)
- has_duplicates = ldm.compute().duplicated(subset=["i", "j"]).any()
- assert not has_duplicates
- idx_drop_ds = maximal_independent_set(ldm)
-
- idx_drop = np.sort(idx_drop_ds.ld_prune_index_to_drop.data)
- m = allel.locate_unlinked(x, size=size, step=step, threshold=threshold)
- idx_drop_ska = np.sort(np.argwhere(~m).squeeze(axis=1))
-
- npt.assert_equal(idx_drop_ska, idx_drop)
-
-
 def test_scores():
  # Create zero row vectors except for 1st and 11th
  # (make them have non-zero variance)

diff --git a/sgkit/tests/test_popgen.py b/sgkit/tests/test_popgen.py
@@ -712,6 +712,9 @@ def test_hash_array(n_rows, n_cols):
  _, expected_inverse, expected_counts = np.unique(
  x, axis=0, return_inverse=True, return_counts=True
  )
+ # following is needed due to https://github.com/numpy/numpy/issues/26738
+ # (workaround from https://github.com/lmcinnes/umap/issues/1138)
+ expected_inverse = expected_inverse.reshape(-1)
 
  # hash columns, then find unique column counts using the hash values
  h = hash_array(x)

diff --git a/sgkit/utils.py b/sgkit/utils.py
@@ -362,6 +362,8 @@ def split_array_chunks(n: int, blocks: int) -> Tuple[int, ...]:
  if blocks <= 0:
  raise ValueError(f"Number of blocks ({blocks}) must be >= 0")
  n_div, n_mod = np.divmod(n, blocks)
+ n_div = int(n_div)
+ n_mod = int(n_mod)
  chunks = n_mod * (n_div + 1,) + (blocks - n_mod) * (n_div,)
  return chunks # type: ignore[no-any-return]