Skip to content

Commit

Permalink
Numpy 2 testing (#1237)
Browse files Browse the repository at this point in the history
* Fix type

* Removing the case '0.0005' as it was previously passing by accident.

* Fix test_ld on numpy 2

* Fix test_hash_array

* Add GitHub Actions workflow to run using NumPy 2

* Restrict to numpy<2.1 for numba compatibility

* Don't run NumPy 2 on Python 3.9 due to scikit-allel incompatibility
  • Loading branch information
tomwhite authored Sep 2, 2024
1 parent ee90b6e commit 03daf6b
Show file tree
Hide file tree
Showing 5 changed files with 53 additions and 49 deletions.
38 changes: 38 additions & 0 deletions .github/workflows/build-numpy-2.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
name: Build NumPy 2

on:
push:
pull_request:

jobs:
build:
# Scheduled runs only on the origin org
if: (github.event_name == 'schedule' && github.repository_owner == 'sgkit-dev') || (github.event_name != 'schedule')
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.10", "3.11"]

steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt -r requirements-dev.txt
pip install -U 'numpy<2.1'
- name: Run pre-commit
uses: pre-commit/[email protected]
- name: Test with pytest (numba jit disabled)
env:
NUMBA_DISABLE_JIT: 1
run: |
# avoid guvectorized functions #1194
pytest -v sgkit/tests/test_pedigree.py
pytest -v sgkit/tests/io/vcf/test_vcf_writer_utils.py
- name: Test with pytest and coverage
run: |
pytest -v --cov=sgkit --cov-report=term-missing
1 change: 0 additions & 1 deletion sgkit/tests/io/vcf/test_vcf_writer_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,6 @@ def test_itoa_out_of_range():
[
(0.0, "0"),
(0.0001, "0"),
(0.0005, "0.001"),
(0.3, "0.3"),
(0.32, "0.32"),
(0.329, "0.329"),
Expand Down
58 changes: 10 additions & 48 deletions sgkit/tests/test_ld.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,9 @@
from typing import Optional

import allel
import dask.array as da
import numpy as np
import numpy.testing as npt
import pytest
from dask.dataframe import DataFrame
from hypothesis import Phase, example, given, settings
from hypothesis import strategies as st
from hypothesis.extra.numpy import arrays

Expand All @@ -27,40 +24,27 @@ def test_rogers_huff_r_between():
gnb = np.array([[0, 1, 2]])
npt.assert_allclose(rogers_huff_r_between(gna[0], gnb[0]), 1.0, rtol=1e-06)
npt.assert_allclose(rogers_huff_r2_between(gna[0], gnb[0]), 1.0, rtol=1e-06)
npt.assert_allclose(
allel.rogers_huff_r_between(gna, gnb),
rogers_huff_r_between(gna[0], gnb[0]),
rtol=1e-06,
)

gna = np.array([[0, 1, 2]])
gnb = np.array([[2, 1, 0]])
npt.assert_allclose(rogers_huff_r_between(gna[0], gnb[0]), -1.0, rtol=1e-06)
npt.assert_allclose(rogers_huff_r2_between(gna[0], gnb[0]), 1.0, rtol=1e-06)
npt.assert_allclose(
allel.rogers_huff_r_between(gna, gnb),
rogers_huff_r_between(gna[0], gnb[0]),
rtol=1e-06,
)

gna = np.array([[0, 0, 0]])
gnb = np.array([[1, 1, 1]])
assert np.isnan(rogers_huff_r_between(gna[0], gnb[0]))
assert np.isnan(rogers_huff_r2_between(gna[0], gnb[0]))
assert np.isnan(allel.rogers_huff_r_between(gna, gnb))

gna = np.array([[1, 1, 1]])
gnb = np.array([[1, 1, 1]])
assert np.isnan(rogers_huff_r_between(gna[0], gnb[0]))
assert np.isnan(rogers_huff_r2_between(gna[0], gnb[0]))
assert np.isnan(allel.rogers_huff_r_between(gna, gnb))

# a case which fails if fastmath=True is enabled for rogers_huff_r_between
gna = np.full((1, 49), 2)
gnb = np.full((1, 49), 2)
assert np.isnan(rogers_huff_r_between(gna[0], gnb[0]))
assert np.isnan(rogers_huff_r2_between(gna[0], gnb[0]))
assert np.isnan(allel.rogers_huff_r_between(gna, gnb))


def ldm_df(
Expand Down Expand Up @@ -115,7 +99,16 @@ def test_threshold():

@pytest.mark.parametrize(
"dtype",
[dtype for k, v in np.sctypes.items() for dtype in v if k in ["int", "uint"]], # type: ignore
[
np.int8,
np.int16,
np.int32,
np.int64,
np.uint8,
np.uint16,
np.uint32,
np.uint64,
],
)
def test_dtypes(dtype):
# Input matrices should work regardless of integer type
Expand Down Expand Up @@ -148,37 +141,6 @@ def ld_prune_args(draw):
return x, window, step, threshold, chunks


# Phases setting without shrinking for complex, conditional draws in
# which shrinking wastes time and adds little information
# (see https://hypothesis.readthedocs.io/en/latest/settings.html#hypothesis.settings.phases)
PHASES_NO_SHRINK = (Phase.explicit, Phase.reuse, Phase.generate, Phase.target)


@given(args=ld_prune_args()) # pylint: disable=no-value-for-parameter
@settings(max_examples=50, deadline=None, phases=PHASES_NO_SHRINK)
@example(args=(np.array([[1, 1], [1, 1]], dtype="uint8"), 1, 1, 0.0, -1))
@pytest.mark.skip(
reason="Hypothesis generates failures that need investigation: https://github.com/sgkit-dev/sgkit/issues/864"
)
def test_vs_skallel(args):
x, size, step, threshold, chunks = args

ds = simulate_genotype_call_dataset(n_variant=x.shape[0], n_sample=x.shape[1])
ds["call_dosage"] = (["variants", "samples"], da.asarray(x).rechunk({0: chunks}))
ds = window_by_variant(ds, size=size, step=step)

ldm = ld_matrix(ds, threshold=threshold)
has_duplicates = ldm.compute().duplicated(subset=["i", "j"]).any()
assert not has_duplicates
idx_drop_ds = maximal_independent_set(ldm)

idx_drop = np.sort(idx_drop_ds.ld_prune_index_to_drop.data)
m = allel.locate_unlinked(x, size=size, step=step, threshold=threshold)
idx_drop_ska = np.sort(np.argwhere(~m).squeeze(axis=1))

npt.assert_equal(idx_drop_ska, idx_drop)


def test_scores():
# Create zero row vectors except for 1st and 11th
# (make them have non-zero variance)
Expand Down
3 changes: 3 additions & 0 deletions sgkit/tests/test_popgen.py
Original file line number Diff line number Diff line change
Expand Up @@ -712,6 +712,9 @@ def test_hash_array(n_rows, n_cols):
_, expected_inverse, expected_counts = np.unique(
x, axis=0, return_inverse=True, return_counts=True
)
# following is needed due to https://github.com/numpy/numpy/issues/26738
# (workaround from https://github.com/lmcinnes/umap/issues/1138)
expected_inverse = expected_inverse.reshape(-1)

# hash columns, then find unique column counts using the hash values
h = hash_array(x)
Expand Down
2 changes: 2 additions & 0 deletions sgkit/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -362,6 +362,8 @@ def split_array_chunks(n: int, blocks: int) -> Tuple[int, ...]:
if blocks <= 0:
raise ValueError(f"Number of blocks ({blocks}) must be >= 0")
n_div, n_mod = np.divmod(n, blocks)
n_div = int(n_div)
n_mod = int(n_mod)
chunks = n_mod * (n_div + 1,) + (blocks - n_mod) * (n_div,)
return chunks # type: ignore[no-any-return]

Expand Down

0 comments on commit 03daf6b

Please sign in to comment.