Skip to content

Commit

Permalink
ENH: Fix tests hang in CI (xorbitsai#794)
Browse files Browse the repository at this point in the history
  • Loading branch information
luweizheng authored Aug 17, 2024
1 parent 4032b78 commit 2f99f8c
Show file tree
Hide file tree
Showing 7 changed files with 83 additions and 74 deletions.
119 changes: 61 additions & 58 deletions .github/workflows/python.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -67,34 +67,34 @@ jobs:
needs: lint
env:
CONDA_ENV: xorbits-test
SELF_HOST_PYTHON: /root/miniconda3/bin/python
SELF_HOST_CONDA: /root/miniconda3/bin/conda
defaults:
run:
shell: bash -l {0}
strategy:
fail-fast: false
matrix:
os: ["ubuntu-latest", "macos-latest", "windows-latest"]
os: ["ubuntu-latest", "macos-13", "windows-latest"]
python-version: ["3.9", "3.10", "3.11"]
module: ["xorbits", "kubernetes"]
module: ["xorbits", "xorbits/numpy", "xorbits/pandas", "kubernetes"]
exclude:
- { os: macos-latest, python-version: 3.10}
- { os: macos-latest, python-version: 3.9}
- { os: macos-13, python-version: 3.10}
- { os: macos-13, python-version: 3.9}
- { os: windows-latest, python-version: 3.10}
- { os: windows-latest, python-version: 3.9}
- { os: windows-latest, module: kubernetes}
- { os: macos-latest, module: kubernetes}
- { os: macos-13, module: kubernetes}
include:
- { os: ubuntu-latest, module: _mars/dataframe, python-version: 3.9 }
- { os: ubuntu-latest, module: _mars/tensor, python-version: 3.9 }
- { os: ubuntu-latest, module: _mars/learn, python-version: 3.9 }
- { os: ubuntu-latest, module: learn, python-version: 3.9 }
- { os: ubuntu-latest, module: mars-core, python-version: 3.9 }
- { os: ubuntu-20.04, module: hadoop, python-version: 3.9 }
- { os: ubuntu-latest, module: vineyard, python-version: 3.9 }
- { os: ubuntu-latest, module: external-storage, python-version: 3.9 }
- { os: ubuntu-latest, module: compatibility, python-version: 3.9 }
- { os: ubuntu-latest, module: doc-build, python-version: 3.9 }
- { os: self-hosted, module: gpu, python-version: 3.9}
- { os: [self-hosted, gpu], module: gpu, python-version: 3.11}
- { os: ubuntu-latest, module: jax, python-version: 3.9 }
- { os: juicefs-ci, module: kubernetes-juicefs, python-version: 3.9 }
- { os: ubuntu-latest, module: slurm, python-version: 3.9 }
Expand Down Expand Up @@ -126,11 +126,11 @@ jobs:
minikube-version: 1.31.2

- name: Install ucx dependencies
if: ${{ (matrix.module != 'gpu') && (matrix.os == 'ubuntu-latest') && (matrix.python-version != '3.11') }}
if: ${{ (matrix.module != 'gpu') && (matrix.os == 'ubuntu-latest')}}
run: |
conda install -c conda-forge -c rapidsai ucx-proc=*=cpu ucx ucx-py
- name: Install libomp (macOS)
if: matrix.os == 'macos-latest'
if: ${{ matrix.os == 'macos-latest' || matrix.os == 'macos-13' }}
run: brew install libomp
- name: Install dependencies
env:
Expand All @@ -139,9 +139,9 @@ jobs:
if: ${{ matrix.module != 'gpu' }}
run: |
pip install -e "git+https://github.com/xorbitsai/xoscar.git@main#subdirectory=python&egg=xoscar"
pip install "numpy<2.0.0" scipy cython pyftpdlib coverage flaky "numexpr<2.8.5"
pip install "numpy<2.0.0" scipy cython pyftpdlib coverage flaky numexpr
if [[ "$MODULE" == "xorbits" ]]; then
if [[ "$MODULE" == "xorbits/pandas" ]]; then
pip install openpyxl
fi
if [[ "$MODULE" == "mars-core" ]]; then
Expand Down Expand Up @@ -227,9 +227,6 @@ jobs:
if [[ "$MODULE" == "learn" ]]; then
pip install xgboost lightgbm
fi
if [[ "$MODULE" == "ray-dag" ]] || [[ "$MODULE" == "ray-deploy" ]]; then
pip install "xgboost_ray<0.1.14" "protobuf<4" "sqlalchemy<2"
fi
if [[ "$MODULE" == "compatibility" ]]; then
# test if compatible with older versions
pip install "pandas==1.5.3" "scipy<=1.10.1" "numpy<=1.24.1" "matplotlib<=3.7.0" "pyarrow<12.0.0" "sqlalchemy<2"
Expand Down Expand Up @@ -267,6 +264,10 @@ jobs:
- name: Install on GPU
if: ${{ matrix.module == 'gpu' }}
run: |
pip install --extra-index-url=https://pypi.nvidia.com cudf-cu12==24.8.*
pip install ucx-py-cu12 cython "numpy>=1.14.0,<2.0.0" cloudpickle scikit-learn \
pyyaml psutil tornado sqlalchemy defusedxml tqdm uvloop coverage \
pytest pytest-cov pytest-timeout pytest-forked pytest-asyncio pytest-mock
pip install -U xoscar
python setup.py build_ext -i
working-directory: ./python
Expand All @@ -279,15 +280,44 @@ jobs:
working-directory: ./doc

- name: Test with pytest
if: ${{ matrix.module != 'doc-build' }}
if: ${{ matrix.module != 'doc-build' && matrix.module != 'gpu' }}
env:
MODULE: ${{ matrix.module }}
run: |
if [[ "$MODULE" == "xorbits" ]]; then
pytest --ignore xorbits/_mars/ --ignore xorbits/xgboost --ignore xorbits/lightgbm \
--ignore xorbits/datasets --timeout=1500 \
pytest --ignore xorbits/_mars/ --ignore xorbits/pandas --ignore xorbits/numpy \
--ignore xorbits/xgboost --ignore xorbits/lightgbm --ignore xorbits/sklearn \
--ignore xorbits/datasets \
--ignore xorbits/core/tests/test_execution_exit.py \
--timeout=1500 \
-W ignore::PendingDeprecationWarning \
--cov-config=setup.cfg --cov-report=xml --cov=xorbits \
-k "not test_execution_with_process_exit_message" \
xorbits
# workaround: this case will hang, run it separately.
pytest --timeout=1500 \
-W ignore::PendingDeprecationWarning \
--cov-config=setup.cfg --cov-report=xml --cov=xorbits \
-k "test_execution_with_process_exit_message" \
xorbits/core/tests/test_execution.py
elif [[ "$MODULE" == "xorbits/pandas" ]]; then
pytest --timeout=1500 \
-W ignore::PendingDeprecationWarning \
--cov-config=setup.cfg --cov-report=xml --cov=xorbits xorbits
--cov-config=setup.cfg --cov-report=xml \
--cov=xorbits xorbits/pandas
elif [[ "$MODULE" == "xorbits/numpy" ]]; then
pytest --timeout=1500 \
-W ignore::PendingDeprecationWarning \
--cov-config=setup.cfg --cov-report=xml --cov=xorbits \
-k "not test_numpy_fallback" \
xorbits/numpy
# workaround: this case will hang, run it separately.
pytest --timeout=1500 \
-W ignore::PendingDeprecationWarning \
--cov-config=setup.cfg --cov-report=xml \
--cov=xorbits \
-k "test_numpy_fallback" \
xorbits/numpy/numpy_adapters/tests/test_numpy_adapters.py
elif [[ "$MODULE" == "mars-core" ]]; then
pytest --forked --log-level=DEBUG --ignore xorbits/_mars/dataframe --ignore xorbits/_mars/tensor \
--ignore xorbits/_mars/learn --ignore xorbits/_mars/remote \
Expand Down Expand Up @@ -343,48 +373,12 @@ jobs:
-W ignore::PendingDeprecationWarning \
--cov-config=setup.cfg --cov-report=xml --cov=xorbits \
xorbits/xgboost xorbits/lightgbm
elif [ "$MODULE" == "ray-deploy" ]; then
pytest --cov-config=setup.cfg --cov-report=xml --cov=xorbits --durations=0 \
--log-level=DEBUG --timeout=200 xorbits/_mars --ignore=xorbits/_mars/deploy/oscar/ -m ray
pytest --cov-config=setup.cfg --cov-report=xml --cov=xorbits --durations=0 \
--log-level=DEBUG --timeout=200 xorbits/_mars/deploy/oscar/tests/test_ray.py -m ray
pytest --cov-config=setup.cfg --cov-report=xml --cov=xorbits --durations=0 \
--log-level=DEBUG --timeout=200 xorbits/_mars/deploy/oscar/tests/test_ray_load_modules.py -m ray
pytest --cov-config=setup.cfg --cov-report=xml --cov=xorbits --durations=0 \
--log-level=DEBUG --timeout=200 xorbits/_mars/deploy/oscar/tests/test_ray_cluster_standalone.py -m ray
pytest --cov-config=setup.cfg --cov-report=xml --cov=xorbits --durations=0 \
--log-level=DEBUG --timeout=200 xorbits/_mars/deploy/oscar/tests/test_ray_client.py -m ray
pytest --cov-config=setup.cfg --cov-report=xml --cov=xorbits --durations=0 \
--log-level=DEBUG --timeout=200 xorbits/_mars/deploy/oscar/tests/test_ray_fault_injection.py -m ray
pytest --cov-config=setup.cfg --cov-report=xml --cov=xorbits --durations=0 \
--log-level=DEBUG --timeout=200 xorbits/_mars/deploy/oscar/tests/test_ray_scheduling.py -m ray
elif [ "$MODULE" == "ray-dag" ]; then
export MARS_CI_BACKEND=ray
export RAY_idle_worker_killing_time_threshold_ms=60000
pytest --cov-config=setup.cfg --cov-report=xml --durations=0 --timeout=500 xorbits/_mars/dataframe \
-v -s -m "not skip_ray_dag" --ignore=xorbits/_mars/dataframe/contrib/raydataset
pytest --cov-config=setup.cfg --cov-report=xml --durations=0 \
--timeout=500 xorbits/_mars/dataframe/contrib/raydataset -v -s -m "not skip_ray_dag"
pytest --cov-config=setup.cfg --cov-report=xml --durations=0 \
--timeout=500 xorbits/_mars/tensor -v -s -m "not skip_ray_dag"
pytest --cov-config=setup.cfg --cov-report=xml --durations=0 \
--timeout=500 xorbits/_mars/learn --ignore xorbits/_mars/learn/contrib \
--ignore xorbits/_mars/learn/utils/tests/test_collect_ports.py -m "not skip_ray_dag"
pytest --cov-config=setup.cfg --cov-report=xml --durations=0 \
--timeout=200 xorbits -v -s -m ray_dag
pytest --cov-config=setup.cfg --cov-report=xml --durations=0 \
--timeout=200 xorbits/_mars/deploy/oscar/tests/test_ray_dag.py
pytest --cov-config=setup.cfg --cov-report=xml --durations=0 \
--timeout=200 xorbits/_mars/deploy/oscar/tests/test_ray_dag_failover.py
pytest --cov-config=setup.cfg --cov-report=xml --durations=0 \
--timeout=200 xorbits/_mars/deploy/oscar/tests/test_ray_dag_oscar.py -m ray
elif [ "$MODULE" == "gpu" ]; then
pytest -m cuda --gpu --ignore xorbits/datasets --cov-config=setup.cfg --cov-report=xml --cov=xorbits xorbits
elif [ "$MODULE" == "jax" ]; then
pytest --cov-config=setup.cfg --cov-report=xml --cov=xorbits xorbits/_mars/tensor/fuse/tests/test_runtime_fusion.py
pytest --cov-config=setup.cfg --cov-report=xml --cov=xorbits xorbits/_mars/tensor/
elif [ "$MODULE" == "datasets" ]; then
pytest --cov-config=setup.cfg --cov-report=xml --cov=xorbits xorbits/datasets
pytest --cov-config=setup.cfg --cov-report=xml --cov=xorbits xorbits/datasets/backends
pytest --cov-config=setup.cfg --cov-report=xml --cov=xorbits xorbits/datasets/tests
elif [ "$MODULE" == "compatibility" ]; then
pytest --timeout=1500 \
-W ignore::PendingDeprecationWarning \
Expand All @@ -398,6 +392,15 @@ jobs:
--cov-config=setup.cfg --cov-report=xml --cov=xorbits/deploy --cov=xorbits xorbits/$MODULE
fi
working-directory: ./python

- name: Test with pytest GPU
if: ${{ matrix.module == 'gpu' }}
run: |
pytest -m cuda --gpu --ignore xorbits/datasets \
--ignore xorbits/sklearn --ignore xorbits/_mars/learn \
--cov-config=setup.cfg --cov-report=xml --cov=xorbits \
xorbits
working-directory: ./python


- name: Cleanup on slurm
Expand All @@ -407,7 +410,7 @@ jobs:
jobqueue_after_script
- name: Report coverage data
uses: codecov/codecov-action@v3
uses: codecov/codecov-action@v4
with:
working-directory: ./python
flags: unittests
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1079,7 +1079,6 @@ def __call__(self, s):
)


@support_cuda
@pytest.mark.parametrize(
"chunked,as_index", [(True, True), (True, False), (False, True), (False, False)]
)
Expand All @@ -1092,6 +1091,7 @@ def test_groupby_apply_as_index(chunked, as_index, setup_gpu, gpu):
}
)

# cudf not support udf like this
def udf(v):
denominator = v["a"].sum() * v["a"].mean()
v = v[v["c"] == "c"]
Expand Down
8 changes: 1 addition & 7 deletions python/xorbits/_mars/dataframe/reduction/aggregation.py
Original file line number Diff line number Diff line change
Expand Up @@ -630,14 +630,8 @@ def _wrap_df(cls, op, value, index=None):
if isinstance(value, (np.generic, int, float, complex)):
value = xdf.DataFrame([value], columns=index)
elif not isinstance(value, xdf.DataFrame):
new_index = None if not op.gpu else getattr(value, "index", None)
dtype = getattr(value, "dtype", None)
if xdf is pd:
value = xdf.DataFrame(value, columns=index, index=new_index)
else: # pragma: no cover
value = xdf.DataFrame(value)
value.index = new_index
value.columns = index
value = xdf.DataFrame(value, columns=index, index=None)
else:
return value

Expand Down
4 changes: 1 addition & 3 deletions python/xorbits/_mars/dataframe/reduction/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -1161,9 +1161,7 @@ def _interpret_var(v):
and isinstance(t.op.lhs, DATAFRAME_TYPE)
and isinstance(t.op.rhs, str)
):
# for a cudf dataframe, df == 'foo' doesn't work, so we convert the rhs
# to a tuple.
rhs = f"({rhs},) * len({lhs}.columns)"
rhs = f"{rhs} "
statements = [
f"try:",
f" {var_name} = {lhs}.{func_name}({rhs}, {axis_expr})",
Expand Down
4 changes: 4 additions & 0 deletions python/xorbits/_mars/lib/groupby_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,10 @@ def wrapped_groupby(
groupby_kw.pop("squeeze")
if not _HAS_DROPNA: # pragma: no branch
groupby_kw.pop("dropna")
# cudf currently not support observed,
# cudf default to True, while pandas default is False
if is_cudf(obj): # pragma: no branch
groupby_kw["observed"] = True

groupby_obj = obj.groupby(**groupby_kw)
return GroupByWrapper(obj, groupby_obj=groupby_obj, as_index=as_index)
16 changes: 13 additions & 3 deletions python/xorbits/_mars/storage/cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,12 @@ def set_buffers_by_sizes(self, sizes: List[int]):
from rmm import DeviceBuffer

self._buffers = [
_convert_to_cupy_ndarray(DeviceBuffer(size=size)) for size in sizes
(
cupy.ndarray(shape=0, dtype="u1")
if size == 0
else _convert_to_cupy_ndarray(DeviceBuffer(size=size))
)
for size in sizes
]

@property
Expand Down Expand Up @@ -96,7 +101,10 @@ def _initialize_read(self):
self._buffers.append(buf.astype("u1", copy=False))
buffer_types.append(["cuda", buf.size])
elif isinstance(buf, Buffer):
self._buffers.append(_convert_to_cupy_ndarray(buf))
if buf.size == 0:
self._buffers.append(cupy.ndarray(shape=0, dtype="u1"))
else:
self._buffers.append(_convert_to_cupy_ndarray(buf))
buffer_types.append(["cuda", buf.size])
else:
size = getattr(buf, "size", len(buf))
Expand Down Expand Up @@ -275,7 +283,9 @@ async def get(self, object_id: str, **kwargs) -> object:
if isinstance(buf, cupy.ndarray):
new_buffers.append(DeviceBuffer(ptr=buf.data.ptr, size=buf.size))
elif isinstance(buf, CPBuffer):
new_buffers.append(DeviceBuffer(ptr=buf.ptr, size=buf.size))
new_buffers.append(
DeviceBuffer(ptr=buf.owner._ptr + buf._offset, size=buf.size)
)
else:
new_buffers.append(buf)
return deserialize(headers, new_buffers)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,9 @@
SAMPLE_DATASET_IDENTIFIER = "lhoestq/test" # has dataset script
SAMPLE_DATASET_IDENTIFIER2 = "lhoestq/test2" # only has data files
SAMPLE_DATASET_IDENTIFIER3 = (
"mariosasko/test_multi_dir_dataset" # has multiple data directories
"hf-internal-testing/multi_dir_dataset" # has multiple data directories
)
SAMPLE_DATASET_IDENTIFIER4 = "mariosasko/test_imagefolder_with_metadata" # imagefolder with a metadata file outside of the train/test directories
SAMPLE_DATASET_IDENTIFIER4 = "hf-internal-testing/imagefolder_with_metadata" # imagefolder with a metadata file outside of the train/test directories


def test_split_arg_required():
Expand Down

0 comments on commit 2f99f8c

Please sign in to comment.