From e5ea9b2acb0ed12ee6f185259751ace538015e17 Mon Sep 17 00:00:00 2001 From: Weizheng Lu Date: Sat, 17 Aug 2024 11:40:55 +0800 Subject: [PATCH] ENH: Fix tests hang in CI (#794) --- .github/workflows/python.yaml | 117 +++++++++--------- .../groupby/tests/test_groupby_execution.py | 2 +- .../_mars/dataframe/reduction/aggregation.py | 8 +- .../xorbits/_mars/dataframe/reduction/core.py | 4 +- python/xorbits/_mars/lib/groupby_wrapper.py | 4 + python/xorbits/_mars/storage/cuda.py | 16 ++- .../tests/test_huggingface_dataset.py | 4 +- 7 files changed, 82 insertions(+), 73 deletions(-) diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml index 0874c5de6..98db2c7ee 100644 --- a/.github/workflows/python.yaml +++ b/.github/workflows/python.yaml @@ -67,26 +67,26 @@ jobs: needs: lint env: CONDA_ENV: xorbits-test + SELF_HOST_PYTHON: /root/miniconda3/bin/python + SELF_HOST_CONDA: /root/miniconda3/bin/conda defaults: run: shell: bash -l {0} strategy: fail-fast: false matrix: - os: ["ubuntu-latest", "macos-latest", "windows-latest"] + os: ["ubuntu-latest", "macos-13", "windows-latest"] python-version: ["3.9", "3.10", "3.11"] - module: ["xorbits", "kubernetes"] + module: ["xorbits", "xorbits/numpy", "xorbits/pandas", "kubernetes"] exclude: - - { os: macos-latest, python-version: 3.10} - - { os: macos-latest, python-version: 3.9} + - { os: macos-13, python-version: 3.10} + - { os: macos-13, python-version: 3.9} - { os: windows-latest, python-version: 3.10} - { os: windows-latest, python-version: 3.9} - { os: windows-latest, module: kubernetes} - - { os: macos-latest, module: kubernetes} + - { os: macos-13, module: kubernetes} include: - { os: ubuntu-latest, module: _mars/dataframe, python-version: 3.9 } - - { os: ubuntu-latest, module: _mars/tensor, python-version: 3.9 } - - { os: ubuntu-latest, module: _mars/learn, python-version: 3.9 } - { os: ubuntu-latest, module: learn, python-version: 3.9 } - { os: ubuntu-latest, module: mars-core, python-version: 3.9 } - { os: ubuntu-20.04, module: hadoop, python-version: 3.9 } @@ -94,7 +94,7 @@ jobs: - { os: ubuntu-latest, module: external-storage, python-version: 3.9 } - { os: ubuntu-latest, module: compatibility, python-version: 3.9 } - { os: ubuntu-latest, module: doc-build, python-version: 3.9 } - - { os: self-hosted, module: gpu, python-version: 3.9} + - { os: [self-hosted, gpu], module: gpu, python-version: 3.11} - { os: ubuntu-latest, module: jax, python-version: 3.9 } - { os: juicefs-ci, module: kubernetes-juicefs, python-version: 3.9 } - { os: ubuntu-latest, module: slurm, python-version: 3.9 } @@ -126,11 +126,11 @@ jobs: minikube-version: 1.31.2 - name: Install ucx dependencies - if: ${{ (matrix.module != 'gpu') && (matrix.os == 'ubuntu-latest') && (matrix.python-version != '3.11') }} + if: ${{ (matrix.module != 'gpu') && (matrix.os == 'ubuntu-latest')}} run: | conda install -c conda-forge -c rapidsai ucx-proc=*=cpu ucx ucx-py - name: Install libomp (macOS) - if: matrix.os == 'macos-latest' + if: ${{ matrix.os == 'macos-latest' || matrix.os == 'macos-13' }} run: brew install libomp - name: Install dependencies env: @@ -141,7 +141,7 @@ jobs: pip install -e "git+https://github.com/xorbitsai/xoscar.git@main#subdirectory=python&egg=xoscar" pip install "numpy<2.0.0" scipy cython pyftpdlib coverage flaky numexpr - if [[ "$MODULE" == "xorbits" ]]; then + if [[ "$MODULE" == "xorbits/pandas" ]]; then pip install openpyxl fi if [[ "$MODULE" == "mars-core" ]]; then @@ -227,9 +227,6 @@ jobs: if [[ "$MODULE" == "learn" ]]; then pip install xgboost lightgbm fi - if [[ "$MODULE" == "ray-dag" ]] || [[ "$MODULE" == "ray-deploy" ]]; then - pip install "xgboost_ray<0.1.14" "protobuf<4" "sqlalchemy<2" - fi if [[ "$MODULE" == "compatibility" ]]; then # test if compatible with older versions pip install "pandas==1.5.3" "scipy<=1.10.1" "numpy<=1.24.1" "matplotlib<=3.7.0" "pyarrow<12.0.0" "sqlalchemy<2" @@ -267,6 +264,10 @@ jobs: - name: Install on GPU if: ${{ matrix.module == 'gpu' }} run: | + pip install --extra-index-url=https://pypi.nvidia.com cudf-cu12==24.8.* + pip install ucx-py-cu12 cython "numpy>=1.14.0,<2.0.0" cloudpickle scikit-learn \ + pyyaml psutil tornado sqlalchemy defusedxml tqdm uvloop coverage \ + pytest pytest-cov pytest-timeout pytest-forked pytest-asyncio pytest-mock pip install -U xoscar python setup.py build_ext -i working-directory: ./python @@ -279,15 +280,44 @@ jobs: working-directory: ./doc - name: Test with pytest - if: ${{ matrix.module != 'doc-build' }} + if: ${{ matrix.module != 'doc-build' && matrix.module != 'gpu' }} env: MODULE: ${{ matrix.module }} run: | if [[ "$MODULE" == "xorbits" ]]; then - pytest --ignore xorbits/_mars/ --ignore xorbits/xgboost --ignore xorbits/lightgbm \ - --ignore xorbits/datasets --timeout=1500 \ + pytest --ignore xorbits/_mars/ --ignore xorbits/pandas --ignore xorbits/numpy \ + --ignore xorbits/xgboost --ignore xorbits/lightgbm --ignore xorbits/sklearn \ + --ignore xorbits/datasets \ + --ignore xorbits/core/tests/test_execution_exit.py \ + --timeout=1500 \ + -W ignore::PendingDeprecationWarning \ + --cov-config=setup.cfg --cov-report=xml --cov=xorbits \ + -k "not test_execution_with_process_exit_message" \ + xorbits + # workaround: this case will hang, run it separately. + pytest --timeout=1500 \ + -W ignore::PendingDeprecationWarning \ + --cov-config=setup.cfg --cov-report=xml --cov=xorbits \ + -k "test_execution_with_process_exit_message" \ + xorbits/core/tests/test_execution.py + elif [[ "$MODULE" == "xorbits/pandas" ]]; then + pytest --timeout=1500 \ -W ignore::PendingDeprecationWarning \ - --cov-config=setup.cfg --cov-report=xml --cov=xorbits xorbits + --cov-config=setup.cfg --cov-report=xml \ + --cov=xorbits xorbits/pandas + elif [[ "$MODULE" == "xorbits/numpy" ]]; then + pytest --timeout=1500 \ + -W ignore::PendingDeprecationWarning \ + --cov-config=setup.cfg --cov-report=xml --cov=xorbits \ + -k "not test_numpy_fallback" \ + xorbits/numpy + # workaround: this case will hang, run it separately. + pytest --timeout=1500 \ + -W ignore::PendingDeprecationWarning \ + --cov-config=setup.cfg --cov-report=xml \ + --cov=xorbits \ + -k "test_numpy_fallback" \ + xorbits/numpy/numpy_adapters/tests/test_numpy_adapters.py elif [[ "$MODULE" == "mars-core" ]]; then pytest --forked --log-level=DEBUG --ignore xorbits/_mars/dataframe --ignore xorbits/_mars/tensor \ --ignore xorbits/_mars/learn --ignore xorbits/_mars/remote \ @@ -343,48 +373,12 @@ jobs: -W ignore::PendingDeprecationWarning \ --cov-config=setup.cfg --cov-report=xml --cov=xorbits \ xorbits/xgboost xorbits/lightgbm - elif [ "$MODULE" == "ray-deploy" ]; then - pytest --cov-config=setup.cfg --cov-report=xml --cov=xorbits --durations=0 \ - --log-level=DEBUG --timeout=200 xorbits/_mars --ignore=xorbits/_mars/deploy/oscar/ -m ray - pytest --cov-config=setup.cfg --cov-report=xml --cov=xorbits --durations=0 \ - --log-level=DEBUG --timeout=200 xorbits/_mars/deploy/oscar/tests/test_ray.py -m ray - pytest --cov-config=setup.cfg --cov-report=xml --cov=xorbits --durations=0 \ - --log-level=DEBUG --timeout=200 xorbits/_mars/deploy/oscar/tests/test_ray_load_modules.py -m ray - pytest --cov-config=setup.cfg --cov-report=xml --cov=xorbits --durations=0 \ - --log-level=DEBUG --timeout=200 xorbits/_mars/deploy/oscar/tests/test_ray_cluster_standalone.py -m ray - pytest --cov-config=setup.cfg --cov-report=xml --cov=xorbits --durations=0 \ - --log-level=DEBUG --timeout=200 xorbits/_mars/deploy/oscar/tests/test_ray_client.py -m ray - pytest --cov-config=setup.cfg --cov-report=xml --cov=xorbits --durations=0 \ - --log-level=DEBUG --timeout=200 xorbits/_mars/deploy/oscar/tests/test_ray_fault_injection.py -m ray - pytest --cov-config=setup.cfg --cov-report=xml --cov=xorbits --durations=0 \ - --log-level=DEBUG --timeout=200 xorbits/_mars/deploy/oscar/tests/test_ray_scheduling.py -m ray - elif [ "$MODULE" == "ray-dag" ]; then - export MARS_CI_BACKEND=ray - export RAY_idle_worker_killing_time_threshold_ms=60000 - pytest --cov-config=setup.cfg --cov-report=xml --durations=0 --timeout=500 xorbits/_mars/dataframe \ - -v -s -m "not skip_ray_dag" --ignore=xorbits/_mars/dataframe/contrib/raydataset - pytest --cov-config=setup.cfg --cov-report=xml --durations=0 \ - --timeout=500 xorbits/_mars/dataframe/contrib/raydataset -v -s -m "not skip_ray_dag" - pytest --cov-config=setup.cfg --cov-report=xml --durations=0 \ - --timeout=500 xorbits/_mars/tensor -v -s -m "not skip_ray_dag" - pytest --cov-config=setup.cfg --cov-report=xml --durations=0 \ - --timeout=500 xorbits/_mars/learn --ignore xorbits/_mars/learn/contrib \ - --ignore xorbits/_mars/learn/utils/tests/test_collect_ports.py -m "not skip_ray_dag" - pytest --cov-config=setup.cfg --cov-report=xml --durations=0 \ - --timeout=200 xorbits -v -s -m ray_dag - pytest --cov-config=setup.cfg --cov-report=xml --durations=0 \ - --timeout=200 xorbits/_mars/deploy/oscar/tests/test_ray_dag.py - pytest --cov-config=setup.cfg --cov-report=xml --durations=0 \ - --timeout=200 xorbits/_mars/deploy/oscar/tests/test_ray_dag_failover.py - pytest --cov-config=setup.cfg --cov-report=xml --durations=0 \ - --timeout=200 xorbits/_mars/deploy/oscar/tests/test_ray_dag_oscar.py -m ray - elif [ "$MODULE" == "gpu" ]; then - pytest -m cuda --gpu --ignore xorbits/datasets --cov-config=setup.cfg --cov-report=xml --cov=xorbits xorbits elif [ "$MODULE" == "jax" ]; then pytest --cov-config=setup.cfg --cov-report=xml --cov=xorbits xorbits/_mars/tensor/fuse/tests/test_runtime_fusion.py pytest --cov-config=setup.cfg --cov-report=xml --cov=xorbits xorbits/_mars/tensor/ elif [ "$MODULE" == "datasets" ]; then - pytest --cov-config=setup.cfg --cov-report=xml --cov=xorbits xorbits/datasets + pytest --cov-config=setup.cfg --cov-report=xml --cov=xorbits xorbits/datasets/backends + pytest --cov-config=setup.cfg --cov-report=xml --cov=xorbits xorbits/datasets/tests elif [ "$MODULE" == "compatibility" ]; then pytest --timeout=1500 \ -W ignore::PendingDeprecationWarning \ @@ -398,6 +392,15 @@ jobs: --cov-config=setup.cfg --cov-report=xml --cov=xorbits/deploy --cov=xorbits xorbits/$MODULE fi working-directory: ./python + + - name: Test with pytest GPU + if: ${{ matrix.module == 'gpu' }} + run: | + pytest -m cuda --gpu --ignore xorbits/datasets \ + --ignore xorbits/sklearn --ignore xorbits/_mars/learn \ + --cov-config=setup.cfg --cov-report=xml --cov=xorbits \ + xorbits + working-directory: ./python - name: Cleanup on slurm @@ -407,7 +410,7 @@ jobs: jobqueue_after_script - name: Report coverage data - uses: codecov/codecov-action@v3 + uses: codecov/codecov-action@v4 with: working-directory: ./python flags: unittests diff --git a/python/xorbits/_mars/dataframe/groupby/tests/test_groupby_execution.py b/python/xorbits/_mars/dataframe/groupby/tests/test_groupby_execution.py index f3ebbc56d..b36823a12 100644 --- a/python/xorbits/_mars/dataframe/groupby/tests/test_groupby_execution.py +++ b/python/xorbits/_mars/dataframe/groupby/tests/test_groupby_execution.py @@ -1079,7 +1079,6 @@ def __call__(self, s): ) -@support_cuda @pytest.mark.parametrize( "chunked,as_index", [(True, True), (True, False), (False, True), (False, False)] ) @@ -1092,6 +1091,7 @@ def test_groupby_apply_as_index(chunked, as_index, setup_gpu, gpu): } ) + # cudf not support udf like this def udf(v): denominator = v["a"].sum() * v["a"].mean() v = v[v["c"] == "c"] diff --git a/python/xorbits/_mars/dataframe/reduction/aggregation.py b/python/xorbits/_mars/dataframe/reduction/aggregation.py index d9aa5768f..3989bb402 100644 --- a/python/xorbits/_mars/dataframe/reduction/aggregation.py +++ b/python/xorbits/_mars/dataframe/reduction/aggregation.py @@ -630,14 +630,8 @@ def _wrap_df(cls, op, value, index=None): if isinstance(value, (np.generic, int, float, complex)): value = xdf.DataFrame([value], columns=index) elif not isinstance(value, xdf.DataFrame): - new_index = None if not op.gpu else getattr(value, "index", None) dtype = getattr(value, "dtype", None) - if xdf is pd: - value = xdf.DataFrame(value, columns=index, index=new_index) - else: # pragma: no cover - value = xdf.DataFrame(value) - value.index = new_index - value.columns = index + value = xdf.DataFrame(value, columns=index, index=None) else: return value diff --git a/python/xorbits/_mars/dataframe/reduction/core.py b/python/xorbits/_mars/dataframe/reduction/core.py index c82b652cd..9e560b78f 100644 --- a/python/xorbits/_mars/dataframe/reduction/core.py +++ b/python/xorbits/_mars/dataframe/reduction/core.py @@ -1161,9 +1161,7 @@ def _interpret_var(v): and isinstance(t.op.lhs, DATAFRAME_TYPE) and isinstance(t.op.rhs, str) ): - # for a cudf dataframe, df == 'foo' doesn't work, so we convert the rhs - # to a tuple. - rhs = f"({rhs},) * len({lhs}.columns)" + rhs = f"{rhs} " statements = [ f"try:", f" {var_name} = {lhs}.{func_name}({rhs}, {axis_expr})", diff --git a/python/xorbits/_mars/lib/groupby_wrapper.py b/python/xorbits/_mars/lib/groupby_wrapper.py index 744813836..ab7b0d0b4 100644 --- a/python/xorbits/_mars/lib/groupby_wrapper.py +++ b/python/xorbits/_mars/lib/groupby_wrapper.py @@ -290,6 +290,10 @@ def wrapped_groupby( groupby_kw.pop("squeeze") if not _HAS_DROPNA: # pragma: no branch groupby_kw.pop("dropna") + # cudf currently not support observed, + # cudf default to True, while pandas default is False + if is_cudf(obj): # pragma: no branch + groupby_kw["observed"] = True groupby_obj = obj.groupby(**groupby_kw) return GroupByWrapper(obj, groupby_obj=groupby_obj, as_index=as_index) diff --git a/python/xorbits/_mars/storage/cuda.py b/python/xorbits/_mars/storage/cuda.py index 08186fc70..14e320e68 100644 --- a/python/xorbits/_mars/storage/cuda.py +++ b/python/xorbits/_mars/storage/cuda.py @@ -67,7 +67,12 @@ def set_buffers_by_sizes(self, sizes: List[int]): from rmm import DeviceBuffer self._buffers = [ - _convert_to_cupy_ndarray(DeviceBuffer(size=size)) for size in sizes + ( + cupy.ndarray(shape=0, dtype="u1") + if size == 0 + else _convert_to_cupy_ndarray(DeviceBuffer(size=size)) + ) + for size in sizes ] @property @@ -96,7 +101,10 @@ def _initialize_read(self): self._buffers.append(buf.astype("u1", copy=False)) buffer_types.append(["cuda", buf.size]) elif isinstance(buf, Buffer): - self._buffers.append(_convert_to_cupy_ndarray(buf)) + if buf.size == 0: + self._buffers.append(cupy.ndarray(shape=0, dtype="u1")) + else: + self._buffers.append(_convert_to_cupy_ndarray(buf)) buffer_types.append(["cuda", buf.size]) else: size = getattr(buf, "size", len(buf)) @@ -275,7 +283,9 @@ async def get(self, object_id: str, **kwargs) -> object: if isinstance(buf, cupy.ndarray): new_buffers.append(DeviceBuffer(ptr=buf.data.ptr, size=buf.size)) elif isinstance(buf, CPBuffer): - new_buffers.append(DeviceBuffer(ptr=buf.owner._ptr, size=buf.size)) + new_buffers.append( + DeviceBuffer(ptr=buf.owner._ptr + buf._offset, size=buf.size) + ) else: new_buffers.append(buf) return deserialize(headers, new_buffers) diff --git a/python/xorbits/datasets/backends/huggingface/tests/test_huggingface_dataset.py b/python/xorbits/datasets/backends/huggingface/tests/test_huggingface_dataset.py index 073ad4c06..7781c2479 100644 --- a/python/xorbits/datasets/backends/huggingface/tests/test_huggingface_dataset.py +++ b/python/xorbits/datasets/backends/huggingface/tests/test_huggingface_dataset.py @@ -29,9 +29,9 @@ SAMPLE_DATASET_IDENTIFIER = "lhoestq/test" # has dataset script SAMPLE_DATASET_IDENTIFIER2 = "lhoestq/test2" # only has data files SAMPLE_DATASET_IDENTIFIER3 = ( - "mariosasko/test_multi_dir_dataset" # has multiple data directories + "hf-internal-testing/multi_dir_dataset" # has multiple data directories ) -SAMPLE_DATASET_IDENTIFIER4 = "mariosasko/test_imagefolder_with_metadata" # imagefolder with a metadata file outside of the train/test directories +SAMPLE_DATASET_IDENTIFIER4 = "hf-internal-testing/imagefolder_with_metadata" # imagefolder with a metadata file outside of the train/test directories def test_split_arg_required():