Skip to content

Commit

Permalink
initial work on adding wheel build for libcudf
Browse files Browse the repository at this point in the history
  • Loading branch information
msarahan committed Apr 23, 2024
1 parent 73306f1 commit a3de193
Show file tree
Hide file tree
Showing 23 changed files with 312 additions and 80 deletions.
11 changes: 10 additions & 1 deletion .github/workflows/pr.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ jobs:
- static-configure
- conda-notebook-tests
- docs-build
- wheel-build-libcudf
- wheel-build-cudf
- wheel-tests-cudf
- wheel-build-dask-cudf
Expand Down Expand Up @@ -118,10 +119,18 @@ jobs:
arch: "amd64"
container_image: "rapidsai/ci-conda:latest"
run_script: "ci/build_docs.sh"
wheel-build-cudf:
wheel-build-libcudf:
needs: checks
secrets: inherit
uses: rapidsai/shared-workflows/.github/workflows/[email protected]
with:
matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
build_type: pull-request
script: "ci/build_wheel_libcudf.sh"
wheel-build-cudf:
needs: [checks, wheel-build-libcudf]
secrets: inherit
uses: rapidsai/shared-workflows/.github/workflows/[email protected]
with:
build_type: pull-request
script: "ci/build_wheel_cudf.sh"
Expand Down
16 changes: 12 additions & 4 deletions build.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/bash

# Copyright (c) 2019-2023, NVIDIA CORPORATION.
# Copyright (c) 2019-2024, NVIDIA CORPORATION.

# cuDF build script

Expand All @@ -17,12 +17,13 @@ ARGS=$*
# script, and that this script resides in the repo dir!
REPODIR=$(cd $(dirname $0); pwd)

VALIDARGS="clean libcudf cudf cudfjar dask_cudf benchmarks tests libcudf_kafka cudf_kafka custreamz -v -g -n --pydevelop -l --allgpuarch --disable_nvtx --opensource_nvcomp --show_depr_warn --ptds -h --build_metrics --incl_cache_stats"
HELP="$0 [clean] [libcudf] [cudf] [cudfjar] [dask_cudf] [benchmarks] [tests] [libcudf_kafka] [cudf_kafka] [custreamz] [-v] [-g] [-n] [-h] [--cmake-args=\\\"<args>\\\"]
VALIDARGS="clean libcudf cudf libcudfwheel cudfjar dask_cudf benchmarks tests libcudf_kafka cudf_kafka custreamz -v -g -n --pydevelop -l --allgpuarch --disable_nvtx --opensource_nvcomp --show_depr_warn --ptds -h --build_metrics --incl_cache_stats"
HELP="$0 [clean] [libcudf] [cudf] [libcudfwheel] [cudfjar] [dask_cudf] [benchmarks] [tests] [libcudf_kafka] [cudf_kafka] [custreamz] [-v] [-g] [-n] [-h] [--cmake-args=\\\"<args>\\\"]
clean - remove all existing build artifacts and configuration (start
over)
libcudf - build the cudf C++ code only
cudf - build the cudf Python package
libcudfwheel - build the cudf C++ code packaged as a python wheel package
cudfjar - build cudf JAR with static libcudf using devtoolset toolchain
dask_cudf - build the dask_cudf Python package
benchmarks - build benchmarks
Expand Down Expand Up @@ -333,7 +334,14 @@ if buildAll || hasArg libcudf; then
fi
fi

# Build and install the cudf Python package
if buildAll || hasArg libcudfwheel; then

cd ${REPODIR}/python/libcudf
SKBUILD_CMAKE_ARGS="-DCMAKE_PREFIX_PATH=${INSTALL_PREFIX};-DCMAKE_LIBRARY_PATH=${LIBCUDF_BUILD_DIR};-DCMAKE_CUDA_ARCHITECTURES=${CUDF_CMAKE_CUDA_ARCHITECTURES};${EXTRA_CMAKE_ARGS}" \
python ${PYTHON_ARGS_FOR_INSTALL} .
fi

# Build and install the cudf Python packages
if buildAll || hasArg cudf; then

cd ${REPODIR}/python/cudf
Expand Down
2 changes: 2 additions & 0 deletions ci/build_wheel.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ if ! rapids-is-release-build; then
alpha_spec=',>=0.0.0a0'
fi

sed -r -i "s/libcudf==(.*)\"/libcudf${PACKAGE_CUDA_SUFFIX}==\1${alpha_spec}\"/g" ${pyproject_file}

if [[ ${package_name} == "dask-cudf" ]]; then
sed -r -i "s/cudf==(.*)\"/cudf${PACKAGE_CUDA_SUFFIX}==\1${alpha_spec}\"/g" ${pyproject_file}
sed -r -i "s/dask-cuda==(.*)\"/dask-cuda==\1${alpha_spec}\"/g" ${pyproject_file}
Expand Down
14 changes: 10 additions & 4 deletions ci/build_wheel_cudf.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,20 @@

set -euo pipefail

package_name="cudf"
package_dir="python/cudf"

export SKBUILD_CMAKE_ARGS="-DUSE_LIBARROW_FROM_PYARROW=ON"

./ci/build_wheel.sh cudf ${package_dir}
RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
# Downloads libcudf wheel from this current build, then points pip to it in PIP_FIND_LINKS below
RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp /tmp/libcudf_dist

python -m auditwheel repair -w ${package_dir}/final_dist ${package_dir}/dist/*
export PIP_FIND_LINKS="/tmp/libcudf_dist"
./ci/build_wheel.sh cudf ${package_dir}

cd ${package_dir}
mkdir -p final_dist
python -m auditwheel repair --exclude libcudf.so --exclude libarrow.so.1400 -w final_dist dist/*

RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 ${package_dir}/final_dist
RAPIDS_PY_WHEEL_NAME="${package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 python final_dist
11 changes: 11 additions & 0 deletions ci/build_wheel_libcudf.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/bin/bash
# Copyright (c) 2023-2024, NVIDIA CORPORATION.

set -euo pipefail

package_dir="python/libcudf"

./ci/build_wheel.sh libcudf ${package_dir}

RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 cpp ${package_dir}/dist
5 changes: 3 additions & 2 deletions ci/cudf_pandas_scripts/pandas-tests/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,9 @@ rapids-logger "Running Pandas tests using $PANDAS_TESTS_BRANCH branch and rapids
rapids-logger "PR number: ${RAPIDS_REF_NAME:-"unknown"}"

RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep
python -m pip install $(ls ./local-cudf-dep/cudf*.whl)[test,pandas-tests]
RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp ./local-cudf-dep
RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./local-cudf-dep
python -m pip install --find-links $(pwd)/local-cudf-dep $(ls ./local-cudf-dep/cudf*.whl)[test,pandas-tests]

RESULTS_DIR=${RAPIDS_TESTS_DIR:-"$(mktemp -d)"}
RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${RESULTS_DIR}/test-results"}/
Expand Down
5 changes: 3 additions & 2 deletions ci/cudf_pandas_scripts/run_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,9 @@ if [ "$no_cudf" = true ]; then
echo "Skipping cudf install"
else
RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep
python -m pip install $(ls ./local-cudf-dep/cudf*.whl)[test,cudf-pandas-tests]
RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp ./local-cudf-dep
RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./local-cudf-dep
python -m pip install --find-links $(pwd)/local-cudf-dep $(ls ./local-cudf-dep/cudf*.whl)[test,cudf-pandas-tests]
fi

python -m pytest -p cudf.pandas ./python/cudf/cudf_pandas_tests/
5 changes: 3 additions & 2 deletions ci/test_wheel_cudf.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,11 @@
set -eou pipefail

RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp ./dist
RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./dist

# echo to expand wildcard before adding `[extra]` requires for pip
python -m pip install $(echo ./dist/cudf*.whl)[test]
python -m pip install --find-links $(pwd)/dist $(echo ./dist/cudf*.whl)[test]

RESULTS_DIR=${RAPIDS_TESTS_DIR:-"$(mktemp -d)"}
RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${RESULTS_DIR}/test-results"}/
Expand Down
5 changes: 3 additions & 2 deletions ci/test_wheel_dask_cudf.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,9 @@ RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
RAPIDS_PY_WHEEL_NAME="dask_cudf_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-download-wheels-from-s3 ./dist

# Download the cudf built in the previous step
RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep
python -m pip install --no-deps ./local-cudf-dep/cudf*.whl
RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp ./local-cudf-dep
RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./local-cudf-dep
python -m pip install --no-deps --find-links $(pwd)/local-cudf-dep ./local-cudf-dep/cudf*.whl

# echo to expand wildcard before adding `[extra]` requires for pip
python -m pip install $(echo ./dist/dask_cudf*.whl)[test]
Expand Down
22 changes: 22 additions & 0 deletions dependencies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ files:
- py_version
- run_common
- run_cudf
- run_libcudf
- run_dask_cudf
- run_custreamz
- test_cpp
Expand Down Expand Up @@ -75,6 +76,18 @@ files:
- docs
- libarrow_run
- py_version
# This is the shared library, bundled as a wheel. It is meant to be consumed by the wrapper.
py_build_libcudf:
output: pyproject
pyproject_dir: python/libcudf
extras:
table: build-system
includes:
- build_base
- build_cpp
- build_python_common
- build_python_cudf
# This is the wrapper that gets used in Python, not the shared library wheel
py_build_cudf:
output: pyproject
pyproject_dir: python/cudf
Expand All @@ -84,6 +97,7 @@ files:
- build_base
- build_python_common
- build_python_cudf
- run_libcudf
py_run_cudf:
output: pyproject
pyproject_dir: python/cudf
Expand All @@ -94,6 +108,7 @@ files:
- run_cudf
- pyarrow_run
- depends_on_cupy
- run_libcudf
py_test_cudf:
output: pyproject
pyproject_dir: python/cudf
Expand Down Expand Up @@ -336,6 +351,13 @@ dependencies:
# Allow runtime version to float up to minor version
# Disallow pyarrow 14.0.0 due to a CVE
- pyarrow>=14.0.1,<15.0.0a0
run_libcudf:
common:
# TODO: Currently this is a hack for devcontainers. Need to figure out the best solution.
#- output_types: [requirements, pyproject]
- output_types: [pyproject]
packages:
- libcudf==24.6.*
cuda_version:
specific:
- output_types: conda
Expand Down
84 changes: 21 additions & 63 deletions python/cudf/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,72 +24,15 @@ project(
LANGUAGES CXX CUDA
)

option(FIND_CUDF_CPP "Search for existing CUDF C++ installations before defaulting to local files"
OFF
)
option(USE_LIBARROW_FROM_PYARROW "Only use the libarrow contained in pyarrow" OFF)
mark_as_advanced(USE_LIBARROW_FROM_PYARROW)

# Find Python early so that later commands can use it
find_package(Python 3.9 REQUIRED COMPONENTS Interpreter)

# If the user requested it we attempt to find CUDF.
if(FIND_CUDF_CPP)
include(rapids-cpm)
include(rapids-export)
include(rapids-find)
rapids_cpm_init()

if(USE_LIBARROW_FROM_PYARROW)
# We need to find arrow before libcudf since libcudf requires it but doesn't bundle arrow
# libraries. These variables have no effect because we are always searching for arrow via
# pyarrow, but they must be set as they are required arguments to the function in
# get_arrow.cmake.
set(CUDF_USE_ARROW_STATIC OFF)
set(CUDF_ENABLE_ARROW_S3 OFF)
set(CUDF_ENABLE_ARROW_ORC OFF)
set(CUDF_ENABLE_ARROW_PYTHON OFF)
set(CUDF_ENABLE_ARROW_PARQUET OFF)
include(../../cpp/cmake/thirdparty/get_arrow.cmake)
endif()
find_package(cudf "${RAPIDS_VERSION}" REQUIRED)

find_package(cudf "${RAPIDS_VERSION}" REQUIRED)

# an installed version of libcudf doesn't provide the dlpack headers so we need to download dlpack
# for the interop.pyx
include(../../cpp/cmake/thirdparty/get_dlpack.cmake)
else()
set(cudf_FOUND OFF)
endif()
# an installed version of libcudf doesn't provide the dlpack headers so we need to download dlpack
# for the interop.pyx
include(rapids-cpm)
rapids_cpm_init()
include(../../cpp/cmake/thirdparty/get_dlpack.cmake)

include(rapids-cython-core)

if(NOT cudf_FOUND)
set(BUILD_TESTS OFF)
set(BUILD_BENCHMARKS OFF)
set(CUDF_BUILD_TESTUTIL OFF)
set(CUDF_BUILD_STREAMS_TEST_UTIL OFF)
set(CUDA_STATIC_RUNTIME ON)

add_subdirectory(../../cpp cudf-cpp EXCLUDE_FROM_ALL)

# libcudf targets are excluded by default above via EXCLUDE_FROM_ALL to remove extraneous
# components like headers from libcudacxx, but we do need the libraries. However, we want to
# control where they are installed to. Since there are multiple subpackages of cudf._lib that
# require access to libcudf, we place the library and all its dependent artifacts in the cudf
# directory as a single source of truth and modify the other rpaths appropriately.
set(cython_lib_dir cudf)
include(cmake/Modules/WheelHelpers.cmake)
# TODO: This install is currently overzealous. We should only install the libraries that are
# downloaded by CPM during the build, not libraries that were found on the system. However, in
# practice right this would only be a problem is if libcudf was not found but some of the
# dependencies were, and we have no real use cases where that happens.
install_aliased_imported_targets(
TARGETS cudf arrow_shared nvcomp::nvcomp nvcomp::nvcomp_gdeflate nvcomp::nvcomp_bitcomp
DESTINATION ${cython_lib_dir}
)
endif()

rapids_cython_init()

include(cmake/Modules/LinkPyarrowHeaders.cmake)
Expand All @@ -99,3 +42,18 @@ add_subdirectory(udf_cpp)
if(DEFINED cython_lib_dir)
rapids_cython_add_rpath_entries(TARGET cudf PATHS "${cython_lib_dir}")
endif()

# libcudf targets are excluded by default above via EXCLUDE_FROM_ALL to remove extraneous components
# like headers from libcudacxx, but we do need the libraries. However, we want to control where they
# are installed to. Since there are multiple subpackages of cudf._lib that require access to
# libcudf, we place the library and all its dependent artifacts in the cudf directory as a single
# source of truth and modify the other rpaths appropriately.
include(cmake/Modules/WheelHelpers.cmake)
# TODO: This install is currently overzealous. We should only install the libraries that are
# downloaded by CPM during the build, not libraries that were found on the system. However, in
# practice right this would only be a problem is if libcudf was not found but some of the
# dependencies were, and we have no real use cases where that happens.
install_aliased_imported_targets(
TARGETS cudf arrow_shared nvcomp::nvcomp nvcomp::nvcomp_gdeflate nvcomp::nvcomp_bitcomp
DESTINATION ${cython_lib_dir}
)
10 changes: 10 additions & 0 deletions python/cudf/cudf/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,15 @@
# Copyright (c) 2018-2024, NVIDIA CORPORATION.

# If libcudf was installed as a wheel, we must request it to load the library symbols.
# Otherwise, we assume that the library was installed in a system path that ld can find.
try:
import libcudf
except ModuleNotFoundError:
pass
else:
libcudf.load_library()
del libcudf

# _setup_numba _must be called before numba.cuda is imported, because
# it sets the numba config variable responsible for enabling
# Minor Version Compatibility. Setting it after importing numba.cuda has no effect.
Expand Down
2 changes: 2 additions & 0 deletions python/cudf/cudf/_lib/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@ target_link_libraries(strings_udf PUBLIC cudf_strings_udf)
set(targets_using_arrow_headers interop avro csv orc json parquet)
link_to_pyarrow_headers("${targets_using_arrow_headers}")

target_include_directories(interop PUBLIC "$<BUILD_INTERFACE:${DLPACK_INCLUDE_DIR}>")

add_subdirectory(cpp)
add_subdirectory(io)
add_subdirectory(nvtext)
Expand Down
1 change: 1 addition & 0 deletions python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -44,5 +44,6 @@ rapids_cython_create_modules(
LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_ ASSOCIATED_TARGETS cudf
)
link_to_pyarrow_headers(pylibcudf_interop)
target_include_directories(pylibcudf_interop PUBLIC "$<BUILD_INTERFACE:${DLPACK_INCLUDE_DIR}>")

add_subdirectory(strings)
2 changes: 2 additions & 0 deletions python/cudf/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ build-backend = "scikit_build_core.build"
requires = [
"cmake>=3.26.4",
"cython>=3.0.3",
"libcudf==24.6.*",
"ninja",
"numpy==1.23.*",
"pyarrow==14.0.2.*",
Expand All @@ -28,6 +29,7 @@ dependencies = [
"cuda-python>=11.7.1,<12.0a0",
"cupy-cuda11x>=12.0.0",
"fsspec>=0.6.0",
"libcudf==24.6.*",
"numba>=0.57",
"numpy>=1.23,<2.0a0",
"nvtx>=0.2.1",
Expand Down
Loading

0 comments on commit a3de193

Please sign in to comment.