diff --git a/.clang-format b/.clang-format index 9448dc8d8c80d..abd823c103904 100644 --- a/.clang-format +++ b/.clang-format @@ -19,3 +19,4 @@ BasedOnStyle: Google ColumnLimit: 90 DerivePointerAlignment: false IncludeBlocks: Preserve +IndentPPDirectives: AfterHash diff --git a/.dockerignore b/.dockerignore index 3791cca95e3fe..1f1715d8e833d 100644 --- a/.dockerignore +++ b/.dockerignore @@ -27,11 +27,11 @@ # include explicitly !ci/** !c_glib/Gemfile -!dev/archery/setup.py !dev/release/setup-*.sh !docs/requirements*.txt +!go/go.mod +!go/go.sum !python/requirements*.txt -!python/manylinux1/** !r/DESCRIPTION !ruby/Gemfile !ruby/red-arrow/Gemfile @@ -46,20 +46,3 @@ !ruby/red-parquet/Gemfile !ruby/red-parquet/lib/parquet/version.rb !ruby/red-parquet/red-parquet.gemspec -!ruby/red-plasma/Gemfile -!ruby/red-plasma/lib/plasma/version.rb -!ruby/red-plasma/red-plasma.gemspec -!rust/Cargo.toml -!rust/benchmarks/Cargo.toml -!rust/arrow/Cargo.toml -!rust/arrow/benches -!rust/arrow-flight/Cargo.toml -!rust/parquet/Cargo.toml -!rust/parquet/build.rs -!rust/parquet_derive/Cargo.toml -!rust/parquet_derive_test/Cargo.toml -!rust/datafusion/Cargo.toml -!rust/datafusion/benches -!rust/integration-testing/Cargo.toml -!go/go.mod -!go/go.sum \ No newline at end of file diff --git a/.env b/.env index be35921f94c3a..f41a142490716 100644 --- a/.env +++ b/.env @@ -58,10 +58,8 @@ CUDA=11.2.2 DASK=latest DOTNET=8.0 GCC_VERSION="" -GO=1.21.8 -STATICCHECK=v0.4.7 HDFS=3.2.1 -JDK=8 +JDK=11 KARTOTHEK=latest # LLVM 12 and GCC 11 reports -Wmismatched-new-delete. LLVM=14 @@ -70,7 +68,9 @@ NODE=18 NUMBA=latest NUMPY=latest PANDAS=latest -PYTHON=3.8 +PYTHON=3.9 +PYTHON_IMAGE_TAG=3.9 +PYTHON_ABI_TAG=cp39 R=4.4 SPARK=master TURBODBC=latest @@ -95,7 +95,7 @@ VCPKG="943c5ef1c8f6b5e6ced092b242c8299caae2ff01" # 2024.04.26 Release # ci/docker/python-wheel-windows-vs2019.dockerfile. # This is a workaround for our CI problem that "archery docker build" doesn't # use pulled built images in dev/tasks/python-wheels/github.windows.yml. -PYTHON_WHEEL_WINDOWS_IMAGE_REVISION=2024-06-18 +PYTHON_WHEEL_WINDOWS_IMAGE_REVISION=2024-08-06 # Use conanio/${CONAN_BASE}:{CONAN_VERSION} for "docker-compose run --rm conan". # See https://github.com/conan-io/conan-docker-tools#readme and diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index e495bfd147de6..793dbb3806f80 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -40,7 +40,7 @@ /matlab/ @kevingurney @kou @sgilmore10 /python/pyarrow/_flight.pyx @lidavidm /python/pyarrow/**/*gandiva* @wjones127 -/r/ @paleolimbot @thisisnic +/r/ @jonkeane @thisisnic /ruby/ @kou /swift/ @kou @@ -53,7 +53,7 @@ # *.txt # PR CI and repository files -/.github/ @assignUser @kou @raulcd +/.github/ @assignUser @jonkeane @kou @raulcd .asf.yaml @assignUser @kou @raulcd .pre-commit-config.yaml @raulcd .travis.yml @assignUser @kou @raulcd @@ -61,11 +61,11 @@ appveyor.yml @assignUser @kou @raulcd # .git* # release scripts, archery etc. -/ci/ @assignUser @kou @raulcd -/dev/ @assignUser @kou @raulcd +/ci/ @assignUser @jonkeane @kou @raulcd +/dev/ @assignUser @jonkeane @kou @raulcd .dockerignore @raulcd -.env @assignUser @kou @raulcd -docker-compose.yml @assignUser @kou @raulcd +.env @assignUser @jonkeane @kou @raulcd +docker-compose.yml @assignUser @jonkeane @kou @raulcd # R specific packaging tooling /r/configure* @assignUser diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 7d9ff2f42e887..7ba9744ef005d 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -24,13 +24,6 @@ updates: commit-message: prefix: "MINOR: [CI] " open-pull-requests-limit: 10 - - package-ecosystem: "gomod" - directory: "/go/" - schedule: - interval: "weekly" - commit-message: - prefix: "MINOR: [Go] " - open-pull-requests-limit: 10 - package-ecosystem: "maven" directory: "/java/" schedule: diff --git a/.github/workflows/archery.yml b/.github/workflows/archery.yml index c698baba2c816..e448209056d78 100644 --- a/.github/workflows/archery.yml +++ b/.github/workflows/archery.yml @@ -20,12 +20,14 @@ name: Archery & Crossbow on: push: paths: + - '.dockerignore' - '.github/workflows/archery.yml' - 'dev/archery/**' - 'dev/tasks/**' - 'docker-compose.yml' pull_request: paths: + - '.dockerignore' - '.github/workflows/archery.yml' - 'dev/archery/**' - 'dev/tasks/**' @@ -34,7 +36,6 @@ on: env: ARCHERY_DEBUG: 1 ARCHERY_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }} - ARCHERY_USE_DOCKER_CLI: 1 concurrency: group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} @@ -59,7 +60,7 @@ jobs: shell: bash run: git branch $ARCHERY_DEFAULT_BRANCH origin/$ARCHERY_DEFAULT_BRANCH || true - name: Setup Python - uses: actions/setup-python@v5.1.0 + uses: actions/setup-python@v5.2.0 with: python-version: '3.9' - name: Install pygit2 binary wheel diff --git a/.github/workflows/comment_bot.yml b/.github/workflows/comment_bot.yml index 6b9a9256a5290..2306ed6db0dc9 100644 --- a/.github/workflows/comment_bot.yml +++ b/.github/workflows/comment_bot.yml @@ -41,7 +41,7 @@ jobs: # fetch the tags for version number generation fetch-depth: 0 - name: Set up Python - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: python-version: 3.12 - name: Install Archery and Crossbow dependencies diff --git a/.github/workflows/cpp.yml b/.github/workflows/cpp.yml index e539fadb859fe..f5c8b6a7201be 100644 --- a/.github/workflows/cpp.yml +++ b/.github/workflows/cpp.yml @@ -20,6 +20,7 @@ name: C++ on: push: paths: + - '.dockerignore' - '.github/workflows/cpp.yml' - 'ci/conda_env_*' - 'ci/docker/**' @@ -35,6 +36,7 @@ on: - 'testing' pull_request: paths: + - '.dockerignore' - '.github/workflows/cpp.yml' - 'ci/conda_env_*' - 'ci/docker/**' @@ -99,7 +101,6 @@ jobs: cat <> "$GITHUB_OUTPUT" { "arch": "arm64v8", - "archery-use-docker-cli": "0", "clang-tools": "10", "image": "ubuntu-cpp", "llvm": "10", @@ -124,9 +125,6 @@ jobs: include: ${{ fromJson(needs.docker-targets.outputs.targets) }} env: ARCH: ${{ matrix.arch }} - # By default, use Docker CLI because docker-compose v1 is obsolete, - # except where the Docker client version is too old. - ARCHERY_USE_DOCKER_CLI: ${{ matrix.archery-use-docker-cli || '1' }} ARROW_SIMD_LEVEL: ${{ matrix.simd-level }} CLANG_TOOLS: ${{ matrix.clang-tools }} LLVM: ${{ matrix.llvm }} @@ -147,6 +145,7 @@ jobs: run: | sudo apt update sudo apt install -y --no-install-recommends python3 python3-dev python3-pip + python3 -m pip install -U pip - name: Setup Archery run: python3 -m pip install -e dev/archery[docker] - name: Execute Docker Build @@ -156,8 +155,7 @@ jobs: run: | # GH-40558: reduce ASLR to avoid ASAN/LSAN crashes sudo sysctl -w vm.mmap_rnd_bits=28 - sudo sysctl -w kernel.core_pattern="core.%e.%p" - ulimit -c unlimited + source ci/scripts/util_enable_core_dumps.sh archery docker run ${{ matrix.image }} - name: Docker Push if: >- @@ -189,7 +187,7 @@ jobs: - name: Run minimal example run: | cd cpp/examples/minimal_build - docker-compose run --rm minimal + docker compose run --rm minimal macos: name: ${{ matrix.architecture }} macOS ${{ matrix.macos-version }} C++ @@ -246,7 +244,7 @@ jobs: $(brew --prefix bash)/bin/bash \ ci/scripts/install_minio.sh latest ${ARROW_HOME} - name: Set up Python - uses: actions/setup-python@v5.1.0 + uses: actions/setup-python@v5.2.0 with: python-version: 3.12 - name: Install Google Cloud Storage Testbench @@ -273,7 +271,7 @@ jobs: shell: bash run: | sudo sysctl -w kern.coredump=1 - sudo sysctl -w kern.corefile=core.%N.%P + sudo sysctl -w kern.corefile=/tmp/core.%N.%P ulimit -c unlimited # must enable within the same shell ci/scripts/cpp_test.sh $(pwd) $(pwd)/build @@ -412,12 +410,10 @@ jobs: ARROW_WITH_SNAPPY: ON ARROW_WITH_ZLIB: ON ARROW_WITH_ZSTD: ON - # Don't use preinstalled Boost by empty BOOST_ROOT and - # -DBoost_NO_BOOST_CMAKE=ON + # Don't use preinstalled Boost by empty BOOST_ROOT BOOST_ROOT: "" ARROW_CMAKE_ARGS: >- -DARROW_PACKAGE_PREFIX=/${{ matrix.msystem_lower}} - -DBoost_NO_BOOST_CMAKE=ON -DCMAKE_FIND_PACKAGE_PREFER_CONFIG=ON # We can't use unity build because we don't have enough memory on # GitHub Actions. @@ -467,16 +463,18 @@ jobs: https://dl.min.io/server/minio/release/windows-amd64/archive/minio.RELEASE.2022-05-26T05-48-41Z chmod +x /usr/local/bin/minio.exe - name: Set up Python - uses: actions/setup-python@v5.1.0 + uses: actions/setup-python@v5.2.0 + id: python-install with: python-version: 3.9 - name: Install Google Cloud Storage Testbench - shell: bash + shell: msys2 {0} + env: + PIPX_BIN_DIR: /usr/local/bin + PIPX_BASE_PYTHON: ${{ steps.python-install.outputs.python-path }} run: | ci/scripts/install_gcs_testbench.sh default - echo "PYTHON_BIN_DIR=$(cygpath --windows $(dirname $(which python3.exe)))" >> $GITHUB_ENV - name: Test shell: msys2 {0} run: | - PATH="$(cygpath --unix ${PYTHON_BIN_DIR}):${PATH}" ci/scripts/cpp_test.sh "$(pwd)" "$(pwd)/build" diff --git a/.github/workflows/csharp.yml b/.github/workflows/csharp.yml index e4db9f482e206..c618350affbeb 100644 --- a/.github/workflows/csharp.yml +++ b/.github/workflows/csharp.yml @@ -49,7 +49,7 @@ jobs: dotnet: ['8.0.x'] steps: - name: Install C# - uses: actions/setup-dotnet@v4 + uses: actions/setup-dotnet@v4.0.1 with: dotnet-version: ${{ matrix.dotnet }} - name: Checkout Arrow @@ -77,7 +77,7 @@ jobs: dotnet: ['8.0.x'] steps: - name: Install C# - uses: actions/setup-dotnet@v4 + uses: actions/setup-dotnet@v4.0.1 with: dotnet-version: ${{ matrix.dotnet }} - name: Checkout Arrow @@ -104,11 +104,11 @@ jobs: dotnet: ['8.0.x'] steps: - name: Install C# - uses: actions/setup-dotnet@v4 + uses: actions/setup-dotnet@v4.0.1 with: dotnet-version: ${{ matrix.dotnet }} - name: Setup Python - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: python-version: 3.12 - name: Checkout Arrow diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml index 5aec3638a8967..d2436fe3c4525 100644 --- a/.github/workflows/dev.yml +++ b/.github/workflows/dev.yml @@ -31,7 +31,6 @@ permissions: env: ARCHERY_DEBUG: 1 - ARCHERY_USE_DOCKER_CLI: 1 jobs: @@ -46,7 +45,7 @@ jobs: with: fetch-depth: 0 - name: Setup Python - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: python-version: 3.12 - name: Install pre-commit @@ -67,9 +66,9 @@ jobs: env: ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} + UBUNTU: 22.04 run: | - sudo sysctl -w kernel.core_pattern="core.%e.%p" - ulimit -c unlimited + source ci/scripts/util_enable_core_dumps.sh archery docker run -e GITHUB_ACTIONS=true ubuntu-lint - name: Docker Push if: >- @@ -105,7 +104,7 @@ jobs: with: fetch-depth: 0 - name: Install Python - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: python-version: '3.12' - name: Install Ruby @@ -113,7 +112,7 @@ jobs: with: ruby-version: ruby - name: Install .NET - uses: actions/setup-dotnet@4d6c8fcf3c8f7a60068d26b594648e99df24cee3 # v4.0.0 + uses: actions/setup-dotnet@6bd8b7f7774af54e05809fcc5431931b3eb1ddee # v4.0.1 with: dotnet-version: '8.0.x' - name: Install Dependencies diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 36a0dc014db8d..1219f7526f9f2 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -25,7 +25,6 @@ permissions: env: ARCHERY_DEBUG: 1 - ARCHERY_USE_DOCKER_CLI: 1 ARROW_ENABLE_TIMING_TESTS: OFF DOCKER_VOLUME_PREFIX: ".docker/" @@ -53,7 +52,7 @@ jobs: key: debian-docs-${{ hashFiles('cpp/**') }} restore-keys: debian-docs- - name: Setup Python - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: python-version: 3.12 - name: Setup Archery diff --git a/.github/workflows/docs_light.yml b/.github/workflows/docs_light.yml index 947e2ac21b83c..7d540b7cecdc9 100644 --- a/.github/workflows/docs_light.yml +++ b/.github/workflows/docs_light.yml @@ -20,6 +20,7 @@ name: Docs on: pull_request: paths: + - '.dockerignore' - 'docs/**' - '.github/workflows/docs_light.yml' - 'ci/docker/conda.dockerfile' @@ -34,7 +35,6 @@ permissions: env: ARCHERY_DEBUG: 1 - ARCHERY_USE_DOCKER_CLI: 1 ARROW_ENABLE_TIMING_TESTS: OFF DOCKER_VOLUME_PREFIX: ".docker/" @@ -59,7 +59,7 @@ jobs: key: conda-docs-${{ hashFiles('cpp/**') }} restore-keys: conda-docs- - name: Setup Python - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: python-version: 3.12 - name: Setup Archery diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml deleted file mode 100644 index c247a89128b34..0000000000000 --- a/.github/workflows/go.yml +++ /dev/null @@ -1,488 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -name: Go - -on: - push: - paths: - - '.github/workflows/go.yml' - - 'ci/docker/*_go.dockerfile' - - 'ci/scripts/go_*' - - 'docker-compose.yml' - - 'go/**' - pull_request: - paths: - - '.github/workflows/go.yml' - - 'ci/docker/*_go.dockerfile' - - 'ci/docker/**' - - 'ci/scripts/go_*' - - 'docker-compose.yml' - - 'go/**' - -concurrency: - group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} - cancel-in-progress: true - -permissions: - contents: read - -env: - ARCHERY_DEBUG: 1 - ARCHERY_USE_DOCKER_CLI: 1 - -jobs: - - docker-targets: - name: Docker targets - runs-on: ubuntu-latest - if: ${{ !contains(github.event.pull_request.title, 'WIP') }} - outputs: - targets: ${{ steps.detect-targets.outputs.targets }} - steps: - - name: Detect targets - id: detect-targets - run: | - echo "targets<> "$GITHUB_OUTPUT" - echo "[" >> "$GITHUB_OUTPUT" - cat <> "$GITHUB_OUTPUT" - { - "arch-label": "AMD64", - "arch": "amd64", - "go": "1.21", - "runs-on": "ubuntu-latest" - }, - { - "arch-label": "AMD64", - "arch": "amd64", - "go": "1.22", - "runs-on": "ubuntu-latest" - } - JSON - if [ "$GITHUB_REPOSITORY_OWNER" = "apache" ]; then - echo "," >> "$GITHUB_OUTPUT" - cat <> "$GITHUB_OUTPUT" - { - "arch-label": "ARM64", - "arch": "arm64v8", - "archery-use-docker-cli": "0", - "go": "1.21", - "runs-on": ["self-hosted", "arm", "linux"] - }, - { - "arch-label": "ARM64", - "arch": "arm64v8", - "archery-use-docker-cli": "0", - "go": "1.22", - "runs-on": ["self-hosted", "arm", "linux"] - } - JSON - fi - echo "]" >> "$GITHUB_OUTPUT" - echo "JSON" >> "$GITHUB_OUTPUT" - - docker: - name: ${{ matrix.arch-label }} Debian 12 Go ${{ matrix.go }} - needs: docker-targets - runs-on: ${{ matrix.runs-on }} - if: ${{ !contains(github.event.pull_request.title, 'WIP') }} - timeout-minutes: 60 - strategy: - fail-fast: false - matrix: - include: ${{ fromJson(needs.docker-targets.outputs.targets) }} - env: - ARCH: ${{ matrix.arch }} - # By default, use Docker CLI because docker-compose v1 is obsolete, - # except where the Docker client version is too old. - ARCHERY_USE_DOCKER_CLI: ${{ matrix.archery-use-docker-cli || '1' }} - GO: ${{ matrix.go }} - steps: - - name: Checkout Arrow - uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0 - with: - fetch-depth: 0 - submodules: recursive - - name: Setup Python - run: | - sudo apt update - sudo apt install -y --no-install-recommends python3 python3-dev python3-pip - - name: Setup Archery - run: python3 -m pip install -e dev/archery[docker] - - name: Execute Docker Build - env: - ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} - ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} - run: archery docker run debian-go - - name: Docker Push - if: >- - success() && - github.event_name == 'push' && - github.repository == 'apache/arrow' && - github.ref_name == 'main' - env: - ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} - ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} - continue-on-error: true - run: archery docker push debian-go - - name: Install Go ${{ matrix.go }} for Benchmarks - if: >- - success() && - matrix.arch == 'amd64' && - github.event_name == 'push' && - github.repository == 'apache/arrow' && - github.ref_name == 'main' - uses: actions/setup-go@0c52d547c9bc32b1aa3301fd7a9cb496313a4491 # v5.0.0 - with: - go-version: ${{ matrix.go }} - cache: true - cache-dependency-path: go/go.sum - - name: Run Benchmarks - if: >- - success() && - matrix.arch == 'amd64' && - github.event_name == 'push' && - github.repository == 'apache/arrow' && - github.ref_name == 'main' - env: - CONBENCH_URL: https://conbench.ursa.dev - CONBENCH_EMAIL: ${{ secrets.CONBENCH_EMAIL }} - CONBENCH_PASSWORD: ${{ secrets.CONBENCH_PASS }} - CONBENCH_REF: ${{ github.ref_name }} - CONBENCH_MACHINE_INFO_NAME: ${{ matrix.arch }}-debian-12 - run: | - python3 -m pip install benchadapt@git+https://github.com/conbench/conbench.git@main#subdirectory=benchadapt/python - python3 ci/scripts/go_bench_adapt.py - - build_test_386: - name: Go Cross-build and test for 386 - runs-on: ubuntu-latest - if: ${{ !contains(github.event.pull_request.title, 'WIP') }} - timeout-minutes: 20 - steps: - - name: Checkout Arrow - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - name: Get required Go version - run: | - (. .env && echo "GO_VERSION=${GO}") >> $GITHUB_ENV - - name: Install Go - uses: actions/setup-go@v5 - with: - go-version: "${{ env.GO_VERSION }}" - cache: true - cache-dependency-path: go/go.sum - - name: Run build - run: GOARCH=386 go build ./... - working-directory: ./go - - name: Run test - # WIP refactor, only tests in the specified dirs have been fixed - run: GOARCH=386 go test ./parquet/file/... - working-directory: ./go - - docker_cgo: - name: AMD64 Debian 12 Go ${{ matrix.go }} - CGO - runs-on: ubuntu-latest - if: ${{ !contains(github.event.pull_request.title, 'WIP') }} - timeout-minutes: 20 - strategy: - fail-fast: false - matrix: - go: ['1.21', '1.22'] - env: - GO: ${{ matrix.go }} - steps: - - name: Checkout Arrow - uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0 - with: - fetch-depth: 0 - submodules: recursive - - name: Setup Python - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 - with: - python-version: 3.8 - - name: Setup Archery - run: pip install -e dev/archery[docker] - - name: Execute Docker Build - env: - ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} - ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} - run: archery docker run debian-go-cgo - - name: Docker Push - if: >- - success() && - github.event_name == 'push' && - github.repository == 'apache/arrow' && - github.ref_name == 'main' - env: - ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} - ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} - continue-on-error: true - run: archery docker push debian-go-cgo - - - docker_cgo_python: - name: AMD64 Debian 12 Go ${{ matrix.go }} - CGO Python - runs-on: ubuntu-latest - if: ${{ !contains(github.event.pull_request.title, 'WIP') }} - timeout-minutes: 15 - strategy: - fail-fast: false - matrix: - go: ['1.21', '1.22'] - env: - GO: ${{ matrix.go }} - steps: - - name: Checkout Arrow - uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0 - with: - fetch-depth: 0 - - name: Setup Python - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 - with: - python-version: 3.8 - - name: Setup Archery - run: pip install -e dev/archery[docker] - - name: Execute Docker Build - env: - ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} - ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} - run: archery docker run debian-go-cgo-python - - name: Docker Push - if: >- - success() && - github.event_name == 'push' && - github.repository == 'apache/arrow' && - github.ref_name == 'main' - env: - ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} - ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} - continue-on-error: true - run: archery docker push debian-go-cgo-python - - windows: - name: AMD64 Windows 2019 Go ${{ matrix.go }} - runs-on: windows-2019 - if: ${{ !contains(github.event.pull_request.title, 'WIP') }} - timeout-minutes: 25 - strategy: - fail-fast: false - matrix: - go: ['1.21', '1.22'] - steps: - - name: Checkout Arrow - uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0 - with: - fetch-depth: 0 - submodules: recursive - - name: Install go - uses: actions/setup-go@0c52d547c9bc32b1aa3301fd7a9cb496313a4491 # v5.0.0 - with: - go-version: ${{ matrix.go }} - cache: true - cache-dependency-path: go/go.sum - - name: Install staticcheck - shell: bash - run: | - . .env - go install honnef.co/go/tools/cmd/staticcheck@${STATICCHECK} - - name: Build - shell: bash - run: ci/scripts/go_build.sh $(pwd) - - name: Test - shell: bash - run: ci/scripts/go_test.sh $(pwd) - - macos: - name: AMD64 macOS 12 Go ${{ matrix.go }} - runs-on: macos-12 - if: ${{ !contains(github.event.pull_request.title, 'WIP') }} - timeout-minutes: 60 - strategy: - fail-fast: false - matrix: - go: ['1.21', '1.22'] - steps: - - name: Checkout Arrow - uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0 - with: - fetch-depth: 0 - submodules: recursive - - name: Install go - uses: actions/setup-go@0c52d547c9bc32b1aa3301fd7a9cb496313a4491 # v5.0.0 - with: - go-version: ${{ matrix.go }} - cache: true - cache-dependency-path: go/go.sum - - name: Install staticcheck - run: | - . .env - go install honnef.co/go/tools/cmd/staticcheck@${STATICCHECK} - - name: Build - shell: bash - run: ci/scripts/go_build.sh $(pwd) - - name: Test - shell: bash - run: ci/scripts/go_test.sh $(pwd) - - name: Setup Python - if: >- - success() && - github.event_name == 'push' && - github.repository == 'apache/arrow' && - github.ref_name == 'main' - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 - with: - python-version: '3.10' - - name: Run Benchmarks - if: >- - success() && - github.event_name == 'push' && - github.repository == 'apache/arrow' && - github.ref_name == 'main' - shell: bash - env: - CONBENCH_URL: 'https://conbench.ursa.dev' - CONBENCH_EMAIL: ${{ secrets.CONBENCH_EMAIL }} - CONBENCH_PASSWORD: ${{ secrets.CONBENCH_PASS }} - CONBENCH_REF: ${{ github.ref_name }} - CONBENCH_MACHINE_INFO_NAME: amd64-macos-11 - run: | - pip install benchadapt@git+https://github.com/conbench/conbench.git@main#subdirectory=benchadapt/python - python ci/scripts/go_bench_adapt.py - - - macos-cgo: - name: AMD64 macOS 12 Go ${{ matrix.go }} - CGO - runs-on: macos-12 - if: ${{ !contains(github.event.pull_request.title, 'WIP') }} - timeout-minutes: 60 - strategy: - fail-fast: false - matrix: - go: ['1.21', '1.22'] - env: - ARROW_GO_TESTCGO: "1" - steps: - - name: Checkout Arrow - uses: actions/checkout@v4 - with: - fetch-depth: 0 - submodules: recursive - - name: Install go - uses: actions/setup-go@v5 - with: - go-version: ${{ matrix.go }} - cache: true - cache-dependency-path: go/go.sum - - name: Brew Install Arrow and pkg-config - shell: bash - run: brew install apache-arrow pkg-config - - name: Install staticcheck - run: | - . .env - go install honnef.co/go/tools/cmd/staticcheck@${STATICCHECK} - - name: Add To pkg config path - shell: bash - run: | - echo "PKG_CONFIG_PATH=$(brew --prefix openssl@3)/lib/pkgconfig:$PKG_CONFIG_PATH" >> $GITHUB_ENV - - name: Build - shell: bash - run: ci/scripts/go_build.sh $(pwd) - - name: Test - shell: bash - run: ci/scripts/go_test.sh $(pwd) - - windows-mingw: - name: AMD64 Windows MinGW ${{ matrix.mingw-n-bits }} CGO - runs-on: windows-2019 - if: ${{ !contains(github.event.pull_request.title, 'WIP') }} - timeout-minutes: 60 - strategy: - fail-fast: false - matrix: - mingw-n-bits: - #- 32 runtime handling for CGO needs 64-bit currently - - 64 - env: - ARROW_GO_TESTCGO: "1" - MINGW_LINT: "1" - steps: - - name: Disable Crash Dialogs - run: | - reg add ` - "HKCU\SOFTWARE\Microsoft\Windows\Windows Error Reporting" ` - /v DontShowUI ` - /t REG_DWORD ` - /d 1 ` - /f - - name: Checkout Arrow - uses: actions/checkout@v4 - with: - fetch-depth: 0 - submodules: recursive - - uses: msys2/setup-msys2@v2 - with: - msystem: MINGW${{ matrix.mingw-n-bits }} - update: true - - name: Setup MSYS2 - shell: msys2 {0} - run: | - ci/scripts/msys2_setup.sh cgo - - name: Get required Go version - run: | - (. .env && echo "GO_VERSION=${GO}") >> $GITHUB_ENV - - name: Update CGO Env vars - shell: msys2 {0} - run: | - echo "CGO_CPPFLAGS=-I$(cygpath --windows ${MINGW_PREFIX}/include)" >> $GITHUB_ENV - echo "CGO_LDFLAGS=-g -O2 -L$(cygpath --windows ${MINGW_PREFIX}/lib) -L$(cygpath --windows ${MINGW_PREFIX}/bin)" >> $GITHUB_ENV - echo "MINGW_PREFIX=$(cygpath --windows ${MINGW_PREFIX})" >> $GITHUB_ENV - - name: Install go - uses: actions/setup-go@v5 - with: - go-version: "${{ env.GO_VERSION }}" - cache: true - cache-dependency-path: go/go.sum - - name: Install staticcheck - shell: bash - run: | - . .env - go install honnef.co/go/tools/cmd/staticcheck@${STATICCHECK} - - name: Build - shell: bash - run: ci/scripts/go_build.sh $(pwd) - - name: Test - shell: bash - run: ci/scripts/go_test.sh $(pwd) - - tinygo: - name: TinyGo - runs-on: ubuntu-latest - if: ${{ !contains(github.event.pull_request.title, 'WIP') }} - env: - TINYGO_VERSION: 0.27.0 - timeout-minutes: 60 - steps: - - name: Checkout Arrow - uses: actions/checkout@v4 - with: - fetch-depth: 0 - submodules: recursive - - name: Build and Run Example - run: | - docker run --rm -v $(pwd)/go:/src -v $(pwd)/ci/scripts:/ci-scripts "tinygo/tinygo:$TINYGO_VERSION" /ci-scripts/go_tinygo_example.sh diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index f53f4aeb505d2..af9a98ed437f8 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -20,6 +20,7 @@ name: Integration on: push: paths: + - '.dockerignore' - '.github/workflows/integration.yml' - 'ci/**' - 'dev/archery/**' @@ -33,6 +34,7 @@ on: - 'format/**' pull_request: paths: + - '.dockerignore' - '.github/workflows/integration.yml' - 'ci/**' - 'dev/archery/**' @@ -54,7 +56,6 @@ permissions: env: ARCHERY_DEBUG: 1 - ARCHERY_USE_DOCKER_CLI: 1 DOCKER_VOLUME_PREFIX: ".docker/" jobs: @@ -80,6 +81,11 @@ jobs: with: repository: apache/arrow-nanoarrow path: nanoarrow + - name: Checkout Arrow Go + uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + with: + repository: apache/arrow-go + path: go - name: Free up disk space run: | ci/scripts/util_free_space.sh @@ -90,18 +96,20 @@ jobs: key: conda-${{ hashFiles('cpp/**') }} restore-keys: conda- - name: Setup Python - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: - python-version: 3.8 + python-version: 3.12 - name: Setup Archery run: pip install -e dev/archery[docker] - name: Execute Docker Build env: ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} - run: > + run: | + source ci/scripts/util_enable_core_dumps.sh archery docker run \ -e ARCHERY_DEFAULT_BRANCH=${{ github.event.repository.default_branch }} \ + -e ARCHERY_INTEGRATION_WITH_GO=1 \ -e ARCHERY_INTEGRATION_WITH_NANOARROW=1 \ -e ARCHERY_INTEGRATION_WITH_RUST=1 \ conda-integration diff --git a/.github/workflows/java.yml b/.github/workflows/java.yml index 08dbe7c8068c0..ad39dbc7d01e6 100644 --- a/.github/workflows/java.yml +++ b/.github/workflows/java.yml @@ -20,6 +20,7 @@ name: Java on: push: paths: + - '.dockerignore' - '.github/workflows/java.yml' - 'ci/docker/*java*' - 'ci/scripts/java*.sh' @@ -29,6 +30,7 @@ on: - 'java/**' pull_request: paths: + - '.dockerignore' - '.github/workflows/java.yml' - 'ci/docker/*java*' - 'ci/scripts/java*.sh' @@ -46,7 +48,6 @@ permissions: env: ARCHERY_DEBUG: 1 - ARCHERY_USE_DOCKER_CLI: 1 DOCKER_VOLUME_PREFIX: ".docker/" jobs: @@ -58,7 +59,7 @@ jobs: strategy: fail-fast: false matrix: - jdk: [8, 11, 17, 21, 22] + jdk: [11, 17, 21, 22] maven: [3.9.6] image: [java] env: @@ -77,9 +78,9 @@ jobs: key: maven-${{ hashFiles('java/**') }} restore-keys: maven- - name: Setup Python - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: - python-version: 3.8 + python-version: 3.12 - name: Setup Archery run: pip install -e dev/archery[docker] - name: Execute Docker Build diff --git a/.github/workflows/java_jni.yml b/.github/workflows/java_jni.yml index ea5f8d694a9c6..56aa1d0992887 100644 --- a/.github/workflows/java_jni.yml +++ b/.github/workflows/java_jni.yml @@ -20,6 +20,7 @@ name: Java JNI on: push: paths: + - '.dockerignore' - '.github/workflows/java_jni.yml' - 'ci/docker/**' - 'ci/scripts/cpp_build.sh' @@ -29,6 +30,7 @@ on: - 'java/**' pull_request: paths: + - '.dockerignore' - '.github/workflows/java_jni.yml' - 'ci/docker/**' - 'ci/scripts/cpp_build.sh' @@ -46,7 +48,6 @@ permissions: env: ARCHERY_DEBUG: 1 - ARCHERY_USE_DOCKER_CLI: 1 DOCKER_VOLUME_PREFIX: ".docker/" jobs: @@ -71,16 +72,18 @@ jobs: key: java-jni-manylinux-2014-${{ hashFiles('cpp/**', 'java/**') }} restore-keys: java-jni-manylinux-2014- - name: Setup Python - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: - python-version: 3.8 + python-version: 3.12 - name: Setup Archery run: pip install -e dev/archery[docker] - name: Execute Docker Build env: ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} - run: archery docker run java-jni-manylinux-2014 + run: | + source ci/scripts/util_enable_core_dumps.sh + archery docker run java-jni-manylinux-2014 - name: Docker Push if: >- success() && @@ -111,9 +114,9 @@ jobs: key: maven-${{ hashFiles('java/**') }} restore-keys: maven- - name: Setup Python - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: - python-version: 3.8 + python-version: 3.12 - name: Setup Archery run: pip install -e dev/archery[docker] - name: Execute Docker Build diff --git a/.github/workflows/java_nightly.yml b/.github/workflows/java_nightly.yml index f40d4ce5b42d6..0bf0c27288faf 100644 --- a/.github/workflows/java_nightly.yml +++ b/.github/workflows/java_nightly.yml @@ -58,7 +58,7 @@ jobs: repository: ursacomputing/crossbow ref: main - name: Set up Python - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: cache: 'pip' python-version: 3.12 diff --git a/.github/workflows/js.yml b/.github/workflows/js.yml index c11c8254011f6..c7693c05133b0 100644 --- a/.github/workflows/js.yml +++ b/.github/workflows/js.yml @@ -20,12 +20,14 @@ name: NodeJS on: push: paths: + - '.dockerignore' - '.github/workflows/js.yml' - 'ci/docker/*js.dockerfile' - 'ci/scripts/js_*' - 'js/**' pull_request: paths: + - '.dockerignore' - '.github/workflows/js.yml' - 'ci/docker/*js.dockerfile' - 'ci/scripts/js_*' @@ -40,7 +42,6 @@ permissions: env: ARCHERY_DEBUG: 1 - ARCHERY_USE_DOCKER_CLI: 1 jobs: @@ -55,9 +56,9 @@ jobs: with: fetch-depth: 0 - name: Setup Python - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: - python-version: 3.8 + python-version: 3.12 - name: Setup Archery run: pip install -e dev/archery[docker] - name: Execute Docker Build @@ -65,8 +66,7 @@ jobs: ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} run: | - sudo sysctl -w kernel.core_pattern="core.%e.%p" - ulimit -c unlimited + source ci/scripts/util_enable_core_dumps.sh archery docker run debian-js - name: Docker Push if: >- diff --git a/.github/workflows/pr_bot.yml b/.github/workflows/pr_bot.yml index e589610f536b3..bbb1a2d7228d0 100644 --- a/.github/workflows/pr_bot.yml +++ b/.github/workflows/pr_bot.yml @@ -82,7 +82,7 @@ jobs: # fetch the tags for version number generation fetch-depth: 0 - name: Set up Python - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: python-version: 3.12 - name: Install Archery and Crossbow dependencies diff --git a/.github/workflows/pr_review_trigger.yml b/.github/workflows/pr_review_trigger.yml index 0cd89b3206715..68f922ce8b4d9 100644 --- a/.github/workflows/pr_review_trigger.yml +++ b/.github/workflows/pr_review_trigger.yml @@ -29,7 +29,7 @@ jobs: runs-on: ubuntu-latest steps: - name: "Upload PR review Payload" - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4.4.0 with: path: "${{ github.event_path }}" name: "pr_review_payload" diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index a568f8346e7fc..4916287556b0c 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -20,6 +20,7 @@ name: Python on: push: paths: + - '.dockerignore' - '.github/workflows/python.yml' - 'ci/**' - 'cpp/**' @@ -27,6 +28,7 @@ on: - 'python/**' pull_request: paths: + - '.dockerignore' - '.github/workflows/python.yml' - 'ci/**' - 'cpp/**' @@ -42,7 +44,6 @@ permissions: env: ARCHERY_DEBUG: 1 - ARCHERY_USE_DOCKER_CLI: 1 DOCKER_VOLUME_PREFIX: ".docker/" jobs: @@ -57,35 +58,41 @@ jobs: matrix: name: - conda-python-docs - - conda-python-3.9-nopandas - - conda-python-3.8-pandas-1.0 - - conda-python-3.10-pandas-latest + - conda-python-3.10-nopandas + - conda-python-3.9-pandas-1.1.3 + - conda-python-3.11-pandas-latest + - conda-python-3.11-no-numpy include: - name: conda-python-docs - cache: conda-python-3.9 + cache: conda-python-3.10 image: conda-python-docs - title: AMD64 Conda Python 3.9 Sphinx & Numpydoc - python: 3.9 - - name: conda-python-3.9-nopandas - cache: conda-python-3.9 + title: AMD64 Conda Python 3.10 Sphinx & Numpydoc + python: "3.10" + - name: conda-python-3.10-nopandas + cache: conda-python-3.10 image: conda-python - title: AMD64 Conda Python 3.9 Without Pandas - python: 3.9 - - name: conda-python-3.8-pandas-1.0 - cache: conda-python-3.8 + title: AMD64 Conda Python 3.10 Without Pandas + python: "3.10" + - name: conda-python-3.9-pandas-1.1.3 + cache: conda-python-3.9 image: conda-python-pandas - title: AMD64 Conda Python 3.8 Pandas 1.0 - python: 3.8 - pandas: "1.0" - numpy: 1.16 - - name: conda-python-3.10-pandas-latest - cache: conda-python-3.10 + title: AMD64 Conda Python 3.9 Pandas 1.1.3 + python: 3.9 + pandas: "1.1.3" + numpy: 1.19.5 + - name: conda-python-3.11-pandas-latest + cache: conda-python-3.11 image: conda-python-pandas - title: AMD64 Conda Python 3.10 Pandas latest - python: "3.10" + title: AMD64 Conda Python 3.11 Pandas latest + python: "3.11" pandas: latest + - name: conda-python-3.11-no-numpy + cache: conda-python-3.11 + image: conda-python-no-numpy + title: AMD64 Conda Python 3.11 without NumPy + python: "3.11" env: - PYTHON: ${{ matrix.python || 3.8 }} + PYTHON: ${{ matrix.python || 3.9 }} UBUNTU: ${{ matrix.ubuntu || 20.04 }} PANDAS: ${{ matrix.pandas || 'latest' }} NUMPY: ${{ matrix.numpy || 'latest' }} @@ -102,9 +109,9 @@ jobs: key: ${{ matrix.cache }}-${{ hashFiles('cpp/**') }} restore-keys: ${{ matrix.cache }}- - name: Setup Python - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: - python-version: 3.8 + python-version: 3.12 - name: Setup Archery run: pip install -e dev/archery[docker] - name: Execute Docker Build @@ -112,8 +119,7 @@ jobs: ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} run: | - sudo sysctl -w kernel.core_pattern="core.%e.%p" - ulimit -c unlimited + source ci/scripts/util_enable_core_dumps.sh archery docker run ${{ matrix.image }} - name: Docker Push if: >- @@ -164,7 +170,7 @@ jobs: ARROW_BUILD_TESTS: OFF PYARROW_TEST_LARGE_MEMORY: ON # Current oldest supported version according to https://endoflife.date/macos - MACOSX_DEPLOYMENT_TARGET: 10.15 + MACOSX_DEPLOYMENT_TARGET: 12.0 steps: - name: Checkout Arrow uses: actions/checkout@v4 @@ -172,7 +178,7 @@ jobs: fetch-depth: 0 submodules: recursive - name: Setup Python - uses: actions/setup-python@v5.1.0 + uses: actions/setup-python@v5.2.0 with: python-version: '3.11' - name: Install Dependencies @@ -182,6 +188,10 @@ jobs: python -m pip install \ -r python/requirements-build.txt \ -r python/requirements-test.txt + - name: Install MinIO + run: | + $(brew --prefix bash)/bin/bash \ + ci/scripts/install_minio.sh latest /usr/local - name: Setup ccache shell: bash run: ci/scripts/ccache_setup.sh diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml index e8f57db99c28c..9abedcd767150 100644 --- a/.github/workflows/r.yml +++ b/.github/workflows/r.yml @@ -20,6 +20,7 @@ name: R on: push: paths: + - '.dockerignore' - ".github/workflows/r.yml" - "ci/docker/**" - "ci/etc/rprofile" @@ -32,6 +33,7 @@ on: - "r/**" pull_request: paths: + - '.dockerignore' - ".github/workflows/r.yml" - "ci/docker/**" - "ci/etc/rprofile" @@ -52,7 +54,6 @@ permissions: env: ARCHERY_DEBUG: 1 - ARCHERY_USE_DOCKER_CLI: 1 DOCKER_VOLUME_PREFIX: ".docker/" jobs: @@ -122,7 +123,7 @@ jobs: fail-fast: false matrix: r: ["4.4"] - ubuntu: [20.04] + ubuntu: [24.04] force-tests: ["true"] env: R: ${{ matrix.r }} @@ -133,6 +134,9 @@ jobs: with: fetch-depth: 0 submodules: recursive + - name: Free up disk space + run: | + ci/scripts/util_free_space.sh - name: Cache Docker Volumes uses: actions/cache@13aacd865c20de90d75de3b17ebe84f7a17d57d2 # v4.0.0 with: @@ -144,9 +148,9 @@ jobs: ubuntu-${{ matrix.ubuntu }}-r-${{ matrix.r }}-${{ hashFiles('cpp/src/**/*.cc','cpp/src/**/*.h)') }}- ubuntu-${{ matrix.ubuntu }}-r-${{ matrix.r }}- - name: Setup Python - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: - python-version: 3.8 + python-version: 3.12 - name: Setup Archery run: pip install -e dev/archery[docker] - name: Execute Docker Build @@ -154,8 +158,7 @@ jobs: ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} run: | - sudo sysctl -w kernel.core_pattern="core.%e.%p" - ulimit -c unlimited + source ci/scripts/util_enable_core_dumps.sh # Setting a non-default and non-probable Marquesas French Polynesia time # it has both with a .45 offset and very very few people who live there. archery docker run -e TZ=MART -e ARROW_R_FORCE_TESTS=${{ matrix.force-tests }} ubuntu-r @@ -167,9 +170,9 @@ jobs: if: always() - name: Save the test output if: always() - uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce # v3.1.2 + uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0 with: - name: test-output + name: test-output-${{ matrix.ubuntu }}-${{ matrix.r }} path: r/check/arrow.Rcheck/tests/testthat.Rout* - name: Docker Push if: >- @@ -204,9 +207,9 @@ jobs: fetch-depth: 0 submodules: recursive - name: Setup Python - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: - python-version: 3.8 + python-version: 3.12 - name: Setup Archery run: pip install -e dev/archery[docker] - name: Execute Docker Build @@ -214,8 +217,7 @@ jobs: ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} run: | - sudo sysctl -w kernel.core_pattern="core.%e.%p" - ulimit -c unlimited + source ci/scripts/util_enable_core_dumps.sh # Don't set a TZ here to test that case. These builds will have the following warning in them: # System has not been booted with systemd as init system (PID 1). Can't operate. # Failed to connect to bus: Host is down @@ -228,9 +230,9 @@ jobs: if: always() - name: Save the test output if: always() - uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce # v3.1.2 + uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0 with: - name: test-output + name: test-output-bundled path: r/check/arrow.Rcheck/tests/testthat.Rout* - name: Docker Push if: >- @@ -290,7 +292,7 @@ jobs: # So that they're unique when multiple are downloaded in the next step shell: bash run: mv libarrow.zip libarrow-rtools${{ matrix.config.rtools }}-${{ matrix.config.arch }}.zip - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # # v4.0.0 with: name: libarrow-rtools${{ matrix.config.rtools }}-${{ matrix.config.arch }}.zip path: libarrow-rtools${{ matrix.config.rtools }}-${{ matrix.config.arch }}.zip @@ -328,7 +330,7 @@ jobs: echo "$HOME/.local/bin" >> $GITHUB_PATH - run: mkdir r/windows - name: Download artifacts - uses: actions/download-artifact@v3 + uses: actions/download-artifact@v4.1.8 with: name: libarrow-rtools40-ucrt64.zip path: r/windows diff --git a/.github/workflows/r_nightly.yml b/.github/workflows/r_nightly.yml index af5382f90834c..9817e41d3b61d 100644 --- a/.github/workflows/r_nightly.yml +++ b/.github/workflows/r_nightly.yml @@ -60,7 +60,7 @@ jobs: repository: ursacomputing/crossbow ref: main - name: Set up Python - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: cache: 'pip' python-version: 3.12 diff --git a/.github/workflows/ruby.yml b/.github/workflows/ruby.yml index 6a29ec8e72cab..83a066dc27386 100644 --- a/.github/workflows/ruby.yml +++ b/.github/workflows/ruby.yml @@ -20,6 +20,7 @@ name: C GLib & Ruby on: push: paths: + - '.dockerignore' - '.github/workflows/ruby.yml' - 'ci/docker/**' - 'ci/scripts/c_glib_*' @@ -33,6 +34,7 @@ on: - 'ruby/**' pull_request: paths: + - '.dockerignore' - '.github/workflows/ruby.yml' - 'ci/docker/**' - 'ci/scripts/c_glib_*' @@ -54,7 +56,6 @@ permissions: env: ARCHERY_DEBUG: 1 - ARCHERY_USE_DOCKER_CLI: 1 DOCKER_VOLUME_PREFIX: ".docker/" jobs: @@ -84,9 +85,9 @@ jobs: key: ubuntu-${{ matrix.ubuntu }}-ruby-${{ hashFiles('cpp/**') }} restore-keys: ubuntu-${{ matrix.ubuntu }}-ruby- - name: Setup Python - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: - python-version: 3.8 + python-version: 3.12 - name: Setup Archery run: pip install -e dev/archery[docker] - name: Execute Docker Build @@ -94,8 +95,7 @@ jobs: ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} run: | - sudo sysctl -w kernel.core_pattern="core.%e.%p" - ulimit -c unlimited + source ci/scripts/util_enable_core_dumps.sh archery docker run \ -e ARROW_FLIGHT=ON \ -e ARROW_FLIGHT_SQL=ON \ @@ -407,7 +407,10 @@ jobs: -source "https://nuget.pkg.github.com/$GITHUB_REPOSITORY_OWNER/index.json" - name: Build C++ vcpkg dependencies run: | - vcpkg\vcpkg.exe install --triplet $env:VCPKG_TRIPLET --x-manifest-root cpp --x-install-root build\cpp\vcpkg_installed + vcpkg\vcpkg.exe install ` + --triplet $env:VCPKG_TRIPLET ` + --x-manifest-root cpp ` + --x-install-root build\cpp\vcpkg_installed - name: Build C++ shell: cmd run: | diff --git a/.github/workflows/swift.yml b/.github/workflows/swift.yml index 3f039315b505a..87aa5cb83f714 100644 --- a/.github/workflows/swift.yml +++ b/.github/workflows/swift.yml @@ -20,6 +20,7 @@ name: Swift on: push: paths: + - '.dockerignore' - '.github/workflows/swift.yml' - 'ci/docker/*swift*' - 'ci/scripts/swift_*' @@ -27,6 +28,7 @@ on: - 'swift/**' pull_request: paths: + - '.dockerignore' - '.github/workflows/swift.yml' - 'ci/docker/*swift*' - 'ci/scripts/swift_*' @@ -42,7 +44,6 @@ permissions: env: ARCHERY_DEBUG: 1 - ARCHERY_USE_DOCKER_CLI: 1 DOCKER_VOLUME_PREFIX: ".docker/" jobs: @@ -64,8 +65,7 @@ jobs: ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} run: | - sudo sysctl -w kernel.core_pattern="core.%e.%p" - ulimit -c unlimited + source ci/scripts/util_enable_core_dumps.sh archery docker run ubuntu-swift - name: Docker Push if: >- diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index bf0bcde14622a..bee20369c017e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -78,6 +78,26 @@ repos: ?^cpp/src/generated/| ?^cpp/thirdparty/| ) + - repo: https://github.com/cpplint/cpplint + rev: 1.6.1 + hooks: + - id: cpplint + name: C++ Lint + args: + - "--verbose=2" + types_or: + - c++ + files: >- + ^cpp/ + exclude: >- + ( + ?\.grpc\.fb\.(cc|h)$| + ?\.pb\.(cc|h)$| + ?_generated.*\.(cc|h)$| + ?^cpp/src/arrow/vendored/| + ?^cpp/src/generated/| + ?^cpp/thirdparty/| + ) - repo: https://github.com/pre-commit/mirrors-clang-format rev: v14.0.6 hooks: @@ -148,17 +168,3 @@ repos: '--disable', 'dangling-hyphen,line-too-long', ] - - repo: https://github.com/golangci/golangci-lint - rev: v1.59.0 - hooks: - # no built-in support for multiple go.mod - # https://github.com/golangci/golangci-lint/issues/828 - - id: golangci-lint-full - name: golangci-lint-full-arrow - entry: bash -c 'cd go/arrow && golangci-lint run' - - id: golangci-lint-full - name: golangci-lint-full-parquet - entry: bash -c 'cd go/parquet && golangci-lint run' - - id: golangci-lint-full - name: golangci-lint-full-internal - entry: bash -c 'cd go/internal && golangci-lint run' diff --git a/CPPLINT.cfg b/CPPLINT.cfg new file mode 100644 index 0000000000000..2f47b4dbf57b7 --- /dev/null +++ b/CPPLINT.cfg @@ -0,0 +1,30 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +filter = -build/c++11 +filter = -build/header_guard +filter = -build/include_order +filter = -build/include_what_you_use +filter = -readability/alt_tokens +# readability/casting is disabled as it aggressively warns about +# functions with names like "int32", so "int32(x)", where int32 is a +# function name, warns with +filter = -readability/casting +filter = -readability/todo +filter = -runtime/references +filter = -whitespace/comments +linelength = 90 diff --git a/appveyor.yml b/appveyor.yml index 5954251d34733..9e4582f1d8d7f 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -24,6 +24,7 @@ only_commits: - appveyor.yml - ci/appveyor* - ci/conda* + - ci/scripts/*.bat - cpp/ - format/ - python/ diff --git a/c_glib/Gemfile b/c_glib/Gemfile index d32bc87ba72c6..cc6adecabe230 100644 --- a/c_glib/Gemfile +++ b/c_glib/Gemfile @@ -20,4 +20,4 @@ source "https://rubygems.org/" gem "test-unit" -gem "gobject-introspection", ">= 4.1.1" +gem "gobject-introspection", ">= 4.2.3" diff --git a/c_glib/arrow-cuda-glib/meson.build b/c_glib/arrow-cuda-glib/meson.build index 47bed70f03b60..36730dec6c4b7 100644 --- a/c_glib/arrow-cuda-glib/meson.build +++ b/c_glib/arrow-cuda-glib/meson.build @@ -58,14 +58,15 @@ libarrow_cuda_glib = library('arrow-cuda-glib', arrow_cuda_glib = declare_dependency(link_with: libarrow_cuda_glib, include_directories: base_include_directories, dependencies: dependencies) - -pkgconfig.generate(libarrow_cuda_glib, - description: 'C API for Apache Arrow CUDA based on GLib', - filebase: 'arrow-cuda-glib', - name: 'Apache Arrow CUDA GLib', - requires: ['arrow-glib', 'arrow-cuda'], - variables: pkgconfig_variables, - version: version) +if target_machine.system() != 'windows' + pkgconfig.generate(libarrow_cuda_glib, + description: 'C API for Apache Arrow CUDA based on GLib', + filebase: 'arrow-cuda-glib', + name: 'Apache Arrow CUDA GLib', + requires: ['arrow-glib', 'arrow-cuda'], + variables: pkgconfig_variables, + version: version) +endif if have_gi gir_dependencies = [ diff --git a/c_glib/arrow-flight-glib/client.cpp b/c_glib/arrow-flight-glib/client.cpp index 80c47e336f872..8ec8e9729a2d9 100644 --- a/c_glib/arrow-flight-glib/client.cpp +++ b/c_glib/arrow-flight-glib/client.cpp @@ -33,10 +33,19 @@ G_BEGIN_DECLS * #GAFlightStreamReader is a class for reading record batches from a * server. * + * #GAFlightStreamWriter is a class for writing record batches to a + * server. + * + * #GAFlightMetadataReader is a class for reading metadata from a + * server. + * * #GAFlightCallOptions is a class for options of each call. * * #GAFlightClientOptions is a class for options of each client. * + * #GAFlightDoPutResult is a class that has gaflight_client_do_put() + * result. + * * #GAFlightClient is a class for Apache Arrow Flight client. * * Since: 5.0.0 @@ -56,16 +65,142 @@ gaflight_stream_reader_class_init(GAFlightStreamReaderClass *klass) { } -typedef struct GAFlightCallOptionsPrivate_ +G_DEFINE_TYPE(GAFlightStreamWriter, + gaflight_stream_writer, + GAFLIGHT_TYPE_RECORD_BATCH_WRITER) + +static void +gaflight_stream_writer_init(GAFlightStreamWriter *object) +{ +} + +static void +gaflight_stream_writer_class_init(GAFlightStreamWriterClass *klass) +{ +} + +/** + * gaflight_stream_writer_done_writing: + * @writer: A #GAFlightStreamWriter. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE on error. + * + * Since: 18.0.0 + */ +gboolean +gaflight_stream_writer_done_writing(GAFlightStreamWriter *writer, GError **error) +{ + auto flight_writer = std::static_pointer_cast( + garrow_record_batch_writer_get_raw(GARROW_RECORD_BATCH_WRITER(writer))); + return garrow::check(error, + flight_writer->DoneWriting(), + "[flight-stream-writer][done-writing]"); +} + +struct GAFlightMetadataReaderPrivate +{ + arrow::flight::FlightMetadataReader *reader; +}; + +enum { + PROP_METADATA_READER_READER = 1, +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GAFlightMetadataReader, + gaflight_metadata_reader, + G_TYPE_OBJECT) + +#define GAFLIGHT_METADATA_READER_GET_PRIVATE(object) \ + static_cast( \ + gaflight_metadata_reader_get_instance_private(GAFLIGHT_METADATA_READER(object))) + +static void +gaflight_metadata_reader_finalize(GObject *object) +{ + auto priv = GAFLIGHT_METADATA_READER_GET_PRIVATE(object); + delete priv->reader; + G_OBJECT_CLASS(gaflight_metadata_reader_parent_class)->finalize(object); +} + +static void +gaflight_metadata_reader_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GAFLIGHT_METADATA_READER_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_METADATA_READER_READER: + priv->reader = + static_cast(g_value_get_pointer(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gaflight_metadata_reader_init(GAFlightMetadataReader *object) +{ +} + +static void +gaflight_metadata_reader_class_init(GAFlightMetadataReaderClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->finalize = gaflight_metadata_reader_finalize; + gobject_class->set_property = gaflight_metadata_reader_set_property; + + GParamSpec *spec; + spec = g_param_spec_pointer( + "reader", + nullptr, + nullptr, + static_cast(G_PARAM_WRITABLE | G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_METADATA_READER_READER, spec); +} + +/** + * gaflight_metadata_reader_read: + * @reader: A #GAFlightMetadataReader. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (transfer full): The metadata on success, %NULL on error. + * + * Since: 18.0.0 + */ +GArrowBuffer * +gaflight_metadata_reader_read(GAFlightMetadataReader *reader, GError **error) +{ + auto flight_reader = gaflight_metadata_reader_get_raw(reader); + std::shared_ptr metadata; + if (garrow::check(error, + flight_reader->ReadMetadata(&metadata), + "[flight-metadata-reader][read]")) { + return garrow_buffer_new_raw(&metadata); + } else { + return nullptr; + } +} + +struct GAFlightCallOptionsPrivate { arrow::flight::FlightCallOptions options; -} GAFlightCallOptionsPrivate; +}; + +enum { + PROP_TIMEOUT = 1, +}; G_DEFINE_TYPE_WITH_PRIVATE(GAFlightCallOptions, gaflight_call_options, G_TYPE_OBJECT) -#define GAFLIGHT_CALL_OPTIONS_GET_PRIVATE(obj) \ +#define GAFLIGHT_CALL_OPTIONS_GET_PRIVATE(object) \ static_cast( \ - gaflight_call_options_get_instance_private(GAFLIGHT_CALL_OPTIONS(obj))) + gaflight_call_options_get_instance_private(GAFLIGHT_CALL_OPTIONS(object))) static void gaflight_call_options_finalize(GObject *object) @@ -77,6 +212,42 @@ gaflight_call_options_finalize(GObject *object) G_OBJECT_CLASS(gaflight_call_options_parent_class)->finalize(object); } +static void +gaflight_call_options_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GAFLIGHT_CALL_OPTIONS_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_TIMEOUT: + priv->options.timeout = arrow::flight::TimeoutDuration(g_value_get_double(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gaflight_call_options_get_property(GObject *object, + guint prop_id, + GValue *value, + GParamSpec *pspec) +{ + auto priv = GAFLIGHT_CALL_OPTIONS_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_TIMEOUT: + g_value_set_double(value, priv->options.timeout.count()); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + static void gaflight_call_options_init(GAFlightCallOptions *object) { @@ -90,6 +261,28 @@ gaflight_call_options_class_init(GAFlightCallOptionsClass *klass) auto gobject_class = G_OBJECT_CLASS(klass); gobject_class->finalize = gaflight_call_options_finalize; + gobject_class->set_property = gaflight_call_options_set_property; + gobject_class->get_property = gaflight_call_options_get_property; + + arrow::flight::FlightCallOptions options; + GParamSpec *spec; + /** + * GAFlightCallOptions:timeout: + * + * An optional timeout for this call. Negative durations mean an + * implementation-defined default behavior will be used + * instead. This is the default value. + * + * Since: 18.0.0 + */ + spec = g_param_spec_double("timeout", + nullptr, + nullptr, + -G_MAXDOUBLE, + G_MAXDOUBLE, + options.timeout.count(), + static_cast(G_PARAM_READWRITE)); + g_object_class_install_property(gobject_class, PROP_TIMEOUT, spec); } /** @@ -143,8 +336,8 @@ gaflight_call_options_clear_headers(GAFlightCallOptions *options) /** * gaflight_call_options_foreach_header: * @options: A #GAFlightCallOptions. - * @func: (scope call): The user's callback function. - * @user_data: (closure): Data for @func. + * @func: (scope call) (closure user_data): The user's callback function. + * @user_data: Data for @func. * * Iterates over all headers in the options. * @@ -385,6 +578,139 @@ gaflight_client_options_new(void) g_object_new(GAFLIGHT_TYPE_CLIENT_OPTIONS, NULL)); } +struct GAFlightDoPutResultPrivate +{ + GAFlightStreamWriter *writer; + GAFlightMetadataReader *reader; +}; + +enum { + PROP_DO_PUT_RESULT_RESULT = 1, + PROP_DO_PUT_RESULT_WRITER, + PROP_DO_PUT_RESULT_READER, +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GAFlightDoPutResult, gaflight_do_put_result, G_TYPE_OBJECT) + +#define GAFLIGHT_DO_PUT_RESULT_GET_PRIVATE(object) \ + static_cast( \ + gaflight_do_put_result_get_instance_private(GAFLIGHT_DO_PUT_RESULT(object))) + +static void +gaflight_do_put_result_dispose(GObject *object) +{ + auto priv = GAFLIGHT_DO_PUT_RESULT_GET_PRIVATE(object); + + if (priv->writer) { + g_object_unref(priv->writer); + priv->writer = nullptr; + } + + if (priv->reader) { + g_object_unref(priv->reader); + priv->reader = nullptr; + } + + G_OBJECT_CLASS(gaflight_do_put_result_parent_class)->dispose(object); +} + +static void +gaflight_do_put_result_init(GAFlightDoPutResult *object) +{ +} + +static void +gaflight_do_put_result_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GAFLIGHT_DO_PUT_RESULT_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_DO_PUT_RESULT_RESULT: + { + auto result = static_cast( + g_value_get_pointer(value)); + std::shared_ptr writer = + std::move(result->writer); + priv->writer = gaflight_stream_writer_new_raw(&writer); + priv->reader = gaflight_metadata_reader_new_raw(result->reader.release()); + break; + } + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gaflight_do_put_result_get_property(GObject *object, + guint prop_id, + GValue *value, + GParamSpec *pspec) +{ + auto priv = GAFLIGHT_DO_PUT_RESULT_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_DO_PUT_RESULT_WRITER: + g_value_set_object(value, priv->writer); + break; + case PROP_DO_PUT_RESULT_READER: + g_value_set_object(value, priv->reader); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gaflight_do_put_result_class_init(GAFlightDoPutResultClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->dispose = gaflight_do_put_result_dispose; + gobject_class->set_property = gaflight_do_put_result_set_property; + gobject_class->get_property = gaflight_do_put_result_get_property; + + GParamSpec *spec; + spec = g_param_spec_pointer( + "result", + nullptr, + nullptr, + static_cast(G_PARAM_WRITABLE | G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_DO_PUT_RESULT_RESULT, spec); + + /** + * GAFlightDoPutResult:writer: + * + * A writer to write record batches to. + * + * Since: 18.0.0 + */ + spec = g_param_spec_object("writer", + nullptr, + nullptr, + GAFLIGHT_TYPE_STREAM_WRITER, + static_cast(G_PARAM_READABLE)); + g_object_class_install_property(gobject_class, PROP_DO_PUT_RESULT_WRITER, spec); + + /** + * GAFlightDoPutResult:reader: + * + * A reader for application metadata from the server. + * + * Since: 18.0.0 + */ + spec = g_param_spec_object("reader", + nullptr, + nullptr, + GAFLIGHT_TYPE_METADATA_READER, + static_cast(G_PARAM_READABLE)); + g_object_class_install_property(gobject_class, PROP_DO_PUT_RESULT_READER, spec); +} + struct GAFlightClientPrivate { std::shared_ptr client; @@ -661,6 +987,51 @@ gaflight_client_do_get(GAFlightClient *client, return gaflight_stream_reader_new_raw(flight_reader.release(), TRUE); } +/** + * gaflight_client_do_put: + * @client: A #GAFlightClient. + * @descriptor: A #GAFlightDescriptor. + * @schema: A #GArrowSchema. + * @options: (nullable): A #GAFlightCallOptions. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Upload data to a Flight described by the given descriptor. The + * caller must call garrow_record_batch_writer_close() on the + * returned stream once they are done writing. + * + * The reader and writer are linked; closing the writer will also + * close the reader. Use garrow_flight_stream_writer_done_writing() to + * only close the write side of the channel. + * + * Returns: (nullable) (transfer full): + * The #GAFlighDoPutResult holding a reader and a writer on success, + * %NULL on error. + * + * Since: 18.0.0 + */ +GAFlightDoPutResult * +gaflight_client_do_put(GAFlightClient *client, + GAFlightDescriptor *descriptor, + GArrowSchema *schema, + GAFlightCallOptions *options, + GError **error) +{ + auto flight_client = gaflight_client_get_raw(client); + auto flight_descriptor = gaflight_descriptor_get_raw(descriptor); + auto arrow_schema = garrow_schema_get_raw(schema); + arrow::flight::FlightCallOptions flight_default_options; + auto flight_options = &flight_default_options; + if (options) { + flight_options = gaflight_call_options_get_raw(options); + } + auto result = flight_client->DoPut(*flight_options, *flight_descriptor, arrow_schema); + if (!garrow::check(error, result, "[flight-client][do-put]")) { + return nullptr; + } + auto flight_result = std::move(*result); + return gaflight_do_put_result_new_raw(&flight_result); +} + G_END_DECLS GAFlightStreamReader * @@ -672,7 +1043,31 @@ gaflight_stream_reader_new_raw(arrow::flight::FlightStreamReader *flight_reader, flight_reader, "is-owner", is_owner, - NULL)); + nullptr)); +} + +GAFlightStreamWriter * +gaflight_stream_writer_new_raw( + std::shared_ptr *flight_writer) +{ + return GAFLIGHT_STREAM_WRITER(g_object_new(GAFLIGHT_TYPE_STREAM_WRITER, + "record-batch-writer", + flight_writer, + nullptr)); +} + +GAFlightMetadataReader * +gaflight_metadata_reader_new_raw(arrow::flight::FlightMetadataReader *flight_reader) +{ + return GAFLIGHT_METADATA_READER( + g_object_new(GAFLIGHT_TYPE_METADATA_READER, "reader", flight_reader, nullptr)); +} + +arrow::flight::FlightMetadataReader * +gaflight_metadata_reader_get_raw(GAFlightMetadataReader *reader) +{ + auto priv = GAFLIGHT_METADATA_READER_GET_PRIVATE(reader); + return priv->reader; } arrow::flight::FlightCallOptions * @@ -689,6 +1084,13 @@ gaflight_client_options_get_raw(GAFlightClientOptions *options) return &(priv->options); } +GAFlightDoPutResult * +gaflight_do_put_result_new_raw(arrow::flight::FlightClient::DoPutResult *flight_result) +{ + return GAFLIGHT_DO_PUT_RESULT( + g_object_new(GAFLIGHT_TYPE_DO_PUT_RESULT, "result", flight_result, nullptr)); +} + std::shared_ptr gaflight_client_get_raw(GAFlightClient *client) { diff --git a/c_glib/arrow-flight-glib/client.h b/c_glib/arrow-flight-glib/client.h index a91bbe55e3c04..12c5a06b810e1 100644 --- a/c_glib/arrow-flight-glib/client.h +++ b/c_glib/arrow-flight-glib/client.h @@ -35,6 +35,35 @@ struct _GAFlightStreamReaderClass GAFlightRecordBatchReaderClass parent_class; }; +#define GAFLIGHT_TYPE_STREAM_WRITER (gaflight_stream_writer_get_type()) +GAFLIGHT_AVAILABLE_IN_18_0 +G_DECLARE_DERIVABLE_TYPE(GAFlightStreamWriter, + gaflight_stream_writer, + GAFLIGHT, + STREAM_WRITER, + GAFlightRecordBatchWriter) +struct _GAFlightStreamWriterClass +{ + GAFlightRecordBatchWriterClass parent_class; +}; + +GAFLIGHT_AVAILABLE_IN_18_0 +gboolean +gaflight_stream_writer_done_writing(GAFlightStreamWriter *writer, GError **error); + +#define GAFLIGHT_TYPE_METADATA_READER (gaflight_metadata_reader_get_type()) +GAFLIGHT_AVAILABLE_IN_18_0 +G_DECLARE_DERIVABLE_TYPE( + GAFlightMetadataReader, gaflight_metadata_reader, GAFLIGHT, METADATA_READER, GObject) +struct _GAFlightMetadataReaderClass +{ + GObjectClass parent_class; +}; + +GAFLIGHT_AVAILABLE_IN_18_0 +GArrowBuffer * +gaflight_metadata_reader_read(GAFlightMetadataReader *reader, GError **error); + #define GAFLIGHT_TYPE_CALL_OPTIONS (gaflight_call_options_get_type()) GAFLIGHT_AVAILABLE_IN_5_0 G_DECLARE_DERIVABLE_TYPE( @@ -75,6 +104,15 @@ GAFLIGHT_AVAILABLE_IN_5_0 GAFlightClientOptions * gaflight_client_options_new(void); +#define GAFLIGHT_TYPE_DO_PUT_RESULT (gaflight_do_put_result_get_type()) +GAFLIGHT_AVAILABLE_IN_18_0 +G_DECLARE_DERIVABLE_TYPE( + GAFlightDoPutResult, gaflight_do_put_result, GAFLIGHT, DO_PUT_RESULT, GObject) +struct _GAFlightDoPutResultClass +{ + GObjectClass parent_class; +}; + #define GAFLIGHT_TYPE_CLIENT (gaflight_client_get_type()) GAFLIGHT_AVAILABLE_IN_5_0 G_DECLARE_DERIVABLE_TYPE(GAFlightClient, gaflight_client, GAFLIGHT, CLIENT, GObject) @@ -124,4 +162,12 @@ gaflight_client_do_get(GAFlightClient *client, GAFlightCallOptions *options, GError **error); +GAFLIGHT_AVAILABLE_IN_18_0 +GAFlightDoPutResult * +gaflight_client_do_put(GAFlightClient *client, + GAFlightDescriptor *descriptor, + GArrowSchema *schema, + GAFlightCallOptions *options, + GError **error); + G_END_DECLS diff --git a/c_glib/arrow-flight-glib/client.hpp b/c_glib/arrow-flight-glib/client.hpp index 185a28e6dc4bd..32ad35845aa12 100644 --- a/c_glib/arrow-flight-glib/client.hpp +++ b/c_glib/arrow-flight-glib/client.hpp @@ -28,6 +28,19 @@ GAFlightStreamReader * gaflight_stream_reader_new_raw(arrow::flight::FlightStreamReader *flight_reader, gboolean is_owner); +GAFLIGHT_EXTERN +GAFlightStreamWriter * +gaflight_stream_writer_new_raw( + std::shared_ptr *flight_writer); + +GAFLIGHT_EXTERN +GAFlightMetadataReader * +gaflight_metadata_reader_new_raw(arrow::flight::FlightMetadataReader *flight_reader); + +GAFLIGHT_EXTERN +arrow::flight::FlightMetadataReader * +gaflight_metadata_reader_get_raw(GAFlightMetadataReader *reader); + GAFLIGHT_EXTERN arrow::flight::FlightCallOptions * gaflight_call_options_get_raw(GAFlightCallOptions *options); @@ -36,6 +49,10 @@ GAFLIGHT_EXTERN arrow::flight::FlightClientOptions * gaflight_client_options_get_raw(GAFlightClientOptions *options); +GAFLIGHT_EXTERN +GAFlightDoPutResult * +gaflight_do_put_result_new_raw(arrow::flight::FlightClient::DoPutResult *flight_result); + GAFLIGHT_EXTERN std::shared_ptr gaflight_client_get_raw(GAFlightClient *client); diff --git a/c_glib/arrow-flight-glib/common.cpp b/c_glib/arrow-flight-glib/common.cpp index efc544f10cf66..3deaf67cc14e8 100644 --- a/c_glib/arrow-flight-glib/common.cpp +++ b/c_glib/arrow-flight-glib/common.cpp @@ -48,7 +48,11 @@ G_BEGIN_DECLS * * #GAFlightStreamChunk is a class for a chunk in stream. * - * #GAFlightRecordBatchReader is a class for reading record batches. + * #GAFlightRecordBatchReader is an abstract class for reading record + * batches with metadata. + * + * #GAFlightRecordBatchWeriter is an abstract class for + * writing record batches with metadata. * * Since: 5.0.0 */ @@ -1172,13 +1176,13 @@ typedef struct GAFlightRecordBatchReaderPrivate_ } GAFlightRecordBatchReaderPrivate; enum { - PROP_READER = 1, - PROP_IS_OWNER, + PROP_RECORD_BATCH_READER_READER = 1, + PROP_RECORD_BATCH_READER_IS_OWNER, }; -G_DEFINE_TYPE_WITH_PRIVATE(GAFlightRecordBatchReader, - gaflight_record_batch_reader, - G_TYPE_OBJECT) +G_DEFINE_ABSTRACT_TYPE_WITH_PRIVATE(GAFlightRecordBatchReader, + gaflight_record_batch_reader, + G_TYPE_OBJECT) #define GAFLIGHT_RECORD_BATCH_READER_GET_PRIVATE(obj) \ static_cast( \ @@ -1192,7 +1196,7 @@ gaflight_record_batch_reader_finalize(GObject *object) if (priv->is_owner) { delete priv->reader; } - G_OBJECT_CLASS(gaflight_info_parent_class)->finalize(object); + G_OBJECT_CLASS(gaflight_record_batch_reader_parent_class)->finalize(object); } static void @@ -1204,11 +1208,11 @@ gaflight_record_batch_reader_set_property(GObject *object, auto priv = GAFLIGHT_RECORD_BATCH_READER_GET_PRIVATE(object); switch (prop_id) { - case PROP_READER: + case PROP_RECORD_BATCH_READER_READER: priv->reader = static_cast(g_value_get_pointer(value)); break; - case PROP_IS_OWNER: + case PROP_RECORD_BATCH_READER_IS_OWNER: priv->is_owner = g_value_get_boolean(value); break; default: @@ -1236,7 +1240,7 @@ gaflight_record_batch_reader_class_init(GAFlightRecordBatchReaderClass *klass) nullptr, nullptr, static_cast(G_PARAM_WRITABLE | G_PARAM_CONSTRUCT_ONLY)); - g_object_class_install_property(gobject_class, PROP_READER, spec); + g_object_class_install_property(gobject_class, PROP_RECORD_BATCH_READER_READER, spec); spec = g_param_spec_boolean( "is-owner", @@ -1244,7 +1248,7 @@ gaflight_record_batch_reader_class_init(GAFlightRecordBatchReaderClass *klass) nullptr, TRUE, static_cast(G_PARAM_WRITABLE | G_PARAM_CONSTRUCT_ONLY)); - g_object_class_install_property(gobject_class, PROP_IS_OWNER, spec); + g_object_class_install_property(gobject_class, PROP_RECORD_BATCH_READER_IS_OWNER, spec); } /** @@ -1296,6 +1300,108 @@ gaflight_record_batch_reader_read_all(GAFlightRecordBatchReader *reader, GError } } +G_DEFINE_ABSTRACT_TYPE(GAFlightRecordBatchWriter, + gaflight_record_batch_writer, + GARROW_TYPE_RECORD_BATCH_WRITER) + +static void +gaflight_record_batch_writer_init(GAFlightRecordBatchWriter *object) +{ +} + +static void +gaflight_record_batch_writer_class_init(GAFlightRecordBatchWriterClass *klass) +{ +} + +/** + * gaflight_record_batch_writer_begin: + * @writer: A #GAFlightRecordBatchWriter. + * @schema: A #GArrowSchema. + * @options: (nullable): A #GArrowWriteOptions. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Begins writing data with the given schema. Only used with + * `DoExchange`. + * + * Returns: %TRUE on success, %FALSE on error. + * + * Since: 18.0.0 + */ +gboolean +gaflight_record_batch_writer_begin(GAFlightRecordBatchWriter *writer, + GArrowSchema *schema, + GArrowWriteOptions *options, + GError **error) +{ + auto flight_writer = std::static_pointer_cast( + garrow_record_batch_writer_get_raw(GARROW_RECORD_BATCH_WRITER(writer))); + auto arrow_schema = garrow_schema_get_raw(schema); + arrow::ipc::IpcWriteOptions arrow_write_options; + if (options) { + arrow_write_options = *garrow_write_options_get_raw(options); + } else { + arrow_write_options = arrow::ipc::IpcWriteOptions::Defaults(); + } + return garrow::check(error, + flight_writer->Begin(arrow_schema, arrow_write_options), + "[flight-record-batch-writer][begin]"); +} + +/** + * gaflight_record_batch_writer_write_metadata: + * @writer: A #GAFlightRecordBatchWriter. + * @metadata: A #GArrowBuffer. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Write metadata. + * + * Returns: %TRUE on success, %FALSE on error. + * + * Since: 18.0.0 + */ +gboolean +gaflight_record_batch_writer_write_metadata(GAFlightRecordBatchWriter *writer, + GArrowBuffer *metadata, + GError **error) +{ + auto flight_writer = std::static_pointer_cast( + garrow_record_batch_writer_get_raw(GARROW_RECORD_BATCH_WRITER(writer))); + auto arrow_metadata = garrow_buffer_get_raw(metadata); + return garrow::check(error, + flight_writer->WriteMetadata(arrow_metadata), + "[flight-record-batch-writer][write-metadata]"); +} + +/** + * gaflight_record_batch_writer_write_record_batch: + * @writer: A #GAFlightRecordBatchWriter. + * @record_batch: A #GArrowRecordBatch. + * @metadata: (nullable): A #GArrowBuffer. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Write a record batch with metadata. + * + * Returns: %TRUE on success, %FALSE on error. + * + * Since: 18.0.0 + */ +gboolean +gaflight_record_batch_writer_write_record_batch(GAFlightRecordBatchWriter *writer, + GArrowRecordBatch *record_batch, + GArrowBuffer *metadata, + GError **error) +{ + auto flight_writer = std::static_pointer_cast( + garrow_record_batch_writer_get_raw(GARROW_RECORD_BATCH_WRITER(writer))); + auto arrow_record_batch = garrow_record_batch_get_raw(record_batch); + auto arrow_metadata = garrow_buffer_get_raw(metadata); + return garrow::check( + error, + flight_writer->WriteWithMetadata(*arrow_record_batch, arrow_metadata), + "[flight-record-batch-writer][write]"); +} + G_END_DECLS GAFlightCriteria * diff --git a/c_glib/arrow-flight-glib/common.h b/c_glib/arrow-flight-glib/common.h index b1d89f79c357e..726132fe4921b 100644 --- a/c_glib/arrow-flight-glib/common.h +++ b/c_glib/arrow-flight-glib/common.h @@ -232,4 +232,36 @@ GAFLIGHT_AVAILABLE_IN_6_0 GArrowTable * gaflight_record_batch_reader_read_all(GAFlightRecordBatchReader *reader, GError **error); +#define GAFLIGHT_TYPE_RECORD_BATCH_WRITER (gaflight_record_batch_writer_get_type()) +GAFLIGHT_AVAILABLE_IN_18_0 +G_DECLARE_DERIVABLE_TYPE(GAFlightRecordBatchWriter, + gaflight_record_batch_writer, + GAFLIGHT, + RECORD_BATCH_WRITER, + GArrowRecordBatchWriter) +struct _GAFlightRecordBatchWriterClass +{ + GArrowRecordBatchWriterClass parent_class; +}; + +GAFLIGHT_AVAILABLE_IN_18_0 +gboolean +gaflight_record_batch_writer_begin(GAFlightRecordBatchWriter *writer, + GArrowSchema *schema, + GArrowWriteOptions *options, + GError **error); + +GAFLIGHT_AVAILABLE_IN_18_0 +gboolean +gaflight_record_batch_writer_write_metadata(GAFlightRecordBatchWriter *writer, + GArrowBuffer *metadata, + GError **error); + +GAFLIGHT_AVAILABLE_IN_18_0 +gboolean +gaflight_record_batch_writer_write_record_batch(GAFlightRecordBatchWriter *writer, + GArrowRecordBatch *record_batch, + GArrowBuffer *metadata, + GError **error); + G_END_DECLS diff --git a/c_glib/arrow-flight-glib/common.hpp b/c_glib/arrow-flight-glib/common.hpp index db56fff579baf..ae5a7703397dd 100644 --- a/c_glib/arrow-flight-glib/common.hpp +++ b/c_glib/arrow-flight-glib/common.hpp @@ -79,3 +79,7 @@ gaflight_stream_chunk_get_raw(GAFlightStreamChunk *chunk); GAFLIGHT_EXTERN arrow::flight::MetadataRecordBatchReader * gaflight_record_batch_reader_get_raw(GAFlightRecordBatchReader *reader); + +GAFLIGHT_EXTERN +arrow::flight::MetadataRecordBatchWriter * +gaflight_record_batch_writer_get_raw(GAFlightRecordBatchWriter *writer); diff --git a/c_glib/arrow-flight-glib/server.cpp b/c_glib/arrow-flight-glib/server.cpp index f7444918e90f6..2feeb853e2c51 100644 --- a/c_glib/arrow-flight-glib/server.cpp +++ b/c_glib/arrow-flight-glib/server.cpp @@ -45,6 +45,9 @@ G_BEGIN_DECLS * client. Also allows reading application-defined metadata via the * Flight protocol. * + * #GAFlightMetadataWriter is a class for sending application-specific + * metadata back to client during an upload. + * * #GAFlightServerAuthSender is a class for sending messages to the * client during an authentication handshake. * @@ -290,6 +293,98 @@ gaflight_message_reader_get_descriptor(GAFlightMessageReader *reader) return gaflight_descriptor_new_raw(&flight_descriptor); } +struct GAFlightMetadataWriterPrivate +{ + arrow::flight::FlightMetadataWriter *writer; +}; + +enum { + PROP_WRITER = 1, +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GAFlightMetadataWriter, + gaflight_metadata_writer, + G_TYPE_OBJECT) + +#define GAFLIGHT_METADATA_WRITER_GET_PRIVATE(object) \ + static_cast( \ + gaflight_metadata_writer_get_instance_private(GAFLIGHT_METADATA_WRITER(object))) + +static void +gaflight_metadata_writer_finalize(GObject *object) +{ + auto priv = GAFLIGHT_METADATA_WRITER_GET_PRIVATE(object); + + delete priv->writer; + + G_OBJECT_CLASS(gaflight_metadata_writer_parent_class)->finalize(object); +} + +static void +gaflight_metadata_writer_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GAFLIGHT_METADATA_WRITER_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_WRITER: + priv->writer = + static_cast(g_value_get_pointer(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gaflight_metadata_writer_init(GAFlightMetadataWriter *object) +{ +} + +static void +gaflight_metadata_writer_class_init(GAFlightMetadataWriterClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->finalize = gaflight_metadata_writer_finalize; + gobject_class->set_property = gaflight_metadata_writer_set_property; + + GParamSpec *spec; + spec = g_param_spec_pointer( + "writer", + nullptr, + nullptr, + static_cast(G_PARAM_WRITABLE | G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_WRITER, spec); +} + +/** + * gaflight_metadata_writer_write: + * @writer: A #GAFlightMetadataWriter. + * @metadata: A #GArrowBuffer to be sent. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Writes metadata to the client. + * + * Returns: %TRUE on success, %FALSE on error. + * + * Since: 18.0.0 + */ +gboolean +gaflight_metadata_writer_write(GAFlightMetadataWriter *writer, + GArrowBuffer *metadata, + GError **error) +{ + auto flight_writer = gaflight_metadata_writer_get_raw(writer); + auto flight_metadata = garrow_buffer_get_raw(metadata); + return garrow::check(error, + flight_writer->WriteMetadata(*flight_metadata), + "[flight-metadata-writer][write]"); +} + struct GAFlightServerCallContextPrivate { arrow::flight::ServerCallContext *call_context; @@ -366,8 +461,8 @@ gaflight_server_call_context_class_init(GAFlightServerCallContextClass *klass) /** * gaflight_server_call_context_foreach_incoming_header: * @context: A #GAFlightServerCallContext. - * @func: (scope call): The user's callback function. - * @user_data: (closure): Data for @func. + * @func: (scope call) (closure user_data): The user's callback function. + * @user_data: Data for @func. * * Iterates over all incoming headers. * @@ -1034,6 +1129,34 @@ namespace gaflight { return arrow::Status::OK(); } + arrow::Status + DoPut(const arrow::flight::ServerCallContext &context, + std::unique_ptr reader, + std::unique_ptr writer) override + { + auto gacontext = gaflight_server_call_context_new_raw(&context); + auto gareader = gaflight_message_reader_new_raw(reader.release(), TRUE); + auto gawriter = gaflight_metadata_writer_new_raw(writer.release()); + GError *gerror = nullptr; + auto success = + gaflight_server_do_put(gaserver_, gacontext, gareader, gawriter, &gerror); + g_object_unref(gawriter); + g_object_unref(gareader); + g_object_unref(gacontext); + if (!success && !gerror) { + g_set_error(&gerror, + GARROW_ERROR, + GARROW_ERROR_UNKNOWN, + "GAFlightServerClass::do_put() returns FALSE but error isn't set"); + } + if (gerror) { + return garrow_error_to_status(gerror, + arrow::StatusCode::UnknownError, + "[flight-server][do-put]"); + } + return arrow::Status::OK(); + } + private: GAFlightServer *gaserver_; }; @@ -1228,6 +1351,35 @@ gaflight_server_do_get(GAFlightServer *server, return (*(klass->do_get))(server, context, ticket, error); } +/** + * gaflight_server_do_put: + * @server: A #GAFlightServer. + * @context: A #GAFlightServerCallContext. + * @reader: A #GAFlightMessageReader. + * @writer: A #GAFlightMetadataWriter. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Processes a stream of IPC payloads sent from a client. + * + * Returns: %TRUE on success, %FALSE on error. + * + * Since: 18.0.0 + */ +gboolean +gaflight_server_do_put(GAFlightServer *server, + GAFlightServerCallContext *context, + GAFlightMessageReader *reader, + GAFlightMetadataWriter *writer, + GError **error) +{ + auto klass = GAFLIGHT_SERVER_GET_CLASS(server); + if (!(klass && klass->do_put)) { + g_set_error(error, GARROW_ERROR, GARROW_ERROR_NOT_IMPLEMENTED, "not implemented"); + return false; + } + return klass->do_put(server, context, reader, writer, error); +} + G_END_DECLS arrow::flight::FlightDataStream * @@ -1257,6 +1409,20 @@ gaflight_message_reader_get_raw(GAFlightMessageReader *reader) return static_cast(flight_reader); } +GAFlightMetadataWriter * +gaflight_metadata_writer_new_raw(arrow::flight::FlightMetadataWriter *flight_writer) +{ + return GAFLIGHT_METADATA_WRITER( + g_object_new(GAFLIGHT_TYPE_METADATA_WRITER, "writer", flight_writer, nullptr)); +} + +arrow::flight::FlightMetadataWriter * +gaflight_metadata_writer_get_raw(GAFlightMetadataWriter *writer) +{ + auto priv = GAFLIGHT_METADATA_WRITER_GET_PRIVATE(writer); + return priv->writer; +} + GAFlightServerCallContext * gaflight_server_call_context_new_raw( const arrow::flight::ServerCallContext *flight_call_context) diff --git a/c_glib/arrow-flight-glib/server.h b/c_glib/arrow-flight-glib/server.h index 7e594febb172f..e3a469098b32c 100644 --- a/c_glib/arrow-flight-glib/server.h +++ b/c_glib/arrow-flight-glib/server.h @@ -65,6 +65,21 @@ GAFLIGHT_AVAILABLE_IN_14_0 GAFlightDescriptor * gaflight_message_reader_get_descriptor(GAFlightMessageReader *reader); +#define GAFLIGHT_TYPE_METADATA_WRITER (gaflight_metadata_writer_get_type()) +GAFLIGHT_AVAILABLE_IN_18_0 +G_DECLARE_DERIVABLE_TYPE( + GAFlightMetadataWriter, gaflight_metadata_writer, GAFLIGHT, METADATA_WRITER, GObject) +struct _GAFlightMetadataWriterClass +{ + GObjectClass parent_class; +}; + +GAFLIGHT_AVAILABLE_IN_18_0 +gboolean +gaflight_metadata_writer_write(GAFlightMetadataWriter *writer, + GArrowBuffer *metadata, + GError **error); + #define GAFLIGHT_TYPE_SERVER_CALL_CONTEXT (gaflight_server_call_context_get_type()) GAFLIGHT_AVAILABLE_IN_5_0 G_DECLARE_DERIVABLE_TYPE(GAFlightServerCallContext, @@ -199,6 +214,7 @@ G_DECLARE_DERIVABLE_TYPE(GAFlightServer, gaflight_server, GAFLIGHT, SERVER, GObj * GAFlightServerClass: * @list_flights: A virtual function to implement `ListFlights` API. * @do_get: A virtual function to implement `DoGet` API. + * @do_put: A virtual function to implement `DoPut` API. * * Since: 5.0.0 */ @@ -218,6 +234,11 @@ struct _GAFlightServerClass GAFlightServerCallContext *context, GAFlightTicket *ticket, GError **error); + gboolean (*do_put)(GAFlightServer *server, + GAFlightServerCallContext *context, + GAFlightMessageReader *reader, + GAFlightMetadataWriter *writer, + GError **error); }; GAFLIGHT_AVAILABLE_IN_5_0 @@ -254,4 +275,12 @@ gaflight_server_do_get(GAFlightServer *server, GAFlightTicket *ticket, GError **error); +GAFLIGHT_AVAILABLE_IN_18_0 +gboolean +gaflight_server_do_put(GAFlightServer *server, + GAFlightServerCallContext *context, + GAFlightMessageReader *reader, + GAFlightMetadataWriter *writer, + GError **error); + G_END_DECLS diff --git a/c_glib/arrow-flight-glib/server.hpp b/c_glib/arrow-flight-glib/server.hpp index ec4815751c8d8..f68eef83781ec 100644 --- a/c_glib/arrow-flight-glib/server.hpp +++ b/c_glib/arrow-flight-glib/server.hpp @@ -36,6 +36,14 @@ GAFLIGHT_EXTERN arrow::flight::FlightMessageReader * gaflight_message_reader_get_raw(GAFlightMessageReader *reader); +GAFLIGHT_EXTERN +GAFlightMetadataWriter * +gaflight_metadata_writer_new_raw(arrow::flight::FlightMetadataWriter *flight_writer); + +GAFLIGHT_EXTERN +arrow::flight::FlightMetadataWriter * +gaflight_metadata_writer_get_raw(GAFlightMetadataWriter *writer); + GAFLIGHT_EXTERN GAFlightServerCallContext * gaflight_server_call_context_new_raw( diff --git a/c_glib/arrow-glib/arrow-glib.h b/c_glib/arrow-glib/arrow-glib.h index 7ba20882610e8..272b6ba1dae10 100644 --- a/c_glib/arrow-glib/arrow-glib.h +++ b/c_glib/arrow-glib/arrow-glib.h @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include diff --git a/c_glib/arrow-glib/arrow-glib.hpp b/c_glib/arrow-glib/arrow-glib.hpp index 79e8dcbcce61a..49571eeae4929 100644 --- a/c_glib/arrow-glib/arrow-glib.hpp +++ b/c_glib/arrow-glib/arrow-glib.hpp @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include diff --git a/c_glib/arrow-glib/decoder.cpp b/c_glib/arrow-glib/decoder.cpp new file mode 100644 index 0000000000000..83af6bc484394 --- /dev/null +++ b/c_glib/arrow-glib/decoder.cpp @@ -0,0 +1,607 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include +#include +#include +#include +#include +#include + +G_BEGIN_DECLS + +/** + * SECTION: decoder + * @section_id: decoder-classes + * @title: Decoder classes + * @include: arrow-glib/arrow-glib.h + * + * #GArrowStreamListener is a class for receiving decoded information + * from #GArrowStreamDecoder. + * + * #GArrowStreamDecoder is a class for decoding record batches in + * stream format from given data chunks. + */ + +struct GArrowStreamListenerPrivate +{ + std::shared_ptr listener; +}; + +G_DEFINE_ABSTRACT_TYPE_WITH_PRIVATE(GArrowStreamListener, + garrow_stream_listener, + G_TYPE_OBJECT); + +#define GARROW_STREAM_LISTENER_GET_PRIVATE(object) \ + static_cast( \ + garrow_stream_listener_get_instance_private(GARROW_STREAM_LISTENER(object))) + +G_END_DECLS + +namespace garrow { + class StreamListener : public arrow::ipc::Listener { + public: + StreamListener(GArrowStreamListener *listener) : listener_(listener) + { + g_object_ref(listener_); + } + ~StreamListener() { g_object_unref(listener_); } + + arrow::Status + OnEOS() override + { + if (!klass()->on_eos) { + return arrow::Status::OK(); + } + + GError *error = nullptr; + if (garrow_stream_listener_on_eos(listener_, &error)) { + return arrow::Status::OK(); + } else { + return garrow_error_to_status(error, + arrow::StatusCode::UnknownError, + "[stream-listener][on-eos]"); + } + } + + arrow::Status + OnRecordBatchWithMetadataDecoded( + arrow::RecordBatchWithMetadata arrow_record_batch_with_metadata) override + { + if (!klass()->on_record_batch_decoded) { + return arrow::Status::OK(); + } + + auto record_batch = + garrow_record_batch_new_raw(&(arrow_record_batch_with_metadata.batch)); + GHashTable *metadata = nullptr; + if (arrow_record_batch_with_metadata.custom_metadata) { + metadata = garrow_internal_hash_table_from_metadata( + arrow_record_batch_with_metadata.custom_metadata); + } + GError *error = nullptr; + auto success = garrow_stream_listener_on_record_batch_decoded(listener_, + record_batch, + metadata, + &error); + g_object_unref(record_batch); + if (metadata) { + g_hash_table_unref(metadata); + } + if (success) { + return arrow::Status::OK(); + } else { + return garrow_error_to_status(error, + arrow::StatusCode::UnknownError, + "[stream-listener][on-record-batch-decoded]"); + } + } + + arrow::Status + OnSchemaDecoded(std::shared_ptr arrow_schema, + std::shared_ptr arrow_filtered_schema) override + { + if (!klass()->on_schema_decoded) { + return arrow::Status::OK(); + } + + auto schema = garrow_schema_new_raw(&arrow_schema); + auto filtered_schema = garrow_schema_new_raw(&arrow_filtered_schema); + GError *error = nullptr; + auto success = garrow_stream_listener_on_schema_decoded(listener_, + schema, + filtered_schema, + &error); + g_object_unref(schema); + g_object_unref(filtered_schema); + if (success) { + return arrow::Status::OK(); + } else { + return garrow_error_to_status(error, + arrow::StatusCode::UnknownError, + "[stream-listener][on-schema-decoded]"); + } + } + + private: + GArrowStreamListener *listener_; + + GArrowStreamListenerClass * + klass() + { + return GARROW_STREAM_LISTENER_GET_CLASS(listener_); + } + }; +}; // namespace garrow + +G_BEGIN_DECLS + +static void +garrow_stream_listener_finalize(GObject *object) +{ + auto priv = GARROW_STREAM_LISTENER_GET_PRIVATE(object); + priv->listener.~shared_ptr(); + G_OBJECT_CLASS(garrow_stream_listener_parent_class)->finalize(object); +} + +static void +garrow_stream_listener_init(GArrowStreamListener *object) +{ + auto priv = GARROW_STREAM_LISTENER_GET_PRIVATE(object); + new (&priv->listener) + std::shared_ptr(new garrow::StreamListener(object)); +} + +static void +garrow_stream_listener_class_init(GArrowStreamListenerClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + gobject_class->finalize = garrow_stream_listener_finalize; + + klass->on_eos = nullptr; + klass->on_record_batch_decoded = nullptr; + klass->on_schema_decoded = nullptr; +} + +/** + * garrow_stream_listener_on_eos: + * @listener: A #GArrowStreamListener. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Processes an EOS event. + * + * Returns: %TRUE on success, %FALSE on error. + * + * Since: 18.0.0 + */ +gboolean +garrow_stream_listener_on_eos(GArrowStreamListener *listener, GError **error) +{ + auto klass = GARROW_STREAM_LISTENER_GET_CLASS(listener); + if (!(klass && klass->on_eos)) { + g_set_error(error, + GARROW_ERROR, + GARROW_ERROR_NOT_IMPLEMENTED, + "[stream-listener][on-eos] not implemented"); + return false; + } + return klass->on_eos(listener, error); +} + +/** + * garrow_stream_listener_on_record_batch_decoded: + * @listener: A #GArrowStreamListener. + * @record_batch: A decoded #GArrowRecordBatch. + * @metadata: (element-type utf8 utf8) (nullable): A decoded metadata. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Processes a decoded record batch. + * + * Returns: %TRUE on success, %FALSE on error. + * + * Since: 18.0.0 + */ +gboolean +garrow_stream_listener_on_record_batch_decoded(GArrowStreamListener *listener, + GArrowRecordBatch *record_batch, + GHashTable *metadata, + GError **error) +{ + auto klass = GARROW_STREAM_LISTENER_GET_CLASS(listener); + if (!(klass && klass->on_record_batch_decoded)) { + g_set_error(error, + GARROW_ERROR, + GARROW_ERROR_NOT_IMPLEMENTED, + "[stream-listener][on-record-batch-decoded] not implemented"); + return false; + } + return klass->on_record_batch_decoded(listener, record_batch, metadata, error); +} + +/** + * garrow_stream_listener_on_schema_decoded: + * @listener: A #GArrowStreamListener. + * @schema: A decoded #GArrowSchema. + * @filtered_schema: A decoded #GArrowSchema that only has read fields. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Processes a decoded schema. + * + * Returns: %TRUE on success, %FALSE on error. + * + * Since: 18.0.0 + */ +gboolean +garrow_stream_listener_on_schema_decoded(GArrowStreamListener *listener, + GArrowSchema *schema, + GArrowSchema *filtered_schema, + GError **error) +{ + auto klass = GARROW_STREAM_LISTENER_GET_CLASS(listener); + if (!(klass && klass->on_schema_decoded)) { + g_set_error(error, + GARROW_ERROR, + GARROW_ERROR_NOT_IMPLEMENTED, + "[stream-listener][on-schema-decoded] not implemented"); + return false; + } + return klass->on_schema_decoded(listener, schema, filtered_schema, error); +} + +struct GArrowStreamDecoderPrivate +{ + std::shared_ptr decoder; + GArrowStreamListener *listener; +}; + +enum { + PROP_DECODER = 1, + PROP_LISTENER, +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GArrowStreamDecoder, garrow_stream_decoder, G_TYPE_OBJECT); + +#define GARROW_STREAM_DECODER_GET_PRIVATE(object) \ + static_cast( \ + garrow_stream_decoder_get_instance_private(GARROW_STREAM_DECODER(object))) + +static void +garrow_stream_decoder_finalize(GObject *object) +{ + auto priv = GARROW_STREAM_DECODER_GET_PRIVATE(object); + priv->decoder.~shared_ptr(); + G_OBJECT_CLASS(garrow_stream_decoder_parent_class)->finalize(object); +} + +static void +garrow_stream_decoder_dispose(GObject *object) +{ + auto priv = GARROW_STREAM_DECODER_GET_PRIVATE(object); + + if (priv->listener) { + g_object_unref(priv->listener); + priv->listener = nullptr; + } + + G_OBJECT_CLASS(garrow_stream_decoder_parent_class)->dispose(object); +} + +static void +garrow_stream_decoder_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GARROW_STREAM_DECODER_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_DECODER: + priv->decoder = *static_cast *>( + g_value_get_pointer(value)); + break; + case PROP_LISTENER: + priv->listener = GARROW_STREAM_LISTENER(g_value_dup_object(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +garrow_stream_decoder_get_property(GObject *object, + guint prop_id, + GValue *value, + GParamSpec *pspec) +{ + auto priv = GARROW_STREAM_DECODER_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_LISTENER: + g_value_set_object(value, priv->listener); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +garrow_stream_decoder_init(GArrowStreamDecoder *object) +{ + auto priv = GARROW_STREAM_DECODER_GET_PRIVATE(object); + new (&priv->decoder) std::shared_ptr; +} + +static void +garrow_stream_decoder_class_init(GArrowStreamDecoderClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->finalize = garrow_stream_decoder_finalize; + gobject_class->dispose = garrow_stream_decoder_dispose; + gobject_class->set_property = garrow_stream_decoder_set_property; + gobject_class->get_property = garrow_stream_decoder_get_property; + + GParamSpec *spec; + spec = g_param_spec_pointer( + "decoder", + nullptr, + nullptr, + static_cast(G_PARAM_WRITABLE | G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_DECODER, spec); + + /** + * GArrowStreamDecoder:listener: + * + * A listener that receives decoded events. + * + * Since: 18.0.0 + */ + spec = g_param_spec_object( + "listener", + nullptr, + nullptr, + GARROW_TYPE_STREAM_LISTENER, + static_cast(G_PARAM_READWRITE | G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_LISTENER, spec); +} + +/** + * garrow_stream_decoder_new: + * @listener: The #GArrowStreamListener that receives decoded events. + * @options: (nullable): The #GArrowReadOptions. + * + * Returns: A newly created #GArrowStreamDecoder. + * + * Since: 18.0.0 + */ +GArrowStreamDecoder * +garrow_stream_decoder_new(GArrowStreamListener *listener, GArrowReadOptions *options) +{ + auto arrow_listener = garrow_stream_listener_get_raw(listener); + arrow::ipc::IpcReadOptions arrow_options; + if (options) { + arrow_options = *garrow_read_options_get_raw(options); + } else { + arrow_options = arrow::ipc::IpcReadOptions::Defaults(); + } + auto arrow_decoder = + std::make_shared(arrow_listener, arrow_options); + return garrow_stream_decoder_new_raw(&arrow_decoder, listener); +} + +/** + * garrow_stream_decoder_consume_bytes: + * @decoder: A #GArrowStreamDecoder. + * @bytes: A #GBytes to be decoded. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Feed data to the decoder as a raw data. + * + * If the decoder can read one or more record batches by the data, the + * decoder calls [vfunc@GArrowStreamListener.on_record_batch_decoded] + * with a decoded record batch multiple times. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * Since: 18.0.0 + */ +gboolean +garrow_stream_decoder_consume_bytes(GArrowStreamDecoder *decoder, + GBytes *bytes, + GError **error) +{ + auto arrow_decoder = garrow_stream_decoder_get_raw(decoder); + gsize size; + gconstpointer data = g_bytes_get_data(bytes, &size); + return garrow::check(error, + arrow_decoder->Consume(static_cast(data), size), + "[stream-decoder][consume-bytes]"); +} + +/** + * garrow_stream_decoder_consume_buffer: + * @decoder: A #GArrowStreamDecoder. + * @buffer: A #GArrowBuffer to be decoded. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Feed data to the decoder as a #GArrowBuffer. + * + * If the decoder can read one or more record batches by the data, the + * decoder calls [vfunc@GArrowStreamListener.on_record_batch_decoded] + * with a decoded record batch multiple times. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * Since: 18.0.0 + */ +gboolean +garrow_stream_decoder_consume_buffer(GArrowStreamDecoder *decoder, + GArrowBuffer *buffer, + GError **error) +{ + auto arrow_decoder = garrow_stream_decoder_get_raw(decoder); + auto arrow_buffer = garrow_buffer_get_raw(buffer); + return garrow::check(error, + arrow_decoder->Consume(arrow_buffer), + "[stream-decoder][consume-buffer]"); +} + +/** + * garrow_stream_decoder_reset: + * @decoder: A #GArrowStreamDecoder. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Reset the internal status. + * + * You can reuse this decoder for new stream after calling this. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * Since: 18.0.0 + */ +gboolean +garrow_stream_decoder_reset(GArrowStreamDecoder *decoder, GError **error) +{ + auto arrow_decoder = garrow_stream_decoder_get_raw(decoder); + return garrow::check(error, arrow_decoder->Reset(), "[stream-decoder][reset]"); +} + +/** + * garrow_stream_decoder_get_schema: + * @decoder: A #GArrowStreamDecoder. + * + * Returns: (nullable) (transfer full): The shared #GArrowSchema of + * the record batches in the stream. + * + * Since: 18.0.0 + */ +GArrowSchema * +garrow_stream_decoder_get_schema(GArrowStreamDecoder *decoder) +{ + auto arrow_decoder = garrow_stream_decoder_get_raw(decoder); + auto arrow_schema = arrow_decoder->schema(); + if (arrow_schema) { + return garrow_schema_new_raw(&arrow_schema); + } else { + return nullptr; + } +} + +/** + * garrow_stream_decoder_get_next_required_size: + * @decoder: A #GArrowStreamDecoder. + * + * This method is provided for users who want to optimize performance. + * Normal users don't need to use this method. + * + * Here is an example usage for normal users: + * + * garrow_stream_decoder_consume_buffer(decoder, buffer1); + * garrow_stream_decoder_consume_buffer(decoder, buffer2); + * garrow_stream_decoder_consume_buffer(decoder, buffer3); + * + * Decoder has internal buffer. If consumed data isn't enough to + * advance the state of the decoder, consumed data is buffered to + * the internal buffer. It causes performance overhead. + * + * If you pass garrow_stream_decoer_get_next_required_size() size data + * to each + * garrow_stream_decoder_consume_bytes()/garrow_stream_decoder_consume_buffer() + * call, the decoder doesn't use its internal buffer. It improves + * performance. + * + * Here is an example usage to avoid using internal buffer: + * + * buffer1 = get_data(garrow_stream_decoder_get_next_required_size(decoder)); + * garrow_stream_decoder_consume_buffer(buffer1); + * buffer2 = get_data(garrow_stream_decoder_get_next_required_size(decoder)); + * garrow_stream_decoder_consume_buffer(buffer2); + * + * Users can use this method to avoid creating small chunks. Record + * batch data must be contiguous data. If users pass small chunks to + * the decoder, the decoder needs concatenate small chunks + * internally. It causes performance overhead. + * + * Here is an example usage to reduce small chunks: + * + * GArrowResizablBuffer *buffer = garrow_resizable_buffer_new(1024, NULL); + * while ((small_chunk = get_data(&small_chunk_size))) { + * size_t current_buffer_size = garrow_buffer_get_size(GARROW_BUFFER(buffer)); + * garrow_resizable_buffer_resize(buffer, current_buffer_size + small_chunk_size, +NULL); + * garrow_mutable_buffer_set_data(GARROW_MUTABLE_BUFFER(buffer), + * current_buffer_size, + * small_chunk, + * small_chunk_size, + * NULL); + * if (garrow_buffer_get_size(GARROW_BUFFER(buffer)) < + * garrow_stream_decoder_get_next_required_size(decoder)) { + * continue; + * } + * garrow_stream_decoder_consume_buffer(decoder, GARROW_BUFFER(buffer), NULL); + * g_object_unref(buffer); + * buffer = garrow_resizable_buffer_new(1024, NULL); + * } + * if (garrow_buffer_get_size(GARROW_BUFFER(buffer)) > 0) { + * garrow_stream_decoder_consume_buffer(decoder, GARROW_BUFFER(buffer), NULL); + * } + * g_object_unref(buffer); + * + * Returns: The number of bytes needed to advance the state of + * the decoder. + * + * Since: 18.0.0 + */ +gsize +garrow_stream_decoder_get_next_required_size(GArrowStreamDecoder *decoder) +{ + auto arrow_decoder = garrow_stream_decoder_get_raw(decoder); + return arrow_decoder->next_required_size(); +} + +G_END_DECLS + +std::shared_ptr +garrow_stream_listener_get_raw(GArrowStreamListener *listener) +{ + auto priv = GARROW_STREAM_LISTENER_GET_PRIVATE(listener); + return priv->listener; +} + +GArrowStreamDecoder * +garrow_stream_decoder_new_raw(std::shared_ptr *arrow_decoder, + GArrowStreamListener *listener) +{ + return GARROW_STREAM_DECODER(g_object_new(GARROW_TYPE_STREAM_DECODER, + "decoder", + arrow_decoder, + "listener", + listener, + nullptr)); +} + +std::shared_ptr +garrow_stream_decoder_get_raw(GArrowStreamDecoder *decoder) +{ + auto priv = GARROW_STREAM_DECODER_GET_PRIVATE(decoder); + return priv->decoder; +} diff --git a/c_glib/arrow-glib/decoder.h b/c_glib/arrow-glib/decoder.h new file mode 100644 index 0000000000000..2ac0efbabfc7b --- /dev/null +++ b/c_glib/arrow-glib/decoder.h @@ -0,0 +1,96 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include +#include + +G_BEGIN_DECLS + +#define GARROW_TYPE_STREAM_LISTENER (garrow_stream_listener_get_type()) +GARROW_AVAILABLE_IN_18_0 +G_DECLARE_DERIVABLE_TYPE( + GArrowStreamListener, garrow_stream_listener, GARROW, STREAM_LISTENER, GObject) +struct _GArrowStreamListenerClass +{ + GObjectClass parent_class; + + gboolean (*on_eos)(GArrowStreamListener *listener, GError **error); + gboolean (*on_record_batch_decoded)(GArrowStreamListener *listener, + GArrowRecordBatch *record_batch, + GHashTable *metadata, + GError **error); + gboolean (*on_schema_decoded)(GArrowStreamListener *listener, + GArrowSchema *schema, + GArrowSchema *filtered_schema, + GError **error); +}; + +GARROW_AVAILABLE_IN_18_0 +gboolean +garrow_stream_listener_on_eos(GArrowStreamListener *listener, GError **error); + +GARROW_AVAILABLE_IN_18_0 +gboolean +garrow_stream_listener_on_record_batch_decoded(GArrowStreamListener *listener, + GArrowRecordBatch *record_batch, + GHashTable *metadata, + GError **error); + +GARROW_AVAILABLE_IN_18_0 +gboolean +garrow_stream_listener_on_schema_decoded(GArrowStreamListener *listener, + GArrowSchema *schema, + GArrowSchema *filtered_schema, + GError **error); + +#define GARROW_TYPE_STREAM_DECODER (garrow_stream_decoder_get_type()) +GARROW_AVAILABLE_IN_18_0 +G_DECLARE_DERIVABLE_TYPE( + GArrowStreamDecoder, garrow_stream_decoder, GARROW, STREAM_DECODER, GObject) +struct _GArrowStreamDecoderClass +{ + GObjectClass parent_class; +}; + +GARROW_AVAILABLE_IN_18_0 +GArrowStreamDecoder * +garrow_stream_decoder_new(GArrowStreamListener *listener, GArrowReadOptions *options); +GARROW_AVAILABLE_IN_18_0 +gboolean +garrow_stream_decoder_consume_bytes(GArrowStreamDecoder *decoder, + GBytes *bytes, + GError **error); +GARROW_AVAILABLE_IN_18_0 +gboolean +garrow_stream_decoder_consume_buffer(GArrowStreamDecoder *decoder, + GArrowBuffer *buffer, + GError **error); +GARROW_AVAILABLE_IN_18_0 +gboolean +garrow_stream_decoder_reset(GArrowStreamDecoder *decoder, GError **error); +GARROW_AVAILABLE_IN_18_0 +GArrowSchema * +garrow_stream_decoder_get_schema(GArrowStreamDecoder *decoder); +GARROW_AVAILABLE_IN_18_0 +gsize +garrow_stream_decoder_get_next_required_size(GArrowStreamDecoder *decoder); + +G_END_DECLS diff --git a/c_glib/arrow-glib/decoder.hpp b/c_glib/arrow-glib/decoder.hpp new file mode 100644 index 0000000000000..24b329867c685 --- /dev/null +++ b/c_glib/arrow-glib/decoder.hpp @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include +#include + +#include + +GARROW_EXTERN +std::shared_ptr +garrow_stream_listener_get_raw(GArrowStreamListener *listener); + +GARROW_EXTERN +GArrowStreamDecoder * +garrow_stream_decoder_new_raw(std::shared_ptr *arrow_decoder, + GArrowStreamListener *listener); + +GARROW_EXTERN +std::shared_ptr +garrow_stream_decoder_get_raw(GArrowStreamDecoder *decoder); diff --git a/c_glib/arrow-glib/file-system.cpp b/c_glib/arrow-glib/file-system.cpp index b6efa2b872635..9ba494e405957 100644 --- a/c_glib/arrow-glib/file-system.cpp +++ b/c_glib/arrow-glib/file-system.cpp @@ -56,6 +56,8 @@ G_BEGIN_DECLS * #GArrowS3FileSystem is a class for S3-backed file system. * * #GArrowGCSFileSystem is a class for GCS-backed file system. + * + * #GArrowAzureFileSystem is a class for Azure-backed file system. */ /* arrow::fs::FileInfo */ @@ -1561,6 +1563,18 @@ garrow_gcs_file_system_class_init(GArrowGCSFileSystemClass *klass) { } +G_DEFINE_TYPE(GArrowAzureFileSystem, garrow_azure_file_system, GARROW_TYPE_FILE_SYSTEM) + +static void +garrow_azure_file_system_init(GArrowAzureFileSystem *file_system) +{ +} + +static void +garrow_azure_file_system_class_init(GArrowAzureFileSystemClass *klass) +{ +} + G_END_DECLS GArrowFileInfo * @@ -1592,6 +1606,8 @@ garrow_file_system_new_raw(std::shared_ptr *arrow_file_sy file_system_type = GARROW_TYPE_S3_FILE_SYSTEM; } else if (type_name == "gcs") { file_system_type = GARROW_TYPE_GCS_FILE_SYSTEM; + } else if (type_name == "abfs") { + file_system_type = GARROW_TYPE_AZURE_FILE_SYSTEM; } else if (type_name == "mock") { file_system_type = GARROW_TYPE_MOCK_FILE_SYSTEM; } diff --git a/c_glib/arrow-glib/file-system.h b/c_glib/arrow-glib/file-system.h index 2e500672e145c..9a903c6af68cf 100644 --- a/c_glib/arrow-glib/file-system.h +++ b/c_glib/arrow-glib/file-system.h @@ -337,4 +337,16 @@ struct _GArrowGCSFileSystemClass GArrowFileSystemClass parent_class; }; +#define GARROW_TYPE_AZURE_FILE_SYSTEM (garrow_azure_file_system_get_type()) +GARROW_AVAILABLE_IN_18_0 +G_DECLARE_DERIVABLE_TYPE(GArrowAzureFileSystem, + garrow_azure_file_system, + GARROW, + AZURE_FILE_SYSTEM, + GArrowFileSystem) +struct _GArrowAzureFileSystemClass +{ + GArrowFileSystemClass parent_class; +}; + G_END_DECLS diff --git a/c_glib/arrow-glib/internal-hash-table.hpp b/c_glib/arrow-glib/internal-hash-table.hpp index 27ec060994c98..2e0a72561a7d8 100644 --- a/c_glib/arrow-glib/internal-hash-table.hpp +++ b/c_glib/arrow-glib/internal-hash-table.hpp @@ -37,3 +37,21 @@ garrow_internal_hash_table_to_metadata(GHashTable *metadata) &arrow_metadata); return arrow_metadata; } + +static inline GHashTable * +garrow_internal_hash_table_from_metadata( + const std::shared_ptr &arrow_metadata) +{ + auto metadata = g_hash_table_new_full(g_str_hash, g_str_equal, g_free, g_free); + const auto &keys = arrow_metadata->keys(); + const auto &values = arrow_metadata->values(); + auto n = arrow_metadata->size(); + for (int64_t i = 0; i < n; ++i) { + const auto &key = keys[i]; + const auto &value = values[i]; + g_hash_table_insert(metadata, + g_strndup(key.data(), key.size()), + g_strndup(value.data(), value.size())); + } + return metadata; +} diff --git a/c_glib/arrow-glib/meson.build b/c_glib/arrow-glib/meson.build index 36a8274513ed2..854988e348986 100644 --- a/c_glib/arrow-glib/meson.build +++ b/c_glib/arrow-glib/meson.build @@ -28,6 +28,7 @@ sources = files( 'composite-data-type.cpp', 'datum.cpp', 'decimal.cpp', + 'decoder.cpp', 'error.cpp', 'expression.cpp', 'field.cpp', @@ -91,6 +92,7 @@ c_headers = files( 'data-type.h', 'datum.h', 'decimal.h', + 'decoder.h', 'error.h', 'expression.h', 'field.h', @@ -153,6 +155,7 @@ cpp_headers = files( 'data-type.hpp', 'datum.hpp', 'decimal.hpp', + 'decoder.hpp', 'error.hpp', 'expression.hpp', 'field.hpp', diff --git a/c_glib/arrow-glib/reader.cpp b/c_glib/arrow-glib/reader.cpp index 8a1c3722d4a0f..9fe9d9d1b3199 100644 --- a/c_glib/arrow-glib/reader.cpp +++ b/c_glib/arrow-glib/reader.cpp @@ -668,10 +668,10 @@ garrow_record_batch_file_reader_read_record_batch(GArrowRecordBatchFileReader *r } } -typedef struct GArrowFeatherFileReaderPrivate_ +struct GArrowFeatherFileReaderPrivate { std::shared_ptr feather_reader; -} GArrowFeatherFileReaderPrivate; +}; enum { PROP_FEATHER_READER = 1, @@ -714,22 +714,11 @@ garrow_feather_file_reader_set_property(GObject *object, } } -static void -garrow_feather_file_reader_get_property(GObject *object, - guint prop_id, - GValue *value, - GParamSpec *pspec) -{ - switch (prop_id) { - default: - G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); - break; - } -} - static void garrow_feather_file_reader_init(GArrowFeatherFileReader *object) { + auto priv = GARROW_FEATHER_FILE_READER_GET_PRIVATE(object); + new (&priv->feather_reader) std::shared_ptr; } static void @@ -739,7 +728,6 @@ garrow_feather_file_reader_class_init(GArrowFeatherFileReaderClass *klass) gobject_class->finalize = garrow_feather_file_reader_finalize; gobject_class->set_property = garrow_feather_file_reader_set_property; - gobject_class->get_property = garrow_feather_file_reader_get_property; GParamSpec *spec; spec = g_param_spec_pointer( diff --git a/c_glib/arrow-glib/writer.cpp b/c_glib/arrow-glib/writer.cpp index b0321d51b3ba4..08af1c7976965 100644 --- a/c_glib/arrow-glib/writer.cpp +++ b/c_glib/arrow-glib/writer.cpp @@ -45,14 +45,14 @@ G_BEGIN_DECLS * batches in file format into output. */ -typedef struct GArrowRecordBatchWriterPrivate_ +struct GArrowRecordBatchWriterPrivate { std::shared_ptr record_batch_writer; -} GArrowRecordBatchWriterPrivate; + bool is_closed; +}; enum { - PROP_0, - PROP_RECORD_BATCH_WRITER + PROP_RECORD_BATCH_WRITER = 1, }; G_DEFINE_TYPE_WITH_PRIVATE(GArrowRecordBatchWriter, @@ -111,6 +111,7 @@ garrow_record_batch_writer_init(GArrowRecordBatchWriter *object) { auto priv = GARROW_RECORD_BATCH_WRITER_GET_PRIVATE(object); new (&priv->record_batch_writer) std::shared_ptr; + priv->is_closed = false; } static void @@ -193,7 +194,27 @@ garrow_record_batch_writer_close(GArrowRecordBatchWriter *writer, GError **error auto arrow_writer = garrow_record_batch_writer_get_raw(writer); auto status = arrow_writer->Close(); - return garrow_error_check(error, status, "[record-batch-writer][close]"); + auto success = garrow_error_check(error, status, "[record-batch-writer][close]"); + if (success) { + auto priv = GARROW_RECORD_BATCH_WRITER_GET_PRIVATE(writer); + priv->is_closed = true; + } + return success; +} + +/** + * garrow_record_batch_writer_is_closed: + * @writer: A #GArrowRecordBatchWriter. + * + * Returns: %TRUE if the writer is closed, %FALSE otherwise. + * + * Since: 18.0.0 + */ +gboolean +garrow_record_batch_writer_is_closed(GArrowRecordBatchWriter *writer) +{ + auto priv = GARROW_RECORD_BATCH_WRITER_GET_PRIVATE(writer); + return priv->is_closed; } G_DEFINE_TYPE(GArrowRecordBatchStreamWriter, diff --git a/c_glib/arrow-glib/writer.h b/c_glib/arrow-glib/writer.h index 46bbdddec8c9d..cea8390d9028f 100644 --- a/c_glib/arrow-glib/writer.h +++ b/c_glib/arrow-glib/writer.h @@ -53,6 +53,10 @@ GARROW_AVAILABLE_IN_ALL gboolean garrow_record_batch_writer_close(GArrowRecordBatchWriter *writer, GError **error); +GARROW_AVAILABLE_IN_18_0 +gboolean +garrow_record_batch_writer_is_closed(GArrowRecordBatchWriter *writer); + #define GARROW_TYPE_RECORD_BATCH_STREAM_WRITER \ (garrow_record_batch_stream_writer_get_type()) GARROW_AVAILABLE_IN_ALL diff --git a/c_glib/arrow-glib/writer.hpp b/c_glib/arrow-glib/writer.hpp index aa87ffe77d79b..1d85ac52f88d1 100644 --- a/c_glib/arrow-glib/writer.hpp +++ b/c_glib/arrow-glib/writer.hpp @@ -25,16 +25,20 @@ #include +GARROW_AVAILABLE_IN_ALL GArrowRecordBatchWriter * garrow_record_batch_writer_new_raw( std::shared_ptr *arrow_writer); +GARROW_AVAILABLE_IN_ALL std::shared_ptr garrow_record_batch_writer_get_raw(GArrowRecordBatchWriter *writer); +GARROW_AVAILABLE_IN_ALL GArrowRecordBatchStreamWriter * garrow_record_batch_stream_writer_new_raw( std::shared_ptr *arrow_writer); +GARROW_AVAILABLE_IN_ALL GArrowRecordBatchFileWriter * garrow_record_batch_file_writer_new_raw( std::shared_ptr *arrow_writer); diff --git a/c_glib/meson.build b/c_glib/meson.build index 06aa5b941e77c..214c57747033e 100644 --- a/c_glib/meson.build +++ b/c_glib/meson.build @@ -35,7 +35,7 @@ project('arrow-glib', 'c', 'cpp', # * 22.04: 0.61.2 meson_version: '>=0.53.2') -version = '17.0.0-SNAPSHOT' +version = '18.0.0-SNAPSHOT' if version.endswith('-SNAPSHOT') version_numbers = version.split('-')[0].split('.') version_tag = version.split('-')[1] diff --git a/c_glib/parquet-glib/arrow-file-writer.cpp b/c_glib/parquet-glib/arrow-file-writer.cpp index b6f019ed27d46..2b8e2bdeac026 100644 --- a/c_glib/parquet-glib/arrow-file-writer.cpp +++ b/c_glib/parquet-glib/arrow-file-writer.cpp @@ -316,14 +316,13 @@ gparquet_writer_properties_get_data_page_size(GParquetWriterProperties *properti return parquet_properties->data_pagesize(); } -typedef struct GParquetArrowFileWriterPrivate_ +struct GParquetArrowFileWriterPrivate { parquet::arrow::FileWriter *arrow_file_writer; -} GParquetArrowFileWriterPrivate; +}; enum { - PROP_0, - PROP_ARROW_FILE_WRITER + PROP_ARROW_FILE_WRITER = 1, }; G_DEFINE_TYPE_WITH_PRIVATE(GParquetArrowFileWriter, @@ -496,6 +495,58 @@ gparquet_arrow_file_writer_new_path(GArrowSchema *schema, } } +/** + * gparquet_arrow_file_writer_get_schema: + * @writer: A #GParquetArrowFileWriter. + * + * Returns: (transfer full): The schema to be written to. + * + * Since: 18.0.0 + */ +GArrowSchema * +gparquet_arrow_file_writer_get_schema(GParquetArrowFileWriter *writer) +{ + auto parquet_arrow_file_writer = gparquet_arrow_file_writer_get_raw(writer); + auto arrow_schema = parquet_arrow_file_writer->schema(); + return garrow_schema_new_raw(&arrow_schema); +} + +/** + * gparquet_arrow_file_writer_write_record_batch: + * @writer: A #GParquetArrowFileWriter. + * @record_batch: A record batch to be written. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Write a record batch into the buffered row group. + * + * Multiple record batches can be written into the same row group + * through this function. + * + * gparquet_writer_properties_get_max_row_group_length() is respected + * and a new row group will be created if the current row group + * exceeds the limit. + * + * Record batches get flushed to the output stream once + * gparquet_file_writer_new_buffered_row_group() or + * gparquet_file_writer_close() is called. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * Since: 18.0.0 + */ +gboolean +gparquet_arrow_file_writer_write_record_batch(GParquetArrowFileWriter *writer, + GArrowRecordBatch *record_batch, + GError **error) +{ + auto parquet_arrow_file_writer = gparquet_arrow_file_writer_get_raw(writer); + auto arrow_record_batch = garrow_record_batch_get_raw(record_batch).get(); + auto status = parquet_arrow_file_writer->WriteRecordBatch(*arrow_record_batch); + return garrow_error_check(error, + status, + "[parquet][arrow][file-writer][write-record-batch]"); +} + /** * gparquet_arrow_file_writer_write_table: * @writer: A #GParquetArrowFileWriter. @@ -510,13 +561,82 @@ gparquet_arrow_file_writer_new_path(GArrowSchema *schema, gboolean gparquet_arrow_file_writer_write_table(GParquetArrowFileWriter *writer, GArrowTable *table, - guint64 chunk_size, + gsize chunk_size, GError **error) { auto parquet_arrow_file_writer = gparquet_arrow_file_writer_get_raw(writer); auto arrow_table = garrow_table_get_raw(table).get(); - auto status = parquet_arrow_file_writer->WriteTable(*arrow_table, chunk_size); - return garrow_error_check(error, status, "[parquet][arrow][file-writer][write-table]"); + return garrow::check(error, + parquet_arrow_file_writer->WriteTable(*arrow_table, chunk_size), + "[parquet][arrow][file-writer][write-table]"); +} + +/** + * gparquet_arrow_file_writer_new_row_group: + * @writer: A #GParquetArrowFileWriter. + * @chunk_size: The max number of rows in a row group. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Start a new row group. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * Since: 18.0.0 + */ +gboolean +gparquet_arrow_file_writer_new_row_group(GParquetArrowFileWriter *writer, + gsize chunk_size, + GError **error) +{ + auto parquet_arrow_file_writer = gparquet_arrow_file_writer_get_raw(writer); + return garrow::check(error, + parquet_arrow_file_writer->NewRowGroup(chunk_size), + "[parquet][arrow][file-writer][new-row-group]"); +} + +/** + * gparquet_arrow_file_writer_new_buffered_row_group: + * @writer: A #GParquetArrowFileWriter. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Start a new buffered row group. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * Since: 18.0.0 + */ +gboolean +gparquet_arrow_file_writer_new_buffered_row_group(GParquetArrowFileWriter *writer, + GError **error) +{ + auto parquet_arrow_file_writer = gparquet_arrow_file_writer_get_raw(writer); + return garrow::check(error, + parquet_arrow_file_writer->NewBufferedRowGroup(), + "[parquet][arrow][file-writer][new-buffered-row-group]"); +} + +/** + * gparquet_arrow_file_writer_write_chunked_array: + * @writer: A #GParquetArrowFileWriter. + * @chunked_array: A #GArrowChunkedArray to be written. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Start a chunked array as a column chunk. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * Since: 18.0.0 + */ +gboolean +gparquet_arrow_file_writer_write_chunked_array(GParquetArrowFileWriter *writer, + GArrowChunkedArray *chunked_array, + GError **error) +{ + auto parquet_arrow_file_writer = gparquet_arrow_file_writer_get_raw(writer); + auto arrow_chunked_array = garrow_chunked_array_get_raw(chunked_array); + return garrow::check(error, + parquet_arrow_file_writer->WriteColumnChunk(arrow_chunked_array), + "[parquet][arrow][file-writer][write-chunked-array]"); } /** diff --git a/c_glib/parquet-glib/arrow-file-writer.h b/c_glib/parquet-glib/arrow-file-writer.h index 71cbfa195e842..2c82f7c1f87de 100644 --- a/c_glib/parquet-glib/arrow-file-writer.h +++ b/c_glib/parquet-glib/arrow-file-writer.h @@ -116,13 +116,40 @@ gparquet_arrow_file_writer_new_path(GArrowSchema *schema, GParquetWriterProperties *writer_properties, GError **error); +GPARQUET_AVAILABLE_IN_18_0 +GArrowSchema * +gparquet_arrow_file_writer_get_schema(GParquetArrowFileWriter *writer); + +GPARQUET_AVAILABLE_IN_18_0 +gboolean +gparquet_arrow_file_writer_write_record_batch(GParquetArrowFileWriter *writer, + GArrowRecordBatch *record_batch, + GError **error); + GPARQUET_AVAILABLE_IN_0_11 gboolean gparquet_arrow_file_writer_write_table(GParquetArrowFileWriter *writer, GArrowTable *table, - guint64 chunk_size, + gsize chunk_size, GError **error); +GPARQUET_AVAILABLE_IN_18_0 +gboolean +gparquet_arrow_file_writer_new_row_group(GParquetArrowFileWriter *writer, + gsize chunk_size, + GError **error); + +GPARQUET_AVAILABLE_IN_18_0 +gboolean +gparquet_arrow_file_writer_new_buffered_row_group(GParquetArrowFileWriter *writer, + GError **error); + +GPARQUET_AVAILABLE_IN_18_0 +gboolean +gparquet_arrow_file_writer_write_chunked_array(GParquetArrowFileWriter *writer, + GArrowChunkedArray *chunked_array, + GError **error); + GPARQUET_AVAILABLE_IN_0_11 gboolean gparquet_arrow_file_writer_close(GParquetArrowFileWriter *writer, GError **error); diff --git a/c_glib/test/flight/test-call-options.rb b/c_glib/test/flight/test-call-options.rb index bf4dd6ae81a84..2574a9f7cbdbf 100644 --- a/c_glib/test/flight/test-call-options.rb +++ b/c_glib/test/flight/test-call-options.rb @@ -44,4 +44,10 @@ def test_clear_headers @options.clear_headers assert_equal([], collect_headers) end + + def test_timeout + assert_in_delta(-1, @options.timeout) + @options.timeout = 10.1 + assert_in_delta(10.1, @options.timeout) + end end diff --git a/c_glib/test/flight/test-client.rb b/c_glib/test/flight/test-client.rb index 7eb093d3cab80..f1e3f31234ab4 100644 --- a/c_glib/test/flight/test-client.rb +++ b/c_glib/test/flight/test-client.rb @@ -84,4 +84,37 @@ def test_error end end end + + sub_test_case("#do_put") do + def test_success + client = ArrowFlight::Client.new(@location) + generator = Helper::FlightInfoGenerator.new + descriptor = generator.page_view_descriptor + table = generator.page_view_table + result = client.do_put(descriptor, table.schema) + writer = result.writer + writer.write_table(table) + writer.done_writing + reader = result.reader + metadata = reader.read + writer.close + assert_equal(["done", table], + [metadata.data.to_s, @server.uploaded_table]) + end + + def test_error + client = ArrowFlight::Client.new(@location) + generator = Helper::FlightInfoGenerator.new + descriptor = generator.page_view_descriptor + table = generator.page_view_table + result = client.do_put(descriptor, table.schema) + assert_raise(Arrow::Error::Invalid) do + writer = result.writer + writer.done_writing + reader = result.reader + reader.read + writer.close + end + end + end end diff --git a/c_glib/test/helper/flight-server.rb b/c_glib/test/helper/flight-server.rb index 8c47029d41791..80b8a5c96cf9f 100644 --- a/c_glib/test/helper/flight-server.rb +++ b/c_glib/test/helper/flight-server.rb @@ -34,6 +34,8 @@ def virtual_do_is_valid(context, token) class FlightServer < ArrowFlight::Server type_register + attr_reader :uploaded_table + private def virtual_do_list_flights(context, criteria) generator = FlightInfoGenerator.new @@ -54,5 +56,14 @@ def virtual_do_do_get(context, ticket) reader = Arrow::TableBatchReader.new(table) ArrowFlight::RecordBatchStream.new(reader) end + + def virtual_do_do_put(context, reader, writer) + @uploaded_table = reader.read_all + writer.write(Arrow::Buffer.new("done")) + if @uploaded_table.n_rows.zero? + raise Arrow::Error::Invalid.new("empty table") + end + true + end end end diff --git a/c_glib/test/parquet/test-arrow-file-writer.rb b/c_glib/test/parquet/test-arrow-file-writer.rb index f899e7273b2a2..d8344bf1c50b0 100644 --- a/c_glib/test/parquet/test-arrow-file-writer.rb +++ b/c_glib/test/parquet/test-arrow-file-writer.rb @@ -26,7 +26,42 @@ def setup end end - def test_write + def test_schema + schema = build_schema("enabled" => :boolean) + writer = Parquet::ArrowFileWriter.new(schema, @file.path) + assert_equal(schema, writer.schema) + writer.close + end + + def test_write_record_batch + enabled_values = [true, nil, false, true] + record_batch = + build_record_batch("enabled" => build_boolean_array(enabled_values)) + + writer = Parquet::ArrowFileWriter.new(record_batch.schema, @file.path) + writer.write_record_batch(record_batch) + writer.new_buffered_row_group + writer.write_record_batch(record_batch) + writer.close + + reader = Parquet::ArrowFileReader.new(@file.path) + begin + reader.use_threads = true + assert_equal([ + 2, + Arrow::Table.new(record_batch.schema, + [record_batch, record_batch]), + ], + [ + reader.n_row_groups, + reader.read_table, + ]) + ensure + reader.unref + end + end + + def test_write_table enabled_values = [true, nil, false, true] table = build_table("enabled" => build_boolean_array(enabled_values)) chunk_size = 2 @@ -40,11 +75,41 @@ def test_write reader.use_threads = true assert_equal([ enabled_values.length / chunk_size, - true, + table, + ], + [ + reader.n_row_groups, + reader.read_table, + ]) + ensure + reader.unref + end + end + + def test_write_chunked_array + schema = build_schema("enabled" => :boolean) + writer = Parquet::ArrowFileWriter.new(schema, @file.path) + writer.new_row_group(2) + chunked_array = Arrow::ChunkedArray.new([build_boolean_array([true, nil])]) + writer.write_chunked_array(chunked_array) + writer.new_row_group(1) + chunked_array = Arrow::ChunkedArray.new([build_boolean_array([false])]) + writer.write_chunked_array(chunked_array) + writer.close + + reader = Parquet::ArrowFileReader.new(@file.path) + begin + reader.use_threads = true + assert_equal([ + 2, + build_table("enabled" => [ + build_boolean_array([true, nil]), + build_boolean_array([false]), + ]), ], [ reader.n_row_groups, - table.equal_metadata(reader.read_table, false), + reader.read_table, ]) ensure reader.unref diff --git a/c_glib/test/parquet/test-column-chunk-metadata.rb b/c_glib/test/parquet/test-column-chunk-metadata.rb index f0012f0124577..4612e5bf0cc59 100644 --- a/c_glib/test/parquet/test-column-chunk-metadata.rb +++ b/c_glib/test/parquet/test-column-chunk-metadata.rb @@ -77,7 +77,7 @@ def setup test("#file_offset") do assert do - @metadata.file_offset > 0 + @metadata.file_offset == 0 end end diff --git a/c_glib/test/test-file-writer.rb b/c_glib/test/test-file-writer.rb index 5f9c3c4e19aa9..06c9dfa25c7fc 100644 --- a/c_glib/test/test-file-writer.rb +++ b/c_glib/test/test-file-writer.rb @@ -34,6 +34,9 @@ def test_write_record_batch file_writer.write_record_batch(record_batch) ensure file_writer.close + assert do + file_writer.closed? + end end ensure output.close @@ -68,6 +71,9 @@ def test_write_table file_writer.write_table(table) ensure file_writer.close + assert do + file_writer.closed? + end end ensure output.close diff --git a/c_glib/test/test-stream-decoder.rb b/c_glib/test/test-stream-decoder.rb new file mode 100644 index 0000000000000..108e687e3aa6b --- /dev/null +++ b/c_glib/test/test-stream-decoder.rb @@ -0,0 +1,126 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestStreamDecoder < Test::Unit::TestCase + include Helper::Buildable + + class Listener < Arrow::StreamListener + type_register + + attr_reader :events + def initialize + super + @events = [] + end + + private + def virtual_do_on_eos + @events << [:eos] + true + end + + def virtual_do_on_record_batch_decoded(record_batch, metadata) + @events << [:record_batch_decoded, record_batch, metadata] + true + end + + def virtual_do_on_schema_decoded(schema, filtered_schema) + @events << [:schema_decoded, schema, filtered_schema] + true + end + end + + def setup + columns = { + "enabled": build_boolean_array([true, false, nil, true]), + } + @record_batch = build_record_batch(columns) + @schema = @record_batch.schema + + @buffer = Arrow::ResizableBuffer.new(0) + output = Arrow::BufferOutputStream.new(@buffer) + stream_writer = Arrow::RecordBatchStreamWriter.new(output, @schema) + stream_writer.write_record_batch(@record_batch) + stream_writer.close + output.close + + @listener = Listener.new + @decoder = Arrow::StreamDecoder.new(@listener) + end + + def test_listener + assert_equal(@listener, @decoder.listener) + end + + def test_consume_bytes + @buffer.data.to_s.each_byte do |byte| + @decoder.consume_bytes(GLib::Bytes.new(byte.chr)) + end + assert_equal([ + [:schema_decoded, @schema, @schema], + [:record_batch_decoded, @record_batch, nil], + [:eos], + ], + @listener.events) + end + + def test_consume_buffer + @buffer.data.to_s.each_byte do |byte| + @decoder.consume_buffer(Arrow::Buffer.new(byte.chr)) + end + assert_equal([ + [:schema_decoded, @schema, @schema], + [:record_batch_decoded, @record_batch, nil], + [:eos], + ], + @listener.events) + end + + def test_reset + @decoder.consume_bytes(@buffer.data.to_s[0, 10]) + @decoder.reset + @decoder.consume_bytes(@buffer.data) + assert_equal([ + [:schema_decoded, @schema, @schema], + [:record_batch_decoded, @record_batch, nil], + [:eos], + ], + @listener.events) + end + + def test_schema + assert_nil(@decoder.schema) + @decoder.consume_bytes(@buffer.data) + assert_equal(@schema, @decoder.schema) + end + + def test_next_required_size + data = @buffer.data.to_s + loop do + next_required_size = @decoder.next_required_size + break if next_required_size.zero? + @decoder.consume_bytes(data[0, next_required_size]) + data = data[next_required_size..-1] + end + assert_equal([ + [:schema_decoded, @schema, @schema], + [:record_batch_decoded, @record_batch, nil], + [:eos], + ], + @listener.events) + end +end diff --git a/c_glib/test/test-stream-writer.rb b/c_glib/test/test-stream-writer.rb index 32754e20838b4..261732ae91e15 100644 --- a/c_glib/test/test-stream-writer.rb +++ b/c_glib/test/test-stream-writer.rb @@ -35,6 +35,9 @@ def test_write_record_batch stream_writer.write_record_batch(record_batch) ensure stream_writer.close + assert do + stream_writer.closed? + end end ensure output.close diff --git a/c_glib/tool/generate-version-header.py b/c_glib/tool/generate-version-header.py index 7422432251ff1..ba8cb03d15a3e 100755 --- a/c_glib/tool/generate-version-header.py +++ b/c_glib/tool/generate-version-header.py @@ -140,6 +140,7 @@ def generate_availability_macros(library: str) -> str: ALL_VERSIONS = [ + (18, 0), (17, 0), (16, 0), (15, 0), diff --git a/c_glib/vcpkg.json b/c_glib/vcpkg.json index e88d2b8fe30d5..3941edbfec527 100644 --- a/c_glib/vcpkg.json +++ b/c_glib/vcpkg.json @@ -1,6 +1,6 @@ { "name": "arrow-glib", - "version-string": "17.0.0-SNAPSHOT", + "version-string": "18.0.0-SNAPSHOT", "dependencies": [ "glib", "gobject-introspection", diff --git a/ci/appveyor-cpp-build.bat b/ci/appveyor-cpp-build.bat index f688fbb63a9ad..08a052e82f24d 100644 --- a/ci/appveyor-cpp-build.bat +++ b/ci/appveyor-cpp-build.bat @@ -46,7 +46,9 @@ set ARROW_CMAKE_ARGS=-DARROW_DEPENDENCY_SOURCE=CONDA -DARROW_WITH_BZ2=ON set ARROW_CXXFLAGS=/WX /MP @rem Install GCS testbench +set PIPX_BIN_DIR=C:\Windows\ call %CD%\ci\scripts\install_gcs_testbench.bat +storage-testbench -h || exit /B @rem @rem Build and test Arrow C++ libraries (including Parquet) diff --git a/ci/docker/conda-cpp.dockerfile b/ci/docker/conda-cpp.dockerfile index dff1f2224809a..f0084894e19dc 100644 --- a/ci/docker/conda-cpp.dockerfile +++ b/ci/docker/conda-cpp.dockerfile @@ -42,17 +42,19 @@ RUN mamba install -q -y \ valgrind && \ mamba clean --all +# We want to install the GCS testbench using the Conda base environment's Python, +# because the test environment's Python may later change. +ENV PIPX_BASE_PYTHON=/opt/conda/bin/python3 +COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts +RUN /arrow/ci/scripts/install_gcs_testbench.sh default + # Ensure npm, node and azurite are on path. npm and node are required to install azurite, which will then need to -# be on the path for the tests to run. +# be on the path for the tests to run. ENV PATH=/opt/conda/envs/arrow/bin:$PATH COPY ci/scripts/install_azurite.sh /arrow/ci/scripts/ RUN /arrow/ci/scripts/install_azurite.sh -# We want to install the GCS testbench using the same Python binary that the Conda code will use. -COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts -RUN /arrow/ci/scripts/install_gcs_testbench.sh default - COPY ci/scripts/install_sccache.sh /arrow/ci/scripts/ RUN /arrow/ci/scripts/install_sccache.sh unknown-linux-musl /usr/local/bin diff --git a/ci/docker/conda-integration.dockerfile b/ci/docker/conda-integration.dockerfile index 30b9cd5199fab..bc268e484d019 100644 --- a/ci/docker/conda-integration.dockerfile +++ b/ci/docker/conda-integration.dockerfile @@ -23,8 +23,7 @@ ARG arch=amd64 ARG maven=3.8.7 ARG node=16 ARG yarn=1.22 -ARG jdk=8 -ARG go=1.21.8 +ARG jdk=11 # Install Archery and integration dependencies COPY ci/conda_env_archery.txt /arrow/ci/ @@ -44,15 +43,30 @@ RUN mamba install -q -y \ # Install Rust with only the needed components # (rustfmt is needed for tonic-build to compile the protobuf definitions) +# GH-41637: Version pinned at 1.77 because the glibc for conda-cpp is currently too old RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- --profile=minimal -y && \ - $HOME/.cargo/bin/rustup toolchain install stable && \ + $HOME/.cargo/bin/rustup override set 1.77 && \ + $HOME/.cargo/bin/rustup toolchain install 1.77 && \ $HOME/.cargo/bin/rustup component add rustfmt ENV GOROOT=/opt/go \ GOBIN=/opt/go/bin \ GOPATH=/go \ PATH=/opt/go/bin:$PATH -RUN wget -nv -O - https://dl.google.com/go/go${go}.linux-${arch}.tar.gz | tar -xzf - -C /opt +# Use always latest go +RUN wget -nv -O - https://dl.google.com/go/go$( \ + curl \ + --fail \ + --location \ + --show-error \ + --silent \ + https://api.github.com/repos/golang/go/git/matching-refs/tags/go | \ + grep -o '"ref": "refs/tags/go.*"' | \ + tail -n 1 | \ + sed \ + -e 's,^"ref": "refs/tags/go,,g' \ + -e 's/"$//g' \ + ).linux-${arch}.tar.gz | tar -xzf - -C /opt ENV DOTNET_ROOT=/opt/dotnet \ PATH=/opt/dotnet:$PATH diff --git a/ci/docker/debian-12-go.dockerfile b/ci/docker/conda-python-cpython-debug.dockerfile similarity index 67% rename from ci/docker/debian-12-go.dockerfile rename to ci/docker/conda-python-cpython-debug.dockerfile index c958e6bdee211..36ba7865a888c 100644 --- a/ci/docker/debian-12-go.dockerfile +++ b/ci/docker/conda-python-cpython-debug.dockerfile @@ -15,15 +15,14 @@ # specific language governing permissions and limitations # under the License. -ARG arch=amd64 -ARG go=1.21 -ARG staticcheck=v0.4.7 -FROM ${arch}/golang:${go}-bookworm +ARG repo +ARG arch +ARG python=3.9 +FROM ${repo}:${arch}-conda-python-${python} -# FROM collects all the args, get back the staticcheck version arg -ARG staticcheck -RUN GO111MODULE=on go install honnef.co/go/tools/cmd/staticcheck@${staticcheck} - -# Copy the go.mod and go.sum over and pre-download all the dependencies -COPY go/ /arrow/go -RUN cd /arrow/go && go mod download +# (Docker oddity: ARG needs to be repeated after FROM) +ARG python=3.9 +RUN mamba install -y "conda-forge/label/python_debug::python=${python}[build=*_cpython]" && \ + mamba clean --all +# Quick check that we do have a debug mode CPython +RUN python -c "import sys; sys.gettotalrefcount()" diff --git a/ci/docker/conda-python-cython2.dockerfile b/ci/docker/conda-python-cython2.dockerfile index d67ef677276c7..859ad868b0c71 100644 --- a/ci/docker/conda-python-cython2.dockerfile +++ b/ci/docker/conda-python-cython2.dockerfile @@ -17,7 +17,7 @@ ARG repo ARG arch -ARG python=3.8 +ARG python=3.9 FROM ${repo}:${arch}-conda-python-${python} RUN mamba install -q -y "cython<3" && \ diff --git a/ci/docker/conda-python-dask.dockerfile b/ci/docker/conda-python-dask.dockerfile index 44840110817e9..2c063b2e643b6 100644 --- a/ci/docker/conda-python-dask.dockerfile +++ b/ci/docker/conda-python-dask.dockerfile @@ -17,7 +17,7 @@ ARG repo ARG arch=amd64 -ARG python=3.8 +ARG python=3.9 FROM ${repo}:${arch}-conda-python-${python} ARG dask=latest diff --git a/ci/docker/conda-python-hdfs.dockerfile b/ci/docker/conda-python-hdfs.dockerfile index fa4fa0d1fb772..4cf35f4b37a56 100644 --- a/ci/docker/conda-python-hdfs.dockerfile +++ b/ci/docker/conda-python-hdfs.dockerfile @@ -17,10 +17,10 @@ ARG repo ARG arch=amd64 -ARG python=3.8 +ARG python=3.9 FROM ${repo}:${arch}-conda-python-${python} -ARG jdk=8 +ARG jdk=11 ARG maven=3.8.7 RUN mamba install -q -y \ maven=${maven} \ diff --git a/ci/docker/conda-python-jpype.dockerfile b/ci/docker/conda-python-jpype.dockerfile index d9b43afdaec9e..c28400f0262da 100644 --- a/ci/docker/conda-python-jpype.dockerfile +++ b/ci/docker/conda-python-jpype.dockerfile @@ -17,7 +17,7 @@ ARG repo ARG arch=amd64 -ARG python=3.8 +ARG python=3.9 FROM ${repo}:${arch}-conda-python-${python} ARG jdk=11 diff --git a/ci/docker/conda-python-pandas.dockerfile b/ci/docker/conda-python-pandas.dockerfile index 83ad52a13d639..9ee62cd282d36 100644 --- a/ci/docker/conda-python-pandas.dockerfile +++ b/ci/docker/conda-python-pandas.dockerfile @@ -17,7 +17,7 @@ ARG repo ARG arch=amd64 -ARG python=3.8 +ARG python=3.9 FROM ${repo}:${arch}-conda-python-${python} ARG pandas=latest diff --git a/ci/docker/conda-python-spark.dockerfile b/ci/docker/conda-python-spark.dockerfile index 866f6f37f8bd9..a8e8250797fa8 100644 --- a/ci/docker/conda-python-spark.dockerfile +++ b/ci/docker/conda-python-spark.dockerfile @@ -17,10 +17,10 @@ ARG repo ARG arch=amd64 -ARG python=3.8 +ARG python=3.9 FROM ${repo}:${arch}-conda-python-${python} -ARG jdk=8 +ARG jdk=11 ARG maven=3.8.7 ARG numpy=latest diff --git a/ci/docker/conda-python.dockerfile b/ci/docker/conda-python.dockerfile index 027fd589cecca..3897a7217d975 100644 --- a/ci/docker/conda-python.dockerfile +++ b/ci/docker/conda-python.dockerfile @@ -20,7 +20,7 @@ ARG arch FROM ${repo}:${arch}-conda-cpp # install python specific packages -ARG python=3.8 +ARG python=3.9 COPY ci/conda_env_python.txt \ /arrow/ci/ # If the Python version being tested is the same as the Python used by the system gdb, @@ -32,11 +32,6 @@ RUN mamba install -q -y \ nomkl && \ mamba clean --all -# XXX The GCS testbench was already installed in conda-cpp.dockerfile, -# but we changed the installed Python version above, so we need to reinstall it. -COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts -RUN /arrow/ci/scripts/install_gcs_testbench.sh default - ENV ARROW_ACERO=ON \ ARROW_BUILD_STATIC=OFF \ ARROW_BUILD_TESTS=OFF \ diff --git a/ci/docker/debian-go-cgo.dockerfile b/ci/docker/debian-go-cgo.dockerfile deleted file mode 100644 index a494d1e1564ff..0000000000000 --- a/ci/docker/debian-go-cgo.dockerfile +++ /dev/null @@ -1,32 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -ARG base -FROM ${base} - -ENV DEBIAN_FRONTEND noninteractive - -# install libarrow-dev to link against with CGO -RUN apt-get update -y -q && \ - apt-get install -y -q --no-install-recommends ca-certificates lsb-release wget && \ - wget https://apache.jfrog.io/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb && \ - apt-get install -y -q --no-install-recommends ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb && \ - apt-get update -y -q && \ - apt-get install -y -q --no-install-recommends \ - cmake \ - libarrow-dev && \ - apt-get clean diff --git a/ci/docker/fedora-39-cpp.dockerfile b/ci/docker/fedora-39-cpp.dockerfile index 33d11823094ce..2ac5afe7b91f6 100644 --- a/ci/docker/fedora-39-cpp.dockerfile +++ b/ci/docker/fedora-39-cpp.dockerfile @@ -34,6 +34,7 @@ RUN dnf update -y && \ curl-devel \ gcc \ gcc-c++ \ + gdb \ gflags-devel \ git \ glog-devel \ diff --git a/ci/docker/java-jni-manylinux-201x.dockerfile b/ci/docker/java-jni-manylinux-201x.dockerfile index 8b73c73c1d240..479f4aa598b18 100644 --- a/ci/docker/java-jni-manylinux-201x.dockerfile +++ b/ci/docker/java-jni-manylinux-201x.dockerfile @@ -33,7 +33,7 @@ RUN vcpkg install \ --x-feature=s3 # Install Java -ARG java=1.8.0 +ARG java=11 ARG maven=3.9.3 RUN yum install -y java-$java-openjdk-devel && \ yum clean all && \ diff --git a/ci/docker/linux-apt-docs.dockerfile b/ci/docker/linux-apt-docs.dockerfile index 1c916840e071b..0804f3543c283 100644 --- a/ci/docker/linux-apt-docs.dockerfile +++ b/ci/docker/linux-apt-docs.dockerfile @@ -19,7 +19,7 @@ ARG base FROM ${base} ARG r=4.4 -ARG jdk=8 +ARG jdk=11 ENV PUPPETEER_EXECUTABLE_PATH=/usr/bin/chromium diff --git a/ci/docker/linux-apt-python-3.dockerfile b/ci/docker/linux-apt-python-3.dockerfile index 2e07c244017a0..e215976d44850 100644 --- a/ci/docker/linux-apt-python-3.dockerfile +++ b/ci/docker/linux-apt-python-3.dockerfile @@ -23,6 +23,7 @@ COPY python/requirements-build.txt \ /arrow/python/ ENV ARROW_PYTHON_VENV /arrow-dev + RUN python3 -m venv ${ARROW_PYTHON_VENV} && \ . ${ARROW_PYTHON_VENV}/bin/activate && \ pip install -U pip setuptools wheel && \ diff --git a/ci/docker/linux-apt-python-313-freethreading.dockerfile b/ci/docker/linux-apt-python-313-freethreading.dockerfile new file mode 100644 index 0000000000000..f5505e67f00bb --- /dev/null +++ b/ci/docker/linux-apt-python-313-freethreading.dockerfile @@ -0,0 +1,59 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +ARG base +FROM ${base} + +RUN apt-get update -y -q && \ + apt install -y -q --no-install-recommends software-properties-common gpg-agent && \ + add-apt-repository -y ppa:deadsnakes/ppa && \ + apt-get update -y -q && \ + apt install -y -q --no-install-recommends python3.13-dev python3.13-nogil python3.13-venv && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists* + +COPY python/requirements-build.txt \ + python/requirements-test.txt \ + /arrow/python/ + +ENV ARROW_PYTHON_VENV /arrow-dev +RUN python3.13t -m venv ${ARROW_PYTHON_VENV} +RUN ${ARROW_PYTHON_VENV}/bin/python -m pip install -U pip setuptools wheel +RUN ${ARROW_PYTHON_VENV}/bin/python -m pip install \ + --pre \ + --prefer-binary \ + --extra-index-url "https://pypi.anaconda.org/scientific-python-nightly-wheels/simple" \ + -r arrow/python/requirements-build.txt \ + -r arrow/python/requirements-test.txt + +# We want to run the PyArrow test suite with the GIL disabled, but cffi +# (more precisely, the `_cffi_backend` module) currently doesn't declare +# itself safe to run without the GIL. +# Therefore set PYTHON_GIL to 0. +ENV ARROW_ACERO=ON \ + ARROW_BUILD_STATIC=OFF \ + ARROW_BUILD_TESTS=OFF \ + ARROW_BUILD_UTILITIES=OFF \ + ARROW_COMPUTE=ON \ + ARROW_CSV=ON \ + ARROW_DATASET=ON \ + ARROW_FILESYSTEM=ON \ + ARROW_GDB=ON \ + ARROW_HDFS=ON \ + ARROW_JSON=ON \ + ARROW_USE_GLOG=OFF \ + PYTHON_GIL=0 diff --git a/ci/docker/linux-apt-r.dockerfile b/ci/docker/linux-apt-r.dockerfile index 630b96e1007b9..4be5adf246b88 100644 --- a/ci/docker/linux-apt-r.dockerfile +++ b/ci/docker/linux-apt-r.dockerfile @@ -58,6 +58,7 @@ RUN apt-get update -y && \ locales \ # Need Python to check py-to-r bridge python3 \ + python3-venv \ python3-pip \ python3-dev && \ locale-gen en_US.UTF-8 && \ @@ -81,15 +82,16 @@ RUN cat /arrow/ci/etc/rprofile >> $(R RHOME)/etc/Rprofile.site # Also ensure parallel compilation of C/C++ code RUN echo "MAKEFLAGS=-j$(R -s -e 'cat(parallel::detectCores())')" >> $(R RHOME)/etc/Renviron.site -# Set up Python 3 and its dependencies -RUN ln -s /usr/bin/python3 /usr/local/bin/python && \ - ln -s /usr/bin/pip3 /usr/local/bin/pip - COPY ci/scripts/r_deps.sh /arrow/ci/scripts/ COPY r/DESCRIPTION /arrow/r/ RUN /arrow/ci/scripts/r_deps.sh /arrow -RUN pip install -U pip setuptools wheel +ENV ARROW_PYTHON_VENV /arrow-dev +COPY python/requirements-build.txt /arrow/python/ +RUN python3 -m venv ${ARROW_PYTHON_VENV} && \ + source ${ARROW_PYTHON_VENV}/bin/activate && \ + pip install -U pip setuptools wheel && \ + pip install -r arrow/python/requirements-build.txt COPY ci/scripts/install_minio.sh /arrow/ci/scripts/ RUN /arrow/ci/scripts/install_minio.sh latest /usr/local @@ -97,9 +99,6 @@ RUN /arrow/ci/scripts/install_minio.sh latest /usr/local COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts/ RUN /arrow/ci/scripts/install_gcs_testbench.sh default -COPY python/requirements-build.txt /arrow/python/ -RUN pip install -r arrow/python/requirements-build.txt - ENV \ ARROW_ACERO=ON \ ARROW_BUILD_STATIC=OFF \ diff --git a/ci/docker/debian-12-go-cgo-python.dockerfile b/ci/docker/python-free-threaded-wheel-manylinux-test-imports.dockerfile similarity index 67% rename from ci/docker/debian-12-go-cgo-python.dockerfile rename to ci/docker/python-free-threaded-wheel-manylinux-test-imports.dockerfile index a24955f76e666..09530560e4f20 100644 --- a/ci/docker/debian-12-go-cgo-python.dockerfile +++ b/ci/docker/python-free-threaded-wheel-manylinux-test-imports.dockerfile @@ -18,17 +18,16 @@ ARG base FROM ${base} -ENV DEBIAN_FRONTEND noninteractive - -# Install python3 and pip so we can install pyarrow to test the C data interface. RUN apt-get update -y -q && \ - apt-get install -y -q --no-install-recommends \ - python3 \ - python3-pip \ - python3-venv && \ - apt-get clean + apt install -y -q --no-install-recommends software-properties-common gpg-agent && \ + add-apt-repository -y ppa:deadsnakes/ppa && \ + apt-get update -y -q && \ + apt install -y -q --no-install-recommends python3.13-dev python3.13-nogil python3.13-venv && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists* ENV ARROW_PYTHON_VENV /arrow-dev -RUN python3 -m venv ${ARROW_PYTHON_VENV} && \ - . ${ARROW_PYTHON_VENV}/bin/activate && \ - pip install pyarrow cffi --only-binary pyarrow +RUN python3.13t -m venv ${ARROW_PYTHON_VENV} + +ENV PYTHON_GIL 0 +ENV PATH "${ARROW_PYTHON_VENV}/bin:${PATH}" diff --git a/ci/docker/python-free-threaded-wheel-manylinux-test-unittests.dockerfile b/ci/docker/python-free-threaded-wheel-manylinux-test-unittests.dockerfile new file mode 100644 index 0000000000000..13b3bc140a9a8 --- /dev/null +++ b/ci/docker/python-free-threaded-wheel-manylinux-test-unittests.dockerfile @@ -0,0 +1,51 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +ARG base +FROM ${base} + +ENV DEBIAN_FRONTEND=noninteractive + +RUN apt-get update -y -q && \ + apt install -y -q --no-install-recommends software-properties-common gpg-agent && \ + add-apt-repository -y ppa:deadsnakes/ppa && \ + apt-get update -y -q && \ + apt install -y -q --no-install-recommends \ + build-essential \ + libffi-dev \ + python3.13-dev \ + python3.13-nogil \ + python3.13-venv && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists* + +ENV ARROW_PYTHON_VENV /arrow-dev +RUN python3.13t -m venv ${ARROW_PYTHON_VENV} + +ENV PYTHON_GIL 0 +ENV PATH "${ARROW_PYTHON_VENV}/bin:${PATH}" + +# pandas doesn't provide wheels for aarch64 yet, so we have to install nightly Cython +# along with the rest of pandas' build dependencies and disable build isolation +COPY python/requirements-wheel-test.txt /arrow/python/ +RUN python -m pip install \ + --pre \ + --prefer-binary \ + --extra-index-url "https://pypi.anaconda.org/scientific-python-nightly-wheels/simple" \ + Cython numpy +RUN python -m pip install "meson-python==0.13.1" "meson==1.2.1" wheel "versioneer[toml]" ninja +RUN python -m pip install --no-build-isolation -r /arrow/python/requirements-wheel-test.txt diff --git a/ci/docker/python-wheel-manylinux-test.dockerfile b/ci/docker/python-wheel-manylinux-test.dockerfile index cdd0ae3ced756..09883f9780a36 100644 --- a/ci/docker/python-wheel-manylinux-test.dockerfile +++ b/ci/docker/python-wheel-manylinux-test.dockerfile @@ -16,15 +16,22 @@ # under the License. ARG arch -ARG python -FROM ${arch}/python:${python} - -# RUN pip install --upgrade pip +ARG python_image_tag +FROM ${arch}/python:${python_image_tag} # pandas doesn't provide wheel for aarch64 yet, so cache the compiled # test dependencies in a docker image COPY python/requirements-wheel-test.txt /arrow/python/ RUN pip install -r /arrow/python/requirements-wheel-test.txt +# Install the GCS testbench with the system Python +RUN apt-get update -y -q && \ + apt-get install -y -q \ + build-essential \ + python3-dev && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists* + COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts/ -RUN PYTHON=python /arrow/ci/scripts/install_gcs_testbench.sh default +ENV PIPX_PYTHON=/usr/bin/python3 PIPX_PIP_ARGS=--prefer-binary +RUN /arrow/ci/scripts/install_gcs_testbench.sh default diff --git a/ci/docker/python-wheel-manylinux.dockerfile b/ci/docker/python-wheel-manylinux.dockerfile index cb39667af1e10..d22a70a2d777b 100644 --- a/ci/docker/python-wheel-manylinux.dockerfile +++ b/ci/docker/python-wheel-manylinux.dockerfile @@ -47,7 +47,7 @@ RUN dnf install -y git flex curl autoconf zip perl-IPC-Cmd wget # on manylinux_2_28, no system python is installed. # We therefore override the PATH with Python 3.8 in /opt/python # so that we have a consistent Python version across base images. -ENV CPYTHON_VERSION=cp38 +ENV CPYTHON_VERSION=cp39 ENV PATH=/opt/python/${CPYTHON_VERSION}-${CPYTHON_VERSION}/bin:${PATH} # Install CMake @@ -100,10 +100,15 @@ RUN vcpkg install \ --x-feature=parquet \ --x-feature=s3 +# Make sure auditwheel is up-to-date +RUN pipx upgrade auditwheel + # Configure Python for applications running in the bash shell of this Dockerfile -ARG python=3.8 +ARG python=3.9 +ARG python_abi_tag=cp39 ENV PYTHON_VERSION=${python} -RUN PYTHON_ROOT=$(find /opt/python -name cp${PYTHON_VERSION/./}-*) && \ +ENV PYTHON_ABI_TAG=${python_abi_tag} +RUN PYTHON_ROOT=$(find /opt/python -name cp${PYTHON_VERSION/./}-${PYTHON_ABI_TAG}) && \ echo "export PATH=$PYTHON_ROOT/bin:\$PATH" >> /etc/profile.d/python.sh SHELL ["/bin/bash", "-i", "-c"] diff --git a/ci/docker/python-wheel-windows-test-vs2019.dockerfile b/ci/docker/python-wheel-windows-test-vs2019.dockerfile index 32bbb55e82689..8c17ebfa2fe0a 100644 --- a/ci/docker/python-wheel-windows-test-vs2019.dockerfile +++ b/ci/docker/python-wheel-windows-test-vs2019.dockerfile @@ -27,23 +27,37 @@ FROM abrarov/msvc-2019:2.11.0 # Add unix tools to path RUN setx path "%path%;C:\Program Files\Git\usr\bin" -# Remove previous installations of python from the base image +# 1. Remove previous installations of python from the base image # NOTE: a more recent base image (tried with 2.12.1) comes with python 3.9.7 # and the msi installers are failing to remove pip and tcl/tk "products" making # the subsequent choco python installation step failing for installing python # version 3.9.* due to existing python version +# 2. Install Minio for S3 testing. RUN wmic product where "name like 'python%%'" call uninstall /nointeractive && \ - rm -rf Python* + rm -rf Python* && \ + curl https://dl.min.io/server/minio/release/windows-amd64/archive/minio.RELEASE.2022-05-26T05-48-41Z \ + --output "C:\Windows\Minio.exe" -# Define the full version number otherwise choco falls back to patch number 0 (3.8 => 3.8.0) -ARG python=3.8 -RUN (if "%python%"=="3.8" setx PYTHON_VERSION "3.8.10" && setx PATH "%PATH%;C:\Python38;C:\Python38\Scripts") & \ - (if "%python%"=="3.9" setx PYTHON_VERSION "3.9.13" && setx PATH "%PATH%;C:\Python39;C:\Python39\Scripts") & \ - (if "%python%"=="3.10" setx PYTHON_VERSION "3.10.11" && setx PATH "%PATH%;C:\Python310;C:\Python310\Scripts") & \ - (if "%python%"=="3.11" setx PYTHON_VERSION "3.11.5" && setx PATH "%PATH%;C:\Python311;C:\Python311\Scripts") & \ - (if "%python%"=="3.12" setx PYTHON_VERSION "3.12.0" && setx PATH "%PATH%;C:\Python312;C:\Python312\Scripts") +# Install the GCS testbench using a well-known Python version. +# NOTE: cannot use pipx's `--fetch-missing-python` because of +# https://github.com/pypa/pipx/issues/1521, therefore download Python ourselves. +RUN choco install -r -y --pre --no-progress python --version=3.11.9 +ENV PIPX_BIN_DIR=C:\\Windows\\ +ENV PIPX_PYTHON="C:\Python311\python.exe" +COPY ci/scripts/install_gcs_testbench.bat C:/arrow/ci/scripts/ +RUN call "C:\arrow\ci\scripts\install_gcs_testbench.bat" && \ + storage-testbench -h + +# Define the full version number otherwise choco falls back to patch number 0 (3.9 => 3.9.0) +ARG python=3.9 +RUN (if "%python%"=="3.9" setx PYTHON_VERSION "3.9.13") & \ + (if "%python%"=="3.10" setx PYTHON_VERSION "3.10.11") & \ + (if "%python%"=="3.11" setx PYTHON_VERSION "3.11.9") & \ + (if "%python%"=="3.12" setx PYTHON_VERSION "3.12.5") & \ + (if "%python%"=="3.13" setx PYTHON_VERSION "3.13.0-rc1") # Install archiver to extract xz archives -RUN choco install -r -y --no-progress python --version=%PYTHON_VERSION% & \ - python -m pip install --no-cache-dir -U pip setuptools & \ +RUN choco install -r -y --pre --no-progress --force python --version=%PYTHON_VERSION% && \ choco install --no-progress -r -y archiver + +ENV PYTHON=$python diff --git a/ci/docker/python-wheel-windows-vs2019.dockerfile b/ci/docker/python-wheel-windows-vs2019.dockerfile index ff42de939d91f..f9d31eb5771ef 100644 --- a/ci/docker/python-wheel-windows-vs2019.dockerfile +++ b/ci/docker/python-wheel-windows-vs2019.dockerfile @@ -78,14 +78,14 @@ RUN vcpkg install \ RUN wmic product where "name like 'python%%'" call uninstall /nointeractive && \ rm -rf Python* -# Define the full version number otherwise choco falls back to patch number 0 (3.8 => 3.8.0) -ARG python=3.8 -RUN (if "%python%"=="3.8" setx PYTHON_VERSION "3.8.10" && setx PATH "%PATH%;C:\Python38;C:\Python38\Scripts") & \ - (if "%python%"=="3.9" setx PYTHON_VERSION "3.9.13" && setx PATH "%PATH%;C:\Python39;C:\Python39\Scripts") & \ +# Define the full version number otherwise choco falls back to patch number 0 (3.9 => 3.9.0) +ARG python=3.9 +RUN (if "%python%"=="3.9" setx PYTHON_VERSION "3.9.13" && setx PATH "%PATH%;C:\Python39;C:\Python39\Scripts") & \ (if "%python%"=="3.10" setx PYTHON_VERSION "3.10.11" && setx PATH "%PATH%;C:\Python310;C:\Python310\Scripts") & \ - (if "%python%"=="3.11" setx PYTHON_VERSION "3.11.5" && setx PATH "%PATH%;C:\Python311;C:\Python311\Scripts") & \ - (if "%python%"=="3.12" setx PYTHON_VERSION "3.12.0" && setx PATH "%PATH%;C:\Python312;C:\Python312\Scripts") -RUN choco install -r -y --no-progress python --version=%PYTHON_VERSION% + (if "%python%"=="3.11" setx PYTHON_VERSION "3.11.9" && setx PATH "%PATH%;C:\Python311;C:\Python311\Scripts") & \ + (if "%python%"=="3.12" setx PYTHON_VERSION "3.12.5" && setx PATH "%PATH%;C:\Python312;C:\Python312\Scripts") & \ + (if "%python%"=="3.13" setx PYTHON_VERSION "3.13.0-rc1" && setx PATH "%PATH%;C:\Python313;C:\Python313\Scripts") +RUN choco install -r -y --pre --no-progress python --version=%PYTHON_VERSION% RUN python -m pip install -U pip setuptools COPY python/requirements-wheel-build.txt arrow/python/ diff --git a/ci/docker/ubuntu-20.04-cpp-minimal.dockerfile b/ci/docker/ubuntu-20.04-cpp-minimal.dockerfile index e17c0306f115d..1b342df596c9d 100644 --- a/ci/docker/ubuntu-20.04-cpp-minimal.dockerfile +++ b/ci/docker/ubuntu-20.04-cpp-minimal.dockerfile @@ -29,10 +29,12 @@ RUN apt-get update -y -q && \ ccache \ cmake \ curl \ + gdb \ git \ libssl-dev \ libcurl4-openssl-dev \ python3-pip \ + python3-venv \ tzdata \ wget && \ apt-get clean && \ diff --git a/ci/docker/ubuntu-22.04-cpp-minimal.dockerfile b/ci/docker/ubuntu-22.04-cpp-minimal.dockerfile index 341d8a87e8661..ce31c457e909e 100644 --- a/ci/docker/ubuntu-22.04-cpp-minimal.dockerfile +++ b/ci/docker/ubuntu-22.04-cpp-minimal.dockerfile @@ -29,10 +29,12 @@ RUN apt-get update -y -q && \ ccache \ cmake \ curl \ + gdb \ git \ libssl-dev \ libcurl4-openssl-dev \ python3-pip \ + python3-venv \ tzdata \ wget && \ apt-get clean && \ diff --git a/ci/docker/ubuntu-24.04-cpp-minimal.dockerfile b/ci/docker/ubuntu-24.04-cpp-minimal.dockerfile new file mode 100644 index 0000000000000..a1fd178a2c754 --- /dev/null +++ b/ci/docker/ubuntu-24.04-cpp-minimal.dockerfile @@ -0,0 +1,106 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +ARG base=amd64/ubuntu:24.04 +FROM ${base} + +SHELL ["/bin/bash", "-o", "pipefail", "-c"] + +RUN echo "debconf debconf/frontend select Noninteractive" | \ + debconf-set-selections + +RUN apt-get update -y -q && \ + apt-get install -y -q \ + build-essential \ + ccache \ + cmake \ + curl \ + gdb \ + git \ + libssl-dev \ + libcurl4-openssl-dev \ + python3-pip \ + python3-venv \ + tzdata \ + tzdata-legacy \ + wget && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists* + +# Installs LLVM toolchain, for Gandiva and testing other compilers +# +# Note that this is installed before the base packages to improve iteration +# while debugging package list with docker build. +ARG llvm +RUN latest_system_llvm=14 && \ + if [ ${llvm} -gt ${latest_system_llvm} ]; then \ + apt-get update -y -q && \ + apt-get install -y -q --no-install-recommends \ + apt-transport-https \ + ca-certificates \ + gnupg \ + lsb-release \ + wget && \ + wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | apt-key add - && \ + code_name=$(lsb_release --codename --short) && \ + if [ ${llvm} -gt 10 ]; then \ + echo "deb https://apt.llvm.org/${code_name}/ llvm-toolchain-${code_name}-${llvm} main" > \ + /etc/apt/sources.list.d/llvm.list; \ + fi; \ + fi && \ + apt-get update -y -q && \ + apt-get install -y -q --no-install-recommends \ + clang-${llvm} \ + llvm-${llvm}-dev && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists* + +COPY ci/scripts/install_minio.sh /arrow/ci/scripts/ +RUN /arrow/ci/scripts/install_minio.sh latest /usr/local + +COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts/ +RUN /arrow/ci/scripts/install_gcs_testbench.sh default + +COPY ci/scripts/install_sccache.sh /arrow/ci/scripts/ +RUN /arrow/ci/scripts/install_sccache.sh unknown-linux-musl /usr/local/bin + +ENV ARROW_ACERO=ON \ + ARROW_AZURE=OFF \ + ARROW_BUILD_TESTS=ON \ + ARROW_DATASET=ON \ + ARROW_FLIGHT=ON \ + ARROW_GANDIVA=ON \ + ARROW_GCS=ON \ + ARROW_HDFS=ON \ + ARROW_HOME=/usr/local \ + ARROW_INSTALL_NAME_RPATH=OFF \ + ARROW_ORC=ON \ + ARROW_PARQUET=ON \ + ARROW_S3=ON \ + ARROW_USE_CCACHE=ON \ + ARROW_WITH_BROTLI=ON \ + ARROW_WITH_BZ2=ON \ + ARROW_WITH_LZ4=ON \ + ARROW_WITH_OPENTELEMETRY=OFF \ + ARROW_WITH_SNAPPY=ON \ + ARROW_WITH_ZLIB=ON \ + ARROW_WITH_ZSTD=ON \ + CMAKE_GENERATOR="Unix Makefiles" \ + PARQUET_BUILD_EXAMPLES=ON \ + PARQUET_BUILD_EXECUTABLES=ON \ + PATH=/usr/lib/ccache/:$PATH \ + PYTHON=python3 diff --git a/ci/docker/ubuntu-24.04-cpp.dockerfile b/ci/docker/ubuntu-24.04-cpp.dockerfile index ecfb5e2f5096d..7d0772c33a255 100644 --- a/ci/docker/ubuntu-24.04-cpp.dockerfile +++ b/ci/docker/ubuntu-24.04-cpp.dockerfile @@ -57,6 +57,7 @@ RUN latest_system_llvm=18 && \ clang-${llvm} \ clang-format-${clang_tools} \ clang-tidy-${clang_tools} \ + libclang-rt-${llvm}-dev \ llvm-${llvm}-dev && \ apt-get clean && \ rm -rf /var/lib/apt/lists* diff --git a/ci/scripts/PKGBUILD b/ci/scripts/PKGBUILD index e12099f2b405d..ed68faae950b1 100644 --- a/ci/scripts/PKGBUILD +++ b/ci/scripts/PKGBUILD @@ -18,7 +18,7 @@ _realname=arrow pkgbase=mingw-w64-${_realname} pkgname="${MINGW_PACKAGE_PREFIX}-${_realname}" -pkgver=16.1.0.9000 +pkgver=17.0.0.9000 pkgrel=8000 pkgdesc="Apache Arrow is a cross-language development platform for in-memory data (mingw-w64)" arch=("any") diff --git a/ci/scripts/cpp_test.sh b/ci/scripts/cpp_test.sh index 2c640f2c1fb6a..7912bf23e491c 100755 --- a/ci/scripts/cpp_test.sh +++ b/ci/scripts/cpp_test.sh @@ -80,7 +80,7 @@ case "$(uname)" in ;; esac -if [ "${ARROW_EMSCRIPTEN:-OFF}" = "ON" ]; then +if [ "${ARROW_EMSCRIPTEN:-OFF}" = "ON" ]; then n_jobs=1 # avoid spurious fails on emscripten due to loading too many big executables fi diff --git a/ci/scripts/go_bench.sh b/ci/scripts/go_bench.sh deleted file mode 100755 index 6d5305f9eeff2..0000000000000 --- a/ci/scripts/go_bench.sh +++ /dev/null @@ -1,54 +0,0 @@ -#!/usr/bin/env bash -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# this will output the benchmarks to STDOUT but if `-json` is passed -# as the second argument, it will create a file "bench_stats.json" -# in the directory this is called from containing a json representation - -set -ex - -# simplistic semver comparison -verlte() { - [ "$1" = "`echo -e "$1\n$2" | sort -V | head -n1`" ] -} -verlt() { - [ "$1" = "$2" ] && return 1 || verlte $1 $2 -} - -ver=`go env GOVERSION` - -source_dir=${1}/go - -export PARQUET_TEST_DATA=${1}/cpp/submodules/parquet-testing/data -pushd ${source_dir} - -# lots of benchmarks, they can take a while -# the timeout is for *ALL* benchmarks together, -# not per benchmark -go test -bench=. -benchmem -timeout 40m -run=^$ ./... | tee bench_stat.dat - -popd - -if [[ "$2" = "-json" ]]; then - go install go.bobheadxi.dev/gobenchdata@latest - export PATH=`go env GOPATH`/bin:$PATH - cat ${source_dir}/bench_*.dat | gobenchdata --json bench_stats.json -fi - -rm ${source_dir}/bench_*.dat \ No newline at end of file diff --git a/ci/scripts/go_bench_adapt.py b/ci/scripts/go_bench_adapt.py deleted file mode 100644 index a05e25de8bdd3..0000000000000 --- a/ci/scripts/go_bench_adapt.py +++ /dev/null @@ -1,127 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import json -import os -import uuid -import logging -from pathlib import Path -from typing import List - -from benchadapt import BenchmarkResult -from benchadapt.adapters import BenchmarkAdapter -from benchadapt.log import log - -log.setLevel(logging.DEBUG) - -ARROW_ROOT = Path(__file__).parent.parent.parent.resolve() -SCRIPTS_PATH = ARROW_ROOT / "ci" / "scripts" - -# `github_commit_info` is meant to communicate GitHub-flavored commit -# information to Conbench. See -# https://github.com/conbench/conbench/blob/cf7931f/benchadapt/python/benchadapt/result.py#L66 -# for a specification. -github_commit_info = {"repository": "https://github.com/apache/arrow"} - -if os.environ.get("CONBENCH_REF") == "main": - # Assume GitHub Actions CI. The environment variable lookups below are - # expected to fail when not running in GitHub Actions. - github_commit_info = { - "repository": f'{os.environ["GITHUB_SERVER_URL"]}/{os.environ["GITHUB_REPOSITORY"]}', - "commit": os.environ["GITHUB_SHA"], - "pr_number": None, # implying default branch - } - run_reason = "commit" -else: - # Assume that the environment is not GitHub Actions CI. Error out if that - # assumption seems to be wrong. - assert os.getenv("GITHUB_ACTIONS") is None - - # This is probably a local dev environment, for testing. In this case, it - # does usually not make sense to provide commit information (not a - # controlled CI environment). Explicitly leave out "commit" and "pr_number" to - # reflect that (to not send commit information). - - # Reflect 'local dev' scenario in run_reason. Allow user to (optionally) - # inject a custom piece of information into the run reason here, from - # environment. - run_reason = "localdev" - custom_reason_suffix = os.getenv("CONBENCH_CUSTOM_RUN_REASON") - if custom_reason_suffix is not None: - run_reason += f" {custom_reason_suffix.strip()}" - - -class GoAdapter(BenchmarkAdapter): - result_file = "bench_stats.json" - command = ["bash", SCRIPTS_PATH / "go_bench.sh", ARROW_ROOT, "-json"] - - def __init__(self, *args, **kwargs) -> None: - super().__init__(command=self.command, *args, **kwargs) - - def _transform_results(self) -> List[BenchmarkResult]: - with open(self.result_file, "r") as f: - raw_results = json.load(f) - - run_id = uuid.uuid4().hex - parsed_results = [] - for suite in raw_results[0]["Suites"]: - batch_id = uuid.uuid4().hex - pkg = suite["Pkg"] - - for benchmark in suite["Benchmarks"]: - data = benchmark["Mem"]["MBPerSec"] * 1e6 - time = 1 / benchmark["NsPerOp"] * 1e9 - - name = benchmark["Name"].removeprefix("Benchmark") - ncpu = name[name.rfind("-") + 1 :] - pieces = name[: -(len(ncpu) + 1)].split("/") - - parsed = BenchmarkResult( - run_id=run_id, - batch_id=batch_id, - stats={ - "data": [data], - "unit": "B/s", - "times": [time], - "time_unit": "i/s", - "iterations": benchmark["Runs"], - }, - context={ - "benchmark_language": "Go", - "goos": suite["Goos"], - "goarch": suite["Goarch"], - }, - tags={ - "pkg": pkg, - "num_cpu": ncpu, - "name": pieces[0], - "params": "/".join(pieces[1:]), - }, - run_reason=run_reason, - github=github_commit_info, - ) - parsed.run_name = ( - f"{parsed.run_reason}: {github_commit_info.get('commit')}" - ) - parsed_results.append(parsed) - - return parsed_results - - -if __name__ == "__main__": - go_adapter = GoAdapter(result_fields_override={"info": {}}) - go_adapter() diff --git a/ci/scripts/go_build.sh b/ci/scripts/go_build.sh deleted file mode 100755 index ea77ecf56ac0e..0000000000000 --- a/ci/scripts/go_build.sh +++ /dev/null @@ -1,67 +0,0 @@ -#!/usr/bin/env bash -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -set -ex - -source_dir=${1}/go - -# Need "all=" as per https://github.com/golang/go/issues/42131#issuecomment-713917379 -export GOFLAGS="${GOFLAGS} -gcflags=all=-d=checkptr" - -pushd ${source_dir}/arrow - -if [[ -n "${ARROW_GO_TESTCGO}" ]]; then - if [[ "${MSYSTEM}" = "MINGW64" ]]; then - export PATH=${MINGW_PREFIX}/bin:$PATH - go clean -cache - go clean -testcache - fi - TAGS="-tags assert,test,ccalloc" -fi - -go install $TAGS -v ./... - -popd - -pushd ${source_dir}/parquet - -go install -v ./... - -popd - -: ${ARROW_INTEGRATION_GO:=ON} - -if [ "${ARROW_INTEGRATION_GO}" == "ON" ]; then - pushd ${source_dir}/arrow/internal/cdata_integration - - case "$(uname)" in - Linux) - go_lib="arrow_go_integration.so" - ;; - Darwin) - go_lib="arrow_go_integration.dylib" - ;; - MINGW*) - go_lib="arrow_go_integration.dll" - ;; - esac - go build -buildvcs=false -tags cdata_integration,assert -buildmode=c-shared -o ${go_lib} . - - popd -fi diff --git a/ci/scripts/go_test.sh b/ci/scripts/go_test.sh deleted file mode 100755 index bad2ffe619026..0000000000000 --- a/ci/scripts/go_test.sh +++ /dev/null @@ -1,89 +0,0 @@ -#!/usr/bin/env bash -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -set -ex - -# simplistic semver comparison -verlte() { - [ "$1" = "`echo -e "$1\n$2" | sort -V | head -n1`" ] -} -verlt() { - [ "$1" = "$2" ] && return 1 || verlte $1 $2 -} - -ver=`go env GOVERSION` - -source_dir=${1}/go - -testargs="-race" -if verlte "1.18" "${ver#go}" && [ "$(go env GOOS)" != "darwin" ]; then - # asan not supported on darwin/amd64 - testargs="-asan" -fi - -case "$(uname)" in - MINGW*) - # -asan and -race don't work on windows currently - testargs="" - ;; -esac - -if [[ "$(go env GOHOSTARCH)" = "s390x" ]]; then - testargs="" # -race and -asan not supported on s390x -fi - -# Go static check (skipped in MinGW) -if [[ -z "${MINGW_LINT}" ]]; then - pushd ${source_dir} - "$(go env GOPATH)"/bin/staticcheck ./... - popd -fi - - -pushd ${source_dir}/arrow - -TAGS="assert,test" -if [[ -n "${ARROW_GO_TESTCGO}" ]]; then - if [[ "${MSYSTEM}" = "MINGW64" ]]; then - export PATH=${MINGW_PREFIX}\\bin:${MINGW_PREFIX}\\lib:$PATH - fi - TAGS="${TAGS},ccalloc" -fi - -# the cgo implementation of the c data interface requires the "test" -# tag in order to run its tests so that the testing functions implemented -# in .c files don't get included in non-test builds. - -go test $testargs -tags $TAGS ./... - -# run it again but with the noasm tag -go test $testargs -tags $TAGS,noasm ./... - -popd - -export PARQUET_TEST_DATA=${1}/cpp/submodules/parquet-testing/data -export ARROW_TEST_DATA=${1}/testing/data -pushd ${source_dir}/parquet - -go test $testargs -tags assert ./... - -# run the tests again but with the noasm tag -go test $testargs -tags assert,noasm ./... - -popd diff --git a/ci/scripts/install_azurite.sh b/ci/scripts/install_azurite.sh index dda5e99405b7f..b8b1618bed314 100755 --- a/ci/scripts/install_azurite.sh +++ b/ci/scripts/install_azurite.sh @@ -19,20 +19,32 @@ set -e -# Pin azurite to 3.29.0 due to https://github.com/apache/arrow/issues/41505 +node_version="$(node --version)" +echo "node version = ${node_version}" + +case "${node_version}" in + v12*) + # Pin azurite to 3.29.0 due to https://github.com/apache/arrow/issues/41505 + azurite_version=v3.29.0 + ;; + *) + azurite_version=latest + ;; +esac + case "$(uname)" in Darwin) - npm install -g azurite@v3.29.0 + npm install -g azurite@${azurite_version} which azurite ;; MINGW*) choco install nodejs.install - npm install -g azurite@v3.29.0 + npm install -g azurite@${azurite_version} ;; Linux) - npm install -g azurite@v3.29.0 + npm install -g azurite@${azurite_version} which azurite ;; esac -echo "node version = $(node --version)" -echo "azurite version = $(azurite --version)" \ No newline at end of file + +echo "azurite version = $(azurite --version)" diff --git a/ci/scripts/install_gcs_testbench.bat b/ci/scripts/install_gcs_testbench.bat index b03d0c2ad6608..f54f98db7cac8 100644 --- a/ci/scripts/install_gcs_testbench.bat +++ b/ci/scripts/install_gcs_testbench.bat @@ -17,9 +17,18 @@ @echo on -set GCS_TESTBENCH_VERSION="v0.36.0" +set GCS_TESTBENCH_VERSION="v0.40.0" + +set PIPX_FLAGS=--verbose +if NOT "%PIPX_PYTHON%"=="" ( + set PIPX_FLAGS=--python %PIPX_PYTHON% %PIPX_FLAGS% +) + +python -m pip install -U pipx || exit /B 1 @REM Install GCS testbench %GCS_TESTBENCH_VERSION% -python -m pip install ^ +pipx install %PIPX_FLAGS% ^ "https://github.com/googleapis/storage-testbench/archive/%GCS_TESTBENCH_VERSION%.tar.gz" ^ || exit /B 1 + +pipx list --verbose diff --git a/ci/scripts/install_gcs_testbench.sh b/ci/scripts/install_gcs_testbench.sh index 2090290c99322..48a5858a358c9 100755 --- a/ci/scripts/install_gcs_testbench.sh +++ b/ci/scripts/install_gcs_testbench.sh @@ -17,7 +17,7 @@ # specific language governing permissions and limitations # under the License. -set -e +set -ex if [ "$#" -ne 1 ]; then echo "Usage: $0 " @@ -34,15 +34,26 @@ case "$(uname -m)" in ;; esac -# On newer pythons install into the system will fail, so override that -export PIP_BREAK_SYSTEM_PACKAGES=1 - version=$1 if [[ "${version}" -eq "default" ]]; then version="v0.39.0" - # Latests versions of Testbench require newer setuptools - ${PYTHON:-python3} -m pip install --upgrade setuptools fi -${PYTHON:-python3} -m pip install \ +# The Python to install pipx with +: ${PIPX_BASE_PYTHON:=$(which python3)} +# The Python to install the GCS testbench with +: ${PIPX_PYTHON:=${PIPX_BASE_PYTHON:-$(which python3)}} + +export PIP_BREAK_SYSTEM_PACKAGES=1 +${PIPX_BASE_PYTHON} -m pip install -U pipx + +pipx_flags=(--verbose --python ${PIPX_PYTHON}) +if [[ $(id -un) == "root" ]]; then + # Install globally as /root/.local/bin is typically not in $PATH + pipx_flags+=(--global) +fi +if [[ -n "${PIPX_PIP_ARGS}" ]]; then + pipx_flags+=(--pip-args "'${PIPX_PIP_ARGS}'") +fi +${PIPX_BASE_PYTHON} -m pipx install ${pipx_flags[@]} \ "https://github.com/googleapis/storage-testbench/archive/${version}.tar.gz" diff --git a/ci/scripts/install_python.sh b/ci/scripts/install_python.sh index 5f962f02b911b..0f8a0804691e7 100755 --- a/ci/scripts/install_python.sh +++ b/ci/scripts/install_python.sh @@ -25,11 +25,12 @@ platforms=([windows]=Windows [linux]=Linux) declare -A versions -versions=([3.8]=3.8.10 - [3.9]=3.9.13 +versions=([3.9]=3.9.13 [3.10]=3.10.11 - [3.11]=3.11.5 - [3.12]=3.12.0) + [3.11]=3.11.9 + [3.12]=3.12.5 + [3.13]=3.13.0 + [3.13t]=3.13.0) if [ "$#" -ne 2 ]; then echo "Usage: $0 " @@ -46,7 +47,14 @@ full_version=${versions[$2]} if [ $platform = "macOS" ]; then echo "Downloading Python installer..." - if [ "$(uname -m)" = "arm64" ] || [ "$version" = "3.10" ] || [ "$version" = "3.11" ] || [ "$version" = "3.12" ]; then + if [ "$version" = "3.13" ] || [ "$version" = "3.13t" ]; + then + fname="python-${full_version}rc2-macos11.pkg" + elif [ "$(uname -m)" = "arm64" ] || \ + [ "$version" = "3.10" ] || \ + [ "$version" = "3.11" ] || \ + [ "$version" = "3.12" ]; + then fname="python-${full_version}-macos11.pkg" else fname="python-${full_version}-macosx10.9.pkg" @@ -54,15 +62,40 @@ if [ $platform = "macOS" ]; then wget "https://www.python.org/ftp/python/${full_version}/${fname}" echo "Installing Python..." - installer -pkg $fname -target / + if [[ $2 == "3.13t" ]]; then + # See https://github.com/python/cpython/issues/120098#issuecomment-2151122033 for more info on this. + cat > ./choicechanges.plist < + + + + + attributeSetting + 1 + choiceAttribute + selected + choiceIdentifier + org.python.Python.PythonTFramework-3.13 + + + +EOF + installer -pkg $fname -applyChoiceChangesXML ./choicechanges.plist -target / + rm ./choicechanges.plist + else + installer -pkg $fname -target / + fi rm $fname - echo "Installing Pip..." python="/Library/Frameworks/Python.framework/Versions/${version}/bin/python${version}" - pip="${python} -m pip" + if [[ $2 == "3.13t" ]]; then + python="/Library/Frameworks/PythonT.framework/Versions/3.13/bin/python3.13t" + fi + echo "Installing Pip..." $python -m ensurepip - $pip install -U pip setuptools + $python -m pip install -U pip setuptools else echo "Unsupported platform: $platform" + exit 1 fi diff --git a/ci/scripts/integration_arrow.sh b/ci/scripts/integration_arrow.sh index 2eb58e8dc75ec..8d0a343ebb443 100755 --- a/ci/scripts/integration_arrow.sh +++ b/ci/scripts/integration_arrow.sh @@ -17,7 +17,7 @@ # specific language governing permissions and limitations # under the License. -set -ex +set -e arrow_dir=${1} build_dir=${2} @@ -26,12 +26,19 @@ gold_dir=$arrow_dir/testing/data/arrow-ipc-stream/integration : ${ARROW_INTEGRATION_CPP:=ON} : ${ARROW_INTEGRATION_CSHARP:=ON} -: ${ARROW_INTEGRATION_GO:=ON} : ${ARROW_INTEGRATION_JAVA:=ON} : ${ARROW_INTEGRATION_JS:=ON} +: ${ARCHERY_INTEGRATION_TARGET_IMPLEMENTATIONS:=cpp,csharp,java,js} +export ARCHERY_INTEGRATION_TARGET_IMPLEMENTATIONS + +. ${arrow_dir}/ci/scripts/util_log.sh + +github_actions_group_begin "Integration: Prepare: Archery" pip install -e $arrow_dir/dev/archery[integration] +github_actions_group_end +github_actions_group_begin "Integration: Prepare: Dependencies" # For C Data Interface testing if [ "${ARROW_INTEGRATION_CSHARP}" == "ON" ]; then pip install pythonnet @@ -39,6 +46,7 @@ fi if [ "${ARROW_INTEGRATION_JAVA}" == "ON" ]; then pip install jpype1 fi +github_actions_group_end export ARROW_BUILD_ROOT=${build_dir} @@ -58,7 +66,6 @@ time archery integration \ --run-flight \ --with-cpp=$([ "$ARROW_INTEGRATION_CPP" == "ON" ] && echo "1" || echo "0") \ --with-csharp=$([ "$ARROW_INTEGRATION_CSHARP" == "ON" ] && echo "1" || echo "0") \ - --with-go=$([ "$ARROW_INTEGRATION_GO" == "ON" ] && echo "1" || echo "0") \ --with-java=$([ "$ARROW_INTEGRATION_JAVA" == "ON" ] && echo "1" || echo "0") \ --with-js=$([ "$ARROW_INTEGRATION_JS" == "ON" ] && echo "1" || echo "0") \ --gold-dirs=$gold_dir/0.14.1 \ diff --git a/ci/scripts/integration_arrow_build.sh b/ci/scripts/integration_arrow_build.sh index 9b54049a2b803..4dfcf8768c71f 100755 --- a/ci/scripts/integration_arrow_build.sh +++ b/ci/scripts/integration_arrow_build.sh @@ -17,33 +17,45 @@ # specific language governing permissions and limitations # under the License. -set -ex +set -e arrow_dir=${1} build_dir=${2} : ${ARROW_INTEGRATION_CPP:=ON} : ${ARROW_INTEGRATION_CSHARP:=ON} -: ${ARROW_INTEGRATION_GO:=ON} : ${ARROW_INTEGRATION_JAVA:=ON} : ${ARROW_INTEGRATION_JS:=ON} +. ${arrow_dir}/ci/scripts/util_log.sh + +github_actions_group_begin "Integration: Build: Rust" ${arrow_dir}/ci/scripts/rust_build.sh ${arrow_dir} ${build_dir} +github_actions_group_end +github_actions_group_begin "Integration: Build: nanoarrow" ${arrow_dir}/ci/scripts/nanoarrow_build.sh ${arrow_dir} ${build_dir} +github_actions_group_end + +github_actions_group_begin "Integration: Build: Go" +if [ "${ARCHERY_INTEGRATION_WITH_GO}" -gt "0" ]; then + ${arrow_dir}/go/ci/scripts/build.sh ${arrow_dir}/go +fi +github_actions_group_end +github_actions_group_begin "Integration: Build: C++" if [ "${ARROW_INTEGRATION_CPP}" == "ON" ]; then ${arrow_dir}/ci/scripts/cpp_build.sh ${arrow_dir} ${build_dir} fi +github_actions_group_end +github_actions_group_begin "Integration: Build: C#" if [ "${ARROW_INTEGRATION_CSHARP}" == "ON" ]; then ${arrow_dir}/ci/scripts/csharp_build.sh ${arrow_dir} ${build_dir} fi +github_actions_group_end -if [ "${ARROW_INTEGRATION_GO}" == "ON" ]; then - ${arrow_dir}/ci/scripts/go_build.sh ${arrow_dir} ${build_dir} -fi - +github_actions_group_begin "Integration: Build: Java" if [ "${ARROW_INTEGRATION_JAVA}" == "ON" ]; then export ARROW_JAVA_CDATA="ON" export JAVA_JNI_CMAKE_ARGS="-DARROW_JAVA_JNI_ENABLE_DEFAULT=OFF -DARROW_JAVA_JNI_ENABLE_C=ON" @@ -51,7 +63,10 @@ if [ "${ARROW_INTEGRATION_JAVA}" == "ON" ]; then ${arrow_dir}/ci/scripts/java_jni_build.sh ${arrow_dir} ${ARROW_HOME} ${build_dir} /tmp/dist/java ${arrow_dir}/ci/scripts/java_build.sh ${arrow_dir} ${build_dir} /tmp/dist/java fi +github_actions_group_end +github_actions_group_begin "Integration: Build: JavaScript" if [ "${ARROW_INTEGRATION_JS}" == "ON" ]; then ${arrow_dir}/ci/scripts/js_build.sh ${arrow_dir} ${build_dir} fi +github_actions_group_end diff --git a/ci/scripts/java_build.sh b/ci/scripts/java_build.sh index 0fa1edab429c0..212ec6eb11476 100755 --- a/ci/scripts/java_build.sh +++ b/ci/scripts/java_build.sh @@ -72,9 +72,6 @@ if [ $ARROW_JAVA_SKIP_GIT_PLUGIN ]; then mvn="${mvn} -Dmaven.gitcommitid.skip=true" fi -# Use `2 * ncores` threads -mvn="${mvn} -T 2C" - # https://github.com/apache/arrow/issues/41429 # TODO: We want to out-of-source build. This is a workaround. We copy # all needed files to the build directory from the source directory @@ -98,10 +95,12 @@ if [ "${ARROW_JAVA_JNI}" = "ON" ]; then mvn="${mvn} -Darrow.cpp.build.dir=${java_jni_dist_dir} -Parrow-jni" fi -${mvn} clean install +# Use `2 * ncores` threads +${mvn} -T 2C clean install if [ "${BUILD_DOCS_JAVA}" == "ON" ]; then # HTTP pooling is turned of to avoid download issues https://issues.apache.org/jira/browse/ARROW-11633 + # GH-43378: Maven site plugins not compatible with multithreading mkdir -p ${build_dir}/docs/java/reference ${mvn} -Dcheckstyle.skip=true -Dhttp.keepAlive=false -Dmaven.wagon.http.pool=false clean install site rsync -a target/site/apidocs/ ${build_dir}/docs/java/reference diff --git a/ci/scripts/java_test.sh b/ci/scripts/java_test.sh index dd483ff254197..5efda4318f15a 100755 --- a/ci/scripts/java_test.sh +++ b/ci/scripts/java_test.sh @@ -38,14 +38,10 @@ pushd ${source_dir} ${mvn} clean test projects=() -if [ "${ARROW_DATASET}" = "ON" ]; then - projects+=(gandiva) -fi -if [ "${ARROW_GANDIVA}" = "ON" ]; then - projects+=(gandiva) -fi -if [ "${ARROW_ORC}" = "ON" ]; then +if [ "${ARROW_JAVA_JNI}" = "ON" ]; then projects+=(adapter/orc) + projects+=(dataset) + projects+=(gandiva) fi if [ "${#projects[@]}" -gt 0 ]; then ${mvn} clean test \ diff --git a/ci/scripts/python_wheel_macos_build.sh b/ci/scripts/python_wheel_macos_build.sh index 3ed9d5d8dd12f..d2c392e6b9db3 100755 --- a/ci/scripts/python_wheel_macos_build.sh +++ b/ci/scripts/python_wheel_macos_build.sh @@ -34,7 +34,7 @@ rm -rf ${source_dir}/python/pyarrow/*.so.* echo "=== (${PYTHON_VERSION}) Set SDK, C++ and Wheel flags ===" export _PYTHON_HOST_PLATFORM="macosx-${MACOSX_DEPLOYMENT_TARGET}-${arch}" -export MACOSX_DEPLOYMENT_TARGET=${MACOSX_DEPLOYMENT_TARGET:-10.15} +export MACOSX_DEPLOYMENT_TARGET=${MACOSX_DEPLOYMENT_TARGET:-12.0} export SDKROOT=${SDKROOT:-$(xcrun --sdk macosx --show-sdk-path)} if [ $arch = "arm64" ]; then @@ -48,13 +48,11 @@ fi echo "=== (${PYTHON_VERSION}) Install Python build dependencies ===" export PIP_SITE_PACKAGES=$(python -c 'import site; print(site.getsitepackages()[0])') -export PIP_TARGET_PLATFORM="macosx_${MACOSX_DEPLOYMENT_TARGET//./_}_${arch}" pip install \ --upgrade \ --only-binary=:all: \ --target $PIP_SITE_PACKAGES \ - --platform $PIP_TARGET_PLATFORM \ -r ${source_dir}/python/requirements-wheel-build.txt pip install "delocate>=0.10.3" @@ -152,7 +150,6 @@ echo "=== (${PYTHON_VERSION}) Building wheel ===" export PYARROW_BUILD_TYPE=${CMAKE_BUILD_TYPE} export PYARROW_BUNDLE_ARROW_CPP=1 export PYARROW_CMAKE_GENERATOR=${CMAKE_GENERATOR} -export PYARROW_INSTALL_TESTS=1 export PYARROW_WITH_ACERO=${ARROW_ACERO} export PYARROW_WITH_AZURE=${ARROW_AZURE} export PYARROW_WITH_DATASET=${ARROW_DATASET} diff --git a/ci/scripts/python_wheel_manylinux_build.sh b/ci/scripts/python_wheel_manylinux_build.sh index aa86494a9d47d..885019ff3049f 100755 --- a/ci/scripts/python_wheel_manylinux_build.sh +++ b/ci/scripts/python_wheel_manylinux_build.sh @@ -140,7 +140,6 @@ echo "=== (${PYTHON_VERSION}) Building wheel ===" export PYARROW_BUILD_TYPE=${CMAKE_BUILD_TYPE} export PYARROW_BUNDLE_ARROW_CPP=1 export PYARROW_CMAKE_GENERATOR=${CMAKE_GENERATOR} -export PYARROW_INSTALL_TESTS=1 export PYARROW_WITH_ACERO=${ARROW_ACERO} export PYARROW_WITH_AZURE=${ARROW_AZURE} export PYARROW_WITH_DATASET=${ARROW_DATASET} @@ -181,5 +180,5 @@ popd rm -rf dist/temp-fix-wheel echo "=== (${PYTHON_VERSION}) Tag the wheel with manylinux${MANYLINUX_VERSION} ===" -auditwheel repair -L . dist/pyarrow-*.whl -w repaired_wheels +auditwheel repair dist/pyarrow-*.whl -w repaired_wheels popd diff --git a/ci/scripts/python_wheel_unix_test.sh b/ci/scripts/python_wheel_unix_test.sh index a25e5c51bddbc..1487581eaef51 100755 --- a/ci/scripts/python_wheel_unix_test.sh +++ b/ci/scripts/python_wheel_unix_test.sh @@ -34,6 +34,7 @@ source_dir=${1} : ${ARROW_S3:=ON} : ${ARROW_SUBSTRAIT:=ON} : ${CHECK_IMPORTS:=ON} +: ${CHECK_WHEEL_CONTENT:=ON} : ${CHECK_UNITTESTS:=ON} : ${INSTALL_PYARROW:=ON} @@ -54,11 +55,11 @@ export PYARROW_TEST_S3=${ARROW_S3} export PYARROW_TEST_TENSORFLOW=ON export ARROW_TEST_DATA=${source_dir}/testing/data -export PARQUET_TEST_DATA=${source_dir}/submodules/parquet-testing/data +export PARQUET_TEST_DATA=${source_dir}/cpp/submodules/parquet-testing/data if [ "${INSTALL_PYARROW}" == "ON" ]; then # Install the built wheels - pip install ${source_dir}/python/repaired_wheels/*.whl + python -m pip install ${source_dir}/python/repaired_wheels/*.whl fi if [ "${CHECK_IMPORTS}" == "ON" ]; then @@ -87,9 +88,14 @@ import pyarrow.parquet fi fi +if [ "${CHECK_WHEEL_CONTENT}" == "ON" ]; then + python ${source_dir}/ci/scripts/python_wheel_validate_contents.py \ + --path ${source_dir}/python/repaired_wheels +fi + if [ "${CHECK_UNITTESTS}" == "ON" ]; then # Install testing dependencies - pip install -U -r ${source_dir}/python/requirements-wheel-test.txt + python -m pip install -U -r ${source_dir}/python/requirements-wheel-test.txt # Execute unittest, test dependencies must be installed python -c 'import pyarrow; pyarrow.create_library_symlinks()' diff --git a/ci/scripts/python_wheel_validate_contents.py b/ci/scripts/python_wheel_validate_contents.py new file mode 100644 index 0000000000000..22b3a890f036b --- /dev/null +++ b/ci/scripts/python_wheel_validate_contents.py @@ -0,0 +1,48 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import argparse +from pathlib import Path +import re +import zipfile + + +def validate_wheel(path): + p = Path(path) + wheels = list(p.glob('*.whl')) + error_msg = f"{len(wheels)} wheels found but only 1 expected ({wheels})" + assert len(wheels) == 1, error_msg + f = zipfile.ZipFile(wheels[0]) + outliers = [ + info.filename for info in f.filelist if not re.match( + r'(pyarrow/|pyarrow-[-.\w\d]+\.dist-info/)', info.filename + ) + ] + assert not outliers, f"Unexpected contents in wheel: {sorted(outliers)}" + print(f"The wheel: {wheels[0]} seems valid.") + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--path", type=str, required=True, + help="Directory where wheel is located") + args = parser.parse_args() + validate_wheel(args.path) + + +if __name__ == '__main__': + main() diff --git a/ci/scripts/python_wheel_windows_build.bat b/ci/scripts/python_wheel_windows_build.bat index 54f02ec6f6ed0..1f1d5dca721d9 100644 --- a/ci/scripts/python_wheel_windows_build.bat +++ b/ci/scripts/python_wheel_windows_build.bat @@ -106,7 +106,6 @@ echo "=== (%PYTHON_VERSION%) Building wheel ===" set PYARROW_BUILD_TYPE=%CMAKE_BUILD_TYPE% set PYARROW_BUNDLE_ARROW_CPP=ON set PYARROW_CMAKE_GENERATOR=%CMAKE_GENERATOR% -set PYARROW_INSTALL_TESTS=ON set PYARROW_WITH_ACERO=%ARROW_ACERO% set PYARROW_WITH_DATASET=%ARROW_DATASET% set PYARROW_WITH_FLIGHT=%ARROW_FLIGHT% diff --git a/ci/scripts/python_wheel_windows_test.bat b/ci/scripts/python_wheel_windows_test.bat index a928c3571d0cb..ae5b7e36ad7ab 100755 --- a/ci/scripts/python_wheel_windows_test.bat +++ b/ci/scripts/python_wheel_windows_test.bat @@ -28,37 +28,44 @@ set PYARROW_TEST_ORC=ON set PYARROW_TEST_PARQUET=ON set PYARROW_TEST_PARQUET_ENCRYPTION=ON set PYARROW_TEST_SUBSTRAIT=ON -set PYARROW_TEST_S3=OFF +set PYARROW_TEST_S3=ON set PYARROW_TEST_TENSORFLOW=ON @REM Enable again once https://github.com/scipy/oldest-supported-numpy/pull/27 gets merged @REM set PYARROW_TEST_PANDAS=ON set ARROW_TEST_DATA=C:\arrow\testing\data -set PARQUET_TEST_DATA=C:\arrow\submodules\parquet-testing\data +set PARQUET_TEST_DATA=C:\arrow\cpp\submodules\parquet-testing\data -@REM Install testing dependencies -pip install -r C:\arrow\python\requirements-wheel-test.txt || exit /B 1 +@REM List installed Pythons +py -0p + +set PYTHON_CMD=py -%PYTHON% -@REM Install GCS testbench -call "C:\arrow\ci\scripts\install_gcs_testbench.bat" +%PYTHON_CMD% -m pip install -U pip setuptools || exit /B 1 + +@REM Install testing dependencies +%PYTHON_CMD% -m pip install -r C:\arrow\python\requirements-wheel-test.txt || exit /B 1 @REM Install the built wheels -python -m pip install --no-index --find-links=C:\arrow\python\dist\ pyarrow || exit /B 1 +%PYTHON_CMD% -m pip install --no-index --find-links=C:\arrow\python\dist\ pyarrow || exit /B 1 @REM Test that the modules are importable -python -c "import pyarrow" || exit /B 1 -python -c "import pyarrow._gcsfs" || exit /B 1 -python -c "import pyarrow._hdfs" || exit /B 1 -python -c "import pyarrow._s3fs" || exit /B 1 -python -c "import pyarrow.csv" || exit /B 1 -python -c "import pyarrow.dataset" || exit /B 1 -python -c "import pyarrow.flight" || exit /B 1 -python -c "import pyarrow.fs" || exit /B 1 -python -c "import pyarrow.json" || exit /B 1 -python -c "import pyarrow.orc" || exit /B 1 -python -c "import pyarrow.parquet" || exit /B 1 -python -c "import pyarrow.substrait" || exit /B 1 +%PYTHON_CMD% -c "import pyarrow" || exit /B 1 +%PYTHON_CMD% -c "import pyarrow._gcsfs" || exit /B 1 +%PYTHON_CMD% -c "import pyarrow._hdfs" || exit /B 1 +%PYTHON_CMD% -c "import pyarrow._s3fs" || exit /B 1 +%PYTHON_CMD% -c "import pyarrow.csv" || exit /B 1 +%PYTHON_CMD% -c "import pyarrow.dataset" || exit /B 1 +%PYTHON_CMD% -c "import pyarrow.flight" || exit /B 1 +%PYTHON_CMD% -c "import pyarrow.fs" || exit /B 1 +%PYTHON_CMD% -c "import pyarrow.json" || exit /B 1 +%PYTHON_CMD% -c "import pyarrow.orc" || exit /B 1 +%PYTHON_CMD% -c "import pyarrow.parquet" || exit /B 1 +%PYTHON_CMD% -c "import pyarrow.substrait" || exit /B 1 + +@REM Validate wheel contents +%PYTHON_CMD% C:\arrow\ci\scripts\python_wheel_validate_contents.py --path C:\arrow\python\dist || exit /B 1 @rem Download IANA Timezone Database for ORC C++ curl https://cygwin.osuosl.org/noarch/release/tzdata/tzdata-2024a-1.tar.xz --output tzdata.tar.xz || exit /B @@ -67,4 +74,4 @@ arc unarchive tzdata.tar.xz %USERPROFILE%\Downloads\test\tzdata set TZDIR=%USERPROFILE%\Downloads\test\tzdata\usr\share\zoneinfo @REM Execute unittest -pytest -r s --pyargs pyarrow || exit /B 1 +%PYTHON_CMD% -m pytest -r s --pyargs pyarrow || exit /B 1 diff --git a/ci/scripts/r_install_system_dependencies.sh b/ci/scripts/r_install_system_dependencies.sh index 7ddc2604f661a..ae2a04656c528 100755 --- a/ci/scripts/r_install_system_dependencies.sh +++ b/ci/scripts/r_install_system_dependencies.sh @@ -54,7 +54,7 @@ if [ "$ARROW_S3" == "ON" ] || [ "$ARROW_GCS" == "ON" ] || [ "$ARROW_R_DEV" == "T case "$PACKAGE_MANAGER" in zypper) # python3 is Python 3.6 on OpenSUSE 15.3. - # PyArrow supports Python 3.8 or later. + # PyArrow supports Python 3.9 or later. $PACKAGE_MANAGER install -y python39-pip ln -s /usr/bin/python3.9 /usr/local/bin/python ln -s /usr/bin/pip3.9 /usr/local/bin/pip diff --git a/ci/scripts/r_test.sh b/ci/scripts/r_test.sh index fe9d18edb8cbb..d5fd78914755e 100755 --- a/ci/scripts/r_test.sh +++ b/ci/scripts/r_test.sh @@ -26,6 +26,10 @@ pushd ${source_dir} printenv +if [ -n "${ARROW_PYTHON_VENV:-}" ]; then + . "${ARROW_PYTHON_VENV}/bin/activate" +fi + # Run the nixlibs.R test suite, which is not included in the installed package ${R_BIN} -e 'setwd("tools"); testthat::test_dir(".", stop_on_warning = TRUE)' diff --git a/ci/scripts/util_enable_core_dumps.sh b/ci/scripts/util_enable_core_dumps.sh new file mode 100644 index 0000000000000..09f8d2d727099 --- /dev/null +++ b/ci/scripts/util_enable_core_dumps.sh @@ -0,0 +1,33 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# NOTE: this script is not marked executable as it should be source'd +# for `ulimit` to take effect. + +set -e + +platform=$(uname) + +if [ "${platform}" = "Linux" ]; then + # We need to override `core_pattern` because + # 1. the original setting may reference apport, which is not available under + # most Docker containers; + # 2. we want to write the core file in a well-known directory. + sudo sysctl -w kernel.core_pattern="/tmp/core.%e.%p" +fi + +ulimit -c unlimited diff --git a/ci/scripts/go_tinygo_example.sh b/ci/scripts/util_log.sh old mode 100755 new mode 100644 similarity index 84% rename from ci/scripts/go_tinygo_example.sh rename to ci/scripts/util_log.sh index 7bde56226db7b..b34c44059adb2 --- a/ci/scripts/go_tinygo_example.sh +++ b/ci/scripts/util_log.sh @@ -1,5 +1,3 @@ -#!/usr/bin/env bash -# # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information @@ -17,11 +15,12 @@ # specific language governing permissions and limitations # under the License. -set -ex - -cd ~ -pushd /src -tinygo build -tags noasm -o ~/example_tinygo arrow/_examples/helloworld/main.go -popd +github_actions_group_begin() { + echo "::group::$1" + set -x +} -./example_tinygo +github_actions_group_end() { + set +x + echo "::endgroup::" +} diff --git a/ci/vcpkg/arm64-osx-static-debug.cmake b/ci/vcpkg/arm64-osx-static-debug.cmake index f511819a2edd9..32ae7bc433489 100644 --- a/ci/vcpkg/arm64-osx-static-debug.cmake +++ b/ci/vcpkg/arm64-osx-static-debug.cmake @@ -21,6 +21,6 @@ set(VCPKG_LIBRARY_LINKAGE static) set(VCPKG_CMAKE_SYSTEM_NAME Darwin) set(VCPKG_OSX_ARCHITECTURES arm64) -set(VCPKG_OSX_DEPLOYMENT_TARGET "11.0") +set(VCPKG_OSX_DEPLOYMENT_TARGET "12.0") set(VCPKG_BUILD_TYPE debug) diff --git a/ci/vcpkg/arm64-osx-static-release.cmake b/ci/vcpkg/arm64-osx-static-release.cmake index 43d65efb2651b..dde46cd763afe 100644 --- a/ci/vcpkg/arm64-osx-static-release.cmake +++ b/ci/vcpkg/arm64-osx-static-release.cmake @@ -21,6 +21,6 @@ set(VCPKG_LIBRARY_LINKAGE static) set(VCPKG_CMAKE_SYSTEM_NAME Darwin) set(VCPKG_OSX_ARCHITECTURES arm64) -set(VCPKG_OSX_DEPLOYMENT_TARGET "11.0") +set(VCPKG_OSX_DEPLOYMENT_TARGET "12.0") set(VCPKG_BUILD_TYPE release) diff --git a/ci/vcpkg/universal2-osx-static-debug.cmake b/ci/vcpkg/universal2-osx-static-debug.cmake index 8abc1ebf838f1..d3ef0d67eb719 100644 --- a/ci/vcpkg/universal2-osx-static-debug.cmake +++ b/ci/vcpkg/universal2-osx-static-debug.cmake @@ -21,6 +21,6 @@ set(VCPKG_LIBRARY_LINKAGE static) set(VCPKG_CMAKE_SYSTEM_NAME Darwin) set(VCPKG_OSX_ARCHITECTURES "x86_64;arm64") -set(VCPKG_OSX_DEPLOYMENT_TARGET "10.15") +set(VCPKG_OSX_DEPLOYMENT_TARGET "12.0") set(VCPKG_BUILD_TYPE debug) diff --git a/ci/vcpkg/universal2-osx-static-release.cmake b/ci/vcpkg/universal2-osx-static-release.cmake index 2eb36c15175b2..3018aa93e5fbb 100644 --- a/ci/vcpkg/universal2-osx-static-release.cmake +++ b/ci/vcpkg/universal2-osx-static-release.cmake @@ -21,6 +21,6 @@ set(VCPKG_LIBRARY_LINKAGE static) set(VCPKG_CMAKE_SYSTEM_NAME Darwin) set(VCPKG_OSX_ARCHITECTURES "x86_64;arm64") -set(VCPKG_OSX_DEPLOYMENT_TARGET "10.15") +set(VCPKG_OSX_DEPLOYMENT_TARGET "12.0") set(VCPKG_BUILD_TYPE release) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 8247043b8bf84..423744c388471 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -79,12 +79,12 @@ if(POLICY CMP0170) cmake_policy(SET CMP0170 NEW) endif() -set(ARROW_VERSION "17.0.0-SNAPSHOT") +set(ARROW_VERSION "18.0.0-SNAPSHOT") string(REGEX MATCH "^[0-9]+\\.[0-9]+\\.[0-9]+" ARROW_BASE_VERSION "${ARROW_VERSION}") # if no build type is specified, default to release builds -if(NOT DEFINED CMAKE_BUILD_TYPE) +if(NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build.") @@ -301,7 +301,8 @@ add_custom_target(lint --cpplint_binary ${CPPLINT_BIN} ${COMMON_LINT_OPTIONS} - ${ARROW_LINT_QUIET}) + ${ARROW_LINT_QUIET} + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..) # # "make format" and "make check-format" targets diff --git a/cpp/build-support/fuzzing/generate_corpuses.sh b/cpp/build-support/fuzzing/generate_corpuses.sh index e3f00e64782c1..ffd5c54e4436a 100755 --- a/cpp/build-support/fuzzing/generate_corpuses.sh +++ b/cpp/build-support/fuzzing/generate_corpuses.sh @@ -56,4 +56,5 @@ rm -rf ${CORPUS_DIR} ${OUT}/parquet-arrow-generate-fuzz-corpus ${CORPUS_DIR} # Add Parquet testing examples cp ${ARROW_CPP}/submodules/parquet-testing/data/*.parquet ${CORPUS_DIR} +cp ${ARROW_CPP}/submodules/parquet-testing/bad_data/*.parquet ${CORPUS_DIR} ${ARROW_CPP}/build-support/fuzzing/pack_corpus.py ${CORPUS_DIR} ${OUT}/parquet-arrow-fuzz_seed_corpus.zip diff --git a/cpp/build-support/lint_cpp_cli.py b/cpp/build-support/lint_cpp_cli.py index a0eb8f0efe6d5..47abd53fe925d 100755 --- a/cpp/build-support/lint_cpp_cli.py +++ b/cpp/build-support/lint_cpp_cli.py @@ -31,6 +31,7 @@ _NULLPTR_REGEX = re.compile(r'.*\bnullptr\b.*') _RETURN_NOT_OK_REGEX = re.compile(r'.*\sRETURN_NOT_OK.*') _ASSIGN_OR_RAISE_REGEX = re.compile(r'.*\sASSIGN_OR_RAISE.*') +_DCHECK_REGEX = re.compile(r'.*\sDCHECK.*') def _paths(paths): @@ -54,14 +55,12 @@ def lint_file(path): (lambda x: re.match(_RETURN_NOT_OK_REGEX, x), 'Use ARROW_RETURN_NOT_OK in header files', _paths('''\ arrow/status.h - test - arrow/util/hash.h arrow/python/util''')), (lambda x: re.match(_ASSIGN_OR_RAISE_REGEX, x), - 'Use ARROW_ASSIGN_OR_RAISE in header files', _paths('''\ - arrow/result_internal.h - test - ''')) + 'Use ARROW_ASSIGN_OR_RAISE in header files', []), + (lambda x: re.match(_DCHECK_REGEX, x), + 'Use ARROW_DCHECK in header files', _paths('''\ + arrow/util/logging.h''')) ] diff --git a/cpp/build-support/run-test.sh b/cpp/build-support/run-test.sh index 8e42438a23c1c..55e3fe0980749 100755 --- a/cpp/build-support/run-test.sh +++ b/cpp/build-support/run-test.sh @@ -121,12 +121,15 @@ function print_coredumps() { # patterns must be set with prefix `core.{test-executable}*`: # # In case of macOS: - # sudo sysctl -w kern.corefile=core.%N.%P + # sudo sysctl -w kern.corefile=/tmp/core.%N.%P # On Linux: - # sudo sysctl -w kernel.core_pattern=core.%e.%p + # sudo sysctl -w kernel.core_pattern=/tmp/core.%e.%p # # and the ulimit must be increased: # ulimit -c unlimited + # + # If the tests are run in a Docker container, the instructions are slightly + # different: see the 'Coredumps' comment section in `docker-compose.yml`. # filename is truncated to the first 15 characters in case of linux, so limit # the pattern for the first 15 characters @@ -134,19 +137,21 @@ function print_coredumps() { FILENAME=$(echo ${FILENAME} | cut -c-15) PATTERN="^core\.${FILENAME}" - COREFILES=$(ls | grep $PATTERN) + COREFILES=$(ls /tmp | grep $PATTERN) if [ -n "$COREFILES" ]; then - echo "Found core dump, printing backtrace:" - for COREFILE in $COREFILES; do + COREPATH="/tmp/${COREFILE}" + echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" + echo "Running '${TEST_EXECUTABLE}' produced core dump at '${COREPATH}', printing backtrace:" # Print backtrace if [ "$(uname)" == "Darwin" ]; then - lldb -c "${COREFILE}" --batch --one-line "thread backtrace all -e true" + lldb -c "${COREPATH}" --batch --one-line "thread backtrace all -e true" else - gdb -c "${COREFILE}" $TEST_EXECUTABLE -ex "thread apply all bt" -ex "set pagination 0" -batch + gdb -c "${COREPATH}" $TEST_EXECUTABLE -ex "thread apply all bt" -ex "set pagination 0" -batch fi - # Remove the coredump, regenerate it via running the test case directly - rm "${COREFILE}" + echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" + # Remove the coredump, it can be regenerated via running the test case directly + rm "${COREPATH}" done fi } diff --git a/cpp/build-support/run_cpplint.py b/cpp/build-support/run_cpplint.py index 76c0fe0aefaca..a81acf2eb2ff9 100755 --- a/cpp/build-support/run_cpplint.py +++ b/cpp/build-support/run_cpplint.py @@ -26,24 +26,6 @@ from functools import partial -# NOTE(wesm): -# -# * readability/casting is disabled as it aggressively warns about functions -# with names like "int32", so "int32(x)", where int32 is a function name, -# warns with -_filters = ''' --whitespace/comments --readability/casting --readability/todo --readability/alt_tokens --build/header_guard --build/c++11 --build/include_what_you_use --runtime/references --build/include_order -'''.split() - - def _get_chunk_key(filenames): # lists are not hashable so key on the first filename in a chunk return filenames[0] @@ -87,8 +69,6 @@ def _check_some_files(completed_processes, filenames): cmd = [ arguments.cpplint_binary, '--verbose=2', - '--linelength=90', - '--filter=' + ','.join(_filters) ] if (arguments.cpplint_binary.endswith('.py') and platform.system() == 'Windows'): diff --git a/cpp/cmake_modules/BuildUtils.cmake b/cpp/cmake_modules/BuildUtils.cmake index e7523add27223..692efa78376f4 100644 --- a/cpp/cmake_modules/BuildUtils.cmake +++ b/cpp/cmake_modules/BuildUtils.cmake @@ -721,6 +721,11 @@ function(ADD_TEST_CASE REL_TEST_NAME) "${EXECUTABLE_OUTPUT_PATH};$ENV{CONDA_PREFIX}/lib") endif() + # Ensure using bundled GoogleTest when we use bundled GoogleTest. + # ARROW_GTEST_GTEST_HEADERS is defined only when we use bundled + # GoogleTest. + target_link_libraries(${TEST_NAME} PRIVATE ${ARROW_GTEST_GTEST_HEADERS}) + if(ARG_STATIC_LINK_LIBS) # Customize link libraries target_link_libraries(${TEST_NAME} PRIVATE ${ARG_STATIC_LINK_LIBS}) diff --git a/cpp/cmake_modules/DefineOptions.cmake b/cpp/cmake_modules/DefineOptions.cmake index 41466a1c22404..755887314d110 100644 --- a/cpp/cmake_modules/DefineOptions.cmake +++ b/cpp/cmake_modules/DefineOptions.cmake @@ -303,7 +303,10 @@ takes precedence over ccache if a storage backend is configured" ON) ARROW_IPC) define_option(ARROW_AZURE - "Build Arrow with Azure support (requires the Azure SDK for C++)" OFF) + "Build Arrow with Azure support (requires the Azure SDK for C++)" + OFF + DEPENDS + ARROW_FILESYSTEM) define_option(ARROW_BUILD_UTILITIES "Build Arrow commandline utilities" OFF) @@ -346,9 +349,16 @@ takes precedence over ccache if a storage backend is configured" ON) ARROW_WITH_UTF8PROC) define_option(ARROW_GCS - "Build Arrow with GCS support (requires the GCloud SDK for C++)" OFF) + "Build Arrow with GCS support (requires the GCloud SDK for C++)" + OFF + DEPENDS + ARROW_FILESYSTEM) - define_option(ARROW_HDFS "Build the Arrow HDFS bridge" OFF) + define_option(ARROW_HDFS + "Build the Arrow HDFS bridge" + OFF + DEPENDS + ARROW_FILESYSTEM) define_option(ARROW_IPC "Build the Arrow IPC extensions" ON) @@ -398,7 +408,11 @@ takes precedence over ccache if a storage backend is configured" ON) ARROW_HDFS ARROW_JSON) - define_option(ARROW_S3 "Build Arrow with S3 support (requires the AWS SDK for C++)" OFF) + define_option(ARROW_S3 + "Build Arrow with S3 support (requires the AWS SDK for C++)" + OFF + DEPENDS + ARROW_FILESYSTEM) define_option(ARROW_SKYHOOK "Build the Skyhook libraries" diff --git a/cpp/cmake_modules/FindThriftAlt.cmake b/cpp/cmake_modules/FindThriftAlt.cmake index f3e49021d5738..98a706deb9919 100644 --- a/cpp/cmake_modules/FindThriftAlt.cmake +++ b/cpp/cmake_modules/FindThriftAlt.cmake @@ -191,6 +191,10 @@ if(ThriftAlt_FOUND) # thrift/windows/config.h for Visual C++. set_target_properties(thrift::thrift PROPERTIES INTERFACE_LINK_LIBRARIES "ws2_32") endif() + # Workaround: thrift.pc doesn't have Boost dependency. + if(TARGET Boost::headers) + target_link_libraries(thrift::thrift INTERFACE Boost::headers) + endif() if(Thrift_COMPILER_FOUND) add_executable(thrift::compiler IMPORTED) diff --git a/cpp/cmake_modules/Findlz4Alt.cmake b/cpp/cmake_modules/Findlz4Alt.cmake index 77a22957f7964..91e735107a954 100644 --- a/cpp/cmake_modules/Findlz4Alt.cmake +++ b/cpp/cmake_modules/Findlz4Alt.cmake @@ -29,9 +29,15 @@ endif() find_package(lz4 ${find_package_args}) if(lz4_FOUND) set(lz4Alt_FOUND TRUE) - # Conan uses lz4::lz4 not LZ4::lz4 - if(NOT TARGET LZ4::lz4 AND TARGET lz4::lz4) - add_library(LZ4::lz4 ALIAS lz4::lz4) + if(NOT TARGET LZ4::lz4) + # Conan uses lz4::lz4 not LZ4::lz4 + if(TARGET lz4::lz4) + add_library(LZ4::lz4 ALIAS lz4::lz4) + elseif(ARROW_LZ4_USE_SHARED) + add_library(LZ4::lz4 ALIAS LZ4::lz4_shared) + else() + add_library(LZ4::lz4 ALIAS LZ4::lz4_static) + endif() endif() return() endif() diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 5b89a831ff7fe..db151b4e0f44b 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -259,7 +259,7 @@ macro(resolve_dependency DEPENDENCY_NAME) IS_RUNTIME_DEPENDENCY REQUIRED_VERSION USE_CONFIG) - set(multi_value_args COMPONENTS PC_PACKAGE_NAMES) + set(multi_value_args COMPONENTS OPTIONAL_COMPONENTS PC_PACKAGE_NAMES) cmake_parse_arguments(ARG "${options}" "${one_value_args}" @@ -287,6 +287,9 @@ macro(resolve_dependency DEPENDENCY_NAME) if(ARG_COMPONENTS) list(APPEND FIND_PACKAGE_ARGUMENTS COMPONENTS ${ARG_COMPONENTS}) endif() + if(ARG_OPTIONAL_COMPONENTS) + list(APPEND FIND_PACKAGE_ARGUMENTS OPTIONAL_COMPONENTS ${ARG_OPTIONAL_COMPONENTS}) + endif() if(${DEPENDENCY_NAME}_SOURCE STREQUAL "AUTO") find_package(${FIND_PACKAGE_ARGUMENTS}) set(COMPATIBLE ${${PACKAGE_NAME}_FOUND}) @@ -1289,15 +1292,19 @@ if(ARROW_USE_BOOST) set(Boost_USE_STATIC_LIBS ON) endif() if(ARROW_BOOST_REQUIRE_LIBRARY) - set(ARROW_BOOST_COMPONENTS system filesystem) + set(ARROW_BOOST_COMPONENTS filesystem system) + set(ARROW_BOOST_OPTIONAL_COMPONENTS process) else() set(ARROW_BOOST_COMPONENTS) + set(ARROW_BOOST_OPTIONAL_COMPONENTS) endif() resolve_dependency(Boost REQUIRED_VERSION ${ARROW_BOOST_REQUIRED_VERSION} COMPONENTS ${ARROW_BOOST_COMPONENTS} + OPTIONAL_COMPONENTS + ${ARROW_BOOST_OPTIONAL_COMPONENTS} IS_RUNTIME_DEPENDENCY # libarrow.so doesn't depend on libboost*. FALSE) @@ -1316,14 +1323,35 @@ if(ARROW_USE_BOOST) endif() endforeach() - if(WIN32 AND CMAKE_CXX_COMPILER_ID STREQUAL "GNU") - # boost/process/detail/windows/handle_workaround.hpp doesn't work - # without BOOST_USE_WINDOWS_H with MinGW because MinGW doesn't - # provide __kernel_entry without winternl.h. - # - # See also: - # https://github.com/boostorg/process/blob/develop/include/boost/process/detail/windows/handle_workaround.hpp - target_compile_definitions(Boost::headers INTERFACE "BOOST_USE_WINDOWS_H=1") + if(TARGET Boost::process) + # Boost >= 1.86 + target_compile_definitions(Boost::process INTERFACE "BOOST_PROCESS_HAVE_V1") + target_compile_definitions(Boost::process INTERFACE "BOOST_PROCESS_HAVE_V2") + else() + # Boost < 1.86 + add_library(Boost::process INTERFACE IMPORTED) + if(TARGET Boost::filesystem) + target_link_libraries(Boost::process INTERFACE Boost::filesystem) + endif() + if(TARGET Boost::system) + target_link_libraries(Boost::process INTERFACE Boost::system) + endif() + if(TARGET Boost::headers) + target_link_libraries(Boost::process INTERFACE Boost::headers) + endif() + if(Boost_VERSION VERSION_GREATER_EQUAL 1.80) + target_compile_definitions(Boost::process INTERFACE "BOOST_PROCESS_HAVE_V2") + # Boost < 1.86 has a bug that + # boost::process::v2::process_environment::on_setup() isn't + # defined. We need to build Boost Process source to define it. + # + # See also: + # https://github.com/boostorg/process/issues/312 + target_compile_definitions(Boost::process INTERFACE "BOOST_PROCESS_NEED_SOURCE") + if(WIN32) + target_link_libraries(Boost::process INTERFACE bcrypt ntdll) + endif() + endif() endif() message(STATUS "Boost include dir: ${Boost_INCLUDE_DIRS}") @@ -1355,15 +1383,23 @@ macro(build_snappy) "-DCMAKE_INSTALL_PREFIX=${SNAPPY_PREFIX}") # Snappy unconditionally enables -Werror when building with clang this can lead # to build failures by way of new compiler warnings. This adds a flag to disable - # Werror to the very end of the invocation to override the snappy internal setting. + # -Werror to the very end of the invocation to override the snappy internal setting. + set(SNAPPY_ADDITIONAL_CXX_FLAGS "") if(CMAKE_CXX_COMPILER_ID MATCHES "Clang") - foreach(CONFIG DEBUG MINSIZEREL RELEASE RELWITHDEBINFO) - list(APPEND - SNAPPY_CMAKE_ARGS - "-DCMAKE_CXX_FLAGS_${UPPERCASE_BUILD_TYPE}=${EP_CXX_FLAGS_${CONFIG}} -Wno-error" - ) - endforeach() + string(APPEND SNAPPY_ADDITIONAL_CXX_FLAGS " -Wno-error") endif() + # Snappy unconditionally disables RTTI, which is incompatible with some other + # build settings (https://github.com/apache/arrow/issues/43688). + if(NOT MSVC) + string(APPEND SNAPPY_ADDITIONAL_CXX_FLAGS " -frtti") + endif() + + foreach(CONFIG DEBUG MINSIZEREL RELEASE RELWITHDEBINFO) + list(APPEND + SNAPPY_CMAKE_ARGS + "-DCMAKE_CXX_FLAGS_${CONFIG}=${EP_CXX_FLAGS_${CONFIG}} ${SNAPPY_ADDITIONAL_CXX_FLAGS}" + ) + endforeach() if(APPLE AND CMAKE_HOST_SYSTEM_VERSION VERSION_LESS 20) # On macOS 10.13 we need to explicitly add to avoid a missing include error @@ -2306,6 +2342,10 @@ function(build_gtest) install(DIRECTORY "${googletest_SOURCE_DIR}/googlemock/include/" "${googletest_SOURCE_DIR}/googletest/include/" DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}") + add_library(arrow::GTest::gtest_headers INTERFACE IMPORTED) + target_include_directories(arrow::GTest::gtest_headers + INTERFACE "${googletest_SOURCE_DIR}/googlemock/include/" + "${googletest_SOURCE_DIR}/googletest/include/") install(TARGETS gmock gmock_main gtest gtest_main EXPORT arrow_testing_targets RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}" @@ -2350,12 +2390,14 @@ if(ARROW_TESTING) string(APPEND ARROW_TESTING_PC_LIBS " $") endif() + set(ARROW_GTEST_GTEST_HEADERS) set(ARROW_GTEST_GMOCK GTest::gmock) set(ARROW_GTEST_GTEST GTest::gtest) set(ARROW_GTEST_GTEST_MAIN GTest::gtest_main) else() string(APPEND ARROW_TESTING_PC_CFLAGS " -I\${includedir}/arrow-gtest") string(APPEND ARROW_TESTING_PC_LIBS " -larrow_gtest") + set(ARROW_GTEST_GTEST_HEADERS arrow::GTest::gtest_headers) set(ARROW_GTEST_GMOCK arrow::GTest::gmock) set(ARROW_GTEST_GTEST arrow::GTest::gtest) set(ARROW_GTEST_GTEST_MAIN arrow::GTest::gtest_main) @@ -2882,6 +2924,10 @@ macro(build_absl) set(ABSL_INCLUDE_DIR "${ABSL_PREFIX}/include") set(ABSL_CMAKE_ARGS "${EP_COMMON_CMAKE_ARGS}" -DABSL_RUN_TESTS=OFF "-DCMAKE_INSTALL_PREFIX=${ABSL_PREFIX}") + if(CMAKE_COMPILER_IS_GNUCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0) + set(ABSL_CXX_FLAGS "${EP_CXX_FLAGS} -include stdint.h") + list(APPEND ABSL_CMAKE_ARGS "-DCMAKE_CXX_FLAGS=${ABSL_CXX_FLAGS}") + endif() set(ABSL_BUILD_BYPRODUCTS) set(ABSL_LIBRARIES) @@ -4506,9 +4552,12 @@ function(build_orc) OFF CACHE BOOL "" FORCE) get_target_property(LZ4_INCLUDE_DIR LZ4::lz4 INTERFACE_INCLUDE_DIRECTORIES) + if(NOT LZ4_INCLUDE_DIR) + find_path(LZ4_INCLUDE_DIR NAMES lz4.h) + endif() get_filename_component(LZ4_ROOT "${LZ4_INCLUDE_DIR}" DIRECTORY) set(LZ4_HOME - ${LZ4_ROOT} + "${LZ4_ROOT}" CACHE STRING "" FORCE) set(LZ4_LIBRARY LZ4::lz4 @@ -4944,8 +4993,24 @@ macro(build_awssdk) set(AWSSDK_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/awssdk_ep-install") set(AWSSDK_INCLUDE_DIR "${AWSSDK_PREFIX}/include") + # The AWS SDK has a few warnings around shortening lengths + set(AWS_C_FLAGS "${EP_C_FLAGS}") + set(AWS_CXX_FLAGS "${EP_CXX_FLAGS}") + if(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL + "Clang") + # Negate warnings that AWS SDK cannot build under + string(APPEND AWS_C_FLAGS " -Wno-error=shorten-64-to-32") + string(APPEND AWS_CXX_FLAGS " -Wno-error=shorten-64-to-32") + endif() + if(NOT MSVC) + string(APPEND AWS_C_FLAGS " -Wno-deprecated") + string(APPEND AWS_CXX_FLAGS " -Wno-deprecated") + endif() + set(AWSSDK_COMMON_CMAKE_ARGS ${EP_COMMON_CMAKE_ARGS} + -DCMAKE_C_FLAGS=${AWS_C_FLAGS} + -DCMAKE_CXX_FLAGS=${AWS_CXX_FLAGS} -DCPP_STANDARD=${CMAKE_CXX_STANDARD} -DCMAKE_INSTALL_PREFIX=${AWSSDK_PREFIX} -DCMAKE_PREFIX_PATH=${AWSSDK_PREFIX} diff --git a/cpp/cmake_modules/UseCython.cmake b/cpp/cmake_modules/UseCython.cmake index e15ac59490c6e..7d88daa4fade9 100644 --- a/cpp/cmake_modules/UseCython.cmake +++ b/cpp/cmake_modules/UseCython.cmake @@ -184,4 +184,9 @@ function(cython_add_module _name pyx_target_name generated_files) add_dependencies(${_name} ${pyx_target_name}) endfunction() +execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "from Cython.Compiler.Version import version; print(version)" + OUTPUT_VARIABLE CYTHON_VERSION_OUTPUT + OUTPUT_STRIP_TRAILING_WHITESPACE) +set(CYTHON_VERSION "${CYTHON_VERSION_OUTPUT}") + include(CMakeParseArguments) diff --git a/cpp/examples/minimal_build/CMakeLists.txt b/cpp/examples/minimal_build/CMakeLists.txt index b4a7cde938c87..95dad34221add 100644 --- a/cpp/examples/minimal_build/CMakeLists.txt +++ b/cpp/examples/minimal_build/CMakeLists.txt @@ -30,7 +30,7 @@ endif() # We require a C++17 compliant compiler set(CMAKE_CXX_STANDARD_REQUIRED ON) -if(NOT DEFINED CMAKE_BUILD_TYPE) +if(NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE Release) endif() diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 6dc8358f502f5..e77a02d0c0800 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -373,7 +373,11 @@ set(ARROW_SRCS config.cc datum.cc device.cc + device_allocation_type_set.cc extension_type.cc + extension/bool8.cc + extension/json.cc + extension/uuid.cc pretty_print.cc record_batch.cc result.cc @@ -412,6 +416,7 @@ arrow_add_object_library(ARROW_ARRAY array/concatenate.cc array/data.cc array/diff.cc + array/statistics.cc array/util.cc array/validate.cc) @@ -640,9 +645,13 @@ else() endif() set(ARROW_TESTING_SHARED_LINK_LIBS arrow_shared ${ARROW_GTEST_GTEST}) -set(ARROW_TESTING_SHARED_PRIVATE_LINK_LIBS arrow::flatbuffers RapidJSON) -set(ARROW_TESTING_STATIC_LINK_LIBS arrow::flatbuffers RapidJSON arrow_static - ${ARROW_GTEST_GTEST}) +set(ARROW_TESTING_SHARED_PRIVATE_LINK_LIBS arrow::flatbuffers RapidJSON Boost::process) +set(ARROW_TESTING_STATIC_LINK_LIBS + arrow::flatbuffers + RapidJSON + Boost::process + arrow_static + ${ARROW_GTEST_GTEST}) set(ARROW_TESTING_SHARED_INSTALL_INTERFACE_LIBS Arrow::arrow_shared) set(ARROW_TESTING_STATIC_INSTALL_INTERFACE_LIBS Arrow::arrow_static) # that depend on gtest @@ -663,9 +672,10 @@ set(ARROW_TESTING_SRCS io/test_common.cc ipc/test_common.cc testing/fixed_width_test_util.cc + testing/generator.cc testing/gtest_util.cc + testing/process.cc testing/random.cc - testing/generator.cc testing/util.cc) # @@ -722,7 +732,6 @@ set(ARROW_COMPUTE_SRCS compute/ordering.cc compute/registry.cc compute/kernels/codegen_internal.cc - compute/kernels/row_encoder.cc compute/kernels/ree_util_internal.cc compute/kernels/scalar_cast_boolean.cc compute/kernels/scalar_cast_dictionary.cc @@ -741,6 +750,7 @@ set(ARROW_COMPUTE_SRCS compute/row/encode_internal.cc compute/row/compare_internal.cc compute/row/grouper.cc + compute/row/row_encoder_internal.cc compute/row/row_internal.cc compute/util.cc compute/util_internal.cc) @@ -906,6 +916,7 @@ endif() if(ARROW_JSON) arrow_add_object_library(ARROW_JSON extension/fixed_shape_tensor.cc + extension/opaque.cc json/options.cc json/chunked_builder.cc json/chunker.cc @@ -1168,6 +1179,7 @@ add_arrow_test(array_test array/array_struct_test.cc array/array_union_test.cc array/array_view_test.cc + array/statistics_test.cc PRECOMPILED_HEADERS "$<$:arrow/testing/pch.h>") @@ -1221,6 +1233,7 @@ add_subdirectory(testing) add_subdirectory(array) add_subdirectory(c) add_subdirectory(compute) +add_subdirectory(extension) add_subdirectory(io) add_subdirectory(tensor) add_subdirectory(util) @@ -1263,7 +1276,6 @@ endif() if(ARROW_JSON) add_subdirectory(json) - add_subdirectory(extension) endif() if(ARROW_ORC) diff --git a/cpp/src/arrow/acero/aggregate_benchmark.cc b/cpp/src/arrow/acero/aggregate_benchmark.cc index 854862e3e48ca..9c90b63904eb3 100644 --- a/cpp/src/arrow/acero/aggregate_benchmark.cc +++ b/cpp/src/arrow/acero/aggregate_benchmark.cc @@ -24,6 +24,7 @@ #include "arrow/array/array_primitive.h" #include "arrow/compute/api.h" #include "arrow/table.h" +#include "arrow/testing/generator.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/random.h" #include "arrow/util/benchmark_util.h" @@ -165,11 +166,11 @@ struct SumSentinelUnrolled : public Summer { static void Sum(const ArrayType& array, SumState* state) { SumState local; -#define SUM_NOT_NULL(ITEM) \ - do { \ - local.total += values[i + ITEM] * Traits::NotNull(values[i + ITEM]); \ - local.valid_count++; \ - } while (0) +# define SUM_NOT_NULL(ITEM) \ + do { \ + local.total += values[i + ITEM] * Traits::NotNull(values[i + ITEM]); \ + local.valid_count++; \ + } while (0) const auto values = array.raw_values(); const auto length = array.length(); @@ -185,7 +186,7 @@ struct SumSentinelUnrolled : public Summer { SUM_NOT_NULL(7); } -#undef SUM_NOT_NULL +# undef SUM_NOT_NULL for (int64_t i = length_rounded * 8; i < length; ++i) { local.total += values[i] * Traits::NotNull(values[i]); @@ -256,7 +257,7 @@ struct SumBitmapVectorizeUnroll : public Summer { for (int64_t i = 0; i < length_rounded; i += 8) { const uint8_t valid_byte = bitmap[i / 8]; -#define SUM_SHIFT(ITEM) (values[i + ITEM] * ((valid_byte >> ITEM) & 1)) +# define SUM_SHIFT(ITEM) (values[i + ITEM] * ((valid_byte >> ITEM) & 1)) if (valid_byte < 0xFF) { // Some nulls @@ -277,7 +278,7 @@ struct SumBitmapVectorizeUnroll : public Summer { } } -#undef SUM_SHIFT +# undef SUM_SHIFT for (int64_t i = length_rounded; i < length; ++i) { if (bit_util::GetBit(bitmap, i)) { @@ -325,7 +326,8 @@ BENCHMARK_TEMPLATE(ReferenceSum, SumBitmapVectorizeUnroll) std::shared_ptr RecordBatchFromArrays( const std::vector>& arguments, - const std::vector>& keys) { + const std::vector>& keys, + const std::vector>& segment_keys) { std::vector> fields; std::vector> all_arrays; int64_t length = -1; @@ -347,37 +349,56 @@ std::shared_ptr RecordBatchFromArrays( fields.push_back(field("key" + ToChars(key_idx), key->type())); all_arrays.push_back(key); } + for (std::size_t segment_key_idx = 0; segment_key_idx < segment_keys.size(); + segment_key_idx++) { + const auto& segment_key = segment_keys[segment_key_idx]; + DCHECK_EQ(segment_key->length(), length); + fields.push_back( + field("segment_key" + ToChars(segment_key_idx), segment_key->type())); + all_arrays.push_back(segment_key); + } return RecordBatch::Make(schema(std::move(fields)), length, std::move(all_arrays)); } Result> BatchGroupBy( std::shared_ptr batch, std::vector aggregates, - std::vector keys, bool use_threads = false, - MemoryPool* memory_pool = default_memory_pool()) { + std::vector keys, std::vector segment_keys, + bool use_threads = false, MemoryPool* memory_pool = default_memory_pool()) { ARROW_ASSIGN_OR_RAISE(std::shared_ptr table, Table::FromRecordBatches({std::move(batch)})); Declaration plan = Declaration::Sequence( {{"table_source", TableSourceNodeOptions(std::move(table))}, - {"aggregate", AggregateNodeOptions(std::move(aggregates), std::move(keys))}}); + {"aggregate", AggregateNodeOptions(std::move(aggregates), std::move(keys), + std::move(segment_keys))}}); return DeclarationToTable(std::move(plan), use_threads, memory_pool); } -static void BenchmarkGroupBy(benchmark::State& state, std::vector aggregates, - const std::vector>& arguments, - const std::vector>& keys) { - std::shared_ptr batch = RecordBatchFromArrays(arguments, keys); +static void BenchmarkAggregate( + benchmark::State& state, std::vector aggregates, + const std::vector>& arguments, + const std::vector>& keys, + const std::vector>& segment_keys = {}) { + std::shared_ptr batch = + RecordBatchFromArrays(arguments, keys, segment_keys); std::vector key_refs; for (std::size_t key_idx = 0; key_idx < keys.size(); key_idx++) { key_refs.emplace_back(static_cast(key_idx + arguments.size())); } + std::vector segment_key_refs; + for (std::size_t segment_key_idx = 0; segment_key_idx < segment_keys.size(); + segment_key_idx++) { + segment_key_refs.emplace_back( + static_cast(segment_key_idx + arguments.size() + keys.size())); + } for (std::size_t arg_idx = 0; arg_idx < arguments.size(); arg_idx++) { aggregates[arg_idx].target = {FieldRef(static_cast(arg_idx))}; } int64_t total_bytes = TotalBufferSize(*batch); for (auto _ : state) { - ABORT_NOT_OK(BatchGroupBy(batch, aggregates, key_refs)); + ABORT_NOT_OK(BatchGroupBy(batch, aggregates, key_refs, segment_key_refs)); } state.SetBytesProcessed(total_bytes * state.iterations()); + state.SetItemsProcessed(batch->num_rows() * state.iterations()); } #define GROUP_BY_BENCHMARK(Name, Impl) \ @@ -404,7 +425,7 @@ GROUP_BY_BENCHMARK(SumDoublesGroupedByTinyStringSet, [&] { /*min_length=*/3, /*max_length=*/32); - BenchmarkGroupBy(state, {{"hash_sum", ""}}, {summand}, {key}); + BenchmarkAggregate(state, {{"hash_sum", ""}}, {summand}, {key}); }); GROUP_BY_BENCHMARK(SumDoublesGroupedBySmallStringSet, [&] { @@ -419,7 +440,7 @@ GROUP_BY_BENCHMARK(SumDoublesGroupedBySmallStringSet, [&] { /*min_length=*/3, /*max_length=*/32); - BenchmarkGroupBy(state, {{"hash_sum", ""}}, {summand}, {key}); + BenchmarkAggregate(state, {{"hash_sum", ""}}, {summand}, {key}); }); GROUP_BY_BENCHMARK(SumDoublesGroupedByMediumStringSet, [&] { @@ -434,7 +455,7 @@ GROUP_BY_BENCHMARK(SumDoublesGroupedByMediumStringSet, [&] { /*min_length=*/3, /*max_length=*/32); - BenchmarkGroupBy(state, {{"hash_sum", ""}}, {summand}, {key}); + BenchmarkAggregate(state, {{"hash_sum", ""}}, {summand}, {key}); }); GROUP_BY_BENCHMARK(SumDoublesGroupedByTinyIntegerSet, [&] { @@ -448,7 +469,7 @@ GROUP_BY_BENCHMARK(SumDoublesGroupedByTinyIntegerSet, [&] { /*min=*/0, /*max=*/15); - BenchmarkGroupBy(state, {{"hash_sum", ""}}, {summand}, {key}); + BenchmarkAggregate(state, {{"hash_sum", ""}}, {summand}, {key}); }); GROUP_BY_BENCHMARK(SumDoublesGroupedBySmallIntegerSet, [&] { @@ -462,7 +483,7 @@ GROUP_BY_BENCHMARK(SumDoublesGroupedBySmallIntegerSet, [&] { /*min=*/0, /*max=*/255); - BenchmarkGroupBy(state, {{"hash_sum", ""}}, {summand}, {key}); + BenchmarkAggregate(state, {{"hash_sum", ""}}, {summand}, {key}); }); GROUP_BY_BENCHMARK(SumDoublesGroupedByMediumIntegerSet, [&] { @@ -476,7 +497,7 @@ GROUP_BY_BENCHMARK(SumDoublesGroupedByMediumIntegerSet, [&] { /*min=*/0, /*max=*/4095); - BenchmarkGroupBy(state, {{"hash_sum", ""}}, {summand}, {key}); + BenchmarkAggregate(state, {{"hash_sum", ""}}, {summand}, {key}); }); GROUP_BY_BENCHMARK(SumDoublesGroupedByTinyIntStringPairSet, [&] { @@ -494,7 +515,7 @@ GROUP_BY_BENCHMARK(SumDoublesGroupedByTinyIntStringPairSet, [&] { /*min_length=*/3, /*max_length=*/32); - BenchmarkGroupBy(state, {{"hash_sum", ""}}, {summand}, {int_key, str_key}); + BenchmarkAggregate(state, {{"hash_sum", ""}}, {summand}, {int_key, str_key}); }); GROUP_BY_BENCHMARK(SumDoublesGroupedBySmallIntStringPairSet, [&] { @@ -512,7 +533,7 @@ GROUP_BY_BENCHMARK(SumDoublesGroupedBySmallIntStringPairSet, [&] { /*min_length=*/3, /*max_length=*/32); - BenchmarkGroupBy(state, {{"hash_sum", ""}}, {summand}, {int_key, str_key}); + BenchmarkAggregate(state, {{"hash_sum", ""}}, {summand}, {int_key, str_key}); }); GROUP_BY_BENCHMARK(SumDoublesGroupedByMediumIntStringPairSet, [&] { @@ -530,7 +551,7 @@ GROUP_BY_BENCHMARK(SumDoublesGroupedByMediumIntStringPairSet, [&] { /*min_length=*/3, /*max_length=*/32); - BenchmarkGroupBy(state, {{"hash_sum", ""}}, {summand}, {int_key, str_key}); + BenchmarkAggregate(state, {{"hash_sum", ""}}, {summand}, {int_key, str_key}); }); // Grouped MinMax @@ -543,7 +564,7 @@ GROUP_BY_BENCHMARK(MinMaxDoublesGroupedByMediumInt, [&] { /*nan_probability=*/args.null_proportion / 10); auto int_key = rng.Int64(args.size, /*min=*/0, /*max=*/63); - BenchmarkGroupBy(state, {{"hash_min_max", ""}}, {input}, {int_key}); + BenchmarkAggregate(state, {{"hash_min_max", ""}}, {input}, {int_key}); }); GROUP_BY_BENCHMARK(MinMaxShortStringsGroupedByMediumInt, [&] { @@ -553,7 +574,7 @@ GROUP_BY_BENCHMARK(MinMaxShortStringsGroupedByMediumInt, [&] { /*null_probability=*/args.null_proportion); auto int_key = rng.Int64(args.size, /*min=*/0, /*max=*/63); - BenchmarkGroupBy(state, {{"hash_min_max", ""}}, {input}, {int_key}); + BenchmarkAggregate(state, {{"hash_min_max", ""}}, {input}, {int_key}); }); GROUP_BY_BENCHMARK(MinMaxLongStringsGroupedByMediumInt, [&] { @@ -563,7 +584,7 @@ GROUP_BY_BENCHMARK(MinMaxLongStringsGroupedByMediumInt, [&] { /*null_probability=*/args.null_proportion); auto int_key = rng.Int64(args.size, /*min=*/0, /*max=*/63); - BenchmarkGroupBy(state, {{"hash_min_max", ""}}, {input}, {int_key}); + BenchmarkAggregate(state, {{"hash_min_max", ""}}, {input}, {int_key}); }); // @@ -866,5 +887,61 @@ BENCHMARK(TDigestKernelDoubleMedian)->Apply(QuantileKernelArgs); BENCHMARK(TDigestKernelDoubleDeciles)->Apply(QuantileKernelArgs); BENCHMARK(TDigestKernelDoubleCentiles)->Apply(QuantileKernelArgs); +// +// Segmented Aggregate +// + +static void BenchmarkSegmentedAggregate( + benchmark::State& state, int64_t num_rows, std::vector aggregates, + const std::vector>& arguments, + const std::vector>& keys, int64_t num_segment_keys, + int64_t num_segments) { + ASSERT_GT(num_segments, 0); + + auto rng = random::RandomArrayGenerator(42); + auto segment_key = rng.Int64(num_rows, /*min=*/0, /*max=*/num_segments - 1); + int64_t* values = segment_key->data()->GetMutableValues(1); + std::sort(values, values + num_rows); + // num_segment_keys copies of the segment key. + ArrayVector segment_keys(num_segment_keys, segment_key); + + BenchmarkAggregate(state, std::move(aggregates), arguments, keys, segment_keys); +} + +template +static void CountScalarSegmentedByInts(benchmark::State& state, Args&&...) { + constexpr int64_t num_rows = 32 * 1024; + + // A trivial column to count from. + auto arg = ConstantArrayGenerator::Zeroes(num_rows, int32()); + + BenchmarkSegmentedAggregate(state, num_rows, {{"count", ""}}, {arg}, /*keys=*/{}, + state.range(0), state.range(1)); +} +BENCHMARK(CountScalarSegmentedByInts) + ->ArgNames({"SegmentKeys", "Segments"}) + ->ArgsProduct({{0, 1, 2}, benchmark::CreateRange(1, 256, 8)}); + +template +static void CountGroupByIntsSegmentedByInts(benchmark::State& state, Args&&...) { + constexpr int64_t num_rows = 32 * 1024; + + // A trivial column to count from. + auto arg = ConstantArrayGenerator::Zeroes(num_rows, int32()); + + auto rng = random::RandomArrayGenerator(42); + int64_t num_keys = state.range(0); + ArrayVector keys(num_keys); + for (auto& key : keys) { + key = rng.Int64(num_rows, /*min=*/0, /*max=*/64); + } + + BenchmarkSegmentedAggregate(state, num_rows, {{"hash_count", ""}}, {arg}, keys, + state.range(1), state.range(2)); +} +BENCHMARK(CountGroupByIntsSegmentedByInts) + ->ArgNames({"Keys", "SegmentKeys", "Segments"}) + ->ArgsProduct({{1, 2}, {0, 1, 2}, benchmark::CreateRange(1, 256, 8)}); + } // namespace acero } // namespace arrow diff --git a/cpp/src/arrow/acero/aggregate_internal.h b/cpp/src/arrow/acero/aggregate_internal.h index 5730d99f93f88..7cdc424cbb76b 100644 --- a/cpp/src/arrow/acero/aggregate_internal.h +++ b/cpp/src/arrow/acero/aggregate_internal.h @@ -131,17 +131,14 @@ void AggregatesToString(std::stringstream* ss, const Schema& input_schema, template Status HandleSegments(RowSegmenter* segmenter, const ExecBatch& batch, const std::vector& ids, const BatchHandler& handle_batch) { - int64_t offset = 0; ARROW_ASSIGN_OR_RAISE(auto segment_exec_batch, batch.SelectValues(ids)); ExecSpan segment_batch(segment_exec_batch); - while (true) { - ARROW_ASSIGN_OR_RAISE(compute::Segment segment, - segmenter->GetNextSegment(segment_batch, offset)); - if (segment.offset >= segment_batch.length) break; // condition of no-next-segment + ARROW_ASSIGN_OR_RAISE(auto segments, segmenter->GetSegments(segment_batch)); + for (const auto& segment : segments) { ARROW_RETURN_NOT_OK(handle_batch(batch, segment)); - offset = segment.offset + segment.length; } + return Status::OK(); } diff --git a/cpp/src/arrow/acero/aggregate_node_test.cc b/cpp/src/arrow/acero/aggregate_node_test.cc index d398fb24b73d5..c623271db9fb4 100644 --- a/cpp/src/arrow/acero/aggregate_node_test.cc +++ b/cpp/src/arrow/acero/aggregate_node_test.cc @@ -210,5 +210,57 @@ TEST(GroupByNode, NoSkipNulls) { AssertExecBatchesEqualIgnoringOrder(out_schema, {expected_batch}, out_batches.batches); } +TEST(ScalarAggregateNode, AnyAll) { + // GH-43768: boolean_any and boolean_all with constant input should work well + // when min_count != 0. + std::shared_ptr in_schema = schema({field("not_used", int32())}); + std::shared_ptr out_schema = schema({field("agg_out", boolean())}); + struct AnyAllCase { + std::string batches_json; + Expression literal; + std::string expected_json; + bool skip_nulls = false; + uint32_t min_count = 2; + }; + std::vector cases{ + {"[[42], [42], [42], [42]]", literal(true), "[[true]]"}, + {"[[42], [42], [42], [42]]", literal(false), "[[false]]"}, + {"[[42], [42], [42], [42]]", literal(BooleanScalar{}), "[[null]]"}, + {"[[42]]", literal(true), "[[null]]"}, + {"[[42], [42], [42]]", literal(true), "[[true]]"}, + {"[[42], [42], [42]]", literal(true), "[[null]]", /*skip_nulls=*/false, + /*min_count=*/4}, + {"[[42], [42], [42], [42]]", literal(BooleanScalar{}), "[[null]]", + /*skip_nulls=*/true}, + }; + for (const AnyAllCase& any_all_case : cases) { + for (auto func_name : {"any", "all"}) { + std::vector batches{ + ExecBatchFromJSON({int32()}, any_all_case.batches_json)}; + std::vector aggregates = { + Aggregate(func_name, + std::make_shared( + /*skip_nulls=*/any_all_case.skip_nulls, + /*min_count=*/any_all_case.min_count), + FieldRef("literal"))}; + + // And a projection to make the input including a Scalar Boolean + Declaration plan = Declaration::Sequence( + {{"exec_batch_source", ExecBatchSourceNodeOptions(in_schema, batches)}, + {"project", ProjectNodeOptions({any_all_case.literal}, {"literal"})}, + {"aggregate", AggregateNodeOptions(aggregates)}}); + + ASSERT_OK_AND_ASSIGN(BatchesWithCommonSchema out_batches, + DeclarationToExecBatches(plan)); + + ExecBatch expected_batch = + ExecBatchFromJSON({boolean()}, any_all_case.expected_json); + + AssertExecBatchesEqualIgnoringOrder(out_schema, {expected_batch}, + out_batches.batches); + } + } +} + } // namespace acero } // namespace arrow diff --git a/cpp/src/arrow/acero/asof_join_node.cc b/cpp/src/arrow/acero/asof_join_node.cc index 848cbdf7506ad..c4f11d01f3d5c 100644 --- a/cpp/src/arrow/acero/asof_join_node.cc +++ b/cpp/src/arrow/acero/asof_join_node.cc @@ -32,9 +32,9 @@ #include "arrow/acero/exec_plan.h" #include "arrow/acero/options.h" -#include "arrow/acero/unmaterialized_table.h" +#include "arrow/acero/unmaterialized_table_internal.h" #ifndef NDEBUG -#include "arrow/acero/options_internal.h" +# include "arrow/acero/options_internal.h" #endif #include "arrow/acero/query_context.h" #include "arrow/acero/schema_util.h" @@ -42,7 +42,7 @@ #include "arrow/array/builder_binary.h" #include "arrow/array/builder_primitive.h" #ifndef NDEBUG -#include "arrow/compute/function_internal.h" +# include "arrow/compute/function_internal.h" #endif #include "arrow/acero/time_series_util.h" #include "arrow/compute/key_hash_internal.h" @@ -207,16 +207,16 @@ class DebugSync { std::unique_lock debug_lock_; }; -#define DEBUG_SYNC(node, ...) DebugSync(node).insert(__VA_ARGS__) -#define DEBUG_MANIP(manip) \ - DebugSync::Manip([](DebugSync& d) -> DebugSync& { return d << manip; }) -#define NDEBUG_EXPLICIT -#define DEBUG_ADD(ndebug, ...) ndebug, __VA_ARGS__ +# define DEBUG_SYNC(node, ...) DebugSync(node).insert(__VA_ARGS__) +# define DEBUG_MANIP(manip) \ + DebugSync::Manip([](DebugSync& d) -> DebugSync& { return d << manip; }) +# define NDEBUG_EXPLICIT +# define DEBUG_ADD(ndebug, ...) ndebug, __VA_ARGS__ #else -#define DEBUG_SYNC(...) -#define DEBUG_MANIP(...) -#define NDEBUG_EXPLICIT explicit -#define DEBUG_ADD(ndebug, ...) ndebug +# define DEBUG_SYNC(...) +# define DEBUG_MANIP(...) +# define NDEBUG_EXPLICIT explicit +# define DEBUG_ADD(ndebug, ...) ndebug #endif struct MemoStore { diff --git a/cpp/src/arrow/acero/asof_join_node_test.cc b/cpp/src/arrow/acero/asof_join_node_test.cc index 051e280a4c53c..5d3e9fba08bbf 100644 --- a/cpp/src/arrow/acero/asof_join_node_test.cc +++ b/cpp/src/arrow/acero/asof_join_node_test.cc @@ -26,13 +26,13 @@ #include "arrow/acero/exec_plan.h" #include "arrow/testing/future_util.h" #ifndef NDEBUG -#include +# include #endif #include #include "arrow/acero/options.h" #ifndef NDEBUG -#include "arrow/acero/options_internal.h" +# include "arrow/acero/options_internal.h" #endif #include "arrow/acero/map_node.h" #include "arrow/acero/query_context.h" @@ -41,8 +41,8 @@ #include "arrow/acero/util.h" #include "arrow/api.h" #include "arrow/compute/api_scalar.h" -#include "arrow/compute/kernels/row_encoder_internal.h" #include "arrow/compute/kernels/test_util.h" +#include "arrow/compute/row/row_encoder_internal.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/matchers.h" #include "arrow/testing/random.h" diff --git a/cpp/src/arrow/acero/bloom_filter.h b/cpp/src/arrow/acero/bloom_filter.h index 50d07bfd948e0..530beaea64827 100644 --- a/cpp/src/arrow/acero/bloom_filter.h +++ b/cpp/src/arrow/acero/bloom_filter.h @@ -18,7 +18,7 @@ #pragma once #if defined(ARROW_HAVE_RUNTIME_AVX2) -#include +# include #endif #include diff --git a/cpp/src/arrow/acero/bloom_filter_test.cc b/cpp/src/arrow/acero/bloom_filter_test.cc index a2d6e9575a1aa..30cafd120caea 100644 --- a/cpp/src/arrow/acero/bloom_filter_test.cc +++ b/cpp/src/arrow/acero/bloom_filter_test.cc @@ -503,9 +503,9 @@ TEST(BloomFilter, Scaling) { num_build.push_back(4000000); std::vector strategies; -#ifdef ARROW_ENABLE_THREADING +# ifdef ARROW_ENABLE_THREADING strategies.push_back(BloomFilterBuildStrategy::PARALLEL); -#endif +# endif strategies.push_back(BloomFilterBuildStrategy::SINGLE_THREADED); for (const auto hardware_flags : HardwareFlagsForTesting()) { diff --git a/cpp/src/arrow/acero/groupby_aggregate_node.cc b/cpp/src/arrow/acero/groupby_aggregate_node.cc index 723c8b7377e13..06b034ab2d459 100644 --- a/cpp/src/arrow/acero/groupby_aggregate_node.cc +++ b/cpp/src/arrow/acero/groupby_aggregate_node.cc @@ -369,13 +369,14 @@ Status GroupByNode::InputReceived(ExecNode* input, ExecBatch batch) { DCHECK_EQ(input, inputs_[0]); auto handler = [this](const ExecBatch& full_batch, const Segment& segment) { - if (!segment.extends && segment.offset == 0) RETURN_NOT_OK(OutputResult(false)); + if (!segment.extends && segment.offset == 0) + RETURN_NOT_OK(OutputResult(/*is_last=*/false)); auto exec_batch = full_batch.Slice(segment.offset, segment.length); auto batch = ExecSpan(exec_batch); RETURN_NOT_OK(Consume(batch)); RETURN_NOT_OK( ExtractSegmenterValues(&segmenter_values_, exec_batch, segment_key_field_ids_)); - if (!segment.is_open) RETURN_NOT_OK(OutputResult(false)); + if (!segment.is_open) RETURN_NOT_OK(OutputResult(/*is_last=*/false)); return Status::OK(); }; ARROW_RETURN_NOT_OK( diff --git a/cpp/src/arrow/acero/hash_aggregate_test.cc b/cpp/src/arrow/acero/hash_aggregate_test.cc index 743cb20d1960d..1e2975afc91b3 100644 --- a/cpp/src/arrow/acero/hash_aggregate_test.cc +++ b/cpp/src/arrow/acero/hash_aggregate_test.cc @@ -585,19 +585,12 @@ void TestGroupClassSupportedKeys( void TestSegments(std::unique_ptr& segmenter, const ExecSpan& batch, std::vector expected_segments) { - int64_t offset = 0, segment_num = 0; - for (auto expected_segment : expected_segments) { - SCOPED_TRACE("segment #" + ToChars(segment_num++)); - ASSERT_OK_AND_ASSIGN(auto segment, segmenter->GetNextSegment(batch, offset)); - ASSERT_EQ(expected_segment, segment); - offset = segment.offset + segment.length; + ASSERT_OK_AND_ASSIGN(auto actual_segments, segmenter->GetSegments(batch)); + ASSERT_EQ(actual_segments.size(), expected_segments.size()); + for (size_t i = 0; i < actual_segments.size(); ++i) { + SCOPED_TRACE("segment #" + ToChars(i)); + ASSERT_EQ(actual_segments[i], expected_segments[i]); } - // Assert next is the last (empty) segment. - ASSERT_OK_AND_ASSIGN(auto segment, segmenter->GetNextSegment(batch, offset)); - ASSERT_GE(segment.offset, batch.length); - ASSERT_EQ(segment.length, 0); - ASSERT_TRUE(segment.is_open); - ASSERT_TRUE(segment.extends); } Result> MakeGrouper(const std::vector& key_types) { @@ -629,91 +622,68 @@ TEST(RowSegmenter, Basics) { auto batch2 = ExecBatchFromJSON(types2, "[[1, 1], [1, 2], [2, 2]]"); auto batch1 = ExecBatchFromJSON(types1, "[[1], [1], [2]]"); ExecBatch batch0({}, 3); - { - SCOPED_TRACE("offset"); - ASSERT_OK_AND_ASSIGN(auto segmenter, MakeRowSegmenter(types0)); - ExecSpan span0(batch0); - for (int64_t offset : {-1, 4}) { - EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, - HasSubstr("invalid grouping segmenter offset"), - segmenter->GetNextSegment(span0, offset)); - } - } { SCOPED_TRACE("types0 segmenting of batch2"); ASSERT_OK_AND_ASSIGN(auto segmenter, MakeRowSegmenter(types0)); ExecSpan span2(batch2); EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, HasSubstr("expected batch size 0 "), - segmenter->GetNextSegment(span2, 0)); + segmenter->GetSegments(span2)); ExecSpan span0(batch0); - TestSegments(segmenter, span0, {{0, 3, true, true}, {3, 0, true, true}}); + TestSegments(segmenter, span0, {{0, 3, true, true}}); } { SCOPED_TRACE("bad_types1 segmenting of batch1"); ASSERT_OK_AND_ASSIGN(auto segmenter, MakeRowSegmenter(bad_types1)); ExecSpan span1(batch1); EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, HasSubstr("expected batch value 0 of type "), - segmenter->GetNextSegment(span1, 0)); + segmenter->GetSegments(span1)); } { SCOPED_TRACE("types1 segmenting of batch2"); ASSERT_OK_AND_ASSIGN(auto segmenter, MakeRowSegmenter(types1)); ExecSpan span2(batch2); EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, HasSubstr("expected batch size 1 "), - segmenter->GetNextSegment(span2, 0)); + segmenter->GetSegments(span2)); ExecSpan span1(batch1); - TestSegments(segmenter, span1, - {{0, 2, false, true}, {2, 1, true, false}, {3, 0, true, true}}); + TestSegments(segmenter, span1, {{0, 2, false, true}, {2, 1, true, false}}); } { SCOPED_TRACE("bad_types2 segmenting of batch2"); ASSERT_OK_AND_ASSIGN(auto segmenter, MakeRowSegmenter(bad_types2)); ExecSpan span2(batch2); EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, HasSubstr("expected batch value 1 of type "), - segmenter->GetNextSegment(span2, 0)); + segmenter->GetSegments(span2)); } { SCOPED_TRACE("types2 segmenting of batch1"); ASSERT_OK_AND_ASSIGN(auto segmenter, MakeRowSegmenter(types2)); ExecSpan span1(batch1); EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, HasSubstr("expected batch size 2 "), - segmenter->GetNextSegment(span1, 0)); + segmenter->GetSegments(span1)); ExecSpan span2(batch2); TestSegments(segmenter, span2, - {{0, 1, false, true}, - {1, 1, false, false}, - {2, 1, true, false}, - {3, 0, true, true}}); + {{0, 1, false, true}, {1, 1, false, false}, {2, 1, true, false}}); } } TEST(RowSegmenter, NonOrdered) { - { - std::vector types = {int32()}; - auto batch = ExecBatchFromJSON(types, "[[1], [1], [2], [1], [2]]"); + for (int num_keys = 1; num_keys <= 2; ++num_keys) { + SCOPED_TRACE("non-ordered " + ToChars(num_keys) + " int32(s)"); + std::vector types(num_keys, int32()); + std::vector values(num_keys, ArrayFromJSON(int32(), "[1, 1, 2, 1, 2]")); + ExecBatch batch(std::move(values), 5); ASSERT_OK_AND_ASSIGN(auto segmenter, MakeRowSegmenter(types)); TestSegments(segmenter, ExecSpan(batch), {{0, 2, false, true}, {2, 1, false, false}, {3, 1, false, false}, - {4, 1, true, false}, - {5, 0, true, true}}); - } - { - std::vector types = {int32(), int32()}; - auto batch = ExecBatchFromJSON(types, "[[1, 1], [1, 1], [2, 2], [1, 2], [2, 2]]"); - ASSERT_OK_AND_ASSIGN(auto segmenter, MakeRowSegmenter(types)); - TestSegments(segmenter, ExecSpan(batch), - {{0, 2, false, true}, - {2, 1, false, false}, - {3, 1, false, false}, - {4, 1, true, false}, - {5, 0, true, true}}); + {4, 1, true, false}}); } } TEST(RowSegmenter, EmptyBatches) { { + SCOPED_TRACE("empty batches {int32}"); std::vector types = {int32()}; std::vector batches = { ExecBatchFromJSON(types, "[]"), ExecBatchFromJSON(types, "[]"), @@ -732,6 +702,7 @@ TEST(RowSegmenter, EmptyBatches) { TestSegments(segmenter, ExecSpan(batches[7]), {}); } { + SCOPED_TRACE("empty batches {int32, int32}"); std::vector types = {int32(), int32()}; std::vector batches = { ExecBatchFromJSON(types, "[]"), @@ -756,25 +727,12 @@ TEST(RowSegmenter, EmptyBatches) { } TEST(RowSegmenter, MultipleSegments) { - { - std::vector types = {int32()}; - auto batch = - ExecBatchFromJSON(types, "[[1], [1], [2], [5], [3], [3], [5], [5], [4]]"); - ASSERT_OK_AND_ASSIGN(auto segmenter, MakeRowSegmenter(types)); - TestSegments(segmenter, ExecSpan(batch), - {{0, 2, false, true}, - {2, 1, false, false}, - {3, 1, false, false}, - {4, 2, false, false}, - {6, 2, false, false}, - {8, 1, true, false}, - {9, 0, true, true}}); - } - { - std::vector types = {int32(), int32()}; - auto batch = ExecBatchFromJSON( - types, - "[[1, 1], [1, 1], [2, 2], [5, 5], [3, 3], [3, 3], [5, 5], [5, 5], [4, 4]]"); + auto test_with_keys = [](int num_keys, const std::shared_ptr& key) { + SCOPED_TRACE("multiple segments " + ToChars(num_keys) + " " + + key->type()->ToString()); + std::vector types(num_keys, key->type()); + std::vector values(num_keys, key); + ExecBatch batch(std::move(values), key->length()); ASSERT_OK_AND_ASSIGN(auto segmenter, MakeRowSegmenter(types)); TestSegments(segmenter, ExecSpan(batch), {{0, 2, false, true}, @@ -782,13 +740,23 @@ TEST(RowSegmenter, MultipleSegments) { {3, 1, false, false}, {4, 2, false, false}, {6, 2, false, false}, - {8, 1, true, false}, - {9, 0, true, true}}); + {8, 1, true, false}}); + }; + for (int num_keys = 1; num_keys <= 2; ++num_keys) { + test_with_keys(num_keys, ArrayFromJSON(int32(), "[1, 1, 2, 5, 3, 3, 5, 5, 4]")); + test_with_keys( + num_keys, + ArrayFromJSON(fixed_size_binary(2), + R"(["aa", "aa", "bb", "ee", "cc", "cc", "ee", "ee", "dd"])")); + test_with_keys(num_keys, DictArrayFromJSON(dictionary(int8(), utf8()), + "[0, 0, 1, 4, 2, 2, 4, 4, 3]", + R"(["a", "b", "c", "d", "e"])")); } } TEST(RowSegmenter, MultipleSegmentsMultipleBatches) { { + SCOPED_TRACE("multiple segments multiple batches {int32}"); std::vector types = {int32()}; std::vector batches = { ExecBatchFromJSON(types, "[[1]]"), ExecBatchFromJSON(types, "[[1], [2]]"), @@ -806,6 +774,7 @@ TEST(RowSegmenter, MultipleSegmentsMultipleBatches) { TestSegments(segmenter, ExecSpan(batches[4]), {{0, 1, true, false}}); } { + SCOPED_TRACE("multiple segments multiple batches {int32, int32}"); std::vector types = {int32(), int32()}; std::vector batches = { ExecBatchFromJSON(types, "[[1, 1]]"), @@ -829,74 +798,217 @@ TEST(RowSegmenter, MultipleSegmentsMultipleBatches) { namespace { void TestRowSegmenterConstantBatch( - std::function shape_func, + const std::shared_ptr& type, + std::function shape_func, + std::function>(int64_t key)> value_func, std::function>(const std::vector&)> make_segmenter) { - constexpr size_t n = 3, repetitions = 3; - std::vector types = {int32(), int32(), int32()}; - std::vector shapes(n); - for (size_t i = 0; i < n; i++) shapes[i] = shape_func(i); - auto full_batch = ExecBatchFromJSON(types, shapes, "[[1, 1, 1], [1, 1, 1], [1, 1, 1]]"); - auto test_by_size = [&](size_t size) -> Status { - SCOPED_TRACE("constant-batch with " + ToChars(size) + " key(s)"); - std::vector values(full_batch.values.begin(), - full_batch.values.begin() + size); - ExecBatch batch(values, full_batch.length); - std::vector key_types(types.begin(), types.begin() + size); + constexpr int64_t n_keys = 3, n_rows = 3, repetitions = 3; + std::vector types(n_keys, type); + std::vector full_values(n_keys); + for (int64_t i = 0; i < n_keys; i++) { + auto shape = shape_func(i); + ASSERT_OK_AND_ASSIGN(auto scalar, value_func(i)); + if (shape == ArgShape::SCALAR) { + full_values[i] = std::move(scalar); + } else { + ASSERT_OK_AND_ASSIGN(full_values[i], MakeArrayFromScalar(*scalar, n_rows)); + } + } + auto test_with_keys = [&](int64_t keys) -> Status { + SCOPED_TRACE("constant-batch with " + ToChars(keys) + " key(s)"); + std::vector values(full_values.begin(), full_values.begin() + keys); + ExecBatch batch(values, n_rows); + std::vector key_types(types.begin(), types.begin() + keys); ARROW_ASSIGN_OR_RAISE(auto segmenter, make_segmenter(key_types)); - for (size_t i = 0; i < repetitions; i++) { - TestSegments(segmenter, ExecSpan(batch), {{0, 3, true, true}, {3, 0, true, true}}); + for (int64_t i = 0; i < repetitions; i++) { + TestSegments(segmenter, ExecSpan(batch), {{0, n_rows, true, true}}); ARROW_RETURN_NOT_OK(segmenter->Reset()); } return Status::OK(); }; - for (size_t i = 0; i <= 3; i++) { - ASSERT_OK(test_by_size(i)); + for (int64_t i = 0; i <= n_keys; i++) { + ASSERT_OK(test_with_keys(i)); } } } // namespace TEST(RowSegmenter, ConstantArrayBatch) { - TestRowSegmenterConstantBatch([](size_t i) { return ArgShape::ARRAY; }, - MakeRowSegmenter); + TestRowSegmenterConstantBatch( + int32(), [](int64_t key) { return ArgShape::ARRAY; }, + [](int64_t key) { return MakeScalar(1); }, MakeRowSegmenter); } TEST(RowSegmenter, ConstantScalarBatch) { - TestRowSegmenterConstantBatch([](size_t i) { return ArgShape::SCALAR; }, - MakeRowSegmenter); + TestRowSegmenterConstantBatch( + int32(), [](int64_t key) { return ArgShape::SCALAR; }, + [](int64_t key) { return MakeScalar(1); }, MakeRowSegmenter); } TEST(RowSegmenter, ConstantMixedBatch) { TestRowSegmenterConstantBatch( - [](size_t i) { return i % 2 == 0 ? ArgShape::SCALAR : ArgShape::ARRAY; }, - MakeRowSegmenter); + int32(), + [](int64_t key) { return key % 2 == 0 ? ArgShape::SCALAR : ArgShape::ARRAY; }, + [](int64_t key) { return MakeScalar(1); }, MakeRowSegmenter); } TEST(RowSegmenter, ConstantArrayBatchWithAnyKeysSegmenter) { - TestRowSegmenterConstantBatch([](size_t i) { return ArgShape::ARRAY; }, - MakeGenericSegmenter); + TestRowSegmenterConstantBatch( + int32(), [](int64_t key) { return ArgShape::ARRAY; }, + [](int64_t key) { return MakeScalar(1); }, MakeGenericSegmenter); } TEST(RowSegmenter, ConstantScalarBatchWithAnyKeysSegmenter) { - TestRowSegmenterConstantBatch([](size_t i) { return ArgShape::SCALAR; }, - MakeGenericSegmenter); + TestRowSegmenterConstantBatch( + int32(), [](int64_t key) { return ArgShape::SCALAR; }, + [](int64_t key) { return MakeScalar(1); }, MakeGenericSegmenter); } TEST(RowSegmenter, ConstantMixedBatchWithAnyKeysSegmenter) { TestRowSegmenterConstantBatch( - [](size_t i) { return i % 2 == 0 ? ArgShape::SCALAR : ArgShape::ARRAY; }, - MakeGenericSegmenter); + int32(), + [](int64_t key) { return key % 2 == 0 ? ArgShape::SCALAR : ArgShape::ARRAY; }, + [](int64_t key) { return MakeScalar(1); }, MakeGenericSegmenter); +} + +TEST(RowSegmenter, ConstantFixedSizeBinaryArrayBatch) { + constexpr int fsb = 8; + auto type = fixed_size_binary(fsb); + ASSERT_OK_AND_ASSIGN(auto value, MakeScalar(type, std::string(fsb, 'X'))); + TestRowSegmenterConstantBatch( + type, [](int64_t key) { return ArgShape::ARRAY; }, + [&](int64_t key) { return value; }, MakeRowSegmenter); +} + +TEST(RowSegmenter, ConstantFixedSizeBinaryScalarBatch) { + constexpr int fsb = 8; + auto type = fixed_size_binary(fsb); + ASSERT_OK_AND_ASSIGN(auto value, MakeScalar(type, std::string(fsb, 'X'))); + TestRowSegmenterConstantBatch( + fixed_size_binary(8), [](int64_t key) { return ArgShape::SCALAR; }, + [&](int64_t key) { return value; }, MakeRowSegmenter); +} + +TEST(RowSegmenter, ConstantFixedSizeBinaryMixedBatch) { + constexpr int fsb = 8; + auto type = fixed_size_binary(fsb); + ASSERT_OK_AND_ASSIGN(auto value, MakeScalar(type, std::string(fsb, 'X'))); + TestRowSegmenterConstantBatch( + fixed_size_binary(8), + [](int64_t key) { return key % 2 == 0 ? ArgShape::SCALAR : ArgShape::ARRAY; }, + [&](int64_t key) { return value; }, MakeRowSegmenter); +} + +TEST(RowSegmenter, ConstantFixedSizeBinaryArrayBatchWithAnyKeysSegmenter) { + constexpr int fsb = 8; + auto type = fixed_size_binary(fsb); + ASSERT_OK_AND_ASSIGN(auto value, MakeScalar(type, std::string(fsb, 'X'))); + TestRowSegmenterConstantBatch( + type, [](int64_t key) { return ArgShape::ARRAY; }, + [&](int64_t key) { return value; }, MakeGenericSegmenter); +} + +TEST(RowSegmenter, ConstantFixedSizeBinaryScalarBatchWithAnyKeysSegmenter) { + constexpr int fsb = 8; + auto type = fixed_size_binary(fsb); + ASSERT_OK_AND_ASSIGN(auto value, MakeScalar(type, std::string(fsb, 'X'))); + TestRowSegmenterConstantBatch( + fixed_size_binary(8), [](int64_t key) { return ArgShape::SCALAR; }, + [&](int64_t key) { return value; }, MakeGenericSegmenter); +} + +TEST(RowSegmenter, ConstantFixedSizeBinaryMixedBatchWithAnyKeysSegmenter) { + constexpr int fsb = 8; + auto type = fixed_size_binary(fsb); + ASSERT_OK_AND_ASSIGN(auto value, MakeScalar(type, std::string(fsb, 'X'))); + TestRowSegmenterConstantBatch( + fixed_size_binary(8), + [](int64_t key) { return key % 2 == 0 ? ArgShape::SCALAR : ArgShape::ARRAY; }, + [&](int64_t key) { return value; }, MakeGenericSegmenter); +} + +TEST(RowSegmenter, ConstantDictionaryArrayBatch) { + auto index_type = int32(); + auto value_type = utf8(); + auto dict_type = dictionary(index_type, value_type); + auto dict = ArrayFromJSON(value_type, R"(["alpha", null, "gamma"])"); + ASSERT_OK_AND_ASSIGN(auto index_value, MakeScalar(index_type, 0)); + auto dict_value = DictionaryScalar::Make(std::move(index_value), dict); + TestRowSegmenterConstantBatch( + dict_type, [](int64_t key) { return ArgShape::ARRAY; }, + [&](int64_t key) { return dict_value; }, MakeRowSegmenter); +} + +TEST(RowSegmenter, ConstantDictionaryScalarBatch) { + auto index_type = int32(); + auto value_type = utf8(); + auto dict_type = dictionary(index_type, value_type); + auto dict = ArrayFromJSON(value_type, R"(["alpha", null, "gamma"])"); + ASSERT_OK_AND_ASSIGN(auto index_value, MakeScalar(index_type, 0)); + auto dict_value = DictionaryScalar::Make(std::move(index_value), dict); + TestRowSegmenterConstantBatch( + dict_type, [](int64_t key) { return ArgShape::SCALAR; }, + [&](int64_t key) { return dict_value; }, MakeRowSegmenter); +} + +TEST(RowSegmenter, ConstantDictionaryMixedBatch) { + auto index_type = int32(); + auto value_type = utf8(); + auto dict_type = dictionary(index_type, value_type); + auto dict = ArrayFromJSON(value_type, R"(["alpha", null, "gamma"])"); + ASSERT_OK_AND_ASSIGN(auto index_value, MakeScalar(index_type, 0)); + auto dict_value = DictionaryScalar::Make(std::move(index_value), dict); + TestRowSegmenterConstantBatch( + dict_type, + [](int64_t key) { return key % 2 == 0 ? ArgShape::SCALAR : ArgShape::ARRAY; }, + [&](int64_t key) { return dict_value; }, MakeRowSegmenter); +} + +TEST(RowSegmenter, ConstantDictionaryArrayBatchWithAnyKeysSegmenter) { + auto index_type = int32(); + auto value_type = utf8(); + auto dict_type = dictionary(index_type, value_type); + auto dict = ArrayFromJSON(value_type, R"(["alpha", null, "gamma"])"); + ASSERT_OK_AND_ASSIGN(auto index_value, MakeScalar(index_type, 0)); + auto dict_value = DictionaryScalar::Make(std::move(index_value), dict); + TestRowSegmenterConstantBatch( + dict_type, [](int64_t key) { return ArgShape::ARRAY; }, + [&](int64_t key) { return dict_value; }, MakeGenericSegmenter); +} + +TEST(RowSegmenter, ConstantDictionaryScalarBatchWithAnyKeysSegmenter) { + auto index_type = int32(); + auto value_type = utf8(); + auto dict_type = dictionary(index_type, value_type); + auto dict = ArrayFromJSON(value_type, R"(["alpha", null, "gamma"])"); + ASSERT_OK_AND_ASSIGN(auto index_value, MakeScalar(index_type, 0)); + auto dict_value = DictionaryScalar::Make(std::move(index_value), dict); + TestRowSegmenterConstantBatch( + dict_type, [](int64_t key) { return ArgShape::SCALAR; }, + [&](int64_t key) { return dict_value; }, MakeGenericSegmenter); +} + +TEST(RowSegmenter, ConstantDictionaryMixedBatchWithAnyKeysSegmenter) { + auto index_type = int32(); + auto value_type = utf8(); + auto dict_type = dictionary(index_type, value_type); + auto dict = ArrayFromJSON(value_type, R"(["alpha", null, "gamma"])"); + ASSERT_OK_AND_ASSIGN(auto index_value, MakeScalar(index_type, 0)); + auto dict_value = DictionaryScalar::Make(std::move(index_value), dict); + TestRowSegmenterConstantBatch( + dict_type, + [](int64_t key) { return key % 2 == 0 ? ArgShape::SCALAR : ArgShape::ARRAY; }, + [&](int64_t key) { return dict_value; }, MakeGenericSegmenter); } TEST(RowSegmenter, RowConstantBatch) { constexpr size_t n = 3; std::vector types = {int32(), int32(), int32()}; auto full_batch = ExecBatchFromJSON(types, "[[1, 1, 1], [2, 2, 2], [3, 3, 3]]"); - std::vector expected_segments_for_size_0 = {{0, 3, true, true}, - {3, 0, true, true}}; + std::vector expected_segments_for_size_0 = {{0, 3, true, true}}; std::vector expected_segments = { - {0, 1, false, true}, {1, 1, false, false}, {2, 1, true, false}, {3, 0, true, true}}; + {0, 1, false, true}, {1, 1, false, false}, {2, 1, true, false}}; auto test_by_size = [&](size_t size) -> Status { SCOPED_TRACE("constant-batch with " + ToChars(size) + " key(s)"); std::vector values(full_batch.values.begin(), diff --git a/cpp/src/arrow/acero/hash_join.cc b/cpp/src/arrow/acero/hash_join.cc index 5aa70a23f7c9e..ddcd2a0995701 100644 --- a/cpp/src/arrow/acero/hash_join.cc +++ b/cpp/src/arrow/acero/hash_join.cc @@ -27,8 +27,8 @@ #include "arrow/acero/hash_join_dict.h" #include "arrow/acero/task_util.h" -#include "arrow/compute/kernels/row_encoder_internal.h" #include "arrow/compute/row/encode_internal.h" +#include "arrow/compute/row/row_encoder_internal.h" #include "arrow/util/tracing_internal.h" namespace arrow { diff --git a/cpp/src/arrow/acero/hash_join_benchmark.cc b/cpp/src/arrow/acero/hash_join_benchmark.cc index 1f8e02e9f0fcf..e3e37e249e6a3 100644 --- a/cpp/src/arrow/acero/hash_join_benchmark.cc +++ b/cpp/src/arrow/acero/hash_join_benchmark.cc @@ -23,7 +23,7 @@ #include "arrow/acero/test_util_internal.h" #include "arrow/acero/util.h" #include "arrow/api.h" -#include "arrow/compute/kernels/row_encoder_internal.h" +#include "arrow/compute/row/row_encoder_internal.h" #include "arrow/testing/random.h" #include "arrow/util/thread_pool.h" @@ -104,7 +104,7 @@ class JoinBenchmark { key_cmp.push_back(JoinKeyCmp::EQ); } - for (size_t i = 0; i < settings.build_payload_types.size(); i++) { + for (size_t i = 0; i < settings.probe_payload_types.size(); i++) { std::string name = "lp" + std::to_string(i); DCHECK_OK(l_schema_builder.AddField(field(name, settings.probe_payload_types[i]))); } @@ -279,7 +279,7 @@ static void BM_HashJoinBasic_MatchesPerRow(benchmark::State& st) { settings.cardinality = 1.0 / static_cast(st.range(0)); settings.num_build_batches = static_cast(st.range(1)); - settings.num_probe_batches = settings.num_probe_batches; + settings.num_probe_batches = settings.num_build_batches; HashJoinBasicBenchmarkImpl(st, settings); } @@ -291,7 +291,7 @@ static void BM_HashJoinBasic_PayloadSize(benchmark::State& st) { settings.cardinality = 1.0 / static_cast(st.range(1)); settings.num_build_batches = static_cast(st.range(2)); - settings.num_probe_batches = settings.num_probe_batches; + settings.num_probe_batches = settings.num_build_batches; HashJoinBasicBenchmarkImpl(st, settings); } diff --git a/cpp/src/arrow/acero/hash_join_dict.cc b/cpp/src/arrow/acero/hash_join_dict.cc index 3aef08e6e9ccf..8db9dddb2c3a0 100644 --- a/cpp/src/arrow/acero/hash_join_dict.cc +++ b/cpp/src/arrow/acero/hash_join_dict.cc @@ -225,21 +225,20 @@ Status HashJoinDictBuild::Init(ExecContext* ctx, std::shared_ptr dictiona return Status::OK(); } - dictionary_ = dictionary; + dictionary_ = std::move(dictionary); // Initialize encoder RowEncoder encoder; - std::vector encoder_types; - encoder_types.emplace_back(value_type_); + std::vector encoder_types{value_type_}; encoder.Init(encoder_types, ctx); // Encode all dictionary values - int64_t length = dictionary->data()->length; + int64_t length = dictionary_->data()->length; if (length >= std::numeric_limits::max()) { return Status::Invalid( "Dictionary length in hash join must fit into signed 32-bit integer."); } - RETURN_NOT_OK(encoder.EncodeAndAppend(ExecSpan({*dictionary->data()}, length))); + RETURN_NOT_OK(encoder.EncodeAndAppend(ExecSpan({*dictionary_->data()}, length))); std::vector entries_to_take; diff --git a/cpp/src/arrow/acero/hash_join_dict.h b/cpp/src/arrow/acero/hash_join_dict.h index c7d8d785d079e..02454a7146278 100644 --- a/cpp/src/arrow/acero/hash_join_dict.h +++ b/cpp/src/arrow/acero/hash_join_dict.h @@ -22,7 +22,7 @@ #include "arrow/acero/schema_util.h" #include "arrow/compute/exec.h" -#include "arrow/compute/kernels/row_encoder_internal.h" +#include "arrow/compute/row/row_encoder_internal.h" #include "arrow/result.h" #include "arrow/status.h" #include "arrow/type.h" diff --git a/cpp/src/arrow/acero/hash_join_node.cc b/cpp/src/arrow/acero/hash_join_node.cc index 67f902e64be93..80dd163ced740 100644 --- a/cpp/src/arrow/acero/hash_join_node.cc +++ b/cpp/src/arrow/acero/hash_join_node.cc @@ -61,30 +61,30 @@ Result> HashJoinSchema::ComputePayload( const std::vector& filter, const std::vector& keys) { // payload = (output + filter) - keys, with no duplicates std::unordered_set payload_fields; - for (auto ref : output) { + for (const auto& ref : output) { ARROW_ASSIGN_OR_RAISE(auto match, ref.FindOne(schema)); payload_fields.insert(match[0]); } - for (auto ref : filter) { + for (const auto& ref : filter) { ARROW_ASSIGN_OR_RAISE(auto match, ref.FindOne(schema)); payload_fields.insert(match[0]); } - for (auto ref : keys) { + for (const auto& ref : keys) { ARROW_ASSIGN_OR_RAISE(auto match, ref.FindOne(schema)); payload_fields.erase(match[0]); } std::vector payload_refs; - for (auto ref : output) { + for (const auto& ref : output) { ARROW_ASSIGN_OR_RAISE(auto match, ref.FindOne(schema)); if (payload_fields.find(match[0]) != payload_fields.end()) { payload_refs.push_back(ref); payload_fields.erase(match[0]); } } - for (auto ref : filter) { + for (const auto& ref : filter) { ARROW_ASSIGN_OR_RAISE(auto match, ref.FindOne(schema)); if (payload_fields.find(match[0]) != payload_fields.end()) { payload_refs.push_back(ref); @@ -198,7 +198,7 @@ Status HashJoinSchema::ValidateSchemas(JoinType join_type, const Schema& left_sc return Status::Invalid("Different number of key fields on left (", left_keys.size(), ") and right (", right_keys.size(), ") side of the join"); } - if (left_keys.size() < 1) { + if (left_keys.empty()) { return Status::Invalid("Join key cannot be empty"); } for (size_t i = 0; i < left_keys.size() + right_keys.size(); ++i) { @@ -432,7 +432,7 @@ Status HashJoinSchema::CollectFilterColumns(std::vector& left_filter, indices[0] -= left_schema.num_fields(); FieldPath corrected_path(std::move(indices)); if (right_seen_paths.find(*path) == right_seen_paths.end()) { - right_filter.push_back(corrected_path); + right_filter.emplace_back(corrected_path); right_seen_paths.emplace(std::move(corrected_path)); } } else if (left_seen_paths.find(*path) == left_seen_paths.end()) { @@ -698,7 +698,7 @@ class HashJoinNode : public ExecNode, public TracedNode { std::shared_ptr output_schema, std::unique_ptr schema_mgr, Expression filter, std::unique_ptr impl) - : ExecNode(plan, inputs, {"left", "right"}, + : ExecNode(plan, std::move(inputs), {"left", "right"}, /*output_schema=*/std::move(output_schema)), TracedNode(this), join_type_(join_options.join_type), diff --git a/cpp/src/arrow/acero/hash_join_node.h b/cpp/src/arrow/acero/hash_join_node.h index ad60019ceabc4..19745b8675cf0 100644 --- a/cpp/src/arrow/acero/hash_join_node.h +++ b/cpp/src/arrow/acero/hash_join_node.h @@ -65,9 +65,9 @@ class ARROW_ACERO_EXPORT HashJoinSchema { std::shared_ptr MakeOutputSchema(const std::string& left_field_name_suffix, const std::string& right_field_name_suffix); - bool LeftPayloadIsEmpty() { return PayloadIsEmpty(0); } + bool LeftPayloadIsEmpty() const { return PayloadIsEmpty(0); } - bool RightPayloadIsEmpty() { return PayloadIsEmpty(1); } + bool RightPayloadIsEmpty() const { return PayloadIsEmpty(1); } static int kMissingField() { return SchemaProjectionMaps::kMissingField; @@ -88,7 +88,7 @@ class ARROW_ACERO_EXPORT HashJoinSchema { const SchemaProjectionMap& right_to_filter, const Expression& filter); - bool PayloadIsEmpty(int side) { + bool PayloadIsEmpty(int side) const { assert(side == 0 || side == 1); return proj_maps[side].num_cols(HashJoinProjection::PAYLOAD) == 0; } diff --git a/cpp/src/arrow/acero/hash_join_node_test.cc b/cpp/src/arrow/acero/hash_join_node_test.cc index f7b442cc3c624..76ad9c7d650eb 100644 --- a/cpp/src/arrow/acero/hash_join_node_test.cc +++ b/cpp/src/arrow/acero/hash_join_node_test.cc @@ -26,10 +26,12 @@ #include "arrow/acero/test_util_internal.h" #include "arrow/acero/util.h" #include "arrow/api.h" -#include "arrow/compute/kernels/row_encoder_internal.h" #include "arrow/compute/kernels/test_util.h" #include "arrow/compute/light_array_internal.h" +#include "arrow/compute/row/row_encoder_internal.h" +#include "arrow/extension/uuid.h" #include "arrow/testing/extension_type.h" +#include "arrow/testing/generator.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/matchers.h" #include "arrow/testing/random.h" @@ -40,6 +42,10 @@ using testing::UnorderedElementsAreArray; namespace arrow { +using arrow::gen::Constant; +using arrow::random::kSeedMax; +using arrow::random::RandomArrayGenerator; +using compute::and_; using compute::call; using compute::default_exec_context; using compute::ExecBatchBuilder; @@ -3253,5 +3259,192 @@ TEST(HashJoin, ManyJoins) { ASSERT_OK_AND_ASSIGN(std::ignore, DeclarationToTable(std::move(root))); } +namespace { + +void AssertRowCountEq(Declaration source, int64_t expected) { + Declaration count{"aggregate", + {std::move(source)}, + AggregateNodeOptions{/*aggregates=*/{{"count_all", "count(*)"}}}}; + ASSERT_OK_AND_ASSIGN(auto batches, DeclarationToExecBatches(std::move(count))); + ASSERT_EQ(batches.batches.size(), 1); + ASSERT_EQ(batches.batches[0].values.size(), 1); + ASSERT_TRUE(batches.batches[0].values[0].is_scalar()); + ASSERT_EQ(batches.batches[0].values[0].scalar()->type->id(), Type::INT64); + ASSERT_TRUE(batches.batches[0].values[0].scalar_as().is_valid); + ASSERT_EQ(batches.batches[0].values[0].scalar_as().value, expected); +} + +} // namespace + +// GH-43495: Test that both the key and the payload of the right side (the build side) are +// fixed length and larger than 4GB, and the 64-bit offset in the hash table can handle it +// correctly. +TEST(HashJoin, LARGE_MEMORY_TEST(BuildSideOver4GBFixedLength)) { + constexpr int64_t k5GB = 5ll * 1024 * 1024 * 1024; + constexpr int fixed_length = 128; + const auto type = fixed_size_binary(fixed_length); + constexpr uint8_t byte_no_match_min = static_cast('A'); + constexpr uint8_t byte_no_match_max = static_cast('y'); + constexpr uint8_t byte_match = static_cast('z'); + const auto value_match = + std::make_shared(std::string(fixed_length, byte_match)); + constexpr int16_t num_rows_per_batch_left = 128; + constexpr int16_t num_rows_per_batch_right = 4096; + const int64_t num_batches_left = 8; + const int64_t num_batches_right = + k5GB / (num_rows_per_batch_right * type->byte_width()); + + // Left side composed of num_batches_left identical batches of num_rows_per_batch_left + // rows of value_match-es. + BatchesWithSchema batches_left; + { + // A column with num_rows_per_batch_left value_match-es. + ASSERT_OK_AND_ASSIGN(auto column, + Constant(value_match)->Generate(num_rows_per_batch_left)); + + // Use the column as both the key and the payload. + ExecBatch batch({column, column}, num_rows_per_batch_left); + batches_left = + BatchesWithSchema{std::vector(num_batches_left, std::move(batch)), + schema({field("l_key", type), field("l_payload", type)})}; + } + + // Right side composed of num_batches_right identical batches of + // num_rows_per_batch_right rows containing only 1 value_match. + BatchesWithSchema batches_right; + { + // A column with (num_rows_per_batch_right - 1) non-value_match-es (possibly null) and + // 1 value_match. + auto non_matches = RandomArrayGenerator(kSeedMax).FixedSizeBinary( + num_rows_per_batch_right - 1, fixed_length, + /*null_probability =*/0.01, /*min_byte=*/byte_no_match_min, + /*max_byte=*/byte_no_match_max); + ASSERT_OK_AND_ASSIGN(auto match, Constant(value_match)->Generate(1)); + ASSERT_OK_AND_ASSIGN(auto column, Concatenate({non_matches, match})); + + // Use the column as both the key and the payload. + ExecBatch batch({column, column}, num_rows_per_batch_right); + batches_right = + BatchesWithSchema{std::vector(num_batches_right, std::move(batch)), + schema({field("r_key", type), field("r_payload", type)})}; + } + + Declaration left{"exec_batch_source", + ExecBatchSourceNodeOptions(std::move(batches_left.schema), + std::move(batches_left.batches))}; + + Declaration right{"exec_batch_source", + ExecBatchSourceNodeOptions(std::move(batches_right.schema), + std::move(batches_right.batches))}; + + HashJoinNodeOptions join_opts(JoinType::INNER, /*left_keys=*/{"l_key"}, + /*right_keys=*/{"r_key"}); + Declaration join{"hashjoin", {std::move(left), std::move(right)}, join_opts}; + + ASSERT_OK_AND_ASSIGN(auto batches_result, DeclarationToExecBatches(std::move(join))); + Declaration result{"exec_batch_source", + ExecBatchSourceNodeOptions(std::move(batches_result.schema), + std::move(batches_result.batches))}; + + // The row count of hash join should be (number of value_match-es in left side) * + // (number of value_match-es in right side). + AssertRowCountEq(result, + num_batches_left * num_rows_per_batch_left * num_batches_right); + + // All rows should be value_match-es. + auto predicate = and_({equal(field_ref("l_key"), literal(value_match)), + equal(field_ref("l_payload"), literal(value_match)), + equal(field_ref("r_key"), literal(value_match)), + equal(field_ref("r_payload"), literal(value_match))}); + Declaration filter{"filter", {result}, FilterNodeOptions{std::move(predicate)}}; + AssertRowCountEq(std::move(filter), + num_batches_left * num_rows_per_batch_left * num_batches_right); +} + +// GH-43495: Test that both the key and the payload of the right side (the build side) are +// var length and larger than 4GB, and the 64-bit offset in the hash table can handle it +// correctly. +TEST(HashJoin, LARGE_MEMORY_TEST(BuildSideOver4GBVarLength)) { + constexpr int64_t k5GB = 5ll * 1024 * 1024 * 1024; + const auto type = utf8(); + constexpr int value_no_match_length_min = 128; + constexpr int value_no_match_length_max = 129; + constexpr int value_match_length = 130; + const auto value_match = + std::make_shared(std::string(value_match_length, 'X')); + constexpr int16_t num_rows_per_batch_left = 128; + constexpr int16_t num_rows_per_batch_right = 4096; + const int64_t num_batches_left = 8; + const int64_t num_batches_right = + k5GB / (num_rows_per_batch_right * value_no_match_length_min); + + // Left side composed of num_batches_left identical batches of num_rows_per_batch_left + // rows of value_match-es. + BatchesWithSchema batches_left; + { + // A column with num_rows_per_batch_left value_match-es. + ASSERT_OK_AND_ASSIGN(auto column, + Constant(value_match)->Generate(num_rows_per_batch_left)); + + // Use the column as both the key and the payload. + ExecBatch batch({column, column}, num_rows_per_batch_left); + batches_left = + BatchesWithSchema{std::vector(num_batches_left, std::move(batch)), + schema({field("l_key", type), field("l_payload", type)})}; + } + + // Right side composed of num_batches_right identical batches of + // num_rows_per_batch_right rows containing only 1 value_match. + BatchesWithSchema batches_right; + { + // A column with (num_rows_per_batch_right - 1) non-value_match-es (possibly null) and + // 1 value_match. + auto non_matches = + RandomArrayGenerator(kSeedMax).String(num_rows_per_batch_right - 1, + /*min_length=*/value_no_match_length_min, + /*max_length=*/value_no_match_length_max, + /*null_probability =*/0.01); + ASSERT_OK_AND_ASSIGN(auto match, Constant(value_match)->Generate(1)); + ASSERT_OK_AND_ASSIGN(auto column, Concatenate({non_matches, match})); + + // Use the column as both the key and the payload. + ExecBatch batch({column, column}, num_rows_per_batch_right); + batches_right = + BatchesWithSchema{std::vector(num_batches_right, std::move(batch)), + schema({field("r_key", type), field("r_payload", type)})}; + } + + Declaration left{"exec_batch_source", + ExecBatchSourceNodeOptions(std::move(batches_left.schema), + std::move(batches_left.batches))}; + + Declaration right{"exec_batch_source", + ExecBatchSourceNodeOptions(std::move(batches_right.schema), + std::move(batches_right.batches))}; + + HashJoinNodeOptions join_opts(JoinType::INNER, /*left_keys=*/{"l_key"}, + /*right_keys=*/{"r_key"}); + Declaration join{"hashjoin", {std::move(left), std::move(right)}, join_opts}; + + ASSERT_OK_AND_ASSIGN(auto batches_result, DeclarationToExecBatches(std::move(join))); + Declaration result{"exec_batch_source", + ExecBatchSourceNodeOptions(std::move(batches_result.schema), + std::move(batches_result.batches))}; + + // The row count of hash join should be (number of value_match-es in left side) * + // (number of value_match-es in right side). + AssertRowCountEq(result, + num_batches_left * num_rows_per_batch_left * num_batches_right); + + // All rows should be value_match-es. + auto predicate = and_({equal(field_ref("l_key"), literal(value_match)), + equal(field_ref("l_payload"), literal(value_match)), + equal(field_ref("r_key"), literal(value_match)), + equal(field_ref("r_payload"), literal(value_match))}); + Declaration filter{"filter", {result}, FilterNodeOptions{std::move(predicate)}}; + AssertRowCountEq(std::move(filter), + num_batches_left * num_rows_per_batch_left * num_batches_right); +} + } // namespace acero } // namespace arrow diff --git a/cpp/src/arrow/acero/options_internal.h b/cpp/src/arrow/acero/options_internal.h index d4bf79a7cd008..fd3ea78116572 100644 --- a/cpp/src/arrow/acero/options_internal.h +++ b/cpp/src/arrow/acero/options_internal.h @@ -18,8 +18,8 @@ #pragma once #ifndef NDEBUG -#include -#include +# include +# include #endif namespace arrow { diff --git a/cpp/src/arrow/acero/scalar_aggregate_node.cc b/cpp/src/arrow/acero/scalar_aggregate_node.cc index c7805f4d24eb2..b34f7511cc12b 100644 --- a/cpp/src/arrow/acero/scalar_aggregate_node.cc +++ b/cpp/src/arrow/acero/scalar_aggregate_node.cc @@ -234,7 +234,8 @@ Status ScalarAggregateNode::InputReceived(ExecNode* input, ExecBatch batch) { // (1) The segment is starting of a new segment group and points to // the beginning of the batch, then it means no data in the batch belongs // to the current segment group. We can output and reset kernel states. - if (!segment.extends && segment.offset == 0) RETURN_NOT_OK(OutputResult(false)); + if (!segment.extends && segment.offset == 0) + RETURN_NOT_OK(OutputResult(/*is_last=*/false)); // We add segment to the current segment group aggregation auto exec_batch = full_batch.Slice(segment.offset, segment.length); @@ -244,7 +245,7 @@ Status ScalarAggregateNode::InputReceived(ExecNode* input, ExecBatch batch) { // If the segment closes the current segment group, we can output segment group // aggregation. - if (!segment.is_open) RETURN_NOT_OK(OutputResult(false)); + if (!segment.is_open) RETURN_NOT_OK(OutputResult(/*is_last=*/false)); return Status::OK(); }; diff --git a/cpp/src/arrow/acero/sorted_merge_node.cc b/cpp/src/arrow/acero/sorted_merge_node.cc index a71ac79efcc46..2845383cee982 100644 --- a/cpp/src/arrow/acero/sorted_merge_node.cc +++ b/cpp/src/arrow/acero/sorted_merge_node.cc @@ -28,7 +28,7 @@ #include "arrow/acero/options.h" #include "arrow/acero/query_context.h" #include "arrow/acero/time_series_util.h" -#include "arrow/acero/unmaterialized_table.h" +#include "arrow/acero/unmaterialized_table_internal.h" #include "arrow/acero/util.h" #include "arrow/array/builder_base.h" #include "arrow/result.h" diff --git a/cpp/src/arrow/acero/swiss_join.cc b/cpp/src/arrow/acero/swiss_join.cc index 732deb72861d6..6c783110af571 100644 --- a/cpp/src/arrow/acero/swiss_join.cc +++ b/cpp/src/arrow/acero/swiss_join.cc @@ -24,10 +24,10 @@ #include "arrow/acero/swiss_join_internal.h" #include "arrow/acero/util.h" #include "arrow/array/util.h" // MakeArrayFromScalar -#include "arrow/compute/kernels/row_encoder_internal.h" #include "arrow/compute/key_hash_internal.h" #include "arrow/compute/row/compare_internal.h" #include "arrow/compute/row/encode_internal.h" +#include "arrow/compute/row/row_encoder_internal.h" #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_ops.h" #include "arrow/util/tracing_internal.h" @@ -122,7 +122,7 @@ void RowArrayAccessor::Visit(const RowTableImpl& rows, int column_id, int num_ro if (!is_fixed_length_column) { int varbinary_column_id = VarbinaryColumnId(rows.metadata(), column_id); const uint8_t* row_ptr_base = rows.data(2); - const uint32_t* row_offsets = rows.offsets(); + const RowTableImpl::offset_type* row_offsets = rows.offsets(); uint32_t field_offset_within_row, field_length; if (varbinary_column_id == 0) { @@ -173,7 +173,7 @@ void RowArrayAccessor::Visit(const RowTableImpl& rows, int column_id, int num_ro // Case 4: This is a fixed length column in a varying length row // const uint8_t* row_ptr_base = rows.data(2) + field_offset_within_row; - const uint32_t* row_offsets = rows.offsets(); + const RowTableImpl::offset_type* row_offsets = rows.offsets(); for (int i = 0; i < num_rows; ++i) { uint32_t row_id = row_ids[i]; const uint8_t* row_ptr = row_ptr_base + row_offsets[row_id]; @@ -473,17 +473,10 @@ Status RowArrayMerge::PrepareForMerge(RowArray* target, (*first_target_row_id)[sources.size()] = num_rows; } - if (num_bytes > std::numeric_limits::max()) { - return Status::Invalid( - "There are more than 2^32 bytes of key data. Acero cannot " - "process a join of this magnitude"); - } - // Allocate target memory // target->rows_.Clean(); - RETURN_NOT_OK(target->rows_.AppendEmpty(static_cast(num_rows), - static_cast(num_bytes))); + RETURN_NOT_OK(target->rows_.AppendEmpty(static_cast(num_rows), num_bytes)); // In case of varying length rows, // initialize the first row offset for each range of rows corresponding to a @@ -565,15 +558,15 @@ void RowArrayMerge::CopyVaryingLength(RowTableImpl* target, const RowTableImpl& int64_t first_target_row_offset, const int64_t* source_rows_permutation) { int64_t num_source_rows = source.length(); - uint32_t* target_offsets = target->mutable_offsets(); - const uint32_t* source_offsets = source.offsets(); + RowTableImpl::offset_type* target_offsets = target->mutable_offsets(); + const RowTableImpl::offset_type* source_offsets = source.offsets(); // Permutation of source rows is optional. // if (!source_rows_permutation) { int64_t target_row_offset = first_target_row_offset; for (int64_t i = 0; i < num_source_rows; ++i) { - target_offsets[first_target_row_id + i] = static_cast(target_row_offset); + target_offsets[first_target_row_id + i] = target_row_offset; target_row_offset += source_offsets[i + 1] - source_offsets[i]; } // We purposefully skip outputting of N+1 offset, to allow concurrent @@ -593,7 +586,10 @@ void RowArrayMerge::CopyVaryingLength(RowTableImpl* target, const RowTableImpl& int64_t source_row_id = source_rows_permutation[i]; const uint64_t* source_row_ptr = reinterpret_cast( source.data(2) + source_offsets[source_row_id]); - uint32_t length = source_offsets[source_row_id + 1] - source_offsets[source_row_id]; + int64_t length = source_offsets[source_row_id + 1] - source_offsets[source_row_id]; + // Though the row offset is 64-bit, the length of a single row must be 32-bit as + // required by current row table implementation. + DCHECK_LE(length, std::numeric_limits::max()); // Rows should be 64-bit aligned. // In that case we can copy them using a sequence of 64-bit read/writes. @@ -604,7 +600,7 @@ void RowArrayMerge::CopyVaryingLength(RowTableImpl* target, const RowTableImpl& *target_row_ptr++ = *source_row_ptr++; } - target_offsets[first_target_row_id + i] = static_cast(target_row_offset); + target_offsets[first_target_row_id + i] = target_row_offset; target_row_offset += length; } } @@ -1671,7 +1667,7 @@ Result> JoinResultMaterialize::FlushBuildColumn( const std::shared_ptr& data_type, const RowArray* row_array, int column_id, uint32_t* row_ids) { ResizableArrayData output; - output.Init(data_type, pool_, bit_util::Log2(num_rows_)); + RETURN_NOT_OK(output.Init(data_type, pool_, bit_util::Log2(num_rows_))); for (size_t i = 0; i <= null_ranges_.size(); ++i) { int row_id_begin = @@ -2251,8 +2247,9 @@ Result JoinResidualFilter::MaterializeFilterInput( build_schemas_->map(HashJoinProjection::FILTER, HashJoinProjection::PAYLOAD); for (int i = 0; i < num_build_cols; ++i) { ResizableArrayData column_data; - column_data.Init(build_schemas_->data_type(HashJoinProjection::FILTER, i), pool_, - bit_util::Log2(num_batch_rows)); + RETURN_NOT_OK( + column_data.Init(build_schemas_->data_type(HashJoinProjection::FILTER, i), + pool_, bit_util::Log2(num_batch_rows))); if (auto idx = to_key.get(i); idx != SchemaProjectionMap::kMissingField) { RETURN_NOT_OK(build_keys_->DecodeSelected(&column_data, idx, num_batch_rows, key_ids_maybe_null, pool_)); diff --git a/cpp/src/arrow/acero/swiss_join_avx2.cc b/cpp/src/arrow/acero/swiss_join_avx2.cc index 0888dd8938455..1076073523448 100644 --- a/cpp/src/arrow/acero/swiss_join_avx2.cc +++ b/cpp/src/arrow/acero/swiss_join_avx2.cc @@ -15,14 +15,16 @@ // specific language governing permissions and limitations // under the License. -#include - #include "arrow/acero/swiss_join_internal.h" #include "arrow/util/bit_util.h" +#include "arrow/util/simd.h" namespace arrow { namespace acero { +// TODO(GH-43693): The functions in this file are not wired anywhere. We may consider +// actually utilizing them or removing them. + template int RowArrayAccessor::Visit_avx2(const RowTableImpl& rows, int column_id, int num_rows, const uint32_t* row_ids, @@ -45,48 +47,78 @@ int RowArrayAccessor::Visit_avx2(const RowTableImpl& rows, int column_id, int nu if (!is_fixed_length_column) { int varbinary_column_id = VarbinaryColumnId(rows.metadata(), column_id); const uint8_t* row_ptr_base = rows.data(2); - const uint32_t* row_offsets = rows.offsets(); + const RowTableImpl::offset_type* row_offsets = rows.offsets(); + static_assert( + sizeof(RowTableImpl::offset_type) == sizeof(int64_t), + "RowArrayAccessor::Visit_avx2 only supports 64-bit RowTableImpl::offset_type"); if (varbinary_column_id == 0) { // Case 1: This is the first varbinary column // __m256i field_offset_within_row = _mm256_set1_epi32(rows.metadata().fixed_length); __m256i varbinary_end_array_offset = - _mm256_set1_epi32(rows.metadata().varbinary_end_array_offset); + _mm256_set1_epi64x(rows.metadata().varbinary_end_array_offset); for (int i = 0; i < num_rows / unroll; ++i) { + // Load 8 32-bit row ids. __m256i row_id = _mm256_loadu_si256(reinterpret_cast(row_ids) + i); - __m256i row_offset = _mm256_i32gather_epi32( - reinterpret_cast(row_offsets), row_id, sizeof(uint32_t)); + // Gather the lower/higher 4 64-bit row offsets based on the lower/higher 4 32-bit + // row ids. + __m256i row_offset_lo = + _mm256_i32gather_epi64(row_offsets, _mm256_castsi256_si128(row_id), + sizeof(RowTableImpl::offset_type)); + __m256i row_offset_hi = + _mm256_i32gather_epi64(row_offsets, _mm256_extracti128_si256(row_id, 1), + sizeof(RowTableImpl::offset_type)); + // Gather the lower/higher 4 32-bit field lengths based on the lower/higher 4 + // 64-bit row offsets. + __m128i field_length_lo = _mm256_i64gather_epi32( + reinterpret_cast(row_ptr_base), + _mm256_add_epi64(row_offset_lo, varbinary_end_array_offset), 1); + __m128i field_length_hi = _mm256_i64gather_epi32( + reinterpret_cast(row_ptr_base), + _mm256_add_epi64(row_offset_hi, varbinary_end_array_offset), 1); + // The final 8 32-bit field lengths, subtracting the field offset within row. __m256i field_length = _mm256_sub_epi32( - _mm256_i32gather_epi32( - reinterpret_cast(row_ptr_base), - _mm256_add_epi32(row_offset, varbinary_end_array_offset), 1), - field_offset_within_row); + _mm256_set_m128i(field_length_hi, field_length_lo), field_offset_within_row); process_8_values_fn(i * unroll, row_ptr_base, - _mm256_add_epi32(row_offset, field_offset_within_row), + _mm256_add_epi64(row_offset_lo, field_offset_within_row), + _mm256_add_epi64(row_offset_hi, field_offset_within_row), field_length); } } else { // Case 2: This is second or later varbinary column // __m256i varbinary_end_array_offset = - _mm256_set1_epi32(rows.metadata().varbinary_end_array_offset + - sizeof(uint32_t) * (varbinary_column_id - 1)); + _mm256_set1_epi64x(rows.metadata().varbinary_end_array_offset + + sizeof(uint32_t) * (varbinary_column_id - 1)); auto row_ptr_base_i64 = reinterpret_cast(row_ptr_base); for (int i = 0; i < num_rows / unroll; ++i) { + // Load 8 32-bit row ids. __m256i row_id = _mm256_loadu_si256(reinterpret_cast(row_ids) + i); - __m256i row_offset = _mm256_i32gather_epi32( - reinterpret_cast(row_offsets), row_id, sizeof(uint32_t)); - __m256i end_array_offset = - _mm256_add_epi32(row_offset, varbinary_end_array_offset); - - __m256i field_offset_within_row_A = _mm256_i32gather_epi64( - row_ptr_base_i64, _mm256_castsi256_si128(end_array_offset), 1); - __m256i field_offset_within_row_B = _mm256_i32gather_epi64( - row_ptr_base_i64, _mm256_extracti128_si256(end_array_offset, 1), 1); + // Gather the lower/higher 4 64-bit row offsets based on the lower/higher 4 32-bit + // row ids. + __m256i row_offset_lo = + _mm256_i32gather_epi64(row_offsets, _mm256_castsi256_si128(row_id), + sizeof(RowTableImpl::offset_type)); + // Gather the lower/higher 4 32-bit field lengths based on the lower/higher 4 + // 64-bit row offsets. + __m256i row_offset_hi = + _mm256_i32gather_epi64(row_offsets, _mm256_extracti128_si256(row_id, 1), + sizeof(RowTableImpl::offset_type)); + // Prepare the lower/higher 4 64-bit end array offsets based on the lower/higher 4 + // 64-bit row offsets. + __m256i end_array_offset_lo = + _mm256_add_epi64(row_offset_lo, varbinary_end_array_offset); + __m256i end_array_offset_hi = + _mm256_add_epi64(row_offset_hi, varbinary_end_array_offset); + + __m256i field_offset_within_row_A = + _mm256_i64gather_epi64(row_ptr_base_i64, end_array_offset_lo, 1); + __m256i field_offset_within_row_B = + _mm256_i64gather_epi64(row_ptr_base_i64, end_array_offset_hi, 1); field_offset_within_row_A = _mm256_permutevar8x32_epi32( field_offset_within_row_A, _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)); field_offset_within_row_B = _mm256_permutevar8x32_epi32( @@ -110,8 +142,14 @@ int RowArrayAccessor::Visit_avx2(const RowTableImpl& rows, int column_id, int nu 0x4e); // Swapping low and high 128-bits field_length = _mm256_sub_epi32(field_length, field_offset_within_row); + field_offset_within_row_A = + _mm256_add_epi32(field_offset_within_row_A, alignment_padding); + field_offset_within_row_B = + _mm256_add_epi32(field_offset_within_row_B, alignment_padding); + process_8_values_fn(i * unroll, row_ptr_base, - _mm256_add_epi32(row_offset, field_offset_within_row), + _mm256_add_epi64(row_offset_lo, field_offset_within_row_A), + _mm256_add_epi64(row_offset_hi, field_offset_within_row_B), field_length); } } @@ -119,7 +157,7 @@ int RowArrayAccessor::Visit_avx2(const RowTableImpl& rows, int column_id, int nu if (is_fixed_length_column) { __m256i field_offset_within_row = - _mm256_set1_epi32(rows.metadata().encoded_field_offset( + _mm256_set1_epi64x(rows.metadata().encoded_field_offset( rows.metadata().pos_after_encoding(column_id))); __m256i field_length = _mm256_set1_epi32(rows.metadata().column_metadatas[column_id].fixed_length); @@ -130,24 +168,51 @@ int RowArrayAccessor::Visit_avx2(const RowTableImpl& rows, int column_id, int nu // const uint8_t* row_ptr_base = rows.data(1); for (int i = 0; i < num_rows / unroll; ++i) { + // Load 8 32-bit row ids. __m256i row_id = _mm256_loadu_si256(reinterpret_cast(row_ids) + i); - __m256i row_offset = _mm256_mullo_epi32(row_id, field_length); - __m256i field_offset = _mm256_add_epi32(row_offset, field_offset_within_row); - process_8_values_fn(i * unroll, row_ptr_base, field_offset, field_length); + // Widen the 32-bit row ids to 64-bit and store the lower/higher 4 of them into 2 + // 256-bit registers. + __m256i row_id_lo = _mm256_cvtepi32_epi64(_mm256_castsi256_si128(row_id)); + __m256i row_id_hi = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(row_id, 1)); + // Calculate the lower/higher 4 64-bit row offsets based on the lower/higher 4 + // 64-bit row ids and the fixed field length. + __m256i row_offset_lo = _mm256_mul_epi32(row_id_lo, field_length); + __m256i row_offset_hi = _mm256_mul_epi32(row_id_hi, field_length); + // Calculate the lower/higher 4 64-bit field offsets based on the lower/higher 4 + // 64-bit row offsets and field offset within row. + __m256i field_offset_lo = + _mm256_add_epi64(row_offset_lo, field_offset_within_row); + __m256i field_offset_hi = + _mm256_add_epi64(row_offset_hi, field_offset_within_row); + process_8_values_fn(i * unroll, row_ptr_base, field_offset_lo, field_offset_hi, + field_length); } } else { // Case 4: This is a fixed length column in varying length row // const uint8_t* row_ptr_base = rows.data(2); - const uint32_t* row_offsets = rows.offsets(); + const RowTableImpl::offset_type* row_offsets = rows.offsets(); for (int i = 0; i < num_rows / unroll; ++i) { + // Load 8 32-bit row ids. __m256i row_id = _mm256_loadu_si256(reinterpret_cast(row_ids) + i); - __m256i row_offset = _mm256_i32gather_epi32( - reinterpret_cast(row_offsets), row_id, sizeof(uint32_t)); - __m256i field_offset = _mm256_add_epi32(row_offset, field_offset_within_row); - process_8_values_fn(i * unroll, row_ptr_base, field_offset, field_length); + // Gather the lower/higher 4 64-bit row offsets based on the lower/higher 4 32-bit + // row ids. + __m256i row_offset_lo = + _mm256_i32gather_epi64(row_offsets, _mm256_castsi256_si128(row_id), + sizeof(RowTableImpl::offset_type)); + __m256i row_offset_hi = + _mm256_i32gather_epi64(row_offsets, _mm256_extracti128_si256(row_id, 1), + sizeof(RowTableImpl::offset_type)); + // Calculate the lower/higher 4 64-bit field offsets based on the lower/higher 4 + // 64-bit row offsets and field offset within row. + __m256i field_offset_lo = + _mm256_add_epi64(row_offset_lo, field_offset_within_row); + __m256i field_offset_hi = + _mm256_add_epi64(row_offset_hi, field_offset_within_row); + process_8_values_fn(i * unroll, row_ptr_base, field_offset_lo, field_offset_hi, + field_length); } } } diff --git a/cpp/src/arrow/acero/swiss_join_internal.h b/cpp/src/arrow/acero/swiss_join_internal.h index dceb74abe4f1b..4d749c1c529ae 100644 --- a/cpp/src/arrow/acero/swiss_join_internal.h +++ b/cpp/src/arrow/acero/swiss_join_internal.h @@ -22,10 +22,10 @@ #include "arrow/acero/partition_util.h" #include "arrow/acero/schema_util.h" #include "arrow/acero/task_util.h" -#include "arrow/compute/kernels/row_encoder_internal.h" #include "arrow/compute/key_map_internal.h" #include "arrow/compute/light_array_internal.h" #include "arrow/compute/row/encode_internal.h" +#include "arrow/compute/row/row_encoder_internal.h" namespace arrow { diff --git a/cpp/src/arrow/acero/tpch_node_test.cc b/cpp/src/arrow/acero/tpch_node_test.cc index 076bcf634a6ba..17fb43452bc58 100644 --- a/cpp/src/arrow/acero/tpch_node_test.cc +++ b/cpp/src/arrow/acero/tpch_node_test.cc @@ -27,8 +27,8 @@ #include "arrow/acero/test_util_internal.h" #include "arrow/acero/tpch_node.h" #include "arrow/acero/util.h" -#include "arrow/compute/kernels/row_encoder_internal.h" #include "arrow/compute/kernels/test_util.h" +#include "arrow/compute/row/row_encoder_internal.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/matchers.h" #include "arrow/testing/random.h" diff --git a/cpp/src/arrow/acero/unmaterialized_table.h b/cpp/src/arrow/acero/unmaterialized_table_internal.h similarity index 100% rename from cpp/src/arrow/acero/unmaterialized_table.h rename to cpp/src/arrow/acero/unmaterialized_table_internal.h diff --git a/cpp/src/arrow/acero/util.h b/cpp/src/arrow/acero/util.h index 0eb9f4c87e180..ee46e8527422a 100644 --- a/cpp/src/arrow/acero/util.h +++ b/cpp/src/arrow/acero/util.h @@ -65,7 +65,7 @@ class ARROW_ACERO_EXPORT AtomicCounter { // return true if the counter is complete bool Increment() { - DCHECK_NE(count_.load(), total_.load()); + ARROW_DCHECK_NE(count_.load(), total_.load()); int count = count_.fetch_add(1) + 1; if (count != total_.load()) return false; return DoneOnce(); diff --git a/cpp/src/arrow/acero/visibility.h b/cpp/src/arrow/acero/visibility.h index 02382232b69dd..21a697a56eca9 100644 --- a/cpp/src/arrow/acero/visibility.h +++ b/cpp/src/arrow/acero/visibility.h @@ -20,31 +20,31 @@ #pragma once #if defined(_WIN32) || defined(__CYGWIN__) -#if defined(_MSC_VER) -#pragma warning(push) -#pragma warning(disable : 4251) -#else -#pragma GCC diagnostic ignored "-Wattributes" -#endif +# if defined(_MSC_VER) +# pragma warning(push) +# pragma warning(disable : 4251) +# else +# pragma GCC diagnostic ignored "-Wattributes" +# endif -#ifdef ARROW_ACERO_STATIC -#define ARROW_ACERO_EXPORT -#elif defined(ARROW_ACERO_EXPORTING) -#define ARROW_ACERO_EXPORT __declspec(dllexport) -#else -#define ARROW_ACERO_EXPORT __declspec(dllimport) -#endif +# ifdef ARROW_ACERO_STATIC +# define ARROW_ACERO_EXPORT +# elif defined(ARROW_ACERO_EXPORTING) +# define ARROW_ACERO_EXPORT __declspec(dllexport) +# else +# define ARROW_ACERO_EXPORT __declspec(dllimport) +# endif -#define ARROW_ACERO_NO_EXPORT +# define ARROW_ACERO_NO_EXPORT #else // Not Windows -#ifndef ARROW_ACERO_EXPORT -#define ARROW_ACERO_EXPORT __attribute__((visibility("default"))) -#endif -#ifndef ARROW_ACERO_NO_EXPORT -#define ARROW_ACERO_NO_EXPORT __attribute__((visibility("hidden"))) -#endif +# ifndef ARROW_ACERO_EXPORT +# define ARROW_ACERO_EXPORT __attribute__((visibility("default"))) +# endif +# ifndef ARROW_ACERO_NO_EXPORT +# define ARROW_ACERO_NO_EXPORT __attribute__((visibility("hidden"))) +# endif #endif // Not-Windows #if defined(_MSC_VER) -#pragma warning(pop) +# pragma warning(pop) #endif diff --git a/cpp/src/arrow/adapters/orc/adapter.cc b/cpp/src/arrow/adapters/orc/adapter.cc index 25759f8471365..d16b6cfd2e97d 100644 --- a/cpp/src/arrow/adapters/orc/adapter.cc +++ b/cpp/src/arrow/adapters/orc/adapter.cc @@ -25,7 +25,7 @@ #include #ifdef ARROW_ORC_NEED_TIME_ZONE_DATABASE_CHECK -#include +# include #endif #include "arrow/adapters/orc/util.h" diff --git a/cpp/src/arrow/array/array_base.h b/cpp/src/arrow/array/array_base.h index 716ae0722069e..e4af67d7e5f0b 100644 --- a/cpp/src/arrow/array/array_base.h +++ b/cpp/src/arrow/array/array_base.h @@ -232,6 +232,14 @@ class ARROW_EXPORT Array { /// \return DeviceAllocationType DeviceAllocationType device_type() const { return data_->device_type(); } + /// \brief Return the statistics of this Array + /// + /// This just delegates to calling statistics on the underlying ArrayData + /// object which backs this Array. + /// + /// \return const ArrayStatistics& + std::shared_ptr statistics() const { return data_->statistics; } + protected: Array() = default; ARROW_DEFAULT_MOVE_AND_ASSIGN(Array); diff --git a/cpp/src/arrow/array/array_binary.cc b/cpp/src/arrow/array/array_binary.cc index d83ba0ca8936d..1266819bdb311 100644 --- a/cpp/src/arrow/array/array_binary.cc +++ b/cpp/src/arrow/array/array_binary.cc @@ -125,12 +125,8 @@ FixedSizeBinaryArray::FixedSizeBinaryArray(const std::shared_ptr& type int64_t length, const std::shared_ptr& data, const std::shared_ptr& null_bitmap, - int64_t null_count, int64_t offset) - : PrimitiveArray(type, length, data, null_bitmap, null_count, offset), - byte_width_(checked_cast(*type).byte_width()) {} - -const uint8_t* FixedSizeBinaryArray::GetValue(int64_t i) const { - return raw_values_ + (i + data_->offset) * byte_width_; + int64_t null_count, int64_t offset) { + SetData(ArrayData::Make(type, length, {null_bitmap, data}, null_count, offset)); } } // namespace arrow diff --git a/cpp/src/arrow/array/array_binary.h b/cpp/src/arrow/array/array_binary.h index 19fdee61243d1..63903eac46d41 100644 --- a/cpp/src/arrow/array/array_binary.h +++ b/cpp/src/arrow/array/array_binary.h @@ -57,8 +57,6 @@ class BaseBinaryArray : public FlatArray { /// Return the pointer to the given elements bytes // XXX should GetValue(int64_t i) return a string_view? const uint8_t* GetValue(int64_t i, offset_type* out_length) const { - // Account for base offset - i += data_->offset; const offset_type pos = raw_value_offsets_[i]; *out_length = raw_value_offsets_[i + 1] - pos; return raw_data_ + pos; @@ -69,8 +67,6 @@ class BaseBinaryArray : public FlatArray { /// \param i the value index /// \return the view over the selected value std::string_view GetView(int64_t i) const { - // Account for base offset - i += data_->offset; const offset_type pos = raw_value_offsets_[i]; return std::string_view(reinterpret_cast(raw_data_ + pos), raw_value_offsets_[i + 1] - pos); @@ -99,9 +95,7 @@ class BaseBinaryArray : public FlatArray { /// Note that this buffer does not account for any slice offset std::shared_ptr value_data() const { return data_->buffers[2]; } - const offset_type* raw_value_offsets() const { - return raw_value_offsets_ + data_->offset; - } + const offset_type* raw_value_offsets() const { return raw_value_offsets_; } const uint8_t* raw_data() const { return raw_data_; } @@ -109,15 +103,12 @@ class BaseBinaryArray : public FlatArray { /// at the passed index. /// /// Does not perform boundschecking - offset_type value_offset(int64_t i) const { - return raw_value_offsets_[i + data_->offset]; - } + offset_type value_offset(int64_t i) const { return raw_value_offsets_[i]; } /// \brief Return the length of the data for the value at the passed index. /// /// Does not perform boundschecking offset_type value_length(int64_t i) const { - i += data_->offset; return raw_value_offsets_[i + 1] - raw_value_offsets_[i]; } @@ -126,8 +117,7 @@ class BaseBinaryArray : public FlatArray { /// less than the size of the data buffer (data_->buffers[2]). offset_type total_values_length() const { if (data_->length > 0) { - return raw_value_offsets_[data_->length + data_->offset] - - raw_value_offsets_[data_->offset]; + return raw_value_offsets_[data_->length] - raw_value_offsets_[0]; } else { return 0; } @@ -144,7 +134,7 @@ class BaseBinaryArray : public FlatArray { // Protected method for constructors void SetData(const std::shared_ptr& data) { this->Array::SetData(data); - raw_value_offsets_ = data->GetValuesSafe(1, /*offset=*/0); + raw_value_offsets_ = data->GetValuesSafe(1); raw_data_ = data->GetValuesSafe(2, /*offset=*/0); } @@ -293,11 +283,11 @@ class ARROW_EXPORT FixedSizeBinaryArray : public PrimitiveArray { const std::shared_ptr& null_bitmap = NULLPTR, int64_t null_count = kUnknownNullCount, int64_t offset = 0); - const uint8_t* GetValue(int64_t i) const; + const uint8_t* GetValue(int64_t i) const { return values_ + i * byte_width_; } const uint8_t* Value(int64_t i) const { return GetValue(i); } std::string_view GetView(int64_t i) const { - return std::string_view(reinterpret_cast(GetValue(i)), byte_width()); + return std::string_view(reinterpret_cast(GetValue(i)), byte_width_); } std::optional operator[](int64_t i) const { @@ -308,7 +298,7 @@ class ARROW_EXPORT FixedSizeBinaryArray : public PrimitiveArray { int32_t byte_width() const { return byte_width_; } - const uint8_t* raw_values() const { return raw_values_ + data_->offset * byte_width_; } + const uint8_t* raw_values() const { return values_; } IteratorType begin() const { return IteratorType(*this); } @@ -319,8 +309,10 @@ class ARROW_EXPORT FixedSizeBinaryArray : public PrimitiveArray { this->PrimitiveArray::SetData(data); byte_width_ = internal::checked_cast(*type()).byte_width(); + values_ = raw_values_ + data_->offset * byte_width_; } + const uint8_t* values_; int32_t byte_width_; }; diff --git a/cpp/src/arrow/array/array_dict.cc b/cpp/src/arrow/array/array_dict.cc index 7fd76a1dae81b..55e086af30bc2 100644 --- a/cpp/src/arrow/array/array_dict.cc +++ b/cpp/src/arrow/array/array_dict.cc @@ -349,7 +349,7 @@ class DictionaryUnifierImpl : public DictionaryUnifier { using MemoTableType = typename DictTraits::MemoTableType; DictionaryUnifierImpl(MemoryPool* pool, std::shared_ptr value_type) - : pool_(pool), value_type_(value_type), memo_table_(pool) {} + : pool_(pool), value_type_(std::move(value_type)), memo_table_(pool) {} Status Unify(const Array& dictionary, std::shared_ptr* out) override { if (dictionary.null_count() > 0) { @@ -432,7 +432,7 @@ struct MakeUnifier { std::unique_ptr result; MakeUnifier(MemoryPool* pool, std::shared_ptr value_type) - : pool(pool), value_type(value_type) {} + : pool(pool), value_type(std::move(value_type)) {} template enable_if_no_memoize Visit(const T&) { diff --git a/cpp/src/arrow/array/array_nested.cc b/cpp/src/arrow/array/array_nested.cc index 47c0fd35829a1..db52551eadc7f 100644 --- a/cpp/src/arrow/array/array_nested.cc +++ b/cpp/src/arrow/array/array_nested.cc @@ -461,8 +461,7 @@ inline void SetListData(VarLengthListLikeArray* self, self->Array::SetData(data); self->list_type_ = checked_cast(data->type.get()); - self->raw_value_offsets_ = - data->GetValuesSafe(1, /*offset=*/0); + self->raw_value_offsets_ = data->GetValuesSafe(1); // BaseListViewArray::SetData takes care of setting raw_value_sizes_. ARROW_CHECK_EQ(self->list_type_->value_type()->id(), data->child_data[0]->type->id()); @@ -542,7 +541,7 @@ Result> ListArray::FromArrays( const Array& offsets, const Array& values, MemoryPool* pool, std::shared_ptr null_bitmap, int64_t null_count) { return ListArrayFromArrays(std::make_shared(values.type()), offsets, - values, pool, null_bitmap, null_count); + values, pool, std::move(null_bitmap), null_count); } Result> ListArray::FromListView(const ListViewArray& source, @@ -563,7 +562,7 @@ Result> ListArray::FromArrays( return Status::TypeError("Mismatching list value type"); } return ListArrayFromArrays(std::move(type), offsets, values, pool, - null_bitmap, null_count); + std::move(null_bitmap), null_count); } Result> ListArray::Flatten(MemoryPool* memory_pool) const { @@ -599,8 +598,8 @@ Result> LargeListArray::FromArrays( const Array& offsets, const Array& values, MemoryPool* pool, std::shared_ptr null_bitmap, int64_t null_count) { return ListArrayFromArrays( - std::make_shared(values.type()), offsets, values, pool, null_bitmap, - null_count); + std::make_shared(values.type()), offsets, values, pool, + std::move(null_bitmap), null_count); } Result> LargeListArray::FromListView( @@ -622,7 +621,7 @@ Result> LargeListArray::FromArrays( return Status::TypeError("Mismatching list value type"); } return ListArrayFromArrays(std::move(type), offsets, values, pool, - null_bitmap, null_count); + std::move(null_bitmap), null_count); } Result> LargeListArray::Flatten(MemoryPool* memory_pool) const { @@ -654,7 +653,7 @@ ListViewArray::ListViewArray(std::shared_ptr type, int64_t length, void ListViewArray::SetData(const std::shared_ptr& data) { internal::SetListData(this, data); - raw_value_sizes_ = data->GetValuesSafe(2, /*offset=*/0); + raw_value_sizes_ = data->GetValuesSafe(2); } Result> ListViewArray::FromArrays( @@ -662,7 +661,7 @@ Result> ListViewArray::FromArrays( std::shared_ptr null_bitmap, int64_t null_count) { return ListViewArrayFromArrays( std::make_shared(values.type()), offsets, sizes, values, pool, - null_bitmap, null_count); + std::move(null_bitmap), null_count); } Result> ListViewArray::FromArrays( @@ -677,7 +676,7 @@ Result> ListViewArray::FromArrays( return Status::TypeError("Mismatching list-view value type"); } return ListViewArrayFromArrays(std::move(type), offsets, sizes, values, - pool, null_bitmap, null_count); + pool, std::move(null_bitmap), null_count); } Result> ListViewArray::FromList(const ListArray& source, @@ -722,14 +721,14 @@ LargeListViewArray::LargeListViewArray(std::shared_ptr type, int64_t l std::shared_ptr null_bitmap, int64_t null_count, int64_t offset) { LargeListViewArray::SetData(ArrayData::Make( - type, length, + std::move(type), length, {std::move(null_bitmap), std::move(value_offsets), std::move(value_sizes)}, /*child_data=*/{values->data()}, null_count, offset)); } void LargeListViewArray::SetData(const std::shared_ptr& data) { internal::SetListData(this, data); - raw_value_sizes_ = data->GetValuesSafe(2, /*offset=*/0); + raw_value_sizes_ = data->GetValuesSafe(2); } Result> LargeListViewArray::FromArrays( @@ -737,7 +736,7 @@ Result> LargeListViewArray::FromArrays( std::shared_ptr null_bitmap, int64_t null_count) { return ListViewArrayFromArrays( std::make_shared(values.type()), offsets, sizes, values, pool, - null_bitmap, null_count); + std::move(null_bitmap), null_count); } Result> LargeListViewArray::FromArrays( @@ -752,7 +751,7 @@ Result> LargeListViewArray::FromArrays( return Status::TypeError("Mismatching large list-view value type"); } return ListViewArrayFromArrays( - std::move(type), offsets, sizes, values, pool, null_bitmap, null_count); + std::move(type), offsets, sizes, values, pool, std::move(null_bitmap), null_count); } Result> LargeListViewArray::Flatten( @@ -854,8 +853,9 @@ Result> MapArray::FromArraysInternal( null_count = kUnknownNullCount; } buffers[1] = typed_offsets.values(); - return std::make_shared(type, offsets->length() - 1, std::move(buffers), keys, - items, /*null_count=*/null_count, offsets->offset()); + return std::make_shared(std::move(type), offsets->length() - 1, + std::move(buffers), keys, items, + /*null_count=*/null_count, offsets->offset()); } Result> MapArray::FromArrays(const std::shared_ptr& offsets, @@ -971,8 +971,8 @@ Result> FixedSizeListArray::FromArrays( int64_t length = values->length() / list_size; auto list_type = std::make_shared(values->type(), list_size); - return std::make_shared(list_type, length, values, null_bitmap, - null_count); + return std::make_shared(list_type, length, values, + std::move(null_bitmap), null_count); } Result> FixedSizeListArray::FromArrays( @@ -992,8 +992,8 @@ Result> FixedSizeListArray::FromArrays( } int64_t length = values->length() / list_type.list_size(); - return std::make_shared(type, length, values, null_bitmap, - null_count); + return std::make_shared(std::move(type), length, values, + std::move(null_bitmap), null_count); } Result> FixedSizeListArray::Flatten( @@ -1015,7 +1015,7 @@ StructArray::StructArray(const std::shared_ptr& type, int64_t length, std::shared_ptr null_bitmap, int64_t null_count, int64_t offset) { ARROW_CHECK_EQ(type->id(), Type::STRUCT); - SetData(ArrayData::Make(type, length, {null_bitmap}, null_count, offset)); + SetData(ArrayData::Make(type, length, {std::move(null_bitmap)}, null_count, offset)); for (const auto& child : children) { data_->child_data.push_back(child->data()); } @@ -1048,7 +1048,7 @@ Result> StructArray::Make( null_count = 0; } return std::make_shared(struct_(fields), length - offset, children, - null_bitmap, null_count, offset); + std::move(null_bitmap), null_count, offset); } Result> StructArray::Make( @@ -1085,8 +1085,8 @@ const std::shared_ptr& StructArray::field(int i) const { } else { field_data = data_->child_data[i]; } - std::shared_ptr result = MakeArray(field_data); - std::atomic_store(&boxed_fields_[i], result); + result = MakeArray(field_data); + std::atomic_store(&boxed_fields_[i], std::move(result)); return boxed_fields_[i]; } return boxed_fields_[i]; @@ -1183,7 +1183,7 @@ void UnionArray::SetData(std::shared_ptr data) { union_type_ = checked_cast(data_->type.get()); ARROW_CHECK_GE(data_->buffers.size(), 2); - raw_type_codes_ = data->GetValuesSafe(1, /*offset=*/0); + raw_type_codes_ = data->GetValuesSafe(1); boxed_fields_.resize(data_->child_data.size()); } @@ -1205,7 +1205,7 @@ void DenseUnionArray::SetData(const std::shared_ptr& data) { // No validity bitmap ARROW_CHECK_EQ(data_->buffers[0], nullptr); - raw_value_offsets_ = data->GetValuesSafe(2, /*offset=*/0); + raw_value_offsets_ = data->GetValuesSafe(2); } SparseUnionArray::SparseUnionArray(std::shared_ptr data) { diff --git a/cpp/src/arrow/array/array_nested.h b/cpp/src/arrow/array/array_nested.h index a6d4977839ef1..f122f9378b525 100644 --- a/cpp/src/arrow/array/array_nested.h +++ b/cpp/src/arrow/array/array_nested.h @@ -94,15 +94,11 @@ class VarLengthListLikeArray : public Array { const std::shared_ptr& value_type() const { return list_type_->value_type(); } /// Return pointer to raw value offsets accounting for any slice offset - const offset_type* raw_value_offsets() const { - return raw_value_offsets_ + data_->offset; - } + const offset_type* raw_value_offsets() const { return raw_value_offsets_; } // The following functions will not perform boundschecking - offset_type value_offset(int64_t i) const { - return raw_value_offsets_[i + data_->offset]; - } + offset_type value_offset(int64_t i) const { return raw_value_offsets_[i]; } /// \brief Return the size of the value at a particular index /// @@ -154,7 +150,6 @@ class BaseListArray : public VarLengthListLikeArray { /// /// \pre IsValid(i) offset_type value_length(int64_t i) const final { - i += this->data_->offset; return this->raw_value_offsets_[i + 1] - this->raw_value_offsets_[i]; } }; @@ -302,9 +297,7 @@ class BaseListViewArray : public VarLengthListLikeArray { const std::shared_ptr& value_sizes() const { return this->data_->buffers[2]; } /// \brief Return pointer to raw value offsets accounting for any slice offset - const offset_type* raw_value_sizes() const { - return raw_value_sizes_ + this->data_->offset; - } + const offset_type* raw_value_sizes() const { return raw_value_sizes_; } /// \brief Return the size of the value at a particular index /// @@ -313,9 +306,7 @@ class BaseListViewArray : public VarLengthListLikeArray { /// length of the child values array. /// /// \pre IsValid(i) - offset_type value_length(int64_t i) const final { - return this->raw_value_sizes_[i + this->data_->offset]; - } + offset_type value_length(int64_t i) const final { return this->raw_value_sizes_[i]; } protected: const offset_type* raw_value_sizes_ = NULLPTR; @@ -744,15 +735,13 @@ class ARROW_EXPORT UnionArray : public Array { /// Note that this buffer does not account for any slice offset const std::shared_ptr& type_codes() const { return data_->buffers[1]; } - const type_code_t* raw_type_codes() const { return raw_type_codes_ + data_->offset; } + const type_code_t* raw_type_codes() const { return raw_type_codes_; } /// The logical type code of the value at index. - type_code_t type_code(int64_t i) const { return raw_type_codes_[i + data_->offset]; } + type_code_t type_code(int64_t i) const { return raw_type_codes_[i]; } /// The physical child id containing value at index. - int child_id(int64_t i) const { - return union_type_->child_ids()[raw_type_codes_[i + data_->offset]]; - } + int child_id(int64_t i) const { return union_type_->child_ids()[raw_type_codes_[i]]; } const UnionType* union_type() const { return union_type_; } @@ -883,9 +872,9 @@ class ARROW_EXPORT DenseUnionArray : public UnionArray { /// Note that this buffer does not account for any slice offset const std::shared_ptr& value_offsets() const { return data_->buffers[2]; } - int32_t value_offset(int64_t i) const { return raw_value_offsets_[i + data_->offset]; } + int32_t value_offset(int64_t i) const { return raw_value_offsets_[i]; } - const int32_t* raw_value_offsets() const { return raw_value_offsets_ + data_->offset; } + const int32_t* raw_value_offsets() const { return raw_value_offsets_; } protected: const int32_t* raw_value_offsets_; diff --git a/cpp/src/arrow/array/array_primitive.cc b/cpp/src/arrow/array/array_primitive.cc index da3810aa392c9..10d4e9e6aa284 100644 --- a/cpp/src/arrow/array/array_primitive.cc +++ b/cpp/src/arrow/array/array_primitive.cc @@ -78,15 +78,16 @@ DayTimeIntervalArray::DayTimeIntervalArray(const std::shared_ptr& type int64_t length, const std::shared_ptr& data, const std::shared_ptr& null_bitmap, - int64_t null_count, int64_t offset) - : PrimitiveArray(type, length, data, null_bitmap, null_count, offset) {} + int64_t null_count, int64_t offset) { + SetData(ArrayData::Make(type, length, {null_bitmap, data}, null_count, offset)); +} DayTimeIntervalArray::DayTimeIntervalArray(int64_t length, const std::shared_ptr& data, const std::shared_ptr& null_bitmap, int64_t null_count, int64_t offset) - : PrimitiveArray(day_time_interval(), length, data, null_bitmap, null_count, offset) { -} + : DayTimeIntervalArray(day_time_interval(), length, data, null_bitmap, null_count, + offset) {} DayTimeIntervalType::DayMilliseconds DayTimeIntervalArray::GetValue(int64_t i) const { DCHECK(i < length()); @@ -105,14 +106,15 @@ MonthDayNanoIntervalArray::MonthDayNanoIntervalArray( MonthDayNanoIntervalArray::MonthDayNanoIntervalArray( const std::shared_ptr& type, int64_t length, const std::shared_ptr& data, const std::shared_ptr& null_bitmap, - int64_t null_count, int64_t offset) - : PrimitiveArray(type, length, data, null_bitmap, null_count, offset) {} + int64_t null_count, int64_t offset) { + SetData(ArrayData::Make(type, length, {null_bitmap, data}, null_count, offset)); +} MonthDayNanoIntervalArray::MonthDayNanoIntervalArray( int64_t length, const std::shared_ptr& data, const std::shared_ptr& null_bitmap, int64_t null_count, int64_t offset) - : PrimitiveArray(month_day_nano_interval(), length, data, null_bitmap, null_count, - offset) {} + : MonthDayNanoIntervalArray(month_day_nano_interval(), length, data, null_bitmap, + null_count, offset) {} MonthDayNanoIntervalType::MonthDayNanos MonthDayNanoIntervalArray::GetValue( int64_t i) const { diff --git a/cpp/src/arrow/array/array_primitive.h b/cpp/src/arrow/array/array_primitive.h index e6df92e3b788c..3e2893b7dd898 100644 --- a/cpp/src/arrow/array/array_primitive.h +++ b/cpp/src/arrow/array/array_primitive.h @@ -90,7 +90,7 @@ class NumericArray : public PrimitiveArray { using value_type = typename TypeClass::c_type; using IteratorType = stl::ArrayIterator>; - explicit NumericArray(const std::shared_ptr& data) : PrimitiveArray(data) {} + explicit NumericArray(const std::shared_ptr& data) { SetData(data); } // Only enable this constructor without a type argument for types without additional // metadata @@ -98,18 +98,17 @@ class NumericArray : public PrimitiveArray { NumericArray(enable_if_parameter_free length, const std::shared_ptr& data, const std::shared_ptr& null_bitmap = NULLPTR, - int64_t null_count = kUnknownNullCount, int64_t offset = 0) - : PrimitiveArray(TypeTraits::type_singleton(), length, data, null_bitmap, - null_count, offset) {} - - const value_type* raw_values() const { - return reinterpret_cast(raw_values_) + data_->offset; + int64_t null_count = kUnknownNullCount, int64_t offset = 0) { + SetData(ArrayData::Make(TypeTraits::type_singleton(), length, {null_bitmap, data}, + null_count, offset)); } - value_type Value(int64_t i) const { return raw_values()[i]; } + const value_type* raw_values() const { return values_; } + + value_type Value(int64_t i) const { return values_[i]; } // For API compatibility with BinaryArray etc. - value_type GetView(int64_t i) const { return Value(i); } + value_type GetView(int64_t i) const { return values_[i]; } std::optional operator[](int64_t i) const { return *IteratorType(*this, i); @@ -121,6 +120,15 @@ class NumericArray : public PrimitiveArray { protected: using PrimitiveArray::PrimitiveArray; + + void SetData(const std::shared_ptr& data) { + this->PrimitiveArray::SetData(data); + values_ = raw_values_ + ? (reinterpret_cast(raw_values_) + data_->offset) + : NULLPTR; + } + + const value_type* values_; }; /// DayTimeArray diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index 32806d9d2edb3..73e0c692432b6 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -3709,6 +3709,132 @@ TEST(TestSwapEndianArrayData, InvalidLength) { } } +class TestArrayDataStatistics : public ::testing::Test { + public: + void SetUp() { + valids_ = {1, 0, 1, 1}; + null_count_ = std::count(valids_.begin(), valids_.end(), 0); + null_buffer_ = *internal::BytesToBits(valids_); + values_ = {1, 0, 3, -4}; + min_ = *std::min_element(values_.begin(), values_.end()); + max_ = *std::max_element(values_.begin(), values_.end()); + values_buffer_ = Buffer::FromVector(values_); + data_ = ArrayData::Make(int32(), values_.size(), {null_buffer_, values_buffer_}, + null_count_); + data_->statistics = std::make_shared(); + data_->statistics->null_count = null_count_; + data_->statistics->min = min_; + data_->statistics->is_min_exact = true; + data_->statistics->max = max_; + data_->statistics->is_max_exact = true; + } + + protected: + std::vector valids_; + size_t null_count_; + std::shared_ptr null_buffer_; + std::vector values_; + int64_t min_; + int64_t max_; + std::shared_ptr values_buffer_; + std::shared_ptr data_; +}; + +TEST_F(TestArrayDataStatistics, MoveConstructor) { + ArrayData copied_data(*data_); + ArrayData moved_data(std::move(copied_data)); + + ASSERT_TRUE(moved_data.statistics->null_count.has_value()); + ASSERT_EQ(null_count_, moved_data.statistics->null_count.value()); + + ASSERT_TRUE(moved_data.statistics->min.has_value()); + ASSERT_TRUE(std::holds_alternative(moved_data.statistics->min.value())); + ASSERT_EQ(min_, std::get(moved_data.statistics->min.value())); + ASSERT_TRUE(moved_data.statistics->is_min_exact); + + ASSERT_TRUE(moved_data.statistics->max.has_value()); + ASSERT_TRUE(std::holds_alternative(moved_data.statistics->max.value())); + ASSERT_EQ(max_, std::get(moved_data.statistics->max.value())); + ASSERT_TRUE(moved_data.statistics->is_max_exact); +} + +TEST_F(TestArrayDataStatistics, CopyConstructor) { + ArrayData copied_data(*data_); + + ASSERT_TRUE(copied_data.statistics->null_count.has_value()); + ASSERT_EQ(null_count_, copied_data.statistics->null_count.value()); + + ASSERT_TRUE(copied_data.statistics->min.has_value()); + ASSERT_TRUE(std::holds_alternative(copied_data.statistics->min.value())); + ASSERT_EQ(min_, std::get(copied_data.statistics->min.value())); + ASSERT_TRUE(copied_data.statistics->is_min_exact); + + ASSERT_TRUE(copied_data.statistics->max.has_value()); + ASSERT_TRUE(std::holds_alternative(copied_data.statistics->max.value())); + ASSERT_EQ(max_, std::get(copied_data.statistics->max.value())); + ASSERT_TRUE(copied_data.statistics->is_max_exact); +} + +TEST_F(TestArrayDataStatistics, MoveAssignment) { + ArrayData copied_data(*data_); + ArrayData moved_data; + moved_data = std::move(copied_data); + + ASSERT_TRUE(moved_data.statistics->null_count.has_value()); + ASSERT_EQ(null_count_, moved_data.statistics->null_count.value()); + + ASSERT_TRUE(moved_data.statistics->min.has_value()); + ASSERT_TRUE(std::holds_alternative(moved_data.statistics->min.value())); + ASSERT_EQ(min_, std::get(moved_data.statistics->min.value())); + ASSERT_TRUE(moved_data.statistics->is_min_exact); + + ASSERT_TRUE(moved_data.statistics->max.has_value()); + ASSERT_TRUE(std::holds_alternative(moved_data.statistics->max.value())); + ASSERT_EQ(max_, std::get(moved_data.statistics->max.value())); + ASSERT_TRUE(moved_data.statistics->is_max_exact); +} + +TEST_F(TestArrayDataStatistics, CopyAssignment) { + ArrayData copied_data; + copied_data = *data_; + + ASSERT_TRUE(copied_data.statistics->null_count.has_value()); + ASSERT_EQ(null_count_, copied_data.statistics->null_count.value()); + + ASSERT_TRUE(copied_data.statistics->min.has_value()); + ASSERT_TRUE(std::holds_alternative(copied_data.statistics->min.value())); + ASSERT_EQ(min_, std::get(copied_data.statistics->min.value())); + ASSERT_TRUE(copied_data.statistics->is_min_exact); + + ASSERT_TRUE(copied_data.statistics->max.has_value()); + ASSERT_TRUE(std::holds_alternative(copied_data.statistics->max.value())); + ASSERT_EQ(max_, std::get(copied_data.statistics->max.value())); + ASSERT_TRUE(copied_data.statistics->is_max_exact); +} + +TEST_F(TestArrayDataStatistics, CopyTo) { + ASSERT_OK_AND_ASSIGN(auto copied_data, + data_->CopyTo(arrow::default_cpu_memory_manager())); + + ASSERT_TRUE(copied_data->statistics->null_count.has_value()); + ASSERT_EQ(null_count_, copied_data->statistics->null_count.value()); + + ASSERT_TRUE(copied_data->statistics->min.has_value()); + ASSERT_TRUE(std::holds_alternative(copied_data->statistics->min.value())); + ASSERT_EQ(min_, std::get(copied_data->statistics->min.value())); + ASSERT_TRUE(copied_data->statistics->is_min_exact); + + ASSERT_TRUE(copied_data->statistics->max.has_value()); + ASSERT_TRUE(std::holds_alternative(copied_data->statistics->max.value())); + ASSERT_EQ(max_, std::get(copied_data->statistics->max.value())); + ASSERT_TRUE(copied_data->statistics->is_max_exact); +} + +TEST_F(TestArrayDataStatistics, Slice) { + auto sliced_data = data_->Slice(0, 1); + ASSERT_FALSE(sliced_data->statistics); +} + template class TestPrimitiveArray : public ::testing::Test { public: diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h index d825f7d32520a..442e4a26320a2 100644 --- a/cpp/src/arrow/array/builder_binary.h +++ b/cpp/src/arrow/array/builder_binary.h @@ -500,9 +500,9 @@ class ARROW_EXPORT StringHeapBuilder { ARROW_RETURN_NOT_OK(Reserve(length)); } - auto v = - util::ToBinaryView(value, static_cast(length), - static_cast(blocks_.size() - 1), current_offset_); + auto v = util::ToNonInlineBinaryView(value, static_cast(length), + static_cast(blocks_.size() - 1), + current_offset_); memcpy(current_out_buffer_, value, static_cast(length)); current_out_buffer_ += length; diff --git a/cpp/src/arrow/array/concatenate.cc b/cpp/src/arrow/array/concatenate.cc index 87e55246c78fe..b4638dd6593d8 100644 --- a/cpp/src/arrow/array/concatenate.cc +++ b/cpp/src/arrow/array/concatenate.cc @@ -75,6 +75,31 @@ struct Bitmap { bool AllSet() const { return data == nullptr; } }; +enum class OffsetBufferOpOutcome { + kOk, + kOffsetOverflow, +}; + +Status OffsetOverflowStatus() { + return Status::Invalid("offset overflow while concatenating arrays"); +} + +#define RETURN_IF_NOT_OK_OUTCOME(outcome) \ + switch (outcome) { \ + case OffsetBufferOpOutcome::kOk: \ + break; \ + case OffsetBufferOpOutcome::kOffsetOverflow: \ + return OffsetOverflowStatus(); \ + } + +struct ErrorHints { + /// \brief Suggested cast to avoid overflow during concatenation. + /// + /// If the concatenation of offsets overflows, this field might be set to the + /// a type that uses larger offsets (e.g. large_utf8, large_list). + std::shared_ptr suggested_cast; +}; + // Allocate a buffer and concatenate bitmaps into it. Status ConcatenateBitmaps(const std::vector& bitmaps, MemoryPool* pool, std::shared_ptr* out) { @@ -112,15 +137,16 @@ int64_t SumBufferSizesInBytes(const BufferVector& buffers) { // Write offsets in src into dst, adjusting them such that first_offset // will be the first offset written. template -Status PutOffsets(const Buffer& src, Offset first_offset, Offset* dst, - Range* values_range); +Result PutOffsets(const Buffer& src, Offset first_offset, + Offset* dst, Range* values_range); // Concatenate buffers holding offsets into a single buffer of offsets, // also computing the ranges of values spanned by each buffer of offsets. template -Status ConcatenateOffsets(const BufferVector& buffers, MemoryPool* pool, - std::shared_ptr* out, - std::vector* values_ranges) { +Result ConcatenateOffsets(const BufferVector& buffers, + MemoryPool* pool, + std::shared_ptr* out, + std::vector* values_ranges) { values_ranges->resize(buffers.size()); // allocate output buffer @@ -133,26 +159,30 @@ Status ConcatenateOffsets(const BufferVector& buffers, MemoryPool* pool, for (size_t i = 0; i < buffers.size(); ++i) { // the first offset from buffers[i] will be adjusted to values_length // (the cumulative length of values spanned by offsets in previous buffers) - RETURN_NOT_OK(PutOffsets(*buffers[i], values_length, - out_data + elements_length, &(*values_ranges)[i])); + ARROW_ASSIGN_OR_RAISE(auto outcome, PutOffsets(*buffers[i], values_length, + out_data + elements_length, + &(*values_ranges)[i])); + if (ARROW_PREDICT_FALSE(outcome != OffsetBufferOpOutcome::kOk)) { + return outcome; + } elements_length += buffers[i]->size() / sizeof(Offset); values_length += static_cast((*values_ranges)[i].length); } // the final element in out_data is the length of all values spanned by the offsets out_data[out_size_in_bytes / sizeof(Offset)] = values_length; - return Status::OK(); + return OffsetBufferOpOutcome::kOk; } template -Status PutOffsets(const Buffer& src, Offset first_offset, Offset* dst, - Range* values_range) { +Result PutOffsets(const Buffer& src, Offset first_offset, + Offset* dst, Range* values_range) { if (src.size() == 0) { // It's allowed to have an empty offsets buffer for a 0-length array // (see Array::Validate) values_range->offset = 0; values_range->length = 0; - return Status::OK(); + return OffsetBufferOpOutcome::kOk; } // Get the range of offsets to transfer from src @@ -162,8 +192,9 @@ Status PutOffsets(const Buffer& src, Offset first_offset, Offset* dst, // Compute the range of values which is spanned by this range of offsets values_range->offset = src_begin[0]; values_range->length = *src_end - values_range->offset; - if (first_offset > std::numeric_limits::max() - values_range->length) { - return Status::Invalid("offset overflow while concatenating arrays"); + if (ARROW_PREDICT_FALSE(first_offset > + std::numeric_limits::max() - values_range->length)) { + return OffsetBufferOpOutcome::kOffsetOverflow; } // Write offsets into dst, ensuring that the first offset written is @@ -175,12 +206,14 @@ Status PutOffsets(const Buffer& src, Offset first_offset, Offset* dst, std::transform(src_begin, src_end, dst, [displacement](Offset offset) { return SafeSignedAdd(offset, displacement); }); - return Status::OK(); + return OffsetBufferOpOutcome::kOk; } template -Status PutListViewOffsets(const ArrayData& input, offset_type* sizes, const Buffer& src, - offset_type displacement, offset_type* dst); +Result PutListViewOffsets(const ArrayData& input, + offset_type* sizes, const Buffer& src, + offset_type displacement, + offset_type* dst); // Concatenate buffers holding list-view offsets into a single buffer of offsets // @@ -198,10 +231,10 @@ Status PutListViewOffsets(const ArrayData& input, offset_type* sizes, const Buff // \param[in] in The child arrays // \param[in,out] sizes The concatenated sizes buffer template -Status ConcatenateListViewOffsets(const ArrayDataVector& in, offset_type* sizes, - const BufferVector& offset_buffers, - const std::vector& value_ranges, - MemoryPool* pool, std::shared_ptr* out) { +Result ConcatenateListViewOffsets( + const ArrayDataVector& in, offset_type* sizes, const BufferVector& offset_buffers, + const std::vector& value_ranges, MemoryPool* pool, + std::shared_ptr* out) { DCHECK_EQ(offset_buffers.size(), value_ranges.size()); // Allocate resulting offsets buffer and initialize it with zeros @@ -216,26 +249,32 @@ Status ConcatenateListViewOffsets(const ArrayDataVector& in, offset_type* sizes, for (size_t i = 0; i < offset_buffers.size(); ++i) { const auto displacement = static_cast(num_child_values - value_ranges[i].offset); - RETURN_NOT_OK(PutListViewOffsets(*in[i], /*sizes=*/sizes + elements_length, - /*src=*/*offset_buffers[i], displacement, - /*dst=*/out_offsets + elements_length)); + ARROW_ASSIGN_OR_RAISE(auto outcome, + PutListViewOffsets(*in[i], /*sizes=*/sizes + elements_length, + /*src=*/*offset_buffers[i], displacement, + /*dst=*/out_offsets + elements_length)); + if (ARROW_PREDICT_FALSE(outcome != OffsetBufferOpOutcome::kOk)) { + return outcome; + } elements_length += offset_buffers[i]->size() / sizeof(offset_type); num_child_values += value_ranges[i].length; if (num_child_values > std::numeric_limits::max()) { - return Status::Invalid("offset overflow while concatenating arrays"); + return OffsetBufferOpOutcome::kOffsetOverflow; } } DCHECK_EQ(elements_length, static_cast(out_size_in_bytes / sizeof(offset_type))); - return Status::OK(); + return OffsetBufferOpOutcome::kOk; } template -Status PutListViewOffsets(const ArrayData& input, offset_type* sizes, const Buffer& src, - offset_type displacement, offset_type* dst) { +Result PutListViewOffsets(const ArrayData& input, + offset_type* sizes, const Buffer& src, + offset_type displacement, + offset_type* dst) { if (src.size() == 0) { - return Status::OK(); + return OffsetBufferOpOutcome::kOk; } const auto& validity_buffer = input.buffers[0]; if (validity_buffer) { @@ -291,7 +330,7 @@ Status PutListViewOffsets(const ArrayData& input, offset_type* sizes, const Buff } } } - return Status::OK(); + return OffsetBufferOpOutcome::kOk; } class ConcatenateImpl { @@ -316,11 +355,17 @@ class ConcatenateImpl { } } - Status Concatenate(std::shared_ptr* out) && { + Status Concatenate(std::shared_ptr* out, ErrorHints* out_hints) && { if (out_->null_count != 0 && internal::may_have_validity_bitmap(out_->type->id())) { RETURN_NOT_OK(ConcatenateBitmaps(Bitmaps(0), pool_, &out_->buffers[0])); } - RETURN_NOT_OK(VisitTypeInline(*out_->type, this)); + auto status = VisitTypeInline(*out_->type, this); + if (!status.ok()) { + if (out_hints) { + out_hints->suggested_cast = std::move(suggested_cast_); + } + return status; + } *out = std::move(out_); return Status::OK(); } @@ -337,11 +382,29 @@ class ConcatenateImpl { return ConcatenateBuffers(buffers, pool_).Value(&out_->buffers[1]); } - Status Visit(const BinaryType&) { + Status Visit(const BinaryType& input_type) { std::vector value_ranges; ARROW_ASSIGN_OR_RAISE(auto index_buffers, Buffers(1, sizeof(int32_t))); - RETURN_NOT_OK(ConcatenateOffsets(index_buffers, pool_, &out_->buffers[1], - &value_ranges)); + ARROW_ASSIGN_OR_RAISE( + auto outcome, ConcatenateOffsets(index_buffers, pool_, &out_->buffers[1], + &value_ranges)); + switch (outcome) { + case OffsetBufferOpOutcome::kOk: + break; + case OffsetBufferOpOutcome::kOffsetOverflow: + switch (input_type.id()) { + case Type::BINARY: + suggested_cast_ = large_binary(); + break; + case Type::STRING: + suggested_cast_ = large_utf8(); + break; + default: + DCHECK(false) << "unexpected type id from BinaryType: " << input_type; + break; + } + return OffsetOverflowStatus(); + } ARROW_ASSIGN_OR_RAISE(auto value_buffers, Buffers(2, value_ranges)); return ConcatenateBuffers(value_buffers, pool_).Value(&out_->buffers[2]); } @@ -349,8 +412,10 @@ class ConcatenateImpl { Status Visit(const LargeBinaryType&) { std::vector value_ranges; ARROW_ASSIGN_OR_RAISE(auto index_buffers, Buffers(1, sizeof(int64_t))); - RETURN_NOT_OK(ConcatenateOffsets(index_buffers, pool_, &out_->buffers[1], - &value_ranges)); + ARROW_ASSIGN_OR_RAISE( + auto outcome, ConcatenateOffsets(index_buffers, pool_, &out_->buffers[1], + &value_ranges)); + RETURN_IF_NOT_OK_OUTCOME(outcome); ARROW_ASSIGN_OR_RAISE(auto value_buffers, Buffers(2, value_ranges)); return ConcatenateBuffers(value_buffers, pool_).Value(&out_->buffers[2]); } @@ -394,22 +459,44 @@ class ConcatenateImpl { return Status::OK(); } - Status Visit(const ListType&) { + Status Visit(const ListType& input_type) { std::vector value_ranges; ARROW_ASSIGN_OR_RAISE(auto index_buffers, Buffers(1, sizeof(int32_t))); - RETURN_NOT_OK(ConcatenateOffsets(index_buffers, pool_, &out_->buffers[1], - &value_ranges)); + ARROW_ASSIGN_OR_RAISE(auto offsets_outcome, + ConcatenateOffsets(index_buffers, pool_, + &out_->buffers[1], &value_ranges)); + switch (offsets_outcome) { + case OffsetBufferOpOutcome::kOk: + break; + case OffsetBufferOpOutcome::kOffsetOverflow: + suggested_cast_ = large_list(input_type.value_type()); + return OffsetOverflowStatus(); + } ARROW_ASSIGN_OR_RAISE(auto child_data, ChildData(0, value_ranges)); - return ConcatenateImpl(child_data, pool_).Concatenate(&out_->child_data[0]); + ErrorHints child_error_hints; + auto status = ConcatenateImpl(child_data, pool_) + .Concatenate(&out_->child_data[0], &child_error_hints); + if (!status.ok() && child_error_hints.suggested_cast) { + suggested_cast_ = list(std::move(child_error_hints.suggested_cast)); + } + return status; } Status Visit(const LargeListType&) { std::vector value_ranges; ARROW_ASSIGN_OR_RAISE(auto index_buffers, Buffers(1, sizeof(int64_t))); - RETURN_NOT_OK(ConcatenateOffsets(index_buffers, pool_, &out_->buffers[1], - &value_ranges)); + ARROW_ASSIGN_OR_RAISE( + auto outcome, ConcatenateOffsets(index_buffers, pool_, &out_->buffers[1], + &value_ranges)); + RETURN_IF_NOT_OK_OUTCOME(outcome); ARROW_ASSIGN_OR_RAISE(auto child_data, ChildData(0, value_ranges)); - return ConcatenateImpl(child_data, pool_).Concatenate(&out_->child_data[0]); + ErrorHints child_error_hints; + auto status = ConcatenateImpl(child_data, pool_) + .Concatenate(&out_->child_data[0], &child_error_hints); + if (!status.ok() && child_error_hints.suggested_cast) { + suggested_cast_ = large_list(std::move(child_error_hints.suggested_cast)); + } + return status; } template @@ -430,8 +517,17 @@ class ConcatenateImpl { } // Concatenate the values + ErrorHints child_error_hints; ARROW_ASSIGN_OR_RAISE(ArrayDataVector value_data, ChildData(0, value_ranges)); - RETURN_NOT_OK(ConcatenateImpl(value_data, pool_).Concatenate(&out_->child_data[0])); + auto values_status = ConcatenateImpl(value_data, pool_) + .Concatenate(&out_->child_data[0], &child_error_hints); + if (!values_status.ok()) { + if (child_error_hints.suggested_cast) { + suggested_cast_ = std::make_shared>( + std::move(child_error_hints.suggested_cast)); + } + return values_status; + } out_->child_data[0]->type = type.value_type(); // Concatenate the sizes first @@ -440,22 +536,39 @@ class ConcatenateImpl { // Concatenate the offsets ARROW_ASSIGN_OR_RAISE(auto offset_buffers, Buffers(1, sizeof(offset_type))); - RETURN_NOT_OK(ConcatenateListViewOffsets( - in_, /*sizes=*/out_->buffers[2]->mutable_data_as(), offset_buffers, - value_ranges, pool_, &out_->buffers[1])); - + ARROW_ASSIGN_OR_RAISE( + auto outcome, ConcatenateListViewOffsets( + in_, /*sizes=*/out_->buffers[2]->mutable_data_as(), + offset_buffers, value_ranges, pool_, &out_->buffers[1])); + switch (outcome) { + case OffsetBufferOpOutcome::kOk: + break; + case OffsetBufferOpOutcome::kOffsetOverflow: + if constexpr (T::type_id == Type::LIST_VIEW) { + suggested_cast_ = large_list_view(type.value_type()); + } + return OffsetOverflowStatus(); + } return Status::OK(); } - Status Visit(const FixedSizeListType& fixed_size_list) { - ARROW_ASSIGN_OR_RAISE(auto child_data, ChildData(0, fixed_size_list.list_size())); - return ConcatenateImpl(child_data, pool_).Concatenate(&out_->child_data[0]); + Status Visit(const FixedSizeListType& fsl_type) { + ARROW_ASSIGN_OR_RAISE(auto child_data, ChildData(0, fsl_type.list_size())); + ErrorHints hints; + auto status = + ConcatenateImpl(child_data, pool_).Concatenate(&out_->child_data[0], &hints); + if (!status.ok() && hints.suggested_cast) { + suggested_cast_ = + fixed_size_list(std::move(hints.suggested_cast), fsl_type.list_size()); + } + return status; } Status Visit(const StructType& s) { for (int i = 0; i < s.num_fields(); ++i) { ARROW_ASSIGN_OR_RAISE(auto child_data, ChildData(i)); - RETURN_NOT_OK(ConcatenateImpl(child_data, pool_).Concatenate(&out_->child_data[i])); + RETURN_NOT_OK(ConcatenateImpl(child_data, pool_) + .Concatenate(&out_->child_data[i], /*hints=*/nullptr)); } return Status::OK(); } @@ -570,8 +683,8 @@ class ConcatenateImpl { case UnionMode::SPARSE: { for (int i = 0; i < u.num_fields(); i++) { ARROW_ASSIGN_OR_RAISE(auto child_data, ChildData(i)); - RETURN_NOT_OK( - ConcatenateImpl(child_data, pool_).Concatenate(&out_->child_data[i])); + RETURN_NOT_OK(ConcatenateImpl(child_data, pool_) + .Concatenate(&out_->child_data[i], /*hints=*/nullptr)); } break; } @@ -581,8 +694,8 @@ class ConcatenateImpl { for (size_t j = 0; j < in_.size(); j++) { child_data[j] = in_[j]->child_data[i]; } - RETURN_NOT_OK( - ConcatenateImpl(child_data, pool_).Concatenate(&out_->child_data[i])); + RETURN_NOT_OK(ConcatenateImpl(child_data, pool_) + .Concatenate(&out_->child_data[i], /*hints=*/nullptr)); } break; } @@ -666,7 +779,8 @@ class ConcatenateImpl { storage_data[i]->type = e.storage_type(); } std::shared_ptr out_storage; - RETURN_NOT_OK(ConcatenateImpl(storage_data, pool_).Concatenate(&out_storage)); + RETURN_NOT_OK(ConcatenateImpl(storage_data, pool_) + .Concatenate(&out_storage, /*hints=*/nullptr)); out_storage->type = in_[0]->type; out_ = std::move(out_storage); return Status::OK(); @@ -797,11 +911,18 @@ class ConcatenateImpl { const ArrayDataVector& in_; MemoryPool* pool_; std::shared_ptr out_; + std::shared_ptr suggested_cast_; }; } // namespace -Result> Concatenate(const ArrayVector& arrays, MemoryPool* pool) { +namespace internal { + +Result> Concatenate( + const ArrayVector& arrays, MemoryPool* pool, + std::shared_ptr* out_suggested_cast) { + DCHECK(out_suggested_cast); + *out_suggested_cast = nullptr; if (arrays.size() == 0) { return Status::Invalid("Must pass at least one array"); } @@ -818,8 +939,31 @@ Result> Concatenate(const ArrayVector& arrays, MemoryPool } std::shared_ptr out_data; - RETURN_NOT_OK(ConcatenateImpl(data, pool).Concatenate(&out_data)); + ErrorHints hints; + auto status = ConcatenateImpl(data, pool).Concatenate(&out_data, &hints); + if (!status.ok()) { + if (hints.suggested_cast) { + DCHECK(status.IsInvalid()); + *out_suggested_cast = std::move(hints.suggested_cast); + } + return status; + } return MakeArray(std::move(out_data)); } +} // namespace internal + +Result> Concatenate(const ArrayVector& arrays, MemoryPool* pool) { + std::shared_ptr suggested_cast; + auto result = internal::Concatenate(arrays, pool, &suggested_cast); + if (!result.ok() && suggested_cast && arrays.size() > 0) { + DCHECK(result.status().IsInvalid()); + return Status::Invalid(result.status().message(), ", consider casting input from `", + *arrays[0]->type(), "` to `", *suggested_cast, "` first."); + } + return result; +} + +#undef RETURN_IF_NOT_OK_OUTCOME + } // namespace arrow diff --git a/cpp/src/arrow/array/concatenate.h b/cpp/src/arrow/array/concatenate.h index e7597aad812c4..aada5624d63a3 100644 --- a/cpp/src/arrow/array/concatenate.h +++ b/cpp/src/arrow/array/concatenate.h @@ -24,6 +24,22 @@ #include "arrow/util/visibility.h" namespace arrow { +namespace internal { + +/// \brief Concatenate arrays +/// +/// \param[in] arrays a vector of arrays to be concatenated +/// \param[in] pool memory to store the result will be allocated from this memory pool +/// \param[out] out_suggested_cast if a non-OK Result is returned, the function might set +/// out_suggested_cast to a cast suggestion that would allow concatenating the arrays +/// without overflow of offsets (e.g. string to large_string) +/// +/// \return the concatenated array +ARROW_EXPORT +Result> Concatenate(const ArrayVector& arrays, MemoryPool* pool, + std::shared_ptr* out_suggested_cast); + +} // namespace internal /// \brief Concatenate arrays /// diff --git a/cpp/src/arrow/array/concatenate_test.cc b/cpp/src/arrow/array/concatenate_test.cc index af595e897f9ee..aea5311575299 100644 --- a/cpp/src/arrow/array/concatenate_test.cc +++ b/cpp/src/arrow/array/concatenate_test.cc @@ -29,6 +29,7 @@ #include #include +#include #include #include "arrow/array.h" @@ -42,6 +43,7 @@ #include "arrow/testing/util.h" #include "arrow/type.h" #include "arrow/util/list_util.h" +#include "arrow/util/unreachable.h" namespace arrow { @@ -661,14 +663,103 @@ TEST_F(ConcatenateTest, ExtensionType) { }); } +std::shared_ptr LargeVersionOfType(const std::shared_ptr& type) { + switch (type->id()) { + case Type::BINARY: + return large_binary(); + case Type::STRING: + return large_utf8(); + case Type::LIST: + return large_list(static_cast(*type).value_type()); + case Type::LIST_VIEW: + return large_list_view(static_cast(*type).value_type()); + case Type::LARGE_BINARY: + case Type::LARGE_STRING: + case Type::LARGE_LIST: + case Type::LARGE_LIST_VIEW: + return type; + default: + Unreachable(); + } +} + +std::shared_ptr fixed_size_list_of_1(std::shared_ptr type) { + return fixed_size_list(std::move(type), 1); +} + TEST_F(ConcatenateTest, OffsetOverflow) { - auto fake_long = ArrayFromJSON(utf8(), "[\"\"]"); - fake_long->data()->GetMutableValues(1)[1] = + using TypeFactory = std::shared_ptr (*)(std::shared_ptr); + static const std::vector kNestedTypeFactories = { + list, large_list, list_view, large_list_view, fixed_size_list_of_1, + }; + + auto* pool = default_memory_pool(); + std::shared_ptr suggested_cast; + for (auto& ty : {binary(), utf8()}) { + auto large_ty = LargeVersionOfType(ty); + + auto fake_long = ArrayFromJSON(ty, "[\"\"]"); + fake_long->data()->GetMutableValues(1)[1] = + std::numeric_limits::max(); + // XXX: since the data fake_long claims to own isn't there, this would + // segfault if Concatenate didn't detect overflow and raise an error. + auto concatenate_status = Concatenate({fake_long, fake_long}); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, + ::testing::StrEq("Invalid: offset overflow while concatenating arrays, " + "consider casting input from `" + + ty->ToString() + "` to `large_" + ty->ToString() + "` first."), + concatenate_status); + + concatenate_status = + internal::Concatenate({fake_long, fake_long}, pool, &suggested_cast); + // Message is doesn't contain the suggested cast type when the caller + // asks for it by passing the output parameter. + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, ::testing::StrEq("Invalid: offset overflow while concatenating arrays"), + concatenate_status); + ASSERT_TRUE(large_ty->Equals(*suggested_cast)); + + // Check that the suggested cast is correct when concatenation + // fails due to the child array being too large. + for (auto factory : kNestedTypeFactories) { + auto nested_ty = factory(ty); + auto expected_suggestion = factory(large_ty); + auto fake_long_list = ArrayFromJSON(nested_ty, "[[\"\"]]"); + fake_long_list->data()->child_data[0] = fake_long->data(); + + ASSERT_RAISES(Invalid, internal::Concatenate({fake_long_list, fake_long_list}, pool, + &suggested_cast) + .status()); + ASSERT_TRUE(suggested_cast->Equals(*expected_suggestion)); + } + } + + auto list_ty = list(utf8()); + auto fake_long_list = ArrayFromJSON(list_ty, "[[\"Hello\"]]"); + fake_long_list->data()->GetMutableValues(1)[1] = std::numeric_limits::max(); - std::shared_ptr concatenated; - // XX since the data fake_long claims to own isn't there, this will segfault if - // Concatenate doesn't detect overflow and raise an error. - ASSERT_RAISES(Invalid, Concatenate({fake_long, fake_long}).status()); + ASSERT_RAISES(Invalid, internal::Concatenate({fake_long_list, fake_long_list}, pool, + &suggested_cast) + .status()); + ASSERT_TRUE(suggested_cast->Equals(LargeVersionOfType(list_ty))); + + auto list_view_ty = list_view(null()); + auto fake_long_list_view = ArrayFromJSON(list_view_ty, "[[], []]"); + { + constexpr int kInt32Max = std::numeric_limits::max(); + auto* values = fake_long_list_view->data()->child_data[0].get(); + auto* mutable_offsets = fake_long_list_view->data()->GetMutableValues(1); + auto* mutable_sizes = fake_long_list_view->data()->GetMutableValues(2); + values->length = 2 * static_cast(kInt32Max); + mutable_offsets[1] = kInt32Max; + mutable_offsets[0] = kInt32Max; + mutable_sizes[0] = kInt32Max; + } + ASSERT_RAISES(Invalid, internal::Concatenate({fake_long_list_view, fake_long_list_view}, + pool, &suggested_cast) + .status()); + ASSERT_TRUE(suggested_cast->Equals(LargeVersionOfType(list_view_ty))); } TEST_F(ConcatenateTest, DictionaryConcatenateWithEmptyUint16) { diff --git a/cpp/src/arrow/array/data.cc b/cpp/src/arrow/array/data.cc index 83eeb56c496cf..8e29297a8c175 100644 --- a/cpp/src/arrow/array/data.cc +++ b/cpp/src/arrow/array/data.cc @@ -165,6 +165,8 @@ Result> CopyToImpl(const ArrayData& data, ARROW_ASSIGN_OR_RAISE(output->dictionary, CopyToImpl(*data.dictionary, to, copy_fn)); } + output->statistics = data.statistics; + return output; } } // namespace @@ -195,6 +197,7 @@ std::shared_ptr ArrayData::Slice(int64_t off, int64_t len) const { } else { copy->null_count = null_count != 0 ? kUnknownNullCount : 0; } + copy->statistics = nullptr; return copy; } diff --git a/cpp/src/arrow/array/data.h b/cpp/src/arrow/array/data.h index e0508fe6980a7..1e6ee9a1d32ff 100644 --- a/cpp/src/arrow/array/data.h +++ b/cpp/src/arrow/array/data.h @@ -24,6 +24,7 @@ #include #include +#include "arrow/array/statistics.h" #include "arrow/buffer.h" #include "arrow/result.h" #include "arrow/type.h" @@ -152,7 +153,8 @@ struct ARROW_EXPORT ArrayData { offset(other.offset), buffers(std::move(other.buffers)), child_data(std::move(other.child_data)), - dictionary(std::move(other.dictionary)) { + dictionary(std::move(other.dictionary)), + statistics(std::move(other.statistics)) { SetNullCount(other.null_count); } @@ -163,7 +165,8 @@ struct ARROW_EXPORT ArrayData { offset(other.offset), buffers(other.buffers), child_data(other.child_data), - dictionary(other.dictionary) { + dictionary(other.dictionary), + statistics(other.statistics) { SetNullCount(other.null_count); } @@ -176,6 +179,7 @@ struct ARROW_EXPORT ArrayData { buffers = std::move(other.buffers); child_data = std::move(other.child_data); dictionary = std::move(other.dictionary); + statistics = std::move(other.statistics); return *this; } @@ -188,6 +192,7 @@ struct ARROW_EXPORT ArrayData { buffers = other.buffers; child_data = other.child_data; dictionary = other.dictionary; + statistics = other.statistics; return *this; } @@ -274,6 +279,18 @@ struct ARROW_EXPORT ArrayData { } /// \brief Construct a zero-copy slice of the data with the given offset and length + /// + /// The associated `ArrayStatistics` is always discarded in a sliced + /// `ArrayData`. Because `ArrayStatistics` in the original + /// `ArrayData` may be invalid in a sliced `ArrayData`. If you want + /// to reuse statistics in the original `ArrayData`, you need to do + /// it by yourself. + /// + /// If the specified slice range has the same range as the original + /// `ArrayData`, we can reuse statistics in the original + /// `ArrayData`. Because it has the same data as the original + /// `ArrayData`. But the associated `ArrayStatistics` is discarded + /// in this case too. Use `Copy()` instead for the case. std::shared_ptr Slice(int64_t offset, int64_t length) const; /// \brief Input-checking variant of Slice @@ -390,6 +407,9 @@ struct ARROW_EXPORT ArrayData { // The dictionary for this Array, if any. Only used for dictionary type std::shared_ptr dictionary; + + // The statistics for this Array. + std::shared_ptr statistics; }; /// \brief A non-owning Buffer reference diff --git a/cpp/src/arrow/array/statistics.cc b/cpp/src/arrow/array/statistics.cc new file mode 100644 index 0000000000000..b661c9fbaffed --- /dev/null +++ b/cpp/src/arrow/array/statistics.cc @@ -0,0 +1,21 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This empty .cc file is for embedding not inlined symbols in +// arrow::ArrayStatistics into libarrow. + +#include "arrow/array/statistics.h" diff --git a/cpp/src/arrow/array/statistics.h b/cpp/src/arrow/array/statistics.h new file mode 100644 index 0000000000000..523f877bbe429 --- /dev/null +++ b/cpp/src/arrow/array/statistics.h @@ -0,0 +1,69 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include + +#include "arrow/util/visibility.h" + +namespace arrow { + +/// \brief Statistics for an Array +/// +/// Apache Arrow format doesn't have statistics but data source such +/// as Apache Parquet may have statistics. Statistics associated with +/// data source can be read unified API via this class. +struct ARROW_EXPORT ArrayStatistics { + using ValueType = std::variant; + + /// \brief The number of null values, may not be set + std::optional null_count = std::nullopt; + + /// \brief The number of distinct values, may not be set + std::optional distinct_count = std::nullopt; + + /// \brief The minimum value, may not be set + std::optional min = std::nullopt; + + /// \brief Whether the minimum value is exact or not + bool is_min_exact = false; + + /// \brief The maximum value, may not be set + std::optional max = std::nullopt; + + /// \brief Whether the maximum value is exact or not + bool is_max_exact = false; + + /// \brief Check two statistics for equality + bool Equals(const ArrayStatistics& other) const { + return null_count == other.null_count && distinct_count == other.distinct_count && + min == other.min && is_min_exact == other.is_min_exact && max == other.max && + is_max_exact == other.is_max_exact; + } + + /// \brief Check two statistics for equality + bool operator==(const ArrayStatistics& other) const { return Equals(other); } + + /// \brief Check two statistics for not equality + bool operator!=(const ArrayStatistics& other) const { return !Equals(other); } +}; + +} // namespace arrow diff --git a/cpp/src/arrow/array/statistics_test.cc b/cpp/src/arrow/array/statistics_test.cc new file mode 100644 index 0000000000000..cf15a5d382978 --- /dev/null +++ b/cpp/src/arrow/array/statistics_test.cc @@ -0,0 +1,101 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "arrow/array/statistics.h" + +namespace arrow { + +TEST(ArrayStatisticsTest, TestNullCount) { + ArrayStatistics statistics; + ASSERT_FALSE(statistics.null_count.has_value()); + statistics.null_count = 29; + ASSERT_TRUE(statistics.null_count.has_value()); + ASSERT_EQ(29, statistics.null_count.value()); +} + +TEST(ArrayStatisticsTest, TestDistinctCount) { + ArrayStatistics statistics; + ASSERT_FALSE(statistics.distinct_count.has_value()); + statistics.distinct_count = 29; + ASSERT_TRUE(statistics.distinct_count.has_value()); + ASSERT_EQ(29, statistics.distinct_count.value()); +} + +TEST(ArrayStatisticsTest, TestMin) { + ArrayStatistics statistics; + ASSERT_FALSE(statistics.min.has_value()); + ASSERT_FALSE(statistics.is_min_exact); + statistics.min = static_cast(29); + statistics.is_min_exact = true; + ASSERT_TRUE(statistics.min.has_value()); + ASSERT_TRUE(std::holds_alternative(statistics.min.value())); + ASSERT_EQ(29, std::get(statistics.min.value())); + ASSERT_TRUE(statistics.is_min_exact); +} + +TEST(ArrayStatisticsTest, TestMax) { + ArrayStatistics statistics; + ASSERT_FALSE(statistics.max.has_value()); + ASSERT_FALSE(statistics.is_max_exact); + statistics.max = std::string("hello"); + statistics.is_max_exact = false; + ASSERT_TRUE(statistics.max.has_value()); + ASSERT_TRUE(std::holds_alternative(statistics.max.value())); + ASSERT_EQ("hello", std::get(statistics.max.value())); + ASSERT_FALSE(statistics.is_max_exact); +} + +TEST(ArrayStatisticsTest, TestEquality) { + ArrayStatistics statistics1; + ArrayStatistics statistics2; + + ASSERT_EQ(statistics1, statistics2); + + statistics1.null_count = 29; + ASSERT_NE(statistics1, statistics2); + statistics2.null_count = 29; + ASSERT_EQ(statistics1, statistics2); + + statistics1.distinct_count = 2929; + ASSERT_NE(statistics1, statistics2); + statistics2.distinct_count = 2929; + ASSERT_EQ(statistics1, statistics2); + + statistics1.min = std::string("world"); + ASSERT_NE(statistics1, statistics2); + statistics2.min = std::string("world"); + ASSERT_EQ(statistics1, statistics2); + + statistics1.is_min_exact = true; + ASSERT_NE(statistics1, statistics2); + statistics2.is_min_exact = true; + ASSERT_EQ(statistics1, statistics2); + + statistics1.max = static_cast(-29); + ASSERT_NE(statistics1, statistics2); + statistics2.max = static_cast(-29); + ASSERT_EQ(statistics1, statistics2); + + statistics1.is_max_exact = true; + ASSERT_NE(statistics1, statistics2); + statistics2.is_max_exact = true; + ASSERT_EQ(statistics1, statistics2); +} + +} // namespace arrow diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc index 0d940d3bc869e..69f1646054f4c 100644 --- a/cpp/src/arrow/array/validate.cc +++ b/cpp/src/arrow/array/validate.cc @@ -985,10 +985,22 @@ Status ValidateArrayFull(const Array& array) { return ValidateArrayFull(*array.d ARROW_EXPORT Status ValidateUTF8(const ArrayData& data) { - DCHECK(data.type->id() == Type::STRING || data.type->id() == Type::STRING_VIEW || - data.type->id() == Type::LARGE_STRING); - UTF8DataValidator validator{data}; - return VisitTypeInline(*data.type, &validator); + const auto& storage_type = + (data.type->id() == Type::EXTENSION) + ? checked_cast(*data.type).storage_type() + : data.type; + DCHECK(storage_type->id() == Type::STRING || storage_type->id() == Type::STRING_VIEW || + storage_type->id() == Type::LARGE_STRING); + + if (data.type->id() == Type::EXTENSION) { + ArrayData ext_data(data); + ext_data.type = storage_type; + UTF8DataValidator validator{ext_data}; + return VisitTypeInline(*storage_type, &validator); + } else { + UTF8DataValidator validator{data}; + return VisitTypeInline(*storage_type, &validator); + } } ARROW_EXPORT diff --git a/cpp/src/arrow/builder_benchmark.cc b/cpp/src/arrow/builder_benchmark.cc index 84f27d20ee038..8ec7373a1de1f 100644 --- a/cpp/src/arrow/builder_benchmark.cc +++ b/cpp/src/arrow/builder_benchmark.cc @@ -150,6 +150,44 @@ static void BuildBinaryArray(benchmark::State& state) { // NOLINT non-const ref state.SetItemsProcessed(state.iterations() * kItemsProcessed); } +static void BuildInlineBinaryViewArray( + benchmark::State& state) { // NOLINT non-const reference + std::string_view kBinaryStrings[] = {"1", "12345678", "12345", "123456789", + "12", "", " "}; + + for (auto _ : state) { + BinaryViewBuilder builder(memory_tracker.memory_pool()); + + for (int64_t i = 0; i < kRounds * kNumberOfElements; i++) { + ABORT_NOT_OK(builder.Append(kBinaryStrings[i % 7])); + } + + std::shared_ptr out; + ABORT_NOT_OK(builder.Finish(&out)); + } + + state.SetBytesProcessed(state.iterations() * kBytesProcessed); + state.SetItemsProcessed(state.iterations() * kItemsProcessed); +} + +static void BuildNonInlineBinaryViewArray( + benchmark::State& state) { // NOLINT non-const reference + const char* kLargeBinaryString = "12345678901234567890123456789012345678901234567890"; + for (auto _ : state) { + BinaryViewBuilder builder(memory_tracker.memory_pool()); + + for (int64_t i = 0; i < kRounds * kNumberOfElements; i++) { + ABORT_NOT_OK(builder.Append(kLargeBinaryString)); + } + + std::shared_ptr out; + ABORT_NOT_OK(builder.Finish(&out)); + } + + state.SetBytesProcessed(state.iterations() * kBytesProcessed); + state.SetItemsProcessed(state.iterations() * kItemsProcessed); +} + static void BuildChunkedBinaryArray( benchmark::State& state) { // NOLINT non-const reference // 1MB chunks @@ -458,6 +496,8 @@ BENCHMARK(BuildBinaryArray); BENCHMARK(BuildChunkedBinaryArray); BENCHMARK(BuildFixedSizeBinaryArray); BENCHMARK(BuildDecimalArray); +BENCHMARK(BuildInlineBinaryViewArray); +BENCHMARK(BuildNonInlineBinaryViewArray); BENCHMARK(BuildInt64DictionaryArrayRandom); BENCHMARK(BuildInt64DictionaryArraySequential); diff --git a/cpp/src/arrow/c/abi.h b/cpp/src/arrow/c/abi.h index 6abe866b5f6f6..db051fff5ff05 100644 --- a/cpp/src/arrow/c/abi.h +++ b/cpp/src/arrow/c/abi.h @@ -41,11 +41,11 @@ extern "C" { #endif #ifndef ARROW_C_DATA_INTERFACE -#define ARROW_C_DATA_INTERFACE +# define ARROW_C_DATA_INTERFACE -#define ARROW_FLAG_DICTIONARY_ORDERED 1 -#define ARROW_FLAG_NULLABLE 2 -#define ARROW_FLAG_MAP_KEYS_SORTED 4 +# define ARROW_FLAG_DICTIONARY_ORDERED 1 +# define ARROW_FLAG_NULLABLE 2 +# define ARROW_FLAG_MAP_KEYS_SORTED 4 struct ArrowSchema { // Array type description @@ -83,7 +83,7 @@ struct ArrowArray { #endif // ARROW_C_DATA_INTERFACE #ifndef ARROW_C_DEVICE_DATA_INTERFACE -#define ARROW_C_DEVICE_DATA_INTERFACE +# define ARROW_C_DEVICE_DATA_INTERFACE // Spec and Documentation: https://arrow.apache.org/docs/format/CDeviceDataInterface.html @@ -91,33 +91,33 @@ struct ArrowArray { typedef int32_t ArrowDeviceType; // CPU device, same as using ArrowArray directly -#define ARROW_DEVICE_CPU 1 +# define ARROW_DEVICE_CPU 1 // CUDA GPU Device -#define ARROW_DEVICE_CUDA 2 +# define ARROW_DEVICE_CUDA 2 // Pinned CUDA CPU memory by cudaMallocHost -#define ARROW_DEVICE_CUDA_HOST 3 +# define ARROW_DEVICE_CUDA_HOST 3 // OpenCL Device -#define ARROW_DEVICE_OPENCL 4 +# define ARROW_DEVICE_OPENCL 4 // Vulkan buffer for next-gen graphics -#define ARROW_DEVICE_VULKAN 7 +# define ARROW_DEVICE_VULKAN 7 // Metal for Apple GPU -#define ARROW_DEVICE_METAL 8 +# define ARROW_DEVICE_METAL 8 // Verilog simulator buffer -#define ARROW_DEVICE_VPI 9 +# define ARROW_DEVICE_VPI 9 // ROCm GPUs for AMD GPUs -#define ARROW_DEVICE_ROCM 10 +# define ARROW_DEVICE_ROCM 10 // Pinned ROCm CPU memory allocated by hipMallocHost -#define ARROW_DEVICE_ROCM_HOST 11 +# define ARROW_DEVICE_ROCM_HOST 11 // Reserved for extension -#define ARROW_DEVICE_EXT_DEV 12 +# define ARROW_DEVICE_EXT_DEV 12 // CUDA managed/unified memory allocated by cudaMallocManaged -#define ARROW_DEVICE_CUDA_MANAGED 13 +# define ARROW_DEVICE_CUDA_MANAGED 13 // unified shared memory allocated on a oneAPI non-partitioned device. -#define ARROW_DEVICE_ONEAPI 14 +# define ARROW_DEVICE_ONEAPI 14 // GPU support for next-gen WebGPU standard -#define ARROW_DEVICE_WEBGPU 15 +# define ARROW_DEVICE_WEBGPU 15 // Qualcomm Hexagon DSP -#define ARROW_DEVICE_HEXAGON 16 +# define ARROW_DEVICE_HEXAGON 16 struct ArrowDeviceArray { // the Allocated Array @@ -138,7 +138,7 @@ struct ArrowDeviceArray { #endif // ARROW_C_DEVICE_DATA_INTERFACE #ifndef ARROW_C_STREAM_INTERFACE -#define ARROW_C_STREAM_INTERFACE +# define ARROW_C_STREAM_INTERFACE struct ArrowArrayStream { // Callback to get the stream type @@ -179,7 +179,7 @@ struct ArrowArrayStream { #endif // ARROW_C_STREAM_INTERFACE #ifndef ARROW_C_DEVICE_STREAM_INTERFACE -#define ARROW_C_DEVICE_STREAM_INTERFACE +# define ARROW_C_DEVICE_STREAM_INTERFACE // Equivalent to ArrowArrayStream, but for ArrowDeviceArrays. // diff --git a/cpp/src/arrow/c/bridge_test.cc b/cpp/src/arrow/c/bridge_test.cc index 09bb524adbdf0..01fd56f631d99 100644 --- a/cpp/src/arrow/c/bridge_test.cc +++ b/cpp/src/arrow/c/bridge_test.cc @@ -48,7 +48,7 @@ // TODO(GH-37221): Remove these ifdef checks when compute dependency is removed #ifdef ARROW_COMPUTE -#include "arrow/compute/api_vector.h" +# include "arrow/compute/api_vector.h" #endif namespace arrow { diff --git a/cpp/src/arrow/c/dlpack_abi.h b/cpp/src/arrow/c/dlpack_abi.h index 4af557a7ed5d7..fbe2a56a344b3 100644 --- a/cpp/src/arrow/c/dlpack_abi.h +++ b/cpp/src/arrow/c/dlpack_abi.h @@ -12,9 +12,9 @@ * \brief Compatibility with C++ */ #ifdef __cplusplus -#define DLPACK_EXTERN_C extern "C" +# define DLPACK_EXTERN_C extern "C" #else -#define DLPACK_EXTERN_C +# define DLPACK_EXTERN_C #endif /*! \brief The current major version of dlpack */ @@ -25,13 +25,13 @@ /*! \brief DLPACK_DLL prefix for windows */ #ifdef _WIN32 -#ifdef DLPACK_EXPORTS -#define DLPACK_DLL __declspec(dllexport) +# ifdef DLPACK_EXPORTS +# define DLPACK_DLL __declspec(dllexport) +# else +# define DLPACK_DLL __declspec(dllimport) +# endif #else -#define DLPACK_DLL __declspec(dllimport) -#endif -#else -#define DLPACK_DLL +# define DLPACK_DLL #endif #include diff --git a/cpp/src/arrow/chunk_resolver.cc b/cpp/src/arrow/chunk_resolver.cc index 55eec53ced1c7..854127480744e 100644 --- a/cpp/src/arrow/chunk_resolver.cc +++ b/cpp/src/arrow/chunk_resolver.cc @@ -60,42 +60,38 @@ inline std::vector MakeChunksOffsets(const std::vector& chunks) { template void ResolveManyInline(size_t num_offsets, const int64_t* signed_offsets, int64_t n_indices, const IndexType* logical_index_vec, - IndexType* out_chunk_index_vec, IndexType chunk_hint, - IndexType* out_index_in_chunk_vec) { + TypedChunkLocation* out_chunk_location_vec, + IndexType chunk_hint) { auto* offsets = reinterpret_cast(signed_offsets); const auto num_chunks = static_cast(num_offsets - 1); // chunk_hint in [0, num_offsets) per the precondition. for (int64_t i = 0; i < n_indices; i++) { - const auto index = static_cast(logical_index_vec[i]); + auto typed_logical_index = logical_index_vec[i]; + const auto index = static_cast(typed_logical_index); + // use or update chunk_hint if (index >= offsets[chunk_hint] && (chunk_hint == num_chunks || index < offsets[chunk_hint + 1])) { - out_chunk_index_vec[i] = chunk_hint; // hint is correct! - continue; + // hint is correct! + } else { + // lo < hi is guaranteed by `num_offsets = chunks.size() + 1` + auto chunk_index = + ChunkResolver::Bisect(index, offsets, /*lo=*/0, /*hi=*/num_offsets); + chunk_hint = static_cast(chunk_index); } - // lo < hi is guaranteed by `num_offsets = chunks.size() + 1` - auto chunk_index = - ChunkResolver::Bisect(index, offsets, /*lo=*/0, /*hi=*/num_offsets); - chunk_hint = static_cast(chunk_index); - out_chunk_index_vec[i] = chunk_hint; - } - if (out_index_in_chunk_vec != NULLPTR) { - for (int64_t i = 0; i < n_indices; i++) { - auto logical_index = logical_index_vec[i]; - auto chunk_index = out_chunk_index_vec[i]; - // chunk_index is in [0, chunks.size()] no matter what the - // value of logical_index is, so it's always safe to dereference - // offset_ as it contains chunks.size()+1 values. - out_index_in_chunk_vec[i] = - logical_index - static_cast(offsets[chunk_index]); + out_chunk_location_vec[i].chunk_index = chunk_hint; + // chunk_index is in [0, chunks.size()] no matter what the + // value of logical_index is, so it's always safe to dereference + // offset_ as it contains chunks.size()+1 values. + out_chunk_location_vec[i].index_in_chunk = + typed_logical_index - static_cast(offsets[chunk_hint]); #if defined(ARROW_VALGRIND) || defined(ADDRESS_SANITIZER) - // Make it more likely that Valgrind/ASAN can catch an invalid memory - // access by poisoning out_index_in_chunk_vec[i] when the logical - // index is out-of-bounds. - if (chunk_index == num_chunks) { - out_index_in_chunk_vec[i] = std::numeric_limits::max(); - } -#endif + // Make it more likely that Valgrind/ASAN can catch an invalid memory + // access by poisoning the index-in-chunk value when the logical + // index is out-of-bounds. + if (chunk_hint == num_chunks) { + out_chunk_location_vec[i].index_in_chunk = std::numeric_limits::max(); } +#endif } } @@ -130,31 +126,31 @@ ChunkResolver& ChunkResolver::operator=(const ChunkResolver& other) noexcept { } void ChunkResolver::ResolveManyImpl(int64_t n_indices, const uint8_t* logical_index_vec, - uint8_t* out_chunk_index_vec, uint8_t chunk_hint, - uint8_t* out_index_in_chunk_vec) const { + TypedChunkLocation* out_chunk_location_vec, + uint8_t chunk_hint) const { ResolveManyInline(offsets_.size(), offsets_.data(), n_indices, logical_index_vec, - out_chunk_index_vec, chunk_hint, out_index_in_chunk_vec); + out_chunk_location_vec, chunk_hint); } void ChunkResolver::ResolveManyImpl(int64_t n_indices, const uint32_t* logical_index_vec, - uint32_t* out_chunk_index_vec, uint32_t chunk_hint, - uint32_t* out_index_in_chunk_vec) const { + TypedChunkLocation* out_chunk_location_vec, + uint32_t chunk_hint) const { ResolveManyInline(offsets_.size(), offsets_.data(), n_indices, logical_index_vec, - out_chunk_index_vec, chunk_hint, out_index_in_chunk_vec); + out_chunk_location_vec, chunk_hint); } void ChunkResolver::ResolveManyImpl(int64_t n_indices, const uint16_t* logical_index_vec, - uint16_t* out_chunk_index_vec, uint16_t chunk_hint, - uint16_t* out_index_in_chunk_vec) const { + TypedChunkLocation* out_chunk_location_vec, + uint16_t chunk_hint) const { ResolveManyInline(offsets_.size(), offsets_.data(), n_indices, logical_index_vec, - out_chunk_index_vec, chunk_hint, out_index_in_chunk_vec); + out_chunk_location_vec, chunk_hint); } void ChunkResolver::ResolveManyImpl(int64_t n_indices, const uint64_t* logical_index_vec, - uint64_t* out_chunk_index_vec, uint64_t chunk_hint, - uint64_t* out_index_in_chunk_vec) const { + TypedChunkLocation* out_chunk_location_vec, + uint64_t chunk_hint) const { ResolveManyInline(offsets_.size(), offsets_.data(), n_indices, logical_index_vec, - out_chunk_index_vec, chunk_hint, out_index_in_chunk_vec); + out_chunk_location_vec, chunk_hint); } } // namespace arrow::internal diff --git a/cpp/src/arrow/chunk_resolver.h b/cpp/src/arrow/chunk_resolver.h index a2a3d5a864243..83fda62387fe1 100644 --- a/cpp/src/arrow/chunk_resolver.h +++ b/cpp/src/arrow/chunk_resolver.h @@ -31,28 +31,34 @@ namespace arrow::internal { struct ChunkResolver; -struct ChunkLocation { +template +struct TypedChunkLocation { /// \brief Index of the chunk in the array of chunks /// /// The value is always in the range `[0, chunks.size()]`. `chunks.size()` is used /// to represent out-of-bounds locations. - int64_t chunk_index = 0; + IndexType chunk_index = 0; /// \brief Index of the value in the chunk /// /// The value is UNDEFINED if chunk_index >= chunks.size() - int64_t index_in_chunk = 0; + IndexType index_in_chunk = 0; - ChunkLocation() = default; + TypedChunkLocation() = default; - ChunkLocation(int64_t chunk_index, int64_t index_in_chunk) - : chunk_index(chunk_index), index_in_chunk(index_in_chunk) {} + TypedChunkLocation(IndexType chunk_index, IndexType index_in_chunk) + : chunk_index(chunk_index), index_in_chunk(index_in_chunk) { + static_assert(sizeof(TypedChunkLocation) == 2 * sizeof(IndexType)); + static_assert(alignof(TypedChunkLocation) == alignof(IndexType)); + } - bool operator==(ChunkLocation other) const { + bool operator==(TypedChunkLocation other) const { return chunk_index == other.chunk_index && index_in_chunk == other.index_in_chunk; } }; +using ChunkLocation = TypedChunkLocation; + /// \brief An utility that incrementally resolves logical indices into /// physical indices in a chunked array. struct ARROW_EXPORT ChunkResolver { @@ -144,26 +150,25 @@ struct ARROW_EXPORT ChunkResolver { /// /// \pre 0 <= logical_index_vec[i] < logical_array_length() /// (for well-defined and valid chunk index results) - /// \pre out_chunk_index_vec has space for `n_indices` + /// \pre out_chunk_location_vec has space for `n_indices` locations /// \pre chunk_hint in [0, chunks.size()] - /// \post out_chunk_index_vec[i] in [0, chunks.size()] for i in [0, n) + /// \post out_chunk_location_vec[i].chunk_index in [0, chunks.size()] for i in [0, n) /// \post if logical_index_vec[i] >= chunked_array.length(), then - /// out_chunk_index_vec[i] == chunks.size() - /// and out_index_in_chunk_vec[i] is UNDEFINED (can be out-of-bounds) - /// \post if logical_index_vec[i] < 0, then both out_chunk_index_vec[i] and - /// out_index_in_chunk_vec[i] are UNDEFINED + /// out_chunk_location_vec[i].chunk_index == chunks.size() + /// and out_chunk_location_vec[i].index_in_chunk is UNDEFINED (can be + /// out-of-bounds) + /// \post if logical_index_vec[i] < 0, then both values in out_chunk_index_vec[i] + /// are UNDEFINED /// /// \param n_indices The number of logical indices to resolve /// \param logical_index_vec The logical indices to resolve - /// \param out_chunk_index_vec The output array where the chunk indices will be written + /// \param out_chunk_location_vec The output array where the locations will be written /// \param chunk_hint 0 or the last chunk_index produced by ResolveMany - /// \param out_index_in_chunk_vec If not NULLPTR, the output array where the - /// within-chunk indices will be written /// \return false iff chunks.size() > std::numeric_limits::max() template [[nodiscard]] bool ResolveMany(int64_t n_indices, const IndexType* logical_index_vec, - IndexType* out_chunk_index_vec, IndexType chunk_hint = 0, - IndexType* out_index_in_chunk_vec = NULLPTR) const { + TypedChunkLocation* out_chunk_location_vec, + IndexType chunk_hint = 0) const { if constexpr (sizeof(IndexType) < sizeof(uint64_t)) { // The max value returned by Bisect is `offsets.size() - 1` (= chunks.size()). constexpr uint64_t kMaxIndexTypeValue = std::numeric_limits::max(); @@ -188,13 +193,11 @@ struct ARROW_EXPORT ChunkResolver { // logical index in the chunked array. using U = std::make_unsigned_t; ResolveManyImpl(n_indices, reinterpret_cast(logical_index_vec), - reinterpret_cast(out_chunk_index_vec), - static_cast(chunk_hint), - reinterpret_cast(out_index_in_chunk_vec)); + reinterpret_cast*>(out_chunk_location_vec), + static_cast(chunk_hint)); } else { static_assert(std::is_unsigned_v); - ResolveManyImpl(n_indices, logical_index_vec, out_chunk_index_vec, chunk_hint, - out_index_in_chunk_vec); + ResolveManyImpl(n_indices, logical_index_vec, out_chunk_location_vec, chunk_hint); } return true; } @@ -226,10 +229,14 @@ struct ARROW_EXPORT ChunkResolver { /// \pre all the pre-conditions of ChunkResolver::ResolveMany() /// \pre num_offsets - 1 <= std::numeric_limits::max() - void ResolveManyImpl(int64_t, const uint8_t*, uint8_t*, uint8_t, uint8_t*) const; - void ResolveManyImpl(int64_t, const uint16_t*, uint16_t*, uint16_t, uint16_t*) const; - void ResolveManyImpl(int64_t, const uint32_t*, uint32_t*, uint32_t, uint32_t*) const; - void ResolveManyImpl(int64_t, const uint64_t*, uint64_t*, uint64_t, uint64_t*) const; + void ResolveManyImpl(int64_t, const uint8_t*, TypedChunkLocation*, + uint8_t) const; + void ResolveManyImpl(int64_t, const uint16_t*, TypedChunkLocation*, + uint16_t) const; + void ResolveManyImpl(int64_t, const uint32_t*, TypedChunkLocation*, + uint32_t) const; + void ResolveManyImpl(int64_t, const uint64_t*, TypedChunkLocation*, + uint64_t) const; public: /// \brief Find the index of the chunk that contains the logical index. diff --git a/cpp/src/arrow/chunked_array.cc b/cpp/src/arrow/chunked_array.cc index c36b736d5d5df..dd6aa51534fcb 100644 --- a/cpp/src/arrow/chunked_array.cc +++ b/cpp/src/arrow/chunked_array.cc @@ -27,6 +27,7 @@ #include "arrow/array/array_nested.h" #include "arrow/array/util.h" #include "arrow/array/validate.h" +#include "arrow/device_allocation_type_set.h" #include "arrow/pretty_print.h" #include "arrow/status.h" #include "arrow/type.h" @@ -86,6 +87,18 @@ Result> ChunkedArray::MakeEmpty( return std::make_shared(std::move(new_chunks)); } +DeviceAllocationTypeSet ChunkedArray::device_types() const { + if (chunks_.empty()) { + // An empty ChunkedArray is considered to be CPU-only. + return DeviceAllocationTypeSet::CpuOnly(); + } + DeviceAllocationTypeSet set; + for (const auto& chunk : chunks_) { + set.add(chunk->device_type()); + } + return set; +} + bool ChunkedArray::Equals(const ChunkedArray& other, const EqualOptions& opts) const { if (length_ != other.length()) { return false; diff --git a/cpp/src/arrow/chunked_array.h b/cpp/src/arrow/chunked_array.h index 5d300861d85c2..c65b6cb6e227f 100644 --- a/cpp/src/arrow/chunked_array.h +++ b/cpp/src/arrow/chunked_array.h @@ -25,6 +25,7 @@ #include "arrow/chunk_resolver.h" #include "arrow/compare.h" +#include "arrow/device_allocation_type_set.h" #include "arrow/result.h" #include "arrow/status.h" #include "arrow/type_fwd.h" @@ -116,6 +117,13 @@ class ARROW_EXPORT ChunkedArray { /// \return an ArrayVector of chunks const ArrayVector& chunks() const { return chunks_; } + /// \return The set of device allocation types used by the chunks in this + /// chunked array. + DeviceAllocationTypeSet device_types() const; + + /// \return true if all chunks are allocated on CPU-accessible memory. + bool is_cpu() const { return device_types().is_cpu_only(); } + /// \brief Construct a zero-copy slice of the chunked array with the /// indicated offset and length /// diff --git a/cpp/src/arrow/chunked_array_test.cc b/cpp/src/arrow/chunked_array_test.cc index e9cc283b53cd5..bf9d4af7c7bb0 100644 --- a/cpp/src/arrow/chunked_array_test.cc +++ b/cpp/src/arrow/chunked_array_test.cc @@ -37,6 +37,7 @@ namespace arrow { using internal::ChunkLocation; using internal::ChunkResolver; +using internal::TypedChunkLocation; class TestChunkedArray : public ::testing::Test { protected: @@ -61,12 +62,17 @@ TEST_F(TestChunkedArray, Make) { ChunkedArray::Make({}, int64())); AssertTypeEqual(*int64(), *result->type()); ASSERT_EQ(result->num_chunks(), 0); + // Empty chunked arrays are treated as CPU-allocated. + ASSERT_TRUE(result->is_cpu()); auto chunk0 = ArrayFromJSON(int8(), "[0, 1, 2]"); auto chunk1 = ArrayFromJSON(int16(), "[3, 4, 5]"); ASSERT_OK_AND_ASSIGN(result, ChunkedArray::Make({chunk0, chunk0})); ASSERT_OK_AND_ASSIGN(auto result2, ChunkedArray::Make({chunk0, chunk0}, int8())); + // All chunks are CPU-accessible. + ASSERT_TRUE(result->is_cpu()); + ASSERT_TRUE(result2->is_cpu()); AssertChunkedEqual(*result, *result2); ASSERT_RAISES(TypeError, ChunkedArray::Make({chunk0, chunk1})); @@ -375,24 +381,26 @@ class TestChunkResolverMany : public ::testing::Test { Result> ResolveMany( const ChunkResolver& resolver, const std::vector& logical_index_vec) { const size_t n = logical_index_vec.size(); - std::vector chunk_index_vec; - chunk_index_vec.resize(n); - std::vector index_in_chunk_vec; - index_in_chunk_vec.resize(n); + std::vector> chunk_location_vec; + chunk_location_vec.resize(n); bool valid = resolver.ResolveMany( - static_cast(n), logical_index_vec.data(), chunk_index_vec.data(), 0, - index_in_chunk_vec.data()); + static_cast(n), logical_index_vec.data(), chunk_location_vec.data(), 0); if (ARROW_PREDICT_FALSE(!valid)) { return Status::Invalid("index type doesn't fit possible chunk indexes"); } - std::vector locations; - locations.reserve(n); - for (size_t i = 0; i < n; i++) { - auto chunk_index = static_cast(chunk_index_vec[i]); - auto index_in_chunk = static_cast(index_in_chunk_vec[i]); - locations.emplace_back(chunk_index, index_in_chunk); + if constexpr (std::is_same::value) { + return chunk_location_vec; + } else { + std::vector locations; + locations.reserve(n); + for (size_t i = 0; i < n; i++) { + auto loc = chunk_location_vec[i]; + auto chunk_index = static_cast(loc.chunk_index); + auto index_in_chunk = static_cast(loc.index_in_chunk); + locations.emplace_back(chunk_index, index_in_chunk); + } + return locations; } - return locations; } void CheckResolveMany(const ChunkResolver& resolver, diff --git a/cpp/src/arrow/compute/CMakeLists.txt b/cpp/src/arrow/compute/CMakeLists.txt index e20b45897db95..aa2a2d4e9af0b 100644 --- a/cpp/src/arrow/compute/CMakeLists.txt +++ b/cpp/src/arrow/compute/CMakeLists.txt @@ -92,6 +92,7 @@ add_arrow_test(internals_test key_hash_test.cc row/compare_test.cc row/grouper_test.cc + row/row_encoder_internal_test.cc row/row_test.cc util_internal_test.cc) diff --git a/cpp/src/arrow/compute/expression.cc b/cpp/src/arrow/compute/expression.cc index 33e5928c2865d..12fda5d58f3bf 100644 --- a/cpp/src/arrow/compute/expression.cc +++ b/cpp/src/arrow/compute/expression.cc @@ -23,6 +23,7 @@ #include #include "arrow/chunked_array.h" +#include "arrow/compute/api_aggregate.h" #include "arrow/compute/api_vector.h" #include "arrow/compute/exec_internal.h" #include "arrow/compute/expression_internal.h" @@ -1242,6 +1243,72 @@ struct Inequality { /*insert_implicit_casts=*/false, &exec_context); } + /// Simplify an `is_in` call against an inequality guarantee. + /// + /// We avoid the complexity of fully simplifying EQUAL comparisons to true + /// literals (e.g., 'x is_in [1, 2, 3]' given the guarantee 'x = 2') due to + /// potential complications with null matching behavior. This is ok for the + /// predicate pushdown use case because the overall aim is to simplify to an + /// unsatisfiable expression. + /// + /// \pre `is_in_call` is a call to the `is_in` function + /// \return a simplified expression, or nullopt if no simplification occurred + static Result> SimplifyIsIn( + const Inequality& guarantee, const Expression::Call* is_in_call) { + DCHECK_EQ(is_in_call->function_name, "is_in"); + + auto options = checked_pointer_cast(is_in_call->options); + + const auto& lhs = Comparison::StripOrderPreservingCasts(is_in_call->arguments[0]); + if (!lhs.field_ref()) return std::nullopt; + if (*lhs.field_ref() != guarantee.target) return std::nullopt; + + FilterOptions::NullSelectionBehavior null_selection; + switch (options->null_matching_behavior) { + case SetLookupOptions::MATCH: + null_selection = + guarantee.nullable ? FilterOptions::EMIT_NULL : FilterOptions::DROP; + break; + case SetLookupOptions::SKIP: + null_selection = FilterOptions::DROP; + break; + case SetLookupOptions::EMIT_NULL: + if (guarantee.nullable) return std::nullopt; + null_selection = FilterOptions::DROP; + break; + case SetLookupOptions::INCONCLUSIVE: + if (guarantee.nullable) return std::nullopt; + ARROW_ASSIGN_OR_RAISE(Datum is_null, IsNull(options->value_set)); + ARROW_ASSIGN_OR_RAISE(Datum any_null, Any(is_null)); + if (any_null.scalar_as().value) return std::nullopt; + null_selection = FilterOptions::DROP; + break; + } + + std::string func_name = Comparison::GetName(guarantee.cmp); + DCHECK_NE(func_name, "na"); + std::vector args{options->value_set, guarantee.bound}; + ARROW_ASSIGN_OR_RAISE(Datum filter_mask, CallFunction(func_name, args)); + FilterOptions filter_options(null_selection); + ARROW_ASSIGN_OR_RAISE(Datum simplified_value_set, + Filter(options->value_set, filter_mask, filter_options)); + + if (simplified_value_set.length() == 0) return literal(false); + if (simplified_value_set.length() == options->value_set.length()) return std::nullopt; + + ExecContext exec_context; + Expression::Call simplified_call; + simplified_call.function_name = "is_in"; + simplified_call.arguments = is_in_call->arguments; + simplified_call.options = std::make_shared( + simplified_value_set, options->null_matching_behavior); + ARROW_ASSIGN_OR_RAISE( + Expression simplified_expr, + BindNonRecursive(std::move(simplified_call), + /*insert_implicit_casts=*/false, &exec_context)); + return simplified_expr; + } + /// \brief Simplify the given expression given this inequality as a guarantee. Result Simplify(Expression expr) { const auto& guarantee = *this; @@ -1258,6 +1325,12 @@ struct Inequality { return call->function_name == "is_valid" ? literal(true) : literal(false); } + if (call->function_name == "is_in") { + ARROW_ASSIGN_OR_RAISE(std::optional result, + SimplifyIsIn(guarantee, call)); + return result.value_or(expr); + } + auto cmp = Comparison::Get(expr); if (!cmp) return expr; diff --git a/cpp/src/arrow/compute/expression_test.cc b/cpp/src/arrow/compute/expression_test.cc index d94a17b6ffadf..0b7e8a9c23b13 100644 --- a/cpp/src/arrow/compute/expression_test.cc +++ b/cpp/src/arrow/compute/expression_test.cc @@ -27,6 +27,7 @@ #include #include +#include "arrow/array/builder_primitive.h" #include "arrow/compute/expression_internal.h" #include "arrow/compute/function_internal.h" #include "arrow/compute/registry.h" @@ -1616,6 +1617,144 @@ TEST(Expression, SimplifyWithComparisonAndNullableCaveat) { true_unless_null(field_ref("i32")))); // not satisfiable, will drop row group } +TEST(Expression, SimplifyIsIn) { + auto is_in = [](Expression field, std::shared_ptr value_set_type, + std::string json_array, + SetLookupOptions::NullMatchingBehavior null_matching_behavior) { + SetLookupOptions options{ArrayFromJSON(value_set_type, json_array), + null_matching_behavior}; + return call("is_in", {field}, options); + }; + + for (SetLookupOptions::NullMatchingBehavior null_matching : { + SetLookupOptions::MATCH, + SetLookupOptions::SKIP, + SetLookupOptions::EMIT_NULL, + SetLookupOptions::INCONCLUSIVE, + }) { + Simplify{is_in(field_ref("i32"), int32(), "[]", null_matching)} + .WithGuarantee(greater(field_ref("i32"), literal(2))) + .Expect(false); + + Simplify{is_in(field_ref("i32"), int32(), "[1,3,5,7,9]", null_matching)} + .WithGuarantee(equal(field_ref("i32"), literal(6))) + .Expect(false); + + Simplify{is_in(field_ref("i32"), int32(), "[1,3,5,7,9]", null_matching)} + .WithGuarantee(greater(field_ref("i32"), literal(3))) + .Expect(is_in(field_ref("i32"), int32(), "[5,7,9]", null_matching)); + + Simplify{is_in(field_ref("i32"), int32(), "[1,3,5,7,9]", null_matching)} + .WithGuarantee(greater(field_ref("i32"), literal(9))) + .Expect(false); + + Simplify{is_in(field_ref("i32"), int32(), "[1,3,5,7,9]", null_matching)} + .WithGuarantee(less_equal(field_ref("i32"), literal(0))) + .Expect(false); + + Simplify{is_in(field_ref("i32"), int32(), "[1,3,5,7,9]", null_matching)} + .WithGuarantee(greater(field_ref("i32"), literal(0))) + .ExpectUnchanged(); + + Simplify{is_in(field_ref("i32"), int32(), "[1,3,5,7,9]", null_matching)} + .WithGuarantee(less_equal(field_ref("i32"), literal(9))) + .ExpectUnchanged(); + + Simplify{is_in(field_ref("i32"), int32(), "[1,3,5,7,9]", null_matching)} + .WithGuarantee(and_(less_equal(field_ref("i32"), literal(7)), + greater(field_ref("i32"), literal(4)))) + .Expect(is_in(field_ref("i32"), int32(), "[5,7]", null_matching)); + + Simplify{is_in(field_ref("u32"), int8(), "[1,3,5,7,9]", null_matching)} + .WithGuarantee(greater(field_ref("u32"), literal(3))) + .Expect(is_in(field_ref("u32"), int8(), "[5,7,9]", null_matching)); + + Simplify{is_in(field_ref("u32"), int64(), "[1,3,5,7,9]", null_matching)} + .WithGuarantee(greater(field_ref("u32"), literal(3))) + .Expect(is_in(field_ref("u32"), int64(), "[5,7,9]", null_matching)); + } + + Simplify{ + is_in(field_ref("i32"), int32(), "[1,2,3]", SetLookupOptions::MATCH), + } + .WithGuarantee( + or_(greater(field_ref("i32"), literal(2)), is_null(field_ref("i32")))) + .Expect(is_in(field_ref("i32"), int32(), "[3]", SetLookupOptions::MATCH)); + + Simplify{ + is_in(field_ref("i32"), int32(), "[1,2,3,null]", SetLookupOptions::MATCH), + } + .WithGuarantee(greater(field_ref("i32"), literal(2))) + .Expect(is_in(field_ref("i32"), int32(), "[3]", SetLookupOptions::MATCH)); + + Simplify{ + is_in(field_ref("i32"), int32(), "[1,2,3,null]", SetLookupOptions::MATCH), + } + .WithGuarantee( + or_(greater(field_ref("i32"), literal(2)), is_null(field_ref("i32")))) + .Expect(is_in(field_ref("i32"), int32(), "[3,null]", SetLookupOptions::MATCH)); + + Simplify{ + is_in(field_ref("i32"), int32(), "[1,2,3]", SetLookupOptions::SKIP), + } + .WithGuarantee( + or_(greater(field_ref("i32"), literal(2)), is_null(field_ref("i32")))) + .Expect(is_in(field_ref("i32"), int32(), "[3]", SetLookupOptions::SKIP)); + + Simplify{ + is_in(field_ref("i32"), int32(), "[1,2,3,null]", SetLookupOptions::SKIP), + } + .WithGuarantee(greater(field_ref("i32"), literal(2))) + .Expect(is_in(field_ref("i32"), int32(), "[3]", SetLookupOptions::SKIP)); + + Simplify{ + is_in(field_ref("i32"), int32(), "[1,2,3,null]", SetLookupOptions::SKIP), + } + .WithGuarantee( + or_(greater(field_ref("i32"), literal(2)), is_null(field_ref("i32")))) + .Expect(is_in(field_ref("i32"), int32(), "[3]", SetLookupOptions::SKIP)); + + Simplify{ + is_in(field_ref("i32"), int32(), "[1,2,3]", SetLookupOptions::EMIT_NULL), + } + .WithGuarantee( + or_(greater(field_ref("i32"), literal(2)), is_null(field_ref("i32")))) + .ExpectUnchanged(); + + Simplify{ + is_in(field_ref("i32"), int32(), "[1,2,3,null]", SetLookupOptions::EMIT_NULL), + } + .WithGuarantee(greater(field_ref("i32"), literal(2))) + .Expect(is_in(field_ref("i32"), int32(), "[3]", SetLookupOptions::EMIT_NULL)); + + Simplify{ + is_in(field_ref("i32"), int32(), "[1,2,3,null]", SetLookupOptions::EMIT_NULL), + } + .WithGuarantee( + or_(greater(field_ref("i32"), literal(2)), is_null(field_ref("i32")))) + .ExpectUnchanged(); + + Simplify{ + is_in(field_ref("i32"), int32(), "[1,2,3]", SetLookupOptions::INCONCLUSIVE), + } + .WithGuarantee( + or_(greater(field_ref("i32"), literal(2)), is_null(field_ref("i32")))) + .ExpectUnchanged(); + + Simplify{ + is_in(field_ref("i32"), int32(), "[1,2,3,null]", SetLookupOptions::INCONCLUSIVE), + } + .WithGuarantee(greater(field_ref("i32"), literal(2))) + .ExpectUnchanged(); + + Simplify{ + is_in(field_ref("i32"), int32(), "[1,2,3,null]", SetLookupOptions::INCONCLUSIVE), + } + .WithGuarantee( + or_(greater(field_ref("i32"), literal(2)), is_null(field_ref("i32")))) + .ExpectUnchanged(); +} + TEST(Expression, SimplifyThenExecute) { auto filter = or_({equal(field_ref("f32"), literal(0)), @@ -1643,6 +1782,40 @@ TEST(Expression, SimplifyThenExecute) { AssertDatumsEqual(evaluated, simplified_evaluated, /*verbose=*/true); } +TEST(Expression, SimplifyIsInThenExecute) { + auto input = RecordBatchFromJSON(kBoringSchema, R"([ + {"i64": 2, "i32": 5}, + {"i64": 5, "i32": 6}, + {"i64": 3, "i32": 6}, + {"i64": 3, "i32": 5}, + {"i64": 4, "i32": 5}, + {"i64": 2, "i32": 7}, + {"i64": 5, "i32": 5} + ])"); + + std::vector guarantees{greater(field_ref("i64"), literal(1)), + greater_equal(field_ref("i32"), literal(5)), + less_equal(field_ref("i64"), literal(5))}; + + for (const Expression& guarantee : guarantees) { + auto filter = + call("is_in", {guarantee.call()->arguments[0]}, + compute::SetLookupOptions{ArrayFromJSON(int32(), "[1,2,3]"), true}); + ASSERT_OK_AND_ASSIGN(filter, filter.Bind(*kBoringSchema)); + ASSERT_OK_AND_ASSIGN(auto simplified, SimplifyWithGuarantee(filter, guarantee)); + + Datum evaluated, simplified_evaluated; + ExpectExecute(filter, input, &evaluated); + ExpectExecute(simplified, input, &simplified_evaluated); + if (simplified_evaluated.is_scalar()) { + ASSERT_OK_AND_ASSIGN( + simplified_evaluated, + MakeArrayFromScalar(*simplified_evaluated.scalar(), evaluated.length())); + } + AssertDatumsEqual(evaluated, simplified_evaluated, /*verbose=*/true); + } +} + TEST(Expression, Filter) { auto ExpectFilter = [](Expression filter, std::string batch_json) { ASSERT_OK_AND_ASSIGN(auto s, kBoringSchema->AddField(0, field("in", boolean()))); diff --git a/cpp/src/arrow/compute/function.cc b/cpp/src/arrow/compute/function.cc index e1a2e8c5d8879..0478a3d1e801a 100644 --- a/cpp/src/arrow/compute/function.cc +++ b/cpp/src/arrow/compute/function.cc @@ -30,6 +30,7 @@ #include "arrow/compute/kernels/common_internal.h" #include "arrow/compute/registry.h" #include "arrow/datum.h" +#include "arrow/device_allocation_type_set.h" #include "arrow/util/cpu_info.h" #include "arrow/util/logging.h" #include "arrow/util/tracing_internal.h" diff --git a/cpp/src/arrow/compute/kernel.cc b/cpp/src/arrow/compute/kernel.cc index 5c87ef2cd0561..5e7461cc52d0e 100644 --- a/cpp/src/arrow/compute/kernel.cc +++ b/cpp/src/arrow/compute/kernel.cc @@ -24,6 +24,7 @@ #include "arrow/buffer.h" #include "arrow/compute/exec.h" +#include "arrow/device_allocation_type_set.h" #include "arrow/result.h" #include "arrow/type_traits.h" #include "arrow/util/bit_util.h" diff --git a/cpp/src/arrow/compute/kernel.h b/cpp/src/arrow/compute/kernel.h index 1adb3e96c97c8..cfb6265f12904 100644 --- a/cpp/src/arrow/compute/kernel.h +++ b/cpp/src/arrow/compute/kernel.h @@ -31,6 +31,7 @@ #include "arrow/buffer.h" #include "arrow/compute/exec.h" #include "arrow/datum.h" +#include "arrow/device_allocation_type_set.h" #include "arrow/memory_pool.h" #include "arrow/result.h" #include "arrow/status.h" @@ -41,7 +42,7 @@ // macOS defines PREALLOCATE as a preprocessor macro in the header sys/vnode.h. // No other BSD seems to do so. The name is used as an identifier in MemAllocation enum. #if defined(__APPLE__) && defined(PREALLOCATE) -#undef PREALLOCATE +# undef PREALLOCATE #endif namespace arrow { diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc index 1fbcd6a249093..b545d8bcc1003 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc @@ -23,7 +23,9 @@ #include "arrow/util/cpu_info.h" #include "arrow/util/hashing.h" -#include +// Include templated definitions for aggregate kernels that must compiled here +// with the SIMD level configured for this compilation unit in the build. +#include "arrow/compute/kernels/aggregate_basic.inc.cc" // NOLINT(build/include) namespace arrow { namespace compute { @@ -276,11 +278,6 @@ struct SumImplDefault : public SumImpl { using SumImpl::SumImpl; }; -template -struct MeanImplDefault : public MeanImpl { - using MeanImpl::MeanImpl; -}; - Result> SumInit(KernelContext* ctx, const KernelInitArgs& args) { SumLikeInit visitor( @@ -289,6 +286,14 @@ Result> SumInit(KernelContext* ctx, return visitor.Create(); } +// ---------------------------------------------------------------------- +// Mean implementation + +template +struct MeanImplDefault : public MeanImpl { + using MeanImpl::MeanImpl; +}; + Result> MeanInit(KernelContext* ctx, const KernelInitArgs& args) { MeanKernelInit visitor( @@ -482,8 +487,8 @@ void AddFirstOrLastAggKernel(ScalarAggregateFunction* func, // ---------------------------------------------------------------------- // MinMax implementation -Result> MinMaxInit(KernelContext* ctx, - const KernelInitArgs& args) { +Result> MinMaxInitDefault(KernelContext* ctx, + const KernelInitArgs& args) { ARROW_ASSIGN_OR_RAISE(TypeHolder out_type, args.kernel->signature->out_type().Resolve(ctx, args.inputs)); MinMaxInitState visitor( @@ -532,13 +537,13 @@ struct BooleanAnyImpl : public ScalarAggregator { } if (batch[0].is_scalar()) { const Scalar& scalar = *batch[0].scalar; - this->has_nulls = !scalar.is_valid; - this->any = scalar.is_valid && checked_cast(scalar).value; - this->count += scalar.is_valid; + this->has_nulls |= !scalar.is_valid; + this->any |= scalar.is_valid && checked_cast(scalar).value; + this->count += scalar.is_valid * batch.length; return Status::OK(); } const ArraySpan& data = batch[0].array; - this->has_nulls = data.GetNullCount() > 0; + this->has_nulls |= data.GetNullCount() > 0; this->count += data.length - data.GetNullCount(); arrow::internal::OptionalBinaryBitBlockCounter counter( data.buffers[0].data, data.offset, data.buffers[1].data, data.offset, @@ -603,13 +608,13 @@ struct BooleanAllImpl : public ScalarAggregator { } if (batch[0].is_scalar()) { const Scalar& scalar = *batch[0].scalar; - this->has_nulls = !scalar.is_valid; - this->count += scalar.is_valid; - this->all = !scalar.is_valid || checked_cast(scalar).value; + this->has_nulls |= !scalar.is_valid; + this->count += scalar.is_valid * batch.length; + this->all &= !scalar.is_valid || checked_cast(scalar).value; return Status::OK(); } const ArraySpan& data = batch[0].array; - this->has_nulls = data.GetNullCount() > 0; + this->has_nulls |= data.GetNullCount() > 0; this->count += data.length - data.GetNullCount(); arrow::internal::OptionalBinaryBitBlockCounter counter( data.buffers[1].data, data.offset, data.buffers[0].data, data.offset, @@ -1114,14 +1119,14 @@ void RegisterScalarAggregateBasic(FunctionRegistry* registry) { // Add min max function func = std::make_shared("min_max", Arity::Unary(), min_max_doc, &default_scalar_aggregate_options); - AddMinMaxKernels(MinMaxInit, {null(), boolean()}, func.get()); - AddMinMaxKernels(MinMaxInit, NumericTypes(), func.get()); - AddMinMaxKernels(MinMaxInit, TemporalTypes(), func.get()); - AddMinMaxKernels(MinMaxInit, BaseBinaryTypes(), func.get()); - AddMinMaxKernel(MinMaxInit, Type::FIXED_SIZE_BINARY, func.get()); - AddMinMaxKernel(MinMaxInit, Type::INTERVAL_MONTHS, func.get()); - AddMinMaxKernel(MinMaxInit, Type::DECIMAL128, func.get()); - AddMinMaxKernel(MinMaxInit, Type::DECIMAL256, func.get()); + AddMinMaxKernels(MinMaxInitDefault, {null(), boolean()}, func.get()); + AddMinMaxKernels(MinMaxInitDefault, NumericTypes(), func.get()); + AddMinMaxKernels(MinMaxInitDefault, TemporalTypes(), func.get()); + AddMinMaxKernels(MinMaxInitDefault, BaseBinaryTypes(), func.get()); + AddMinMaxKernel(MinMaxInitDefault, Type::FIXED_SIZE_BINARY, func.get()); + AddMinMaxKernel(MinMaxInitDefault, Type::INTERVAL_MONTHS, func.get()); + AddMinMaxKernel(MinMaxInitDefault, Type::DECIMAL128, func.get()); + AddMinMaxKernel(MinMaxInitDefault, Type::DECIMAL256, func.get()); // Add the SIMD variants for min max #if defined(ARROW_HAVE_RUNTIME_AVX2) if (cpu_info->IsSupported(arrow::internal::CpuInfo::AVX2)) { diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.inc.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.inc.cc new file mode 100644 index 0000000000000..f2151e0a9e029 --- /dev/null +++ b/cpp/src/arrow/compute/kernels/aggregate_basic.inc.cc @@ -0,0 +1,1025 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// .inc.cc file to be included in compilation unit where kernels are meant to be +// compiled auto-vectorized by the compiler with different SIMD levels passed +// as compiler flags. +// +// It contains no includes to avoid double inclusion in the compilation unit +// that includes this .inc.cc file. + +#include +#include +#include +#include +#include + +#include "arrow/compute/api_aggregate.h" +#include "arrow/compute/kernels/aggregate_internal.h" +#include "arrow/compute/kernels/codegen_internal.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/align_util.h" +#include "arrow/util/bit_block_counter.h" +#include "arrow/util/decimal.h" + +namespace arrow::compute::internal { +namespace { + +// ---------------------------------------------------------------------- +// Sum implementation + +template ::Type> +struct SumImpl : public ScalarAggregator { + using ThisType = SumImpl; + using CType = typename TypeTraits::CType; + using SumType = ResultType; + using SumCType = typename TypeTraits::CType; + using OutputType = typename TypeTraits::ScalarType; + + SumImpl(std::shared_ptr out_type, ScalarAggregateOptions options_) + : out_type(std::move(out_type)), options(std::move(options_)) {} + + Status Consume(KernelContext*, const ExecSpan& batch) override { + if (batch[0].is_array()) { + const ArraySpan& data = batch[0].array; + this->count += data.length - data.GetNullCount(); + this->nulls_observed = this->nulls_observed || data.GetNullCount(); + + if (!options.skip_nulls && this->nulls_observed) { + // Short-circuit + return Status::OK(); + } + + if (is_boolean_type::value) { + this->sum += GetTrueCount(data); + } else { + this->sum += SumArray(data); + } + } else { + const Scalar& data = *batch[0].scalar; + this->count += data.is_valid * batch.length; + this->nulls_observed = this->nulls_observed || !data.is_valid; + if (data.is_valid) { + this->sum += internal::UnboxScalar::Unbox(data) * batch.length; + } + } + return Status::OK(); + } + + Status MergeFrom(KernelContext*, KernelState&& src) override { + const auto& other = checked_cast(src); + this->count += other.count; + this->sum += other.sum; + this->nulls_observed = this->nulls_observed || other.nulls_observed; + return Status::OK(); + } + + Status Finalize(KernelContext*, Datum* out) override { + if ((!options.skip_nulls && this->nulls_observed) || + (this->count < options.min_count)) { + out->value = std::make_shared(out_type); + } else { + out->value = std::make_shared(this->sum, out_type); + } + return Status::OK(); + } + + size_t count = 0; + bool nulls_observed = false; + SumCType sum = 0; + std::shared_ptr out_type; + ScalarAggregateOptions options; +}; + +template +struct NullImpl : public ScalarAggregator { + using ScalarType = typename TypeTraits::ScalarType; + + explicit NullImpl(const ScalarAggregateOptions& options_) : options(options_) {} + + Status Consume(KernelContext*, const ExecSpan& batch) override { + if (batch[0].is_scalar() || batch[0].array.GetNullCount() > 0) { + // If the batch is a scalar or an array with elements, set is_empty to false + is_empty = false; + } + return Status::OK(); + } + + Status MergeFrom(KernelContext*, KernelState&& src) override { + const auto& other = checked_cast(src); + this->is_empty &= other.is_empty; + return Status::OK(); + } + + Status Finalize(KernelContext*, Datum* out) override { + if ((options.skip_nulls || this->is_empty) && options.min_count == 0) { + // Return 0 if the remaining data is empty + out->value = output_empty(); + } else { + out->value = MakeNullScalar(TypeTraits::type_singleton()); + } + return Status::OK(); + } + + virtual std::shared_ptr output_empty() = 0; + + bool is_empty = true; + ScalarAggregateOptions options; +}; + +template +struct NullSumImpl : public NullImpl { + using ScalarType = typename TypeTraits::ScalarType; + + explicit NullSumImpl(const ScalarAggregateOptions& options_) + : NullImpl(options_) {} + + std::shared_ptr output_empty() override { + return std::make_shared(0); + } +}; + +template