Skip to content

GH-44815: [C++][Parquet] Add an example to dump statistics read as `a… #40682

GH-44815: [C++][Parquet] Add an example to dump statistics read as `a…

GH-44815: [C++][Parquet] Add an example to dump statistics read as `a… #40682

Workflow file for this run

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
name: C++
on:
push:
branches:
- '**'
- '!dependabot/**'
tags:
- '**'
paths:
- '.dockerignore'
- '.github/workflows/cpp.yml'
- 'ci/conda_env_*'
- 'ci/docker/**'
- 'ci/scripts/cpp_*'
- 'ci/scripts/install_azurite.sh'
- 'ci/scripts/install_gcs_testbench.sh'
- 'ci/scripts/install_minio.sh'
- 'ci/scripts/msys2_*'
- 'ci/scripts/util_*'
- 'cpp/**'
- 'docker-compose.yml'
- 'format/Flight.proto'
- 'testing'
pull_request:
paths:
- '.dockerignore'
- '.github/workflows/cpp.yml'
- 'ci/conda_env_*'
- 'ci/docker/**'
- 'ci/scripts/cpp_*'
- 'ci/scripts/install_azurite.sh'
- 'ci/scripts/install_gcs_testbench.sh'
- 'ci/scripts/install_minio.sh'
- 'ci/scripts/msys2_*'
- 'ci/scripts/util_*'
- 'cpp/**'
- 'docker-compose.yml'
- 'format/Flight.proto'
- 'testing'
concurrency:
group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }}
cancel-in-progress: true
permissions:
contents: read
env:
ARCHERY_DEBUG: 1
ARROW_ENABLE_TIMING_TESTS: OFF
DOCKER_VOLUME_PREFIX: ".docker/"
jobs:
docker-targets:
name: Docker targets
runs-on: ubuntu-latest
outputs:
targets: ${{ steps.detect-targets.outputs.targets }}
steps:
- name: Detect targets
id: detect-targets
run: |
echo "targets<<JSON" >> "$GITHUB_OUTPUT"
echo "[" >> "$GITHUB_OUTPUT"
cat <<JSON >> "$GITHUB_OUTPUT"
{
"arch": "amd64",
"clang-tools": "14",
"image": "conda-cpp",
"llvm": "14",
"runs-on": "ubuntu-latest",
"simd-level": "AVX2",
"title": "AMD64 Conda C++ AVX2",
"ubuntu": "22.04"
},
{
"arch": "amd64",
"clang-tools": "14",
"image": "ubuntu-cpp-sanitizer",
"llvm": "14",
"runs-on": "ubuntu-latest",
"title": "AMD64 Ubuntu 22.04 C++ ASAN UBSAN",
"ubuntu": "22.04"
}
JSON
if [ "$GITHUB_REPOSITORY_OWNER" = "apache" ]; then
echo "," >> "$GITHUB_OUTPUT"
cat <<JSON >> "$GITHUB_OUTPUT"
{
"arch": "arm64v8",
"clang-tools": "10",
"image": "ubuntu-cpp",
"llvm": "10",
"runs-on": ["self-hosted", "arm", "linux"],
"title": "ARM64 Ubuntu 20.04 C++",
"ubuntu": "20.04"
}
JSON
fi
echo "]" >> "$GITHUB_OUTPUT"
echo "JSON" >> "$GITHUB_OUTPUT"
docker:
name: ${{ matrix.title }}
needs: docker-targets
runs-on: ${{ matrix.runs-on }}
if: ${{ !contains(github.event.pull_request.title, 'WIP') }}
timeout-minutes: 75
strategy:
fail-fast: false
matrix:
include: ${{ fromJson(needs.docker-targets.outputs.targets) }}
env:
ARCH: ${{ matrix.arch }}
ARROW_SIMD_LEVEL: ${{ matrix.simd-level }}
CLANG_TOOLS: ${{ matrix.clang-tools }}
LLVM: ${{ matrix.llvm }}
UBUNTU: ${{ matrix.ubuntu }}
steps:
- name: Checkout Arrow
uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0
with:
fetch-depth: 0
submodules: recursive
- name: Cache Docker Volumes
uses: actions/cache@13aacd865c20de90d75de3b17ebe84f7a17d57d2 # v4.0.0
with:
path: .docker
key: ${{ matrix.image }}-${{ hashFiles('cpp/**') }}
restore-keys: ${{ matrix.image }}-
- name: Setup Python on hosted runner
if: |
matrix.runs-on == 'ubuntu-latest'
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
with:
python-version: 3
- name: Setup Python on self-hosted runner
if: |
contains(matrix.runs-on, 'self-hosted')
run: |
sudo apt update
sudo apt install -y --no-install-recommends python3 python3-dev python3-pip
python3 -m pip install -U pip
- name: Setup Archery
run: python3 -m pip install -e dev/archery[docker]
- name: Execute Docker Build
env:
ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }}
ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }}
run: |
# GH-40558: reduce ASLR to avoid ASAN/LSAN crashes
sudo sysctl -w vm.mmap_rnd_bits=28
source ci/scripts/util_enable_core_dumps.sh
archery docker run ${{ matrix.image }}
- name: Docker Push
if: >-
success() &&
github.event_name == 'push' &&
github.repository == 'apache/arrow' &&
github.ref_name == 'main'
env:
ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }}
ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }}
continue-on-error: true
run: archery docker push ${{ matrix.image }}
build-example:
name: C++ Minimal Build Example
runs-on: ubuntu-latest
if: ${{ !contains(github.event.pull_request.title, 'WIP') }}
timeout-minutes: 45
steps:
- name: Checkout Arrow
uses: actions/checkout@v4
with:
fetch-depth: 0
submodules: recursive
- name: Check CMake presets
run: |
cd cpp
cmake --list-presets
- name: Run minimal example
run: |
cd cpp/examples/minimal_build
docker compose run --rm minimal
macos:
name: ${{ matrix.architecture }} macOS ${{ matrix.macos-version }} C++
runs-on: macos-${{ matrix.macos-version }}
if: ${{ !contains(github.event.pull_request.title, 'WIP') }}
timeout-minutes: 75
strategy:
fail-fast: false
matrix:
include:
- architecture: AMD64
macos-version: "13"
- architecture: ARM64
macos-version: "14"
env:
ARROW_AZURE: ON
ARROW_BUILD_TESTS: ON
ARROW_DATASET: ON
ARROW_FLIGHT: ON
ARROW_GANDIVA: ON
ARROW_GCS: ON
ARROW_HDFS: ON
ARROW_HOME: /tmp/local
ARROW_JEMALLOC: ON
ARROW_ORC: ON
ARROW_PARQUET: ON
ARROW_S3: ON
ARROW_SUBSTRAIT: ON
ARROW_WITH_BROTLI: ON
ARROW_WITH_BZ2: ON
ARROW_WITH_LZ4: ON
# GH-36013 disabling opentelemetry here because we can't
# get the patched version from conda
# ARROW_WITH_OPENTELEMETRY: ON
ARROW_WITH_SNAPPY: ON
ARROW_WITH_ZLIB: ON
ARROW_WITH_ZSTD: ON
GTest_SOURCE: BUNDLED
steps:
- name: CPU Info
run: |
sysctl -a | grep cpu
sysctl -a | grep "hw.optional"
- name: Checkout Arrow
uses: actions/checkout@v4
with:
fetch-depth: 0
submodules: recursive
- name: Install Dependencies
run: |
# pkg-config formula is deprecated but it's still installed
# in GitHub Actions runner now. We can remove this once
# pkg-config formula is removed from GitHub Actions runner.
brew uninstall pkg-config || :
brew uninstall [email protected] || :
brew bundle --file=cpp/Brewfile
- name: Install MinIO
run: |
$(brew --prefix bash)/bin/bash \
ci/scripts/install_minio.sh latest ${ARROW_HOME}
- name: Set up Python
uses: actions/[email protected]
with:
python-version: 3.12
- name: Install Google Cloud Storage Testbench
run: ci/scripts/install_gcs_testbench.sh default
- name: Install Azurite Storage Emulator
run: ci/scripts/install_azurite.sh
- name: Setup ccache
run: |
ci/scripts/ccache_setup.sh
- name: ccache info
id: ccache-info
run: |
echo "cache-dir=$(ccache --get-config cache_dir)" >> $GITHUB_OUTPUT
- name: Cache ccache
uses: actions/cache@v4
with:
path: ${{ steps.ccache-info.outputs.cache-dir }}
key: cpp-ccache-macos-${{ matrix.macos-version }}-${{ hashFiles('cpp/**') }}
restore-keys: cpp-ccache-macos-${{ matrix.macos-version }}-
- name: Build
run: |
ci/scripts/cpp_build.sh $(pwd) $(pwd)/build
- name: Test
shell: bash
run: |
sudo sysctl -w kern.coredump=1
sudo sysctl -w kern.corefile=/tmp/core.%N.%P
ulimit -c unlimited # must enable within the same shell
ci/scripts/cpp_test.sh $(pwd) $(pwd)/build
windows:
name: ${{ matrix.title }}
runs-on: ${{ matrix.os }}
if: ${{ !contains(github.event.pull_request.title, 'WIP') }}
timeout-minutes: 60
strategy:
fail-fast: false
matrix:
os:
- windows-2019
include:
- os: windows-2019
simd-level: AVX2
title: AMD64 Windows 2019 C++17 AVX2
env:
ARROW_BOOST_USE_SHARED: OFF
ARROW_BUILD_BENCHMARKS: ON
ARROW_BUILD_SHARED: ON
ARROW_BUILD_STATIC: OFF
ARROW_BUILD_TESTS: ON
ARROW_DATASET: ON
ARROW_FLIGHT: OFF
ARROW_HDFS: ON
ARROW_HOME: /usr
ARROW_JEMALLOC: OFF
ARROW_MIMALLOC: ON
ARROW_ORC: ON
ARROW_PARQUET: ON
ARROW_SIMD_LEVEL: ${{ matrix.simd-level }}
ARROW_SUBSTRAIT: ON
ARROW_USE_GLOG: OFF
ARROW_VERBOSE_THIRDPARTY_BUILD: OFF
ARROW_WITH_BROTLI: OFF
ARROW_WITH_BZ2: OFF
ARROW_WITH_LZ4: OFF
ARROW_WITH_OPENTELEMETRY: OFF
ARROW_WITH_SNAPPY: ON
ARROW_WITH_ZLIB: ON
ARROW_WITH_ZSTD: ON
BOOST_SOURCE: BUNDLED
CMAKE_CXX_STANDARD: "17"
CMAKE_GENERATOR: Ninja
CMAKE_INSTALL_LIBDIR: bin
CMAKE_INSTALL_PREFIX: /usr
CMAKE_UNITY_BUILD: ON
steps:
- name: Disable Crash Dialogs
run: |
reg add `
"HKCU\SOFTWARE\Microsoft\Windows\Windows Error Reporting" `
/v DontShowUI `
/t REG_DWORD `
/d 1 `
/f
- name: Checkout Arrow
uses: actions/checkout@v4
with:
fetch-depth: 0
submodules: recursive
- name: Download Timezone Database
shell: bash
run: ci/scripts/download_tz_database.sh
- name: Install ccache
shell: bash
run: |
ci/scripts/install_ccache.sh 4.6.3 /usr
- name: Setup ccache
shell: bash
run: |
ci/scripts/ccache_setup.sh
- name: ccache info
id: ccache-info
shell: bash
run: |
echo "cache-dir=$(ccache --get-config cache_dir)" >> $GITHUB_OUTPUT
- name: Cache ccache
uses: actions/cache@v4
with:
path: ${{ steps.ccache-info.outputs.cache-dir }}
key: cpp-ccache-windows-${{ env.CACHE_VERSION }}-${{ hashFiles('cpp/**') }}
restore-keys: cpp-ccache-windows-${{ env.CACHE_VERSION }}-
env:
# We can invalidate the current cache by updating this.
CACHE_VERSION: "2022-09-13"
- name: Build
shell: cmd
run: |
call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64
bash -c "ci/scripts/cpp_build.sh $(pwd) $(pwd)/build"
- name: Test
shell: bash
run: |
# For ORC
export TZDIR=/c/msys64/usr/share/zoneinfo
ci/scripts/cpp_test.sh $(pwd) $(pwd)/build
windows-mingw:
name: AMD64 Windows MinGW ${{ matrix.msystem_upper }} C++
runs-on: windows-2019
if: ${{ !contains(github.event.pull_request.title, 'WIP') }}
# Build may take 1h+ without cache.
timeout-minutes: 120
strategy:
fail-fast: false
matrix:
include:
- msystem_lower: mingw64
msystem_upper: MINGW64
- msystem_lower: clang64
msystem_upper: CLANG64
env:
ARROW_BUILD_SHARED: ON
ARROW_BUILD_STATIC: OFF
ARROW_BUILD_TESTS: ON
ARROW_BUILD_TYPE: release
ARROW_DATASET: ON
ARROW_FLIGHT: ON
ARROW_FLIGHT_SQL: ON
ARROW_GANDIVA: ON
ARROW_GCS: ON
ARROW_HDFS: OFF
ARROW_HOME: /${{ matrix.msystem_lower}}
ARROW_JEMALLOC: OFF
ARROW_PARQUET: ON
ARROW_S3: ON
ARROW_SUBSTRAIT: ON
ARROW_USE_GLOG: OFF
ARROW_VERBOSE_THIRDPARTY_BUILD: OFF
ARROW_WITH_BROTLI: ON
ARROW_WITH_BZ2: ON
ARROW_WITH_LZ4: ON
ARROW_WITH_OPENTELEMETRY: OFF
ARROW_WITH_SNAPPY: ON
ARROW_WITH_ZLIB: ON
ARROW_WITH_ZSTD: ON
# Don't use preinstalled Boost by empty BOOST_ROOT
BOOST_ROOT: ""
ARROW_CMAKE_ARGS: >-
-DARROW_PACKAGE_PREFIX=/${{ matrix.msystem_lower}}
-DCMAKE_FIND_PACKAGE_PREFER_CONFIG=ON
# We can't use unity build because we don't have enough memory on
# GitHub Actions.
# CMAKE_UNITY_BUILD: ON
GTest_SOURCE: BUNDLED
steps:
- name: Disable Crash Dialogs
run: |
reg add `
"HKCU\SOFTWARE\Microsoft\Windows\Windows Error Reporting" `
/v DontShowUI `
/t REG_DWORD `
/d 1 `
/f
- name: Checkout Arrow
uses: actions/checkout@v4
with:
fetch-depth: 0
submodules: recursive
- uses: msys2/setup-msys2@v2
with:
msystem: ${{ matrix.msystem_upper }}
update: true
- name: Setup MSYS2
shell: msys2 {0}
run: ci/scripts/msys2_setup.sh cpp
- name: Cache ccache
uses: actions/cache@v4
with:
path: ccache
key: cpp-ccache-${{ matrix.msystem_lower}}-${{ hashFiles('cpp/**') }}
restore-keys: cpp-ccache-${{ matrix.msystem_lower}}-
- name: Build
shell: msys2 {0}
run: |
export CMAKE_BUILD_PARALLEL_LEVEL=$NUMBER_OF_PROCESSORS
ci/scripts/cpp_build.sh "$(pwd)" "$(pwd)/build"
- name: Download Timezone Database
shell: bash
run: ci/scripts/download_tz_database.sh
- name: Download MinIO
shell: msys2 {0}
run: |
mkdir -p /usr/local/bin
wget \
--output-document /usr/local/bin/minio.exe \
https://dl.min.io/server/minio/release/windows-amd64/archive/minio.RELEASE.2024-09-13T20-26-02Z
chmod +x /usr/local/bin/minio.exe
- name: Set up Python
uses: actions/[email protected]
id: python-install
with:
python-version: 3.9
- name: Install Google Cloud Storage Testbench
shell: msys2 {0}
env:
PIPX_BIN_DIR: /usr/local/bin
PIPX_BASE_PYTHON: ${{ steps.python-install.outputs.python-path }}
run: |
ci/scripts/install_gcs_testbench.sh default
- name: Test
shell: msys2 {0}
run: |
ci/scripts/cpp_test.sh "$(pwd)" "$(pwd)/build"