From faef3fa9b2c34343eafa3a53a7216509e707aa9f Mon Sep 17 00:00:00 2001 From: Ruifeng Zheng Date: Tue, 10 Dec 2024 19:39:10 +0800 Subject: [PATCH] [SPARK-50527][INFRA] Add a separate docker file for python 3.12 daily build ### What changes were proposed in this pull request? Add a separate docker file for python 3.12 daily build ### Why are the changes needed? to isolate the testing environment ### Does this PR introduce _any_ user-facing change? no, infra-only ### How was this patch tested? PR builder with `env`: ``` default: '{"PYSPARK_IMAGE_TO_TEST": "python-312", "PYTHON_TO_TEST": "python3.12"}' ``` https://github.com/zhengruifeng/spark/runs/34169304629 ### Was this patch authored or co-authored using generative AI tooling? no Closes #49122 from zhengruifeng/py_image_312. Authored-by: Ruifeng Zheng Signed-off-by: Ruifeng Zheng --- .../workflows/build_infra_images_cache.yml | 14 ++++ .github/workflows/build_python_3.12.yml | 1 + dev/spark-test-image/python-312/Dockerfile | 83 +++++++++++++++++++ 3 files changed, 98 insertions(+) create mode 100644 dev/spark-test-image/python-312/Dockerfile diff --git a/.github/workflows/build_infra_images_cache.yml b/.github/workflows/build_infra_images_cache.yml index 031a09af69541..b4e7a2cbd0b37 100644 --- a/.github/workflows/build_infra_images_cache.yml +++ b/.github/workflows/build_infra_images_cache.yml @@ -32,6 +32,7 @@ on: - 'dev/spark-test-image/sparkr/Dockerfile' - 'dev/spark-test-image/python-309/Dockerfile' - 'dev/spark-test-image/python-310/Dockerfile' + - 'dev/spark-test-image/python-312/Dockerfile' - '.github/workflows/build_infra_images_cache.yml' # Create infra image when cutting down branches/tags create: @@ -130,3 +131,16 @@ jobs: - name: Image digest (PySpark with Python 3.10) if: hashFiles('dev/spark-test-image/python-310/Dockerfile') != '' run: echo ${{ steps.docker_build_pyspark_python_310.outputs.digest }} + - name: Build and push (PySpark with Python 3.12) + if: hashFiles('dev/spark-test-image/python-312/Dockerfile') != '' + id: docker_build_pyspark_python_312 + uses: docker/build-push-action@v6 + with: + context: ./dev/spark-test-image/python-312/ + push: true + tags: ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-312-cache:${{ github.ref_name }}-static + cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-312-cache:${{ github.ref_name }} + cache-to: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-312-cache:${{ github.ref_name }},mode=max + - name: Image digest (PySpark with Python 3.12) + if: hashFiles('dev/spark-test-image/python-312/Dockerfile') != '' + run: echo ${{ steps.docker_build_pyspark_python_312.outputs.digest }} diff --git a/.github/workflows/build_python_3.12.yml b/.github/workflows/build_python_3.12.yml index e1fd45a7d8838..2503a2f158357 100644 --- a/.github/workflows/build_python_3.12.yml +++ b/.github/workflows/build_python_3.12.yml @@ -36,6 +36,7 @@ jobs: hadoop: hadoop3 envs: >- { + "PYSPARK_IMAGE_TO_TEST": "python-312", "PYTHON_TO_TEST": "python3.12" } jobs: >- diff --git a/dev/spark-test-image/python-312/Dockerfile b/dev/spark-test-image/python-312/Dockerfile new file mode 100644 index 0000000000000..ecfb1ab07123c --- /dev/null +++ b/dev/spark-test-image/python-312/Dockerfile @@ -0,0 +1,83 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Image for building and testing Spark branches. Based on Ubuntu 22.04. +# See also in https://hub.docker.com/_/ubuntu +FROM ubuntu:jammy-20240911.1 +LABEL org.opencontainers.image.authors="Apache Spark project " +LABEL org.opencontainers.image.licenses="Apache-2.0" +LABEL org.opencontainers.image.ref.name="Apache Spark Infra Image For PySpark with Python 3.12" +# Overwrite this label to avoid exposing the underlying Ubuntu OS version label +LABEL org.opencontainers.image.version="" + +ENV FULL_REFRESH_DATE 20241206 + +ENV DEBIAN_FRONTEND noninteractive +ENV DEBCONF_NONINTERACTIVE_SEEN true + +RUN apt-get update && apt-get install -y \ + build-essential \ + ca-certificates \ + curl \ + gfortran \ + git \ + gnupg \ + libcurl4-openssl-dev \ + libfontconfig1-dev \ + libfreetype6-dev \ + libfribidi-dev \ + libgit2-dev \ + libharfbuzz-dev \ + libjpeg-dev \ + liblapack-dev \ + libopenblas-dev \ + libpng-dev \ + libpython3-dev \ + libssl-dev \ + libtiff5-dev \ + libxml2-dev \ + openjdk-17-jdk-headless \ + pkg-config \ + qpdf \ + tzdata \ + software-properties-common \ + wget \ + zlib1g-dev + +# Install Python 3.12 +RUN add-apt-repository ppa:deadsnakes/ppa +RUN apt-get update && apt-get install -y \ + python3.12 \ + && apt-get autoremove --purge -y \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + + +ARG BASIC_PIP_PKGS="numpy pyarrow>=18.0.0 six==1.16.0 pandas==2.2.3 scipy plotly>=4.8 mlflow>=2.8.1 coverage matplotlib openpyxl memory-profiler>=0.61.0 scikit-learn>=1.3.2" +# Python deps for Spark Connect +ARG CONNECT_PIP_PKGS="grpcio==1.67.0 grpcio-status==1.67.0 protobuf==5.28.3 googleapis-common-protos==1.65.0 graphviz==0.20.3" + +# Install Python 3.12 at the last stage to avoid breaking the existing Python installations +RUN apt-get update && apt-get install -y \ + python3.12 \ + && rm -rf /var/lib/apt/lists/* +RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.12 +RUN python3.12 -m pip install --ignore-installed blinker>=1.6.2 # mlflow needs this +RUN python3.12 -m pip install $BASIC_PIP_PKGS unittest-xml-reporting $CONNECT_PIP_PKGS lxml && \ + python3.12 -m pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu && \ + python3.12 -m pip install torcheval && \ + python3.12 -m pip cache purge \