diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 4a3707404bccf..3117872e21680 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -62,6 +62,8 @@ jobs: image_docs_url_link: ${{ steps.infra-image-link.outputs.image_docs_url_link }} image_lint_url: ${{ steps.infra-image-lint-outputs.outputs.image_lint_url }} image_lint_url_link: ${{ steps.infra-image-link.outputs.image_lint_url_link }} + image_sparkr_url: ${{ steps.infra-image-sparkr-outputs.outputs.image_sparkr_url }} + image_sparkr_url_link: ${{ steps.infra-image-link.outputs.image_sparkr_url_link }} steps: - name: Checkout Spark repository uses: actions/checkout@v4 @@ -154,6 +156,14 @@ jobs: IMG_NAME="apache-spark-ci-image-lint:${{ inputs.branch }}-${{ github.run_id }}" IMG_URL="ghcr.io/$REPO_OWNER/$IMG_NAME" echo "image_lint_url=$IMG_URL" >> $GITHUB_OUTPUT + - name: Generate infra image URL (SparkR) + id: infra-image-sparkr-outputs + run: | + # Convert to lowercase to meet Docker repo name requirement + REPO_OWNER=$(echo "${{ github.repository_owner }}" | tr '[:upper:]' '[:lower:]') + IMG_NAME="apache-spark-ci-image-sparkr:${{ inputs.branch }}-${{ github.run_id }}" + IMG_URL="ghcr.io/$REPO_OWNER/$IMG_NAME" + echo "image_sparkr_url=$IMG_URL" >> $GITHUB_OUTPUT - name: Link the docker images id: infra-image-link run: | @@ -162,9 +172,11 @@ jobs: if [[ "${{ inputs.branch }}" == 'branch-3.5' ]]; then echo "image_docs_url_link=${{ steps.infra-image-outputs.outputs.image_url }}" >> $GITHUB_OUTPUT echo "image_lint_url_link=${{ steps.infra-image-outputs.outputs.image_url }}" >> $GITHUB_OUTPUT + echo "image_sparkr_url_link=${{ steps.infra-image-outputs.outputs.image_url }}" >> $GITHUB_OUTPUT else echo "image_docs_url_link=${{ steps.infra-image-docs-outputs.outputs.image_docs_url }}" >> $GITHUB_OUTPUT echo "image_lint_url_link=${{ steps.infra-image-lint-outputs.outputs.image_lint_url }}" >> $GITHUB_OUTPUT + echo "image_sparkr_url_link=${{ steps.infra-image-sparkr-outputs.outputs.image_sparkr_url }}" >> $GITHUB_OUTPUT fi # Build: build Spark and run the tests for specified modules. @@ -405,6 +417,17 @@ jobs: ${{ needs.precondition.outputs.image_lint_url }} # Use the infra image cache to speed up cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-lint-cache:${{ inputs.branch }} + - name: Build and push (SparkR) + if: hashFiles('dev/spark-test-image/sparkr/Dockerfile') != '' + id: docker_build_sparkr + uses: docker/build-push-action@v6 + with: + context: ./dev/spark-test-image/sparkr/ + push: true + tags: | + ${{ needs.precondition.outputs.image_sparkr_url }} + # Use the infra image cache to speed up + cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-sparkr-cache:${{ inputs.branch }} pyspark: @@ -564,7 +587,7 @@ jobs: runs-on: ubuntu-latest timeout-minutes: 180 container: - image: ${{ needs.precondition.outputs.image_url }} + image: ${{ needs.precondition.outputs.image_sparkr_url_link }} env: HADOOP_PROFILE: ${{ inputs.hadoop }} HIVE_PROFILE: hive2.3 diff --git a/.github/workflows/build_infra_images_cache.yml b/.github/workflows/build_infra_images_cache.yml index b82d0633b0cee..a6beacedeebd4 100644 --- a/.github/workflows/build_infra_images_cache.yml +++ b/.github/workflows/build_infra_images_cache.yml @@ -29,6 +29,7 @@ on: - 'dev/infra/Dockerfile' - 'dev/spark-test-image/docs/Dockerfile' - 'dev/spark-test-image/lint/Dockerfile' + - 'dev/spark-test-image/sparkr/Dockerfile' - '.github/workflows/build_infra_images_cache.yml' # Create infra image when cutting down branches/tags create: @@ -88,3 +89,16 @@ jobs: - name: Image digest (Linter) if: hashFiles('dev/spark-test-image/lint/Dockerfile') != '' run: echo ${{ steps.docker_build_lint.outputs.digest }} + - name: Build and push (SparkR) + if: hashFiles('dev/spark-test-image/sparkr/Dockerfile') != '' + id: docker_build_sparkr + uses: docker/build-push-action@v6 + with: + context: ./dev/spark-test-image/sparkr/ + push: true + tags: ghcr.io/apache/spark/apache-spark-github-action-image-sparkr-cache:${{ github.ref_name }}-static + cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-sparkr-cache:${{ github.ref_name }} + cache-to: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-sparkr-cache:${{ github.ref_name }},mode=max + - name: Image digest (SparkR) + if: hashFiles('dev/spark-test-image/sparkr/Dockerfile') != '' + run: echo ${{ steps.docker_build_sparkr.outputs.digest }} diff --git a/dev/spark-test-image/sparkr/Dockerfile b/dev/spark-test-image/sparkr/Dockerfile new file mode 100644 index 0000000000000..43260c714a550 --- /dev/null +++ b/dev/spark-test-image/sparkr/Dockerfile @@ -0,0 +1,77 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Image for building and testing Spark branches. Based on Ubuntu 22.04. +# See also in https://hub.docker.com/_/ubuntu +FROM ubuntu:jammy-20240911.1 +LABEL org.opencontainers.image.authors="Apache Spark project " +LABEL org.opencontainers.image.licenses="Apache-2.0" +LABEL org.opencontainers.image.ref.name="Apache Spark Infra Image for SparkR" +# Overwrite this label to avoid exposing the underlying Ubuntu OS version label +LABEL org.opencontainers.image.version="" + +ENV FULL_REFRESH_DATE 20241114 + +ENV DEBIAN_FRONTEND noninteractive +ENV DEBCONF_NONINTERACTIVE_SEEN true + +RUN apt-get update && apt-get install -y \ + build-essential \ + ca-certificates \ + curl \ + gfortran \ + git \ + gnupg \ + libcurl4-openssl-dev \ + libfontconfig1-dev \ + libfreetype6-dev \ + libfribidi-dev \ + libgit2-dev \ + libharfbuzz-dev \ + libjpeg-dev \ + liblapack-dev \ + libopenblas-dev \ + libpng-dev \ + libpython3-dev \ + libssl-dev \ + libtiff5-dev \ + libxml2-dev \ + pandoc \ + pkg-config \ + qpdf \ + r-base \ + software-properties-common \ + wget \ + zlib1g-dev \ + && rm -rf /var/lib/apt/lists/* + +RUN echo 'deb https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/' >> /etc/apt/sources.list +RUN gpg --keyserver hkps://keyserver.ubuntu.com --recv-key E298A3A825C0D65DFD57CBB651716619E084DAB9 +RUN gpg -a --export E084DAB9 | apt-key add - +RUN add-apt-repository 'deb https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/' + +# See more in SPARK-39959, roxygen2 < 7.2.1 +RUN Rscript -e "install.packages(c('devtools', 'knitr', 'markdown', \ + 'rmarkdown', 'testthat', 'devtools', 'e1071', 'survival', 'arrow', \ + 'ggplot2', 'mvtnorm', 'statmod', 'xml2'), repos='https://cloud.r-project.org/')" && \ + Rscript -e "devtools::install_version('roxygen2', version='7.2.0', repos='https://cloud.r-project.org')" && \ + Rscript -e "devtools::install_version('lintr', version='2.0.1', repos='https://cloud.r-project.org')" && \ + Rscript -e "devtools::install_version('pkgdown', version='2.0.1', repos='https://cloud.r-project.org')" && \ + Rscript -e "devtools::install_version('preferably', version='0.4', repos='https://cloud.r-project.org')" + +# See more in SPARK-39735 +ENV R_LIBS_SITE "/usr/local/lib/R/site-library:${R_LIBS_SITE}:/usr/lib/R/library"