diff --git a/.github/workflows/spark_deployment/Dockerfile b/.github/workflows/spark_deployment/Dockerfile index dab57200..7007022f 100644 --- a/.github/workflows/spark_deployment/Dockerfile +++ b/.github/workflows/spark_deployment/Dockerfile @@ -1,10 +1,13 @@ -FROM openjdk:11-jre-slim +# Use a multi-arch base image +FROM --platform=$BUILDPLATFORM openjdk:11-jre-slim # Set environment variables -ENV SPARK_VERSION=3.5.1 -ENV HADOOP_VERSION=3.3.4 -ENV ICEBERG_VERSION=1.4.2 -ENV AWS_SDK_VERSION=1.12.581 +ARG BUILDPLATFORM +ARG TARGETPLATFORM +ARG SPARK_VERSION=3.5.1 +ARG HADOOP_VERSION=3.3.4 +ARG ICEBERG_VERSION=1.4.2 +ARG AWS_SDK_VERSION=1.12.581 # Install necessary tools RUN apt-get update && apt-get install -y curl wget procps rsync ssh @@ -12,23 +15,32 @@ RUN apt-get update && apt-get install -y curl wget procps rsync ssh # Download and install Spark RUN wget https://downloads.apache.org/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop3.tgz && \ tar -xvzf spark-${SPARK_VERSION}-bin-hadoop3.tgz && \ - mv spark-${SPARK_VERSION}-bin-hadoop3 /spark && \ + mv spark-${SPARK_VERSION}-bin-hadoop3 /opt/spark && \ rm spark-${SPARK_VERSION}-bin-hadoop3.tgz # Set Spark environment variables -ENV SPARK_HOME=/spark +ENV SPARK_HOME=/opt/spark ENV PATH=$PATH:$SPARK_HOME/bin:$SPARK_HOME/sbin # Download necessary JARs -RUN mkdir -p /spark/jars && \ - wget https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime-3.5_2.12/${ICEBERG_VERSION}/iceberg-spark-runtime-3.5_2.12-${ICEBERG_VERSION}.jar -O /spark/jars/iceberg-spark-runtime.jar && \ - wget https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-aws-bundle/${ICEBERG_VERSION}/iceberg-aws-bundle-${ICEBERG_VERSION}.jar -O /spark/jars/iceberg-aws-bundle.jar && \ - wget https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar -O /spark/jars/hadoop-aws.jar && \ - wget https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/${AWS_SDK_VERSION}/aws-java-sdk-bundle-${AWS_SDK_VERSION}.jar -O /spark/jars/aws-java-sdk-bundle.jar +RUN mkdir -p /opt/spark/jars && \ + wget https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime-3.5_2.12/${ICEBERG_VERSION}/iceberg-spark-runtime-3.5_2.12-${ICEBERG_VERSION}.jar -O /opt/spark/jars/iceberg-spark-runtime.jar && \ + wget https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-aws-bundle/${ICEBERG_VERSION}/iceberg-aws-bundle-${ICEBERG_VERSION}.jar -O /opt/spark/jars/iceberg-aws-bundle.jar && \ + wget https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar -O /opt/spark/jars/hadoop-aws.jar && \ + wget https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/${AWS_SDK_VERSION}/aws-java-sdk-bundle-${AWS_SDK_VERSION}.jar -O /opt/spark/jars/aws-java-sdk-bundle.jar # Create directory for Spark events RUN mkdir -p /tmp/spark-events -WORKDIR /spark +# Create a non-root user to run Spark +RUN useradd -ms /bin/bash spark +RUN chown -R spark:spark /opt/spark /tmp/spark-events + +# Switch to non-root user +USER spark +WORKDIR /opt/spark + +# Expose Spark ports +EXPOSE 8080 7077 10000 CMD ["bash"] \ No newline at end of file diff --git a/.github/workflows/spark_deployment/docker-compose.yml b/.github/workflows/spark_deployment/docker-compose.yml index af8ff4da..6d7ae35c 100644 --- a/.github/workflows/spark_deployment/docker-compose.yml +++ b/.github/workflows/spark_deployment/docker-compose.yml @@ -6,8 +6,8 @@ networks: services: spark-master: - image: snowplow/spark-s3-iceberg:v1 - command: ["/bin/bash", "-c", "/spark/sbin/start-master.sh -h spark-master --properties-file /spark/conf/spark-defaults.conf && tail -f /spark/logs/spark--org.apache.spark.deploy.master.Master-1-*.out"] + image: yourdockerhubusername/spark-s3-iceberg:v2 + command: ["/bin/bash", "-c", "/opt/spark/sbin/start-master.sh -h spark-master --properties-file /opt/spark/conf/spark-defaults.conf && tail -f /opt/spark/logs/spark--org.apache.spark.deploy.master.Master-1-*.out"] hostname: spark-master ports: - '8080:8080' @@ -22,13 +22,13 @@ services: - AWS_REGION=eu-west-1 - AWS_DEFAULT_REGION=eu-west-1 volumes: - - ./spark-defaults.conf:/spark/conf/spark-defaults.conf + - ./spark-defaults.conf:/opt/spark/conf/spark-defaults.conf networks: - spark-network spark-worker: - image: snowplow/spark-s3-iceberg:v1 - command: ["/bin/bash", "-c", "sleep 10 && /spark/sbin/start-worker.sh spark://spark-master:7077 --properties-file /spark/conf/spark-defaults.conf && tail -f /spark/logs/spark--org.apache.spark.deploy.worker.Worker-*.out"] + image: yourdockerhubusername/spark-s3-iceberg:v2 + command: ["/bin/bash", "-c", "sleep 10 && /opt/spark/sbin/start-worker.sh spark://spark-master:7077 --properties-file /opt/spark/conf/spark-defaults.conf && tail -f /opt/spark/logs/spark--org.apache.spark.deploy.worker.Worker-*.out"] depends_on: - spark-master environment: @@ -42,13 +42,13 @@ services: - AWS_REGION=eu-west-1 - AWS_DEFAULT_REGION=eu-west-1 volumes: - - ./spark-defaults.conf:/spark/conf/spark-defaults.conf + - ./spark-defaults.conf:/opt/spark/conf/spark-defaults.conf networks: - spark-network thrift-server: - image: snowplow/spark-s3-iceberg:v1 - command: ["/bin/bash", "-c", "sleep 30 && /spark/sbin/start-thriftserver.sh --master spark://spark-master:7077 --driver-memory 2g --executor-memory 3g --hiveconf hive.server2.thrift.port=10000 --hiveconf hive.server2.thrift.bind.host=0.0.0.0 --conf spark.sql.hive.thriftServer.async=true --conf spark.sql.hive.thriftServer.workerQueue.size=2000 --conf spark.sql.hive.thriftServer.maxWorkerThreads=100 --conf spark.sql.hive.thriftServer.minWorkerThreads=50 && tail -f /spark/logs/spark--org.apache.spark.sql.hive.thriftserver.HiveThriftServer2-*.out"] + image: yourdockerhubusername/spark-s3-iceberg:v2 + command: ["/bin/bash", "-c", "sleep 30 && /opt/spark/sbin/start-thriftserver.sh --master spark://spark-master:7077 --driver-memory 2g --executor-memory 3g --hiveconf hive.server2.thrift.port=10000 --hiveconf hive.server2.thrift.bind.host=0.0.0.0 --conf spark.sql.hive.thriftServer.async=true --conf spark.sql.hive.thriftServer.workerQueue.size=2000 --conf spark.sql.hive.thriftServer.maxWorkerThreads=100 --conf spark.sql.hive.thriftServer.minWorkerThreads=50 && tail -f /opt/spark/logs/spark--org.apache.spark.sql.hive.thriftserver.HiveThriftServer2-*.out"] ports: - '10000:10000' depends_on: @@ -61,6 +61,6 @@ services: - AWS_REGION=eu-west-1 - AWS_DEFAULT_REGION=eu-west-1 volumes: - - ./spark-defaults.conf:/spark/conf/spark-defaults.conf + - ./spark-defaults.conf:/opt/spark/conf/spark-defaults.conf networks: - spark-network \ No newline at end of file