Skip to content

Commit

Permalink
fix(docker): fix spark 3 dockers (#383)
Browse files Browse the repository at this point in the history
  • Loading branch information
lyogev authored Sep 23, 2020
1 parent e0685fe commit 9ee8591
Show file tree
Hide file tree
Showing 7 changed files with 187 additions and 29 deletions.
6 changes: 4 additions & 2 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,12 @@ env:
- SCALA_MAJOR_VERSION=2.12
- SPARK2_SCALA_VERSION=2.11.12
- SPARK2_SCALA_MAJOR_VERSION=2.11
- SPARK2_HADOOP_MAJOR_VERSION=2.7
- HADOOP_MAJOR_VERSION=3.2
- SPARK2_HADOOP_VERSION=2.9.2
- SPARK2_HIVE_VERSION=2.3.3
- SPARK2_VERSION=2.4.5
- SPARK_VERSION=3.0.0
- SPARK2_VERSION=2.4.6
- SPARK_VERSION=3.0.1
- HIVE_VERSION=2.3.7
- HUDI_VERSION=0.5.3
- TARGET_CACHE=$HOME/target-cache/${TRAVIS_COMMIT}
Expand Down
2 changes: 1 addition & 1 deletion build.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ scalaVersion := Option(System.getenv("SCALA_VERSION")).getOrElse("2.12.11")
val sparkVersion: Def.Initialize[String] = Def.setting {
CrossVersion.partialVersion(scalaVersion.value) match {
case Some((2, scalaMajor)) if scalaMajor >= 12 => Option(System.getenv("SPARK_VERSION")).getOrElse("3.0.0")
case _ => Option(System.getenv("SPARK2_VERSION")).getOrElse("2.4.5")
case _ => Option(System.getenv("SPARK2_VERSION")).getOrElse("2.4.6")
}
}

Expand Down
50 changes: 29 additions & 21 deletions docker/spark/k8s/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,40 +1,48 @@
ARG SPARK_VERSION=3.0.0
ARG SPARK_IMAGE=gcr.io/spark-operator/spark:v${SPARK_VERSION}-gcs-prometheus
FROM ${SPARK_IMAGE}

# Install Tools
RUN apt-get update \
&& apt-get install -y curl wget \
&& rm -rf /var/lib/apt/lists/*
ARG SPARK_VERSION=3.0.1
FROM metorikku/spark:base-${SPARK_VERSION}

ARG AWS_SDK_VERSION=1.11.853
ARG HADOOP_VERSION=3.2.1
ARG HADOOP_VERSION=3.2.0
ARG HTTPCLIENT_VERSION=4.5.11
ARG SCALA_MAJOR_VERSION=2.12
ARG SPARK_VERSION=3.0.0

RUN rm -f ${SPARK_HOME}/jars/spark-bigquery-latest.jar
USER root

RUN wget -q https://repo1.maven.org/maven2/net/logstash/log4j/jsonevent-layout/1.7/jsonevent-layout-1.7.jar -P $SPARK_HOME/jars/
RUN wget -q https://repo1.maven.org/maven2/net/minidev/json-smart/1.1.1/json-smart-1.1.1.jar -P $SPARK_HOME/jars/
RUN wget -q https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar -P $SPARK_HOME/jars/
RUN wget -q https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk/${AWS_SDK_VERSION}/aws-java-sdk-${AWS_SDK_VERSION}.jar -P $SPARK_HOME/jars/
RUN wget -q https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-core/${AWS_SDK_VERSION}/aws-java-sdk-core-${AWS_SDK_VERSION}.jar -P $SPARK_HOME/jars/
RUN wget -q https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-s3/${AWS_SDK_VERSION}/aws-java-sdk-s3-${AWS_SDK_VERSION}.jar -P $SPARK_HOME/jars/
ADD https://repo1.maven.org/maven2/net/logstash/log4j/jsonevent-layout/1.7/jsonevent-layout-1.7.jar $SPARK_HOME/jars/
ADD https://repo1.maven.org/maven2/net/minidev/json-smart/1.1.1/json-smart-1.1.1.jar $SPARK_HOME/jars/
ADD https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar $SPARK_HOME/jars/
ADD https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk/${AWS_SDK_VERSION}/aws-java-sdk-${AWS_SDK_VERSION}.jar $SPARK_HOME/jars/
ADD https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-core/${AWS_SDK_VERSION}/aws-java-sdk-core-${AWS_SDK_VERSION}.jar $SPARK_HOME/jars/
ADD https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-s3/${AWS_SDK_VERSION}/aws-java-sdk-s3-${AWS_SDK_VERSION}.jar $SPARK_HOME/jars/
ADD https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-dynamodb/${AWS_SDK_VERSION}/aws-java-sdk-dynamodb-${AWS_SDK_VERSION}.jar $SPARK_HOME/jars/
ADD https://repo1.maven.org/maven2/org/apache/spark/spark-sql-kafka-0-10_${SCALA_MAJOR_VERSION}/${SPARK_VERSION}/spark-sql-kafka-0-10_${SCALA_MAJOR_VERSION}-${SPARK_VERSION}.jar $SPARK_HOME/jars/
ADD https://repo1.maven.org/maven2/org/apache/spark/spark-streaming-kafka-0-10-assembly_${SCALA_MAJOR_VERSION}/${SPARK_VERSION}/spark-streaming-kafka-0-10-assembly_${SCALA_MAJOR_VERSION}-${SPARK_VERSION}.jar $SPARK_HOME/jars/
ADD https://repo1.maven.org/maven2/org/apache/spark/spark-avro_${SCALA_MAJOR_VERSION}/${SPARK_VERSION}/spark-avro_${SCALA_MAJOR_VERSION}-${SPARK_VERSION}.jar $SPARK_HOME/jars/
ADD https://repo1.maven.org/maven2/org/apache/commons/commons-pool2/2.6.2/commons-pool2-2.6.2.jar $SPARK_HOME/jars/
RUN rm -f $SPARK_HOME/jars/httpclient-*.jar
ADD https://repo1.maven.org/maven2/org/apache/httpcomponents/httpclient/${HTTPCLIENT_VERSION}/httpclient-${HTTPCLIENT_VERSION}.jar $SPARK_HOME/jars/

RUN wget -q https://repo1.maven.org/maven2/org/apache/spark/spark-sql-kafka-0-10_${SCALA_MAJOR_VERSION}/${SPARK_VERSION}/spark-sql-kafka-0-10_${SCALA_MAJOR_VERSION}-${SPARK_VERSION}.jar -P $SPARK_HOME/jars/
RUN wget -q https://repo1.maven.org/maven2/org/apache/spark/spark-streaming-kafka-0-10-assembly_${SCALA_MAJOR_VERSION}/${SPARK_VERSION}/spark-streaming-kafka-0-10-assembly_${SCALA_MAJOR_VERSION}-${SPARK_VERSION}.jar -P $SPARK_HOME/jars/
RUN wget -q https://repo1.maven.org/maven2/org/apache/spark/spark-avro_${SCALA_MAJOR_VERSION}/${SPARK_VERSION}/spark-avro_${SCALA_MAJOR_VERSION}-${SPARK_VERSION}.jar -P $SPARK_HOME/jars/
RUN wget -q https://repo1.maven.org/maven2/org/apache/commons/commons-pool2/2.6.2/commons-pool2-2.6.2.jar -P $SPARK_HOME/jars/
RUN chmod 644 $SPARK_HOME/jars/*

ADD https://repo1.maven.org/maven2/io/prometheus/jmx/jmx_prometheus_javaagent/0.11.0/jmx_prometheus_javaagent-0.11.0.jar /prometheus/
RUN chmod 644 /prometheus/*.jar

#Python
RUN apt-get update \
&& apt-get install -y coreutils jq less inotify-tools python3 python3-setuptools \
&& apt-get install -y wget curl coreutils jq less inotify-tools python3 python3-setuptools \
&& curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py \
&& python3 get-pip.py 'pip==20.1.1' \
&& rm get-pip.py \
&& rm -rf /var/lib/apt/lists/*

#USER ${spark_uid}

ADD conf/* ${SPARK_HOME}/custom/conf/

RUN mkdir -p /etc/metrics/conf
ADD metrics/* /etc/metrics/conf/

RUN touch hadoop-metrics2.properties

ENV PYTHONHASHSEED 1
19 changes: 19 additions & 0 deletions docker/spark/k8s/metrics/metrics.properties
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#
# Copyright 2018 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

*.sink.jmx.class=org.apache.spark.metrics.sink.JmxSink
driver.source.jvm.class=org.apache.spark.metrics.source.JvmSource
executor.source.jvm.class=org.apache.spark.metrics.source.JvmSource
123 changes: 123 additions & 0 deletions docker/spark/k8s/metrics/prometheus.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
#
# Copyright 2018 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

---
lowercaseOutputName: true
attrNameSnakeCase: true
rules:
# These come from the application driver if it's a streaming application
# Example: default/streaming.driver.com.example.ClassName.StreamingMetrics.streaming.lastCompletedBatch_schedulingDelay
- pattern: metrics<name=(\S+)\.(\S+)\.driver\.(\S+)\.StreamingMetrics\.streaming\.(\S+)><>Value
name: spark_streaming_driver_$4
labels:
app_namespace: "$1"
app_id: "$2"
# These come from the application driver if it's a structured streaming application
# Example: default/sstreaming.driver.spark.streaming.QueryName.inputRate-total
- pattern: metrics<name=(\S+)\.(\S+)\.driver\.spark\.streaming\.(\S+)\.(\S+)><>Value
name: spark_structured_streaming_driver_$4
labels:
app_namespace: "$1"
app_id: "$2"
query_name: "$3"
# These come from the application executors
# Example: default/spark-pi.0.executor.threadpool.activeTasks
- pattern: metrics<name=(\S+)\.(\S+)\.(\S+)\.executor\.(\S+)><>Value
name: spark_executor_$4
type: GAUGE
labels:
app_namespace: "$1"
app_id: "$2"
executor_id: "$3"
# These come from the application driver
# Example: default/spark-pi.driver.DAGScheduler.stage.failedStages
- pattern: metrics<name=(\S+)\.(\S+)\.driver\.(BlockManager|DAGScheduler|jvm)\.(\S+)><>Value
name: spark_driver_$3_$4
type: GAUGE
labels:
app_namespace: "$1"
app_id: "$2"
# These come from the application driver
# Emulate timers for DAGScheduler like messagePRocessingTime
- pattern: metrics<name=(\S+)\.(\S+)\.driver\.DAGScheduler\.(.*)><>Count
name: spark_driver_DAGScheduler_$3_count
type: COUNTER
labels:
app_namespace: "$1"
app_id: "$2"
# HiveExternalCatalog is of type counter
- pattern: metrics<name=(\S+)\.(\S+)\.driver\.HiveExternalCatalog\.(.*)><>Count
name: spark_driver_HiveExternalCatalog_$3_count
type: COUNTER
labels:
app_namespace: "$1"
app_id: "$2"
# These come from the application driver
# Emulate histograms for CodeGenerator
- pattern: metrics<name=(\S+)\.(\S+)\.driver\.CodeGenerator\.(.*)><>Count
name: spark_driver_CodeGenerator_$3_count
type: COUNTER
labels:
app_namespace: "$1"
app_id: "$2"
# These come from the application driver
# Emulate timer (keep only count attribute) plus counters for LiveListenerBus
- pattern: metrics<name=(\S+)\.(\S+)\.driver\.LiveListenerBus\.(.*)><>Count
name: spark_driver_LiveListenerBus_$3_count
type: COUNTER
labels:
app_namespace: "$1"
app_id: "$2"
# Get Gauge type metrics for LiveListenerBus
- pattern: metrics<name=(\S+)\.(\S+)\.driver\.LiveListenerBus\.(.*)><>Value
name: spark_driver_LiveListenerBus_$3
type: GAUGE
labels:
app_namespace: "$1"
app_id: "$2"
# Executors counters
- pattern: metrics<name=(\S+)\.(\S+)\.(.*)\.executor\.(.*)><>Count
name: spark_executor_$4_count
type: COUNTER
labels:
app_namespace: "$1"
app_id: "$2"
executor_id: "$3"
# These come from the application executors
# Example: app-20160809000059-0000.0.jvm.threadpool.activeTasks
- pattern: metrics<name=(\S+)\.(\S+)\.([0-9]+)\.(jvm|NettyBlockTransfer)\.(.*)><>Value
name: spark_executor_$4_$5
type: GAUGE
labels:
app_namespace: "$1"
app_id: "$2"
executor_id: "$3"
- pattern: metrics<name=(\S+)\.(\S+)\.([0-9]+)\.HiveExternalCatalog\.(.*)><>Count
name: spark_executor_HiveExternalCatalog_$4_count
type: COUNTER
labels:
app_namespace: "$1"
app_id: "$2"
executor_id: "$3"
# These come from the application driver
# Emulate histograms for CodeGenerator
- pattern: metrics<name=(\S+)\.(\S+)\.([0-9]+)\.CodeGenerator\.(.*)><>Count
name: spark_executor_CodeGenerator_$4_count
type: COUNTER
labels:
app_namespace: "$1"
app_id: "$2"
executor_id: "$3"
2 changes: 1 addition & 1 deletion examples/udf/build.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ scalaVersion := Option(System.getProperty("scalaVersion")).getOrElse("2.12.11")
val sparkVersion: Def.Initialize[String] = Def.setting {
CrossVersion.partialVersion(scalaVersion.value) match {
case Some((2, scalaMajor)) if scalaMajor >= 12 => Option(System.getProperty("sparkVersion")).getOrElse("3.0.0")
case _ => "2.4.5"
case _ => "2.4.6"
}
}

Expand Down
14 changes: 10 additions & 4 deletions scripts/docker.sh
Original file line number Diff line number Diff line change
@@ -1,25 +1,31 @@
#!/bin/bash

# Hack that helps with the cache
docker pull metorikku/metorikku:k8s
docker pull metorikku/metorikku:standalone
docker pull metorikku/metorikku:spark2_k8s
docker pull metorikku/metorikku:spark2_standalone
docker pull metorikku/hive

docker pull gcr.io/spark-operator/spark:v$SPARK_VERSION-gcs-prometheus
docker pull gcr.io/spark-operator/spark:v$SPARK2_VERSION-gcs-prometheus

set -e

# Latest spark
wget -q https://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop$HADOOP_MAJOR_VERSION.tgz
tar -xzf spark-$SPARK_VERSION-bin-hadoop$HADOOP_MAJOR_VERSION.tgz
(cd spark-$SPARK_VERSION-bin-hadoop$HADOOP_MAJOR_VERSION && bin/docker-image-tool.sh -r metorikku -t base-$SPARK_VERSION build)
rm -rf spark-$SPARK_VERSION-bin-hadoop$HADOOP_MAJOR_VERSION*

docker build -t metorikku/spark:k8s --cache-from metorikku/metorikku:k8s --build-arg SCALA_MAJOR_VERSION=$SCALA_MAJOR_VERSION --build-arg SPARK_VERSION=$SPARK_VERSION -f docker/spark/k8s/Dockerfile docker/spark/k8s
docker build -t metorikku/spark:standalone --cache-from metorikku/metorikku:standalone --build-arg IMAGE_NAME=metorikku/spark:k8s -f docker/spark/standalone/Dockerfile docker/spark/standalone
# Adding metorikku jar
docker build -t metorikku/metorikku:k8s --cache-from metorikku/metorikku:k8s --build-arg IMAGE_NAME=metorikku/spark:k8s -f docker/metorikku/Dockerfile .
docker build -t metorikku/metorikku:standalone --cache-from metorikku/metorikku:standalone --build-arg IMAGE_NAME=metorikku/spark:standalone -f docker/metorikku/Dockerfile .

# Spark 2
wget -q https://archive.apache.org/dist/spark/spark-$SPARK2_VERSION/spark-$SPARK2_VERSION-bin-hadoop$SPARK2_HADOOP_MAJOR_VERSION.tgz
tar -xzf spark-$SPARK2_VERSION-bin-hadoop$SPARK2_HADOOP_MAJOR_VERSION.tgz
(cd spark-$SPARK2_VERSION-bin-hadoop$SPARK2_HADOOP_MAJOR_VERSION && bin/docker-image-tool.sh -r metorikku -t base-$SPARK2_VERSION build)
rm -rf spark-$SPARK2_VERSION-bin-hadoop$SPARK2_HADOOP_MAJOR_VERSION

docker build -t metorikku/spark:spark2_k8s --cache-from metorikku/metorikku:spark2_k8s --build-arg SCALA_MAJOR_VERSION=$SPARK2_SCALA_MAJOR_VERSION --build-arg SPARK_VERSION=$SPARK2_VERSION --build-arg HADOOP_VERSION=${SPARK2_HADOOP_VERSION} -f docker/spark/k8s/Dockerfile docker/spark/k8s
docker build -t metorikku/spark:spark2_hadoop --cache-from metorikku/metorikku:spark2_standalone --build-arg IMAGE_NAME=metorikku/spark:spark2_k8s --build-arg HIVE_VERSION=${SPARK2_HIVE_VERSION} --build-arg HADOOP_VERSION=${SPARK2_HADOOP_VERSION} -f docker/spark/custom-hadoop/Dockerfile docker/spark/custom-hadoop
docker build -t metorikku/spark:spark2_standalone --cache-from metorikku/metorikku:spark2_standalone --build-arg IMAGE_NAME=metorikku/spark:spark2_hadoop -f docker/spark/standalone/Dockerfile docker/spark/standalone
Expand Down

0 comments on commit 9ee8591

Please sign in to comment.