diff --git a/.travis.yml b/.travis.yml index a66a29f12..c4080bd14 100644 --- a/.travis.yml +++ b/.travis.yml @@ -21,10 +21,12 @@ env: - SCALA_MAJOR_VERSION=2.12 - SPARK2_SCALA_VERSION=2.11.12 - SPARK2_SCALA_MAJOR_VERSION=2.11 + - SPARK2_HADOOP_MAJOR_VERSION=2.7 + - HADOOP_MAJOR_VERSION=3.2 - SPARK2_HADOOP_VERSION=2.9.2 - SPARK2_HIVE_VERSION=2.3.3 - - SPARK2_VERSION=2.4.5 - - SPARK_VERSION=3.0.0 + - SPARK2_VERSION=2.4.6 + - SPARK_VERSION=3.0.1 - HIVE_VERSION=2.3.7 - HUDI_VERSION=0.5.3 - TARGET_CACHE=$HOME/target-cache/${TRAVIS_COMMIT} diff --git a/build.sbt b/build.sbt index f10a681ed..9c9a5f121 100644 --- a/build.sbt +++ b/build.sbt @@ -17,7 +17,7 @@ scalaVersion := Option(System.getenv("SCALA_VERSION")).getOrElse("2.12.11") val sparkVersion: Def.Initialize[String] = Def.setting { CrossVersion.partialVersion(scalaVersion.value) match { case Some((2, scalaMajor)) if scalaMajor >= 12 => Option(System.getenv("SPARK_VERSION")).getOrElse("3.0.0") - case _ => Option(System.getenv("SPARK2_VERSION")).getOrElse("2.4.5") + case _ => Option(System.getenv("SPARK2_VERSION")).getOrElse("2.4.6") } } diff --git a/docker/spark/k8s/Dockerfile b/docker/spark/k8s/Dockerfile index 8a1f82e39..c8a6c01d4 100644 --- a/docker/spark/k8s/Dockerfile +++ b/docker/spark/k8s/Dockerfile @@ -1,40 +1,48 @@ -ARG SPARK_VERSION=3.0.0 -ARG SPARK_IMAGE=gcr.io/spark-operator/spark:v${SPARK_VERSION}-gcs-prometheus -FROM ${SPARK_IMAGE} - -# Install Tools -RUN apt-get update \ - && apt-get install -y curl wget \ - && rm -rf /var/lib/apt/lists/* +ARG SPARK_VERSION=3.0.1 +FROM metorikku/spark:base-${SPARK_VERSION} ARG AWS_SDK_VERSION=1.11.853 -ARG HADOOP_VERSION=3.2.1 +ARG HADOOP_VERSION=3.2.0 ARG HTTPCLIENT_VERSION=4.5.11 ARG SCALA_MAJOR_VERSION=2.12 ARG SPARK_VERSION=3.0.0 -RUN rm -f ${SPARK_HOME}/jars/spark-bigquery-latest.jar +USER root -RUN wget -q https://repo1.maven.org/maven2/net/logstash/log4j/jsonevent-layout/1.7/jsonevent-layout-1.7.jar -P $SPARK_HOME/jars/ -RUN wget -q https://repo1.maven.org/maven2/net/minidev/json-smart/1.1.1/json-smart-1.1.1.jar -P $SPARK_HOME/jars/ -RUN wget -q https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar -P $SPARK_HOME/jars/ -RUN wget -q https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk/${AWS_SDK_VERSION}/aws-java-sdk-${AWS_SDK_VERSION}.jar -P $SPARK_HOME/jars/ -RUN wget -q https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-core/${AWS_SDK_VERSION}/aws-java-sdk-core-${AWS_SDK_VERSION}.jar -P $SPARK_HOME/jars/ -RUN wget -q https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-s3/${AWS_SDK_VERSION}/aws-java-sdk-s3-${AWS_SDK_VERSION}.jar -P $SPARK_HOME/jars/ +ADD https://repo1.maven.org/maven2/net/logstash/log4j/jsonevent-layout/1.7/jsonevent-layout-1.7.jar $SPARK_HOME/jars/ +ADD https://repo1.maven.org/maven2/net/minidev/json-smart/1.1.1/json-smart-1.1.1.jar $SPARK_HOME/jars/ +ADD https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar $SPARK_HOME/jars/ +ADD https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk/${AWS_SDK_VERSION}/aws-java-sdk-${AWS_SDK_VERSION}.jar $SPARK_HOME/jars/ +ADD https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-core/${AWS_SDK_VERSION}/aws-java-sdk-core-${AWS_SDK_VERSION}.jar $SPARK_HOME/jars/ +ADD https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-s3/${AWS_SDK_VERSION}/aws-java-sdk-s3-${AWS_SDK_VERSION}.jar $SPARK_HOME/jars/ +ADD https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-dynamodb/${AWS_SDK_VERSION}/aws-java-sdk-dynamodb-${AWS_SDK_VERSION}.jar $SPARK_HOME/jars/ +ADD https://repo1.maven.org/maven2/org/apache/spark/spark-sql-kafka-0-10_${SCALA_MAJOR_VERSION}/${SPARK_VERSION}/spark-sql-kafka-0-10_${SCALA_MAJOR_VERSION}-${SPARK_VERSION}.jar $SPARK_HOME/jars/ +ADD https://repo1.maven.org/maven2/org/apache/spark/spark-streaming-kafka-0-10-assembly_${SCALA_MAJOR_VERSION}/${SPARK_VERSION}/spark-streaming-kafka-0-10-assembly_${SCALA_MAJOR_VERSION}-${SPARK_VERSION}.jar $SPARK_HOME/jars/ +ADD https://repo1.maven.org/maven2/org/apache/spark/spark-avro_${SCALA_MAJOR_VERSION}/${SPARK_VERSION}/spark-avro_${SCALA_MAJOR_VERSION}-${SPARK_VERSION}.jar $SPARK_HOME/jars/ +ADD https://repo1.maven.org/maven2/org/apache/commons/commons-pool2/2.6.2/commons-pool2-2.6.2.jar $SPARK_HOME/jars/ +RUN rm -f $SPARK_HOME/jars/httpclient-*.jar +ADD https://repo1.maven.org/maven2/org/apache/httpcomponents/httpclient/${HTTPCLIENT_VERSION}/httpclient-${HTTPCLIENT_VERSION}.jar $SPARK_HOME/jars/ -RUN wget -q https://repo1.maven.org/maven2/org/apache/spark/spark-sql-kafka-0-10_${SCALA_MAJOR_VERSION}/${SPARK_VERSION}/spark-sql-kafka-0-10_${SCALA_MAJOR_VERSION}-${SPARK_VERSION}.jar -P $SPARK_HOME/jars/ -RUN wget -q https://repo1.maven.org/maven2/org/apache/spark/spark-streaming-kafka-0-10-assembly_${SCALA_MAJOR_VERSION}/${SPARK_VERSION}/spark-streaming-kafka-0-10-assembly_${SCALA_MAJOR_VERSION}-${SPARK_VERSION}.jar -P $SPARK_HOME/jars/ -RUN wget -q https://repo1.maven.org/maven2/org/apache/spark/spark-avro_${SCALA_MAJOR_VERSION}/${SPARK_VERSION}/spark-avro_${SCALA_MAJOR_VERSION}-${SPARK_VERSION}.jar -P $SPARK_HOME/jars/ -RUN wget -q https://repo1.maven.org/maven2/org/apache/commons/commons-pool2/2.6.2/commons-pool2-2.6.2.jar -P $SPARK_HOME/jars/ +RUN chmod 644 $SPARK_HOME/jars/* + +ADD https://repo1.maven.org/maven2/io/prometheus/jmx/jmx_prometheus_javaagent/0.11.0/jmx_prometheus_javaagent-0.11.0.jar /prometheus/ +RUN chmod 644 /prometheus/*.jar #Python RUN apt-get update \ - && apt-get install -y coreutils jq less inotify-tools python3 python3-setuptools \ + && apt-get install -y wget curl coreutils jq less inotify-tools python3 python3-setuptools \ && curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py \ && python3 get-pip.py 'pip==20.1.1' \ && rm get-pip.py \ && rm -rf /var/lib/apt/lists/* +#USER ${spark_uid} + ADD conf/* ${SPARK_HOME}/custom/conf/ +RUN mkdir -p /etc/metrics/conf +ADD metrics/* /etc/metrics/conf/ + +RUN touch hadoop-metrics2.properties + ENV PYTHONHASHSEED 1 \ No newline at end of file diff --git a/docker/spark/k8s/metrics/metrics.properties b/docker/spark/k8s/metrics/metrics.properties new file mode 100644 index 000000000..9640deb15 --- /dev/null +++ b/docker/spark/k8s/metrics/metrics.properties @@ -0,0 +1,19 @@ +# +# Copyright 2018 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +*.sink.jmx.class=org.apache.spark.metrics.sink.JmxSink +driver.source.jvm.class=org.apache.spark.metrics.source.JvmSource +executor.source.jvm.class=org.apache.spark.metrics.source.JvmSource \ No newline at end of file diff --git a/docker/spark/k8s/metrics/prometheus.yaml b/docker/spark/k8s/metrics/prometheus.yaml new file mode 100644 index 000000000..a33b633d9 --- /dev/null +++ b/docker/spark/k8s/metrics/prometheus.yaml @@ -0,0 +1,123 @@ +# +# Copyright 2018 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +--- +lowercaseOutputName: true +attrNameSnakeCase: true +rules: + # These come from the application driver if it's a streaming application + # Example: default/streaming.driver.com.example.ClassName.StreamingMetrics.streaming.lastCompletedBatch_schedulingDelay + - pattern: metrics<>Value + name: spark_streaming_driver_$4 + labels: + app_namespace: "$1" + app_id: "$2" + # These come from the application driver if it's a structured streaming application + # Example: default/sstreaming.driver.spark.streaming.QueryName.inputRate-total + - pattern: metrics<>Value + name: spark_structured_streaming_driver_$4 + labels: + app_namespace: "$1" + app_id: "$2" + query_name: "$3" + # These come from the application executors + # Example: default/spark-pi.0.executor.threadpool.activeTasks + - pattern: metrics<>Value + name: spark_executor_$4 + type: GAUGE + labels: + app_namespace: "$1" + app_id: "$2" + executor_id: "$3" + # These come from the application driver + # Example: default/spark-pi.driver.DAGScheduler.stage.failedStages + - pattern: metrics<>Value + name: spark_driver_$3_$4 + type: GAUGE + labels: + app_namespace: "$1" + app_id: "$2" + # These come from the application driver + # Emulate timers for DAGScheduler like messagePRocessingTime + - pattern: metrics<>Count + name: spark_driver_DAGScheduler_$3_count + type: COUNTER + labels: + app_namespace: "$1" + app_id: "$2" + # HiveExternalCatalog is of type counter + - pattern: metrics<>Count + name: spark_driver_HiveExternalCatalog_$3_count + type: COUNTER + labels: + app_namespace: "$1" + app_id: "$2" + # These come from the application driver + # Emulate histograms for CodeGenerator + - pattern: metrics<>Count + name: spark_driver_CodeGenerator_$3_count + type: COUNTER + labels: + app_namespace: "$1" + app_id: "$2" + # These come from the application driver + # Emulate timer (keep only count attribute) plus counters for LiveListenerBus + - pattern: metrics<>Count + name: spark_driver_LiveListenerBus_$3_count + type: COUNTER + labels: + app_namespace: "$1" + app_id: "$2" + # Get Gauge type metrics for LiveListenerBus + - pattern: metrics<>Value + name: spark_driver_LiveListenerBus_$3 + type: GAUGE + labels: + app_namespace: "$1" + app_id: "$2" + # Executors counters + - pattern: metrics<>Count + name: spark_executor_$4_count + type: COUNTER + labels: + app_namespace: "$1" + app_id: "$2" + executor_id: "$3" + # These come from the application executors + # Example: app-20160809000059-0000.0.jvm.threadpool.activeTasks + - pattern: metrics<>Value + name: spark_executor_$4_$5 + type: GAUGE + labels: + app_namespace: "$1" + app_id: "$2" + executor_id: "$3" + - pattern: metrics<>Count + name: spark_executor_HiveExternalCatalog_$4_count + type: COUNTER + labels: + app_namespace: "$1" + app_id: "$2" + executor_id: "$3" + # These come from the application driver + # Emulate histograms for CodeGenerator + - pattern: metrics<>Count + name: spark_executor_CodeGenerator_$4_count + type: COUNTER + labels: + app_namespace: "$1" + app_id: "$2" + executor_id: "$3" \ No newline at end of file diff --git a/examples/udf/build.sbt b/examples/udf/build.sbt index 0a4125e98..c0c0f55de 100644 --- a/examples/udf/build.sbt +++ b/examples/udf/build.sbt @@ -8,7 +8,7 @@ scalaVersion := Option(System.getProperty("scalaVersion")).getOrElse("2.12.11") val sparkVersion: Def.Initialize[String] = Def.setting { CrossVersion.partialVersion(scalaVersion.value) match { case Some((2, scalaMajor)) if scalaMajor >= 12 => Option(System.getProperty("sparkVersion")).getOrElse("3.0.0") - case _ => "2.4.5" + case _ => "2.4.6" } } diff --git a/scripts/docker.sh b/scripts/docker.sh index 1f269e79e..8f803916e 100755 --- a/scripts/docker.sh +++ b/scripts/docker.sh @@ -1,18 +1,19 @@ #!/bin/bash -# Hack that helps with the cache docker pull metorikku/metorikku:k8s docker pull metorikku/metorikku:standalone docker pull metorikku/metorikku:spark2_k8s docker pull metorikku/metorikku:spark2_standalone docker pull metorikku/hive -docker pull gcr.io/spark-operator/spark:v$SPARK_VERSION-gcs-prometheus -docker pull gcr.io/spark-operator/spark:v$SPARK2_VERSION-gcs-prometheus - set -e # Latest spark +wget -q https://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop$HADOOP_MAJOR_VERSION.tgz +tar -xzf spark-$SPARK_VERSION-bin-hadoop$HADOOP_MAJOR_VERSION.tgz +(cd spark-$SPARK_VERSION-bin-hadoop$HADOOP_MAJOR_VERSION && bin/docker-image-tool.sh -r metorikku -t base-$SPARK_VERSION build) +rm -rf spark-$SPARK_VERSION-bin-hadoop$HADOOP_MAJOR_VERSION* + docker build -t metorikku/spark:k8s --cache-from metorikku/metorikku:k8s --build-arg SCALA_MAJOR_VERSION=$SCALA_MAJOR_VERSION --build-arg SPARK_VERSION=$SPARK_VERSION -f docker/spark/k8s/Dockerfile docker/spark/k8s docker build -t metorikku/spark:standalone --cache-from metorikku/metorikku:standalone --build-arg IMAGE_NAME=metorikku/spark:k8s -f docker/spark/standalone/Dockerfile docker/spark/standalone # Adding metorikku jar @@ -20,6 +21,11 @@ docker build -t metorikku/metorikku:k8s --cache-from metorikku/metorikku:k8s --b docker build -t metorikku/metorikku:standalone --cache-from metorikku/metorikku:standalone --build-arg IMAGE_NAME=metorikku/spark:standalone -f docker/metorikku/Dockerfile . # Spark 2 +wget -q https://archive.apache.org/dist/spark/spark-$SPARK2_VERSION/spark-$SPARK2_VERSION-bin-hadoop$SPARK2_HADOOP_MAJOR_VERSION.tgz +tar -xzf spark-$SPARK2_VERSION-bin-hadoop$SPARK2_HADOOP_MAJOR_VERSION.tgz +(cd spark-$SPARK2_VERSION-bin-hadoop$SPARK2_HADOOP_MAJOR_VERSION && bin/docker-image-tool.sh -r metorikku -t base-$SPARK2_VERSION build) +rm -rf spark-$SPARK2_VERSION-bin-hadoop$SPARK2_HADOOP_MAJOR_VERSION + docker build -t metorikku/spark:spark2_k8s --cache-from metorikku/metorikku:spark2_k8s --build-arg SCALA_MAJOR_VERSION=$SPARK2_SCALA_MAJOR_VERSION --build-arg SPARK_VERSION=$SPARK2_VERSION --build-arg HADOOP_VERSION=${SPARK2_HADOOP_VERSION} -f docker/spark/k8s/Dockerfile docker/spark/k8s docker build -t metorikku/spark:spark2_hadoop --cache-from metorikku/metorikku:spark2_standalone --build-arg IMAGE_NAME=metorikku/spark:spark2_k8s --build-arg HIVE_VERSION=${SPARK2_HIVE_VERSION} --build-arg HADOOP_VERSION=${SPARK2_HADOOP_VERSION} -f docker/spark/custom-hadoop/Dockerfile docker/spark/custom-hadoop docker build -t metorikku/spark:spark2_standalone --cache-from metorikku/metorikku:spark2_standalone --build-arg IMAGE_NAME=metorikku/spark:spark2_hadoop -f docker/spark/standalone/Dockerfile docker/spark/standalone