diff --git a/.env b/.env deleted file mode 100644 index 997507e85..000000000 --- a/.env +++ /dev/null @@ -1,3 +0,0 @@ -# version for opensearch & opensearch-dashboards docker image -VERSION=2.9.0 - diff --git a/docker-compose.yml b/docker-compose.yml deleted file mode 100644 index 6c1e246ef..000000000 --- a/docker-compose.yml +++ /dev/null @@ -1,133 +0,0 @@ -# Copyright The OpenTelemetry Authors -# SPDX-License-Identifier: Apache-2.0 -version: '3.9' -x-default-logging: &logging - driver: "json-file" - options: - max-size: "5m" - max-file: "2" - -volumes: - opensearch-data: - -services: - spark-master: - image: our-own-apache-spark:3.4.0 - ports: - - "9090:8080" - - "7077:7077" - volumes: - - ./apps:/opt/spark-apps - - ./data:/opt/spark-data - environment: - - SPARK_LOCAL_IP=spark-master - - SPARK_WORKLOAD=master - spark-worker-1: - image: our-own-apache-spark:3.4.0 - ports: - - "9091:8080" - - "7000:7000" - depends_on: - - spark-master - environment: - - SPARK_MASTER=spark://spark-master:7077 - - SPARK_WORKER_CORES=1 - - SPARK_WORKER_MEMORY=1G - - SPARK_DRIVER_MEMORY=1G - - SPARK_EXECUTOR_MEMORY=1G - - SPARK_WORKLOAD=worker - - SPARK_LOCAL_IP=spark-worker-1 - volumes: - - ./apps:/opt/spark-apps - - ./data:/opt/spark-data - spark-worker-2: - image: our-own-apache-spark:3.4.0 - ports: - - "9092:8080" - - "7001:7000" - depends_on: - - spark-master - environment: - - SPARK_MASTER=spark://spark-master:7077 - - SPARK_WORKER_CORES=1 - - SPARK_WORKER_MEMORY=1G - - SPARK_DRIVER_MEMORY=1G - - SPARK_EXECUTOR_MEMORY=1G - - SPARK_WORKLOAD=worker - - SPARK_LOCAL_IP=spark-worker-2 - volumes: - - ./apps:/opt/spark-apps - - ./data:/opt/spark-data - - livy-server: - container_name: livy_server - build: ./docker/livy/ - command: ["sh", "-c", "/opt/bitnami/livy/bin/livy-server"] - user: root - volumes: - - type: bind - source: ./docker/livy/conf/ - target: /opt/bitnami/livy/conf/ - - type: bind - source: ./docker/livy/target/ - target: /target/ - - type: bind - source: ./docker/livy/data/ - target: /data/ - ports: - - '8998:8998' - networks: - - net - depends_on: - - spark-master - - spark-worker-1 - - spark-worker-2 - # OpenSearch store - node (not for production - no security - only for test purpose ) - opensearch: - image: opensearchstaging/opensearch:${VERSION} - container_name: opensearch - environment: - - cluster.name=opensearch-cluster - - node.name=opensearch - - discovery.seed_hosts=opensearch - - cluster.initial_cluster_manager_nodes=opensearch - - bootstrap.memory_lock=true - - "OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m" - - "DISABLE_INSTALL_DEMO_CONFIG=true" - - "DISABLE_SECURITY_PLUGIN=true" - ulimits: - memlock: - soft: -1 - hard: -1 - nofile: - soft: 65536 # Maximum number of open files for the opensearch user - set to at least 65536 - hard: 65536 - volumes: - - opensearch-data:/usr/share/opensearch/data # Creates volume called opensearch-data1 and mounts it to the container - ports: - - 9200:9200 - - 9600:9600 - expose: - - "9200" - healthcheck: - test: ["CMD", "curl", "-f", "http://localhost:9200/_cluster/health?wait_for_status=yellow"] - interval: 20s - timeout: 10s - retries: 10 - # OpenSearch store - dashboard - opensearch-dashboards: - image: opensearchproject/opensearch-dashboards:${VERSION} - container_name: opensearch-dashboards - - ports: - - 5601:5601 # Map host port 5601 to container port 5601 - expose: - - "5601" # Expose port 5601 for web access to OpenSearch Dashboards - environment: - OPENSEARCH_HOSTS: '["http://opensearch:9200"]' # Define the OpenSearch nodes that OpenSearch Dashboards will query - depends_on: - - opensearch - -networks: - net: - driver: bridge \ No newline at end of file diff --git a/docker/livy/Dockerfile b/docker/livy/Dockerfile deleted file mode 100644 index fbdc649e2..000000000 --- a/docker/livy/Dockerfile +++ /dev/null @@ -1,15 +0,0 @@ -FROM docker.io/bitnami/spark:2 - -USER root -ENV LIVY_HOME /opt/bitnami/livy -WORKDIR /opt/bitnami/ - -RUN install_packages unzip \ - && curl "https://downloads.apache.org/incubator/livy/0.7.1-incubating/apache-livy-0.7.1-incubating-bin.zip" -O \ - && unzip "apache-livy-0.7.1-incubating-bin" \ - && rm -rf "apache-livy-0.7.1-incubating-bin.zip" \ - && mv "apache-livy-0.7.1-incubating-bin" $LIVY_HOME \ - && mkdir $LIVY_HOME/logs \ - && chown -R 1001:1001 $LIVY_HOME - -USER 1001 \ No newline at end of file diff --git a/docker/livy/conf/livy-env.sh b/docker/livy/conf/livy-env.sh deleted file mode 100644 index c2cc3d092..000000000 --- a/docker/livy/conf/livy-env.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/env bash -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# LIVY ENVIRONMENT VARIABLES -# -# - JAVA_HOME Java runtime to use. By default use "java" from PATH. -# - HADOOP_CONF_DIR Directory containing the Hadoop / YARN configuration to use. -# - SPARK_HOME Spark which you would like to use in Livy. -# - SPARK_CONF_DIR Optional directory where the Spark configuration lives. -# (Default: $SPARK_HOME/conf) -# - LIVY_LOG_DIR Where log files are stored. (Default: ${LIVY_HOME}/logs) -# - LIVY_PID_DIR Where the pid file is stored. (Default: /tmp) -# - LIVY_SERVER_JAVA_OPTS Java Opts for running livy server (You can set jvm related setting here, -# like jvm memory/gc algorithm and etc.) -# - LIVY_IDENT_STRING A name that identifies the Livy server instance, used to generate log file -# names. (Default: name of the user starting Livy). -# - LIVY_MAX_LOG_FILES Max number of log file to keep in the log directory. (Default: 5.) -# - LIVY_NICENESS Niceness of the Livy server process when running in the background. (Default: 0.) - -export SPARK_HOME=/opt/bitnami/spark/ diff --git a/docker/livy/conf/livy.conf b/docker/livy/conf/livy.conf deleted file mode 100644 index f834bb677..000000000 --- a/docker/livy/conf/livy.conf +++ /dev/null @@ -1,167 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# Use this keystore for the SSL certificate and key. -# livy.keystore = - -# Specify the keystore password. -# livy.keystore.password = -# -# Specify the key password. -# livy.key-password = - -# Hadoop Credential Provider Path to get "livy.keystore.password" and "livy.key-password". -# Credential Provider can be created using command as follow: -# hadoop credential create "livy.keystore.password" -value "secret" -provider jceks://hdfs/path/to/livy.jceks -# livy.hadoop.security.credential.provider.path = - -# What host address to start the server on. By default, Livy will bind to all network interfaces. -livy.server.host = 0.0.0.0 - -# What port to start the server on. -livy.server.port = 8998 - -# What base path ui should work on. By default UI is mounted on "/". -# E.g.: livy.ui.basePath = /my_livy - result in mounting UI on /my_livy/ -# livy.ui.basePath = "" - -# What spark master Livy sessions should use. -livy.spark.master = spark://spark-master:7077 - -# What spark deploy mode Livy sessions should use. -livy.spark.deploy-mode = client - -# Configure Livy server http request and response header size. -# livy.server.request-header.size = 131072 -# livy.server.response-header.size = 131072 - -# Enabled to check whether timeout Livy sessions should be stopped. -livy.server.session.timeout-check = true -# -# Whether or not to skip timeout check for a busy session -livy.server.session.timeout-check.skip-busy = false - -# Time in milliseconds on how long Livy will wait before timing out an inactive session. -# Note that the inactive session could be busy running jobs. -livy.server.session.timeout = 5m -# -# How long a finished session state should be kept in LivyServer for query. -livy.server.session.state-retain.sec = 60s - -# If livy should impersonate the requesting users when creating a new session. -# livy.impersonation.enabled = false - -# Logs size livy can cache for each session/batch. 0 means don't cache the logs. -# livy.cache-log.size = 200 - -# Comma-separated list of Livy RSC jars. By default Livy will upload jars from its installation -# directory every time a session is started. By caching these files in HDFS, for example, startup -# time of sessions on YARN can be reduced. -# livy.rsc.jars = - -# Comma-separated list of Livy REPL jars. By default Livy will upload jars from its installation -# directory every time a session is started. By caching these files in HDFS, for example, startup -# time of sessions on YARN can be reduced. Please list all the repl dependencies including -# Scala version-specific livy-repl jars, Livy will automatically pick the right dependencies -# during session creation. -# livy.repl.jars = - -# Location of PySpark archives. By default Livy will upload the file from SPARK_HOME, but -# by caching the file in HDFS, startup time of PySpark sessions on YARN can be reduced. -# livy.pyspark.archives = - -# Location of the SparkR package. By default Livy will upload the file from SPARK_HOME, but -# by caching the file in HDFS, startup time of R sessions on YARN can be reduced. -# livy.sparkr.package = - -# List of local directories from where files are allowed to be added to user sessions. By -# default it's empty, meaning users can only reference remote URIs when starting their -# sessions. -livy.file.local-dir-whitelist = /target/ - -# Whether to enable csrf protection, by default it is false. If it is enabled, client should add -# http-header "X-Requested-By" in request if the http method is POST/DELETE/PUT/PATCH. -# livy.server.csrf-protection.enabled = - -# Whether to enable HiveContext in livy interpreter, if it is true hive-site.xml will be detected -# on user request and then livy server classpath automatically. -# livy.repl.enable-hive-context = - -# Recovery mode of Livy. Possible values: -# off: Default. Turn off recovery. Every time Livy shuts down, it stops and forgets all sessions. -# recovery: Livy persists session info to the state store. When Livy restarts, it recovers -# previous sessions from the state store. -# Must set livy.server.recovery.state-store and livy.server.recovery.state-store.url to -# configure the state store. -# livy.server.recovery.mode = off - -# Where Livy should store state to for recovery. Possible values: -# : Default. State store disabled. -# filesystem: Store state on a file system. -# zookeeper: Store state in a Zookeeper instance. -# livy.server.recovery.state-store = - -# For filesystem state store, the path of the state store directory. Please don't use a filesystem -# that doesn't support atomic rename (e.g. S3). e.g. file:///tmp/livy or hdfs:///. -# For zookeeper, the address to the Zookeeper servers. e.g. host1:port1,host2:port2 -# livy.server.recovery.state-store.url = - -# If Livy can't find the yarn app within this time, consider it lost. -# livy.server.yarn.app-lookup-timeout = 120s -# When the cluster is busy, we may fail to launch yarn app in app-lookup-timeout, then it would -# cause session leakage, so we need to check session leakage. -# How long to check livy session leakage -# livy.server.yarn.app-leakage.check-timeout = 600s -# how often to check livy session leakage -# livy.server.yarn.app-leakage.check-interval = 60s - -# How often Livy polls YARN to refresh YARN app state. -# livy.server.yarn.poll-interval = 5s -# -# Days to keep Livy server request logs. -# livy.server.request-log-retain.days = 5 - -# If the Livy Web UI should be included in the Livy Server. Enabled by default. -# livy.ui.enabled = true - -# Whether to enable Livy server access control, if it is true then all the income requests will -# be checked if the requested user has permission. -# livy.server.access-control.enabled = false - -# Allowed users to access Livy, by default any user is allowed to access Livy. If user want to -# limit who could access Livy, user should list all the permitted users with comma separated. -# livy.server.access-control.allowed-users = * - -# A list of users with comma separated has the permission to change other user's submitted -# session, like submitting statements, deleting session. -# livy.server.access-control.modify-users = - -# A list of users with comma separated has the permission to view other user's infomation, like -# submitted session state, statement results. -# livy.server.access-control.view-users = -# -# Authentication support for Livy server -# Livy has a built-in SPnego authentication support for HTTP requests with below configurations. -# livy.server.auth.type = kerberos -# livy.server.auth.kerberos.principal = -# livy.server.auth.kerberos.keytab = -# livy.server.auth.kerberos.name-rules = DEFAULT -# -# If user wants to use custom authentication filter, configurations are: -# livy.server.auth.type = -# livy.server.auth..class = -# livy.server.auth..param. = -# livy.server.auth..param. = \ No newline at end of file diff --git a/docker/spark/Dockerfile b/docker/spark/Dockerfile deleted file mode 100644 index c85a6ab34..000000000 --- a/docker/spark/Dockerfile +++ /dev/null @@ -1,49 +0,0 @@ -# builder step used to download and configure spark environment -FROM openjdk:11.0.11-jre-slim-buster as builder - -# Add Dependencies for PySpark -RUN apt-get update && apt-get install -y curl vim wget software-properties-common ssh net-tools ca-certificates python3 python3-pip python3-numpy python3-matplotlib python3-scipy python3-pandas python3-simpy - -RUN update-alternatives --install "/usr/bin/python" "python" "$(which python3)" 1 - -# Fix the value of PYTHONHASHSEED -# Note: this is needed when you use Python 3.3 or greater -ENV SPARK_VERSION=3.4.0 \ -HADOOP_VERSION=3 \ -SPARK_HOME=/opt/spark \ -PYTHONHASHSEED=1 - -# Download and uncompress spark from the apache archive -RUN wget --no-verbose -O apache-spark.tgz "https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" \ -&& mkdir -p /opt/spark \ -&& tar -xf apache-spark.tgz -C /opt/spark --strip-components=1 \ -&& rm apache-spark.tgz - - -# Apache spark environment -FROM builder as apache-spark - -WORKDIR /opt/spark - -ENV SPARK_MASTER_PORT=7077 \ -SPARK_MASTER_WEBUI_PORT=8080 \ -SPARK_LOG_DIR=/opt/spark/logs \ -SPARK_MASTER_LOG=/opt/spark/logs/spark-master.out \ -SPARK_CONNECT_LOG=/opt/spark/logs/spark-connect.out \ -SPARK_WORKER_LOG=/opt/spark/logs/spark-worker.out \ -SPARK_WORKER_WEBUI_PORT=8080 \ -SPARK_WORKER_PORT=7000 \ -SPARK_MASTER="spark://spark-master:7077" \ -SPARK_WORKLOAD="master" - -EXPOSE 8080 7077 6066 - -RUN mkdir -p $SPARK_LOG_DIR && \ -touch $SPARK_MASTER_LOG && \ -touch $SPARK_WORKER_LOG && \ -ln -sf /dev/stdout $SPARK_MASTER_LOG && \ -ln -sf /dev/stdout $SPARK_WORKER_LOG - -COPY start-spark.sh / - -CMD ["/bin/bash", "/start-spark.sh"] \ No newline at end of file diff --git a/docker/spark/start-spark.sh b/docker/spark/start-spark.sh deleted file mode 100644 index 2fad05d54..000000000 --- a/docker/spark/start-spark.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash -. "/opt/spark/bin/load-spark-env.sh" - -# When the spark work_load is master, run class org.apache.spark.deploy.master.Master -if [ "$SPARK_WORKLOAD" == "master" ]; then - export SPARK_MASTER_HOST=`hostname` - cd /opt/spark/bin && ./spark-class org.apache.spark.deploy.master.Master --ip $SPARK_MASTER_HOST --port $SPARK_MASTER_PORT --webui-port $SPARK_MASTER_WEBUI_PORT >> $SPARK_MASTER_LOG - # Start the connect server - cd /opt/spark/bin && ./start-connect-server.sh --packages org.apache.spark:spark-connect_2.12:$SPARK_VERSION >> $SPARK_CONNECT_LOG - -elif [ "$SPARK_WORKLOAD" == "worker" ]; then - # When the spark work_load is worker, run class org.apache.spark.deploy.master.Worker - cd /opt/spark/bin && ./spark-class org.apache.spark.deploy.worker.Worker --webui-port $SPARK_WORKER_WEBUI_PORT $SPARK_MASTER >> $SPARK_WORKER_LOG - -elif [ "$SPARK_WORKLOAD" == "submit" ]; then - echo "SPARK SUBMIT" -else - echo "Undefined Workload Type $SPARK_WORKLOAD, must specify: master, worker, submit" -fi -