update docker-compose.yml and different docs relating to te local spa…

…rk-flint tests Signed-off-by: YANGDB <[email protected]>
YANG-DB · Oct 4, 2023 · 6e3437d · 6e3437d
1 parent 2ae65f4
commit 6e3437d
Show file tree

Hide file tree

Showing 6 changed files with 215 additions and 56 deletions.
diff --git a/.env b/.env
@@ -1,3 +1,3 @@
 # version for opensearch & opensearch-dashboards docker image
-VERSION=2.9.0
+VERSION=2.10.0
 
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -8,57 +8,57 @@ x-default-logging: &logging
     max-file: "2"
 
 volumes:
-  opensearch-data:
+  opensearch-data1:
+  opensearch-data2:
 
 services:
   spark-master:
-    image: our-own-apache-spark:3.4.0
+    container_name: spark_master
+    image: docker.io/bitnami/spark:2
     ports:
       - "9090:8080"
       - "7077:7077"
+    networks:
+      - net
     volumes:
+      - ./docker/spark/conf/log4j.properties:/opt/bitnami/spark/conf/log4j.properties  
+      - ./docker/spark/spark-defaults.conf:/opt/bitnami/spark/conf/spark-defaults.conf
       - ./apps:/opt/spark-apps
       - ./data:/opt/spark-data
+      - ./jars:/opt/spark-jars 
     environment:
+      - HADOOP_USER_NAME=root
+      - SPARK_USER=root
       - SPARK_LOCAL_IP=spark-master
-      - SPARK_WORKLOAD=master
+      - SPARK_MODE=master
+      - SPARK_CONF_DIR=/opt/spark-conf # Pointing to the directory with custom Spark configurations
+      - SPARK_DRIVER_EXTRA_CLASSPATH=/opt/spark-jars/* # Add the jar to classpath for driver
+      - SPARK_EXECUTOR_EXTRA_CLASSPATH=/opt/spark-jars/* # Add the jar to classpath for executor
+      - SPARK_SQL_EXTENSIONS=org.opensearch.flint.FlintPPLSparkExtensions # Set the Spark SQL Extensions
   spark-worker-1:
-    image: our-own-apache-spark:3.4.0
+    image: docker.io/bitnami/spark:2
+    container_name: spark_worker1
     ports:
       - "9091:8080"
       - "7000:7000"
+    networks:
+      - net
     depends_on:
       - spark-master
     environment:
-      - SPARK_MASTER=spark://spark-master:7077
+      - SPARK_MASTER_URL=spark://spark-master:7077
       - SPARK_WORKER_CORES=1
       - SPARK_WORKER_MEMORY=1G
       - SPARK_DRIVER_MEMORY=1G
       - SPARK_EXECUTOR_MEMORY=1G
-      - SPARK_WORKLOAD=worker
+      - SPARK_MODE=worker
       - SPARK_LOCAL_IP=spark-worker-1
+      - SPARK_DRIVER_EXTRA_CLASSPATH=/opt/spark-jars/*
+      - SPARK_EXECUTOR_EXTRA_CLASSPATH=/opt/spark-jars/*
     volumes:
       - ./apps:/opt/spark-apps
       - ./data:/opt/spark-data
-  spark-worker-2:
-    image: our-own-apache-spark:3.4.0
-    ports:
-      - "9092:8080"
-      - "7001:7000"
-    depends_on:
-      - spark-master
-    environment:
-      - SPARK_MASTER=spark://spark-master:7077
-      - SPARK_WORKER_CORES=1
-      - SPARK_WORKER_MEMORY=1G
-      - SPARK_DRIVER_MEMORY=1G
-      - SPARK_EXECUTOR_MEMORY=1G
-      - SPARK_WORKLOAD=worker
-      - SPARK_LOCAL_IP=spark-worker-2
-    volumes:
-      - ./apps:/opt/spark-apps
-      - ./data:/opt/spark-data
-
+      - ./logs:/opt/spark-logs
   livy-server:
     container_name: livy_server
     build: ./docker/livy/
@@ -81,52 +81,62 @@ services:
     depends_on:
       - spark-master
       - spark-worker-1
-      - spark-worker-2
-  # OpenSearch store - node (not for production - no security - only for test purpose )
-  opensearch:
-    image: opensearchstaging/opensearch:${VERSION}
-    container_name: opensearch
+  opensearch-node1:
+    image: opensearchproject/opensearch:${VERSION}
+    container_name: opensearch-node1
     environment:
       - cluster.name=opensearch-cluster
-      - node.name=opensearch
-      - discovery.seed_hosts=opensearch
-      - cluster.initial_cluster_manager_nodes=opensearch
-      - bootstrap.memory_lock=true
-      - "OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m"
-      - "DISABLE_INSTALL_DEMO_CONFIG=true"
-      - "DISABLE_SECURITY_PLUGIN=true"
+      - node.name=opensearch-node1
+      - discovery.seed_hosts=opensearch-node1,opensearch-node2
+      - cluster.initial_master_nodes=opensearch-node1,opensearch-node2
+      - bootstrap.memory_lock=true # along with the memlock settings below, disables swapping
+      - "OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m" # minimum and maximum Java heap size, recommend setting both to 50% of system RAM
     ulimits:
       memlock:
         soft: -1
         hard: -1
       nofile:
-        soft: 65536 # Maximum number of open files for the opensearch user - set to at least 65536
+        soft: 65536 # maximum number of open files for the OpenSearch user, set to at least 65536 on modern systems
         hard: 65536
     volumes:
-      - opensearch-data:/usr/share/opensearch/data # Creates volume called opensearch-data1 and mounts it to the container
+      - opensearch-data1:/usr/share/opensearch/data
     ports:
       - 9200:9200
-      - 9600:9600
-    expose:
-      - "9200"
-    healthcheck:
-      test: ["CMD", "curl", "-f", "http://localhost:9200/_cluster/health?wait_for_status=yellow"]
-      interval: 20s
-      timeout: 10s
-      retries: 10
-  # OpenSearch store - dashboard
+      - 9600:9600 # required for Performance Analyzer
+    networks:
+      - net
+  opensearch-node2:
+    image: opensearchproject/opensearch:${VERSION}
+    container_name: opensearch-node2
+    environment:
+      - cluster.name=opensearch-cluster
+      - node.name=opensearch-node2
+      - discovery.seed_hosts=opensearch-node1,opensearch-node2
+      - cluster.initial_master_nodes=opensearch-node1,opensearch-node2
+      - bootstrap.memory_lock=true
+      - "OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m"
+    ulimits:
+      memlock:
+        soft: -1
+        hard: -1
+      nofile:
+        soft: 65536
+        hard: 65536
+    volumes:
+      - opensearch-data2:/usr/share/opensearch/data
+    networks:
+      - net
   opensearch-dashboards:
     image: opensearchproject/opensearch-dashboards:${VERSION}
     container_name: opensearch-dashboards
-
     ports:
-      - 5601:5601 # Map host port 5601 to container port 5601
+      - 5601:5601
     expose:
-      - "5601" # Expose port 5601 for web access to OpenSearch Dashboards
+      - "5601"
     environment:
-      OPENSEARCH_HOSTS: '["http://opensearch:9200"]' # Define the OpenSearch nodes that OpenSearch Dashboards will query
-    depends_on:
-      - opensearch
+      OPENSEARCH_HOSTS: '["https://opensearch-node1:9200","https://opensearch-node2:9200"]' # must be a string with no spaces when specified as an environment variable
+    networks:
+      - net
 
 networks:
   net:

diff --git a/docker/livy/conf/livy.conf b/docker/livy/conf/livy.conf
@@ -98,7 +98,7 @@ livy.file.local-dir-whitelist = /target/
 
 # Whether to enable HiveContext in livy interpreter, if it is true hive-site.xml will be detected
 # on user request and then livy server classpath automatically.
-# livy.repl.enable-hive-context =
+# livy.repl.enable-hive-context = true
 
 # Recovery mode of Livy. Possible values:
 # off: Default. Turn off recovery. Every time Livy shuts down, it stops and forgets all sessions.

diff --git a/docker/spark/conf/log4j.properties b/docker/spark/conf/log4j.properties
@@ -0,0 +1,31 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Set everything to be logged to the console
+log4j.rootCategory=DEBUG, console
+log4j.appender.console=org.apache.log4j.ConsoleAppender
+log4j.appender.console.target=System.err
+log4j.appender.console.layout=org.apache.log4j.PatternLayout
+log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
+
+# Set the default spark-shell log level to WARN. When running the spark-shell, the
+# log level for this class is used to overwrite the root logger's log level, so that
+# the user can have different defaults for the shell and regular Spark apps.
+log4j.logger.org.apache.spark.repl.Main=DEBUG
+
+# Settings to quiet third party logs that are too verbose
+log4j.logger.org.spark_project.jetty=WARN
diff --git a/docker/spark/spark-defaults.conf b/docker/spark/spark-defaults.conf
@@ -0,0 +1,31 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Default system properties included when running spark-submit.
+# This is useful for setting default environmental settings.
+
+# Example:
+# spark.master                     spark://master:7077
+# spark.eventLog.enabled           true
+# spark.eventLog.dir               hdfs://namenode:8021/directory
+# spark.serializer                 org.apache.spark.serializer.KryoSerializer
+# spark.driver.memory              2g
+# spark.executor.extraJavaOptions  -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three"
+
+
+#spark.jars.packages = org.postgresql:postgresql:9.4.1207.jar
+#spark.driver.extraClassPath = /opt/bitnami/spark/jars/postgresql-9.4.1207.jar
diff --git a/docs/Local-testing.md b/docs/Local-testing.md
@@ -0,0 +1,87 @@
+# Testing locally With Spark
+This document is intended to review the local docker-compose based environment in-which the Flint/PPL - spark plugins can be testes and explored.
+
+## Overview
+The following components are part of this testing environment
+
+### Livy
+Apache Livy is a service that enables easy interaction with a Spark cluster over a REST interface. It enables easy submission of Spark jobs or snippets of Spark code, synchronous or asynchronous result retrieval, as well as Spark Context management, all via a simple REST interface or an RPC client library.
+Live provides a comprehensive [REST API](https://livy.apache.org/docs/latest/rest-api.html) to interact with spark cluster in a simplified way.
+
+## Test Tutorial
+First we need to create a livy session
+```
+curl --location --request POST 'http://localhost:8998/sessions' \
+--header 'Content-Type: application/json' \
+--data-raw '{
+    "kind": "sql",
+    "proxyUser": "a_user"
+}'
+```
+This call will respond with a session Id in the following manner:
+```json5
+{
+  "id": 0,
+  "name": null,
+  "appId": null,
+  "owner": null,
+  "proxyUser": null,
+  "state": "starting",
+  "kind": "sql",
+  "appInfo": {
+    "driverLogUrl": null,
+    "sparkUiUrl": null
+  },
+  "log": [
+    "stdout: ",
+    "\nstderr: "
+  ]
+}
+```
+
+Once a session is created, we can submit a SQL query statement the following way:
+```
+curl --location --request POST 'http://localhost:8998/sessions/0/statements' \
+--header 'Content-Type: application/json' \
+--data-raw '{
+    "code": "spark.sql(\"CREATE TABLE test_table (id INT, name STRING)\")"
+}'
+```
+
+This call responds with the next ack
+```json5
+{"id":0,"code":"select 1","state":"waiting","output":null,"progress":0.0,"started":0,"completed":0}
+```
+
+Next we can Insert some data into that table:
+```
+curl --location --request POST 'http://localhost:8998/sessions/0/statements' \
+--header 'Content-Type: application/json' \
+--data-raw '{
+    "code": "spark.sql(\"INSERT INTO test_table VALUES (1, 'John'), (2, 'Doe')\")"
+}'
+```
+
+Now lets query the table using SQL:
+```
+curl --location --request POST 'http://localhost:8998/sessions/0/statements' \
+--header 'Content-Type: application/json' \
+--data-raw '{
+    "code": "spark.sql(\"SELECT * FROM test_table\").show()"
+}'
+```
+
+We can now see the Livy session created with the execution running:
+
+![Livy UI session Image]()
+
+To get the response of this statement use the next API:
+`curl --location --request GET http://localhost:8998/sessions/0/statements/0 | jq '.output.data.application/json.data'`
+
+This would respond with the next results
+```text
+  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
+                                 Dload  Upload   Total   Spent    Left  Speed
+100   298  100   298    0     0   6610      0 --:--:-- --:--:-- --:--:--  7641
+
+```