From 6e3437d265f387ea8851c01c491c44b6c80f4421 Mon Sep 17 00:00:00 2001 From: YANGDB Date: Wed, 4 Oct 2023 14:39:58 -0700 Subject: [PATCH] update docker-compose.yml and different docs relating to te local spark-flint tests Signed-off-by: YANGDB --- .env | 2 +- docker-compose.yml | 118 ++++++++++++++++------------- docker/livy/conf/livy.conf | 2 +- docker/spark/conf/log4j.properties | 31 ++++++++ docker/spark/spark-defaults.conf | 31 ++++++++ docs/Local-testing.md | 87 +++++++++++++++++++++ 6 files changed, 215 insertions(+), 56 deletions(-) create mode 100644 docker/spark/conf/log4j.properties create mode 100644 docker/spark/spark-defaults.conf create mode 100644 docs/Local-testing.md diff --git a/.env b/.env index 997507e85..aad3e4baf 100644 --- a/.env +++ b/.env @@ -1,3 +1,3 @@ # version for opensearch & opensearch-dashboards docker image -VERSION=2.9.0 +VERSION=2.10.0 diff --git a/docker-compose.yml b/docker-compose.yml index 6c1e246ef..892feac34 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -8,57 +8,57 @@ x-default-logging: &logging max-file: "2" volumes: - opensearch-data: + opensearch-data1: + opensearch-data2: services: spark-master: - image: our-own-apache-spark:3.4.0 + container_name: spark_master + image: docker.io/bitnami/spark:2 ports: - "9090:8080" - "7077:7077" + networks: + - net volumes: + - ./docker/spark/conf/log4j.properties:/opt/bitnami/spark/conf/log4j.properties + - ./docker/spark/spark-defaults.conf:/opt/bitnami/spark/conf/spark-defaults.conf - ./apps:/opt/spark-apps - ./data:/opt/spark-data + - ./jars:/opt/spark-jars environment: + - HADOOP_USER_NAME=root + - SPARK_USER=root - SPARK_LOCAL_IP=spark-master - - SPARK_WORKLOAD=master + - SPARK_MODE=master + - SPARK_CONF_DIR=/opt/spark-conf # Pointing to the directory with custom Spark configurations + - SPARK_DRIVER_EXTRA_CLASSPATH=/opt/spark-jars/* # Add the jar to classpath for driver + - SPARK_EXECUTOR_EXTRA_CLASSPATH=/opt/spark-jars/* # Add the jar to classpath for executor + - SPARK_SQL_EXTENSIONS=org.opensearch.flint.FlintPPLSparkExtensions # Set the Spark SQL Extensions spark-worker-1: - image: our-own-apache-spark:3.4.0 + image: docker.io/bitnami/spark:2 + container_name: spark_worker1 ports: - "9091:8080" - "7000:7000" + networks: + - net depends_on: - spark-master environment: - - SPARK_MASTER=spark://spark-master:7077 + - SPARK_MASTER_URL=spark://spark-master:7077 - SPARK_WORKER_CORES=1 - SPARK_WORKER_MEMORY=1G - SPARK_DRIVER_MEMORY=1G - SPARK_EXECUTOR_MEMORY=1G - - SPARK_WORKLOAD=worker + - SPARK_MODE=worker - SPARK_LOCAL_IP=spark-worker-1 + - SPARK_DRIVER_EXTRA_CLASSPATH=/opt/spark-jars/* + - SPARK_EXECUTOR_EXTRA_CLASSPATH=/opt/spark-jars/* volumes: - ./apps:/opt/spark-apps - ./data:/opt/spark-data - spark-worker-2: - image: our-own-apache-spark:3.4.0 - ports: - - "9092:8080" - - "7001:7000" - depends_on: - - spark-master - environment: - - SPARK_MASTER=spark://spark-master:7077 - - SPARK_WORKER_CORES=1 - - SPARK_WORKER_MEMORY=1G - - SPARK_DRIVER_MEMORY=1G - - SPARK_EXECUTOR_MEMORY=1G - - SPARK_WORKLOAD=worker - - SPARK_LOCAL_IP=spark-worker-2 - volumes: - - ./apps:/opt/spark-apps - - ./data:/opt/spark-data - + - ./logs:/opt/spark-logs livy-server: container_name: livy_server build: ./docker/livy/ @@ -81,52 +81,62 @@ services: depends_on: - spark-master - spark-worker-1 - - spark-worker-2 - # OpenSearch store - node (not for production - no security - only for test purpose ) - opensearch: - image: opensearchstaging/opensearch:${VERSION} - container_name: opensearch + opensearch-node1: + image: opensearchproject/opensearch:${VERSION} + container_name: opensearch-node1 environment: - cluster.name=opensearch-cluster - - node.name=opensearch - - discovery.seed_hosts=opensearch - - cluster.initial_cluster_manager_nodes=opensearch - - bootstrap.memory_lock=true - - "OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m" - - "DISABLE_INSTALL_DEMO_CONFIG=true" - - "DISABLE_SECURITY_PLUGIN=true" + - node.name=opensearch-node1 + - discovery.seed_hosts=opensearch-node1,opensearch-node2 + - cluster.initial_master_nodes=opensearch-node1,opensearch-node2 + - bootstrap.memory_lock=true # along with the memlock settings below, disables swapping + - "OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m" # minimum and maximum Java heap size, recommend setting both to 50% of system RAM ulimits: memlock: soft: -1 hard: -1 nofile: - soft: 65536 # Maximum number of open files for the opensearch user - set to at least 65536 + soft: 65536 # maximum number of open files for the OpenSearch user, set to at least 65536 on modern systems hard: 65536 volumes: - - opensearch-data:/usr/share/opensearch/data # Creates volume called opensearch-data1 and mounts it to the container + - opensearch-data1:/usr/share/opensearch/data ports: - 9200:9200 - - 9600:9600 - expose: - - "9200" - healthcheck: - test: ["CMD", "curl", "-f", "http://localhost:9200/_cluster/health?wait_for_status=yellow"] - interval: 20s - timeout: 10s - retries: 10 - # OpenSearch store - dashboard + - 9600:9600 # required for Performance Analyzer + networks: + - net + opensearch-node2: + image: opensearchproject/opensearch:${VERSION} + container_name: opensearch-node2 + environment: + - cluster.name=opensearch-cluster + - node.name=opensearch-node2 + - discovery.seed_hosts=opensearch-node1,opensearch-node2 + - cluster.initial_master_nodes=opensearch-node1,opensearch-node2 + - bootstrap.memory_lock=true + - "OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m" + ulimits: + memlock: + soft: -1 + hard: -1 + nofile: + soft: 65536 + hard: 65536 + volumes: + - opensearch-data2:/usr/share/opensearch/data + networks: + - net opensearch-dashboards: image: opensearchproject/opensearch-dashboards:${VERSION} container_name: opensearch-dashboards - ports: - - 5601:5601 # Map host port 5601 to container port 5601 + - 5601:5601 expose: - - "5601" # Expose port 5601 for web access to OpenSearch Dashboards + - "5601" environment: - OPENSEARCH_HOSTS: '["http://opensearch:9200"]' # Define the OpenSearch nodes that OpenSearch Dashboards will query - depends_on: - - opensearch + OPENSEARCH_HOSTS: '["https://opensearch-node1:9200","https://opensearch-node2:9200"]' # must be a string with no spaces when specified as an environment variable + networks: + - net networks: net: diff --git a/docker/livy/conf/livy.conf b/docker/livy/conf/livy.conf index f834bb677..329906b34 100644 --- a/docker/livy/conf/livy.conf +++ b/docker/livy/conf/livy.conf @@ -98,7 +98,7 @@ livy.file.local-dir-whitelist = /target/ # Whether to enable HiveContext in livy interpreter, if it is true hive-site.xml will be detected # on user request and then livy server classpath automatically. -# livy.repl.enable-hive-context = +# livy.repl.enable-hive-context = true # Recovery mode of Livy. Possible values: # off: Default. Turn off recovery. Every time Livy shuts down, it stops and forgets all sessions. diff --git a/docker/spark/conf/log4j.properties b/docker/spark/conf/log4j.properties new file mode 100644 index 000000000..5879bbed1 --- /dev/null +++ b/docker/spark/conf/log4j.properties @@ -0,0 +1,31 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Set everything to be logged to the console +log4j.rootCategory=DEBUG, console +log4j.appender.console=org.apache.log4j.ConsoleAppender +log4j.appender.console.target=System.err +log4j.appender.console.layout=org.apache.log4j.PatternLayout +log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n + +# Set the default spark-shell log level to WARN. When running the spark-shell, the +# log level for this class is used to overwrite the root logger's log level, so that +# the user can have different defaults for the shell and regular Spark apps. +log4j.logger.org.apache.spark.repl.Main=DEBUG + +# Settings to quiet third party logs that are too verbose +log4j.logger.org.spark_project.jetty=WARN \ No newline at end of file diff --git a/docker/spark/spark-defaults.conf b/docker/spark/spark-defaults.conf new file mode 100644 index 000000000..729c91873 --- /dev/null +++ b/docker/spark/spark-defaults.conf @@ -0,0 +1,31 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Default system properties included when running spark-submit. +# This is useful for setting default environmental settings. + +# Example: +# spark.master spark://master:7077 +# spark.eventLog.enabled true +# spark.eventLog.dir hdfs://namenode:8021/directory +# spark.serializer org.apache.spark.serializer.KryoSerializer +# spark.driver.memory 2g +# spark.executor.extraJavaOptions -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three" + + +#spark.jars.packages = org.postgresql:postgresql:9.4.1207.jar +#spark.driver.extraClassPath = /opt/bitnami/spark/jars/postgresql-9.4.1207.jar diff --git a/docs/Local-testing.md b/docs/Local-testing.md new file mode 100644 index 000000000..a53859bb6 --- /dev/null +++ b/docs/Local-testing.md @@ -0,0 +1,87 @@ +# Testing locally With Spark +This document is intended to review the local docker-compose based environment in-which the Flint/PPL - spark plugins can be testes and explored. + +## Overview +The following components are part of this testing environment + +### Livy +Apache Livy is a service that enables easy interaction with a Spark cluster over a REST interface. It enables easy submission of Spark jobs or snippets of Spark code, synchronous or asynchronous result retrieval, as well as Spark Context management, all via a simple REST interface or an RPC client library. +Live provides a comprehensive [REST API](https://livy.apache.org/docs/latest/rest-api.html) to interact with spark cluster in a simplified way. + +## Test Tutorial +First we need to create a livy session +``` +curl --location --request POST 'http://localhost:8998/sessions' \ +--header 'Content-Type: application/json' \ +--data-raw '{ + "kind": "sql", + "proxyUser": "a_user" +}' +``` +This call will respond with a session Id in the following manner: +```json5 +{ + "id": 0, + "name": null, + "appId": null, + "owner": null, + "proxyUser": null, + "state": "starting", + "kind": "sql", + "appInfo": { + "driverLogUrl": null, + "sparkUiUrl": null + }, + "log": [ + "stdout: ", + "\nstderr: " + ] +} +``` + +Once a session is created, we can submit a SQL query statement the following way: +``` +curl --location --request POST 'http://localhost:8998/sessions/0/statements' \ +--header 'Content-Type: application/json' \ +--data-raw '{ + "code": "spark.sql(\"CREATE TABLE test_table (id INT, name STRING)\")" +}' +``` + +This call responds with the next ack +```json5 +{"id":0,"code":"select 1","state":"waiting","output":null,"progress":0.0,"started":0,"completed":0} +``` + +Next we can Insert some data into that table: +``` +curl --location --request POST 'http://localhost:8998/sessions/0/statements' \ +--header 'Content-Type: application/json' \ +--data-raw '{ + "code": "spark.sql(\"INSERT INTO test_table VALUES (1, 'John'), (2, 'Doe')\")" +}' +``` + +Now lets query the table using SQL: +``` +curl --location --request POST 'http://localhost:8998/sessions/0/statements' \ +--header 'Content-Type: application/json' \ +--data-raw '{ + "code": "spark.sql(\"SELECT * FROM test_table\").show()" +}' +``` + +We can now see the Livy session created with the execution running: + +![Livy UI session Image]() + +To get the response of this statement use the next API: +`curl --location --request GET http://localhost:8998/sessions/0/statements/0 | jq '.output.data.application/json.data'` + +This would respond with the next results +```text + % Total % Received % Xferd Average Speed Time Time Time Current + Dload Upload Total Spent Left Speed +100 298 100 298 0 0 6610 0 --:--:-- --:--:-- --:--:-- 7641 + +``` \ No newline at end of file