Skip to content

Commit

Permalink
update docker-compose.yml and different docs relating to te local spa…
Browse files Browse the repository at this point in the history
…rk-flint tests

Signed-off-by: YANGDB <[email protected]>
  • Loading branch information
YANG-DB committed Oct 4, 2023
1 parent 2ae65f4 commit 6e3437d
Show file tree
Hide file tree
Showing 6 changed files with 215 additions and 56 deletions.
2 changes: 1 addition & 1 deletion .env
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
# version for opensearch & opensearch-dashboards docker image
VERSION=2.9.0
VERSION=2.10.0

118 changes: 64 additions & 54 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,57 +8,57 @@ x-default-logging: &logging
max-file: "2"

volumes:
opensearch-data:
opensearch-data1:
opensearch-data2:

services:
spark-master:
image: our-own-apache-spark:3.4.0
container_name: spark_master
image: docker.io/bitnami/spark:2
ports:
- "9090:8080"
- "7077:7077"
networks:
- net
volumes:
- ./docker/spark/conf/log4j.properties:/opt/bitnami/spark/conf/log4j.properties
- ./docker/spark/spark-defaults.conf:/opt/bitnami/spark/conf/spark-defaults.conf
- ./apps:/opt/spark-apps
- ./data:/opt/spark-data
- ./jars:/opt/spark-jars
environment:
- HADOOP_USER_NAME=root
- SPARK_USER=root
- SPARK_LOCAL_IP=spark-master
- SPARK_WORKLOAD=master
- SPARK_MODE=master
- SPARK_CONF_DIR=/opt/spark-conf # Pointing to the directory with custom Spark configurations
- SPARK_DRIVER_EXTRA_CLASSPATH=/opt/spark-jars/* # Add the jar to classpath for driver
- SPARK_EXECUTOR_EXTRA_CLASSPATH=/opt/spark-jars/* # Add the jar to classpath for executor
- SPARK_SQL_EXTENSIONS=org.opensearch.flint.FlintPPLSparkExtensions # Set the Spark SQL Extensions
spark-worker-1:
image: our-own-apache-spark:3.4.0
image: docker.io/bitnami/spark:2
container_name: spark_worker1
ports:
- "9091:8080"
- "7000:7000"
networks:
- net
depends_on:
- spark-master
environment:
- SPARK_MASTER=spark://spark-master:7077
- SPARK_MASTER_URL=spark://spark-master:7077
- SPARK_WORKER_CORES=1
- SPARK_WORKER_MEMORY=1G
- SPARK_DRIVER_MEMORY=1G
- SPARK_EXECUTOR_MEMORY=1G
- SPARK_WORKLOAD=worker
- SPARK_MODE=worker
- SPARK_LOCAL_IP=spark-worker-1
- SPARK_DRIVER_EXTRA_CLASSPATH=/opt/spark-jars/*
- SPARK_EXECUTOR_EXTRA_CLASSPATH=/opt/spark-jars/*
volumes:
- ./apps:/opt/spark-apps
- ./data:/opt/spark-data
spark-worker-2:
image: our-own-apache-spark:3.4.0
ports:
- "9092:8080"
- "7001:7000"
depends_on:
- spark-master
environment:
- SPARK_MASTER=spark://spark-master:7077
- SPARK_WORKER_CORES=1
- SPARK_WORKER_MEMORY=1G
- SPARK_DRIVER_MEMORY=1G
- SPARK_EXECUTOR_MEMORY=1G
- SPARK_WORKLOAD=worker
- SPARK_LOCAL_IP=spark-worker-2
volumes:
- ./apps:/opt/spark-apps
- ./data:/opt/spark-data

- ./logs:/opt/spark-logs
livy-server:
container_name: livy_server
build: ./docker/livy/
Expand All @@ -81,52 +81,62 @@ services:
depends_on:
- spark-master
- spark-worker-1
- spark-worker-2
# OpenSearch store - node (not for production - no security - only for test purpose )
opensearch:
image: opensearchstaging/opensearch:${VERSION}
container_name: opensearch
opensearch-node1:
image: opensearchproject/opensearch:${VERSION}
container_name: opensearch-node1
environment:
- cluster.name=opensearch-cluster
- node.name=opensearch
- discovery.seed_hosts=opensearch
- cluster.initial_cluster_manager_nodes=opensearch
- bootstrap.memory_lock=true
- "OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m"
- "DISABLE_INSTALL_DEMO_CONFIG=true"
- "DISABLE_SECURITY_PLUGIN=true"
- node.name=opensearch-node1
- discovery.seed_hosts=opensearch-node1,opensearch-node2
- cluster.initial_master_nodes=opensearch-node1,opensearch-node2
- bootstrap.memory_lock=true # along with the memlock settings below, disables swapping
- "OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m" # minimum and maximum Java heap size, recommend setting both to 50% of system RAM
ulimits:
memlock:
soft: -1
hard: -1
nofile:
soft: 65536 # Maximum number of open files for the opensearch user - set to at least 65536
soft: 65536 # maximum number of open files for the OpenSearch user, set to at least 65536 on modern systems
hard: 65536
volumes:
- opensearch-data:/usr/share/opensearch/data # Creates volume called opensearch-data1 and mounts it to the container
- opensearch-data1:/usr/share/opensearch/data
ports:
- 9200:9200
- 9600:9600
expose:
- "9200"
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:9200/_cluster/health?wait_for_status=yellow"]
interval: 20s
timeout: 10s
retries: 10
# OpenSearch store - dashboard
- 9600:9600 # required for Performance Analyzer
networks:
- net
opensearch-node2:
image: opensearchproject/opensearch:${VERSION}
container_name: opensearch-node2
environment:
- cluster.name=opensearch-cluster
- node.name=opensearch-node2
- discovery.seed_hosts=opensearch-node1,opensearch-node2
- cluster.initial_master_nodes=opensearch-node1,opensearch-node2
- bootstrap.memory_lock=true
- "OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m"
ulimits:
memlock:
soft: -1
hard: -1
nofile:
soft: 65536
hard: 65536
volumes:
- opensearch-data2:/usr/share/opensearch/data
networks:
- net
opensearch-dashboards:
image: opensearchproject/opensearch-dashboards:${VERSION}
container_name: opensearch-dashboards

ports:
- 5601:5601 # Map host port 5601 to container port 5601
- 5601:5601
expose:
- "5601" # Expose port 5601 for web access to OpenSearch Dashboards
- "5601"
environment:
OPENSEARCH_HOSTS: '["http://opensearch:9200"]' # Define the OpenSearch nodes that OpenSearch Dashboards will query
depends_on:
- opensearch
OPENSEARCH_HOSTS: '["https://opensearch-node1:9200","https://opensearch-node2:9200"]' # must be a string with no spaces when specified as an environment variable
networks:
- net

networks:
net:
Expand Down
2 changes: 1 addition & 1 deletion docker/livy/conf/livy.conf
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ livy.file.local-dir-whitelist = /target/

# Whether to enable HiveContext in livy interpreter, if it is true hive-site.xml will be detected
# on user request and then livy server classpath automatically.
# livy.repl.enable-hive-context =
# livy.repl.enable-hive-context = true

# Recovery mode of Livy. Possible values:
# off: Default. Turn off recovery. Every time Livy shuts down, it stops and forgets all sessions.
Expand Down
31 changes: 31 additions & 0 deletions docker/spark/conf/log4j.properties
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# Set everything to be logged to the console
log4j.rootCategory=DEBUG, console
log4j.appender.console=org.apache.log4j.ConsoleAppender
log4j.appender.console.target=System.err
log4j.appender.console.layout=org.apache.log4j.PatternLayout
log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n

# Set the default spark-shell log level to WARN. When running the spark-shell, the
# log level for this class is used to overwrite the root logger's log level, so that
# the user can have different defaults for the shell and regular Spark apps.
log4j.logger.org.apache.spark.repl.Main=DEBUG

# Settings to quiet third party logs that are too verbose
log4j.logger.org.spark_project.jetty=WARN
31 changes: 31 additions & 0 deletions docker/spark/spark-defaults.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# Default system properties included when running spark-submit.
# This is useful for setting default environmental settings.

# Example:
# spark.master spark://master:7077
# spark.eventLog.enabled true
# spark.eventLog.dir hdfs://namenode:8021/directory
# spark.serializer org.apache.spark.serializer.KryoSerializer
# spark.driver.memory 2g
# spark.executor.extraJavaOptions -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three"


#spark.jars.packages = org.postgresql:postgresql:9.4.1207.jar
#spark.driver.extraClassPath = /opt/bitnami/spark/jars/postgresql-9.4.1207.jar
87 changes: 87 additions & 0 deletions docs/Local-testing.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
# Testing locally With Spark
This document is intended to review the local docker-compose based environment in-which the Flint/PPL - spark plugins can be testes and explored.

## Overview
The following components are part of this testing environment

### Livy
Apache Livy is a service that enables easy interaction with a Spark cluster over a REST interface. It enables easy submission of Spark jobs or snippets of Spark code, synchronous or asynchronous result retrieval, as well as Spark Context management, all via a simple REST interface or an RPC client library.
Live provides a comprehensive [REST API](https://livy.apache.org/docs/latest/rest-api.html) to interact with spark cluster in a simplified way.

## Test Tutorial
First we need to create a livy session
```
curl --location --request POST 'http://localhost:8998/sessions' \
--header 'Content-Type: application/json' \
--data-raw '{
"kind": "sql",
"proxyUser": "a_user"
}'
```
This call will respond with a session Id in the following manner:
```json5
{
"id": 0,
"name": null,
"appId": null,
"owner": null,
"proxyUser": null,
"state": "starting",
"kind": "sql",
"appInfo": {
"driverLogUrl": null,
"sparkUiUrl": null
},
"log": [
"stdout: ",
"\nstderr: "
]
}
```

Once a session is created, we can submit a SQL query statement the following way:
```
curl --location --request POST 'http://localhost:8998/sessions/0/statements' \
--header 'Content-Type: application/json' \
--data-raw '{
"code": "spark.sql(\"CREATE TABLE test_table (id INT, name STRING)\")"
}'
```

This call responds with the next ack
```json5
{"id":0,"code":"select 1","state":"waiting","output":null,"progress":0.0,"started":0,"completed":0}
```

Next we can Insert some data into that table:
```
curl --location --request POST 'http://localhost:8998/sessions/0/statements' \
--header 'Content-Type: application/json' \
--data-raw '{
"code": "spark.sql(\"INSERT INTO test_table VALUES (1, 'John'), (2, 'Doe')\")"
}'
```

Now lets query the table using SQL:
```
curl --location --request POST 'http://localhost:8998/sessions/0/statements' \
--header 'Content-Type: application/json' \
--data-raw '{
"code": "spark.sql(\"SELECT * FROM test_table\").show()"
}'
```

We can now see the Livy session created with the execution running:

![Livy UI session Image]()

To get the response of this statement use the next API:
`curl --location --request GET http://localhost:8998/sessions/0/statements/0 | jq '.output.data.application/json.data'`

This would respond with the next results
```text
% Total % Received % Xferd Average Speed Time Time Time Current
Dload Upload Total Spent Left Speed
100 298 100 298 0 0 6610 0 --:--:-- --:--:-- --:--:-- 7641
```

0 comments on commit 6e3437d

Please sign in to comment.