diff --git a/docker/apache-spark-sample/.env b/docker/apache-spark-sample/.env new file mode 100644 index 000000000..a047df5ba --- /dev/null +++ b/docker/apache-spark-sample/.env @@ -0,0 +1,4 @@ +MASTER_UI_PORT=8080 +MASTER_PORT=7077 +UI_PORT=4040 +PPL_JAR=../../ppl-spark-integration/target/scala-2.12/ppl-spark-integration-assembly-0.7.0-SNAPSHOT.jar diff --git a/docker/apache-spark-sample/docker-compose.yml b/docker/apache-spark-sample/docker-compose.yml index a94151813..df2da6d52 100644 --- a/docker/apache-spark-sample/docker-compose.yml +++ b/docker/apache-spark-sample/docker-compose.yml @@ -2,9 +2,9 @@ services: spark: image: bitnami/spark:3.5.3 ports: - - "8080:8080" - - "7077:7077" - - "4040:4040" + - "${MASTER_UI_PORT:-8080}:8080" + - "${MASTER_PORT:-7077}:7077" + - "${UI_PORT:-4040}:4040" environment: - SPARK_MODE=master - SPARK_RPC_AUTHENTICATION_ENABLED=no @@ -17,16 +17,16 @@ services: source: ./spark-defaults.conf target: /opt/bitnami/spark/conf/spark-defaults.conf - type: bind - source: ../../ppl-spark-integration/target/scala-2.12/ppl-spark-integration-assembly-0.7.0-SNAPSHOT.jar - target: /opt/bitnami/spark/jars/ppl-spark-integration-assembly-0.7.0-SNAPSHOT.jar + source: $PPL_JAR + target: /opt/bitnami/spark/jars/ppl-spark-integration.jar spark-worker: image: bitnami/spark:3.5.3 environment: - SPARK_MODE=worker - SPARK_MASTER_URL=spark://spark:7077 - - SPARK_WORKER_MEMORY=1G - - SPARK_WORKER_CORES=1 + - SPARK_WORKER_MEMORY=${WORKER_MEMORY:-1G} + - SPARK_WORKER_CORES=${WORKER_CORES:-1} - SPARK_RPC_AUTHENTICATION_ENABLED=no - SPARK_RPC_ENCRYPTION_ENABLED=no - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no @@ -37,5 +37,5 @@ services: source: ./spark-defaults.conf target: /opt/bitnami/spark/conf/spark-defaults.conf - type: bind - source: ../../ppl-spark-integration/target/scala-2.12/ppl-spark-integration-assembly-0.7.0-SNAPSHOT.jar - target: /opt/bitnami/spark/jars/ppl-spark-integration-assembly-0.7.0-SNAPSHOT.jar + source: $PPL_JAR + target: /opt/bitnami/spark/jars/ppl-spark-integration.jar diff --git a/docker/spark-emr-sample/.env b/docker/spark-emr-sample/.env new file mode 100644 index 000000000..a717532a4 --- /dev/null +++ b/docker/spark-emr-sample/.env @@ -0,0 +1 @@ +PPL_JAR=../../ppl-spark-integration/target/scala-2.12/ppl-spark-integration-assembly-0.7.0-SNAPSHOT.jar diff --git a/docker/spark-emr-sample/docker-compose.yml b/docker/spark-emr-sample/docker-compose.yml new file mode 100644 index 000000000..d0da9f166 --- /dev/null +++ b/docker/spark-emr-sample/docker-compose.yml @@ -0,0 +1,17 @@ +services: + spark-emr: + image: public.ecr.aws/emr-serverless/spark/emr-7.5.0:20241125 + volumes: + - type: bind + source: ./logging-conf + target: /var/loggingConfiguration/spark + - type: bind + source: ../spark-sample-app/target/scala-2.12 + target: /app + - type: bind + source: ./spark-conf + target: /etc/spark/conf + - type: bind + source: ${PPL_JAR} + target: /usr/lib/spark/jars/ppl-spark-integration.jar + command: driver --class MyApp /app/myapp_2.12-1.0.jar diff --git a/docs/spark-docker.md b/docs/spark-docker.md index 8a84254e0..d1200e2b3 100644 --- a/docs/spark-docker.md +++ b/docs/spark-docker.md @@ -7,7 +7,67 @@ Spark PPL extension, the docker image can be used to test PPL commands. The Bitnami Apache Spark image can be used to run a Spark cluster and also to run `spark-shell` for running queries. -## Setup +## Prepare OpenSearch Spark PPL Extension + +Create a local build or copy of the OpenSearch Spark PPL extension. Make a note of the +location of the Jar file as well as the name of the Jar file. + +From the root of this repository, build the OpenSearch Spark PPL extension with: + +``` +sbt clean +sbt assembly +``` + +Refer to the [Developer Guide](../DEVELOPER_GUIDE.md) for more information. + +## Using Docker Compose + +There are sample files in this repository at `docker/apache-spark-sample` They can be used to +start up both nodes with the command: + +``` +docker compose up -d +``` + +The cluster can be stopped with: + +``` +docker compose down +``` + +### Configuration + +There is a file `docker/apache-spark-sample/.env` that can be edited to change some settings. + +| Variable Name | Description | +|----------------|---------------------------------------------------| +| MASTER_UI_PORT | Host port to bind to port 8080 of the master node | +| MASTER_PORT | Host port to bind to port 7077 of the master node | +| UI_PORT | Host port to bind to port 4040 of the master node | +| PPL_JAR | Path to the PPL Jar file | + +## Running Spark Shell + +Can run `spark-shell` on the master node. + +``` +docker exec -it apache-spark-sample-spark-1 /opt/bitnami/spark/bin/spark-shell +``` + +Within the Spark Shell, you can submit queries, including PPL queries. For example a sample +table can be created, populated and finally queried using PPL. + +``` +spark.sql("CREATE TABLE test_table(id int, name varchar(100))") +spark.sql("INSERT INTO test_table (id, name) VALUES(1, 'Foo')") +spark.sql("INSERT INTO test_table (id, name) VALUES(2, 'Bar')") +spark.sql("source=test_table | eval x = id + 5 | fields x, name").show() +``` + +For further information, see the [Spark PPL Test Instructions](ppl-lang/local-spark-ppl-test-instruction.md) + +## Manual Setup ### spark-conf @@ -102,29 +162,3 @@ docker run \ Jar file. * `` Replace with the filename of the OpenSearch Spark PPL extension Jar file. - -## Running Spark Shell - -Can run `spark-shell` on the master node. - -``` -docker exec -it spark /opt/bitnami/spark/bin/spark-shell -``` - -Within the Spark Shell, you can submit queries, including PPL queries. - -## Docker Compose Sample - -There is a sample `docker-compose.yml` file in this repository at -`docker/apache-spark-sample/docker-compose.yml` It can be used to start up both nodes with -the command: - -``` -docker compose up -d -``` - -The cluster can be stopped with: - -``` -docker compose down -``` \ No newline at end of file diff --git a/docs/spark-emr-docker.md b/docs/spark-emr-docker.md index 186cba942..7eef4d250 100644 --- a/docs/spark-emr-docker.md +++ b/docs/spark-emr-docker.md @@ -6,7 +6,53 @@ extension, the docker image can be used to test PPL commands. The Spark EMR image will run an Apache Spark app if one was specified and then shutdown. -## Setup +## Prepare OpenSearch Spark PPL Extension + +Create a local build or copy of the OpenSearch Spark PPL extension. Make a note of the +location of the Jar file as well as the name of the Jar file. + +From the root of this repository, build the OpenSearch Spark PPL extension with: + +``` +sbt clean +sbt assembly +``` + +Refer to the [Developer Guide](../DEVELOPER_GUIDE.md) for more information. + +## Using Docker Compose + +There are sample files in this repository at `docker/spark-emr-sample` They can be used to +run the Spark EMR container: + +``` +docker compose up +``` + +Remove the docker resources afterwards with: + +``` +docker compose down +``` + +### Configuration + +There is a file `docker/spark-emr-sample/.env` that can be edited to change some settings. + +| Variable Name | Description | +|----------------|---------------------------------------------------| +| PPL_JAR | Path to the PPL Jar file | + +## Logs + +The logs are available in `/var/log/spark` in the docker container. + +STDERR for the app run is available in `/var/log/spark/user/stderr`. + +STDOUT for the app +run is available in `/var/log/spark/user/stdout`. + +## Manual Setup Need to create two directories. These directories will be bound to the directories in the image. @@ -99,12 +145,3 @@ docker run \ * `/app/myapp_2.12-1.0.jar` The full path within the docker container where the Jar file of the Spark app is located. - -## Logs - -The logs are available in `/var/log/spark` in the docker container. - -STDERR for the app run is available in `/var/log/spark/user/stderr`. - -STDOUT for the app -run is available in `/var/log/spark/user/stdout`. \ No newline at end of file