Skip to content

Commit

Permalink
Merge branch 'main' into ppl-projection-command
Browse files Browse the repository at this point in the history
  • Loading branch information
YANG-DB committed Dec 21, 2024
2 parents e6bb6b2 + 20ef890 commit 2337358
Show file tree
Hide file tree
Showing 42 changed files with 3,062 additions and 510 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,8 @@ bin/spark-shell --packages "org.opensearch:opensearch-spark-ppl_2.12:0.7.0-SNAPS
### PPL Run queries on a local spark cluster
See ppl usage sample on local spark cluster [PPL on local spark ](docs/ppl-lang/local-spark-ppl-test-instruction.md)

### Running integration tests on a local spark cluster
See integration test documentation [Docker Integration Tests](integ-test/script/README.md)

## Code of Conduct

Expand Down
13 changes: 13 additions & 0 deletions docker/integ-test/.env
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
SPARK_VERSION=3.5.3
OPENSEARCH_VERSION=latest
DASHBOARDS_VERSION=latest
MASTER_UI_PORT=8080
MASTER_PORT=7077
UI_PORT=4040
SPARK_CONNECT_PORT=15002
PPL_JAR=../../ppl-spark-integration/target/scala-2.12/ppl-spark-integration-assembly-0.7.0-SNAPSHOT.jar
FLINT_JAR=../../flint-spark-integration/target/scala-2.12/flint-spark-integration-assembly-0.7.0-SNAPSHOT.jar
OPENSEARCH_NODE_MEMORY=512m
OPENSEARCH_ADMIN_PASSWORD=C0rrecthorsebatterystaple.
OPENSEARCH_PORT=9200
OPENSEARCH_DASHBOARDS_PORT=5601
143 changes: 143 additions & 0 deletions docker/integ-test/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
services:
spark:
image: bitnami/spark:${SPARK_VERSION:-3.5.3}
container_name: spark
ports:
- "${MASTER_UI_PORT:-8080}:8080"
- "${MASTER_PORT:-7077}:7077"
- "${UI_PORT:-4040}:4040"
- "${SPARK_CONNECT_PORT}:15002"
entrypoint: /opt/bitnami/scripts/spark/master-entrypoint.sh
environment:
- SPARK_MODE=master
- SPARK_RPC_AUTHENTICATION_ENABLED=no
- SPARK_RPC_ENCRYPTION_ENABLED=no
- SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
- SPARK_SSL_ENABLED=no
- SPARK_PUBLIC_DNS=localhost
volumes:
- type: bind
source: ./spark-master-entrypoint.sh
target: /opt/bitnami/scripts/spark/master-entrypoint.sh
- type: bind
source: ./spark-defaults.conf
target: /opt/bitnami/spark/conf/spark-defaults.conf
- type: bind
source: ./log4j2.properties
target: /opt/bitnami/spark/conf/log4j2.properties
- type: bind
source: $PPL_JAR
target: /opt/bitnami/spark/jars/ppl-spark-integration.jar
- type: bind
source: $FLINT_JAR
target: /opt/bitnami/spark/jars/flint-spark-integration.jar
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8080/"]
interval: 1m
timeout: 5s
retries: 3
start_period: 30s
start_interval: 5s
networks:
- opensearch-net

spark-worker:
image: bitnami/spark:${SPARK_VERSION:-3.5.3}
container_name: spark-worker
environment:
- SPARK_MODE=worker
- SPARK_MASTER_URL=spark://spark:7077
- SPARK_WORKER_MEMORY=${WORKER_MEMORY:-1G}
- SPARK_WORKER_CORES=${WORKER_CORES:-1}
- SPARK_RPC_AUTHENTICATION_ENABLED=no
- SPARK_RPC_ENCRYPTION_ENABLED=no
- SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
- SPARK_SSL_ENABLED=no
- SPARK_PUBLIC_DNS=localhost
volumes:
- type: bind
source: ./spark-defaults.conf
target: /opt/bitnami/spark/conf/spark-defaults.conf
- type: bind
source: ./log4j2.properties
target: /opt/bitnami/spark/conf/log4j2.properties
- type: bind
source: $PPL_JAR
target: /opt/bitnami/spark/jars/ppl-spark-integration.jar
- type: bind
source: $FLINT_JAR
target: /opt/bitnami/spark/jars/flint-spark-integration.jar
networks:
- opensearch-net
depends_on:
- spark

opensearch:
image: opensearchproject/opensearch:${OPENSEARCH_VERSION:-latest}
container_name: opensearch
environment:
- cluster.name=opensearch-cluster
- node.name=opensearch
- discovery.seed_hosts=opensearch
- cluster.initial_cluster_manager_nodes=opensearch
- bootstrap.memory_lock=true
- plugins.security.ssl.http.enabled=false
- OPENSEARCH_JAVA_OPTS=-Xms${OPENSEARCH_NODE_MEMORY:-512m} -Xmx${OPENSEARCH_NODE_MEMORY:-512m}
- OPENSEARCH_INITIAL_ADMIN_PASSWORD=${OPENSEARCH_ADMIN_PASSWORD}
ulimits:
memlock:
soft: -1
hard: -1
nofile:
soft: 65536
hard: 65536
volumes:
- opensearch-data:/usr/share/opensearch/data
ports:
- ${OPENSEARCH_PORT:-9200}:9200
- 9600:9600
expose:
- "${OPENSEARCH_PORT:-9200}"
healthcheck:
test: ["CMD", "curl", "-f", "-u", "admin:${OPENSEARCH_ADMIN_PASSWORD}", "http://localhost:9200/_cluster/health"]
interval: 1m
timeout: 5s
retries: 3
start_period: 30s
start_interval: 5s
networks:
- opensearch-net

opensearch-dashboards:
image: opensearchproject/opensearch-dashboards:${DASHBOARDS_VERSION}
container_name: opensearch-dashboards
ports:
- ${OPENSEARCH_DASHBOARDS_PORT:-5601}:5601
expose:
- "${OPENSEARCH_DASHBOARDS_PORT:-5601}"
environment:
OPENSEARCH_HOSTS: '["http://opensearch:9200"]'
networks:
- opensearch-net
depends_on:
- opensearch

minio:
image: minio/minio
container_name: minio-S3
# See original entrypoint/command under https://github.com/minio/minio/blob/master/Dockerfile
entrypoint: sh -c 'mkdir -p /data/test && minio server /data --console-address ":9001"'
ports:
- "9000:9000"
- "9001:9001"
volumes:
- minio-data:/data
networks:
- opensearch-net

volumes:
opensearch-data:
minio-data:

networks:
opensearch-net:
69 changes: 69 additions & 0 deletions docker/integ-test/log4j2.properties
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# Set everything to be logged to the console
rootLogger.level = info
rootLogger.appenderRef.stdout.ref = console

# In the pattern layout configuration below, we specify an explicit `%ex` conversion
# pattern for logging Throwables. If this was omitted, then (by default) Log4J would
# implicitly add an `%xEx` conversion pattern which logs stacktraces with additional
# class packaging information. That extra information can sometimes add a substantial
# performance overhead, so we disable it in our default logging config.
# For more information, see SPARK-39361.
appender.console.type = Console
appender.console.name = console
appender.console.target = SYSTEM_ERR
appender.console.layout.type = PatternLayout
appender.console.layout.pattern = %d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n%ex

# Set the default spark-shell/spark-sql log level to WARN. When running the
# spark-shell/spark-sql, the log level for these classes is used to overwrite
# the root logger's log level, so that the user can have different defaults
# for the shell and regular Spark apps.
logger.repl.name = org.apache.spark.repl.Main
logger.repl.level = warn

logger.thriftserver.name = org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver
logger.thriftserver.level = warn

# Settings to quiet third party logs that are too verbose
logger.jetty1.name = org.sparkproject.jetty
logger.jetty1.level = warn
logger.jetty2.name = org.sparkproject.jetty.util.component.AbstractLifeCycle
logger.jetty2.level = error
logger.replexprTyper.name = org.apache.spark.repl.SparkIMain$exprTyper
logger.replexprTyper.level = info
logger.replSparkILoopInterpreter.name = org.apache.spark.repl.SparkILoop$SparkILoopInterpreter
logger.replSparkILoopInterpreter.level = info
logger.parquet1.name = org.apache.parquet
logger.parquet1.level = error
logger.parquet2.name = parquet
logger.parquet2.level = error

# SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support
logger.RetryingHMSHandler.name = org.apache.hadoop.hive.metastore.RetryingHMSHandler
logger.RetryingHMSHandler.level = fatal
logger.FunctionRegistry.name = org.apache.hadoop.hive.ql.exec.FunctionRegistry
logger.FunctionRegistry.level = error

# For deploying Spark ThriftServer
# SPARK-34128: Suppress undesirable TTransportException warnings involved in THRIFT-4805
appender.console.filter.1.type = RegexFilter
appender.console.filter.1.regex = .*Thrift error occurred during processing of message.*
appender.console.filter.1.onMatch = deny
appender.console.filter.1.onMismatch = neutral
23 changes: 23 additions & 0 deletions docker/integ-test/prepare_scala_queries.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#!/usr/bin/env python3

import csv

queries = None
with open('../../integ-test/script/test_cases.csv', 'r') as f:
reader = csv.DictReader(f)
queries = [(row['query'], i, row.get('expected_status', None)) for i, row in enumerate(reader, start=1) if row['query'].strip()]

print('try {')
for query in queries:
query_str = query[0].replace('\n', '').replace('"', '\\"')
if 'FAILED' == query[2]:
print(' try {')
print(f' spark.sql("{query_str}")')
print(' throw new Error')
print(' } catch {')
print(' case e: Exception => null')
print(' }\n')
else:
print(f' spark.sql("{query_str}")\n')
print('}')

Loading

0 comments on commit 2337358

Please sign in to comment.