diff --git a/scalardb-analytics-spark-sample/.gitignore b/scalardb-analytics-spark-sample/.gitignore new file mode 100644 index 00000000..ff9c396b --- /dev/null +++ b/scalardb-analytics-spark-sample/.gitignore @@ -0,0 +1 @@ +.scala_history diff --git a/scalardb-analytics-spark-sample/cert.pem b/scalardb-analytics-spark-sample/cert.pem deleted file mode 100644 index dd1956f5..00000000 --- a/scalardb-analytics-spark-sample/cert.pem +++ /dev/null @@ -1 +0,0 @@ -Please replace this file with the certificate for your license diff --git a/scalardb-analytics-spark-sample/docker-compose.yml b/scalardb-analytics-spark-sample/docker-compose.yml index 2261926f..e7050ffc 100644 --- a/scalardb-analytics-spark-sample/docker-compose.yml +++ b/scalardb-analytics-spark-sample/docker-compose.yml @@ -1,50 +1,50 @@ services: - spark-shell: + spark-sql: build: context: ./docker dockerfile: Dockerfile.spark volumes: - ./scalardb.properties:/etc/scalardb.properties - - ./cert.pem:/etc/cert.pem - - .scala_history_jline3:/root/.scala_history_jline3 + - ./spark-defaults.conf:/opt/spark/conf/spark-defaults.conf + - .scala_history:/root/.scala_history networks: - scalar-network profiles: - dev depends_on: - - backend-postgres - - backend-cassandra - - backend-dynamodb + - scalardb-cassandra + - scalardb-mysql + - postgres command: - - "/opt/spark/bin/spark-shell" + - "/opt/spark/bin/spark-sql" - "--packages" - - "com.scalar-labs:scalardb-analytics-spark-3.5_2.12:3.12.0" + - "com.scalar-labs:scalardb-analytics-spark-all-3.5_2.12:3.14.0" - backend-postgres: - image: postgres:15.1 - ports: - - "5432" + sample-data-loader: + build: + context: sample-data-loader + dockerfile: Dockerfile volumes: - - backend-postgres-data:/var/lib/postgresql/data - environment: - - POSTGRES_USER=postgres - - POSTGRES_PASSWORD=postgres - - POSTGRES_DB=test + - ./scalardb.properties:/etc/scalardb.properties + - ./schema.json:/etc/schema.json + - ./data:/data + working_dir: /sample-data-loader networks: - scalar-network - healthcheck: - test: ["CMD", "psql", "-U", "postgres", "-c", "select 1"] - interval: 1s - timeout: 1s - retries: 10 - start_period: 1s + profiles: + - dev + depends_on: + - scalardb-cassandra + - scalardb-mysql + - postgres + command: ["java", "-jar", "/app.jar"] - backend-cassandra: + scalardb-cassandra: image: cassandra:3.11 ports: - - "9042" + - 9042 volumes: - - backend-cassandra-data:/var/lib/cassandra + - scalardb-cassandra-data:/var/lib/cassandra environment: - CASSANDRA_DC=dc1 - CASSANDRA_ENDPOINT_SNITCH=GossipingPropertyFileSnitch @@ -55,50 +55,52 @@ services: interval: 1s timeout: 1s retries: 10 - start_period: 5s + start_period: 10s - backend-dynamodb: - image: amazon/dynamodb-local:1.21.0 + scalardb-mysql: + image: mysql:8.0 ports: - - "8000" - command: - [ - "-jar", - "DynamoDBLocal.jar", - "-sharedDb", - "-dbPath", - "/home/dynamodblocal", - "-optimizeDbBeforeStartup", - ] + - 3306 volumes: - - backend-dynamodb-data:/home/dynamodblocal + - scalardb-mysql-data:/var/lib/mysql + environment: + - MYSQL_ROOT_PASSWORD=mysql + - MYSQL_DATABASE=sampledb networks: - scalar-network + healthcheck: + test: ["CMD", "mysqladmin", "ping", "-h", "localhost", "-u", "root"] + interval: 1s + timeout: 1s + retries: 10 + start_period: 5s - sample-data-loader: - build: - context: sample-data-loader - dockerfile: Dockerfile + postgres: + image: postgres:15.1 + ports: + - 5432 volumes: - - ./scalardb.properties:/etc/scalardb.properties - - ./schema.json:/etc/schema.json - - ./data:/data - working_dir: /sample-data-loader + - postgres-data:/var/lib/postgresql/data + - ./data/customer.csv:/opt/customer.csv + - ./sql/postgres_copy.sql:/docker-entrypoint-initdb.d/postgres_copy.sql + environment: + - POSTGRES_USER=postgres + - POSTGRES_PASSWORD=postgres + - POSTGRES_DB=sampledb networks: - scalar-network - profiles: - - dev - depends_on: - - backend-postgres - - backend-cassandra - - backend-dynamodb - command: ["java", "-jar", "/app.jar"] + healthcheck: + test: ["CMD", "psql", "-U", "postgres", "-c", "select 1"] + interval: 1s + timeout: 1s + retries: 10 + start_period: 5s volumes: analytics-data: {} - backend-postgres-data: {} - backend-cassandra-data: {} - backend-dynamodb-data: {} + scalardb-cassandra-data: {} + scalardb-mysql-data: {} + postgres-data: {} networks: scalar-network: {} diff --git a/scalardb-analytics-spark-sample/docker/Dockerfile.spark b/scalardb-analytics-spark-sample/docker/Dockerfile.spark index 6bfdd58e..42f1c6df 100644 --- a/scalardb-analytics-spark-sample/docker/Dockerfile.spark +++ b/scalardb-analytics-spark-sample/docker/Dockerfile.spark @@ -3,7 +3,7 @@ FROM eclipse-temurin:17-jre-jammy WORKDIR /work -ENV SPARK_VERSION 3.5.1 +ENV SPARK_VERSION 3.5.3 RUN apt-get update && \ apt-get install -y --no-install-recommends \ diff --git a/scalardb-analytics-spark-sample/sample-data-loader/build.gradle.kts b/scalardb-analytics-spark-sample/sample-data-loader/build.gradle.kts index 9be38336..9c2b6d5b 100644 --- a/scalardb-analytics-spark-sample/sample-data-loader/build.gradle.kts +++ b/scalardb-analytics-spark-sample/sample-data-loader/build.gradle.kts @@ -1,6 +1,6 @@ plugins { application - id("com.github.johnrengelman.shadow") version "7.1.2" + id("com.gradleup.shadow") version "8.3.5" id("com.diffplug.spotless") version "6.24.0" } @@ -9,8 +9,8 @@ repositories { } dependencies { - implementation("com.scalar-labs:scalardb:3.12.1") - implementation("com.scalar-labs:scalardb-schema-loader:3.12.1") + implementation("com.scalar-labs:scalardb:3.14.0") + implementation("com.scalar-labs:scalardb-schema-loader:3.14.0") implementation("org.apache.commons:commons-csv:1.10.0") implementation("io.netty:netty-transport-native-epoll:4.1.99.Final:linux-x86_64") diff --git a/scalardb-analytics-spark-sample/sample-data-loader/gradle/wrapper/gradle-wrapper.jar b/scalardb-analytics-spark-sample/sample-data-loader/gradle/wrapper/gradle-wrapper.jar index d64cd491..a4b76b95 100644 Binary files a/scalardb-analytics-spark-sample/sample-data-loader/gradle/wrapper/gradle-wrapper.jar and b/scalardb-analytics-spark-sample/sample-data-loader/gradle/wrapper/gradle-wrapper.jar differ diff --git a/scalardb-analytics-spark-sample/sample-data-loader/gradle/wrapper/gradle-wrapper.properties b/scalardb-analytics-spark-sample/sample-data-loader/gradle/wrapper/gradle-wrapper.properties index 1af9e093..9355b415 100644 --- a/scalardb-analytics-spark-sample/sample-data-loader/gradle/wrapper/gradle-wrapper.properties +++ b/scalardb-analytics-spark-sample/sample-data-loader/gradle/wrapper/gradle-wrapper.properties @@ -1,6 +1,6 @@ distributionBase=GRADLE_USER_HOME distributionPath=wrapper/dists -distributionUrl=https\://services.gradle.org/distributions/gradle-8.5-bin.zip +distributionUrl=https\://services.gradle.org/distributions/gradle-8.10-bin.zip networkTimeout=10000 validateDistributionUrl=true zipStoreBase=GRADLE_USER_HOME diff --git a/scalardb-analytics-spark-sample/sample-data-loader/gradlew b/scalardb-analytics-spark-sample/sample-data-loader/gradlew index 1aa94a42..f5feea6d 100755 --- a/scalardb-analytics-spark-sample/sample-data-loader/gradlew +++ b/scalardb-analytics-spark-sample/sample-data-loader/gradlew @@ -15,6 +15,8 @@ # See the License for the specific language governing permissions and # limitations under the License. # +# SPDX-License-Identifier: Apache-2.0 +# ############################################################################## # @@ -55,7 +57,7 @@ # Darwin, MinGW, and NonStop. # # (3) This script is generated from the Groovy template -# https://github.com/gradle/gradle/blob/HEAD/subprojects/plugins/src/main/resources/org/gradle/api/internal/plugins/unixStartScript.txt +# https://github.com/gradle/gradle/blob/HEAD/platforms/jvm/plugins-application/src/main/resources/org/gradle/api/internal/plugins/unixStartScript.txt # within the Gradle project. # # You can find Gradle at https://github.com/gradle/gradle/. @@ -84,7 +86,8 @@ done # shellcheck disable=SC2034 APP_BASE_NAME=${0##*/} # Discard cd standard output in case $CDPATH is set (https://github.com/gradle/gradle/issues/25036) -APP_HOME=$( cd "${APP_HOME:-./}" > /dev/null && pwd -P ) || exit +APP_HOME=$( cd -P "${APP_HOME:-./}" > /dev/null && printf '%s +' "$PWD" ) || exit # Use the maximum available, or set MAX_FD != -1 to use that value. MAX_FD=maximum diff --git a/scalardb-analytics-spark-sample/sample-data-loader/gradlew.bat b/scalardb-analytics-spark-sample/sample-data-loader/gradlew.bat index 93e3f59f..9d21a218 100644 --- a/scalardb-analytics-spark-sample/sample-data-loader/gradlew.bat +++ b/scalardb-analytics-spark-sample/sample-data-loader/gradlew.bat @@ -13,6 +13,8 @@ @rem See the License for the specific language governing permissions and @rem limitations under the License. @rem +@rem SPDX-License-Identifier: Apache-2.0 +@rem @if "%DEBUG%"=="" @echo off @rem ########################################################################## @@ -43,11 +45,11 @@ set JAVA_EXE=java.exe %JAVA_EXE% -version >NUL 2>&1 if %ERRORLEVEL% equ 0 goto execute -echo. -echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. -echo. -echo Please set the JAVA_HOME variable in your environment to match the -echo location of your Java installation. +echo. 1>&2 +echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 1>&2 +echo. 1>&2 +echo Please set the JAVA_HOME variable in your environment to match the 1>&2 +echo location of your Java installation. 1>&2 goto fail @@ -57,11 +59,11 @@ set JAVA_EXE=%JAVA_HOME%/bin/java.exe if exist "%JAVA_EXE%" goto execute -echo. -echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% -echo. -echo Please set the JAVA_HOME variable in your environment to match the -echo location of your Java installation. +echo. 1>&2 +echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% 1>&2 +echo. 1>&2 +echo Please set the JAVA_HOME variable in your environment to match the 1>&2 +echo location of your Java installation. 1>&2 goto fail diff --git a/scalardb-analytics-spark-sample/sample-data-loader/src/main/java/sample/data/Loader.java b/scalardb-analytics-spark-sample/sample-data-loader/src/main/java/sample/data/Loader.java index de406d2b..a37075e6 100644 --- a/scalardb-analytics-spark-sample/sample-data-loader/src/main/java/sample/data/Loader.java +++ b/scalardb-analytics-spark-sample/sample-data-loader/src/main/java/sample/data/Loader.java @@ -2,6 +2,7 @@ import com.scalar.db.api.DistributedTransaction; import com.scalar.db.api.DistributedTransactionManager; +import com.scalar.db.api.Mutation; import com.scalar.db.api.Put; import com.scalar.db.exception.transaction.TransactionException; import com.scalar.db.io.Key; @@ -14,29 +15,18 @@ import java.nio.file.Files; import java.nio.file.Path; import java.util.HashMap; +import java.util.List; import java.util.Map; import java.util.function.Function; import org.apache.commons.csv.CSVFormat; import org.apache.commons.csv.CSVRecord; public class Loader implements AutoCloseable { - private static final String CUSTOMER_DATA = "/data/customer.csv"; private static final String ORDERS_DATA = "/data/orders.csv"; private static final String LINEITEM_DATA = "/data/lineitem.csv"; private static final String CONFIG_FILE_PATH = "/etc/scalardb.properties"; private static final String SCHEMA_FILE_PATH = "/etc/schema.json"; - private static final String[] CUSTOMER_COLUMNS = { - "c_custkey", - "c_name", - "c_address", - "c_nationkey", - "c_phone", - "c_acctbal", - "c_mktsegment", - "c_comment" - }; - private static final String[] ORDERS_COLUMNS = { "o_orderkey", "o_custkey", @@ -82,8 +72,6 @@ public void close() { public void load() throws TransactionException, IOException, SchemaLoaderException { loadSchema(); - loadData(this.manager, CUSTOMER_DATA, CUSTOMER_COLUMNS, this::buildPutCustomer); - loadData(this.manager, ORDERS_DATA, ORDERS_COLUMNS, this::buildPutOrders); loadData(this.manager, LINEITEM_DATA, LINEITEM_COLUMNS, this::buildPutLineitem); @@ -101,25 +89,9 @@ private void loadSchema() throws SchemaLoaderException { SchemaLoader.load(configFilePath, schemaFilePath, options, createCoordinatorTables); } - private Put buildPutCustomer(CSVRecord record) { - return Put.newBuilder() - .namespace("dynamons") - .table("customer") - .partitionKey(Key.ofInt("c_custkey", intCol(record, "c_custkey"))) - .textValue("c_name", stringCol(record, "c_name")) - .textValue("c_address", stringCol(record, "c_address")) - .intValue("c_nationkey", intCol(record, "c_nationkey")) - .textValue("c_phone", stringCol(record, "c_phone")) - .doubleValue("c_acctbal", doubleCol(record, "c_acctbal")) - .textValue("c_mktsegment", stringCol(record, "c_mktsegment")) - .textValue("c_comment", stringCol(record, "c_comment")) - .enableImplicitPreRead() - .build(); - } - private Put buildPutOrders(CSVRecord record) { return Put.newBuilder() - .namespace("postgresns") + .namespace("mysqlns") .table("orders") .partitionKey(Key.ofInt("o_orderkey", intCol(record, "o_orderkey"))) .intValue("o_custkey", intCol(record, "o_custkey")) @@ -175,7 +147,8 @@ private void loadData( transaction = manager.start(); for (CSVRecord record : records) { Put put = putFunction.apply(record); - transaction.put(put); + List mutations = List.of(put); + transaction.mutate(mutations); } transaction.commit(); } catch (TransactionException e) { diff --git a/scalardb-analytics-spark-sample/scalardb.properties b/scalardb-analytics-spark-sample/scalardb.properties index bf68de4b..08209d52 100644 --- a/scalardb-analytics-spark-sample/scalardb.properties +++ b/scalardb-analytics-spark-sample/scalardb.properties @@ -1,29 +1,18 @@ scalar.db.storage=multi-storage -scalar.db.multi_storage.storages=cassandra,postgres,dynamodb +scalar.db.multi_storage.storages=cassandra,mysql scalar.db.multi_storage.storages.cassandra.storage=cassandra -scalar.db.multi_storage.storages.cassandra.contact_points=backend-cassandra +scalar.db.multi_storage.storages.cassandra.contact_points=scalardb-cassandra scalar.db.multi_storage.storages.cassandra.contact_port=9042 scalar.db.multi_storage.storages.cassandra.username=cassandra scalar.db.multi_storage.storages.cassandra.password=cassandra -scalar.db.multi_storage.storages.postgres.storage=jdbc -scalar.db.multi_storage.storages.postgres.contact_points=jdbc:postgresql://backend-postgres:5432/test -scalar.db.multi_storage.storages.postgres.username=postgres -scalar.db.multi_storage.storages.postgres.password=postgres -scalar.db.multi_storage.storages.postgres.jdbc.connection_pool.min_idle=5 -scalar.db.multi_storage.storages.postgres.jdbc.connection_pool.max_idle=10 -scalar.db.multi_storage.storages.postgres.jdbc.connection_pool.max_total=25 +scalar.db.multi_storage.storages.mysql.storage=jdbc +scalar.db.multi_storage.storages.mysql.contact_points=jdbc:mysql://scalardb-mysql:3306/sampledb +scalar.db.multi_storage.storages.mysql.username=root +scalar.db.multi_storage.storages.mysql.password=mysql -scalar.db.multi_storage.storages.dynamodb.contact_points=ap-northeast-1 -scalar.db.multi_storage.storages.dynamodb.username=access_key_id -scalar.db.multi_storage.storages.dynamodb.password=secret_access_key -scalar.db.multi_storage.storages.dynamodb.storage=dynamo -scalar.db.multi_storage.storages.dynamodb.dynamo.endpoint_override=http://backend-dynamodb:8000 -scalar.db.multi_storage.storages.dynamodb.dynamo.table_metadata.namespace=table_metadata -scalar.db.multi_storage.storages.dynamodb.dynamo.namespace.prefix=scalar_ - -scalar.db.multi_storage.namespace_mapping=cassandrans:cassandra,postgresns:postgres,dynamons:dynamodb +scalar.db.multi_storage.namespace_mapping=cassandrans:cassandra,mysqlns:mysql scalar.db.multi_storage.default_storage=cassandra diff --git a/scalardb-analytics-spark-sample/schema.json b/scalardb-analytics-spark-sample/schema.json index c90e7630..c5857d59 100644 --- a/scalardb-analytics-spark-sample/schema.json +++ b/scalardb-analytics-spark-sample/schema.json @@ -1,25 +1,7 @@ { - "dynamons.customer": { + "mysqlns.orders": { "transaction": true, - "partition-key": [ - "c_custkey" - ], - "columns": { - "c_custkey": "INT", - "c_name": "TEXT", - "c_address": "TEXT", - "c_nationkey": "INT", - "c_phone": "TEXT", - "c_acctbal": "DOUBLE", - "c_mktsegment": "TEXT", - "c_comment": "TEXT" - } - }, - "postgresns.orders": { - "transaction": true, - "partition-key": [ - "o_orderkey" - ], + "partition-key": ["o_orderkey"], "columns": { "o_orderkey": "INT", "o_custkey": "INT", @@ -34,10 +16,7 @@ }, "cassandrans.lineitem": { "transaction": true, - "partition-key": [ - "l_orderkey", - "l_linenumber" - ], + "partition-key": ["l_orderkey", "l_linenumber"], "columns": { "l_orderkey": "INT", "l_partkey": "INT", diff --git a/scalardb-analytics-spark-sample/spark-defaults.conf b/scalardb-analytics-spark-sample/spark-defaults.conf new file mode 100644 index 00000000..ac239c94 --- /dev/null +++ b/scalardb-analytics-spark-sample/spark-defaults.conf @@ -0,0 +1,24 @@ +# Use the ScalarDB Analytics catalog as `test_catalog` +spark.sql.catalog.test_catalog com.scalar.db.analytics.spark.ScalarDbAnalyticsCatalog + +# Enable Spark extension for ScalarDB Analytics +spark.sql.extensions com.scalar.db.analytics.spark.extension.ScalarDbAnalyticsExtensions + +# Set `test_catalog` as the default catalog +spark.sql.defaultCatalog test_catalog + +# Confiture the ScalarDB Analytics license. PLEASE REPLACE THESE VALUES WITH YOUR LICENSE KEY AND CERTIFICATE CONTENTS +spark.sql.catalog.test_catalog.license.key +spark.sql.catalog.test_catalog.license.cert_pem + +# Configure the ScalarDB Analytics catalog for ScalarDB +spark.sql.catalog.test_catalog.data_source.scalardb.type scalardb +spark.sql.catalog.test_catalog.data_source.scalardb.config_path /etc/scalardb.properties + +# Configure the ScalarDB Analytics catalog for PostgreSQL, which is not managed by ScalarDB +spark.sql.catalog.test_catalog.data_source.postgresql.type postgresql +spark.sql.catalog.test_catalog.data_source.postgresql.host postgres +spark.sql.catalog.test_catalog.data_source.postgresql.port 5432 +spark.sql.catalog.test_catalog.data_source.postgresql.username postgres +spark.sql.catalog.test_catalog.data_source.postgresql.password postgres +spark.sql.catalog.test_catalog.data_source.postgresql.database sampledb diff --git a/scalardb-analytics-spark-sample/sql/postgres_copy.sql b/scalardb-analytics-spark-sample/sql/postgres_copy.sql new file mode 100644 index 00000000..bf233f5a --- /dev/null +++ b/scalardb-analytics-spark-sample/sql/postgres_copy.sql @@ -0,0 +1,13 @@ +create schema sample_ns; +create table sample_ns.customer ( + c_custkey int, + c_name text, + c_address text, + c_nationkey int, + c_phone text, + c_acctbal double precision, + c_mktsegment text, + c_comment text, + PRIMARY KEY (c_custkey) +); +\copy sample_ns.customer from '/opt/customer.csv' delimiter ',' csv;