Skip to content

Commit

Permalink
add hive metastore docker service
Browse files Browse the repository at this point in the history
Signed-off-by: YANGDB <[email protected]>
  • Loading branch information
YANG-DB committed Oct 5, 2023
1 parent 6e3437d commit 767d1d6
Show file tree
Hide file tree
Showing 9 changed files with 143 additions and 82 deletions.
21 changes: 21 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -55,10 +55,31 @@ services:
- SPARK_LOCAL_IP=spark-worker-1
- SPARK_DRIVER_EXTRA_CLASSPATH=/opt/spark-jars/*
- SPARK_EXECUTOR_EXTRA_CLASSPATH=/opt/spark-jars/*
- SPARK_HIVE_METASTORE_VERSION=2.3.7
- SPARK_HIVE_METASTORE_JARS=maven
- SPARK_HIVE_METASTORE_URI=thrift://hive-metastore:9083
volumes:
- ./apps:/opt/spark-apps
- ./data:/opt/spark-data
- ./logs:/opt/spark-logs
hive-metastore:
image: bde2020/hive:2.3.2-postgresql-metastore
container_name: hive-metastore
env_file:
- ./docker/hive/hive.env
- ./docker/hive/conf/metastore-site.xml:/opt/apache-hive-metastore-3.0.0-bin/conf/metastore-site.xml

environment:
HIVE_CORE_CONF_javax_jdo_option_ConnectionURL: jdbc:mysql://mysql/metastore?createDatabaseIfNotExist=true
HIVE_CORE_CONF_javax_jdo_option_ConnectionDriverName: com.mysql.jdbc.Driver
HIVE_CORE_CONF_javax_jdo_option_ConnectionUserName: hive
HIVE_CORE_CONF_javax_jdo_option_ConnectionPassword: hivepassword
ports:
- "9083:9083"
hive-metastore-postgresql:
image: bde2020/hive-metastore-postgresql:2.3.0
container_name: hive-metastore-postgresql

livy-server:
container_name: livy_server
build: ./docker/livy/
Expand Down
9 changes: 9 additions & 0 deletions docker/hive/conf/hive-site.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
<?xml version="1.0"?>
<configuration>

<property>
<name>iceberg.engine.hive.enabled</name>
<value>true</value>
</property>

</configuration>
39 changes: 39 additions & 0 deletions docker/hive/conf/metastore-site.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
<configuration>
<property>
<name>metastore.thrift.uris</name>
<value>thrift://hive-metastore:9083</value>
<description>Thrift URI for the remote metastore. Used by metastore client to connect to remote metastore.</description>
</property>
<property>
<name>metastore.task.threads.always</name>
<value>org.apache.hadoop.hive.metastore.events.EventCleanerTask,org.apache.hadoop.hive.metastore.MaterializationsCacheCleanerTask</value>
</property>
<property>
<name>metastore.expression.proxy</name>
<value>org.apache.hadoop.hive.metastore.DefaultPartitionExpressionProxy</value>
</property>
<property>
<name>javax.jdo.option.ConnectionDriverName</name>
<value>com.mysql.cj.jdbc.Driver</value>
</property>

<property>
<name>javax.jdo.option.ConnectionURL</name>
<value>jdbc:mysql://localhost/metastore</value>
</property>

<property>
<name>javax.jdo.option.ConnectionUserName</name>
<value>hive</value>
</property>

<property>
<name>javax.jdo.option.ConnectionPassword</name>
<value>hivepassword</value>
</property>

<property>
<name>hive.metastore.disallow.incompatible.col.type.changes</name>
<value>false</value>
</property>
</configuration>
6 changes: 6 additions & 0 deletions docker/hive/hive.env
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
HIVE_SITE_CONF_javax_jdo_option_ConnectionURL=jdbc:postgresql://hive-metastore-postgresql/metastore
HIVE_SITE_CONF_javax_jdo_option_ConnectionDriverName=org.postgresql.Driver
HIVE_SITE_CONF_javax_jdo_option_ConnectionUserName=hive
HIVE_SITE_CONF_javax_jdo_option_ConnectionPassword=hive
HIVE_SITE_CONF_datanucleus_autoCreateSchema=false
HIVE_SITE_CONF_hive_metastore_uris=thrift://hive-metastore:9083
2 changes: 1 addition & 1 deletion docker/livy/conf/livy.conf
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ livy.server.session.timeout-check.skip-busy = false

# Time in milliseconds on how long Livy will wait before timing out an inactive session.
# Note that the inactive session could be busy running jobs.
livy.server.session.timeout = 5m
livy.server.session.timeout = 30m
#
# How long a finished session state should be kept in LivyServer for query.
livy.server.session.state-retain.sec = 60s
Expand Down
49 changes: 0 additions & 49 deletions docker/spark/Dockerfile

This file was deleted.

1 change: 1 addition & 0 deletions docker/spark/spark-defaults.conf
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,4 @@

#spark.jars.packages = org.postgresql:postgresql:9.4.1207.jar
#spark.driver.extraClassPath = /opt/bitnami/spark/jars/postgresql-9.4.1207.jar
spark.sql.warehouse.dir=hdfs://localhost:9000/user/hive/warehouse
20 changes: 0 additions & 20 deletions docker/spark/start-spark.sh

This file was deleted.

78 changes: 66 additions & 12 deletions docs/Local-testing.md
Original file line number Diff line number Diff line change
@@ -1,15 +1,24 @@
# Testing locally With Spark
This document is intended to review the local docker-compose based environment in-which the Flint/PPL - spark plugins can be testes and explored.

This document is intended to review the local docker-compose based environment in-which the Flint/PPL - spark plugins
can be testes and explored.

## Overview

The following components are part of this testing environment

### Livy
Apache Livy is a service that enables easy interaction with a Spark cluster over a REST interface. It enables easy submission of Spark jobs or snippets of Spark code, synchronous or asynchronous result retrieval, as well as Spark Context management, all via a simple REST interface or an RPC client library.
Live provides a comprehensive [REST API](https://livy.apache.org/docs/latest/rest-api.html) to interact with spark cluster in a simplified way.

Apache Livy is a service that enables easy interaction with a Spark cluster over a REST interface. It enables easy
submission of Spark jobs or snippets of Spark code, synchronous or asynchronous result retrieval, as well as Spark
Context management, all via a simple REST interface or an RPC client library.
Live provides a comprehensive [REST API](https://livy.apache.org/docs/latest/rest-api.html) to interact with spark
cluster in a simplified way.

## Test Tutorial

First we need to create a livy session

```
curl --location --request POST 'http://localhost:8998/sessions' \
--header 'Content-Type: application/json' \
Expand All @@ -18,7 +27,9 @@ curl --location --request POST 'http://localhost:8998/sessions' \
"proxyUser": "a_user"
}'
```

This call will respond with a session Id in the following manner:

```json5
{
"id": 0,
Expand All @@ -40,34 +51,46 @@ This call will respond with a session Id in the following manner:
```

Once a session is created, we can submit a SQL query statement the following way:

```
curl --location --request POST 'http://localhost:8998/sessions/0/statements' \
--header 'Content-Type: application/json' \
--data-raw '{
"code": "spark.sql(\"CREATE TABLE test_table (id INT, name STRING)\")"
"code": "CREATE TABLE test_table (id INT, name STRING) USING parquet"
}'
```

This call responds with the next ack

```json5
{"id":0,"code":"select 1","state":"waiting","output":null,"progress":0.0,"started":0,"completed":0}
{
"id": 0,
"code": "spark.sql(\"CREATE TABLE test_table (id INT, name STRING)\")",
"state": "waiting",
"output": null,
"progress": 0.0,
"started": 0,
"completed": 0
}
```

Next we can Insert some data into that table:

```
curl --location --request POST 'http://localhost:8998/sessions/0/statements' \
--header 'Content-Type: application/json' \
--data-raw '{
"code": "spark.sql(\"INSERT INTO test_table VALUES (1, 'John'), (2, 'Doe')\")"
"code": "INSERT INTO test_table VALUES (1, \"John\"), (2, \"Doe\")"
}'
```

Now lets query the table using SQL:

```
curl --location --request POST 'http://localhost:8998/sessions/0/statements' \
--header 'Content-Type: application/json' \
--data-raw '{
"code": "spark.sql(\"SELECT * FROM test_table\").show()"
"code": "SELECT * FROM test_table"
}'
```

Expand All @@ -76,12 +99,43 @@ We can now see the Livy session created with the execution running:
![Livy UI session Image]()

To get the response of this statement use the next API:
`curl --location --request GET http://localhost:8998/sessions/0/statements/0 | jq '.output.data.application/json.data'`
`curl --location --request GET http://localhost:8998/sessions/0/statements/2 `

This would respond with the next results
```text
% Total % Received % Xferd Average Speed Time Time Time Current
Dload Upload Total Spent Left Speed
100 298 100 298 0 0 6610 0 --:--:-- --:--:-- --:--:-- 7641

```json5
{
"id": 2,
"code": "SELECT * FROM test_table",
"state": "available",
"output": {
"status": "ok",
"execution_count": 2,
"data": {
"application/json": {
"schema": {
"type": "struct",
"fields": [
{
"name": "id",
"type": "integer",
"nullable": true,
"metadata": {}
},
{
"name": "name",
"type": "string",
"nullable": true,
"metadata": {}
}
]
},
"data": []
}
}
},
"progress": 1.0,
"started": 1696467890662,
"completed": 1696467890978
}
```

0 comments on commit 767d1d6

Please sign in to comment.