Add support for Spark SQL

GoogleCloudDataproc · Nov 1, 2023 · 651fbb0 · 651fbb0
1 parent 57dffdc
commit 651fbb0
Show file tree

Hide file tree

Showing 25 changed files with 910 additions and 186 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -2,6 +2,8 @@
 
 ## Next
 
+* Added support for Hive 2.X.
+* Added support for Spark SQL.
 * Fixed case sensitivity bug with column names. This particularly affected pseudo columns like
   `_PARTITIONTIME` and `_PARTITIONDATE` in time-ingestion partitioned BigQuery tables.
 * **Backward-incompatible change:** The type of the `_PARTITION_TIME` pseudo-column in

diff --git a/README.md b/README.md
@@ -18,6 +18,7 @@ software versions:
 * Hive 2.3.6, 2.3.9, 3.1.2, and 3.1.3.
 * Hadoop 2.10.2, 3.2.3, and 3.3.3.
 * Tez 0.9.2 on Hadoop 2, and Tez 0.10.1 on Hadoop 3.
+* Spark SQL 3.4.1.
 
 ## Installation
 
@@ -474,6 +475,27 @@ session creation time (i.e. when the `SELECT` query is initiated).
 
 Note that this consistency model currently only applies to the table data, not its metadata.
 
+## Spark SQL integration
+
+The connector supports versions of Spark SQL that vendor Hive v2.X. Therefore, to use Spark SQL,
+you must use the Hive 2 (not Hive 3) version of the connector. See more information in the
+[Installation](#installation) section on how to install the correct connector version in your
+environment.
+
+Example (Java):
+
+```java
+SparkConf sparkConf = new SparkConf().setMaster("local");
+SparkSession spark =
+    SparkSession.builder()
+    .appName("example")
+    .config(sparkConf)
+    .enableHiveSupport()
+    .getOrCreate();
+Dataset<Row> ds = spark.sql("SELECT * FROM mytable");
+Row[] rows = ds.collect();
+```
+
 ## BigLake integration
 
 [BigLake](https://cloud.google.com/biglake) allows you to store your data in open formats

diff --git a/cloudbuild/cloudbuild.yaml b/cloudbuild/cloudbuild.yaml
@@ -1,10 +1,10 @@
 steps:
-# 1. Create a Docker image containing hadoop-connectors repo
+# 0. Create a Docker image containing hadoop-connectors repo
 - name: 'gcr.io/cloud-builders/docker'
   id: 'docker-build'
   args: ['build', '--tag=gcr.io/$PROJECT_ID/dataproc-hive-bigquery-connector-presubmit', '-f', 'cloudbuild/Dockerfile', '.']
 
-# 2. Build the connector and download dependencies without running tests.
+# 1. Build the connector and download dependencies without running tests.
 - name: 'gcr.io/$PROJECT_ID/dataproc-hive-bigquery-connector-presubmit'
   id: 'check'
   waitFor: ['docker-build']
@@ -13,7 +13,7 @@ steps:
   env:
     - 'CODECOV_TOKEN=${_CODECOV_TOKEN}'
 
-# 3. Build the connector and download dependencies without running tests.
+# 2. Build the connector and download dependencies without running tests.
 - name: 'gcr.io/$PROJECT_ID/dataproc-hive-bigquery-connector-presubmit'
   id: 'build'
   waitFor: ['check']
@@ -22,7 +22,7 @@ steps:
   env:
     - 'CODECOV_TOKEN=${_CODECOV_TOKEN}'
 
-# 4. Run unit tests for Hive 2
+# 3. Run unit tests for Hive 2
 - name: 'gcr.io/$PROJECT_ID/dataproc-hive-bigquery-connector-presubmit'
   id: 'unit-tests-hive2'
   waitFor: ['build']
@@ -31,7 +31,7 @@ steps:
   env:
     - 'CODECOV_TOKEN=${_CODECOV_TOKEN}'
 
-# 5. Run unit tests for Hive 3
+# 4. Run unit tests for Hive 3
 - name: 'gcr.io/$PROJECT_ID/dataproc-hive-bigquery-connector-presubmit'
   id: 'unit-tests-hive3'
   waitFor: ['build']
@@ -40,7 +40,7 @@ steps:
   env:
     - 'CODECOV_TOKEN=${_CODECOV_TOKEN}'
 
-# 6. Run integration tests for Hive 2
+# 5. Run integration tests for Hive 2
 - name: 'gcr.io/$PROJECT_ID/dataproc-hive-bigquery-connector-presubmit'
   id: 'integration-tests-hive2'
   waitFor: ['unit-tests-hive2']
@@ -49,7 +49,7 @@ steps:
   env:
     - 'CODECOV_TOKEN=${_CODECOV_TOKEN}'
 
-# 7. Run integration tests for Hive 3
+# 6. Run integration tests for Hive 3
 - name: 'gcr.io/$PROJECT_ID/dataproc-hive-bigquery-connector-presubmit'
   id: 'integration-tests-hive3'
   waitFor: ['unit-tests-hive3']
@@ -58,6 +58,15 @@ steps:
   env:
     - 'CODECOV_TOKEN=${_CODECOV_TOKEN}'
 
+# 7. Run integration tests for Spark SQL
+- name: 'gcr.io/$PROJECT_ID/dataproc-hive-bigquery-connector-presubmit'
+  id: 'integration-tests-sparksql'
+  waitFor: ['unit-tests-hive2']
+  entrypoint: 'bash'
+  args: ['/workspace/cloudbuild/presubmit.sh', 'integrationtest_sparksql']
+  env:
+    - 'CODECOV_TOKEN=${_CODECOV_TOKEN}'
+
 # Tests should take under 90 mins
 timeout: 5400s
 

diff --git a/cloudbuild/presubmit.sh b/cloudbuild/presubmit.sh
@@ -26,6 +26,7 @@ readonly ACTION=$1
 
 readonly HIVE2_PROFILE="hive2-generic"
 readonly HIVE3_PROFILE="hive3-generic"
+readonly HIVE3_SHADED_DEPS="shaded-deps-hive3.1.2-hadoop2.10.2"
 readonly MVN="./mvnw -B -e -Dmaven.repo.local=/workspace/.repository"
 
 export TEST_BUCKET=dataproc-integ-tests
@@ -37,16 +38,18 @@ cd /workspace
 case "$ACTION" in
   # Java code style check
   check)
-    ./mvnw spotless:check -P"${HIVE2_PROFILE}" && ./mvnw spotless:check -P"${HIVE3_PROFILE}"
+    $MVN spotless:check -P"${HIVE2_PROFILE}" && $MVN spotless:check -P"${HIVE3_PROFILE}"
     exit
     ;;
 
-  # Download maven and all the dependencies
+  # Build the Maven packages and dependencies
   build)
-    # Install all modules for Hive 2, including parent modules
-    $MVN install -DskipTests -P"${HIVE2_PROFILE}"
-    # Install the shaded deps for Hive 3 (all the other shaded & parent modules have already been installed with the previous command)
-    $MVN install -DskipTests -P"${HIVE3_PROFILE}" -pl shaded-deps-${HIVE3_PROFILE}
+    # Install shaded dependencies for Spark SQL
+    $MVN install -DskipTests -P sparksql -pl shaded-deps-sparksql
+    # Install all modules for Hive 2
+    $MVN install -DskipTests -P"${HIVE2_PROFILE},sparksql-integration"
+    # Install the shaded dependencies for Hive 3 (all the other shaded & parent modules have already been installed with the previous command)
+    $MVN install -DskipTests -P"${HIVE3_PROFILE}" -pl ${HIVE3_SHADED_DEPS}
     exit
     ;;
 
@@ -84,6 +87,15 @@ case "$ACTION" in
     exit
     ;;
 
+  # Run integration tests for Spark SQL
+  integrationtest_sparksql)
+    $MVN failsafe:integration-test failsafe:verify jacoco:report jacoco:report-aggregate \
+      -P${HIVE2_PROFILE},sparksql-integration,coverage
+    # Upload test coverage report to Codecov
+    bash <(curl -s https://codecov.io/bash) -K -F "${ACTION}"
+    exit
+    ;;
+
   *)
     echo "Unknown action: $ACTION"
     exit 1

diff --git a/hive-2-bigquery-connector/pom.xml b/hive-2-bigquery-connector/pom.xml
@@ -36,14 +36,6 @@
             <scope>test</scope>
         </dependency>
 
-        <dependency>
-            <groupId>${project.groupId}</groupId>
-            <artifactId>shaded-acceptance-tests-dependencies</artifactId>
-            <version>${project.version}</version>
-            <classifier>shaded</classifier>
-            <scope>test</scope>
-        </dependency>
-
         <dependency>
             <groupId>io.github.hiverunner</groupId>
             <artifactId>hiverunner</artifactId>
@@ -53,6 +45,20 @@
     </dependencies>
 
     <profiles>
+        <profile>
+            <!-- Currently the same as "hive2.3.9-hadoop2.10.2" but could be changed later -->
+            <!-- Use this profile if you don't care about specific minor versions of Hive 2.X -->
+            <id>hive2-generic</id>
+            <dependencies>
+                <dependency>
+                    <groupId>${project.groupId}</groupId>
+                    <artifactId>shaded-deps-hive2.3.9-hadoop2.10.2</artifactId>
+                    <version>${project.version}</version>
+                    <classifier>shaded</classifier>
+                    <scope>provided</scope>
+                </dependency>
+            </dependencies>
+        </profile>
         <profile>
             <id>hive2.3.6-hadoop2.7.0</id>
             <properties>