Add Hive 2 module (#112)

GoogleCloudDataproc · Oct 25, 2023 · 57dffdc · 57dffdc
1 parent ccc9dd5
commit 57dffdc
Show file tree

Hide file tree

Showing 23 changed files with 768 additions and 77 deletions.
diff --git a/README-template.md b/README-template.md
@@ -15,13 +15,26 @@ This connector supports [Dataproc](https://cloud.google.com/dataproc) 2.0 and 2.
 For Hadoop clusters other than Dataproc, the connector has been tested with the following
 software versions:
 
-* Hive 3.1.2 and 3.1.3.
-* Hadoop 2.10.2, 3.2.3 and 3.3.3.
+* Hive 2.3.6, 2.3.9, 3.1.2, and 3.1.3.
+* Hadoop 2.10.2, 3.2.3, and 3.3.3.
 * Tez 0.9.2 on Hadoop 2, and Tez 0.10.1 on Hadoop 3.
 
-## Build
+## Installation
+
+### Prerequisite
+
+Make sure you have the BigQuery Storage API enabled in your GCP project. Follow [these instructions](https://cloud.google.com/bigquery/docs/reference/storage/#enabling_the_api).
+
+### Option 1: connectors init action
 
-To build the connector jar:
+For Dataproc clusters, the most convenient way to install the Hive-BigQuery
+connector is to use the [connectors init action](https://github.com/GoogleCloudDataproc/initialization-actions/tree/master/connectors).
+
+### Option 2: manual installation
+
+You can also download an official release JAR from [Maven Central](https://mvnrepository.com/artifact/com.google.cloud.hive/hive-bigquery-connector).
+
+Alternately, you can build a JAR from source:
 
 1. Clone this repository:
    ```sh
@@ -30,29 +43,24 @@ To build the connector jar:
    ```
 
 2. Compile and package the jar:
-   ``` sh
-   ./mvnw package -DskipTests
-   ```
-   The packaged jar is now available at: `connector/target/hive-bigquery-connector-2.0.0-SNAPSHOT.jar`
 
+  * For Hive 2:
 
-## Installation
-
-### Prerequisite
+    ``` sh
+    ./mvnw package -DskipTests -P hive2-generic
+    ```
 
-Make sure you have the BigQuery Storage API enabled in your GCP project. Follow [these instructions](https://cloud.google.com/bigquery/docs/reference/storage/#enabling_the_api).
+  * For Hive 3:
 
-### Option 1: connectors init action
-
-For Dataproc clusters, the most convenient way to install the Hive-BigQuery
-connector is to use the [connectors init action](https://github.com/GoogleCloudDataproc/initialization-actions/tree/master/connectors).
+    ``` sh
+    ./mvnw package -DskipTests -P hive3-generic
+    ```
 
-### Option 2: manual installation
+   The packaged jar is now available at: `connector/target/hive-bigquery-connector-<version>.jar`
 
-You can also download the released connector jar from [Maven Central](https://mvnrepository.com/artifact/com.google.cloud.hive/hive-bigquery-connector),
-or build from the source, then install it in your cluster manually:
+Once you have the connector JAR, deploy the JAR to the classpath of all nodes in your Hive cluster.
 
-Try with Hive client session or Beeline client session:
+You can also provide the JAR as a parameter when starting a Hive or Beeline session:
 
    ```sh
    hive --auxpath <jar path>/hive-bigquery-connector-<version>.jar
@@ -80,7 +88,7 @@ Here's an example:
 
 ```sql
 CREATE TABLE mytable (word_count BIGINT, word STRING)
-STORED BY 'com.google.cloud.hive.bigquery.connector.BigQueryStorageHandler'
+  STORED BY 'com.google.cloud.hive.bigquery.connector.BigQueryStorageHandler'
 TBLPROPERTIES (
     'bq.table'='myproject.mydataset.mytable'
 );
@@ -179,7 +187,7 @@ Here's an example:
 
 ```sql
 CREATE TABLE mytable (int_val BIGINT, ts TIMESTAMP)
-STORED BY 'com.google.cloud.hive.bigquery.connector.BigQueryStorageHandler'
+  STORED BY 'com.google.cloud.hive.bigquery.connector.BigQueryStorageHandler'
 TBLPROPERTIES (
     'bq.table'='myproject.mydataset.mytable',
     'bq.time.partition.field'='ts',
@@ -210,7 +218,7 @@ Here's an example:
 
 ```sql
 CREATE TABLE mytable (int_val BIGINT)
-STORED BY 'com.google.cloud.hive.bigquery.connector.BigQueryStorageHandler'
+  STORED BY 'com.google.cloud.hive.bigquery.connector.BigQueryStorageHandler'
 TBLPROPERTIES (
     'bq.table'='myproject.mydataset.mytable',
     'bq.time.partition.type'='DAY'
@@ -233,7 +241,7 @@ Here's an example:
 
 ```sql
 CREATE TABLE mytable (int_val BIGINT, text STRING, purchase_date DATE)
-STORED BY 'com.google.cloud.hive.bigquery.connector.BigQueryStorageHandler'
+  STORED BY 'com.google.cloud.hive.bigquery.connector.BigQueryStorageHandler'
 TBLPROPERTIES (
     'bq.table'='myproject.mydataset.mytable',
     'bq.clustered.fields'='int_val,text'
@@ -448,7 +456,7 @@ To link a Hive table to a BigQuery table snapshot, simply specify the snapshot's
 
 ```sql
 CREATE TABLE mytable (abc BIGINT, xyz STRING)
-STORED BY 'com.google.cloud.hive.bigquery.connector.BigQueryStorageHandler'
+  STORED BY 'com.google.cloud.hive.bigquery.connector.BigQueryStorageHandler'
 TBLPROPERTIES (
     'bq.table'='myproject.mydataset.mysnapshot'
 );
@@ -614,19 +622,19 @@ There are multiple options to override the default behavior and to provide custo
   for specific users, specific groups, or for all users that run the Hive query by default using
   the below properties:
 
-    - `bq.impersonation.service.account.for.user.<USER_NAME>` (not set by default)
+  - `bq.impersonation.service.account.for.user.<USER_NAME>` (not set by default)
 
-      The service account to be impersonated for a specific user. You can specify multiple
-      properties using that pattern for multiple users.
+    The service account to be impersonated for a specific user. You can specify multiple
+    properties using that pattern for multiple users.
 
-    - `bq.impersonation.service.account.for.group.<GROUP_NAME>` (not set by default)
+  - `bq.impersonation.service.account.for.group.<GROUP_NAME>` (not set by default)
 
-      The service account to be impersonated for a specific group. You can specify multiple
-      properties using that pattern for multiple groups.
+    The service account to be impersonated for a specific group. You can specify multiple
+    properties using that pattern for multiple groups.
 
-    - `bq.impersonation.service.account` (not set by default)
+  - `bq.impersonation.service.account` (not set by default)
 
-      Default service account to be impersonated for all users.
+    Default service account to be impersonated for all users.
 
   If any of the above properties are set then the service account specified will be impersonated by
   generating a short-lived credentials when accessing BigQuery.
@@ -711,7 +719,7 @@ export PROJECT=my-gcp-project
 export BIGLAKE_LOCATION=us
 export BIGLAKE_REGION=us-central1
 export BIGLAKE_CONNECTION=hive-integration-tests
-export BIGLAKE_BUCKET=${USER}-biglake-test
+export BIGLAKE_BUCKET=${PROJECT}-biglake-tests
 ```
 
 Create the test BigLake connection:

diff --git a/README.md b/README.md
@@ -15,27 +15,10 @@ This connector supports [Dataproc](https://cloud.google.com/dataproc) 2.0 and 2.
 For Hadoop clusters other than Dataproc, the connector has been tested with the following
 software versions:
 
-* Hive 3.1.2 and 3.1.3.
-* Hadoop 2.10.2, 3.2.3 and 3.3.3.
+* Hive 2.3.6, 2.3.9, 3.1.2, and 3.1.3.
+* Hadoop 2.10.2, 3.2.3, and 3.3.3.
 * Tez 0.9.2 on Hadoop 2, and Tez 0.10.1 on Hadoop 3.
 
-## Build
-
-To build the connector jar:
-
-1. Clone this repository:
-   ```sh
-   git clone https://github.com/GoogleCloudPlatform/hive-bigquery-connector
-   cd hive-bigquery-connector
-   ```
-
-2. Compile and package the jar:
-   ``` sh
-   ./mvnw package -DskipTests
-   ```
-   The packaged jar is now available at: `connector/target/hive-bigquery-connector-2.0.0-SNAPSHOT.jar`
-
-
 ## Installation
 
 ### Prerequisite
@@ -49,10 +32,37 @@ connector is to use the [connectors init action](https://github.com/GoogleCloudD
 
 ### Option 2: manual installation
 
-You can also download the released connector jar from [Maven Central](https://mvnrepository.com/artifact/com.google.cloud.hive/hive-bigquery-connector),
-or build from the source, then install it in your cluster manually:
+You can also download an official release JAR from [Maven Central](https://mvnrepository.com/artifact/com.google.cloud.hive/hive-bigquery-connector).
+
+Alternately, you can build a JAR from source:
+
+  1. Clone this repository:
+     ```sh
+     git clone https://github.com/GoogleCloudPlatform/hive-bigquery-connector
+     cd hive-bigquery-connector
+     ```
+
+  2. Compile and package the jar:
+
+     * For Hive 2:
+
+       ``` sh
+       ./mvnw package -DskipTests -P hive2-generic
+       ```
+
+       The packaged jar is now available at: `hive-2-bigquery-connector/target/hive-2-bigquery-connector-<version>.jar`
+
+     * For Hive 3:
+
+       ``` sh
+       ./mvnw package -DskipTests -P hive3-generic
+       ```
+
+       The packaged jar is now available at: `hive-3-bigquery-connector/target/hive-3-bigquery-connector-<version>.jar`
+
+Once you have the connector JAR, deploy the JAR to the classpath of all nodes in your Hive cluster.
 
-Try with Hive client session or Beeline client session:
+You can also provide the JAR as a parameter when starting a Hive or Beeline session:
 
    ```sh
    hive --auxpath <jar path>/hive-bigquery-connector-<version>.jar

diff --git a/cloudbuild/cloudbuild.yaml b/cloudbuild/cloudbuild.yaml
@@ -22,25 +22,44 @@ steps:
   env:
     - 'CODECOV_TOKEN=${_CODECOV_TOKEN}'
 
-# 4. Run unit tests
+# 4. Run unit tests for Hive 2
 - name: 'gcr.io/$PROJECT_ID/dataproc-hive-bigquery-connector-presubmit'
-  id: 'unit-tests'
+  id: 'unit-tests-hive2'
   waitFor: ['build']
   entrypoint: 'bash'
-  args: ['/workspace/cloudbuild/presubmit.sh', 'unittest']
+  args: ['/workspace/cloudbuild/presubmit.sh', 'unittest_hive2']
   env:
     - 'CODECOV_TOKEN=${_CODECOV_TOKEN}'
-# 5. Run integration tests
+
+# 5. Run unit tests for Hive 3
+- name: 'gcr.io/$PROJECT_ID/dataproc-hive-bigquery-connector-presubmit'
+  id: 'unit-tests-hive3'
+  waitFor: ['build']
+  entrypoint: 'bash'
+  args: ['/workspace/cloudbuild/presubmit.sh', 'unittest_hive3']
+  env:
+    - 'CODECOV_TOKEN=${_CODECOV_TOKEN}'
+
+# 6. Run integration tests for Hive 2
+- name: 'gcr.io/$PROJECT_ID/dataproc-hive-bigquery-connector-presubmit'
+  id: 'integration-tests-hive2'
+  waitFor: ['unit-tests-hive2']
+  entrypoint: 'bash'
+  args: ['/workspace/cloudbuild/presubmit.sh', 'integrationtest_hive2']
+  env:
+    - 'CODECOV_TOKEN=${_CODECOV_TOKEN}'
+
+# 7. Run integration tests for Hive 3
 - name: 'gcr.io/$PROJECT_ID/dataproc-hive-bigquery-connector-presubmit'
-  id: 'integration-tests'
-  waitFor: ['unit-tests']
+  id: 'integration-tests-hive3'
+  waitFor: ['unit-tests-hive3']
   entrypoint: 'bash'
-  args: ['/workspace/cloudbuild/presubmit.sh', 'integrationtest']
+  args: ['/workspace/cloudbuild/presubmit.sh', 'integrationtest_hive3']
   env:
     - 'CODECOV_TOKEN=${_CODECOV_TOKEN}'
 
-# Tests should take under 60 mins
-timeout: 3600s
+# Tests should take under 90 mins
+timeout: 5400s
 
 options:
   machineType: 'N1_HIGHCPU_32'
diff --git a/cloudbuild/presubmit.sh b/cloudbuild/presubmit.sh
@@ -24,7 +24,8 @@ fi
 
 readonly ACTION=$1
 
-readonly PROFILES="dataproc21"
+readonly HIVE2_PROFILE="hive2-generic"
+readonly HIVE3_PROFILE="hive3-generic"
 readonly MVN="./mvnw -B -e -Dmaven.repo.local=/workspace/.repository"
 
 export TEST_BUCKET=dataproc-integ-tests
@@ -36,28 +37,48 @@ cd /workspace
 case "$ACTION" in
   # Java code style check
   check)
-    ./mvnw spotless:check
+    ./mvnw spotless:check -P"${HIVE2_PROFILE}" && ./mvnw spotless:check -P"${HIVE3_PROFILE}"
     exit
     ;;
 
   # Download maven and all the dependencies
   build)
-    $MVN install -P"${PROFILES}" -DskipTests
+    # Install all modules for Hive 2, including parent modules
+    $MVN install -DskipTests -P"${HIVE2_PROFILE}"
+    # Install the shaded deps for Hive 3 (all the other shaded & parent modules have already been installed with the previous command)
+    $MVN install -DskipTests -P"${HIVE3_PROFILE}" -pl shaded-deps-${HIVE3_PROFILE}
     exit
     ;;
 
-  # Run unit tests
-  unittest)
-    $MVN surefire:test jacoco:report jacoco:report-aggregate -P"${PROFILES}",coverage
+  # Run unit tests for Hive 2
+  unittest_hive2)
+    $MVN surefire:test jacoco:report jacoco:report-aggregate -P"${HIVE2_PROFILE}",coverage
     # Upload test coverage report to Codecov
     bash <(curl -s https://codecov.io/bash) -K -F "${ACTION}"
     exit
     ;;
 
-  # Run integration tests
-  integrationtest)
+  # Run unit tests for Hive 3
+  unittest_hive3)
+    $MVN surefire:test jacoco:report jacoco:report-aggregate -P"${HIVE3_PROFILE}",coverage
+    # Upload test coverage report to Codecov
+    bash <(curl -s https://codecov.io/bash) -K -F "${ACTION}"
+    exit
+    ;;
+
+  # Run integration tests for Hive 2
+  integrationtest_hive2)
+    $MVN failsafe:integration-test failsafe:verify jacoco:report jacoco:report-aggregate \
+      -P"${HIVE2_PROFILE}",coverage,integration
+    # Upload test coverage report to Codecov
+    bash <(curl -s https://codecov.io/bash) -K -F "${ACTION}"
+    exit
+    ;;
+
+  # Run integration tests for Hive 3
+  integrationtest_hive3)
     $MVN failsafe:integration-test failsafe:verify jacoco:report jacoco:report-aggregate \
-      -P"${PROFILES}",coverage,integration
+      -P"${HIVE3_PROFILE}",coverage,integration
     # Upload test coverage report to Codecov
     bash <(curl -s https://codecov.io/bash) -K -F "${ACTION}"
     exit