diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index b1902aab5f01..d7255d841afb 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -12,7 +12,7 @@ _If medium or high, explain what verification was done to mitigate the risks._ ### Documentation Update -_Describe any necessary documentation update if there is any new feature, config, or user-facing change_ +_Describe any necessary documentation update if there is any new feature, config, or user-facing change. If not, put "none"._ - _The config description must be updated if new configs are added or the default value of the configs are changed_ - _Any new feature or user-facing change requires updating the Hudi website. Please create a Jira ticket, attach the diff --git a/.github/workflows/azure_ci.js b/.github/workflows/azure_ci.js new file mode 100644 index 000000000000..737b8db9917d --- /dev/null +++ b/.github/workflows/azure_ci.js @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +async function checkAzureCiAndCreateCommitStatus({ github, context, prNumber, latestCommitHash }) { + console.log(`- Checking Azure CI status of PR: ${prNumber} ${latestCommitHash}`); + const botUsername = 'hudi-bot'; + + const comments = await github.paginate(github.rest.issues.listComments, { + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: prNumber, + sort: 'updated', + direction: 'desc', + per_page: 100 + }); + + // Find the latest comment from hudi-bot containing the Azure CI report + const botComments = comments.filter(comment => comment.user.login === botUsername); + + let status = 'pending'; + let message = 'In progress'; + let azureRunLink = ''; + + if (botComments.length > 0) { + const lastComment = botComments[0]; + const reportPrefix = `${latestCommitHash} Azure: ` + const successReportString = `${reportPrefix}[SUCCESS]` + const failureReportString = `${reportPrefix}[FAILURE]` + + if (lastComment.body.includes(reportPrefix)) { + if (lastComment.body.includes(successReportString)) { + message = 'Successful on the latest commit'; + status = 'success'; + } else if (lastComment.body.includes(failureReportString)) { + message = 'Failed on the latest commit'; + status = 'failure'; + } + } + + const linkRegex = /\[[a-zA-Z]+\]\((https?:\/\/[^\s]+)\)/; + const parts = lastComment.body.split(reportPrefix); + const secondPart = parts.length > 1 ? parts[1] : ''; + const match = secondPart.match(linkRegex); + + if (match) { + azureRunLink = match[1]; + } + } + + console.log(`Status: ${status}`); + console.log(`Azure Run Link: ${azureRunLink}`); + console.log(`${message}`); + + console.log(`- Create commit status of PR based on Azure CI status: ${prNumber} ${latestCommitHash}`); + // Create or update the commit status for Azure CI + await github.rest.repos.createCommitStatus({ + owner: context.repo.owner, + repo: context.repo.repo, + sha: latestCommitHash, + state: status, + target_url: azureRunLink, + description: message, + context: 'Azure CI' + }); + + return { status, message, azureRunLink }; +} + +module.exports = checkAzureCiAndCreateCommitStatus; diff --git a/.github/workflows/azure_ci_check.yml b/.github/workflows/azure_ci_check.yml new file mode 100644 index 000000000000..1e33e6b8fa50 --- /dev/null +++ b/.github/workflows/azure_ci_check.yml @@ -0,0 +1,80 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +name: Azure CI + +on: + issue_comment: + types: [ created, edited, deleted ] + +permissions: + statuses: write + pull-requests: read + issues: read + +jobs: + check-azure-ci-and-add-commit-status: + if: | + github.event.issue.pull_request != null && + github.event.comment.user.login == 'hudi-bot' + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v2 + + - name: Check PR state + id: check_pr_state + uses: actions/github-script@v7 + with: + github-token: ${{secrets.GITHUB_TOKEN}} + script: | + const issueNumber = context.issue.number; + const { data: pullRequest } = await github.rest.pulls.get({ + owner: context.repo.owner, + repo: context.repo.repo, + pull_number: issueNumber + }); + + // Only check open PRs and a PR that is not a HOTFIX + const shouldSkip = (pullRequest.body.includes('HOTFIX: SKIP AZURE CI') + || pullRequest.state != 'open'); + + if (!shouldSkip) { + const commitHash = pullRequest.head.sha; + console.log(`Latest commit hash: ${commitHash}`); + // Set the output variable to be used in subsequent step + core.setOutput("latest_commit_hash", commitHash); + } + console.log(`Should skip Azure CI? ${shouldSkip}`); + return shouldSkip; + + - name: Check Azure CI report and create commit status to PR + if: steps.check_pr_state.outputs.result != 'true' + uses: actions/github-script@v7 + with: + github-token: ${{secrets.GITHUB_TOKEN}} + script: | + const latestCommitHash = '${{ steps.check_pr_state.outputs.latest_commit_hash }}' + const issueNumber = context.issue.number; + const checkAzureCiAndCreateCommitStatus = require(`${process.env.GITHUB_WORKSPACE}/.github/workflows/azure_ci.js`); + + await checkAzureCiAndCreateCommitStatus({ + github, + context, + prNumber: issueNumber, + latestCommitHash: latestCommitHash + }); diff --git a/.github/workflows/bot.yml b/.github/workflows/bot.yml index a31c2e3ea35c..123660b119e3 100644 --- a/.github/workflows/bot.yml +++ b/.github/workflows/bot.yml @@ -5,6 +5,7 @@ on: branches: - master - 'release-*' + - branch-0.x pull_request: paths-ignore: - '**.bmp' @@ -20,10 +21,11 @@ on: branches: - master - 'release-*' + - branch-0.x concurrency: group: ${{ github.ref }} - cancel-in-progress: ${{ !contains(github.ref, 'master') }} + cancel-in-progress: ${{ !contains(github.ref, 'master') && !contains(github.ref, 'branch-0.x') }} env: MVN_ARGS: -e -ntp -B -V -Dgpg.skip -Djacoco.skip -Pwarn-log -Dorg.slf4j.simpleLogger.log.org.apache.maven.plugins.shade=warn -Dorg.slf4j.simpleLogger.log.org.apache.maven.plugins.dependency=warn -Dmaven.wagon.httpconnectionManager.ttlSeconds=25 -Dmaven.wagon.http.retryHandler.count=5 @@ -38,7 +40,7 @@ jobs: uses: actions/setup-java@v3 with: java-version: '8' - distribution: 'adopt' + distribution: 'temurin' architecture: x64 cache: maven - name: Check Binary Files @@ -51,7 +53,7 @@ jobs: - name: RAT check run: ./scripts/release/validate_source_rat.sh - test-spark: + test-spark-java-tests: runs-on: ubuntu-latest strategy: matrix: @@ -90,7 +92,7 @@ jobs: uses: actions/setup-java@v3 with: java-version: '8' - distribution: 'adopt' + distribution: 'temurin' architecture: x64 cache: maven - name: Build Project @@ -105,22 +107,87 @@ jobs: SPARK_PROFILE: ${{ matrix.sparkProfile }} run: mvn test -Punit-tests -D"$SCALA_PROFILE" -D"$SPARK_PROFILE" -pl hudi-examples/hudi-examples-spark $MVN_ARGS - - name: UT - Common & Spark + - name: Java UT - Common & Spark env: SCALA_PROFILE: ${{ matrix.scalaProfile }} SPARK_PROFILE: ${{ matrix.sparkProfile }} SPARK_MODULES: ${{ matrix.sparkModules }} if: ${{ !endsWith(env.SPARK_PROFILE, '3.2') }} # skip test spark 3.2 as it's covered by Azure CI run: - mvn test -Punit-tests -D"$SCALA_PROFILE" -D"$SPARK_PROFILE" -pl "$SPARK_COMMON_MODULES,$SPARK_MODULES" $MVN_ARGS - - name: FT - Spark + mvn test -Punit-tests -D"$SCALA_PROFILE" -D"$SPARK_PROFILE" -DwildcardSuites=skipScalaTests -DfailIfNoTests=false -pl "$SPARK_COMMON_MODULES,$SPARK_MODULES" $MVN_ARGS + - name: Java FT - Spark env: SCALA_PROFILE: ${{ matrix.scalaProfile }} SPARK_PROFILE: ${{ matrix.sparkProfile }} SPARK_MODULES: ${{ matrix.sparkModules }} if: ${{ !endsWith(env.SPARK_PROFILE, '3.2') }} # skip test spark 3.2 as it's covered by Azure CI run: - mvn test -Pfunctional-tests -D"$SCALA_PROFILE" -D"$SPARK_PROFILE" -pl "$SPARK_COMMON_MODULES,$SPARK_MODULES" $MVN_ARGS + mvn test -Pfunctional-tests -D"$SCALA_PROFILE" -D"$SPARK_PROFILE" -DwildcardSuites=skipScalaTests -DfailIfNoTests=false -pl "$SPARK_COMMON_MODULES,$SPARK_MODULES" $MVN_ARGS + + test-spark-scala-tests: + runs-on: ubuntu-latest + strategy: + matrix: + include: + - scalaProfile: "scala-2.11" + sparkProfile: "spark2.4" + sparkModules: "hudi-spark-datasource/hudi-spark2" + + - scalaProfile: "scala-2.12" + sparkProfile: "spark3.0" + sparkModules: "hudi-spark-datasource/hudi-spark3.0.x" + + - scalaProfile: "scala-2.12" + sparkProfile: "spark3.1" + sparkModules: "hudi-spark-datasource/hudi-spark3.1.x" + + - scalaProfile: "scala-2.12" + sparkProfile: "spark3.2" + sparkModules: "hudi-spark-datasource/hudi-spark3.2.x" + + - scalaProfile: "scala-2.12" + sparkProfile: "spark3.3" + sparkModules: "hudi-spark-datasource/hudi-spark3.3.x" + + - scalaProfile: "scala-2.12" + sparkProfile: "spark3.4" + sparkModules: "hudi-spark-datasource/hudi-spark3.4.x" + + - scalaProfile: "scala-2.12" + sparkProfile: "spark3.5" + sparkModules: "hudi-spark-datasource/hudi-spark3.5.x" + + steps: + - uses: actions/checkout@v3 + - name: Set up JDK 8 + uses: actions/setup-java@v3 + with: + java-version: '8' + distribution: 'temurin' + architecture: x64 + cache: maven + - name: Build Project + env: + SCALA_PROFILE: ${{ matrix.scalaProfile }} + SPARK_PROFILE: ${{ matrix.sparkProfile }} + run: + mvn clean install -T 2 -D"$SCALA_PROFILE" -D"$SPARK_PROFILE" -DskipTests=true $MVN_ARGS -am -pl "hudi-examples/hudi-examples-spark,$SPARK_COMMON_MODULES,$SPARK_MODULES" + - name: Scala UT - Common & Spark + env: + SCALA_PROFILE: ${{ matrix.scalaProfile }} + SPARK_PROFILE: ${{ matrix.sparkProfile }} + SPARK_MODULES: ${{ matrix.sparkModules }} + if: ${{ !endsWith(env.SPARK_PROFILE, '3.2') }} # skip test spark 3.2 as it's covered by Azure CI + run: + mvn test -Punit-tests -D"$SCALA_PROFILE" -D"$SPARK_PROFILE" -Dtest=skipJavaTests -DfailIfNoTests=false -pl "$SPARK_COMMON_MODULES,$SPARK_MODULES" $MVN_ARGS + - name: Scala FT - Spark + env: + SCALA_PROFILE: ${{ matrix.scalaProfile }} + SPARK_PROFILE: ${{ matrix.sparkProfile }} + SPARK_MODULES: ${{ matrix.sparkModules }} + if: ${{ !endsWith(env.SPARK_PROFILE, '3.2') }} # skip test spark 3.2 as it's covered by Azure CI + run: + mvn test -Pfunctional-tests -D"$SCALA_PROFILE" -D"$SPARK_PROFILE" -Dtest=skipJavaTests -DfailIfNoTests=false -pl "$SPARK_COMMON_MODULES,$SPARK_MODULES" $MVN_ARGS test-hudi-hadoop-mr-and-hudi-java-client: runs-on: ubuntu-latest @@ -138,7 +205,7 @@ jobs: uses: actions/setup-java@v3 with: java-version: '8' - distribution: 'adopt' + distribution: 'temurin' architecture: x64 cache: maven - name: Generate Maven Wrapper @@ -159,7 +226,7 @@ jobs: run: ./mvnw test -Punit-tests -fae -D"$SCALA_PROFILE" -D"$SPARK_PROFILE" -D"FLINK_PROFILE" -pl hudi-hadoop-mr,hudi-client/hudi-java-client $MVN_ARGS - test-spark-java17: + test-spark-java17-java-tests: runs-on: ubuntu-latest strategy: matrix: @@ -180,7 +247,7 @@ jobs: uses: actions/setup-java@v3 with: java-version: '8' - distribution: 'adopt' + distribution: 'temurin' architecture: x64 cache: maven - name: Build Project @@ -193,7 +260,7 @@ jobs: uses: actions/setup-java@v3 with: java-version: '17' - distribution: 'adopt' + distribution: 'temurin' architecture: x64 cache: maven - name: Quickstart Test @@ -201,16 +268,16 @@ jobs: SCALA_PROFILE: ${{ matrix.scalaProfile }} SPARK_PROFILE: ${{ matrix.sparkProfile }} run: - mvn test -Punit-tests -Pjava17 -D"$SCALA_PROFILE" -D"$SPARK_PROFILE" -pl hudi-examples/hudi-examples-spark $MVN_ARGS - - name: UT - Common & Spark + mvn test -Punit-tests -Pjava17 -D"$SCALA_PROFILE" -D"$SPARK_PROFILE" -DwildcardSuites=skipScalaTests -DfailIfNoTests=false -pl hudi-examples/hudi-examples-spark $MVN_ARGS + - name: Java UT - Common & Spark env: SCALA_PROFILE: ${{ matrix.scalaProfile }} SPARK_PROFILE: ${{ matrix.sparkProfile }} SPARK_MODULES: ${{ matrix.sparkModules }} if: ${{ !endsWith(env.SPARK_PROFILE, '3.2') }} # skip test spark 3.2 as it's covered by Azure CI run: - mvn test -Punit-tests -Pjava17 -D"$SCALA_PROFILE" -D"$SPARK_PROFILE" -pl "hudi-common,$SPARK_COMMON_MODULES,$SPARK_MODULES" $MVN_ARGS - - name: FT - Spark + mvn test -Punit-tests -Pjava17 -D"$SCALA_PROFILE" -D"$SPARK_PROFILE" -DwildcardSuites=skipScalaTests -DfailIfNoTests=false -pl "hudi-common,$SPARK_COMMON_MODULES,$SPARK_MODULES" $MVN_ARGS + - name: Java FT - Spark env: SCALA_PROFILE: ${{ matrix.scalaProfile }} SPARK_PROFILE: ${{ matrix.sparkProfile }} @@ -219,6 +286,60 @@ jobs: run: mvn test -Pfunctional-tests -Pjava17 -D"$SCALA_PROFILE" -D"$SPARK_PROFILE" -pl "$SPARK_COMMON_MODULES,$SPARK_MODULES" $MVN_ARGS + test-spark-java17-scala-tests: + runs-on: ubuntu-latest + strategy: + matrix: + include: + - scalaProfile: "scala-2.12" + sparkProfile: "spark3.3" + sparkModules: "hudi-spark-datasource/hudi-spark3.3.x" + - scalaProfile: "scala-2.12" + sparkProfile: "spark3.4" + sparkModules: "hudi-spark-datasource/hudi-spark3.4.x" + - scalaProfile: "scala-2.12" + sparkProfile: "spark3.5" + sparkModules: "hudi-spark-datasource/hudi-spark3.5.x" + + steps: + - uses: actions/checkout@v3 + - name: Set up JDK 8 + uses: actions/setup-java@v3 + with: + java-version: '8' + distribution: 'temurin' + architecture: x64 + cache: maven + - name: Build Project + env: + SCALA_PROFILE: ${{ matrix.scalaProfile }} + SPARK_PROFILE: ${{ matrix.sparkProfile }} + run: + mvn clean install -T 2 -D"$SCALA_PROFILE" -D"$SPARK_PROFILE" -DskipTests=true $MVN_ARGS -am -pl "hudi-examples/hudi-examples-spark,hudi-common,$SPARK_COMMON_MODULES,$SPARK_MODULES" + - name: Set up JDK 17 + uses: actions/setup-java@v3 + with: + java-version: '17' + distribution: 'temurin' + architecture: x64 + cache: maven + - name: Scala UT - Common & Spark + env: + SCALA_PROFILE: ${{ matrix.scalaProfile }} + SPARK_PROFILE: ${{ matrix.sparkProfile }} + SPARK_MODULES: ${{ matrix.sparkModules }} + if: ${{ !endsWith(env.SPARK_PROFILE, '3.2') }} # skip test spark 3.2 as it's covered by Azure CI + run: + mvn test -Punit-tests -Pjava17 -D"$SCALA_PROFILE" -D"$SPARK_PROFILE" -Dtest=skipJavaTests -DfailIfNoTests=false -pl "hudi-common,$SPARK_COMMON_MODULES,$SPARK_MODULES" $MVN_ARGS + - name: Scala FT - Spark + env: + SCALA_PROFILE: ${{ matrix.scalaProfile }} + SPARK_PROFILE: ${{ matrix.sparkProfile }} + SPARK_MODULES: ${{ matrix.sparkModules }} + if: ${{ !endsWith(env.SPARK_PROFILE, '3.2') }} # skip test spark 3.2 as it's covered by Azure CI + run: + mvn test -Pfunctional-tests -Pjava17 -D"$SCALA_PROFILE" -D"$SPARK_PROFILE" -Dtest=skipJavaTests -DfailIfNoTests=false -pl "$SPARK_COMMON_MODULES,$SPARK_MODULES" $MVN_ARGS + test-flink: runs-on: ubuntu-latest strategy: @@ -235,7 +356,7 @@ jobs: uses: actions/setup-java@v3 with: java-version: '8' - distribution: 'adopt' + distribution: 'temurin' architecture: x64 cache: maven - name: Build Project @@ -277,7 +398,7 @@ jobs: uses: actions/setup-java@v3 with: java-version: '8' - distribution: 'adopt' + distribution: 'temurin' architecture: x64 cache: maven - name: UT/FT - Docker Test - OpenJDK 17 @@ -326,7 +447,7 @@ jobs: uses: actions/setup-java@v3 with: java-version: '8' - distribution: 'adopt' + distribution: 'temurin' architecture: x64 cache: maven - name: Build Project @@ -368,78 +489,6 @@ jobs: HUDI_VERSION=$(mvn help:evaluate -Dexpression=project.version -q -DforceStdout) ./packaging/bundle-validation/ci_run.sh $HUDI_VERSION openjdk17 - validate-release-candidate-bundles: - if: false - runs-on: ubuntu-latest - env: - HUDI_VERSION: 0.13.1-rcx - STAGING_REPO_NUM: 1123 - strategy: - matrix: - include: - - flinkProfile: 'flink1.18' - sparkProfile: 'spark3' - sparkRuntime: 'spark3.5.0' - - flinkProfile: 'flink1.18' - sparkProfile: 'spark3.5' - sparkRuntime: 'spark3.5.0' - - flinkProfile: 'flink1.18' - sparkProfile: 'spark3.4' - sparkRuntime: 'spark3.4.0' - - flinkProfile: 'flink1.17' - sparkProfile: 'spark3.3' - sparkRuntime: 'spark3.3.2' - - flinkProfile: 'flink1.16' - sparkProfile: 'spark3.3' - sparkRuntime: 'spark3.3.1' - - flinkProfile: 'flink1.15' - sparkProfile: 'spark3.2' - sparkRuntime: 'spark3.2.3' - - flinkProfile: 'flink1.14' - sparkProfile: 'spark3.1' - sparkRuntime: 'spark3.1.3' - - flinkProfile: 'flink1.14' - sparkProfile: 'spark3.0' - sparkRuntime: 'spark3.0.2' - - flinkProfile: 'flink1.14' - sparkProfile: 'spark' - sparkRuntime: 'spark2.4.8' - - flinkProfile: 'flink1.14' - sparkProfile: 'spark2.4' - sparkRuntime: 'spark2.4.8' - steps: - - uses: actions/checkout@v3 - - name: Set up JDK 8 - uses: actions/setup-java@v3 - with: - java-version: '8' - distribution: 'adopt' - architecture: x64 - cache: maven - - name: IT - Bundle Validation - OpenJDK 8 - env: - FLINK_PROFILE: ${{ matrix.flinkProfile }} - SPARK_PROFILE: ${{ matrix.sparkProfile }} - SPARK_RUNTIME: ${{ matrix.sparkRuntime }} - run: | - ./packaging/bundle-validation/ci_run.sh $HUDI_VERSION openjdk8 $STAGING_REPO_NUM - - name: IT - Bundle Validation - OpenJDK 11 - env: - FLINK_PROFILE: ${{ matrix.flinkProfile }} - SPARK_PROFILE: ${{ matrix.sparkProfile }} - SPARK_RUNTIME: ${{ matrix.sparkRuntime }} - if: ${{ startsWith(env.SPARK_PROFILE, 'spark3') }} # Only Spark 3.x supports Java 11 as of now - run: | - ./packaging/bundle-validation/ci_run.sh $HUDI_VERSION openjdk11 $STAGING_REPO_NUM - - name: IT - Bundle Validation - OpenJDK 17 - env: - FLINK_PROFILE: ${{ matrix.flinkProfile }} - SPARK_PROFILE: ${{ matrix.sparkProfile }} - SPARK_RUNTIME: ${{ matrix.sparkRuntime }} - if: ${{ endsWith(env.SPARK_PROFILE, '3.3') }} # Only Spark 3.3 supports Java 17 as of now - run: | - ./packaging/bundle-validation/ci_run.sh $HUDI_VERSION openjdk17 $STAGING_REPO_NUM - integration-tests: runs-on: ubuntu-latest strategy: @@ -453,7 +502,7 @@ jobs: uses: actions/setup-java@v3 with: java-version: '8' - distribution: 'adopt' + distribution: 'temurin' architecture: x64 cache: maven - name: Build Project diff --git a/.github/workflows/labeler.js b/.github/workflows/labeler.js new file mode 100644 index 000000000000..77cd48337fb1 --- /dev/null +++ b/.github/workflows/labeler.js @@ -0,0 +1,97 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +async function labelDocsPr({ github, context, prNumber }) { + await github.rest.issues.addLabels({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: prNumber, + labels: ['docs'] + }); + + console.log(`- Labeled Docs PR: ${prNumber}`); +} + +async function labelPrWithSize({ github, context, prNumber, prData }) { + console.log(`Label PR based on size: ${prNumber} ${prData.html_url}`); + const additions = prData.additions; + const deletions = prData.deletions; + const totalChanges = additions + deletions; + + let newSizeLabel = ""; + + if (totalChanges <= 10) { + // size:XS : <= 10 LoC + newSizeLabel = "size:XS"; + } else if (totalChanges <= 100) { + // size:S : (10, 100] LoC + newSizeLabel = "size:S"; + } else if (totalChanges <= 300) { + // size:M : (100, 300] LoC + newSizeLabel = "size:M"; + } else if (totalChanges <= 1000) { + // size:L : (300, 1000] LoC + newSizeLabel = "size:L"; + } else { + // size:XL : > 1000 LoC + newSizeLabel = "size:XL"; + } + + // Check existing size label + const { data: labels } = await github.rest.issues.listLabelsOnIssue({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: prNumber + }); + + const existingSizeLabels = labels.filter(label => label.name.startsWith("size:") && label.name !== newSizeLabel); + const newSizeLabelInExisting = labels.filter(label => label.name === newSizeLabel); + + // Remove stale labels that do not match the new one + for (const label of existingSizeLabels) { + await github.rest.issues.removeLabel({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: prNumber, + name: label.name, + }); + console.log(`Removed stale size label: ${label.name}`); + } + + console.log(`Total lines of changes: ${totalChanges}`); + + // Add the new size label if needed + if (newSizeLabelInExisting.length > 0) { + console.log(`Accurate size Label already exists: ${newSizeLabel}`); + } else { + // Add the new label + await github.rest.issues.addLabels({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: prNumber, + labels: [newSizeLabel] + }); + console.log(`Added size Label: ${newSizeLabel}`); + } +} + +module.exports = { + labelDocsPr, + labelPrWithSize +}; diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml deleted file mode 100644 index d0b809c29587..000000000000 --- a/.github/workflows/labeler.yml +++ /dev/null @@ -1,24 +0,0 @@ -name: Label PR - -on: [ pull_request ] - -jobs: - labeler: - runs-on: ubuntu-latest - name: Label the PR size - steps: - - uses: codelytv/pr-size-labeler@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - xs_label: 'size-xs' - xs_max_size: '10' - s_label: 'size-s' - s_max_size: '100' - m_label: 'size-m' - m_max_size: '500' - l_label: 'size-l' - l_max_size: '1000' - xl_label: 'size-xl' - fail_if_xl: 'false' - github_api_url: 'api.github.com' - files_to_ignore: '' \ No newline at end of file diff --git a/.github/workflows/pr_compliance.yml b/.github/workflows/pr_compliance.yml index 3f58ceafcf3d..104a933db7d0 100644 --- a/.github/workflows/pr_compliance.yml +++ b/.github/workflows/pr_compliance.yml @@ -4,6 +4,7 @@ on: types: [opened, edited, reopened, synchronize] branches: - master + - branch-0.x jobs: validate-pr: diff --git a/.github/workflows/release_candidate_validation.yml b/.github/workflows/release_candidate_validation.yml new file mode 100644 index 000000000000..02a598888ea1 --- /dev/null +++ b/.github/workflows/release_candidate_validation.yml @@ -0,0 +1,100 @@ +name: Release Candidate Validation + +on: + push: + branches: + - 'release-*' + pull_request: + paths-ignore: + - '**.bmp' + - '**.gif' + - '**.jpg' + - '**.jpeg' + - '**.md' + - '**.pdf' + - '**.png' + - '**.svg' + - '**.yaml' + - '.gitignore' + branches: + - 'release-*' + +concurrency: + group: ${{ github.ref }} + cancel-in-progress: ${{ !contains(github.ref, 'master') }} + +env: + MVN_ARGS: -e -ntp -B -V -Dgpg.skip -Djacoco.skip -Pwarn-log -Dorg.slf4j.simpleLogger.log.org.apache.maven.plugins.shade=warn -Dorg.slf4j.simpleLogger.log.org.apache.maven.plugins.dependency=warn -Dmaven.wagon.httpconnectionManager.ttlSeconds=25 -Dmaven.wagon.http.retryHandler.count=5 + SPARK_COMMON_MODULES: hudi-spark-datasource/hudi-spark,hudi-spark-datasource/hudi-spark-common + +jobs: + validate-release-candidate-bundles: + runs-on: ubuntu-latest + env: + HUDI_VERSION: 0.14.1 + STAGING_REPO_NUM: 1123 + strategy: + matrix: + include: + - flinkProfile: 'flink1.18' + sparkProfile: 'spark3' + sparkRuntime: 'spark3.5.0' + - flinkProfile: 'flink1.18' + sparkProfile: 'spark3.5' + sparkRuntime: 'spark3.5.0' + - flinkProfile: 'flink1.18' + sparkProfile: 'spark3.4' + sparkRuntime: 'spark3.4.0' + - flinkProfile: 'flink1.17' + sparkProfile: 'spark3.3' + sparkRuntime: 'spark3.3.2' + - flinkProfile: 'flink1.16' + sparkProfile: 'spark3.3' + sparkRuntime: 'spark3.3.1' + - flinkProfile: 'flink1.15' + sparkProfile: 'spark3.2' + sparkRuntime: 'spark3.2.3' + - flinkProfile: 'flink1.14' + sparkProfile: 'spark3.1' + sparkRuntime: 'spark3.1.3' + - flinkProfile: 'flink1.14' + sparkProfile: 'spark3.0' + sparkRuntime: 'spark3.0.2' + - flinkProfile: 'flink1.14' + sparkProfile: 'spark' + sparkRuntime: 'spark2.4.8' + - flinkProfile: 'flink1.14' + sparkProfile: 'spark2.4' + sparkRuntime: 'spark2.4.8' + steps: + - uses: actions/checkout@v3 + - name: Set up JDK 8 + uses: actions/setup-java@v3 + with: + java-version: '8' + distribution: 'temurin' + architecture: x64 + cache: maven + - name: IT - Bundle Validation - OpenJDK 8 + env: + FLINK_PROFILE: ${{ matrix.flinkProfile }} + SPARK_PROFILE: ${{ matrix.sparkProfile }} + SPARK_RUNTIME: ${{ matrix.sparkRuntime }} + run: | + ./packaging/bundle-validation/ci_run.sh $HUDI_VERSION openjdk8 $STAGING_REPO_NUM + - name: IT - Bundle Validation - OpenJDK 11 + env: + FLINK_PROFILE: ${{ matrix.flinkProfile }} + SPARK_PROFILE: ${{ matrix.sparkProfile }} + SPARK_RUNTIME: ${{ matrix.sparkRuntime }} + if: ${{ startsWith(env.SPARK_PROFILE, 'spark3') }} # Only Spark 3.x supports Java 11 as of now + run: | + ./packaging/bundle-validation/ci_run.sh $HUDI_VERSION openjdk11 $STAGING_REPO_NUM + - name: IT - Bundle Validation - OpenJDK 17 + env: + FLINK_PROFILE: ${{ matrix.flinkProfile }} + SPARK_PROFILE: ${{ matrix.sparkProfile }} + SPARK_RUNTIME: ${{ matrix.sparkRuntime }} + if: ${{ endsWith(env.SPARK_PROFILE, '3.3') }} # Only Spark 3.3 supports Java 17 as of now + run: | + ./packaging/bundle-validation/ci_run.sh $HUDI_VERSION openjdk17 $STAGING_REPO_NUM diff --git a/.github/workflows/scheduled_workflow.yml b/.github/workflows/scheduled_workflow.yml new file mode 100644 index 000000000000..e6992d6b3838 --- /dev/null +++ b/.github/workflows/scheduled_workflow.yml @@ -0,0 +1,105 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +name: Scheduled Workflow + +on: + schedule: + # Runs every 5 minutes + - cron: '*/5 * * * *' + +permissions: + statuses: write + pull-requests: write + issues: read + +jobs: + process-new-and-updated-prs: + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v2 + + - name: Process new and updated PRs + # We have to run any actions that require write permissions here + # since the workflow triggered by events from a PR in a fork + # (not apache/hudi but other_owner/hudi) does not run on a + # GITHUB_TOKEN with write permissions (this is prohibited by + # Apache). + uses: actions/github-script@v7 + with: + github-token: ${{secrets.GITHUB_TOKEN}} + script: | + // Cron schedule may not be reliable so giving buffer time to avoid missing recent PRs + const since = new Date(new Date().getTime() - (900 * 1000)).toISOString(); + const query = `repo:${context.repo.owner}/${context.repo.repo} type:pr state:open updated:>=${since}`; + const openPrs = await github.paginate(github.rest.search.issuesAndPullRequests, { + q: query, + sort: 'updated', + order: 'desc', + per_page: 100 + }); + + const { labelDocsPr, labelPrWithSize } = require(`${process.env.GITHUB_WORKSPACE}/.github/workflows/labeler.js`); + const checkAzureCiAndCreateCommitStatus = require(`${process.env.GITHUB_WORKSPACE}/.github/workflows/azure_ci.js`); + + console.log(`Number of PRs to process: ${openPrs.length}`); + + for (const pr of openPrs) { + console.log(`*** Processing PR: ${pr.title}, URL: ${pr.html_url}`); + + const { data: pullRequest } = await github.rest.pulls.get({ + owner: context.repo.owner, + repo: context.repo.repo, + pull_number: pr.number + }); + + const targetBase = pullRequest.base.ref; + console.log(`Target base branch: ${targetBase}`); + + // Label docs PR (targeting "asf-site" branch) + if (targetBase === 'asf-site') { + await labelDocsPr({ + github, + context, + prNumber: pr.number + }); + } + + // Label PR size + await labelPrWithSize({ + github, + context, + prNumber: pr.number, + prData: pullRequest + }); + + // Check Azure CI and create commit status (targeting "master", "release*", or "branch-0.x" branch) + const targetBaseRegex = /^(master|release.*|branch-0\.x)$/; + if (targetBaseRegex.test(targetBase) + && !pr.body.includes('HOTFIX: SKIP AZURE CI')) { + const latestCommitHash = pullRequest.head.sha; + + // Create commit status based on Azure CI report to PR + await checkAzureCiAndCreateCommitStatus({ + github, + context, + prNumber: pr.number, + latestCommitHash: latestCommitHash + }); + } + } diff --git a/.gitignore b/.gitignore index 6c77bdab59de..3f72a1fced51 100644 --- a/.gitignore +++ b/.gitignore @@ -65,6 +65,7 @@ local.properties .out .idea/* !.idea/vcs.xml +!.idea/icon.png *.ipr *.iws *.iml diff --git a/.idea/icon.png b/.idea/icon.png new file mode 100644 index 000000000000..94e623516d86 Binary files /dev/null and b/.idea/icon.png differ diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 000000000000..71b2f1077a09 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,30 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Use a home made image as the base, which includes: +# utuntu:latest +# git +# thrift +# maven +# java8 +FROM apachehudi/hudi-ci-bundle-validation-base:azure_ci_test_base_new + +CMD ["java", "-version"] + +# Set the working directory to /app +WORKDIR /hudi + +# Copy git repo into the working directory +COPY . /hudi diff --git a/azure-pipelines-20230430.yml b/azure-pipelines-20230430.yml index 0767d179b243..e61057a4649d 100644 --- a/azure-pipelines-20230430.yml +++ b/azure-pipelines-20230430.yml @@ -30,6 +30,10 @@ parameters: type: object default: - 'hudi-common' + - 'hudi-client/hudi-spark-client' + - name: job2UTModules + type: object + default: - 'hudi-flink-datasource' - 'hudi-flink-datasource/hudi-flink' - 'hudi-flink-datasource/hudi-flink1.14.x' @@ -37,12 +41,20 @@ parameters: - 'hudi-flink-datasource/hudi-flink1.16.x' - 'hudi-flink-datasource/hudi-flink1.17.x' - 'hudi-flink-datasource/hudi-flink1.18.x' - - name: job2Modules + - name: job2FTModules type: object default: + - 'hudi-common' + - 'hudi-flink-datasource' + - 'hudi-flink-datasource/hudi-flink' + - 'hudi-flink-datasource/hudi-flink1.14.x' + - 'hudi-flink-datasource/hudi-flink1.15.x' + - 'hudi-flink-datasource/hudi-flink1.16.x' + - 'hudi-flink-datasource/hudi-flink1.17.x' + - 'hudi-flink-datasource/hudi-flink1.18.x' - 'hudi-client/hudi-spark-client' - 'hudi-spark-datasource/hudi-spark' - - name: job3UTModules + - name: job34UTModules type: object default: - 'hudi-spark-datasource' @@ -51,12 +63,13 @@ parameters: - 'hudi-spark-datasource/hudi-spark3.2plus-common' - 'hudi-spark-datasource/hudi-spark3-common' - 'hudi-spark-datasource/hudi-spark-common' - - name: job4UTModules + - name: job6UTModules type: object default: - '!hudi-hadoop-mr' - '!hudi-client/hudi-java-client' - '!hudi-client/hudi-spark-client' + - '!hudi-cli' - '!hudi-common' - '!hudi-examples' - '!hudi-examples/hudi-examples-common' @@ -76,10 +89,11 @@ parameters: - '!hudi-spark-datasource/hudi-spark3.2plus-common' - '!hudi-spark-datasource/hudi-spark3-common' - '!hudi-spark-datasource/hudi-spark-common' - - name: job4FTModules + - name: job6FTModules type: object default: - '!hudi-client/hudi-spark-client' + - '!hudi-cli' - '!hudi-common' - '!hudi-examples' - '!hudi-examples/hudi-examples-common' @@ -94,50 +108,59 @@ parameters: - '!hudi-flink-datasource/hudi-flink1.17.x' - '!hudi-flink-datasource/hudi-flink1.18.x' - '!hudi-spark-datasource/hudi-spark' + - name: job4HudiSparkDmlOthersWildcardSuites + type: object + default: + - 'org.apache.hudi' + - 'org.apache.spark.hudi' + - 'org.apache.spark.sql.avro' + - 'org.apache.spark.sql.execution' + - 'org.apache.spark.sql.hudi.analysis' + - 'org.apache.spark.sql.hudi.command' + - 'org.apache.spark.sql.hudi.common' + - 'org.apache.spark.sql.hudi.dml' variables: BUILD_PROFILES: '-Dscala-2.12 -Dspark3.2 -Dflink1.18' PLUGIN_OPTS: '-Dcheckstyle.skip=true -Drat.skip=true -Djacoco.skip=true -ntp -B -V -Pwarn-log -Dorg.slf4j.simpleLogger.log.org.apache.maven.plugins.shade=warn -Dorg.slf4j.simpleLogger.log.org.apache.maven.plugins.dependency=warn' - MVN_OPTS_INSTALL: '-Phudi-platform-service -DskipTests $(BUILD_PROFILES) $(PLUGIN_OPTS) -Dmaven.wagon.httpconnectionManager.ttlSeconds=25 -Dmaven.wagon.http.retryHandler.count=5' + MVN_OPTS_INSTALL: '-T 3 -Phudi-platform-service -DskipTests $(BUILD_PROFILES) $(PLUGIN_OPTS) -Dmaven.wagon.httpconnectionManager.ttlSeconds=25 -Dmaven.wagon.http.retryHandler.count=5' MVN_OPTS_TEST: '-fae -Pwarn-log $(BUILD_PROFILES) $(PLUGIN_OPTS)' + JAVA_MVN_TEST_FILTER: '-DwildcardSuites=skipScalaTests -DfailIfNoTests=false' + SCALA_MVN_TEST_FILTER: '-Dtest=skipJavaTests -DfailIfNoTests=false' JOB1_MODULES: ${{ join(',',parameters.job1Modules) }} - JOB2_MODULES: ${{ join(',',parameters.job2Modules) }} - JOB3_MODULES: ${{ join(',',parameters.job3UTModules) }} - JOB4_UT_MODULES: ${{ join(',',parameters.job4UTModules) }} - JOB4_FT_MODULES: ${{ join(',',parameters.job4FTModules) }} + JOB2_UT_MODULES: ${{ join(',',parameters.job2UTModules) }} + JOB2_FT_MODULES: ${{ join(',',parameters.job2FTModules) }} + JOB34_MODULES: ${{ join(',',parameters.job34UTModules) }} + JOB3_SPARK_DDL_WILDCARD_SUITES: 'org.apache.spark.sql.hudi.ddl' + JOB6_SPARK_PROCEDURE_WILDCARD_SUITES: 'org.apache.spark.sql.hudi.procedure' + JOB4_SPARK_DML_OTHERS_WILDCARD_SUITES: ${{ join(',',parameters.job4HudiSparkDmlOthersWildcardSuites) }} + JOB6_UT_MODULES: ${{ join(',',parameters.job6UTModules) }} + JOB6_FT_MODULES: ${{ join(',',parameters.job6FTModules) }} stages: - stage: test + variables: + - name: DOCKER_BUILDKIT + value: 1 jobs: - job: UT_FT_1 - displayName: UT FT common & flink & UT client/spark-client - timeoutInMinutes: '150' + displayName: UT common & client/spark-client + timeoutInMinutes: '90' steps: - task: Maven@4 displayName: maven install inputs: mavenPomFile: 'pom.xml' goals: 'clean install' - options: $(MVN_OPTS_INSTALL) - publishJUnitResults: true - testResultsFiles: '**/surefire-reports/TEST-*.xml' + options: $(MVN_OPTS_INSTALL) -pl $(JOB1_MODULES) -am + publishJUnitResults: false jdkVersionOption: '1.8' - task: Maven@4 - displayName: UT common flink client/spark-client + displayName: UT common & client/spark-client inputs: mavenPomFile: 'pom.xml' goals: 'test' - options: $(MVN_OPTS_TEST) -Punit-tests -pl $(JOB1_MODULES),hudi-client/hudi-spark-client - publishJUnitResults: true - testResultsFiles: '**/surefire-reports/TEST-*.xml' - jdkVersionOption: '1.8' - mavenOptions: '-Xmx4g' - - task: Maven@4 - displayName: FT common flink - inputs: - mavenPomFile: 'pom.xml' - goals: 'test' - options: $(MVN_OPTS_TEST) -Pfunctional-tests -pl $(JOB1_MODULES) + options: $(MVN_OPTS_TEST) -Punit-tests -pl $(JOB1_MODULES) publishJUnitResults: true testResultsFiles: '**/surefire-reports/TEST-*.xml' jdkVersionOption: '1.8' @@ -146,24 +169,32 @@ stages: grep "testcase" */target/surefire-reports/*.xml */*/target/surefire-reports/*.xml | awk -F'"' ' { print $6,$4,$2 } ' | sort -nr | head -n 100 displayName: Top 100 long-running testcases - job: UT_FT_2 - displayName: FT client/spark-client & hudi-spark-datasource/hudi-spark - timeoutInMinutes: '150' + displayName: UT flink & FT common & flink & spark-client & hudi-spark + timeoutInMinutes: '90' steps: - task: Maven@4 displayName: maven install inputs: mavenPomFile: 'pom.xml' goals: 'clean install' - options: $(MVN_OPTS_INSTALL) - publishJUnitResults: true - testResultsFiles: '**/surefire-reports/TEST-*.xml' + options: $(MVN_OPTS_INSTALL) -pl $(JOB2_FT_MODULES) -am + publishJUnitResults: false jdkVersionOption: '1.8' - task: Maven@4 - displayName: FT client/spark-client & hudi-spark-datasource/hudi-spark + displayName: UT flink inputs: mavenPomFile: 'pom.xml' goals: 'test' - options: $(MVN_OPTS_TEST) -Pfunctional-tests -pl $(JOB2_MODULES) + options: $(MVN_OPTS_TEST) -Punit-tests -pl $(JOB2_UT_MODULES) + publishJUnitResults: false + jdkVersionOption: '1.8' + mavenOptions: '-Xmx4g' + - task: Maven@4 + displayName: FT common & flink & client/spark-client & hudi-spark-datasource/hudi-spark + inputs: + mavenPomFile: 'pom.xml' + goals: 'test' + options: $(MVN_OPTS_TEST) -Pfunctional-tests -pl $(JOB2_FT_MODULES) publishJUnitResults: true testResultsFiles: '**/surefire-reports/TEST-*.xml' jdkVersionOption: '1.8' @@ -172,24 +203,32 @@ stages: grep "testcase" */target/surefire-reports/*.xml */*/target/surefire-reports/*.xml | awk -F'"' ' { print $6,$4,$2 } ' | sort -nr | head -n 100 displayName: Top 100 long-running testcases - job: UT_FT_3 - displayName: UT spark-datasource - timeoutInMinutes: '240' + displayName: UT spark-datasource Java Tests & DDL + timeoutInMinutes: '90' steps: - task: Maven@4 displayName: maven install inputs: mavenPomFile: 'pom.xml' goals: 'clean install' - options: $(MVN_OPTS_INSTALL) - publishJUnitResults: true - testResultsFiles: '**/surefire-reports/TEST-*.xml' + options: $(MVN_OPTS_INSTALL) -pl $(JOB34_MODULES) -am + publishJUnitResults: false jdkVersionOption: '1.8' - task: Maven@4 - displayName: UT spark-datasource + displayName: Java UT spark-datasource inputs: mavenPomFile: 'pom.xml' goals: 'test' - options: $(MVN_OPTS_TEST) -Punit-tests -pl $(JOB3_MODULES) + options: $(MVN_OPTS_TEST) -Punit-tests $(JAVA_MVN_TEST_FILTER) -pl $(JOB34_MODULES) + publishJUnitResults: false + jdkVersionOption: '1.8' + mavenOptions: '-Xmx4g' + - task: Maven@4 + displayName: Scala UT spark-datasource DDL + inputs: + mavenPomFile: 'pom.xml' + goals: 'test' + options: $(MVN_OPTS_TEST) -Punit-tests $(SCALA_MVN_TEST_FILTER) -DwildcardSuites="$(JOB3_SPARK_DDL_WILDCARD_SUITES)" -pl $(JOB34_MODULES) publishJUnitResults: true testResultsFiles: '**/surefire-reports/TEST-*.xml' jdkVersionOption: '1.8' @@ -198,38 +237,116 @@ stages: grep "testcase" */target/surefire-reports/*.xml */*/target/surefire-reports/*.xml | awk -F'"' ' { print $6,$4,$2 } ' | sort -nr | head -n 100 displayName: Top 100 long-running testcases - job: UT_FT_4 - displayName: UT FT other modules - timeoutInMinutes: '240' + displayName: UT spark-datasource DML & others + timeoutInMinutes: '90' steps: - task: Maven@4 displayName: maven install inputs: mavenPomFile: 'pom.xml' goals: 'clean install' - options: $(MVN_OPTS_INSTALL) - publishJUnitResults: true - testResultsFiles: '**/surefire-reports/TEST-*.xml' + options: $(MVN_OPTS_INSTALL) -pl $(JOB34_MODULES) -am + publishJUnitResults: false jdkVersionOption: '1.8' - task: Maven@4 - displayName: UT other modules + displayName: Scala UT spark-datasource DML & others inputs: mavenPomFile: 'pom.xml' goals: 'test' - options: $(MVN_OPTS_TEST) -Punit-tests -pl $(JOB4_UT_MODULES) + options: $(MVN_OPTS_TEST) -Punit-tests $(SCALA_MVN_TEST_FILTER) -DwildcardSuites="$(JOB4_SPARK_DML_OTHERS_WILDCARD_SUITES)" -pl $(JOB34_MODULES) publishJUnitResults: true testResultsFiles: '**/surefire-reports/TEST-*.xml' jdkVersionOption: '1.8' mavenOptions: '-Xmx4g' - - task: Maven@4 - displayName: FT other modules + - script: | + grep "testcase" */target/surefire-reports/*.xml */*/target/surefire-reports/*.xml | awk -F'"' ' { print $6,$4,$2 } ' | sort -nr | head -n 100 + displayName: Top 100 long-running testcases + - job: UT_FT_5 + displayName: UT FT Hudi Streamer + timeoutInMinutes: '90' + steps: + - task: Docker@2 + displayName: "login to docker hub" inputs: - mavenPomFile: 'pom.xml' - goals: 'test' - options: $(MVN_OPTS_TEST) -Pfunctional-tests -pl $(JOB4_FT_MODULES) - publishJUnitResults: true + command: "login" + containerRegistry: "apachehudi-docker-hub" + - task: Docker@2 + displayName: "load repo into image" + inputs: + containerRegistry: 'apachehudi-docker-hub' + repository: 'apachehudi/hudi-ci-bundle-validation-base' + command: 'build' + Dockerfile: '**/Dockerfile' + ImageName: $(Build.BuildId) + - task: Docker@2 + displayName: "UT FT other modules" + inputs: + containerRegistry: 'apachehudi-docker-hub' + repository: 'apachehudi/hudi-ci-bundle-validation-base' + command: 'run' + arguments: > + -v $(Build.SourcesDirectory):/hudi + -i docker.io/apachehudi/hudi-ci-bundle-validation-base:$(Build.BuildId) + /bin/bash -c "pwd + && rm -rf /hudi/scripts/ci/results + && mvn clean install $(MVN_OPTS_INSTALL) -Phudi-platform-service -Pthrift-gen-source -pl hudi-utilities -am + && mvn test $(MVN_OPTS_TEST) -Punit-tests -Dtest="Test*DeltaStreamer*" -DfailIfNoTests=false -pl hudi-utilities + && mvn test $(MVN_OPTS_TEST) -Pfunctional-tests -Dtest="Test*DeltaStreamer*" -DfailIfNoTests=false -pl hudi-utilities + && ./scripts/ci/move_surefire_reports.sh /hudi /hudi/scripts/ci/results + && echo 'All surefire report files:' + && find . -type f -name \"TEST-*.xml\"" + - task: PublishTestResults@2 + displayName: 'Publish Test Results' + inputs: + testResultsFormat: 'JUnit' testResultsFiles: '**/surefire-reports/TEST-*.xml' - jdkVersionOption: '1.8' - mavenOptions: '-Xmx4g' + searchFolder: '$(Build.SourcesDirectory)/scripts/ci/results' + failTaskOnFailedTests: true - script: | - grep "testcase" */target/surefire-reports/*.xml */*/target/surefire-reports/*.xml | awk -F'"' ' { print $6,$4,$2 } ' | sort -nr | head -n 100 + grep "testcase" scripts/ci/results/*/target/surefire-reports/*.xml scripts/ci/results/*/*/target/surefire-reports/*.xml | awk -F'"' ' { print $6,$4,$2 } ' | sort -nr | head -n 100 + displayName: Top 100 long-running testcases + - job: UT_FT_6 + displayName: UT FT other modules + timeoutInMinutes: '90' + steps: + - task: Docker@2 + displayName: "login to docker hub" + inputs: + command: "login" + containerRegistry: "apachehudi-docker-hub" + - task: Docker@2 + displayName: "load repo into image" + inputs: + containerRegistry: 'apachehudi-docker-hub' + repository: 'apachehudi/hudi-ci-bundle-validation-base' + command: 'build' + Dockerfile: '**/Dockerfile' + ImageName: $(Build.BuildId) + - task: Docker@2 + displayName: "UT FT other modules" + inputs: + containerRegistry: 'apachehudi-docker-hub' + repository: 'apachehudi/hudi-ci-bundle-validation-base' + command: 'run' + arguments: > + -v $(Build.SourcesDirectory):/hudi + -i docker.io/apachehudi/hudi-ci-bundle-validation-base:$(Build.BuildId) + /bin/bash -c "pwd + && rm -rf /hudi/scripts/ci/results + && mvn clean install $(MVN_OPTS_INSTALL) -Phudi-platform-service -Pthrift-gen-source + && mvn test $(MVN_OPTS_TEST) -Punit-tests $(SCALA_MVN_TEST_FILTER) -DwildcardSuites="$(JOB6_SPARK_PROCEDURE_WILDCARD_SUITES)" -pl $(JOB34_MODULES) + && mvn test $(MVN_OPTS_TEST) -Punit-tests -Dtest="!Test*DeltaStreamer*" -DfailIfNoTests=false -pl $(JOB6_UT_MODULES) + && mvn test $(MVN_OPTS_TEST) -Pfunctional-tests -Dtest="!Test*DeltaStreamer*" -DfailIfNoTests=false -pl $(JOB6_FT_MODULES) + && ./scripts/ci/move_surefire_reports.sh /hudi /hudi/scripts/ci/results + && echo 'All surefire report files:' + && find . -type f -name \"TEST-*.xml\"" + - task: PublishTestResults@2 + displayName: 'Publish Test Results' + inputs: + testResultsFormat: 'JUnit' + testResultsFiles: '**/surefire-reports/TEST-*.xml' + searchFolder: '$(Build.SourcesDirectory)/scripts/ci/results' + failTaskOnFailedTests: true + - script: | + grep "testcase" scripts/ci/results/*/target/surefire-reports/*.xml scripts/ci/results/*/*/target/surefire-reports/*.xml | awk -F'"' ' { print $6,$4,$2 } ' | sort -nr | head -n 100 displayName: Top 100 long-running testcases diff --git a/hudi-aws/pom.xml b/hudi-aws/pom.xml index 9da22ff32c2a..5aadd6b1cca9 100644 --- a/hudi-aws/pom.xml +++ b/hudi-aws/pom.xml @@ -299,9 +299,6 @@ src/main/resources - - src/test/resources - diff --git a/hudi-aws/src/main/java/org/apache/hudi/aws/sync/AWSGlueCatalogSyncClient.java b/hudi-aws/src/main/java/org/apache/hudi/aws/sync/AWSGlueCatalogSyncClient.java index 1e19b44a4992..2050e4f60ffe 100644 --- a/hudi-aws/src/main/java/org/apache/hudi/aws/sync/AWSGlueCatalogSyncClient.java +++ b/hudi-aws/src/main/java/org/apache/hudi/aws/sync/AWSGlueCatalogSyncClient.java @@ -22,6 +22,8 @@ import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.table.TableSchemaResolver; import org.apache.hudi.common.util.CollectionUtils; +import org.apache.hudi.common.util.CustomizedThreadFactory; +import org.apache.hudi.common.util.HoodieTimer; import org.apache.hudi.common.util.Option; import org.apache.hudi.config.GlueCatalogSyncClientConfig; import org.apache.hudi.hive.HiveSyncConfig; @@ -37,6 +39,8 @@ import software.amazon.awssdk.services.glue.model.BatchCreatePartitionResponse; import software.amazon.awssdk.services.glue.model.BatchDeletePartitionRequest; import software.amazon.awssdk.services.glue.model.BatchDeletePartitionResponse; +import software.amazon.awssdk.services.glue.model.BatchGetPartitionRequest; +import software.amazon.awssdk.services.glue.model.BatchGetPartitionResponse; import software.amazon.awssdk.services.glue.model.BatchUpdatePartitionRequest; import software.amazon.awssdk.services.glue.model.BatchUpdatePartitionRequestEntry; import software.amazon.awssdk.services.glue.model.BatchUpdatePartitionResponse; @@ -59,6 +63,7 @@ import software.amazon.awssdk.services.glue.model.PartitionIndexDescriptor; import software.amazon.awssdk.services.glue.model.PartitionInput; import software.amazon.awssdk.services.glue.model.PartitionValueList; +import software.amazon.awssdk.services.glue.model.Segment; import software.amazon.awssdk.services.glue.model.SerDeInfo; import software.amazon.awssdk.services.glue.model.StorageDescriptor; import software.amazon.awssdk.services.glue.model.Table; @@ -81,14 +86,21 @@ import java.util.Objects; import java.util.concurrent.CompletableFuture; import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; +import java.util.function.Consumer; import java.util.stream.Collectors; import static org.apache.hudi.aws.utils.S3Utils.s3aToS3; import static org.apache.hudi.common.util.MapUtils.containsAll; import static org.apache.hudi.common.util.MapUtils.isNullOrEmpty; -import static org.apache.hudi.config.GlueCatalogSyncClientConfig.GLUE_METADATA_FILE_LISTING; +import static org.apache.hudi.config.GlueCatalogSyncClientConfig.CHANGED_PARTITIONS_READ_PARALLELISM; import static org.apache.hudi.config.GlueCatalogSyncClientConfig.META_SYNC_PARTITION_INDEX_FIELDS; import static org.apache.hudi.config.GlueCatalogSyncClientConfig.META_SYNC_PARTITION_INDEX_FIELDS_ENABLE; +import static org.apache.hudi.config.GlueCatalogSyncClientConfig.PARTITION_CHANGE_PARALLELISM; +import static org.apache.hudi.config.GlueCatalogSyncClientConfig.GLUE_METADATA_FILE_LISTING; +import static org.apache.hudi.config.GlueCatalogSyncClientConfig.ALL_PARTITIONS_READ_PARALLELISM; import static org.apache.hudi.config.HoodieAWSConfig.AWS_GLUE_ENDPOINT; import static org.apache.hudi.config.HoodieAWSConfig.AWS_GLUE_REGION; import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_CREATE_MANAGED_TABLE; @@ -109,12 +121,12 @@ public class AWSGlueCatalogSyncClient extends HoodieSyncClient { private static final Logger LOG = LoggerFactory.getLogger(AWSGlueCatalogSyncClient.class); - private static final int MAX_PARTITIONS_PER_REQUEST = 100; + private static final int MAX_PARTITIONS_PER_CHANGE_REQUEST = 100; + private static final int MAX_PARTITIONS_PER_READ_REQUEST = 1000; private static final int MAX_DELETE_PARTITIONS_PER_REQUEST = 25; protected final GlueAsyncClient awsGlue; private static final String GLUE_PARTITION_INDEX_ENABLE = "partition_filtering.enabled"; private static final int PARTITION_INDEX_MAX_NUMBER = 3; - private static final int GLUE_EXPRESSION_MAX_CHARS = 2048; /** * athena v2/v3 table property * see https://docs.aws.amazon.com/athena/latest/ug/querying-hudi.html @@ -124,6 +136,9 @@ public class AWSGlueCatalogSyncClient extends HoodieSyncClient { private final Boolean skipTableArchive; private final String enableMetadataTable; + private final int allPartitionsReadParallelism; + private final int changedPartitionsReadParallelism; + private final int changeParallelism; public AWSGlueCatalogSyncClient(HiveSyncConfig config) { super(config); @@ -141,105 +156,196 @@ public AWSGlueCatalogSyncClient(HiveSyncConfig config) { this.databaseName = config.getStringOrDefault(META_SYNC_DATABASE_NAME); this.skipTableArchive = config.getBooleanOrDefault(GlueCatalogSyncClientConfig.GLUE_SKIP_TABLE_ARCHIVE); this.enableMetadataTable = Boolean.toString(config.getBoolean(GLUE_METADATA_FILE_LISTING)).toUpperCase(); + this.allPartitionsReadParallelism = config.getIntOrDefault(ALL_PARTITIONS_READ_PARALLELISM); + this.changedPartitionsReadParallelism = config.getIntOrDefault(CHANGED_PARTITIONS_READ_PARALLELISM); + this.changeParallelism = config.getIntOrDefault(PARTITION_CHANGE_PARALLELISM); + } + + private List getPartitionsSegment(Segment segment, String tableName) { + try { + List partitions = new ArrayList<>(); + String nextToken = null; + do { + GetPartitionsResponse result = awsGlue.getPartitions(GetPartitionsRequest.builder() + .databaseName(databaseName) + .tableName(tableName) + .segment(segment) + .nextToken(nextToken) + .build()).get(); + partitions.addAll(result.partitions().stream() + .map(p -> new Partition(p.values(), p.storageDescriptor().location())) + .collect(Collectors.toList())); + nextToken = result.nextToken(); + } while (nextToken != null); + return partitions; + } catch (Exception e) { + throw new HoodieGlueSyncException("Failed to get all partitions for table " + tableId(databaseName, tableName), e); + } } @Override public List getAllPartitions(String tableName) { + ExecutorService executorService = Executors.newFixedThreadPool(this.allPartitionsReadParallelism, new CustomizedThreadFactory("glue-sync-all-partitions", true)); try { - return getPartitions(GetPartitionsRequest.builder() - .databaseName(databaseName) - .tableName(tableName)); + List segments = new ArrayList<>(); + for (int i = 0; i < allPartitionsReadParallelism; i++) { + segments.add(Segment.builder() + .segmentNumber(i) + .totalSegments(allPartitionsReadParallelism).build()); + } + List>> futures = segments.stream() + .map(segment -> executorService.submit(() -> this.getPartitionsSegment(segment, tableName))) + .collect(Collectors.toList()); + + List partitions = new ArrayList<>(); + for (Future> future : futures) { + partitions.addAll(future.get()); + } + + return partitions; } catch (Exception e) { throw new HoodieGlueSyncException("Failed to get all partitions for table " + tableId(databaseName, tableName), e); + } finally { + executorService.shutdownNow(); } } @Override - public List getPartitionsByFilter(String tableName, String filter) { + public List getPartitionsFromList(String tableName, List partitionList) { + if (partitionList.isEmpty()) { + LOG.info("No partitions to read for " + tableId(this.databaseName, tableName)); + return Collections.emptyList(); + } + HoodieTimer timer = HoodieTimer.start(); + List> batches = CollectionUtils.batches(partitionList, MAX_PARTITIONS_PER_READ_REQUEST); + ExecutorService executorService = Executors.newFixedThreadPool( + Math.min(this.changedPartitionsReadParallelism, batches.size()), + new CustomizedThreadFactory("glue-sync-get-partitions-" + tableName, true) + ); try { - if (filter.length() <= GLUE_EXPRESSION_MAX_CHARS) { - LOG.info("Pushdown filters: {}", filter); - return getPartitions(GetPartitionsRequest.builder() - .databaseName(databaseName) - .tableName(tableName) - .expression(filter)); - } else { - LOG.warn("Falling back to listing all partition since expression filter length > {}", GLUE_EXPRESSION_MAX_CHARS); - return getAllPartitions(tableName); + List>> futures = batches + .stream() + .map(batch -> executorService.submit(() -> this.getChangedPartitions(batch, tableName))) + .collect(Collectors.toList()); + + List partitions = new ArrayList<>(); + for (Future> future : futures) { + partitions.addAll(future.get()); } + LOG.info( + "Requested {} partitions, found existing {} partitions, new {} partitions", + partitionList.size(), + partitions.size(), + partitionList.size() - partitions.size()); + + return partitions; } catch (Exception e) { - throw new HoodieGlueSyncException("Failed to get partitions for table " + tableId(databaseName, tableName) + " from expression: " + filter, e); + throw new HoodieGlueSyncException("Failed to get all partitions for table " + tableId(this.databaseName, tableName), e); + } finally { + executorService.shutdownNow(); + LOG.info("Took {} ms to get {} partitions for table {}", timer.endTimer(), partitionList.size(), tableId(this.databaseName, tableName)); } } - private List getPartitions(GetPartitionsRequest.Builder partitionRequestBuilder) throws InterruptedException, ExecutionException { - List partitions = new ArrayList<>(); - String nextToken = null; - do { - GetPartitionsResponse result = awsGlue.getPartitions(partitionRequestBuilder - .excludeColumnSchema(true) - .nextToken(nextToken) - .build()).get(); - partitions.addAll(result.partitions().stream() - .map(p -> new Partition(p.values(), p.storageDescriptor().location())) - .collect(Collectors.toList())); - nextToken = result.nextToken(); - } while (nextToken != null); - return partitions; + private List getChangedPartitions(List changedPartitions, String tableName) throws ExecutionException, InterruptedException { + List partitionValueList = changedPartitions.stream().map(str -> + PartitionValueList.builder().values(partitionValueExtractor.extractPartitionValuesInPath(str)).build() + ).collect(Collectors.toList()); + BatchGetPartitionRequest request = BatchGetPartitionRequest.builder() + .databaseName(this.databaseName) + .tableName(tableName) + .partitionsToGet(partitionValueList) + .build(); + BatchGetPartitionResponse callResult = awsGlue.batchGetPartition(request).get(); + List result = callResult + .partitions() + .stream() + .map(p -> new Partition(p.values(), p.storageDescriptor().location())) + .collect(Collectors.toList()); + + return result; } @Override public void addPartitionsToTable(String tableName, List partitionsToAdd) { - if (partitionsToAdd.isEmpty()) { - LOG.info("No partitions to add for " + tableId(databaseName, tableName)); - return; - } - LOG.info("Adding " + partitionsToAdd.size() + " partition(s) in table " + tableId(databaseName, tableName)); + HoodieTimer timer = HoodieTimer.start(); try { + if (partitionsToAdd.isEmpty()) { + LOG.info("No partitions to add for " + tableId(this.databaseName, tableName)); + return; + } Table table = getTable(awsGlue, databaseName, tableName); + parallelizeChange(partitionsToAdd, this.changeParallelism, partitions -> this.addPartitionsToTableInternal(table, partitions), MAX_PARTITIONS_PER_CHANGE_REQUEST); + } finally { + LOG.info("Added {} partitions to table {} in {} ms", partitionsToAdd.size(), tableId(this.databaseName, tableName), timer.endTimer()); + } + } + + private void parallelizeChange(List items, int parallelism, Consumer> consumer, int sliceSize) { + List> batches = CollectionUtils.batches(items, sliceSize); + ExecutorService executorService = Executors.newFixedThreadPool(Math.min(parallelism, batches.size()), new CustomizedThreadFactory("glue-sync", true)); + try { + List> futures = batches.stream() + .map(item -> executorService.submit(() -> { + consumer.accept(item); + })) + .collect(Collectors.toList()); + for (Future future : futures) { + future.get(); + } + } catch (Exception e) { + throw new HoodieGlueSyncException("Failed to parallelize operation", e); + } finally { + executorService.shutdownNow(); + } + } + + private void addPartitionsToTableInternal(Table table, List partitionsToAdd) { + try { StorageDescriptor sd = table.storageDescriptor(); - List partitionInputs = partitionsToAdd.stream().map(partition -> { + List partitionInputList = partitionsToAdd.stream().map(partition -> { String fullPartitionPath = FSUtils.getPartitionPath(s3aToS3(getBasePath()), partition).toString(); List partitionValues = partitionValueExtractor.extractPartitionValuesInPath(partition); StorageDescriptor partitionSD = sd.copy(copySd -> copySd.location(fullPartitionPath)); return PartitionInput.builder().values(partitionValues).storageDescriptor(partitionSD).build(); }).collect(Collectors.toList()); - List> futures = new ArrayList<>(); - - for (List batch : CollectionUtils.batches(partitionInputs, MAX_PARTITIONS_PER_REQUEST)) { - BatchCreatePartitionRequest request = BatchCreatePartitionRequest.builder() - .databaseName(databaseName).tableName(tableName).partitionInputList(batch).build(); - futures.add(awsGlue.batchCreatePartition(request)); - } - - for (CompletableFuture future : futures) { - BatchCreatePartitionResponse response = future.get(); - if (CollectionUtils.nonEmpty(response.errors())) { - if (response.errors().stream() - .allMatch( - (error) -> "AlreadyExistsException".equals(error.errorDetail().errorCode()))) { - LOG.warn("Partitions already exist in glue: " + response.errors()); - } else { - throw new HoodieGlueSyncException("Fail to add partitions to " + tableId(databaseName, tableName) + BatchCreatePartitionRequest request = BatchCreatePartitionRequest.builder() + .databaseName(databaseName).tableName(table.name()).partitionInputList(partitionInputList).build(); + CompletableFuture future = awsGlue.batchCreatePartition(request); + BatchCreatePartitionResponse response = future.get(); + if (CollectionUtils.nonEmpty(response.errors())) { + if (response.errors().stream() + .allMatch( + (error) -> "AlreadyExistsException".equals(error.errorDetail().errorCode()))) { + LOG.warn("Partitions already exist in glue: " + response.errors()); + } else { + throw new HoodieGlueSyncException("Fail to add partitions to " + tableId(databaseName, table.name()) + " with error(s): " + response.errors()); - } } } } catch (Exception e) { - throw new HoodieGlueSyncException("Fail to add partitions to " + tableId(databaseName, tableName), e); + throw new HoodieGlueSyncException("Fail to add partitions to " + tableId(databaseName, table.name()), e); } } @Override public void updatePartitionsToTable(String tableName, List changedPartitions) { - if (changedPartitions.isEmpty()) { - LOG.info("No partitions to change for " + tableName); - return; - } - LOG.info("Updating " + changedPartitions.size() + "partition(s) in table " + tableId(databaseName, tableName)); + HoodieTimer timer = HoodieTimer.start(); try { + if (changedPartitions.isEmpty()) { + LOG.info("No partitions to update for " + tableId(this.databaseName, tableName)); + return; + } Table table = getTable(awsGlue, databaseName, tableName); + parallelizeChange(changedPartitions, this.changeParallelism, partitions -> this.updatePartitionsToTableInternal(table, partitions), MAX_PARTITIONS_PER_CHANGE_REQUEST); + } finally { + LOG.info("Updated {} partitions to table {} in {} ms", changedPartitions.size(), tableId(this.databaseName, tableName), timer.endTimer()); + } + } + + private void updatePartitionsToTableInternal(Table table, List changedPartitions) { + try { StorageDescriptor sd = table.storageDescriptor(); List updatePartitionEntries = changedPartitions.stream().map(partition -> { String fullPartitionPath = FSUtils.getPartitionPath(s3aToS3(getBasePath()), partition).toString(); @@ -249,57 +355,52 @@ public void updatePartitionsToTable(String tableName, List changedPartit return BatchUpdatePartitionRequestEntry.builder().partitionInput(partitionInput).partitionValueList(partitionValues).build(); }).collect(Collectors.toList()); - List> futures = new ArrayList<>(); - for (List batch : CollectionUtils.batches(updatePartitionEntries, MAX_PARTITIONS_PER_REQUEST)) { - BatchUpdatePartitionRequest request = BatchUpdatePartitionRequest.builder() - .databaseName(databaseName).tableName(tableName).entries(batch).build(); - futures.add(awsGlue.batchUpdatePartition(request)); - } + BatchUpdatePartitionRequest request = BatchUpdatePartitionRequest.builder() + .databaseName(databaseName).tableName(table.name()).entries(updatePartitionEntries).build(); + CompletableFuture future = awsGlue.batchUpdatePartition(request); - for (CompletableFuture future : futures) { - BatchUpdatePartitionResponse response = future.get(); - if (CollectionUtils.nonEmpty(response.errors())) { - throw new HoodieGlueSyncException("Fail to update partitions to " + tableId(databaseName, tableName) - + " with error(s): " + response.errors()); - } + BatchUpdatePartitionResponse response = future.get(); + if (CollectionUtils.nonEmpty(response.errors())) { + throw new HoodieGlueSyncException("Fail to update partitions to " + tableId(databaseName, table.name()) + + " with error(s): " + response.errors()); } } catch (Exception e) { - throw new HoodieGlueSyncException("Fail to update partitions to " + tableId(databaseName, tableName), e); + throw new HoodieGlueSyncException("Fail to update partitions to " + tableId(databaseName, table.name()), e); } } @Override public void dropPartitions(String tableName, List partitionsToDrop) { - if (CollectionUtils.isNullOrEmpty(partitionsToDrop)) { - LOG.info("No partitions to drop for " + tableName); - return; - } - LOG.info("Drop " + partitionsToDrop.size() + "partition(s) in table " + tableId(databaseName, tableName)); + HoodieTimer timer = HoodieTimer.start(); try { - List> futures = new ArrayList<>(); - for (List batch : CollectionUtils.batches(partitionsToDrop, MAX_DELETE_PARTITIONS_PER_REQUEST)) { + if (partitionsToDrop.isEmpty()) { + LOG.info("No partitions to drop for " + tableId(this.databaseName, tableName)); + return; + } + parallelizeChange(partitionsToDrop, this.changeParallelism, partitions -> this.dropPartitionsInternal(tableName, partitions), MAX_DELETE_PARTITIONS_PER_REQUEST); + } finally { + LOG.info("Deleted {} partitions to table {} in {} ms", partitionsToDrop.size(), tableId(this.databaseName, tableName), timer.endTimer()); + } + } - List partitionValueLists = batch.stream().map(partition -> { - PartitionValueList partitionValueList = PartitionValueList.builder() - .values(partitionValueExtractor.extractPartitionValuesInPath(partition)) - .build(); - return partitionValueList; - }).collect(Collectors.toList()); + private void dropPartitionsInternal(String tableName, List partitionsToDrop) { + try { + List partitionValueLists = partitionsToDrop.stream().map(partition -> PartitionValueList.builder() + .values(partitionValueExtractor.extractPartitionValuesInPath(partition)) + .build() + ).collect(Collectors.toList()); - BatchDeletePartitionRequest batchDeletePartitionRequest = BatchDeletePartitionRequest.builder() - .databaseName(databaseName) - .tableName(tableName) - .partitionsToDelete(partitionValueLists) - .build(); - futures.add(awsGlue.batchDeletePartition(batchDeletePartitionRequest)); - } + BatchDeletePartitionRequest batchDeletePartitionRequest = BatchDeletePartitionRequest.builder() + .databaseName(databaseName) + .tableName(tableName) + .partitionsToDelete(partitionValueLists) + .build(); + CompletableFuture future = awsGlue.batchDeletePartition(batchDeletePartitionRequest); - for (CompletableFuture future : futures) { - BatchDeletePartitionResponse response = future.get(); - if (CollectionUtils.nonEmpty(response.errors())) { - throw new HoodieGlueSyncException("Fail to drop partitions to " + tableId(databaseName, tableName) - + " with error(s): " + response.errors()); - } + BatchDeletePartitionResponse response = future.get(); + if (CollectionUtils.nonEmpty(response.errors())) { + throw new HoodieGlueSyncException("Fail to drop partitions to " + tableId(databaseName, tableName) + + " with error(s): " + response.errors()); } } catch (Exception e) { throw new HoodieGlueSyncException("Fail to drop partitions to " + tableId(databaseName, tableName), e); diff --git a/hudi-aws/src/main/java/org/apache/hudi/config/GlueCatalogSyncClientConfig.java b/hudi-aws/src/main/java/org/apache/hudi/config/GlueCatalogSyncClientConfig.java index 21244e651547..0f6ac76a166e 100644 --- a/hudi-aws/src/main/java/org/apache/hudi/config/GlueCatalogSyncClientConfig.java +++ b/hudi-aws/src/main/java/org/apache/hudi/config/GlueCatalogSyncClientConfig.java @@ -26,6 +26,8 @@ import org.apache.hudi.common.util.Option; import org.apache.hudi.keygen.constant.KeyGeneratorOptions; +import java.util.stream.IntStream; + /** * Hoodie Configs for Glue. */ @@ -43,6 +45,28 @@ public class GlueCatalogSyncClientConfig extends HoodieConfig { .sinceVersion("0.14.0") .withDocumentation("Glue catalog sync based client will skip archiving the table version if this config is set to true"); + public static final ConfigProperty ALL_PARTITIONS_READ_PARALLELISM = ConfigProperty + .key(GLUE_CLIENT_PROPERTY_PREFIX + "all_partitions_read_parallelism") + .defaultValue(1) + .markAdvanced() + .withValidValues(IntStream.rangeClosed(1, 10).mapToObj(Integer::toString).toArray(String[]::new)) + .sinceVersion("1.0.0") + .withDocumentation("Parallelism for listing all partitions(first time sync). Should be in interval [1, 10]."); + + public static final ConfigProperty CHANGED_PARTITIONS_READ_PARALLELISM = ConfigProperty + .key(GLUE_CLIENT_PROPERTY_PREFIX + "changed_partitions_read_parallelism") + .defaultValue(1) + .markAdvanced() + .sinceVersion("1.0.0") + .withDocumentation("Parallelism for listing changed partitions(second and subsequent syncs)."); + + public static final ConfigProperty PARTITION_CHANGE_PARALLELISM = ConfigProperty + .key(GLUE_CLIENT_PROPERTY_PREFIX + "partition_change_parallelism") + .defaultValue(1) + .markAdvanced() + .sinceVersion("1.0.0") + .withDocumentation("Parallelism for change operations - such as create/update/delete."); + public static final ConfigProperty GLUE_METADATA_FILE_LISTING = ConfigProperty .key(GLUE_CLIENT_PROPERTY_PREFIX + "metadata_file_listing") .defaultValue(false) diff --git a/hudi-aws/src/test/java/org/apache/hudi/aws/sync/ITTestGluePartitionPushdown.java b/hudi-aws/src/test/java/org/apache/hudi/aws/sync/ITTestGluePartitionPushdown.java index b0aa34bdfce1..9601482b65af 100644 --- a/hudi-aws/src/test/java/org/apache/hudi/aws/sync/ITTestGluePartitionPushdown.java +++ b/hudi-aws/src/test/java/org/apache/hudi/aws/sync/ITTestGluePartitionPushdown.java @@ -31,6 +31,7 @@ import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import software.amazon.awssdk.services.glue.model.Column; import software.amazon.awssdk.services.glue.model.CreateDatabaseRequest; @@ -47,15 +48,14 @@ import java.io.IOException; import java.nio.file.Files; import java.time.Instant; -import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.concurrent.ExecutionException; -import static org.apache.hudi.hive.HiveSyncConfig.HIVE_SYNC_FILTER_PUSHDOWN_MAX_SIZE; import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_BASE_PATH; import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_DATABASE_NAME; +@Disabled("HUDI-7475 The tests do not work. Disabling them to unblock Azure CI") public class ITTestGluePartitionPushdown { private static final String MOTO_ENDPOINT = "http://localhost:5000"; @@ -125,35 +125,14 @@ private void createPartitions(String...partitions) throws ExecutionException, In @Test public void testEmptyPartitionShouldReturnEmpty() { - Assertions.assertEquals(0, glueSync.getPartitionsByFilter(TABLE_NAME, - glueSync.generatePushDownFilter(Arrays.asList("1/bar"), partitionsFieldSchema)).size()); + Assertions.assertEquals(0, glueSync.getPartitionsFromList(TABLE_NAME, + Arrays.asList("1/bar")).size()); } @Test public void testPresentPartitionShouldReturnIt() throws ExecutionException, InterruptedException { createPartitions("1", "b'ar"); - Assertions.assertEquals(1, glueSync.getPartitionsByFilter(TABLE_NAME, - glueSync.generatePushDownFilter(Arrays.asList("1/b'ar", "2/foo", "1/b''ar"), partitionsFieldSchema)).size()); - } - - @Test - public void testPresentPartitionShouldReturnAllWhenExpressionFilterLengthTooLong() throws ExecutionException, InterruptedException { - createPartitions("1", "b'ar"); - - // this will generate an expression larger than GLUE_EXPRESSION_MAX_CHARS - List tooLargePartitionPredicate = new ArrayList<>(); - for (int i = 0; i < 500; i++) { - tooLargePartitionPredicate.add(i + "/foo"); - } - Assertions.assertEquals(1, glueSync.getPartitionsByFilter(TABLE_NAME, - glueSync.generatePushDownFilter(tooLargePartitionPredicate, partitionsFieldSchema)).size(), - "Should fallback to listing all existing partitions"); - - // now set the pushdown max size to a low value to transform the expression in lower/upper bound - hiveSyncProps.setProperty(HIVE_SYNC_FILTER_PUSHDOWN_MAX_SIZE.key(), "10"); - glueSync = new AWSGlueCatalogSyncClient(new HiveSyncConfig(hiveSyncProps)); - Assertions.assertEquals(0, glueSync.getPartitionsByFilter(TABLE_NAME, - glueSync.generatePushDownFilter(tooLargePartitionPredicate, partitionsFieldSchema)).size(), - "No partitions should match"); + Assertions.assertEquals(1, glueSync.getPartitionsFromList(TABLE_NAME, + Arrays.asList("1/b'ar", "2/foo", "1/b''ar")).size()); } } diff --git a/hudi-aws/src/test/java/org/apache/hudi/aws/transaction/integ/ITTestDynamoDBBasedLockProvider.java b/hudi-aws/src/test/java/org/apache/hudi/aws/transaction/integ/ITTestDynamoDBBasedLockProvider.java index 473861712595..b874f4f3c3cc 100644 --- a/hudi-aws/src/test/java/org/apache/hudi/aws/transaction/integ/ITTestDynamoDBBasedLockProvider.java +++ b/hudi-aws/src/test/java/org/apache/hudi/aws/transaction/integ/ITTestDynamoDBBasedLockProvider.java @@ -18,6 +18,7 @@ package org.apache.hudi.aws.transaction.integ; +import org.junit.jupiter.api.Disabled; import software.amazon.awssdk.auth.credentials.AwsBasicCredentials; import software.amazon.awssdk.auth.credentials.AwsCredentialsProvider; import software.amazon.awssdk.auth.credentials.StaticCredentialsProvider; @@ -44,6 +45,7 @@ * Test for {@link DynamoDBBasedLockProvider}. * Set it as integration test because it requires setting up docker environment. */ +@Disabled("HUDI-7475 The tests do not work. Disabling them to unblock Azure CI") public class ITTestDynamoDBBasedLockProvider { private static LockConfiguration lockConfiguration; diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/HoodieCLI.java b/hudi-cli/src/main/java/org/apache/hudi/cli/HoodieCLI.java index 79adad368450..46a1b715ca3c 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/HoodieCLI.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/HoodieCLI.java @@ -39,6 +39,7 @@ public class HoodieCLI { public static Configuration conf; public static ConsistencyGuardConfig consistencyGuardConfig = ConsistencyGuardConfig.newBuilder().build(); + public static HoodieTimeGeneratorConfig timeGeneratorConfig; public static FileSystem fs; public static CLIState state = CLIState.INIT; public static String basePath; @@ -58,6 +59,10 @@ public static void setConsistencyGuardConfig(ConsistencyGuardConfig config) { consistencyGuardConfig = config; } + public static void setTimeGeneratorConfig(HoodieTimeGeneratorConfig config) { + timeGeneratorConfig = config; + } + private static void setTableMetaClient(HoodieTableMetaClient tableMetadata) { HoodieCLI.tableMetadata = tableMetadata; } @@ -88,8 +93,7 @@ public static void initFS(boolean force) throws IOException { public static void refreshTableMetadata() { setTableMetaClient(HoodieTableMetaClient.builder().setConf(HoodieCLI.conf).setBasePath(basePath).setLoadActiveTimelineOnLoad(false) .setConsistencyGuardConfig(HoodieCLI.consistencyGuardConfig) - // TODO [HUDI-6884] Generate HoodieTimeGeneratorConfig from props user set - .setTimeGeneratorConfig(HoodieTimeGeneratorConfig.defaultConfig(basePath)) + .setTimeGeneratorConfig(timeGeneratorConfig == null ? HoodieTimeGeneratorConfig.defaultConfig(basePath) : timeGeneratorConfig) .setLayoutVersion(Option.of(layoutVersion)).build()); } diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/MetadataCommand.java b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/MetadataCommand.java index d106d8375e7a..7cebf43db029 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/MetadataCommand.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/MetadataCommand.java @@ -151,7 +151,7 @@ public String delete(@ShellOption(value = "--backup", help = "Backup the metadat public String deleteRecordIndex(@ShellOption(value = "--backup", help = "Backup the record index before delete", defaultValue = "true", arity = 1) final boolean backup) throws Exception { HoodieTableMetaClient dataMetaClient = HoodieCLI.getTableMetaClient(); String backupPath = HoodieTableMetadataUtil.deleteMetadataTablePartition(dataMetaClient, new HoodieSparkEngineContext(jsc), - MetadataPartitionType.RECORD_INDEX, backup); + MetadataPartitionType.RECORD_INDEX.getPartitionPath(), backup); if (backup) { return "Record Index has been deleted from the Metadata Table and backed up to " + backupPath; } else { diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/TableCommand.java b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/TableCommand.java index 158c79f52a74..e3c3e810c5cb 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/TableCommand.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/TableCommand.java @@ -22,6 +22,7 @@ import org.apache.hudi.cli.HoodiePrintHelper; import org.apache.hudi.cli.HoodieTableHeaderFields; import org.apache.hudi.cli.TableHeader; +import org.apache.hudi.common.config.HoodieTimeGeneratorConfig; import org.apache.hudi.common.fs.ConsistencyGuardConfig; import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.table.HoodieTableConfig; @@ -30,6 +31,7 @@ import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.table.timeline.TimeGeneratorType; import org.apache.hudi.common.util.Option; import org.apache.hudi.config.HoodieCompactionConfig; import org.apache.hudi.exception.HoodieException; @@ -85,13 +87,25 @@ public String connect( @ShellOption(value = {"--maxWaitIntervalMs"}, defaultValue = "300000", help = "Max wait time for eventual consistency") final Integer maxConsistencyIntervalMs, @ShellOption(value = {"--maxCheckIntervalMs"}, defaultValue = "7", - help = "Max checks for eventual consistency") final Integer maxConsistencyChecks) + help = "Max checks for eventual consistency") final Integer maxConsistencyChecks, + @ShellOption(value = {"--timeGeneratorType"}, defaultValue = "WAIT_TO_ADJUST_SKEW", + help = "Time generator type, which is used to generate globally monotonically increasing timestamp") final String timeGeneratorType, + @ShellOption(value = {"--maxExpectedClockSkewMs"}, defaultValue = "200", + help = "The max expected clock skew time for WaitBasedTimeGenerator in ms") final Long maxExpectedClockSkewMs, + @ShellOption(value = {"--useDefaultLockProvider"}, defaultValue = "false", + help = "Use org.apache.hudi.client.transaction.lock.InProcessLockProvider") final boolean useDefaultLockProvider) throws IOException { HoodieCLI .setConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(eventuallyConsistent) .withInitialConsistencyCheckIntervalMs(initialConsistencyIntervalMs) .withMaxConsistencyCheckIntervalMs(maxConsistencyIntervalMs).withMaxConsistencyChecks(maxConsistencyChecks) .build()); + HoodieCLI + .setTimeGeneratorConfig(HoodieTimeGeneratorConfig.newBuilder().withPath(path) + .withTimeGeneratorType(TimeGeneratorType.valueOf(timeGeneratorType)) + .withMaxExpectedClockSkewMs(maxExpectedClockSkewMs) + .withDefaultLockProvider(useDefaultLockProvider) + .build()); HoodieCLI.initConf(); HoodieCLI.connectTo(path, layoutVersion); HoodieCLI.initFS(true); @@ -144,7 +158,8 @@ public String createTable( .setTimelineLayoutVersion(layoutVersion) .initTable(HoodieCLI.conf, path); // Now connect to ensure loading works - return connect(path, layoutVersion, false, 0, 0, 0); + return connect(path, layoutVersion, false, 0, 0, 0, + "WAIT_TO_ADJUST_SKEW", 200L, true); } /** diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestFileSystemViewCommand.java b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestFileSystemViewCommand.java index 80cf9cd34ed3..7bafe2f432f9 100644 --- a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestFileSystemViewCommand.java +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestFileSystemViewCommand.java @@ -101,11 +101,11 @@ private void createNonpartitionedTable() throws IOException { // Write date files and log file String testWriteToken = "2-0-2"; Files.createFile(Paths.get(nonpartitionedTablePath, FSUtils - .makeBaseFileName(commitTime1, testWriteToken, fileId1))); + .makeBaseFileName(commitTime1, testWriteToken, fileId1, BASE_FILE_EXTENSION))); Files.createFile(Paths.get(nonpartitionedTablePath, FSUtils .makeLogFileName(fileId1, HoodieLogFile.DELTA_EXTENSION, commitTime1, 0, testWriteToken))); Files.createFile(Paths.get(nonpartitionedTablePath, FSUtils - .makeBaseFileName(commitTime2, testWriteToken, fileId1))); + .makeBaseFileName(commitTime2, testWriteToken, fileId1, BASE_FILE_EXTENSION))); Files.createFile(Paths.get(nonpartitionedTablePath, FSUtils .makeLogFileName(fileId1, HoodieLogFile.DELTA_EXTENSION, commitTime2, 0, testWriteToken))); @@ -144,11 +144,11 @@ private void createPartitionedTable() throws IOException { // Write date files and log file String testWriteToken = "1-0-1"; Files.createFile(Paths.get(fullPartitionPath, FSUtils - .makeBaseFileName(commitTime1, testWriteToken, fileId1))); + .makeBaseFileName(commitTime1, testWriteToken, fileId1, BASE_FILE_EXTENSION))); Files.createFile(Paths.get(fullPartitionPath, FSUtils .makeLogFileName(fileId1, HoodieLogFile.DELTA_EXTENSION, commitTime1, 0, testWriteToken))); Files.createFile(Paths.get(fullPartitionPath, FSUtils - .makeBaseFileName(commitTime2, testWriteToken, fileId1))); + .makeBaseFileName(commitTime2, testWriteToken, fileId1, BASE_FILE_EXTENSION))); Files.createFile(Paths.get(fullPartitionPath, FSUtils .makeLogFileName(fileId1, HoodieLogFile.DELTA_EXTENSION, commitTime2, 0, testWriteToken))); @@ -323,7 +323,8 @@ public void testShowLatestFileSlices() throws IOException { .addTableHeaderField(HoodieTableHeaderFields.HEADER_DELTA_FILES_UNSCHEDULED); // Test show with partition path '2016/03/15' - new TableCommand().connect(partitionedTablePath, null, false, 0, 0, 0); + new TableCommand().connect(partitionedTablePath, null, false, 0, 0, 0, + "WAIT_TO_ADJUST_SKEW", 200L, false); Object partitionedTable = shell.evaluate(() -> "show fsview latest --partitionPath " + partitionPath); assertTrue(ShellEvaluationResultUtil.isSuccess(partitionedTable)); @@ -336,7 +337,8 @@ public void testShowLatestFileSlices() throws IOException { assertEquals(partitionedExpected, partitionedResults); // Test show for non-partitioned table - new TableCommand().connect(nonpartitionedTablePath, null, false, 0, 0, 0); + new TableCommand().connect(nonpartitionedTablePath, null, false, 0, 0, 0, + "WAIT_TO_ADJUST_SKEW", 200L, false); Object nonpartitionedTable = shell.evaluate(() -> "show fsview latest"); assertTrue(ShellEvaluationResultUtil.isSuccess(nonpartitionedTable)); diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestMetadataCommand.java b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestMetadataCommand.java index 3214bb2cfccd..5dbc1c59b9b5 100644 --- a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestMetadataCommand.java +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestMetadataCommand.java @@ -96,7 +96,8 @@ public void testMetadataDelete() throws Exception { HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf()).setBasePath(tablePath).build(); assertFalse(metaClient.getTableConfig().getMetadataPartitions().isEmpty()); - new TableCommand().connect(tablePath, null, false, 0, 0, 0); + new TableCommand().connect(tablePath, null, false, 0, 0, 0, + "WAIT_TO_ADJUST_SKEW", 200L, false); Object result = shell.evaluate(() -> "metadata delete"); assertTrue(ShellEvaluationResultUtil.isSuccess(result)); diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestTableCommand.java b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestTableCommand.java index 5b6abf25f60d..8f6442379661 100644 --- a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestTableCommand.java +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestTableCommand.java @@ -23,12 +23,14 @@ import org.apache.hudi.cli.functional.CLIFunctionalTestHarness; import org.apache.hudi.cli.testutils.HoodieTestCommitMetadataGenerator; import org.apache.hudi.cli.testutils.ShellEvaluationResultUtil; +import org.apache.hudi.common.config.HoodieTimeGeneratorConfig; import org.apache.hudi.common.fs.ConsistencyGuardConfig; import org.apache.hudi.common.model.HoodieCommitMetadata; import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.table.HoodieTableConfig; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.table.timeline.TimeGeneratorType; import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion; import org.apache.hudi.common.testutils.HoodieTestDataGenerator; import org.apache.hudi.common.util.Option; @@ -105,7 +107,7 @@ public void testConnectTable() { // Test connect with specified values Object result = shell.evaluate(() -> "connect --path " + tablePath + " --initialCheckIntervalMs 3000 " - + "--maxWaitIntervalMs 40000 --maxCheckIntervalMs 8"); + + "--maxWaitIntervalMs 40000 --maxCheckIntervalMs 8 --maxExpectedClockSkewMs 888 --useDefaultLockProvider true"); assertTrue(ShellEvaluationResultUtil.isSuccess(result)); // Check specified values @@ -113,10 +115,16 @@ public void testConnectTable() { assertEquals(3000, conf.getInitialConsistencyCheckIntervalMs()); assertEquals(40000, conf.getMaxConsistencyCheckIntervalMs()); assertEquals(8, conf.getMaxConsistencyChecks()); + HoodieTimeGeneratorConfig timeGeneratorConfig = HoodieCLI.timeGeneratorConfig; + assertEquals(tablePath, timeGeneratorConfig.getBasePath()); + assertEquals(888L, timeGeneratorConfig.getMaxExpectedClockSkewMs()); + assertEquals("org.apache.hudi.client.transaction.lock.InProcessLockProvider", + timeGeneratorConfig.getLockConfiguration().getConfig().getString(HoodieTimeGeneratorConfig.LOCK_PROVIDER_KEY)); // Check default values assertFalse(conf.isConsistencyCheckEnabled()); assertEquals(new Integer(1), HoodieCLI.layoutVersion.getVersion()); + assertEquals(TimeGeneratorType.valueOf("WAIT_TO_ADJUST_SKEW"), timeGeneratorConfig.getTimeGeneratorType()); } /** @@ -134,6 +142,13 @@ public void testDefaultCreate() { assertEquals(metaPath, client.getMetaPath()); assertEquals(HoodieTableType.COPY_ON_WRITE, client.getTableType()); assertEquals(new Integer(1), client.getTimelineLayoutVersion().getVersion()); + + HoodieTimeGeneratorConfig timeGeneratorConfig = HoodieCLI.timeGeneratorConfig; + assertEquals(tablePath, timeGeneratorConfig.getBasePath()); + assertEquals(200L, timeGeneratorConfig.getMaxExpectedClockSkewMs()); + assertEquals("org.apache.hudi.client.transaction.lock.InProcessLockProvider", + timeGeneratorConfig.getLockConfiguration().getConfig().getString(HoodieTimeGeneratorConfig.LOCK_PROVIDER_KEY)); + assertEquals(TimeGeneratorType.valueOf("WAIT_TO_ADJUST_SKEW"), timeGeneratorConfig.getTimeGeneratorType()); } /** diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/functional/CLIFunctionalTestHarness.java b/hudi-cli/src/test/java/org/apache/hudi/cli/functional/CLIFunctionalTestHarness.java index 6d6335ab0fb1..7c72417504bc 100644 --- a/hudi-cli/src/test/java/org/apache/hudi/cli/functional/CLIFunctionalTestHarness.java +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/functional/CLIFunctionalTestHarness.java @@ -21,6 +21,7 @@ import org.apache.hudi.client.SparkRDDReadClient; import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.table.HoodieTableConfig; import org.apache.hudi.common.table.view.FileSystemViewStorageConfig; import org.apache.hudi.testutils.HoodieClientTestUtils; import org.apache.hudi.testutils.providers.SparkProvider; @@ -40,6 +41,8 @@ public class CLIFunctionalTestHarness implements SparkProvider { + protected static final String BASE_FILE_EXTENSION = HoodieTableConfig.BASE_FILE_FORMAT.defaultValue().getFileExtension(); + protected static int timelineServicePort = FileSystemViewStorageConfig.REMOTE_PORT_NUM.defaultValue(); protected static transient TimelineService timelineService; diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestBootstrapCommand.java b/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestBootstrapCommand.java index 2d73eb02e46d..271885df9bce 100644 --- a/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestBootstrapCommand.java +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestBootstrapCommand.java @@ -91,7 +91,8 @@ public void testBootstrapRunCommand() throws IOException { assertTrue(ShellEvaluationResultUtil.isSuccess(resultForBootstrapRun)); // Connect & check Hudi table exist - new TableCommand().connect(tablePath, TimelineLayoutVersion.VERSION_1, false, 2000, 300000, 7); + new TableCommand().connect(tablePath, TimelineLayoutVersion.VERSION_1, false, 2000, 300000, 7, + "WAIT_TO_ADJUST_SKEW", 200L, false); metaClient = HoodieCLI.getTableMetaClient(); assertEquals(1, metaClient.getActiveTimeline().getCommitsTimeline().countInstants(), "Should have 1 commit."); diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestHDFSParquetImportCommand.java b/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestHDFSParquetImportCommand.java index 3575b85344e0..7786a57896e4 100644 --- a/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestHDFSParquetImportCommand.java +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestHDFSParquetImportCommand.java @@ -113,7 +113,8 @@ public void testConvertWithInsert() throws IOException { assertTrue(Files.exists(Paths.get(metaPath)), "Hoodie table not exist."); // Load meta data - new TableCommand().connect(targetPath.toString(), TimelineLayoutVersion.VERSION_1, false, 2000, 300000, 7); + new TableCommand().connect(targetPath.toString(), TimelineLayoutVersion.VERSION_1, false, 2000, 300000, 7, + "WAIT_TO_ADJUST_SKEW", 200L, false); metaClient = HoodieCLI.getTableMetaClient(); assertEquals(1, metaClient.getActiveTimeline().getCommitsTimeline().countInstants(), "Should only 1 commit."); @@ -137,7 +138,8 @@ public void testConvertWithUpsert() throws IOException, ParseException { dataImporter.dataImport(jsc, 0); // Load meta data - new TableCommand().connect(targetPath.toString(), TimelineLayoutVersion.VERSION_1, false, 2000, 300000, 7); + new TableCommand().connect(targetPath.toString(), TimelineLayoutVersion.VERSION_1, false, 2000, 300000, 7, + "WAIT_TO_ADJUST_SKEW", 200L, false); metaClient = HoodieCLI.getTableMetaClient(); // check if insert instant exist diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestRepairsCommand.java b/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestRepairsCommand.java index a95ed9ff7787..d7e40c8f2f65 100644 --- a/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestRepairsCommand.java +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestRepairsCommand.java @@ -320,7 +320,8 @@ public void testDeduplicateWithReal(HoodieTableType tableType) throws IOExceptio } private void connectTableAndReloadMetaClient(String tablePath) throws IOException { - new TableCommand().connect(tablePath, TimelineLayoutVersion.VERSION_1, false, 0, 0, 0); + new TableCommand().connect(tablePath, TimelineLayoutVersion.VERSION_1, false, 0, 0, 0, + "WAIT_TO_ADJUST_SKEW", 200L, false); metaClient = HoodieTableMetaClient.reload(HoodieCLI.getTableMetaClient()); } diff --git a/hudi-client/hudi-client-common/pom.xml b/hudi-client/hudi-client-common/pom.xml index 67705edb316f..9a4a7f2104b9 100644 --- a/hudi-client/hudi-client-common/pom.xml +++ b/hudi-client/hudi-client-common/pom.xml @@ -120,6 +120,16 @@ io.prometheus simpleclient_pushgateway + + com.uber.m3 + tally-m3 + ${tally.version} + + + com.uber.m3 + tally-core + ${tally.version} + @@ -236,9 +246,6 @@ src/main/resources - - src/test/resources - diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/BaseHoodieClient.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/BaseHoodieClient.java index a88030c9a2c1..009fbf140f04 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/BaseHoodieClient.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/BaseHoodieClient.java @@ -24,6 +24,7 @@ import org.apache.hudi.client.heartbeat.HoodieHeartbeatClient; import org.apache.hudi.client.transaction.TransactionManager; import org.apache.hudi.client.utils.TransactionUtils; +import org.apache.hudi.common.data.HoodieData; import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.model.HoodieCommitMetadata; import org.apache.hudi.common.model.HoodieWriteStat; @@ -41,6 +42,7 @@ import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.exception.HoodieWriteConflictException; import org.apache.hudi.hadoop.fs.HadoopFSUtils; +import org.apache.hudi.metadata.HoodieTableMetadataWriter; import org.apache.hudi.metrics.HoodieMetrics; import org.apache.hudi.table.HoodieTable; @@ -258,7 +260,7 @@ protected void finalizeWrite(HoodieTable table, String instantTime, List durationInMs = Option.of(metrics.getDurationInMs(finalizeCtx.stop())); durationInMs.ifPresent(duration -> { - LOG.info("Finalize write elapsed time (milliseconds): " + duration); + LOG.info("Finalize write elapsed time (milliseconds): {}", duration); metrics.updateFinalizeWriteMetrics(duration, stats.size()); }); } @@ -266,4 +268,28 @@ protected void finalizeWrite(HoodieTable table, String instantTime, List writeStatuses) { + context.setJobStatus(this.getClass().getSimpleName(), "Committing to metadata table: " + config.getTableName()); + Option metadataWriterOpt = table.getMetadataWriter(instantTime); + if (metadataWriterOpt.isPresent()) { + try (HoodieTableMetadataWriter metadataWriter = metadataWriterOpt.get()) { + metadataWriter.updateFromWriteStatuses(metadata, writeStatuses, instantTime); + } catch (Exception e) { + if (e instanceof HoodieException) { + throw (HoodieException) e; + } else { + throw new HoodieException("Failed to update metadata", e); + } + } + } + } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/BaseHoodieTableServiceClient.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/BaseHoodieTableServiceClient.java index 967aaa4f68e4..c472be33f3d3 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/BaseHoodieTableServiceClient.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/BaseHoodieTableServiceClient.java @@ -58,7 +58,6 @@ import org.apache.hudi.exception.HoodieLogCompactException; import org.apache.hudi.exception.HoodieRollbackException; import org.apache.hudi.metadata.HoodieTableMetadataUtil; -import org.apache.hudi.metadata.HoodieTableMetadataWriter; import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.action.HoodieWriteMetadata; import org.apache.hudi.table.action.compact.CompactHelpers; @@ -248,7 +247,7 @@ protected Option inlineLogCompact(Option> extraMetad protected void runAnyPendingCompactions(HoodieTable table) { table.getActiveTimeline().getWriteTimeline().filterPendingCompactionTimeline().getInstants() .forEach(instant -> { - LOG.info("Running previously failed inflight compaction at instant " + instant); + LOG.info("Running previously failed inflight compaction at instant {}", instant); compact(instant.getTimestamp(), true); }); } @@ -256,7 +255,7 @@ protected void runAnyPendingCompactions(HoodieTable table) { protected void runAnyPendingLogCompactions(HoodieTable table) { table.getActiveTimeline().getWriteTimeline().filterPendingLogCompactionTimeline().getInstantsAsStream() .forEach(instant -> { - LOG.info("Running previously failed inflight log compaction at instant " + instant); + LOG.info("Running previously failed inflight log compaction at instant {}", instant); logCompact(instant.getTimestamp(), true); }); } @@ -329,7 +328,7 @@ protected void completeCompaction(HoodieCommitMetadata metadata, HoodieTable tab finalizeWrite(table, compactionCommitTime, writeStats); // commit to data table after committing to metadata table. writeTableMetadata(table, compactionCommitTime, metadata, context.emptyHoodieData()); - LOG.info("Committing Compaction " + compactionCommitTime + ". Finished with result " + metadata); + LOG.info("Committing Compaction {}. Finished with result {}", compactionCommitTime, metadata); CompactHelpers.getInstance().completeInflightCompaction(table, compactionCommitTime, metadata); } finally { this.txnManager.endTransaction(Option.of(compactionInstant)); @@ -342,7 +341,7 @@ protected void completeCompaction(HoodieCommitMetadata metadata, HoodieTable tab metrics.updateCommitMetrics(parsedInstant.getTime(), durationInMs, metadata, COMPACTION_ACTION) ); } - LOG.info("Compacted successfully on commit " + compactionCommitTime); + LOG.info("Compacted successfully on commit {}", compactionCommitTime); } /** @@ -389,7 +388,7 @@ protected void completeLogCompaction(HoodieCommitMetadata metadata, HoodieTable finalizeWrite(table, logCompactionCommitTime, writeStats); // commit to data table after committing to metadata table. writeTableMetadata(table, logCompactionCommitTime, metadata, context.emptyHoodieData()); - LOG.info("Committing Log Compaction " + logCompactionCommitTime + ". Finished with result " + metadata); + LOG.info("Committing Log Compaction {}. Finished with result {}", logCompactionCommitTime, metadata); CompactHelpers.getInstance().completeInflightLogCompaction(table, logCompactionCommitTime, metadata); } finally { this.txnManager.endTransaction(Option.of(logCompactionInstant)); @@ -402,7 +401,7 @@ protected void completeLogCompaction(HoodieCommitMetadata metadata, HoodieTable metrics.updateCommitMetrics(parsedInstant.getTime(), durationInMs, metadata, HoodieActiveTimeline.LOG_COMPACTION_ACTION) ); } - LOG.info("Log Compacted successfully on commit " + logCompactionCommitTime); + LOG.info("Log Compacted successfully on commit {}", logCompactionCommitTime); } /** @@ -450,7 +449,7 @@ public HoodieWriteMetadata cluster(String clusteringInstant, boolean shouldCo table.getMetaClient().reloadActiveTimeline(); } clusteringTimer = metrics.getClusteringCtx(); - LOG.info("Starting clustering at " + clusteringInstant); + LOG.info("Starting clustering at {}", clusteringInstant); HoodieWriteMetadata writeMetadata = table.cluster(context, clusteringInstant); HoodieWriteMetadata clusteringMetadata = convertToOutputMetadata(writeMetadata); // Validation has to be done after cloning. if not, it could result in referencing the write status twice which means clustering could get executed twice. @@ -484,6 +483,17 @@ public boolean purgePendingClustering(String clusteringInstant) { return false; } + /** + * Delete expired partition by config. + * + * @param instantTime Instant Time for the action + * @return HoodieWriteMetadata + */ + public HoodieWriteMetadata managePartitionTTL(String instantTime) { + HoodieTable table = createTable(config, context.getHadoopConf().get()); + return table.managePartitionTTL(context, instantTime); + } + protected abstract void validateClusteringCommit(HoodieWriteMetadata clusteringMetadata, String clusteringCommitTime, HoodieTable table); protected abstract HoodieWriteMetadata convertToOutputMetadata(HoodieWriteMetadata writeMetadata); @@ -509,7 +519,7 @@ private void completeClustering(HoodieReplaceCommitMetadata metadata, // Update table's metadata (table) writeTableMetadata(table, clusteringInstant.getTimestamp(), metadata, writeStatuses.orElseGet(context::emptyHoodieData)); - LOG.info("Committing Clustering " + clusteringCommitTime + ". Finished with result " + metadata); + LOG.info("Committing Clustering {}. Finished with result {}", clusteringCommitTime, metadata); table.getActiveTimeline().transitionReplaceInflightToComplete( false, @@ -528,7 +538,7 @@ private void completeClustering(HoodieReplaceCommitMetadata metadata, metrics.updateCommitMetrics(parsedInstant.getTime(), durationInMs, metadata, HoodieActiveTimeline.REPLACE_COMMIT_ACTION) ); } - LOG.info("Clustering successfully on commit " + clusteringCommitTime); + LOG.info("Clustering successfully on commit {}", clusteringCommitTime); } protected void runTableServicesInline(HoodieTable table, HoodieCommitMetadata metadata, Option> extraMetadata) { @@ -583,6 +593,12 @@ protected void runTableServicesInline(HoodieTable table, HoodieCommitMetadata me metadata.addMetadata(HoodieClusteringConfig.SCHEDULE_INLINE_CLUSTERING.key(), "true"); inlineScheduleClustering(extraMetadata); } + + // Do an inline partition ttl management if enabled + if (config.isInlinePartitionTTLEnable()) { + String instantTime = createNewInstantTime(); + table.managePartitionTTL(table.getContext(), instantTime); + } } /** @@ -599,7 +615,7 @@ public Option scheduleTableService(String instantTime, Option scheduleTableServiceInternal(String instantTime, Option LOG.info("Scheduling archiving is not supported. Skipping."); break; case CLUSTER: - LOG.info("Scheduling clustering at instant time :" + instantTime); + LOG.info("Scheduling clustering at instant time: {}", instantTime); Option clusteringPlan = table .scheduleClustering(context, instantTime, extraMetadata); option = clusteringPlan.isPresent() ? Option.of(instantTime) : Option.empty(); break; case COMPACT: - LOG.info("Scheduling compaction at instant time :" + instantTime); + LOG.info("Scheduling compaction at instant time: {}", instantTime); Option compactionPlan = table .scheduleCompaction(context, instantTime, extraMetadata); option = compactionPlan.isPresent() ? Option.of(instantTime) : Option.empty(); break; case LOG_COMPACT: - LOG.info("Scheduling log compaction at instant time :" + instantTime); + LOG.info("Scheduling log compaction at instant time: {}", instantTime); Option logCompactionPlan = table .scheduleLogCompaction(context, instantTime, extraMetadata); option = logCompactionPlan.isPresent() ? Option.of(instantTime) : Option.empty(); break; case CLEAN: - LOG.info("Scheduling cleaning at instant time :" + instantTime); + LOG.info("Scheduling cleaning at instant time: {}", instantTime); Option cleanerPlan = table .scheduleCleaning(context, instantTime, extraMetadata); option = cleanerPlan.isPresent() ? Option.of(instantTime) : Option.empty(); @@ -649,7 +665,7 @@ protected Option scheduleTableServiceInternal(String instantTime, Option Option instantRange = delegateToTableServiceManager(tableServiceType, table); if (instantRange.isPresent()) { - LOG.info("Delegate instant [" + instantRange.get() + "] to table service manager"); + LOG.info("Delegate instant [{}] to table service manager", instantRange.get()); } return option; @@ -693,36 +709,12 @@ protected void runAnyPendingClustering(HoodieTable table) { table.getActiveTimeline().filterPendingReplaceTimeline().getInstants().forEach(instant -> { Option> instantPlan = ClusteringUtils.getClusteringPlan(table.getMetaClient(), instant); if (instantPlan.isPresent()) { - LOG.info("Running pending clustering at instant " + instantPlan.get().getLeft()); + LOG.info("Running pending clustering at instant {}", instantPlan.get().getLeft()); cluster(instant.getTimestamp(), true); } }); } - /** - * Write the HoodieCommitMetadata to metadata table if available. - * - * @param table {@link HoodieTable} of interest. - * @param instantTime instant time of the commit. - * @param metadata instance of {@link HoodieCommitMetadata}. - * @param writeStatuses Write statuses of the commit - */ - protected void writeTableMetadata(HoodieTable table, String instantTime, HoodieCommitMetadata metadata, HoodieData writeStatuses) { - context.setJobStatus(this.getClass().getSimpleName(), "Committing to metadata table: " + config.getTableName()); - Option metadataWriterOpt = table.getMetadataWriter(instantTime); - if (metadataWriterOpt.isPresent()) { - try (HoodieTableMetadataWriter metadataWriter = metadataWriterOpt.get()) { - metadataWriter.updateFromWriteStatuses(metadata, writeStatuses, instantTime); - } catch (Exception e) { - if (e instanceof HoodieException) { - throw (HoodieException) e; - } else { - throw new HoodieException("Failed to update metadata", e); - } - } - } - } - /** * Clean up any stale/old files/data lying around (either on file storage or index storage) based on the * configurations and CleaningPolicy used. (typically files that no longer can be used by a running query can be diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/BaseHoodieWriteClient.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/BaseHoodieWriteClient.java index 9b69d819e712..9ade694d3409 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/BaseHoodieWriteClient.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/BaseHoodieWriteClient.java @@ -347,30 +347,6 @@ protected void preCommit(HoodieInstant inflightInstant, HoodieCommitMetadata met resolveWriteConflict(table, metadata, this.pendingInflightAndRequestedInstants); } - /** - * Write the HoodieCommitMetadata to metadata table if available. - * - * @param table {@link HoodieTable} of interest. - * @param instantTime instant time of the commit. - * @param metadata instance of {@link HoodieCommitMetadata}. - * @param writeStatuses WriteStatuses for the completed action. - */ - protected void writeTableMetadata(HoodieTable table, String instantTime, HoodieCommitMetadata metadata, HoodieData writeStatuses) { - context.setJobStatus(this.getClass().getSimpleName(), "Committing to metadata table: " + config.getTableName()); - Option metadataWriterOpt = table.getMetadataWriter(instantTime); - if (metadataWriterOpt.isPresent()) { - try (HoodieTableMetadataWriter metadataWriter = metadataWriterOpt.get()) { - metadataWriter.updateFromWriteStatuses(metadata, writeStatuses, instantTime); - } catch (Exception e) { - if (e instanceof HoodieException) { - throw (HoodieException) e; - } else { - throw new HoodieException("Failed to update metadata", e); - } - } - } - } - /** * Filter out HoodieRecords that already exists in the output folder. This is useful in deduplication. * @@ -987,10 +963,10 @@ public boolean scheduleCompactionAtInstant(String instantTime, Option scheduleIndexing(List partitionTypes) { + public Option scheduleIndexing(List partitionTypes, List partitionPaths) { String instantTime = createNewInstantTime(); Option indexPlan = createTable(config, hadoopConf) - .scheduleIndexing(context, instantTime, partitionTypes); + .scheduleIndexing(context, instantTime, partitionTypes, partitionPaths); return indexPlan.isPresent() ? Option.of(instantTime) : Option.empty(); } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/timeline/LSMTimelineWriter.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/timeline/LSMTimelineWriter.java index 4f487410a8c9..e5acf775a193 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/timeline/LSMTimelineWriter.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/timeline/LSMTimelineWriter.java @@ -20,6 +20,7 @@ package org.apache.hudi.client.timeline; import org.apache.hudi.avro.model.HoodieLSMTimelineInstant; +import org.apache.hudi.common.engine.TaskContextSupplier; import org.apache.hudi.common.table.timeline.MetadataConversionUtils; import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.fs.FSUtils; @@ -72,21 +73,29 @@ public class LSMTimelineWriter { public static final long MAX_FILE_SIZE_IN_BYTES = 1024 * 1024 * 1000; private final HoodieWriteConfig config; - private final HoodieTable table; + private final TaskContextSupplier taskContextSupplier; private final HoodieTableMetaClient metaClient; private HoodieWriteConfig writeConfig; private LSMTimelineWriter(HoodieWriteConfig config, HoodieTable table) { + this(config, table.getTaskContextSupplier(), table.getMetaClient()); + } + + private LSMTimelineWriter(HoodieWriteConfig config, TaskContextSupplier taskContextSupplier, HoodieTableMetaClient metaClient) { this.config = config; - this.table = table; - this.metaClient = table.getMetaClient(); + this.taskContextSupplier = taskContextSupplier; + this.metaClient = metaClient; } public static LSMTimelineWriter getInstance(HoodieWriteConfig config, HoodieTable table) { return new LSMTimelineWriter(config, table); } + public static LSMTimelineWriter getInstance(HoodieWriteConfig config, TaskContextSupplier taskContextSupplier, HoodieTableMetaClient metaClient) { + return new LSMTimelineWriter(config, taskContextSupplier, metaClient); + } + /** * Writes the list of active actions into the timeline. * @@ -115,10 +124,14 @@ public void write( exceptionHandler.ifPresent(handler -> handler.accept(e)); } } - updateManifest(filePath.getName()); } catch (Exception e) { throw new HoodieCommitException("Failed to write commits", e); } + try { + updateManifest(filePath.getName()); + } catch (Exception e) { + throw new HoodieCommitException("Failed to update archiving manifest", e); + } } /** @@ -366,7 +379,7 @@ private HoodieWriteConfig getOrCreateWriterConfig() { private HoodieFileWriter openWriter(Path filePath) { try { return HoodieFileWriterFactory.getFileWriter("", filePath, metaClient.getHadoopConf(), getOrCreateWriterConfig(), - HoodieLSMTimelineInstant.getClassSchema(), table.getTaskContextSupplier(), HoodieRecord.HoodieRecordType.AVRO); + HoodieLSMTimelineInstant.getClassSchema(), taskContextSupplier, HoodieRecord.HoodieRecordType.AVRO); } catch (IOException e) { throw new HoodieException("Unable to initialize archiving writer", e); } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/transaction/PreferWriterConflictResolutionStrategy.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/transaction/PreferWriterConflictResolutionStrategy.java index 61ed673fc62f..62fbf64a7f9c 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/transaction/PreferWriterConflictResolutionStrategy.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/transaction/PreferWriterConflictResolutionStrategy.java @@ -55,7 +55,7 @@ public Stream getCandidateInstants(HoodieTableMetaClient metaClie Option lastSuccessfulInstant) { HoodieActiveTimeline activeTimeline = metaClient.reloadActiveTimeline(); if ((REPLACE_COMMIT_ACTION.equals(currentInstant.getAction()) - && ClusteringUtils.isClusteringCommit(metaClient, currentInstant)) + && ClusteringUtils.isClusteringInstant(activeTimeline, currentInstant)) || COMPACTION_ACTION.equals(currentInstant.getAction())) { return getCandidateInstantsForTableServicesCommits(activeTimeline, currentInstant); } else { diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodiePayloadConfig.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodiePayloadConfig.java index 3929dcba0471..5c70000bd6c7 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodiePayloadConfig.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodiePayloadConfig.java @@ -22,7 +22,7 @@ import org.apache.hudi.common.config.ConfigGroups; import org.apache.hudi.common.config.ConfigProperty; import org.apache.hudi.common.config.HoodieConfig; -import org.apache.hudi.common.model.OverwriteWithLatestAvroPayload; +import org.apache.hudi.common.model.DefaultHoodieRecordPayload; import java.io.File; import java.io.FileReader; @@ -50,7 +50,7 @@ public class HoodiePayloadConfig extends HoodieConfig { public static final ConfigProperty PAYLOAD_CLASS_NAME = ConfigProperty .key("hoodie.compaction.payload.class") - .defaultValue(OverwriteWithLatestAvroPayload.class.getName()) + .defaultValue(DefaultHoodieRecordPayload.class.getName()) .markAdvanced() .withDocumentation("This needs to be same as class used during insert/upserts. Just like writing, compaction also uses " + "the record payload class to merge records in the log against each other, merge again with the base file and " diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieTTLConfig.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieTTLConfig.java new file mode 100644 index 000000000000..1f9a4e40e983 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieTTLConfig.java @@ -0,0 +1,130 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.config; + +import org.apache.hudi.common.config.ConfigClassProperty; +import org.apache.hudi.common.config.ConfigGroups; +import org.apache.hudi.common.config.ConfigProperty; +import org.apache.hudi.common.config.HoodieConfig; +import org.apache.hudi.table.action.ttl.strategy.PartitionTTLStrategyType; + +import javax.annotation.concurrent.Immutable; + +import java.util.Properties; + +/** + * Hoodie Configs for partition/record level ttl management. + */ +@Immutable +@ConfigClassProperty(name = "TTL management Configs", + groupName = ConfigGroups.Names.WRITE_CLIENT, + description = "Data ttl management") +public class HoodieTTLConfig extends HoodieConfig { + + public static final String PARTITION_TTL_STRATEGY_PARAM_PREFIX = "hoodie.partition.ttl.strategy."; + + public static final String KEEP_BY_TIME_PARTITION_TTL_STRATEGY = + "org.apache.hudi.table.action.ttl.strategy.KeepByTimeStrategy"; + public static final ConfigProperty INLINE_PARTITION_TTL = ConfigProperty + .key("hoodie.partition.ttl.inline") + .defaultValue(false) + .sinceVersion("1.0.0") + .markAdvanced() + .withDocumentation("When enabled, the partition ttl management service is invoked immediately after each commit, " + + "to delete exipired partitions"); + + public static final ConfigProperty PARTITION_TTL_STRATEGY_CLASS_NAME = ConfigProperty + .key("hoodie.partition.ttl.strategy.class") + .noDefaultValue() + .sinceVersion("1.0.0") + .markAdvanced() + .withDocumentation("Config to provide a strategy class (subclass of PartitionTTLStrategy) to get the expired partitions"); + + public static final ConfigProperty PARTITION_TTL_STRATEGY_TYPE = ConfigProperty + .key("hoodie.partition.ttl.management.strategy.type") + .defaultValue(PartitionTTLStrategyType.KEEP_BY_TIME.name()) + .sinceVersion("1.0.0") + .markAdvanced() + .withDocumentation("Partition ttl management strategy type to determine the strategy class"); + + public static final ConfigProperty DAYS_RETAIN = ConfigProperty + .key(PARTITION_TTL_STRATEGY_PARAM_PREFIX + "days.retain") + .defaultValue(-1) + .sinceVersion("1.0.0") + .markAdvanced() + .withDocumentation("Partition ttl management KEEP_BY_TIME strategy days retain"); + + public static final ConfigProperty PARTITION_SELECTED = ConfigProperty + .key(PARTITION_TTL_STRATEGY_PARAM_PREFIX + "partition.selected") + .noDefaultValue() + .markAdvanced() + .sinceVersion("1.0.0") + .withDocumentation("Partitions to manage ttl"); + + public static final ConfigProperty MAX_PARTITION_TO_DELETE = ConfigProperty + .key(PARTITION_TTL_STRATEGY_PARAM_PREFIX + "max.delete.partitions") + .defaultValue(1000) + .markAdvanced() + .sinceVersion("1.0.0") + .withDocumentation("max partitions to delete in partition ttl management"); + + public static class Builder { + private final HoodieTTLConfig ttlConfig = new HoodieTTLConfig(); + + public HoodieTTLConfig.Builder withTTLPartitionSelected(String partitionSelected) { + ttlConfig.setValue(PARTITION_SELECTED, partitionSelected); + return this; + } + + public HoodieTTLConfig.Builder withTTLDaysRetain(Integer daysRetain) { + ttlConfig.setValue(DAYS_RETAIN, daysRetain.toString()); + return this; + } + + public HoodieTTLConfig.Builder enableInlinePartitionTTL(Boolean enable) { + ttlConfig.setValue(INLINE_PARTITION_TTL, enable.toString()); + return this; + } + + public HoodieTTLConfig.Builder withTTLStrategyClass(String clazz) { + ttlConfig.setValue(PARTITION_TTL_STRATEGY_CLASS_NAME, clazz); + return this; + } + + public HoodieTTLConfig.Builder withTTLStrategyType(PartitionTTLStrategyType ttlStrategyType) { + ttlConfig.setValue(PARTITION_TTL_STRATEGY_TYPE, ttlStrategyType.name()); + return this; + } + + public HoodieTTLConfig.Builder fromProperties(Properties props) { + this.ttlConfig.getProps().putAll(props); + return this; + } + + public HoodieTTLConfig build() { + ttlConfig.setDefaults(HoodieTTLConfig.class.getName()); + return ttlConfig; + } + } + + public static Builder newBuilder() { + return new Builder(); + } + +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java index 8fd3546671e5..cca3836ec6e8 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java @@ -39,13 +39,13 @@ import org.apache.hudi.common.engine.EngineType; import org.apache.hudi.common.fs.ConsistencyGuardConfig; import org.apache.hudi.common.fs.FileSystemRetryConfig; +import org.apache.hudi.common.model.DefaultHoodieRecordPayload; import org.apache.hudi.common.model.HoodieAvroRecordMerger; import org.apache.hudi.common.model.HoodieCleaningPolicy; import org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy; import org.apache.hudi.common.model.HoodieFileFormat; import org.apache.hudi.common.model.HoodieRecordMerger; import org.apache.hudi.common.model.HoodieTableType; -import org.apache.hudi.common.model.OverwriteWithLatestAvroPayload; import org.apache.hudi.common.model.RecordPayloadType; import org.apache.hudi.common.model.WriteConcurrencyMode; import org.apache.hudi.common.model.WriteOperationType; @@ -68,6 +68,7 @@ import org.apache.hudi.config.metrics.HoodieMetricsDatadogConfig; import org.apache.hudi.config.metrics.HoodieMetricsGraphiteConfig; import org.apache.hudi.config.metrics.HoodieMetricsJmxConfig; +import org.apache.hudi.config.metrics.HoodieMetricsM3Config; import org.apache.hudi.config.metrics.HoodieMetricsPrometheusConfig; import org.apache.hudi.exception.HoodieNotSupportedException; import org.apache.hudi.execution.bulkinsert.BulkInsertSortMode; @@ -153,14 +154,14 @@ public class HoodieWriteConfig extends HoodieConfig { public static final ConfigProperty WRITE_PAYLOAD_CLASS_NAME = ConfigProperty .key("hoodie.datasource.write.payload.class") - .defaultValue(OverwriteWithLatestAvroPayload.class.getName()) + .defaultValue(DefaultHoodieRecordPayload.class.getName()) .markAdvanced() .withDocumentation("Payload class used. Override this, if you like to roll your own merge logic, when upserting/inserting. " + "This will render any value set for PRECOMBINE_FIELD_OPT_VAL in-effective"); public static final ConfigProperty WRITE_PAYLOAD_TYPE = ConfigProperty .key("hoodie.datasource.write.payload.type") - .defaultValue(RecordPayloadType.OVERWRITE_LATEST_AVRO.name()) + .defaultValue(RecordPayloadType.HOODIE_AVRO_DEFAULT.name()) .markAdvanced() .withDocumentation(RecordPayloadType.class); @@ -562,7 +563,7 @@ public class HoodieWriteConfig extends HoodieConfig { public static final ConfigProperty MERGE_ALLOW_DUPLICATE_ON_INSERTS_ENABLE = ConfigProperty .key("hoodie.merge.allow.duplicate.on.inserts") - .defaultValue("false") + .defaultValue("true") .markAdvanced() .withDocumentation("When enabled, we allow duplicate keys even if inserts are routed to merge with an existing file (for ensuring file sizing)." + " This is only relevant for insert operation, since upsert, delete operations will ensure unique key constraints are maintained."); @@ -2238,6 +2239,26 @@ public int getGraphiteReportPeriodSeconds() { return getInt(HoodieMetricsGraphiteConfig.GRAPHITE_REPORT_PERIOD_IN_SECONDS); } + public String getM3ServerHost() { + return getString(HoodieMetricsM3Config.M3_SERVER_HOST_NAME); + } + + public int getM3ServerPort() { + return getInt(HoodieMetricsM3Config.M3_SERVER_PORT_NUM); + } + + public String getM3Tags() { + return getString(HoodieMetricsM3Config.M3_TAGS); + } + + public String getM3Env() { + return getString(HoodieMetricsM3Config.M3_ENV); + } + + public String getM3Service() { + return getString(HoodieMetricsM3Config.M3_SERVICE); + } + public String getJmxHost() { return getString(HoodieMetricsJmxConfig.JMX_HOST_NAME); } @@ -2697,6 +2718,29 @@ public boolean isNonBlockingConcurrencyControl() { return getWriteConcurrencyMode().isNonBlockingConcurrencyControl(); } + /** + * TTL configs. + */ + public boolean isInlinePartitionTTLEnable() { + return getBoolean(HoodieTTLConfig.INLINE_PARTITION_TTL); + } + + public String getPartitionTTLStrategyClassName() { + return getString(HoodieTTLConfig.PARTITION_TTL_STRATEGY_CLASS_NAME); + } + + public Integer getPartitionTTLStrategyDaysRetain() { + return getInt(HoodieTTLConfig.DAYS_RETAIN); + } + + public String getPartitionTTLPartitionSelected() { + return getString(HoodieTTLConfig.PARTITION_SELECTED); + } + + public Integer getPartitionTTLMaxPartitionsToDelete() { + return getInt(HoodieTTLConfig.MAX_PARTITION_TO_DELETE); + } + public static class Builder { protected final HoodieWriteConfig writeConfig = new HoodieWriteConfig(); @@ -2716,10 +2760,13 @@ public static class Builder { private boolean isCallbackConfigSet = false; private boolean isPayloadConfigSet = false; private boolean isMetadataConfigSet = false; + + private boolean isTTLConfigSet = false; private boolean isLockConfigSet = false; private boolean isPreCommitValidationConfigSet = false; private boolean isMetricsJmxConfigSet = false; private boolean isMetricsGraphiteConfigSet = false; + private boolean isMetricsM3ConfigSet = false; private boolean isLayoutConfigSet = false; public Builder withEngineType(EngineType engineType) { @@ -2959,6 +3006,12 @@ public Builder withMetricsGraphiteConfig(HoodieMetricsGraphiteConfig mericsGraph return this; } + public Builder withMetricsM3Config(HoodieMetricsM3Config metricsM3Config) { + writeConfig.getProps().putAll(metricsM3Config.getProps()); + isMetricsM3ConfigSet = true; + return this; + } + public Builder withPreCommitValidatorConfig(HoodiePreCommitValidatorConfig validatorConfig) { writeConfig.getProps().putAll(validatorConfig.getProps()); isPreCommitValidationConfigSet = true; @@ -2995,6 +3048,12 @@ public Builder withMetadataConfig(HoodieMetadataConfig metadataConfig) { return this; } + public Builder withTTLConfig(HoodieTTLConfig ttlConfig) { + writeConfig.getProps().putAll(ttlConfig.getProps()); + isTTLConfigSet = true; + return this; + } + public Builder withAutoCommit(boolean autoCommit) { writeConfig.setValue(AUTO_COMMIT_ENABLE, String.valueOf(autoCommit)); return this; @@ -3262,6 +3321,8 @@ protected void setDefaults() { final boolean isLockProviderPropertySet = writeConfigProperties.containsKey(HoodieLockConfig.LOCK_PROVIDER_CLASS_NAME.key()); writeConfig.setDefaultOnCondition(!isLockConfigSet, HoodieLockConfig.newBuilder().fromProperties(writeConfig.getProps()).build()); + writeConfig.setDefaultOnCondition(!isTTLConfigSet, + HoodieTTLConfig.newBuilder().fromProperties(writeConfig.getProps()).build()); autoAdjustConfigsForConcurrencyMode(isLockProviderPropertySet); } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/metrics/HoodieMetricsConfig.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/metrics/HoodieMetricsConfig.java index e1d0afeb6fa4..328619f5e9c8 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/metrics/HoodieMetricsConfig.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/metrics/HoodieMetricsConfig.java @@ -220,6 +220,8 @@ public HoodieMetricsConfig build() { HoodieMetricsGraphiteConfig.newBuilder().fromProperties(hoodieMetricsConfig.getProps()).build()); hoodieMetricsConfig.setDefaultOnCondition(reporterType == MetricsReporterType.CLOUDWATCH, HoodieMetricsCloudWatchConfig.newBuilder().fromProperties(hoodieMetricsConfig.getProps()).build()); + hoodieMetricsConfig.setDefaultOnCondition(reporterType == MetricsReporterType.M3, + HoodieMetricsM3Config.newBuilder().fromProperties(hoodieMetricsConfig.getProps()).build()); return hoodieMetricsConfig; } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/metrics/HoodieMetricsM3Config.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/metrics/HoodieMetricsM3Config.java new file mode 100644 index 000000000000..cc675eebfbbf --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/metrics/HoodieMetricsM3Config.java @@ -0,0 +1,126 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.config.metrics; + +import static org.apache.hudi.config.metrics.HoodieMetricsConfig.METRIC_PREFIX; + +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.Properties; +import org.apache.hudi.common.config.ConfigClassProperty; +import org.apache.hudi.common.config.ConfigGroups; +import org.apache.hudi.common.config.ConfigProperty; +import org.apache.hudi.common.config.HoodieConfig; + +/** + * Configs for M3 reporter type. + *

+ * {@link org.apache.hudi.metrics.MetricsReporterType#M3} + */ +@ConfigClassProperty(name = "Metrics Configurations for M3", + groupName = ConfigGroups.Names.METRICS, + description = "Enables reporting on Hudi metrics using M3. " + + " Hudi publishes metrics on every commit, clean, rollback etc.") +public class HoodieMetricsM3Config extends HoodieConfig { + + public static final String M3_PREFIX = METRIC_PREFIX + ".m3"; + + public static final ConfigProperty M3_SERVER_HOST_NAME = ConfigProperty + .key(M3_PREFIX + ".host") + .defaultValue("localhost") + .withDocumentation("M3 host to connect to."); + + public static final ConfigProperty M3_SERVER_PORT_NUM = ConfigProperty + .key(M3_PREFIX + ".port") + .defaultValue(9052) + .withDocumentation("M3 port to connect to."); + + public static final ConfigProperty M3_TAGS = ConfigProperty + .key(M3_PREFIX + ".tags") + .defaultValue("") + .withDocumentation("Optional M3 tags applied to all metrics."); + + public static final ConfigProperty M3_ENV = ConfigProperty + .key(M3_PREFIX + ".env") + .defaultValue("production") + .withDocumentation("M3 tag to label the environment (defaults to 'production'), " + + "applied to all metrics."); + + public static final ConfigProperty M3_SERVICE = ConfigProperty + .key(M3_PREFIX + ".service") + .defaultValue("hoodie") + .withDocumentation("M3 tag to label the service name (defaults to 'hoodie'), " + + "applied to all metrics."); + + private HoodieMetricsM3Config() { + super(); + } + + public static HoodieMetricsM3Config.Builder newBuilder() { + return new HoodieMetricsM3Config.Builder(); + } + + public static class Builder { + + private final HoodieMetricsM3Config hoodieMetricsM3Config = new HoodieMetricsM3Config(); + + public HoodieMetricsM3Config.Builder fromFile(File propertiesFile) throws IOException { + try (FileReader reader = new FileReader(propertiesFile)) { + this.hoodieMetricsM3Config.getProps().load(reader); + return this; + } + } + + public HoodieMetricsM3Config.Builder fromProperties(Properties props) { + this.hoodieMetricsM3Config.getProps().putAll(props); + return this; + } + + public HoodieMetricsM3Config.Builder toM3Host(String host) { + hoodieMetricsM3Config.setValue(M3_SERVER_HOST_NAME, host); + return this; + } + + public HoodieMetricsM3Config.Builder onM3Port(int port) { + hoodieMetricsM3Config.setValue(M3_SERVER_PORT_NUM, String.valueOf(port)); + return this; + } + + public HoodieMetricsM3Config.Builder useM3Tags(String tags) { + hoodieMetricsM3Config.setValue(M3_TAGS, tags); + return this; + } + + public HoodieMetricsM3Config.Builder useM3Env(String env) { + hoodieMetricsM3Config.setValue(M3_ENV, env); + return this; + } + + public HoodieMetricsM3Config.Builder useM3Service(String service) { + hoodieMetricsM3Config.setValue(M3_SERVICE, service); + return this; + } + + public HoodieMetricsM3Config build() { + hoodieMetricsM3Config.setDefaults(HoodieMetricsM3Config.class.getName()); + return hoodieMetricsM3Config; + } + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bucket/ConsistentBucketIndexUtils.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bucket/ConsistentBucketIndexUtils.java index d22e4b21a5ec..0e47d0a688ab 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bucket/ConsistentBucketIndexUtils.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bucket/ConsistentBucketIndexUtils.java @@ -210,7 +210,16 @@ private static void createCommitMarker(HoodieTable table, Path fileStatus, Path if (fs.exists(fullPath)) { return; } - FileIOUtils.createFileInPath(fs, fullPath, Option.of(getUTF8Bytes(StringUtils.EMPTY_STRING))); + //prevent exception from race condition. We are ok with the file being created in another thread, so we should + // check for the marker after catching the exception and we don't need to fail if the file exists + try { + FileIOUtils.createFileInPath(fs, fullPath, Option.of(getUTF8Bytes(StringUtils.EMPTY_STRING))); + } catch (HoodieIOException e) { + if (!fs.exists(fullPath)) { + throw e; + } + LOG.warn("Failed to create marker but " + fullPath + " exists", e); + } } /*** diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieAppendHandle.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieAppendHandle.java index f8cc77274c2e..1301f046ae29 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieAppendHandle.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieAppendHandle.java @@ -89,7 +89,6 @@ public class HoodieAppendHandle extends HoodieWriteHandle recordList = new ArrayList<>(); @@ -158,7 +157,6 @@ public HoodieAppendHandle(HoodieWriteConfig config, String instantTime, HoodieTa ? Option.of(new Schema.Parser().parse(config.getPartialUpdateSchema())) : Option.empty(), taskContextSupplier); - this.fileId = fileId; this.recordItr = recordItr; this.sizeEstimator = new DefaultSizeEstimator(); this.statuses = new ArrayList<>(); @@ -173,50 +171,52 @@ public HoodieAppendHandle(HoodieWriteConfig config, String instantTime, HoodieTa } private void init(HoodieRecord record) { - if (doInit) { - String prevCommit = instantTime; - String baseFile = ""; - List logFiles = new ArrayList<>(); - if (config.isCDCEnabled()) { - // the cdc reader needs the base file metadata to have deterministic update sequence. - TableFileSystemView.SliceView rtView = hoodieTable.getSliceView(); - Option fileSlice = rtView.getLatestFileSlice(partitionPath, fileId); - if (fileSlice.isPresent()) { - prevCommit = fileSlice.get().getBaseInstantTime(); - baseFile = fileSlice.get().getBaseFile().map(BaseFile::getFileName).orElse(""); - logFiles = fileSlice.get().getLogFiles().map(HoodieLogFile::getFileName).collect(Collectors.toList()); - } + if (!doInit) { + return; + } + + String prevCommit = instantTime; + String baseFile = ""; + List logFiles = new ArrayList<>(); + if (config.isCDCEnabled()) { + // the cdc reader needs the base file metadata to have deterministic update sequence. + TableFileSystemView.SliceView rtView = hoodieTable.getSliceView(); + Option fileSlice = rtView.getLatestFileSlice(partitionPath, fileId); + if (fileSlice.isPresent()) { + prevCommit = fileSlice.get().getBaseInstantTime(); + baseFile = fileSlice.get().getBaseFile().map(BaseFile::getFileName).orElse(""); + logFiles = fileSlice.get().getLogFiles().map(HoodieLogFile::getFileName).collect(Collectors.toList()); } + } - // Prepare the first write status - HoodieDeltaWriteStat deltaWriteStat = new HoodieDeltaWriteStat(); - writeStatus.setStat(deltaWriteStat); - writeStatus.setFileId(fileId); - writeStatus.setPartitionPath(partitionPath); - averageRecordSize = sizeEstimator.sizeEstimate(record); + // Prepare the first write status + HoodieDeltaWriteStat deltaWriteStat = new HoodieDeltaWriteStat(); + writeStatus.setStat(deltaWriteStat); + writeStatus.setFileId(fileId); + writeStatus.setPartitionPath(partitionPath); + averageRecordSize = sizeEstimator.sizeEstimate(record); - deltaWriteStat.setPrevCommit(prevCommit); - deltaWriteStat.setPartitionPath(partitionPath); - deltaWriteStat.setFileId(fileId); - deltaWriteStat.setBaseFile(baseFile); - deltaWriteStat.setLogFiles(logFiles); + deltaWriteStat.setPrevCommit(prevCommit); + deltaWriteStat.setPartitionPath(partitionPath); + deltaWriteStat.setFileId(fileId); + deltaWriteStat.setBaseFile(baseFile); + deltaWriteStat.setLogFiles(logFiles); - try { - // Save hoodie partition meta in the partition path - HoodiePartitionMetadata partitionMetadata = new HoodiePartitionMetadata(fs, instantTime, - new Path(config.getBasePath()), FSUtils.getPartitionPath(config.getBasePath(), partitionPath), - hoodieTable.getPartitionMetafileFormat()); - partitionMetadata.trySave(getPartitionId()); - - this.writer = createLogWriter(getFileInstant(record)); - } catch (Exception e) { - LOG.error("Error in update task at commit " + instantTime, e); - writeStatus.setGlobalError(e); - throw new HoodieUpsertException("Failed to initialize HoodieAppendHandle for FileId: " + fileId + " on commit " - + instantTime + " on HDFS path " + hoodieTable.getMetaClient().getBasePathV2() + "/" + partitionPath, e); - } - doInit = false; + try { + // Save hoodie partition meta in the partition path + HoodiePartitionMetadata partitionMetadata = new HoodiePartitionMetadata(fs, instantTime, + new Path(config.getBasePath()), FSUtils.getPartitionPath(config.getBasePath(), partitionPath), + hoodieTable.getPartitionMetafileFormat()); + partitionMetadata.trySave(getPartitionId()); + + this.writer = createLogWriter(getFileInstant(record)); + } catch (Exception e) { + LOG.error("Error in update task at commit " + instantTime, e); + writeStatus.setGlobalError(e); + throw new HoodieUpsertException("Failed to initialize HoodieAppendHandle for FileId: " + fileId + " on commit " + + instantTime + " on HDFS path " + hoodieTable.getMetaClient().getBasePathV2() + "/" + partitionPath, e); } + doInit = false; } /** @@ -324,12 +324,7 @@ private MetadataValues populateMetadataFields(HoodieRecord hoodieRecord) { private void initNewStatus() { HoodieDeltaWriteStat prevStat = (HoodieDeltaWriteStat) this.writeStatus.getStat(); // Make a new write status and copy basic fields over. - HoodieDeltaWriteStat stat = new HoodieDeltaWriteStat(); - stat.setFileId(fileId); - stat.setPartitionPath(partitionPath); - stat.setPrevCommit(prevStat.getPrevCommit()); - stat.setBaseFile(prevStat.getBaseFile()); - stat.setLogFiles(new ArrayList<>(prevStat.getLogFiles())); + HoodieDeltaWriteStat stat = prevStat.copy(); this.writeStatus = (WriteStatus) ReflectionUtils.loadClass(config.getWriteStatusClassName(), hoodieTable.shouldTrackSuccessRecords(), config.getWriteStatusFailureFraction()); @@ -567,7 +562,7 @@ public IOType getIOType() { return IOType.APPEND; } - public List writeStatuses() { + public List getWriteStatuses() { return statuses; } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieCreateHandle.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieCreateHandle.java index bdb35641f268..0a0f3352069a 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieCreateHandle.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieCreateHandle.java @@ -34,7 +34,6 @@ import org.apache.hudi.exception.HoodieInsertException; import org.apache.hudi.io.storage.HoodieFileWriter; import org.apache.hudi.io.storage.HoodieFileWriterFactory; -import org.apache.hudi.metadata.HoodieTableMetadata; import org.apache.hudi.table.HoodieTable; import org.apache.avro.Schema; @@ -115,8 +114,7 @@ public HoodieCreateHandle(HoodieWriteConfig config, String instantTime, HoodieTa public HoodieCreateHandle(HoodieWriteConfig config, String instantTime, HoodieTable hoodieTable, String partitionPath, String fileId, Map> recordMap, TaskContextSupplier taskContextSupplier) { - // preserveMetadata is disabled by default for MDT but enabled otherwise - this(config, instantTime, hoodieTable, partitionPath, fileId, taskContextSupplier, !HoodieTableMetadata.isMetadataTable(config.getBasePath())); + this(config, instantTime, hoodieTable, partitionPath, fileId, taskContextSupplier, true); this.recordMap = recordMap; this.useWriterSchema = true; } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieMergeHandle.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieMergeHandle.java index a9b22d083326..4f5f240c4fd0 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieMergeHandle.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieMergeHandle.java @@ -47,7 +47,6 @@ import org.apache.hudi.io.storage.HoodieFileWriter; import org.apache.hudi.io.storage.HoodieFileWriterFactory; import org.apache.hudi.keygen.BaseKeyGenerator; -import org.apache.hudi.metadata.HoodieTableMetadata; import org.apache.hudi.table.HoodieTable; import org.apache.avro.Schema; @@ -145,8 +144,7 @@ public HoodieMergeHandle(HoodieWriteConfig config, String instantTime, HoodieTab super(config, instantTime, partitionPath, fileId, hoodieTable, taskContextSupplier); this.keyToNewRecords = keyToNewRecords; this.useWriterSchemaForCompaction = true; - // preserveMetadata is disabled by default for MDT but enabled otherwise - this.preserveMetadata = !HoodieTableMetadata.isMetadataTable(config.getBasePath()); + this.preserveMetadata = true; init(fileId, this.partitionPath, dataFileToBeMerged); validateAndSetAndKeyGenProps(keyGeneratorOpt, config.populateMetaFields()); } @@ -484,6 +482,15 @@ public void performMergeDataValidationCheck(WriteStatus writeStatus) { } } + public Iterator> getWriteStatusesAsIterator() { + List statuses = getWriteStatuses(); + // TODO(vc): This needs to be revisited + if (getPartitionPath() == null) { + LOG.info("Upsert Handle has partition path as null {}, {}", getOldFilePath(), statuses); + } + return Collections.singletonList(statuses).iterator(); + } + public Path getOldFilePath() { return oldFilePath; } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieWriteHandle.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieWriteHandle.java index 9d1bb6d511e8..ab80629c941b 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieWriteHandle.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieWriteHandle.java @@ -190,7 +190,7 @@ protected void markClosed() { public abstract List close(); - public List writeStatuses() { + public List getWriteStatuses() { return Collections.singletonList(writeStatus); } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/KeyGenUtils.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/KeyGenUtils.java index 7b88a0ab979b..4d7c83a7794d 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/KeyGenUtils.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/KeyGenUtils.java @@ -146,21 +146,24 @@ public static String[] extractRecordKeysByFields(String recordKey, List public static String getRecordKey(GenericRecord record, List recordKeyFields, boolean consistentLogicalTimestampEnabled) { boolean keyIsNullEmpty = true; StringBuilder recordKey = new StringBuilder(); - for (String recordKeyField : recordKeyFields) { + for (int i = 0; i < recordKeyFields.size(); i++) { + String recordKeyField = recordKeyFields.get(i); String recordKeyValue = HoodieAvroUtils.getNestedFieldValAsString(record, recordKeyField, true, consistentLogicalTimestampEnabled); if (recordKeyValue == null) { - recordKey.append(recordKeyField + DEFAULT_COMPOSITE_KEY_FILED_VALUE + NULL_RECORDKEY_PLACEHOLDER + DEFAULT_RECORD_KEY_PARTS_SEPARATOR); + recordKey.append(recordKeyField).append(DEFAULT_COMPOSITE_KEY_FILED_VALUE).append(NULL_RECORDKEY_PLACEHOLDER); } else if (recordKeyValue.isEmpty()) { - recordKey.append(recordKeyField + DEFAULT_COMPOSITE_KEY_FILED_VALUE + EMPTY_RECORDKEY_PLACEHOLDER + DEFAULT_RECORD_KEY_PARTS_SEPARATOR); + recordKey.append(recordKeyField).append(DEFAULT_COMPOSITE_KEY_FILED_VALUE).append(EMPTY_RECORDKEY_PLACEHOLDER); } else { - recordKey.append(recordKeyField + DEFAULT_COMPOSITE_KEY_FILED_VALUE + recordKeyValue + DEFAULT_RECORD_KEY_PARTS_SEPARATOR); + recordKey.append(recordKeyField).append(DEFAULT_COMPOSITE_KEY_FILED_VALUE).append(recordKeyValue); keyIsNullEmpty = false; } + if (i != recordKeyFields.size() - 1) { + recordKey.append(DEFAULT_RECORD_KEY_PARTS_SEPARATOR); + } } - recordKey.deleteCharAt(recordKey.length() - 1); if (keyIsNullEmpty) { throw new HoodieKeyException("recordKey values: \"" + recordKey + "\" for fields: " - + recordKeyFields.toString() + " cannot be entirely null or empty."); + + recordKeyFields + " cannot be entirely null or empty."); } return recordKey.toString(); } @@ -172,20 +175,27 @@ public static String getRecordPartitionPath(GenericRecord record, List p } StringBuilder partitionPath = new StringBuilder(); - for (String partitionPathField : partitionPathFields) { + for (int i = 0; i < partitionPathFields.size(); i++) { + String partitionPathField = partitionPathFields.get(i); String fieldVal = HoodieAvroUtils.getNestedFieldValAsString(record, partitionPathField, true, consistentLogicalTimestampEnabled); if (fieldVal == null || fieldVal.isEmpty()) { - partitionPath.append(hiveStylePartitioning ? partitionPathField + "=" + HUDI_DEFAULT_PARTITION_PATH - : HUDI_DEFAULT_PARTITION_PATH); + if (hiveStylePartitioning) { + partitionPath.append(partitionPathField).append("="); + } + partitionPath.append(HUDI_DEFAULT_PARTITION_PATH); } else { if (encodePartitionPath) { fieldVal = PartitionPathEncodeUtils.escapePathName(fieldVal); } - partitionPath.append(hiveStylePartitioning ? partitionPathField + "=" + fieldVal : fieldVal); + if (hiveStylePartitioning) { + partitionPath.append(partitionPathField).append("="); + } + partitionPath.append(fieldVal); + } + if (i != partitionPathFields.size() - 1) { + partitionPath.append(DEFAULT_PARTITION_PATH_SEPARATOR); } - partitionPath.append(DEFAULT_PARTITION_PATH_SEPARATOR); } - partitionPath.deleteCharAt(partitionPath.length() - 1); return partitionPath.toString(); } @@ -257,7 +267,7 @@ public static List getRecordKeyFields(TypedProperties props) { * @param props props of interest. * @return true if record keys need to be auto generated. false otherwise. */ - public static boolean enableAutoGenerateRecordKeys(TypedProperties props) { + public static boolean isAutoGeneratedRecordKeysEnabled(TypedProperties props) { return !props.containsKey(KeyGeneratorOptions.RECORDKEY_FIELD_NAME.key()); } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/factory/HoodieAvroKeyGeneratorFactory.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/factory/HoodieAvroKeyGeneratorFactory.java index f375095122da..f68e3232753a 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/factory/HoodieAvroKeyGeneratorFactory.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/factory/HoodieAvroKeyGeneratorFactory.java @@ -98,7 +98,7 @@ public static KeyGenerator createAvroKeyGeneratorByType(TypedProperties props) t throw new HoodieKeyGeneratorException("Unsupported keyGenerator Type " + keyGeneratorType); } - if (KeyGenUtils.enableAutoGenerateRecordKeys(props)) { + if (KeyGenUtils.isAutoGeneratedRecordKeysEnabled(props)) { return new AutoRecordGenWrapperAvroKeyGenerator(props, keyGenerator); } else { return keyGenerator; diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java index 8d40fc240952..99739947077c 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java @@ -101,7 +101,7 @@ import static org.apache.hudi.common.config.HoodieMetadataConfig.DEFAULT_METADATA_POPULATE_META_FIELDS; import static org.apache.hudi.common.table.HoodieTableConfig.ARCHIVELOG_FOLDER; import static org.apache.hudi.common.table.timeline.HoodieInstant.State.REQUESTED; -import static org.apache.hudi.common.table.timeline.HoodieTimeline.COMPACTION_ACTION; +import static org.apache.hudi.common.table.timeline.HoodieTimeline.COMMIT_ACTION; import static org.apache.hudi.common.table.timeline.HoodieTimeline.LESSER_THAN_OR_EQUALS; import static org.apache.hudi.common.table.timeline.HoodieTimeline.getIndexInflightInstant; import static org.apache.hudi.common.table.timeline.TimelineMetadataUtils.deserializeIndexPlan; @@ -392,7 +392,6 @@ private boolean initializeFromFilesystem(String initializationTime, List dataMetaClient.getTableConfig().isMetadataPartitionAvailable((metadataPartition))); - // Get a complete list of files and partitions from the file system or from already initialized FILES partition of MDT List partitionInfoList; if (filesPartitionAvailable) { @@ -462,7 +461,9 @@ private boolean initializeFromFilesystem(String initializationTime, List records = fileGroupCountAndRecordsPair.getValue(); bulkCommit(commitTimeForPartition, partitionType, records, fileGroupCount); metadataMetaClient.reloadActiveTimeline(); - dataMetaClient.getTableConfig().setMetadataPartitionState(dataMetaClient, partitionType, true); + String partitionPath = (partitionType == FUNCTIONAL_INDEX) ? dataWriteConfig.getFunctionalIndexConfig().getIndexName() : partitionType.getPartitionPath(); + + dataMetaClient.getTableConfig().setMetadataPartitionState(dataMetaClient, partitionPath, true); // initialize the metadata reader again so the MDT partition can be read after initialization initMetadataReader(); long totalInitTime = partitionInitTimer.endTimer(); @@ -795,7 +796,7 @@ public void dropMetadataPartitions(List metadataPartition for (MetadataPartitionType partitionType : metadataPartitions) { String partitionPath = partitionType.getPartitionPath(); // first update table config - dataMetaClient.getTableConfig().setMetadataPartitionState(dataMetaClient, partitionType, false); + dataMetaClient.getTableConfig().setMetadataPartitionState(dataMetaClient, partitionPath, false); LOG.warn("Deleting Metadata Table partition: " + partitionPath); dataMetaClient.getFs().delete(new Path(metadataWriteConfig.getBasePath(), partitionPath), true); // delete corresponding pending indexing instant file in the timeline @@ -829,7 +830,7 @@ private static void deletePendingIndexingInstant(HoodieTableMetaClient metaClien protected static void checkNumDeltaCommits(HoodieTableMetaClient metaClient, int maxNumDeltaCommitsWhenPending) { final HoodieActiveTimeline activeTimeline = metaClient.reloadActiveTimeline(); Option lastCompaction = activeTimeline.filterCompletedInstants() - .filter(s -> s.getAction().equals(COMPACTION_ACTION)).lastInstant(); + .filter(s -> s.getAction().equals(COMMIT_ACTION)).lastInstant(); int numDeltaCommits = lastCompaction.isPresent() ? activeTimeline.getDeltaCommitTimeline().findInstantsAfter(lastCompaction.get().getTimestamp()).countInstants() : activeTimeline.getDeltaCommitTimeline().countInstants(); @@ -900,6 +901,7 @@ public void buildMetadataPartitions(HoodieEngineContext engineContext, List partitionPaths = new ArrayList<>(); List partitionTypes = new ArrayList<>(); indexPartitionInfos.forEach(indexPartitionInfo -> { String relativePartitionPath = indexPartitionInfo.getMetadataPartitionPath(); @@ -913,10 +915,11 @@ public void buildMetadataPartitions(HoodieEngineContext engineContext, List HoodieTableMetadataUtil.convertMetadataToRecords(engineContext, dataMetaClient, rollbackMetadata, instantTime)); - + // The deltacommit that will be rolled back + HoodieInstant deltaCommitInstant = new HoodieInstant(false, HoodieTimeline.DELTA_COMMIT_ACTION, commitToRollbackInstantTime); if (deltacommitsSinceCompaction.containsInstant(deltaCommitInstant)) { LOG.info("Rolling back MDT deltacommit " + commitToRollbackInstantTime); + String rollbackInstantTime = createRollbackTimestamp(instantTime); if (!getWriteClient().rollback(commitToRollbackInstantTime, rollbackInstantTime)) { throw new HoodieMetadataException("Failed to rollback deltacommit at " + commitToRollbackInstantTime); } @@ -1131,8 +1126,9 @@ protected void validateRollback( String commitToRollbackInstantTime, HoodieInstant compactionInstant, HoodieTimeline deltacommitsSinceCompaction) { - // The commit being rolled back should not be earlier than the latest compaction on the MDT. Compaction on MDT only occurs when all actions - // are completed on the dataset. Hence, this case implies a rollback of completed commit which should actually be handled using restore. + // The commit being rolled back should not be earlier than the latest compaction on the MDT because the latest file slice does not change after all. + // Compaction on MDT only occurs when all actions are completed on the dataset. + // Hence, this case implies a rollback of completed commit which should actually be handled using restore. if (compactionInstant.getAction().equals(HoodieTimeline.COMMIT_ACTION)) { final String compactionInstantTime = compactionInstant.getTimestamp(); if (commitToRollbackInstantTime.length() == compactionInstantTime.length() && HoodieTimeline.LESSER_THAN_OR_EQUALS.test(commitToRollbackInstantTime, compactionInstantTime)) { @@ -1316,9 +1312,8 @@ public void performTableServices(Option inFlightInstantTimestamp) { .getTimestamp(); LOG.info("Latest deltacommit time found is " + latestDeltacommitTime + ", running clean operations."); cleanIfNecessary(writeClient, latestDeltacommitTime); - // Do timeline validation before scheduling compaction/logCompaction operations. - if (validateTimelineBeforeSchedulingCompaction(inFlightInstantTimestamp, latestDeltacommitTime)) { + if (validateCompactionScheduling()) { compactIfNecessary(writeClient, latestDeltacommitTime); } writeClient.archive(); @@ -1355,10 +1350,12 @@ private void runPendingTableServicesOperations(BaseHoodieWriteClient writeClient * deltacommit. */ protected void compactIfNecessary(BaseHoodieWriteClient writeClient, String latestDeltacommitTime) { - // Trigger compaction with suffixes based on the same instant time. This ensures that any future - // delta commits synced over will not have an instant time lesser than the last completed instant on the - // metadata table. - final String compactionInstantTime = writeClient.createNewInstantTime(false); + // IMPORTANT: Trigger compaction with max instant time that is smaller than(or equals) the earliest pending instant from DT. + // The compaction planner will manage to filter out the log files that finished with greater completion time. + // see BaseHoodieCompactionPlanGenerator.generateCompactionPlan for more details. + final String compactionInstantTime = dataMetaClient.reloadActiveTimeline().filterInflightsAndRequested() + .findInstantsBeforeOrEquals(latestDeltacommitTime).firstInstant().map(HoodieInstant::getTimestamp) + .orElse(writeClient.createNewInstantTime(false)); // we need to avoid checking compaction w/ same instant again. // let's say we trigger compaction after C5 in MDT and so compaction completes with C4001. but C5 crashed before completing in MDT. @@ -1407,35 +1404,19 @@ protected void cleanIfNecessary(BaseHoodieWriteClient writeClient, String instan /** * Validates the timeline for both main and metadata tables to ensure compaction on MDT can be scheduled. */ - protected boolean validateTimelineBeforeSchedulingCompaction(Option inFlightInstantTimestamp, String latestDeltaCommitTimeInMetadataTable) { - // we need to find if there are any inflights in data table timeline before or equal to the latest delta commit in metadata table. - // Whenever you want to change this logic, please ensure all below scenarios are considered. - // a. There could be a chance that latest delta commit in MDT is committed in MDT, but failed in DT. And so findInstantsBeforeOrEquals() should be employed - // b. There could be DT inflights after latest delta commit in MDT and we are ok with it. bcoz, the contract is, the latest compaction instant time in MDT represents - // any instants before that is already synced with metadata table. - // c. Do consider out of order commits. For eg, c4 from DT could complete before c3. and we can't trigger compaction in MDT with c4 as base instant time, until every - // instant before c4 is synced with metadata table. - List pendingInstants = dataMetaClient.reloadActiveTimeline().filterInflightsAndRequested() - .findInstantsBeforeOrEquals(latestDeltaCommitTimeInMetadataTable).getInstants(); - - if (!pendingInstants.isEmpty()) { - checkNumDeltaCommits(metadataMetaClient, dataWriteConfig.getMetadataConfig().getMaxNumDeltacommitsWhenPending()); - LOG.info(String.format( - "Cannot compact metadata table as there are %d inflight instants in data table before latest deltacommit in metadata table: %s. Inflight instants in data table: %s", - pendingInstants.size(), latestDeltaCommitTimeInMetadataTable, Arrays.toString(pendingInstants.toArray()))); - return false; - } - - // Check if there are any pending compaction or log compaction instants in the timeline. - // If pending compact/logCompaction operations are found abort scheduling new compaction/logCompaction operations. - Option pendingLogCompactionInstant = - metadataMetaClient.getActiveTimeline().filterPendingLogCompactionTimeline().firstInstant(); - Option pendingCompactionInstant = - metadataMetaClient.getActiveTimeline().filterPendingCompactionTimeline().firstInstant(); - if (pendingLogCompactionInstant.isPresent() || pendingCompactionInstant.isPresent()) { - LOG.warn(String.format("Not scheduling compaction or logCompaction, since a pending compaction instant %s or logCompaction %s instant is present", - pendingCompactionInstant, pendingLogCompactionInstant)); - return false; + protected boolean validateCompactionScheduling() { + // Under the log compaction scope, the sequence of the log-compaction and compaction needs to be ensured because metadata items such as RLI + // only has proc-time ordering semantics. For "ensured", it means the completion sequence of the log-compaction/compaction is the same as the start sequence. + if (metadataWriteConfig.isLogCompactionEnabled()) { + Option pendingLogCompactionInstant = + metadataMetaClient.getActiveTimeline().filterPendingLogCompactionTimeline().firstInstant(); + Option pendingCompactionInstant = + metadataMetaClient.getActiveTimeline().filterPendingCompactionTimeline().firstInstant(); + if (pendingLogCompactionInstant.isPresent() || pendingCompactionInstant.isPresent()) { + LOG.warn(String.format("Not scheduling compaction or logCompaction, since a pending compaction instant %s or logCompaction %s instant is present", + pendingCompactionInstant, pendingLogCompactionInstant)); + return false; + } } return true; } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataWriteUtils.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataWriteUtils.java index 7c42ccf50161..48cfb46b49f2 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataWriteUtils.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataWriteUtils.java @@ -37,6 +37,7 @@ import org.apache.hudi.config.metrics.HoodieMetricsConfig; import org.apache.hudi.config.metrics.HoodieMetricsGraphiteConfig; import org.apache.hudi.config.metrics.HoodieMetricsJmxConfig; +import org.apache.hudi.config.metrics.HoodieMetricsM3Config; import org.apache.hudi.config.metrics.HoodieMetricsPrometheusConfig; import org.apache.hudi.config.metrics.HoodieMetricsDatadogConfig; import org.apache.hudi.exception.HoodieMetadataException; @@ -81,11 +82,31 @@ public static HoodieWriteConfig createMetadataWriteConfig( String tableName = writeConfig.getTableName() + METADATA_TABLE_NAME_SUFFIX; final long maxLogFileSizeBytes = writeConfig.getMetadataConfig().getMaxLogFileSize(); + // Borrow the cleaner policy from the main table and adjust the cleaner policy based on the main table's cleaner policy + HoodieCleaningPolicy dataTableCleaningPolicy = writeConfig.getCleanerPolicy(); + HoodieCleanConfig.Builder cleanConfigBuilder = HoodieCleanConfig.newBuilder() + .withAsyncClean(DEFAULT_METADATA_ASYNC_CLEAN) + .withAutoClean(false) + .withCleanerParallelism(MDT_DEFAULT_PARALLELISM) + .withFailedWritesCleaningPolicy(failedWritesCleaningPolicy) + .withCleanerPolicy(dataTableCleaningPolicy); + + if (HoodieCleaningPolicy.KEEP_LATEST_COMMITS.equals(dataTableCleaningPolicy)) { + int retainCommits = (int) Math.max(DEFAULT_METADATA_CLEANER_COMMITS_RETAINED, writeConfig.getCleanerCommitsRetained() * 1.2); + cleanConfigBuilder.retainCommits(retainCommits); + } else if (HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS.equals(dataTableCleaningPolicy)) { + int retainFileVersions = (int) Math.ceil(writeConfig.getCleanerFileVersionsRetained() * 1.2); + cleanConfigBuilder.retainFileVersions(retainFileVersions); + } else if (HoodieCleaningPolicy.KEEP_LATEST_BY_HOURS.equals(dataTableCleaningPolicy)) { + int numHoursRetained = (int) Math.ceil(writeConfig.getCleanerHoursRetained() * 1.2); + cleanConfigBuilder.cleanerNumHoursRetained(numHoursRetained); + } // Create the write config for the metadata table by borrowing options from the main write config. HoodieWriteConfig.Builder builder = HoodieWriteConfig.newBuilder() .withEngineType(writeConfig.getEngineType()) .withTimelineLayoutVersion(TimelineLayoutVersion.CURR_VERSION) + .withMergeAllowDuplicateOnInserts(false) .withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder() .withConsistencyCheckEnabled(writeConfig.getConsistencyGuardConfig().isConsistencyCheckEnabled()) .withInitialConsistencyCheckIntervalMs(writeConfig.getConsistencyGuardConfig().getInitialConsistencyCheckIntervalMs()) @@ -103,14 +124,7 @@ public static HoodieWriteConfig createMetadataWriteConfig( .withSchema(HoodieMetadataRecord.getClassSchema().toString()) .forTable(tableName) // we will trigger cleaning manually, to control the instant times - .withCleanConfig(HoodieCleanConfig.newBuilder() - .withAsyncClean(DEFAULT_METADATA_ASYNC_CLEAN) - .withAutoClean(false) - .withCleanerParallelism(MDT_DEFAULT_PARALLELISM) - .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS) - .withFailedWritesCleaningPolicy(failedWritesCleaningPolicy) - .retainCommits(DEFAULT_METADATA_CLEANER_COMMITS_RETAINED) - .build()) + .withCleanConfig(cleanConfigBuilder.build()) // we will trigger archive manually, to ensure only regular writer invokes it .withArchivalConfig(HoodieArchivalConfig.newBuilder() .archiveCommitsWith( @@ -182,6 +196,15 @@ public static HoodieWriteConfig createMetadataWriteConfig( .withPushgatewayPortNum(writeConfig.getPushGatewayPort()).build(); builder.withProperties(prometheusConfig.getProps()); break; + case M3: + HoodieMetricsM3Config m3Config = HoodieMetricsM3Config.newBuilder() + .onM3Port(writeConfig.getM3ServerPort()) + .toM3Host(writeConfig.getM3ServerHost()) + .useM3Tags(writeConfig.getM3Tags()) + .useM3Service(writeConfig.getM3Service()) + .useM3Env(writeConfig.getM3Env()).build(); + builder.withProperties(m3Config.getProps()); + break; case DATADOG: HoodieMetricsDatadogConfig.Builder datadogConfig = HoodieMetricsDatadogConfig.newBuilder() .withDatadogApiKey(writeConfig.getDatadogApiKey()) diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/MetricsReporterFactory.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/MetricsReporterFactory.java index 27034735a040..0d20337fa5c5 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/MetricsReporterFactory.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/MetricsReporterFactory.java @@ -27,6 +27,7 @@ import org.apache.hudi.metrics.cloudwatch.CloudWatchMetricsReporter; import org.apache.hudi.metrics.custom.CustomizableMetricsReporter; import org.apache.hudi.metrics.datadog.DatadogMetricsReporter; +import org.apache.hudi.metrics.m3.M3MetricsReporter; import org.apache.hudi.metrics.prometheus.PrometheusReporter; import org.apache.hudi.metrics.prometheus.PushGatewayMetricsReporter; @@ -89,6 +90,9 @@ public static Option createReporter(HoodieWriteConfig config, M case CLOUDWATCH: reporter = new CloudWatchMetricsReporter(config, registry); break; + case M3: + reporter = new M3MetricsReporter(config, registry); + break; default: LOG.error("Reporter type[" + type + "] is not supported."); break; diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/MetricsReporterType.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/MetricsReporterType.java index 3c8600159287..6d05e443e6b9 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/MetricsReporterType.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/MetricsReporterType.java @@ -22,5 +22,5 @@ * Types of the reporter supported, hudi also supports user defined reporter. */ public enum MetricsReporterType { - GRAPHITE, INMEMORY, JMX, DATADOG, CONSOLE, PROMETHEUS_PUSHGATEWAY, PROMETHEUS, CLOUDWATCH + GRAPHITE, INMEMORY, JMX, DATADOG, CONSOLE, PROMETHEUS_PUSHGATEWAY, PROMETHEUS, CLOUDWATCH, M3 } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/m3/M3MetricsReporter.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/m3/M3MetricsReporter.java new file mode 100644 index 000000000000..a658476ef754 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/m3/M3MetricsReporter.java @@ -0,0 +1,120 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.metrics.m3; + +import com.codahale.metrics.MetricRegistry; +import com.uber.m3.tally.m3.M3Reporter; +import com.uber.m3.util.Duration; +import com.uber.m3.util.ImmutableMap; +import com.uber.m3.tally.RootScopeBuilder; +import com.uber.m3.tally.Scope; +import java.net.InetSocketAddress; +import java.util.Arrays; +import java.util.HashMap; +import java.util.Map; +import java.util.concurrent.TimeUnit; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.metrics.MetricsReporter; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Implementation of M3 Metrics reporter, which can report metrics to a https://m3db.io/ service + */ +public class M3MetricsReporter extends MetricsReporter { + + private static final Logger LOG = LoggerFactory.getLogger(M3MetricsReporter.class); + private final HoodieWriteConfig config; + private final MetricRegistry registry; + private final ImmutableMap tags; + + public M3MetricsReporter(HoodieWriteConfig config, MetricRegistry registry) { + this.config = config; + this.registry = registry; + + ImmutableMap.Builder tagBuilder = new ImmutableMap.Builder<>(); + tagBuilder.putAll(parseOptionalTags(config.getM3Tags())); + tagBuilder.put("service", config.getM3Service()); + tagBuilder.put("env", config.getM3Env()); + this.tags = tagBuilder.build(); + LOG.info(String.format("Building M3 Reporter with M3 tags mapping: %s", tags)); + } + + private static Map parseOptionalTags(String tagValueString) { + Map parsedTags = new HashMap(); + if (!tagValueString.isEmpty()) { + Arrays.stream(tagValueString.split(",")).forEach((tagValuePair) -> { + String[] parsedTagValuePair = Arrays.stream(tagValuePair.split("=")) + .map((tagOrValue) -> tagOrValue.trim()).filter((tagOrValue) -> !tagOrValue.isEmpty()) + .toArray(String[]::new); + if (parsedTagValuePair.length != 2) { + throw new RuntimeException(String.format( + "M3 Reporter tags cannot be initialized with tags [%s] due to not being in format `tag=value, . . .`.", + tagValuePair)); + } + parsedTags.put(parsedTagValuePair[0], parsedTagValuePair[1]); + }); + } + return parsedTags; + } + + @Override + public void start() {} + + @Override + public void report() { + /* + Although com.uber.m3.tally.Scope supports automatically submitting metrics in an interval + via a background task, it does not seem to support + - an API for explicitly flushing/emitting all metrics + - Taking in an external com.codahale.metrics.MetricRegistry metrics registry and automatically + adding any new counters/gauges whenever they are added to the registry + Due to this, this implementation emits metrics by creating a Scope, adding all metrics from + the HUDI metircs registry as counters/gauges to the scope, and then closing the Scope. Since + closing this Scope will implicitly flush all M3 metrics, the reporting intervals + are configured to be Integer.MAX_VALUE. + */ + synchronized (this) { + try (Scope scope = new RootScopeBuilder() + .reporter(new M3Reporter.Builder( + new InetSocketAddress(config.getM3ServerHost(), config.getM3ServerPort())) + .includeHost(true).commonTags(tags) + .build()) + .reportEvery(Duration.ofSeconds(Integer.MAX_VALUE)) + .tagged(tags)) { + + M3ScopeReporterAdaptor scopeReporter = new M3ScopeReporterAdaptor(registry, scope); + scopeReporter.start(Integer.MAX_VALUE, TimeUnit.SECONDS); + scopeReporter.report(); + scopeReporter.stop(); + } catch (Exception e) { + LOG.error(String.format("Error reporting metrics to M3: %s", e)); + } + } + } + + @Override + public void stop() {} +} + + + + + + diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/m3/M3ScopeReporterAdaptor.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/m3/M3ScopeReporterAdaptor.java new file mode 100644 index 000000000000..ae66914400b9 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/m3/M3ScopeReporterAdaptor.java @@ -0,0 +1,145 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.metrics.m3; + +import com.codahale.metrics.Counter; +import com.codahale.metrics.Gauge; +import com.codahale.metrics.Histogram; +import com.codahale.metrics.Meter; +import com.codahale.metrics.Metered; +import com.codahale.metrics.MetricFilter; +import com.codahale.metrics.MetricRegistry; +import com.codahale.metrics.ScheduledReporter; +import com.codahale.metrics.Snapshot; +import com.codahale.metrics.Timer; +import com.uber.m3.tally.Scope; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.SortedMap; +import java.util.concurrent.TimeUnit; +import org.apache.hudi.common.util.collection.Pair; + +/** + * Implementation of com.codahale.metrics.ScheduledReporter, to emit metrics from + * com.codahale.metrics.MetricRegistry to M3 + */ +public class M3ScopeReporterAdaptor extends ScheduledReporter { + private final Scope scope; + + protected M3ScopeReporterAdaptor(MetricRegistry registry, Scope scope) { + super(registry, "hudi-m3-reporter", MetricFilter.ALL, TimeUnit.SECONDS, TimeUnit.SECONDS); + this.scope = scope; + } + + @Override + public void start(long period, TimeUnit unit) { + } + + @Override + public void stop() { + } + + @Override + public void report(SortedMap gauges, SortedMap counters, + SortedMap histograms, SortedMap meters, + SortedMap timers) { + /* + When reporting, process each com.codahale.metrics metric and add counters & gauges to + the passed-in com.uber.m3.tally.Scope with the same name and value. This is needed + for the Scope to register these metrics + */ + report(scope, + gauges, + counters, + histograms, + meters, + timers); + } + + private void report(Scope scope, + Map gauges, + Map counters, + Map histograms, + Map meters, + Map timers) { + + for (Entry entry : gauges.entrySet()) { + scope.gauge(entry.getKey()).update( + ((Number) entry.getValue().getValue()).doubleValue()); + } + + for (Entry entry : counters.entrySet()) { + scope.counter(entry.getKey()).inc( + ((Number) entry.getValue().getCount()).longValue()); + } + + for (Entry entry : histograms.entrySet()) { + scope.gauge(MetricRegistry.name(entry.getKey(), "count")).update( + entry.getValue().getCount()); + reportSnapshot(entry.getKey(), entry.getValue().getSnapshot()); + } + + for (Entry entry : meters.entrySet()) { + reportMetered(entry.getKey(), entry.getValue()); + } + + for (Entry entry : timers.entrySet()) { + reportTimer(entry.getKey(), entry.getValue()); + } + } + + private void reportMetered(String name, Metered meter) { + scope.counter(MetricRegistry.name(name, "count")).inc(meter.getCount()); + List> meterGauges = Arrays.asList( + Pair.of("m1_rate", meter.getOneMinuteRate()), + Pair.of("m5_rate", meter.getFiveMinuteRate()), + Pair.of("m15_rate", meter.getFifteenMinuteRate()), + Pair.of("mean_rate", meter.getMeanRate()) + ); + for (Pair pair : meterGauges) { + scope.gauge(MetricRegistry.name(name, pair.getLeft())).update(pair.getRight()); + } + } + + private void reportSnapshot(String name, Snapshot snapshot) { + List> snapshotGauges = Arrays.asList( + Pair.of("max", snapshot.getMax()), + Pair.of("mean", snapshot.getMean()), + Pair.of("min", snapshot.getMin()), + Pair.of("stddev", snapshot.getStdDev()), + Pair.of("p50", snapshot.getMedian()), + Pair.of("p75", snapshot.get75thPercentile()), + Pair.of("p95", snapshot.get95thPercentile()), + Pair.of("p98", snapshot.get98thPercentile()), + Pair.of("p99", snapshot.get99thPercentile()), + Pair.of("p999", snapshot.get999thPercentile()) + ); + for (Pair pair : snapshotGauges) { + scope.gauge(MetricRegistry.name(name, pair.getLeft())).update(pair.getRight().doubleValue()); + } + } + + private void reportTimer(String name, Timer timer) { + reportMetered(name, timer); + reportSnapshot(name, timer.getSnapshot()); + } + +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/HoodieTable.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/HoodieTable.java index 080fe5f357d6..aadb0d486857 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/HoodieTable.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/HoodieTable.java @@ -67,14 +67,17 @@ import org.apache.hudi.exception.HoodieInsertException; import org.apache.hudi.exception.HoodieMetadataException; import org.apache.hudi.exception.HoodieUpsertException; +import org.apache.hudi.exception.SchemaCompatibilityException; import org.apache.hudi.hadoop.fs.ConsistencyGuard; import org.apache.hudi.hadoop.fs.ConsistencyGuard.FileVisibility; import org.apache.hudi.index.HoodieIndex; +import org.apache.hudi.io.HoodieMergeHandle; import org.apache.hudi.metadata.HoodieTableMetadata; import org.apache.hudi.metadata.HoodieTableMetadataWriter; import org.apache.hudi.metadata.MetadataPartitionType; import org.apache.hudi.table.action.HoodieWriteMetadata; import org.apache.hudi.table.action.bootstrap.HoodieBootstrapWriteMetadata; +import org.apache.hudi.table.action.commit.HoodieMergeHelper; import org.apache.hudi.table.marker.WriteMarkers; import org.apache.hudi.table.marker.WriteMarkersFactory; import org.apache.hudi.table.storage.HoodieLayoutFactory; @@ -121,13 +124,12 @@ * @param Type of outputs */ public abstract class HoodieTable implements Serializable { - private static final Logger LOG = LoggerFactory.getLogger(HoodieTable.class); protected final HoodieWriteConfig config; protected final HoodieTableMetaClient metaClient; protected final HoodieIndex index; - private SerializableConfiguration hadoopConfiguration; + private final SerializableConfiguration hadoopConfiguration; protected final TaskContextSupplier taskContextSupplier; private final HoodieTableMetadata metadata; private final HoodieStorageLayout storageLayout; @@ -146,7 +148,7 @@ protected HoodieTable(HoodieWriteConfig config, HoodieEngineContext context, Hoo .build(); this.metadata = HoodieTableMetadata.create(context, metadataConfig, config.getBasePath()); - this.viewManager = FileSystemViewManager.createViewManager(context, config.getMetadataConfig(), config.getViewStorageConfig(), config.getCommonConfig(), unused -> metadata); + this.viewManager = getViewManager(); this.metaClient = metaClient; this.index = getIndex(config, context); this.storageLayout = getStorageLayout(config); @@ -165,7 +167,7 @@ protected HoodieStorageLayout getStorageLayout(HoodieWriteConfig config) { private synchronized FileSystemViewManager getViewManager() { if (null == viewManager) { - viewManager = FileSystemViewManager.createViewManager(getContext(), config.getMetadataConfig(), config.getViewStorageConfig(), config.getCommonConfig(), unused -> metadata); + viewManager = FileSystemViewManager.createViewManager(getContext(), config.getViewStorageConfig(), config.getCommonConfig(), unused -> metadata); } return viewManager; } @@ -177,8 +179,7 @@ private synchronized FileSystemViewManager getViewManager() { * @param records hoodieRecords to upsert * @return HoodieWriteMetadata */ - public abstract HoodieWriteMetadata upsert(HoodieEngineContext context, String instantTime, - I records); + public abstract HoodieWriteMetadata upsert(HoodieEngineContext context, String instantTime, I records); /** * Insert a batch of new records into Hoodie table at the supplied instantTime. @@ -187,8 +188,7 @@ public abstract HoodieWriteMetadata upsert(HoodieEngineContext context, Strin * @param records hoodieRecords to upsert * @return HoodieWriteMetadata */ - public abstract HoodieWriteMetadata insert(HoodieEngineContext context, String instantTime, - I records); + public abstract HoodieWriteMetadata insert(HoodieEngineContext context, String instantTime, I records); /** * Bulk Insert a batch of new records into Hoodie table at the supplied instantTime. @@ -267,7 +267,7 @@ public abstract HoodieWriteMetadata insertPrepped(HoodieEngineContext context * @return HoodieWriteMetadata */ public abstract HoodieWriteMetadata bulkInsertPrepped(HoodieEngineContext context, String instantTime, - I preppedRecords, Option bulkInsertPartitioner); + I preppedRecords, Option bulkInsertPartitioner); /** * Replaces all the existing records and inserts the specified new records into Hoodie table at the supplied instantTime, @@ -291,6 +291,14 @@ public abstract HoodieWriteMetadata bulkInsertPrepped(HoodieEngineContext con */ public abstract HoodieWriteMetadata insertOverwriteTable(HoodieEngineContext context, String instantTime, I records); + /** + * Delete expired partition by config + * @param context HoodieEngineContext + * @param instantTime Instant Time for the action + * @return HoodieWriteMetadata + */ + public abstract HoodieWriteMetadata managePartitionTTL(HoodieEngineContext context, String instantTime); + public HoodieWriteConfig getConfig() { return config; } @@ -564,7 +572,9 @@ public abstract HoodieRollbackMetadata rollback(HoodieEngineContext context, * @param partitionsToIndex List of {@link MetadataPartitionType} that should be indexed. * @return HoodieIndexPlan containing metadata partitions and instant upto which they should be indexed. */ - public abstract Option scheduleIndexing(HoodieEngineContext context, String indexInstantTime, List partitionsToIndex); + public abstract Option scheduleIndexing(HoodieEngineContext context, String indexInstantTime, + List partitionsToIndex, + List partitionPaths); /** * Execute requested index action. @@ -859,8 +869,10 @@ private void validateSchema() throws HoodieUpsertException, HoodieInsertExceptio Schema writerSchema = HoodieAvroUtils.createHoodieWriteSchema(config.getSchema()); Schema tableSchema = HoodieAvroUtils.createHoodieWriteSchema(existingTableSchema.get()); AvroSchemaUtils.checkSchemaCompatible(tableSchema, writerSchema, shouldValidate, allowProjection, getDropPartitionColNames()); + } catch (SchemaCompatibilityException e) { + throw e; } catch (Exception e) { - throw new HoodieException("Failed to read schema/check compatibility for base path " + metaClient.getBasePath(), e); + throw new SchemaCompatibilityException("Failed to read schema/check compatibility for base path " + metaClient.getBasePath(), e); } } @@ -984,8 +996,8 @@ public void deleteMetadataIndexIfNecessary() { if (shouldDeleteMetadataPartition(partitionType)) { try { LOG.info("Deleting metadata partition because it is disabled in writer: " + partitionType.name()); - if (metadataPartitionExists(metaClient.getBasePath(), context, partitionType)) { - deleteMetadataPartition(metaClient.getBasePath(), context, partitionType); + if (metadataPartitionExists(metaClient.getBasePath(), context, partitionType.getPartitionPath())) { + deleteMetadataPartition(metaClient.getBasePath(), context, partitionType.getPartitionPath()); } clearMetadataTablePartitionsConfig(Option.of(partitionType), false); } catch (HoodieMetadataException e) { @@ -1083,4 +1095,12 @@ private Set getDropPartitionColNames() { } return new HashSet<>(Arrays.asList(partitionFields.get())); } + + public void runMerge(HoodieMergeHandle upsertHandle, String instantTime, String fileId) throws IOException { + if (upsertHandle.getOldFilePath() == null) { + throw new HoodieUpsertException("Error in finding the old file path at commit " + instantTime + " for fileId: " + fileId); + } else { + HoodieMergeHelper.newInstance().runMerge(this, upsertHandle); + } + } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/clean/CleanActionExecutor.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/clean/CleanActionExecutor.java index 61c0eeeffb0f..c7e294410e3d 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/clean/CleanActionExecutor.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/clean/CleanActionExecutor.java @@ -234,7 +234,7 @@ private HoodieCleanMetadata runClean(HoodieTable table, HoodieInstan throw new HoodieIOException("Failed to clean up after commit", e); } finally { if (!skipLocking) { - this.txnManager.endTransaction(Option.of(inflightInstant)); + this.txnManager.endTransaction(Option.ofNullable(inflightInstant)); } } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/clean/CleanPlanActionExecutor.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/clean/CleanPlanActionExecutor.java index 723a95bb2181..77c96b47f057 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/clean/CleanPlanActionExecutor.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/clean/CleanPlanActionExecutor.java @@ -48,6 +48,7 @@ import java.util.Map; import java.util.stream.Collectors; +import static org.apache.hudi.client.utils.MetadataTableUtils.shouldUseBatchLookup; import static org.apache.hudi.common.util.MapUtils.nonEmpty; import static org.apache.hudi.table.action.clean.CleanPlanner.SAVEPOINTED_TIMESTAMPS; @@ -122,10 +123,15 @@ HoodieCleanerPlan requestClean(HoodieEngineContext context) { Map> cleanOps = new HashMap<>(); List partitionsToDelete = new ArrayList<>(); + boolean shouldUseBatchLookup = shouldUseBatchLookup(table.getMetaClient().getTableConfig(), config); for (int i = 0; i < partitionsToClean.size(); i += cleanerParallelism) { // Handles at most 'cleanerParallelism' number of partitions once at a time to avoid overlarge memory pressure to the timeline server // (remote or local embedded), thus to reduce the risk of an OOM exception. List subPartitionsToClean = partitionsToClean.subList(i, Math.min(i + cleanerParallelism, partitionsToClean.size())); + if (shouldUseBatchLookup) { + LOG.info("Load partitions and files into file system view in advance. Paths: {}", subPartitionsToClean); + table.getHoodieView().loadPartitions(subPartitionsToClean); + } Map>> cleanOpsWithPartitionMeta = context .map(subPartitionsToClean, partitionPathToClean -> Pair.of(partitionPathToClean, planner.getDeletePaths(partitionPathToClean, earliestInstant)), cleanerParallelism) .stream() diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/clean/CleanPlanner.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/clean/CleanPlanner.java index 19cbe0f91a73..753f8c8253d5 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/clean/CleanPlanner.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/clean/CleanPlanner.java @@ -64,8 +64,6 @@ import java.util.stream.Collectors; import java.util.stream.Stream; -import static org.apache.hudi.client.utils.MetadataTableUtils.shouldUseBatchLookup; - /** * Cleaner is responsible for garbage collecting older files in a given partition path. Such that *

@@ -108,14 +106,9 @@ public CleanPlanner(HoodieEngineContext context, HoodieTable hoodieT .map(entry -> Pair.of(new HoodieFileGroupId(entry.getValue().getPartitionPath(), entry.getValue().getFileId()), entry.getValue())) .collect(Collectors.toMap(Pair::getKey, Pair::getValue)); - // load all partitions in advance if necessary. - if (shouldUseBatchLookup(hoodieTable.getMetaClient().getTableConfig(), config)) { - LOG.info("Load all partitions and files into file system view in advance."); - fileSystemView.loadAllPartitions(); - } - // collect savepointed timestamps to be assist with incremental cleaning. For non-partitioned and metadata table, we may not need this. - this.savepointedTimestamps = hoodieTable.isMetadataTable() ? Collections.EMPTY_LIST : (hoodieTable.isPartitioned() ? hoodieTable.getSavepointTimestamps().stream().collect(Collectors.toList()) - : Collections.EMPTY_LIST); + // collect savepointed timestamps to assist with incremental cleaning. For non-partitioned and metadata table, we may not need this. + this.savepointedTimestamps = hoodieTable.isMetadataTable() ? Collections.emptyList() : (hoodieTable.isPartitioned() ? new ArrayList<>(hoodieTable.getSavepointTimestamps()) + : Collections.emptyList()); } /** @@ -234,8 +227,8 @@ private List getPartitionPathsForIncrementalCleaning(HoodieCleanMetadata } private List getPartitionsFromDeletedSavepoint(HoodieCleanMetadata cleanMetadata) { - List savepointedTimestampsFromLastClean = Arrays.stream(cleanMetadata.getExtraMetadata() - .getOrDefault(SAVEPOINTED_TIMESTAMPS, StringUtils.EMPTY_STRING).split(",")) + List savepointedTimestampsFromLastClean = cleanMetadata.getExtraMetadata() == null ? Collections.emptyList() + : Arrays.stream(cleanMetadata.getExtraMetadata().getOrDefault(SAVEPOINTED_TIMESTAMPS, StringUtils.EMPTY_STRING).split(",")) .filter(partition -> !StringUtils.isNullOrEmpty(partition)).collect(Collectors.toList()); if (savepointedTimestampsFromLastClean.isEmpty()) { return Collections.emptyList(); @@ -252,6 +245,7 @@ private List getPartitionsFromDeletedSavepoint(HoodieCleanMetadata clean Option instantOption = hoodieTable.getCompletedCommitsTimeline().filter(instant -> instant.getTimestamp().equals(savepointCommit)).firstInstant(); if (!instantOption.isPresent()) { LOG.warn("Skipping to process a commit for which savepoint was removed as the instant moved to archived timeline already"); + return Stream.empty(); } HoodieInstant instant = instantOption.get(); return getPartitionsForInstants(instant); diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/cluster/ClusteringPlanActionExecutor.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/cluster/ClusteringPlanActionExecutor.java index 18c98d377f6c..6cb8f023ba83 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/cluster/ClusteringPlanActionExecutor.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/cluster/ClusteringPlanActionExecutor.java @@ -57,7 +57,8 @@ public ClusteringPlanActionExecutor(HoodieEngineContext context, protected Option createClusteringPlan() { LOG.info("Checking if clustering needs to be run on " + config.getBasePath()); - Option lastClusteringInstant = table.getActiveTimeline().getLastClusterCommit(); + Option lastClusteringInstant = + table.getActiveTimeline().getLastClusteringInstant(); int commitsSinceLastClustering = table.getActiveTimeline().getCommitsTimeline().filterCompletedInstants() .findInstantsAfter(lastClusteringInstant.map(HoodieInstant::getTimestamp).orElse("0"), Integer.MAX_VALUE) diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/cluster/strategy/ClusteringPlanStrategy.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/cluster/strategy/ClusteringPlanStrategy.java index 0d07bed531a4..a6894388f6d2 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/cluster/strategy/ClusteringPlanStrategy.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/cluster/strategy/ClusteringPlanStrategy.java @@ -54,7 +54,7 @@ public abstract class ClusteringPlanStrategy implements Serializable { public static final int CLUSTERING_PLAN_VERSION_1 = 1; - private final HoodieTable hoodieTable; + protected final HoodieTable hoodieTable; private final transient HoodieEngineContext engineContext; private final HoodieWriteConfig writeConfig; diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/BaseCommitActionExecutor.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/BaseCommitActionExecutor.java index c5f5273fbad9..36f75b6a5b07 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/BaseCommitActionExecutor.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/BaseCommitActionExecutor.java @@ -263,6 +263,7 @@ protected abstract Iterator> handleUpdate(String partitionPath Iterator> recordItr) throws IOException; protected HoodieWriteMetadata> executeClustering(HoodieClusteringPlan clusteringPlan) { + context.setJobStatus(this.getClass().getSimpleName(), "Clustering records for " + config.getTableName()); HoodieInstant instant = HoodieTimeline.getReplaceCommitRequestedInstant(instantTime); // Mark instant as clustering inflight table.getActiveTimeline().transitionReplaceRequestedToInflight(instant, Option.empty()); @@ -285,6 +286,7 @@ protected HoodieWriteMetadata> executeClustering(HoodieC writeMetadata.setPartitionToReplaceFileIds(getPartitionToReplacedFileIds(clusteringPlan, writeMetadata)); commitOnAutoCommit(writeMetadata); if (!writeMetadata.getCommitMetadata().isPresent()) { + LOG.info("Found empty commit metadata for clustering with instant time " + instantTime); HoodieCommitMetadata commitMetadata = CommitUtils.buildMetadata(writeMetadata.getWriteStats().get(), writeMetadata.getPartitionToReplaceFileIds(), extraMetadata, operationType, getSchemaToStoreInCommit(), getCommitActionType()); writeMetadata.setCommitMetadata(Option.of(commitMetadata)); diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/ScheduleCompactionActionExecutor.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/ScheduleCompactionActionExecutor.java index 2aac6f0db8ee..941d93fd3506 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/ScheduleCompactionActionExecutor.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/ScheduleCompactionActionExecutor.java @@ -45,7 +45,6 @@ import javax.annotation.Nullable; import java.io.IOException; -import java.text.ParseException; import java.util.Map; import static org.apache.hudi.common.util.CollectionUtils.nonEmpty; @@ -211,12 +210,7 @@ private boolean needCompact(CompactionTriggerStrategy compactionTriggerStrategy) } private Long parsedToSeconds(String time) { - long timestamp; - try { - timestamp = HoodieActiveTimeline.parseDateFromInstantTime(time).getTime() / 1000; - } catch (ParseException e) { - throw new HoodieCompactionException(e.getMessage(), e); - } - return timestamp; + return HoodieActiveTimeline.parseDateFromInstantTimeSafely(time).orElseThrow(() -> new HoodieCompactionException("Failed to parse timestamp " + time)) + .getTime() / 1000; } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/index/RunIndexActionExecutor.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/index/RunIndexActionExecutor.java index fc0c320b4406..94c4296e470e 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/index/RunIndexActionExecutor.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/index/RunIndexActionExecutor.java @@ -51,7 +51,6 @@ import java.util.Arrays; import java.util.Collections; import java.util.List; -import java.util.Locale; import java.util.Set; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; @@ -72,7 +71,6 @@ import static org.apache.hudi.common.table.timeline.HoodieTimeline.ROLLBACK_ACTION; import static org.apache.hudi.config.HoodieWriteConfig.WRITE_CONCURRENCY_MODE; import static org.apache.hudi.metadata.HoodieTableMetadata.getMetadataTableBasePath; -import static org.apache.hudi.metadata.HoodieTableMetadataUtil.PARTITION_NAME_FUNCTIONAL_INDEX_PREFIX; import static org.apache.hudi.metadata.HoodieTableMetadataUtil.deleteMetadataPartition; import static org.apache.hudi.metadata.HoodieTableMetadataUtil.getInflightAndCompletedMetadataPartitions; import static org.apache.hudi.metadata.HoodieTableMetadataUtil.getInflightMetadataPartitions; @@ -220,9 +218,8 @@ private void abort(HoodieInstant indexInstant, Set requestedPartitions) // delete metadata partition requestedPartitions.forEach(partition -> { - MetadataPartitionType partitionType = MetadataPartitionType.valueOf(partition.toUpperCase(Locale.ROOT)); - if (metadataPartitionExists(table.getMetaClient().getBasePathV2().toString(), context, partitionType)) { - deleteMetadataPartition(table.getMetaClient().getBasePathV2().toString(), context, partitionType); + if (metadataPartitionExists(table.getMetaClient().getBasePathV2().toString(), context, partition)) { + deleteMetadataPartition(table.getMetaClient().getBasePathV2().toString(), context, partition); } }); @@ -320,9 +317,7 @@ private static List getCompletedArchivedAndActiveInstantsAfter(St private void updateMetadataPartitionsTableConfig(HoodieTableMetaClient metaClient, Set metadataPartitions) { metadataPartitions.forEach(metadataPartition -> { - MetadataPartitionType partitionType = metadataPartition.startsWith(PARTITION_NAME_FUNCTIONAL_INDEX_PREFIX) ? MetadataPartitionType.FUNCTIONAL_INDEX : - MetadataPartitionType.valueOf(metadataPartition.toUpperCase(Locale.ROOT)); - metaClient.getTableConfig().setMetadataPartitionState(metaClient, partitionType, true); + metaClient.getTableConfig().setMetadataPartitionState(metaClient, metadataPartition, true); }); } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/index/ScheduleIndexActionExecutor.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/index/ScheduleIndexActionExecutor.java index 7b27d7ef6e1c..da85fc4d6340 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/index/ScheduleIndexActionExecutor.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/index/ScheduleIndexActionExecutor.java @@ -67,15 +67,20 @@ public class ScheduleIndexActionExecutor extends BaseActionExecutor< private static final Integer LATEST_INDEX_PLAN_VERSION = INDEX_PLAN_VERSION_1; private final List partitionIndexTypes; + + private final List partitionPaths; + private final TransactionManager txnManager; public ScheduleIndexActionExecutor(HoodieEngineContext context, HoodieWriteConfig config, HoodieTable table, String instantTime, - List partitionIndexTypes) { + List partitionIndexTypes, + List partitionPaths) { super(context, config, table, instantTime); this.partitionIndexTypes = partitionIndexTypes; + this.partitionPaths = partitionPaths; this.txnManager = new TransactionManager(config, table.getMetaClient().getFs()); } @@ -84,8 +89,11 @@ public Option execute() { validateBeforeScheduling(); // make sure that it is idempotent, check with previously pending index operations. Set indexesInflightOrCompleted = getInflightAndCompletedMetadataPartitions(table.getMetaClient().getTableConfig()); + Set requestedPartitions = partitionIndexTypes.stream().map(MetadataPartitionType::getPartitionPath).collect(Collectors.toSet()); + requestedPartitions.addAll(partitionPaths); requestedPartitions.removeAll(indexesInflightOrCompleted); + if (!requestedPartitions.isEmpty()) { LOG.warn(String.format("Following partitions already exist or inflight: %s. Going to schedule indexing of only these partitions: %s", indexesInflightOrCompleted, requestedPartitions)); @@ -142,8 +150,8 @@ private void validateBeforeScheduling() { private void abort(HoodieInstant indexInstant) { // delete metadata partition partitionIndexTypes.forEach(partitionType -> { - if (metadataPartitionExists(table.getMetaClient().getBasePath(), context, partitionType)) { - deleteMetadataPartition(table.getMetaClient().getBasePath(), context, partitionType); + if (metadataPartitionExists(table.getMetaClient().getBasePath(), context, partitionType.getPartitionPath())) { + deleteMetadataPartition(table.getMetaClient().getBasePath(), context, partitionType.getPartitionPath()); } }); // delete requested instant diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/BaseRollbackActionExecutor.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/BaseRollbackActionExecutor.java index 077f956eb7be..51159d3d5c3a 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/BaseRollbackActionExecutor.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/BaseRollbackActionExecutor.java @@ -188,7 +188,7 @@ private void validateRollbackCommitSequence() { if (!instant.getAction().equals(HoodieTimeline.REPLACE_COMMIT_ACTION)) { return true; } - return !ClusteringUtils.isPendingClusteringInstant(table.getMetaClient(), instant); + return !ClusteringUtils.isClusteringInstant(table.getActiveTimeline(), instant); }).map(HoodieInstant::getTimestamp) .collect(Collectors.toList()); if ((instantTimeToRollback != null) && !inflights.isEmpty() diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/RestorePlanActionExecutor.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/RestorePlanActionExecutor.java index b3ee11b9836e..2f9e96859ff6 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/RestorePlanActionExecutor.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/RestorePlanActionExecutor.java @@ -71,7 +71,7 @@ public Option execute() { // rollback pending clustering instants first before other instants (See HUDI-3362) List pendingClusteringInstantsToRollback = table.getActiveTimeline().filterPendingReplaceTimeline() // filter only clustering related replacecommits (Not insert_overwrite related commits) - .filter(instant -> ClusteringUtils.isPendingClusteringInstant(table.getMetaClient(), instant)) + .filter(instant -> ClusteringUtils.isClusteringInstant(table.getActiveTimeline(), instant)) .getReverseOrderedInstants() .filter(instant -> HoodieActiveTimeline.GREATER_THAN.test(instant.getTimestamp(), savepointToRestoreTimestamp)) .collect(Collectors.toList()); diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/ttl/strategy/HoodiePartitionTTLStrategyFactory.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/ttl/strategy/HoodiePartitionTTLStrategyFactory.java new file mode 100644 index 000000000000..26bcaa9fe51e --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/ttl/strategy/HoodiePartitionTTLStrategyFactory.java @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.ttl.strategy; + +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.util.ReflectionUtils; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.config.HoodieTTLConfig; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.table.HoodieTable; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.Locale; + +import static org.apache.hudi.config.HoodieTTLConfig.PARTITION_TTL_STRATEGY_CLASS_NAME; +import static org.apache.hudi.config.HoodieTTLConfig.PARTITION_TTL_STRATEGY_TYPE; + +/** + * Factory help to create {@link PartitionTTLStrategy}. + *

+ * This factory will try {@link HoodieTTLConfig#PARTITION_TTL_STRATEGY_CLASS_NAME} firstly, + * this ensures the class prop will not be overwritten by {@link PartitionTTLStrategyType} + */ +public class HoodiePartitionTTLStrategyFactory { + + private static final Logger LOG = LoggerFactory.getLogger(HoodiePartitionTTLStrategyFactory.class); + + public static PartitionTTLStrategy createStrategy(HoodieTable hoodieTable, TypedProperties props, String instantTime) throws IOException { + String strategyClassName = getPartitionTTLStrategyClassName(props); + try { + return (PartitionTTLStrategy) ReflectionUtils.loadClass(strategyClassName, + new Class[] {HoodieTable.class, String.class}, hoodieTable, instantTime); + } catch (Throwable e) { + throw new IOException("Could not load partition ttl management strategy class " + strategyClassName, e); + } + } + + private static String getPartitionTTLStrategyClassName(TypedProperties props) { + String strategyClassName = + props.getString(PARTITION_TTL_STRATEGY_CLASS_NAME.key(), null); + if (StringUtils.isNullOrEmpty(strategyClassName)) { + String strategyType = props.getString(PARTITION_TTL_STRATEGY_TYPE.key(), + PARTITION_TTL_STRATEGY_TYPE.defaultValue()); + PartitionTTLStrategyType strategyTypeEnum; + try { + strategyTypeEnum = PartitionTTLStrategyType.valueOf(strategyType.toUpperCase(Locale.ROOT)); + } catch (IllegalArgumentException e) { + throw new HoodieException("Unsupported PartitionTTLStrategy Type " + strategyType); + } + strategyClassName = getPartitionTTLStrategyFromType(strategyTypeEnum); + } + return strategyClassName; + } + + /** + * @param type {@link PartitionTTLStrategyType} enum. + * @return The partition ttl management strategy class name based on the {@link PartitionTTLStrategyType}. + */ + public static String getPartitionTTLStrategyFromType(PartitionTTLStrategyType type) { + switch (type) { + case KEEP_BY_TIME: + return KeepByTimeStrategy.class.getName(); + case KEEP_BY_CREATION_TIME: + return KeepByCreationTimeStrategy.class.getName(); + default: + throw new HoodieException("Unsupported PartitionTTLStrategy Type " + type); + } + } + +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/ttl/strategy/KeepByCreationTimeStrategy.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/ttl/strategy/KeepByCreationTimeStrategy.java new file mode 100644 index 000000000000..a350086f2dcb --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/ttl/strategy/KeepByCreationTimeStrategy.java @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.ttl.strategy; + +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodiePartitionMetadata; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.table.HoodieTable; + +import java.util.List; +import java.util.stream.Collectors; + +/** + * KeepByTimeStrategy will return expired partitions by their lastCommitTime. + */ +public class KeepByCreationTimeStrategy extends KeepByTimeStrategy { + + public KeepByCreationTimeStrategy(HoodieTable hoodieTable, String instantTime) { + super(hoodieTable, instantTime); + } + + @Override + protected List getExpiredPartitionsForTimeStrategy(List partitionPathsForTTL) { + HoodieTableMetaClient metaClient = hoodieTable.getMetaClient(); + return partitionPathsForTTL.stream().parallel().filter(part -> { + HoodiePartitionMetadata hoodiePartitionMetadata = + new HoodiePartitionMetadata(metaClient.getFs(), FSUtils.getPartitionPath(metaClient.getBasePath(), part)); + Option instantOption = hoodiePartitionMetadata.readPartitionCreatedCommitTime(); + if (instantOption.isPresent()) { + String instantTime = instantOption.get(); + return isPartitionExpired(instantTime); + } + return false; + }).collect(Collectors.toList()); + } + +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/ttl/strategy/KeepByTimeStrategy.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/ttl/strategy/KeepByTimeStrategy.java new file mode 100644 index 000000000000..b6d67bb9e8a3 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/ttl/strategy/KeepByTimeStrategy.java @@ -0,0 +1,110 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.ttl.strategy; + +import org.apache.hudi.common.model.FileSlice; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.util.HoodieTimer; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.table.HoodieTable; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Collections; +import java.util.Comparator; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +import static org.apache.hudi.common.table.timeline.HoodieInstantTimeGenerator.fixInstantTimeCompatibility; +import static org.apache.hudi.common.table.timeline.HoodieInstantTimeGenerator.instantTimePlusMillis; + +/** + * KeepByTimeStrategy will return expired partitions by their lastCommitTime. + */ +public class KeepByTimeStrategy extends PartitionTTLStrategy { + + private static final Logger LOG = LoggerFactory.getLogger(KeepByTimeStrategy.class); + + protected final long ttlInMilis; + + public KeepByTimeStrategy(HoodieTable hoodieTable, String instantTime) { + super(hoodieTable, instantTime); + this.ttlInMilis = writeConfig.getPartitionTTLStrategyDaysRetain() * 1000 * 3600 * 24; + } + + @Override + public List getExpiredPartitionPaths() { + Option lastCompletedInstant = hoodieTable.getActiveTimeline().filterCompletedInstants().lastInstant(); + if (!lastCompletedInstant.isPresent() || ttlInMilis <= 0 + || !hoodieTable.getMetaClient().getTableConfig().getPartitionFields().isPresent()) { + return Collections.emptyList(); + } + List expiredPartitions = getExpiredPartitionsForTimeStrategy(getPartitionPathsForTTL()); + int limit = writeConfig.getPartitionTTLMaxPartitionsToDelete(); + LOG.info("Total expired partitions count {}, limit {}", expiredPartitions.size(), limit); + return expiredPartitions.stream() + .limit(limit) // Avoid a single replace commit too large + .collect(Collectors.toList()); + } + + protected List getExpiredPartitionsForTimeStrategy(List partitionsForTTLManagement) { + HoodieTimer timer = HoodieTimer.start(); + Map> lastCommitTimeForPartitions = getLastCommitTimeForPartitions(partitionsForTTLManagement); + LOG.info("Collect last commit time for partitions cost {} ms", timer.endTimer()); + return lastCommitTimeForPartitions.entrySet() + .stream() + .filter(entry -> entry.getValue().isPresent()) + .filter(entry -> isPartitionExpired(entry.getValue().get())) + .map(Map.Entry::getKey) + .collect(Collectors.toList()); + } + + /** + * @param partitionPaths Partitions to collect stats. + */ + private Map> getLastCommitTimeForPartitions(List partitionPaths) { + int statsParallelism = Math.min(partitionPaths.size(), 200); + return hoodieTable.getContext().map(partitionPaths, partitionPath -> { + Option partitionLastModifiedTime = hoodieTable.getHoodieView() + .getLatestFileSlicesBeforeOrOn(partitionPath, instantTime, true) + .map(FileSlice::getBaseInstantTime) + .max(Comparator.naturalOrder()) + .map(Option::ofNullable) + .orElse(Option.empty()); + return Pair.of(partitionPath, partitionLastModifiedTime); + }, statsParallelism).stream().collect(Collectors.toMap(Pair::getKey, Pair::getValue)); + } + + /** + * Determines if a partition's reference time has exceeded its time-to-live (TTL). + *

+ * This method checks if the current time has passed the TTL threshold based on a + * reference time, which could be the creation time or the last commit time of the partition. + * + * @param referenceTime last commit time or creation time for partition + */ + protected boolean isPartitionExpired(String referenceTime) { + String expiredTime = instantTimePlusMillis(fixInstantTimeCompatibility(referenceTime), ttlInMilis); + return fixInstantTimeCompatibility(instantTime).compareTo(expiredTime) > 0; + } + +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/ttl/strategy/PartitionTTLStrategy.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/ttl/strategy/PartitionTTLStrategy.java new file mode 100644 index 000000000000..477688709303 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/ttl/strategy/PartitionTTLStrategy.java @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.ttl.strategy; + +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.util.HoodieTimer; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.table.HoodieTable; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.Serializable; +import java.util.Arrays; +import java.util.List; + +/** + * Strategy for partition-level ttl management. + */ +public abstract class PartitionTTLStrategy implements TTLStrategy, Serializable { + + private static final Logger LOG = LoggerFactory.getLogger(PartitionTTLStrategy.class); + + protected final HoodieTable hoodieTable; + protected final HoodieWriteConfig writeConfig; + protected final String instantTime; + + public PartitionTTLStrategy(HoodieTable hoodieTable, String instantTime) { + this.writeConfig = hoodieTable.getConfig(); + this.hoodieTable = hoodieTable; + this.instantTime = instantTime; + } + + /** + * Get expired partition paths for a specific partition ttl strategy. + * + * @return Expired partition paths. + */ + public abstract List getExpiredPartitionPaths(); + + /** + * Scan and list all partitions for partition ttl management. + * + * @return all partitions paths for the dataset. + */ + protected List getPartitionPathsForTTL() { + String partitionSelected = writeConfig.getClusteringPartitionSelected(); + HoodieTimer timer = HoodieTimer.start(); + List partitionsForTTL; + if (StringUtils.isNullOrEmpty(partitionSelected)) { + // Return All partition paths + partitionsForTTL = FSUtils.getAllPartitionPaths(hoodieTable.getContext(), writeConfig.getMetadataConfig(), writeConfig.getBasePath()); + } else { + partitionsForTTL = Arrays.asList(partitionSelected.split(",")); + } + LOG.info("Get partitions for ttl cost {} ms", timer.endTimer()); + return partitionsForTTL; + } + +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/ttl/strategy/PartitionTTLStrategyType.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/ttl/strategy/PartitionTTLStrategyType.java new file mode 100644 index 000000000000..6dfbcd6d0e59 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/ttl/strategy/PartitionTTLStrategyType.java @@ -0,0 +1,75 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.ttl.strategy; + +import org.apache.hudi.common.config.HoodieConfig; +import org.apache.hudi.keygen.constant.KeyGeneratorType; + +import javax.annotation.Nullable; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import static org.apache.hudi.config.HoodieTTLConfig.PARTITION_TTL_STRATEGY_CLASS_NAME; +import static org.apache.hudi.config.HoodieTTLConfig.PARTITION_TTL_STRATEGY_TYPE; + +/** + * Types of {@link PartitionTTLStrategy}. + */ +public enum PartitionTTLStrategyType { + KEEP_BY_TIME("org.apache.hudi.table.action.ttl.strategy.KeepByTimeStrategy"), + KEEP_BY_CREATION_TIME("org.apache.hudi.table.action.ttl.strategy.KeepByCreationTimeStrategy"); + + private final String className; + + PartitionTTLStrategyType(String className) { + this.className = className; + } + + public String getClassName() { + return className; + } + + public static PartitionTTLStrategyType fromClassName(String className) { + for (PartitionTTLStrategyType type : PartitionTTLStrategyType.values()) { + if (type.getClassName().equals(className)) { + return type; + } + } + throw new IllegalArgumentException("No PartitionTTLStrategyType found for class name: " + className); + } + + public static List getPartitionTTLStrategyNames() { + List names = new ArrayList<>(PartitionTTLStrategyType.values().length); + Arrays.stream(PartitionTTLStrategyType.values()) + .forEach(x -> names.add(x.name())); + return names; + } + + @Nullable + public static String getPartitionTTLStrategyClassName(HoodieConfig config) { + if (config.contains(PARTITION_TTL_STRATEGY_CLASS_NAME)) { + return config.getString(PARTITION_TTL_STRATEGY_CLASS_NAME); + } else if (config.contains(PARTITION_TTL_STRATEGY_TYPE)) { + return KeyGeneratorType.valueOf(config.getString(PARTITION_TTL_STRATEGY_TYPE)).getClassName(); + } + return null; + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/exception/HoodieIncompatibleSchemaException.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/ttl/strategy/TTLStrategy.java similarity index 71% rename from hudi-common/src/main/java/org/apache/hudi/exception/HoodieIncompatibleSchemaException.java rename to hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/ttl/strategy/TTLStrategy.java index a739af67909b..ad41f95fba27 100644 --- a/hudi-common/src/main/java/org/apache/hudi/exception/HoodieIncompatibleSchemaException.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/ttl/strategy/TTLStrategy.java @@ -16,18 +16,10 @@ * limitations under the License. */ -package org.apache.hudi.exception; +package org.apache.hudi.table.action.ttl.strategy; /** - * Exception for incompatible schema. + * Strategy for ttl management. */ -public class HoodieIncompatibleSchemaException extends RuntimeException { - - public HoodieIncompatibleSchemaException(String msg, Throwable e) { - super(msg, e); - } - - public HoodieIncompatibleSchemaException(String msg) { - super(msg); - } +public interface TTLStrategy { } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/ThreeToFourUpgradeHandler.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/ThreeToFourUpgradeHandler.java index c7cb544aec94..edc2d19cf4bc 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/ThreeToFourUpgradeHandler.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/ThreeToFourUpgradeHandler.java @@ -49,7 +49,7 @@ public Map upgrade(HoodieWriteConfig config, HoodieEngin tablePropsToAdd.put(TABLE_CHECKSUM, String.valueOf(HoodieTableConfig.generateChecksum(config.getProps()))); // if metadata is enabled and files partition exist then update TABLE_METADATA_INDEX_COMPLETED // schema for the files partition is same between the two versions - if (config.isMetadataTableEnabled() && metadataPartitionExists(config.getBasePath(), context, MetadataPartitionType.FILES)) { + if (config.isMetadataTableEnabled() && metadataPartitionExists(config.getBasePath(), context, MetadataPartitionType.FILES.getPartitionPath())) { tablePropsToAdd.put(TABLE_METADATA_PARTITIONS, MetadataPartitionType.FILES.getPartitionPath()); } return tablePropsToAdd; diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/HoodieTestCommitGenerator.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/HoodieTestCommitGenerator.java index edb3617ea9ef..296cb162a424 100644 --- a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/HoodieTestCommitGenerator.java +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/HoodieTestCommitGenerator.java @@ -24,6 +24,7 @@ import org.apache.hudi.common.model.HoodieFileFormat; import org.apache.hudi.common.model.HoodiePartitionMetadata; import org.apache.hudi.common.model.HoodieWriteStat; +import org.apache.hudi.common.table.HoodieTableConfig; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.testutils.InProcessTimeGenerator; @@ -111,7 +112,7 @@ public static void setupTimelineInFS( } public static String getBaseFilename(String instantTime, String fileId) { - return FSUtils.makeBaseFileName(instantTime, BASE_FILE_WRITE_TOKEN, fileId); + return FSUtils.makeBaseFileName(instantTime, BASE_FILE_WRITE_TOKEN, fileId, HoodieTableConfig.BASE_FILE_FORMAT.defaultValue().getFileExtension()); } public static String getLogFilename(String instantTime, String fileId) { diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/common/table/timeline/TestHoodieArchivedTimeline.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/common/table/timeline/TestHoodieArchivedTimeline.java new file mode 100644 index 000000000000..664385f9ae06 --- /dev/null +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/common/table/timeline/TestHoodieArchivedTimeline.java @@ -0,0 +1,105 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.common.table.timeline; + +import org.apache.hudi.DummyActiveAction; +import org.apache.hudi.client.timeline.LSMTimelineWriter; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.engine.HoodieLocalEngineContext; +import org.apache.hudi.common.engine.LocalTaskContextSupplier; +import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.common.testutils.HoodieCommonTestHarness; +import org.apache.hudi.common.testutils.HoodieTestTable; +import org.apache.hudi.config.HoodieIndexConfig; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.index.HoodieIndex; + +import org.apache.hadoop.conf.Configuration; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import static org.hamcrest.CoreMatchers.is; +import static org.hamcrest.MatcherAssert.assertThat; + +/** + * Test cases for {@link HoodieArchivedTimeline}. + */ +public class TestHoodieArchivedTimeline extends HoodieCommonTestHarness { + + @BeforeEach + public void setUp() throws Exception { + initMetaClient(); + } + + @AfterEach + public void tearDown() throws Exception { + cleanMetaClient(); + } + + @Test + public void testLoadingInstantsIncrementally() throws Exception { + writeArchivedTimeline(10, 10000000); + // now we got 500 instants spread in 5 parquets. + HoodieArchivedTimeline archivedTimeline = metaClient.getArchivedTimeline("10000043"); + assertThat(archivedTimeline.firstInstant().map(HoodieInstant::getTimestamp).orElse(""), is("10000043")); + assertThat(archivedTimeline.lastInstant().map(HoodieInstant::getTimestamp).orElse(""), is("10000050")); + // load incrementally + archivedTimeline.reload("10000034"); + assertThat(archivedTimeline.firstInstant().map(HoodieInstant::getTimestamp).orElse(""), is("10000034")); + archivedTimeline.reload("10000011"); + assertThat(archivedTimeline.firstInstant().map(HoodieInstant::getTimestamp).orElse(""), is("10000011")); + } + + // ------------------------------------------------------------------------- + // Utilities + // ------------------------------------------------------------------------- + + private void writeArchivedTimeline(int batchSize, long startTs) throws Exception { + HoodieTestTable testTable = HoodieTestTable.of(this.metaClient); + HoodieWriteConfig writeConfig = HoodieWriteConfig.newBuilder().withPath(this.metaClient.getBasePathV2().toString()) + .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.INMEMORY).build()) + .withMarkersType("DIRECT") + .build(); + HoodieEngineContext engineContext = new HoodieLocalEngineContext(new Configuration()); + LSMTimelineWriter writer = LSMTimelineWriter.getInstance(writeConfig, new LocalTaskContextSupplier(), metaClient); + List instantBuffer = new ArrayList<>(); + for (int i = 1; i <= 50; i++) { + long instantTimeTs = startTs + i; + String instantTime = String.valueOf(instantTimeTs); + String completionTime = String.valueOf(instantTimeTs + 10); + HoodieInstant instant = new HoodieInstant(HoodieInstant.State.COMPLETED, "commit", instantTime, completionTime); + HoodieCommitMetadata metadata = testTable.createCommitMetadata(instantTime, WriteOperationType.INSERT, Arrays.asList("par1", "par2"), 10, false); + byte[] serializedMetadata = TimelineMetadataUtils.serializeCommitMetadata(metadata).get(); + instantBuffer.add(new DummyActiveAction(instant, serializedMetadata)); + if (i % batchSize == 0) { + // archive 10 instants each time + writer.write(instantBuffer, org.apache.hudi.common.util.Option.empty(), org.apache.hudi.common.util.Option.empty()); + writer.compactAndClean(engineContext); + instantBuffer.clear(); + } + } + } +} diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/common/testutils/HoodieMetadataTestTable.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/common/testutils/HoodieMetadataTestTable.java index 3bcba72eb684..5024a9f59d2c 100644 --- a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/common/testutils/HoodieMetadataTestTable.java +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/common/testutils/HoodieMetadataTestTable.java @@ -19,6 +19,7 @@ package org.apache.hudi.common.testutils; import org.apache.hudi.avro.model.HoodieCleanMetadata; +import org.apache.hudi.avro.model.HoodieCleanerPlan; import org.apache.hudi.avro.model.HoodieRequestedReplaceMetadata; import org.apache.hudi.avro.model.HoodieRestoreMetadata; import org.apache.hudi.avro.model.HoodieRollbackMetadata; @@ -134,6 +135,16 @@ public HoodieCleanMetadata doClean(String commitTime, Map parti return cleanMetadata; } + @Override + public void repeatClean(String cleanCommitTime, + HoodieCleanerPlan cleanerPlan, + HoodieCleanMetadata cleanMetadata) throws IOException { + super.repeatClean(cleanCommitTime, cleanerPlan, cleanMetadata); + if (writer != null) { + writer.update(cleanMetadata, cleanCommitTime); + } + } + public HoodieTestTable addCompaction(String instantTime, HoodieCommitMetadata commitMetadata) throws Exception { super.addCompaction(instantTime, commitMetadata); if (writer != null) { @@ -148,7 +159,6 @@ public HoodieTestTable addRollback(String instantTime, HoodieRollbackMetadata ro if (writer != null) { writer.update(rollbackMetadata, instantTime); } - super.addRollbackCompleted(instantTime, rollbackMetadata, false); return this; } diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/config/TestHoodieWriteConfig.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/config/TestHoodieWriteConfig.java index 5c93f924ecef..90fcfd4fd7ae 100644 --- a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/config/TestHoodieWriteConfig.java +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/config/TestHoodieWriteConfig.java @@ -89,6 +89,7 @@ public void testPropertyLoading(boolean withAlternative) throws IOException { assertEquals(5, config.getMaxCommitsToKeep()); assertEquals(2, config.getMinCommitsToKeep()); assertTrue(config.shouldUseExternalSchemaTransformation()); + assertTrue(config.allowDuplicateInserts()); } @Test diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/metadata/TestHoodieMetadataWriteUtils.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/metadata/TestHoodieMetadataWriteUtils.java new file mode 100644 index 000000000000..529d2ddfc7ff --- /dev/null +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/metadata/TestHoodieMetadataWriteUtils.java @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.metadata; + +import org.apache.hudi.common.config.HoodieMetadataConfig; +import org.apache.hudi.common.model.HoodieCleaningPolicy; +import org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy; +import org.apache.hudi.config.HoodieCleanConfig; +import org.apache.hudi.config.HoodieWriteConfig; + +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotEquals; + +public class TestHoodieMetadataWriteUtils { + + @Test + public void testCreateMetadataWriteConfigForCleaner() { + HoodieWriteConfig writeConfig1 = HoodieWriteConfig.newBuilder() + .withPath("/tmp") + .withCleanConfig(HoodieCleanConfig.newBuilder() + .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS) + .retainCommits(5).build()) + .build(); + + HoodieWriteConfig metadataWriteConfig1 = HoodieMetadataWriteUtils.createMetadataWriteConfig(writeConfig1, HoodieFailedWritesCleaningPolicy.EAGER); + assertEquals(HoodieFailedWritesCleaningPolicy.EAGER, metadataWriteConfig1.getFailedWritesCleanPolicy()); + assertEquals(HoodieCleaningPolicy.KEEP_LATEST_COMMITS, metadataWriteConfig1.getCleanerPolicy()); + // default value already greater than data cleaner commits retained * 1.2 + assertEquals(HoodieMetadataConfig.DEFAULT_METADATA_CLEANER_COMMITS_RETAINED, metadataWriteConfig1.getCleanerCommitsRetained()); + + assertNotEquals(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS, metadataWriteConfig1.getCleanerPolicy()); + assertNotEquals(HoodieCleaningPolicy.KEEP_LATEST_BY_HOURS, metadataWriteConfig1.getCleanerPolicy()); + + HoodieWriteConfig writeConfig2 = HoodieWriteConfig.newBuilder() + .withPath("/tmp") + .withCleanConfig(HoodieCleanConfig.newBuilder() + .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS) + .retainCommits(20).build()) + .build(); + HoodieWriteConfig metadataWriteConfig2 = HoodieMetadataWriteUtils.createMetadataWriteConfig(writeConfig2, HoodieFailedWritesCleaningPolicy.EAGER); + assertEquals(HoodieFailedWritesCleaningPolicy.EAGER, metadataWriteConfig2.getFailedWritesCleanPolicy()); + assertEquals(HoodieCleaningPolicy.KEEP_LATEST_COMMITS, metadataWriteConfig2.getCleanerPolicy()); + // data cleaner commits retained * 1.2 is greater than default + assertEquals(24, metadataWriteConfig2.getCleanerCommitsRetained()); + } +} diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/metrics/m3/TestM3Metrics.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/metrics/m3/TestM3Metrics.java new file mode 100644 index 000000000000..e7299d706b89 --- /dev/null +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/metrics/m3/TestM3Metrics.java @@ -0,0 +1,92 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.metrics.m3; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.mockito.Mockito.when; + +import java.util.UUID; +import org.apache.hudi.common.testutils.NetworkTestUtils; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.metrics.HoodieMetrics; +import org.apache.hudi.metrics.Metrics; +import org.apache.hudi.metrics.MetricsReporterType; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.Mock; +import org.mockito.junit.jupiter.MockitoExtension; + +@ExtendWith(MockitoExtension.class) +public class TestM3Metrics { + + @Mock + HoodieWriteConfig config; + HoodieMetrics hoodieMetrics; + Metrics metrics; + + @BeforeEach + public void start() { + when(config.isMetricsOn()).thenReturn(true); + when(config.getMetricsReporterType()).thenReturn(MetricsReporterType.M3); + when(config.getBasePath()).thenReturn("s3://test" + UUID.randomUUID()); + } + + @Test + public void testRegisterGauge() { + when(config.getM3ServerHost()).thenReturn("localhost"); + when(config.getM3ServerPort()).thenReturn(NetworkTestUtils.nextFreePort()); + when(config.getTableName()).thenReturn("raw_table"); + when(config.getM3Env()).thenReturn("dev"); + when(config.getM3Service()).thenReturn("hoodie"); + when(config.getM3Tags()).thenReturn("tag1=value1,tag2=value2"); + when(config.getMetricReporterMetricsNamePrefix()).thenReturn(""); + hoodieMetrics = new HoodieMetrics(config); + metrics = hoodieMetrics.getMetrics(); + metrics.registerGauge("metric1", 123L); + assertEquals("123", metrics.getRegistry().getGauges().get("metric1").getValue().toString()); + metrics.shutdown(); + } + + @Test + public void testEmptyM3Tags() { + when(config.getM3ServerHost()).thenReturn("localhost"); + when(config.getM3ServerPort()).thenReturn(NetworkTestUtils.nextFreePort()); + when(config.getTableName()).thenReturn("raw_table"); + when(config.getM3Env()).thenReturn("dev"); + when(config.getM3Service()).thenReturn("hoodie"); + when(config.getM3Tags()).thenReturn(""); + when(config.getMetricReporterMetricsNamePrefix()).thenReturn(""); + hoodieMetrics = new HoodieMetrics(config); + metrics = hoodieMetrics.getMetrics(); + metrics.registerGauge("metric1", 123L); + assertEquals("123", metrics.getRegistry().getGauges().get("metric1").getValue().toString()); + metrics.shutdown(); + } + + @Test + public void testInvalidM3Tags() { + when(config.getTableName()).thenReturn("raw_table"); + when(config.getMetricReporterMetricsNamePrefix()).thenReturn(""); + assertThrows(RuntimeException.class, () -> { + hoodieMetrics = new HoodieMetrics(config); + }); + } +} diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/table/action/TestCleanPlanner.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/table/action/TestCleanPlanner.java index 4268cc36d474..9989273b723f 100644 --- a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/table/action/TestCleanPlanner.java +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/table/action/TestCleanPlanner.java @@ -138,14 +138,14 @@ void testGetDeletePaths(HoodieWriteConfig config, String earliestInstant, List partitionsInLastClean, Map> savepointsTrackedInLastClean, Map> activeInstantsPartitions, - Map> savepoints, List expectedPartitions) throws IOException { + Map> savepoints, List expectedPartitions, boolean areCommitsForSavepointsRemoved) throws IOException { HoodieActiveTimeline activeTimeline = mock(HoodieActiveTimeline.class); when(mockHoodieTable.getActiveTimeline()).thenReturn(activeTimeline); // setup savepoint mocks Set savepointTimestamps = savepoints.keySet().stream().collect(Collectors.toSet()); when(mockHoodieTable.getSavepointTimestamps()).thenReturn(savepointTimestamps); if (!savepoints.isEmpty()) { - for (Map.Entry> entry: savepoints.entrySet()) { + for (Map.Entry> entry : savepoints.entrySet()) { Pair> savepointMetadataOptionPair = getSavepointMetadata(entry.getValue()); HoodieInstant instant = new HoodieInstant(false, HoodieTimeline.SAVEPOINT_ACTION, entry.getKey()); when(activeTimeline.getInstantDetails(instant)).thenReturn(savepointMetadataOptionPair.getRight()); @@ -156,7 +156,7 @@ void testPartitionsForIncrCleaning(HoodieWriteConfig config, String earliestInst Pair> cleanMetadataOptionPair = getCleanCommitMetadata(partitionsInLastClean, lastCleanInstant, earliestInstantsInLastClean, lastCompletedTimeInLastClean, savepointsTrackedInLastClean.keySet()); mockLastCleanCommit(mockHoodieTable, lastCleanInstant, earliestInstantsInLastClean, activeTimeline, cleanMetadataOptionPair); - mockFewActiveInstants(mockHoodieTable, activeInstantsPartitions, savepointsTrackedInLastClean); + mockFewActiveInstants(mockHoodieTable, activeInstantsPartitions, savepointsTrackedInLastClean, areCommitsForSavepointsRemoved); // Trigger clean and validate partitions to clean. CleanPlanner cleanPlanner = new CleanPlanner<>(context, mockHoodieTable, config); @@ -332,7 +332,7 @@ static Stream keepLatestByHoursOrCommitsArgs() { static Stream keepLatestByHoursOrCommitsArgsIncrCleanPartitions() { String earliestInstant = "20231204194919610"; - String earliestInstantPlusTwoDays = "20231206194919610"; + String earliestInstantPlusTwoDays = "20231206194919610"; String lastCleanInstant = earliestInstantPlusTwoDays; String earliestInstantMinusThreeDays = "20231201194919610"; String earliestInstantMinusFourDays = "20231130194919610"; @@ -340,9 +340,9 @@ static Stream keepLatestByHoursOrCommitsArgsIncrCleanPartitions() { String earliestInstantMinusSixDays = "20231128194919610"; String earliestInstantInLastClean = earliestInstantMinusSixDays; String lastCompletedInLastClean = earliestInstantMinusSixDays; - String earliestInstantMinusOneWeek = "20231127194919610"; + String earliestInstantMinusOneWeek = "20231127194919610"; String savepoint2 = earliestInstantMinusOneWeek; - String earliestInstantMinusOneMonth = "20231104194919610"; + String earliestInstantMinusOneMonth = "20231104194919610"; String savepoint3 = earliestInstantMinusOneMonth; List threePartitionsInActiveTimeline = Arrays.asList(PARTITION1, PARTITION2, PARTITION3); @@ -360,66 +360,74 @@ static Stream keepLatestByHoursOrCommitsArgsIncrCleanPartitions() { List arguments = new ArrayList<>(); // no savepoints tracked in last clean and no additional savepoints. all partitions in uncleaned instants should be expected - arguments.addAll(buildArgumentsForCleanByHoursAndCommitsIncrCleanParitionsCases( + arguments.addAll(buildArgumentsForCleanByHoursAndCommitsIncrCleanPartitionsCases( earliestInstant, lastCompletedInLastClean, lastCleanInstant, earliestInstantInLastClean, Collections.singletonList(PARTITION1), Collections.emptyMap(), - activeInstantsPartitionsMap3, Collections.emptyMap(), threePartitionsInActiveTimeline)); + activeInstantsPartitionsMap3, Collections.emptyMap(), threePartitionsInActiveTimeline, false)); // a new savepoint is added after last clean. but rest of uncleaned touches all partitions, and so all partitions are expected - arguments.addAll(buildArgumentsForCleanByHoursAndCommitsIncrCleanParitionsCases( + arguments.addAll(buildArgumentsForCleanByHoursAndCommitsIncrCleanPartitionsCases( earliestInstant, lastCompletedInLastClean, lastCleanInstant, earliestInstantInLastClean, Collections.singletonList(PARTITION1), Collections.emptyMap(), - activeInstantsPartitionsMap3, Collections.singletonMap(savepoint2, Collections.singletonList(PARTITION1)), threePartitionsInActiveTimeline)); + activeInstantsPartitionsMap3, Collections.singletonMap(savepoint2, Collections.singletonList(PARTITION1)), threePartitionsInActiveTimeline, false)); // previous clean tracks a savepoint which exists in timeline still. only 2 partitions are touched by uncleaned instants. only 2 partitions are expected - arguments.addAll(buildArgumentsForCleanByHoursAndCommitsIncrCleanParitionsCases( + arguments.addAll(buildArgumentsForCleanByHoursAndCommitsIncrCleanPartitionsCases( earliestInstant, lastCompletedInLastClean, lastCleanInstant, earliestInstantInLastClean, Collections.singletonList(PARTITION1), Collections.singletonMap(savepoint2, Collections.singletonList(PARTITION1)), - activeInstantsPartitionsMap2, Collections.singletonMap(savepoint2, Collections.singletonList(PARTITION1)), twoPartitionsInActiveTimeline)); + activeInstantsPartitionsMap2, Collections.singletonMap(savepoint2, Collections.singletonList(PARTITION1)), twoPartitionsInActiveTimeline, false)); // savepoint tracked in previous clean was removed(touching partition1). latest uncleaned touched 2 other partitions. So, in total 3 partitions are expected. - arguments.addAll(buildArgumentsForCleanByHoursAndCommitsIncrCleanParitionsCases( + arguments.addAll(buildArgumentsForCleanByHoursAndCommitsIncrCleanPartitionsCases( earliestInstant, lastCompletedInLastClean, lastCleanInstant, earliestInstantInLastClean, Collections.singletonList(PARTITION1), Collections.singletonMap(savepoint2, Collections.singletonList(PARTITION1)), - activeInstantsPartitionsMap2, Collections.emptyMap(), threePartitionsInActiveTimeline)); + activeInstantsPartitionsMap2, Collections.emptyMap(), threePartitionsInActiveTimeline, false)); // previous savepoint still exists and touches partition1. uncleaned touches only partition2 and partition3. expected partition2 and partition3. - arguments.addAll(buildArgumentsForCleanByHoursAndCommitsIncrCleanParitionsCases( + arguments.addAll(buildArgumentsForCleanByHoursAndCommitsIncrCleanPartitionsCases( earliestInstant, lastCompletedInLastClean, lastCleanInstant, earliestInstantInLastClean, Collections.singletonList(PARTITION1), Collections.singletonMap(savepoint2, Collections.singletonList(PARTITION1)), - activeInstantsPartitionsMap2, Collections.singletonMap(savepoint2, Collections.singletonList(PARTITION1)), twoPartitionsInActiveTimeline)); + activeInstantsPartitionsMap2, Collections.singletonMap(savepoint2, Collections.singletonList(PARTITION1)), twoPartitionsInActiveTimeline, false)); // a new savepoint was added compared to previous clean. all 2 partitions are expected since uncleaned commits touched just 2 partitions. Map> latestSavepoints = new HashMap<>(); latestSavepoints.put(savepoint2, Collections.singletonList(PARTITION1)); latestSavepoints.put(savepoint3, Collections.singletonList(PARTITION1)); - arguments.addAll(buildArgumentsForCleanByHoursAndCommitsIncrCleanParitionsCases( + arguments.addAll(buildArgumentsForCleanByHoursAndCommitsIncrCleanPartitionsCases( earliestInstant, lastCompletedInLastClean, lastCleanInstant, earliestInstantInLastClean, Collections.singletonList(PARTITION1), Collections.singletonMap(savepoint2, Collections.singletonList(PARTITION1)), - activeInstantsPartitionsMap2, latestSavepoints, twoPartitionsInActiveTimeline)); + activeInstantsPartitionsMap2, latestSavepoints, twoPartitionsInActiveTimeline, false)); // 2 savepoints were tracked in previous clean. one of them is removed in latest. A partition which was part of the removed savepoint should be added in final // list of partitions to clean Map> previousSavepoints = new HashMap<>(); latestSavepoints.put(savepoint2, Collections.singletonList(PARTITION1)); latestSavepoints.put(savepoint3, Collections.singletonList(PARTITION2)); - arguments.addAll(buildArgumentsForCleanByHoursAndCommitsIncrCleanParitionsCases( + arguments.addAll(buildArgumentsForCleanByHoursAndCommitsIncrCleanPartitionsCases( earliestInstant, lastCompletedInLastClean, lastCleanInstant, earliestInstantInLastClean, Collections.singletonList(PARTITION1), - previousSavepoints, activeInstantsPartitionsMap2, Collections.singletonMap(savepoint3, Collections.singletonList(PARTITION2)), twoPartitionsInActiveTimeline)); + previousSavepoints, activeInstantsPartitionsMap2, Collections.singletonMap(savepoint3, Collections.singletonList(PARTITION2)), twoPartitionsInActiveTimeline, false)); // 2 savepoints were tracked in previous clean. one of them is removed in latest. But a partition part of removed savepoint is already touched by uncleaned commits. // so we expect all 3 partitions to be in final list. - arguments.addAll(buildArgumentsForCleanByHoursAndCommitsIncrCleanParitionsCases( + arguments.addAll(buildArgumentsForCleanByHoursAndCommitsIncrCleanPartitionsCases( earliestInstant, lastCompletedInLastClean, lastCleanInstant, earliestInstantInLastClean, Collections.singletonList(PARTITION1), - previousSavepoints, activeInstantsPartitionsMap3, Collections.singletonMap(savepoint3, Collections.singletonList(PARTITION2)), threePartitionsInActiveTimeline)); + previousSavepoints, activeInstantsPartitionsMap3, Collections.singletonMap(savepoint3, Collections.singletonList(PARTITION2)), threePartitionsInActiveTimeline, false)); // unpartitioned test case. savepoint removed. List unPartitionsInActiveTimeline = Arrays.asList(StringUtils.EMPTY_STRING); Map> activeInstantsUnPartitionsMap = new HashMap<>(); activeInstantsUnPartitionsMap.put(earliestInstantMinusThreeDays, unPartitionsInActiveTimeline); - arguments.addAll(buildArgumentsForCleanByHoursAndCommitsIncrCleanParitionsCases( + arguments.addAll(buildArgumentsForCleanByHoursAndCommitsIncrCleanPartitionsCases( earliestInstant, lastCompletedInLastClean, lastCleanInstant, earliestInstantInLastClean, Collections.singletonList(StringUtils.EMPTY_STRING), Collections.singletonMap(savepoint2, Collections.singletonList(StringUtils.EMPTY_STRING)), - activeInstantsUnPartitionsMap, Collections.emptyMap(), unPartitionsInActiveTimeline)); + activeInstantsUnPartitionsMap, Collections.emptyMap(), unPartitionsInActiveTimeline, false)); + + // savepoint tracked in previous clean was removed(touching partition1). active instants does not have the instant corresponding to the savepoint. + // latest uncleaned touched 2 other partitions. So, in total 2 partitions are expected. + activeInstantsPartitionsMap2.remove(earliestInstantMinusOneWeek); + arguments.addAll(buildArgumentsForCleanByHoursAndCommitsIncrCleanPartitionsCases( + earliestInstant, lastCompletedInLastClean, lastCleanInstant, earliestInstantInLastClean, Collections.singletonList(PARTITION1), + Collections.singletonMap(savepoint2, Collections.singletonList(PARTITION1)), + activeInstantsPartitionsMap2, Collections.emptyMap(), twoPartitionsInActiveTimeline, true)); return arguments.stream(); } @@ -450,19 +458,20 @@ private static List buildArgumentsForCleanByHoursAndCommitsCases(Stri } // helper to build common cases for the two policies - private static List buildArgumentsForCleanByHoursAndCommitsIncrCleanParitionsCases(String earliestInstant, - String latestCompletedInLastClean, - String lastKnownCleanInstantTime, - String earliestInstantInLastClean, - List partitionsInLastClean, - Map> savepointsTrackedInLastClean, - Map> activeInstantsToPartitionsMap, - Map> savepoints, - List expectedPartitions) { + private static List buildArgumentsForCleanByHoursAndCommitsIncrCleanPartitionsCases(String earliestInstant, + String latestCompletedInLastClean, + String lastKnownCleanInstantTime, + String earliestInstantInLastClean, + List partitionsInLastClean, + Map> savepointsTrackedInLastClean, + Map> activeInstantsToPartitionsMap, + Map> savepoints, + List expectedPartitions, + boolean areCommitsForSavepointsRemoved) { return Arrays.asList(Arguments.of(getCleanByHoursConfig(), earliestInstant, latestCompletedInLastClean, lastKnownCleanInstantTime, - earliestInstantInLastClean, partitionsInLastClean, savepointsTrackedInLastClean, activeInstantsToPartitionsMap, savepoints, expectedPartitions), + earliestInstantInLastClean, partitionsInLastClean, savepointsTrackedInLastClean, activeInstantsToPartitionsMap, savepoints, expectedPartitions, areCommitsForSavepointsRemoved), Arguments.of(getCleanByCommitsConfig(), earliestInstant, latestCompletedInLastClean, lastKnownCleanInstantTime, - earliestInstantInLastClean, partitionsInLastClean, savepointsTrackedInLastClean, activeInstantsToPartitionsMap, savepoints, expectedPartitions)); + earliestInstantInLastClean, partitionsInLastClean, savepointsTrackedInLastClean, activeInstantsToPartitionsMap, savepoints, expectedPartitions, areCommitsForSavepointsRemoved)); } private static HoodieFileGroup buildFileGroup(List baseFileCommitTimes) { @@ -507,7 +516,7 @@ private static Pair> getCleanCommitMetadata( extraMetadata.put(SAVEPOINTED_TIMESTAMPS, savepointsToTrack.stream().collect(Collectors.joining(","))); } HoodieCleanMetadata cleanMetadata = new HoodieCleanMetadata(instantTime, 100L, 10, earliestCommitToRetain, lastCompletedTime, partitionMetadata, - CLEAN_METADATA_VERSION_2, Collections.EMPTY_MAP, extraMetadata); + CLEAN_METADATA_VERSION_2, Collections.EMPTY_MAP, extraMetadata.isEmpty() ? null : extraMetadata); return Pair.of(cleanMetadata, TimelineMetadataUtils.serializeCleanMetadata(cleanMetadata)); } catch (IOException ex) { throw new UncheckedIOException(ex); @@ -548,14 +557,16 @@ private static void mockLastCleanCommit(HoodieTable hoodieTable, String timestam } private static void mockFewActiveInstants(HoodieTable hoodieTable, Map> activeInstantsToPartitions, - Map> savepointedCommitsToAdd) + Map> savepointedCommitsToAdd, boolean areCommitsForSavepointsRemoved) throws IOException { HoodieDefaultTimeline commitsTimeline = new HoodieDefaultTimeline(); List instants = new ArrayList<>(); Map> instantstoProcess = new HashMap<>(); instantstoProcess.putAll(activeInstantsToPartitions); - instantstoProcess.putAll(savepointedCommitsToAdd); - instantstoProcess.forEach((k,v) -> { + if (!areCommitsForSavepointsRemoved) { + instantstoProcess.putAll(savepointedCommitsToAdd); + } + instantstoProcess.forEach((k, v) -> { HoodieInstant hoodieInstant = new HoodieInstant(HoodieInstant.State.COMPLETED, HoodieTimeline.COMMIT_ACTION, k); instants.add(hoodieInstant); Map> partitionToWriteStats = new HashMap<>(); diff --git a/hudi-client/hudi-flink-client/pom.xml b/hudi-client/hudi-flink-client/pom.xml index 99a66498c15b..d774078d5d68 100644 --- a/hudi-client/hudi-flink-client/pom.xml +++ b/hudi-client/hudi-flink-client/pom.xml @@ -262,9 +262,6 @@ src/main/resources - - src/test/resources - diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/client/clustering/plan/strategy/FlinkSizeBasedClusteringPlanStrategyRecently.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/client/clustering/plan/strategy/FlinkSizeBasedClusteringPlanStrategyRecently.java new file mode 100644 index 000000000000..234bd7a90908 --- /dev/null +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/client/clustering/plan/strategy/FlinkSizeBasedClusteringPlanStrategyRecently.java @@ -0,0 +1,133 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.client.clustering.plan.strategy; + +import org.apache.hudi.avro.model.HoodieClusteringGroup; +import org.apache.hudi.avro.model.HoodieClusteringPlan; +import org.apache.hudi.avro.model.HoodieClusteringStrategy; +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.FileSlice; +import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.util.CollectionUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.table.HoodieTable; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.stream.Collectors; + +import static org.apache.hudi.common.table.timeline.HoodieTimeline.COMMIT_ACTION; + +/** + * Only take care of partitions related to active timeline, instead of do full partition listing. + */ +public class FlinkSizeBasedClusteringPlanStrategyRecently extends FlinkSizeBasedClusteringPlanStrategy { + private static final Logger LOG = LoggerFactory.getLogger(FlinkSizeBasedClusteringPlanStrategy.class); + public FlinkSizeBasedClusteringPlanStrategyRecently(HoodieTable table, + HoodieEngineContext engineContext, + HoodieWriteConfig writeConfig) { + super(table, engineContext, writeConfig); + if (!table.getConfig().getTableType().equals(HoodieTableType.COPY_ON_WRITE)) { + throw new UnsupportedOperationException("FlinkSizeBasedClusteringPlanStrategyRecently only support cow table for now."); + } + } + + @Override + public Option generateClusteringPlan() { + if (!checkPrecondition()) { + return Option.empty(); + } + + HoodieTableMetaClient metaClient = getHoodieTable().getMetaClient(); + LOG.info("Scheduling clustering for " + metaClient.getBasePath()); + + List partitionPaths = getPartitionPathInActiveTimeline(hoodieTable); + + partitionPaths = filterPartitionPaths(partitionPaths); + + if (partitionPaths.isEmpty()) { + // In case no partitions could be picked, return no clustering plan + return Option.empty(); + } + + List clusteringGroups = getEngineContext() + .flatMap( + partitionPaths, partitionPath -> { + List fileSlicesEligible = getFileSlicesEligibleForClustering(partitionPath).collect(Collectors.toList()); + return buildClusteringGroupsForPartition(partitionPath, fileSlicesEligible).limit(getWriteConfig().getClusteringMaxNumGroups()); + }, + partitionPaths.size()) + .stream() + .limit(getWriteConfig().getClusteringMaxNumGroups()) + .collect(Collectors.toList()); + + if (clusteringGroups.isEmpty()) { + LOG.info("No data available to cluster"); + return Option.empty(); + } + + HoodieClusteringStrategy strategy = HoodieClusteringStrategy.newBuilder() + .setStrategyClassName(getWriteConfig().getClusteringExecutionStrategyClass()) + .setStrategyParams(getStrategyParams()) + .build(); + + return Option.of(HoodieClusteringPlan.newBuilder() + .setStrategy(strategy) + .setInputGroups(clusteringGroups) + .setExtraMetadata(getExtraMetadata()) + .setVersion(getPlanVersion()) + .setPreserveHoodieMetadata(true) + .build()); + } + + /** + * Only take care of partitions related to active timeline, instead of do full partition listing. + * @param hoodieTable + * @return + */ + private List getPartitionPathInActiveTimeline(HoodieTable>, List, List> hoodieTable) { + HashSet partitions = new HashSet<>(); + HoodieTimeline cowCommitTimeline = hoodieTable.getActiveTimeline().getTimelineOfActions(CollectionUtils.createSet(COMMIT_ACTION)).filterCompletedInstants(); + cowCommitTimeline.getInstants().forEach(instant -> { + try { + HoodieCommitMetadata metadata = + HoodieCommitMetadata.fromBytes(cowCommitTimeline.getInstantDetails(instant).get(), HoodieCommitMetadata.class); + partitions.addAll(metadata.getWritePartitionPaths()); + } catch (IOException e) { + // ignore Exception here + LOG.warn("Exception while get instant details from commit metadata.", e); + } + }); + + LOG.info("Partitions related to active timeline: " + partitions); + return new ArrayList<>(partitions); + } +} \ No newline at end of file diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/metadata/FlinkHoodieBackedTableMetadataWriter.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/metadata/FlinkHoodieBackedTableMetadataWriter.java index 9ce17cbb8c68..94a127ae9263 100644 --- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/metadata/FlinkHoodieBackedTableMetadataWriter.java +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/metadata/FlinkHoodieBackedTableMetadataWriter.java @@ -32,7 +32,6 @@ import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.table.timeline.HoodieInstant; -import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.common.util.collection.Pair; @@ -172,24 +171,6 @@ protected void commitInternal(String instantTime, Map m.updateSizeMetrics(metadataMetaClient, metadata, dataMetaClient.getTableConfig().getMetadataPartitions())); } - /** - * Validates the timeline for both main and metadata tables to ensure compaction on MDT can be scheduled. - */ - @Override - protected boolean validateTimelineBeforeSchedulingCompaction(Option inFlightInstantTimestamp, String latestDeltaCommitTimeInMetadataTable) { - // Allows compaction of the metadata table to run regardless of inflight instants - return true; - } - - @Override - protected void validateRollback(String commitToRollbackInstantTime, HoodieInstant compactionInstant, HoodieTimeline deltacommitsSinceCompaction) { - // ignore, flink has more radical compression strategy, it is very probably that - // the latest compaction instant has greater timestamp than the instant to roll back. - - // The limitation can be relaxed because the log reader of MDT only accepts valid instants - // based on the DT timeline, so the base file of MDT does not include un-committed instants. - } - @Override public void deletePartitions(String instantTime, List partitions) { throw new HoodieNotSupportedException("Dropping metadata index not supported for Flink metadata table yet."); diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/HoodieFlinkCopyOnWriteTable.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/HoodieFlinkCopyOnWriteTable.java index 0f73b0bce05d..05779339780c 100644 --- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/HoodieFlinkCopyOnWriteTable.java +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/HoodieFlinkCopyOnWriteTable.java @@ -41,7 +41,6 @@ import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.exception.HoodieNotSupportedException; -import org.apache.hudi.exception.HoodieUpsertException; import org.apache.hudi.io.HoodieCreateHandle; import org.apache.hudi.io.HoodieMergeHandle; import org.apache.hudi.io.HoodieMergeHandleFactory; @@ -64,7 +63,6 @@ import org.apache.hudi.table.action.commit.FlinkInsertPreppedCommitActionExecutor; import org.apache.hudi.table.action.commit.FlinkUpsertCommitActionExecutor; import org.apache.hudi.table.action.commit.FlinkUpsertPreppedCommitActionExecutor; -import org.apache.hudi.table.action.commit.HoodieMergeHelper; import org.apache.hudi.table.action.rollback.BaseRollbackPlanActionExecutor; import org.apache.hudi.table.action.rollback.CopyOnWriteRollbackActionExecutor; import org.slf4j.Logger; @@ -378,7 +376,7 @@ public HoodieRollbackMetadata rollback(HoodieEngineContext context, String rollb } @Override - public Option scheduleIndexing(HoodieEngineContext context, String indexInstantTime, List partitionsToIndex) { + public Option scheduleIndexing(HoodieEngineContext context, String indexInstantTime, List partitionsToIndex, List partitionPaths) { throw new HoodieNotSupportedException("Metadata indexing is not supported for a Flink table yet."); } @@ -397,6 +395,11 @@ public Option scheduleRestore(HoodieEngineContext context, St throw new HoodieNotSupportedException("Restore is not supported yet"); } + @Override + public HoodieWriteMetadata> managePartitionTTL(HoodieEngineContext context, String instantTime) { + throw new HoodieNotSupportedException("Manage partition ttl is not supported yet"); + } + @Override public HoodieRestoreMetadata restore(HoodieEngineContext context, String restoreInstantTimestamp, String savepointToRestoreTimestamp) { throw new HoodieNotSupportedException("Savepoint and restore is not supported yet"); @@ -416,20 +419,8 @@ public Iterator> handleUpdate( protected Iterator> handleUpdateInternal(HoodieMergeHandle upsertHandle, String instantTime, String fileId) throws IOException { - if (upsertHandle.getOldFilePath() == null) { - throw new HoodieUpsertException( - "Error in finding the old file path at commit " + instantTime + " for fileId: " + fileId); - } else { - HoodieMergeHelper.newInstance().runMerge(this, upsertHandle); - } - - // TODO(vc): This needs to be revisited - if (upsertHandle.getPartitionPath() == null) { - LOG.info("Upsert Handle has partition path as null " + upsertHandle.getOldFilePath() + ", " - + upsertHandle.writeStatuses()); - } - - return Collections.singletonList(upsertHandle.writeStatuses()).iterator(); + runMerge(upsertHandle, instantTime, fileId); + return upsertHandle.getWriteStatusesAsIterator(); } protected HoodieMergeHandle getUpdateHandle(String instantTime, String partitionPath, String fileId, diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/BaseFlinkCommitActionExecutor.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/BaseFlinkCommitActionExecutor.java index 732832ae9112..750ef71bc37b 100644 --- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/BaseFlinkCommitActionExecutor.java +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/BaseFlinkCommitActionExecutor.java @@ -184,20 +184,8 @@ public Iterator> handleUpdate(String partitionPath, String fil protected Iterator> handleUpdateInternal(HoodieMergeHandle upsertHandle, String fileId) throws IOException { - if (upsertHandle.getOldFilePath() == null) { - throw new HoodieUpsertException( - "Error in finding the old file path at commit " + instantTime + " for fileId: " + fileId); - } else { - HoodieMergeHelper.newInstance().runMerge(table, upsertHandle); - } - - // TODO(vc): This needs to be revisited - if (upsertHandle.getPartitionPath() == null) { - LOG.info("Upsert Handle has partition path as null " + upsertHandle.getOldFilePath() + ", " - + upsertHandle.writeStatuses()); - } - - return Collections.singletonList(upsertHandle.writeStatuses()).iterator(); + table.runMerge(upsertHandle, instantTime, fileId); + return upsertHandle.getWriteStatusesAsIterator(); } @Override diff --git a/hudi-client/hudi-flink-client/src/test/java/org/apache/hudi/table/action/cluster/strategy/TestFlinkSizeBasedClusteringPlanStrategy.java b/hudi-client/hudi-flink-client/src/test/java/org/apache/hudi/table/action/cluster/strategy/TestFlinkSizeBasedClusteringPlanStrategy.java index 97f12abf322b..50a3233bf370 100644 --- a/hudi-client/hudi-flink-client/src/test/java/org/apache/hudi/table/action/cluster/strategy/TestFlinkSizeBasedClusteringPlanStrategy.java +++ b/hudi-client/hudi-flink-client/src/test/java/org/apache/hudi/table/action/cluster/strategy/TestFlinkSizeBasedClusteringPlanStrategy.java @@ -25,6 +25,7 @@ import org.apache.hudi.common.model.FileSlice; import org.apache.hudi.common.model.HoodieBaseFile; import org.apache.hudi.common.model.HoodieFileGroupId; +import org.apache.hudi.common.table.HoodieTableConfig; import org.apache.hudi.config.HoodieClusteringConfig; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.table.HoodieFlinkCopyOnWriteTable; @@ -90,7 +91,8 @@ public void testBuildClusteringGroupsForPartitionOnlyOneFile() { private FileSlice generateFileSlice(String partitionPath, String fileId, String baseInstant) { FileSlice fs = new FileSlice(new HoodieFileGroupId(partitionPath, fileId), baseInstant); - fs.setBaseFile(new HoodieBaseFile(FSUtils.makeBaseFileName(baseInstant, "1-0-1", fileId))); + fs.setBaseFile(new HoodieBaseFile(FSUtils.makeBaseFileName(baseInstant, "1-0-1", fileId, + HoodieTableConfig.BASE_FILE_FORMAT.defaultValue().getFileExtension()))); return fs; } } diff --git a/hudi-client/hudi-java-client/pom.xml b/hudi-client/hudi-java-client/pom.xml index 37e4ecbec36d..3fb62e2fa504 100644 --- a/hudi-client/hudi-java-client/pom.xml +++ b/hudi-client/hudi-java-client/pom.xml @@ -151,9 +151,6 @@ src/main/resources - - src/test/resources - diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/client/HoodieJavaWriteClient.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/client/HoodieJavaWriteClient.java index af503e15c608..9a906c7e7e00 100644 --- a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/client/HoodieJavaWriteClient.java +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/client/HoodieJavaWriteClient.java @@ -208,28 +208,6 @@ public List deletePrepped(List> preppedRecords, fin return postWrite(result, instantTime, table); } - @Override - public List postWrite(HoodieWriteMetadata> result, - String instantTime, - HoodieTable hoodieTable) { - if (result.getIndexLookupDuration().isPresent()) { - metrics.updateIndexMetrics(getOperationType().name(), result.getIndexUpdateDuration().get().toMillis()); - } - if (result.isCommitted()) { - // Perform post commit operations. - if (result.getFinalizeDuration().isPresent()) { - metrics.updateFinalizeWriteMetrics(result.getFinalizeDuration().get().toMillis(), - result.getWriteStats().get().size()); - } - - postCommit(hoodieTable, result.getCommitMetadata().get(), instantTime, Option.empty()); - mayBeCleanAndArchive(hoodieTable); - - emitCommitMetrics(instantTime, result.getCommitMetadata().get(), hoodieTable.getMetaClient().getCommitActionType()); - } - return result.getWriteStatuses(); - } - @Override protected void initMetadataTable(Option instantTime) { // Initialize Metadata Table to make sure it's bootstrapped _before_ the operation, diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/HoodieJavaCopyOnWriteTable.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/HoodieJavaCopyOnWriteTable.java index 4c080f2f6635..6e111f67da27 100644 --- a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/HoodieJavaCopyOnWriteTable.java +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/HoodieJavaCopyOnWriteTable.java @@ -42,7 +42,6 @@ import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.exception.HoodieNotSupportedException; -import org.apache.hudi.exception.HoodieUpsertException; import org.apache.hudi.io.HoodieCreateHandle; import org.apache.hudi.io.HoodieMergeHandle; import org.apache.hudi.io.HoodieMergeHandleFactory; @@ -55,7 +54,6 @@ import org.apache.hudi.table.action.clean.CleanPlanActionExecutor; import org.apache.hudi.table.action.cluster.ClusteringPlanActionExecutor; import org.apache.hudi.table.action.cluster.JavaExecuteClusteringCommitActionExecutor; -import org.apache.hudi.table.action.commit.HoodieMergeHelper; import org.apache.hudi.table.action.commit.JavaBulkInsertCommitActionExecutor; import org.apache.hudi.table.action.commit.JavaBulkInsertPreppedCommitActionExecutor; import org.apache.hudi.table.action.commit.JavaDeleteCommitActionExecutor; @@ -178,6 +176,11 @@ public HoodieWriteMetadata> insertOverwriteTable(HoodieEngineC context, config, this, instantTime, records).execute(); } + @Override + public HoodieWriteMetadata> managePartitionTTL(HoodieEngineContext context, String instantTime) { + throw new HoodieNotSupportedException("Manage partition ttl is not supported yet"); + } + @Override public Option scheduleCompaction(HoodieEngineContext context, String instantTime, @@ -242,8 +245,8 @@ public HoodieRollbackMetadata rollback(HoodieEngineContext context, } @Override - public Option scheduleIndexing(HoodieEngineContext context, String indexInstantTime, List partitionsToIndex) { - return new ScheduleIndexActionExecutor<>(context, config, this, indexInstantTime, partitionsToIndex).execute(); + public Option scheduleIndexing(HoodieEngineContext context, String indexInstantTime, List partitionsToIndex, List partitionPaths) { + return new ScheduleIndexActionExecutor<>(context, config, this, indexInstantTime, partitionsToIndex, partitionPaths).execute(); } @Override @@ -285,20 +288,8 @@ public Iterator> handleUpdate( protected Iterator> handleUpdateInternal(HoodieMergeHandle upsertHandle, String instantTime, String fileId) throws IOException { - if (upsertHandle.getOldFilePath() == null) { - throw new HoodieUpsertException( - "Error in finding the old file path at commit " + instantTime + " for fileId: " + fileId); - } else { - HoodieMergeHelper.newInstance().runMerge(this, upsertHandle); - } - - // TODO(yihua): This needs to be revisited - if (upsertHandle.getPartitionPath() == null) { - LOG.info("Upsert Handle has partition path as null " + upsertHandle.getOldFilePath() + ", " - + upsertHandle.writeStatuses()); - } - - return Collections.singletonList(upsertHandle.writeStatuses()).iterator(); + runMerge(upsertHandle, instantTime, fileId); + return upsertHandle.getWriteStatusesAsIterator(); } protected HoodieMergeHandle getUpdateHandle(String instantTime, String partitionPath, String fileId, diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/BaseJavaCommitActionExecutor.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/BaseJavaCommitActionExecutor.java index 5ce6bcccef57..84e4316d164d 100644 --- a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/BaseJavaCommitActionExecutor.java +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/BaseJavaCommitActionExecutor.java @@ -243,18 +243,8 @@ public Iterator> handleUpdate(String partitionPath, String fil protected Iterator> handleUpdateInternal(HoodieMergeHandle upsertHandle, String fileId) throws IOException { - if (upsertHandle.getOldFilePath() == null) { - throw new HoodieUpsertException( - "Error in finding the old file path at commit " + instantTime + " for fileId: " + fileId); - } else { - HoodieMergeHelper.newInstance().runMerge(table, upsertHandle); - } - - List statuses = upsertHandle.writeStatuses(); - if (upsertHandle.getPartitionPath() == null) { - LOG.info("Upsert Handle has partition path as null " + upsertHandle.getOldFilePath() + ", " + statuses); - } - return Collections.singletonList(statuses).iterator(); + table.runMerge(upsertHandle, instantTime, fileId); + return upsertHandle.getWriteStatusesAsIterator(); } protected HoodieMergeHandle getUpdateHandle(String partitionPath, String fileId, Iterator> recordItr) { diff --git a/hudi-client/hudi-java-client/src/test/java/org/apache/hudi/client/TestJavaHoodieBackedMetadata.java b/hudi-client/hudi-java-client/src/test/java/org/apache/hudi/client/TestJavaHoodieBackedMetadata.java index d4df65270a86..cc0478d73f53 100644 --- a/hudi-client/hudi-java-client/src/test/java/org/apache/hudi/client/TestJavaHoodieBackedMetadata.java +++ b/hudi-client/hudi-java-client/src/test/java/org/apache/hudi/client/TestJavaHoodieBackedMetadata.java @@ -59,6 +59,7 @@ import org.apache.hudi.common.table.log.block.HoodieLogBlock; import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieInstantTimeGenerator; import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion; import org.apache.hudi.common.table.view.FileSystemViewStorageConfig; @@ -69,6 +70,7 @@ import org.apache.hudi.common.testutils.HoodieTestDataGenerator; import org.apache.hudi.common.testutils.HoodieTestTable; import org.apache.hudi.common.testutils.HoodieTestUtils; +import org.apache.hudi.common.testutils.InProcessTimeGenerator; import org.apache.hudi.common.util.HoodieTimer; import org.apache.hudi.common.util.JsonUtils; import org.apache.hudi.common.util.Option; @@ -110,7 +112,6 @@ import org.apache.avro.generic.IndexedRecord; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.Path; -import org.apache.hadoop.util.Time; import org.apache.parquet.avro.AvroSchemaConverter; import org.apache.parquet.schema.MessageType; import org.junit.jupiter.api.AfterEach; @@ -167,7 +168,6 @@ import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertNull; -import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; import static org.junit.jupiter.api.Assertions.fail; @@ -578,8 +578,8 @@ public void testMetadataTableCompactionWithPendingInstants() throws Exception { doWriteOperation(testTable, "0000007", INSERT); tableMetadata = metadata(writeConfig, context); - // verify that compaction of metadata table does not kick in. - assertFalse(tableMetadata.getLatestCompactionTime().isPresent()); + // verify that compaction of metadata table should kick in. + assertTrue(tableMetadata.getLatestCompactionTime().isPresent(), "Compaction of metadata table does not kick in"); // move inflight to completed testTable.moveInflightCommitToComplete("0000003", inflightCommitMeta); @@ -1051,7 +1051,6 @@ public void testRollbackOperationsNonPartitioned() throws Exception { */ @Test public void testManualRollbacks() throws Exception { - boolean populateMateFields = false; init(COPY_ON_WRITE, false); // Setting to archive more aggressively on the Metadata Table than the Dataset final int maxDeltaCommitsBeforeCompaction = 4; @@ -1082,23 +1081,17 @@ public void testManualRollbacks() throws Exception { } validateMetadata(testTable); - // We can only rollback those commits whose deltacommit have not been archived yet. - int numRollbacks = 0; - boolean exceptionRaised = false; + // We can only roll back those commits whose deltacommit have not been archived yet. List allInstants = metaClient.reloadActiveTimeline().getCommitsTimeline().getReverseOrderedInstants().collect(Collectors.toList()); for (HoodieInstant instantToRollback : allInstants) { try { - testTable.doRollback(instantToRollback.getTimestamp(), String.valueOf(Time.now())); + testTable.doRollback(instantToRollback.getTimestamp(), metaClient.createNewInstantTime()); validateMetadata(testTable); - ++numRollbacks; } catch (HoodieMetadataException e) { // This is expected since we are rolling back commits that are older than the latest compaction on the MDT break; } } - // Since each rollback also creates a deltacommit, we can only support rolling back of half of the original - // instants present before rollback started. - // assertTrue(numRollbacks >= minArchiveCommitsDataset / 2, "Rollbacks of non archived instants should work"); } /** @@ -1178,7 +1171,7 @@ public void testMetadataBootstrapLargeCommitList(HoodieTableType tableType, bool doCompaction(testTable, instantTime5, nonPartitionedDataset); } // added 60s to commitTime6 to make sure it is greater than compaction instant triggered by previous commit - String commitTime6 = metaClient.createNewInstantTime() + + 60000L; + String commitTime6 = HoodieInstantTimeGenerator.instantTimePlusMillis(InProcessTimeGenerator.createNewInstantTime(), 60000L); doWriteOperation(testTable, commitTime6, UPSERT, nonPartitionedDataset); String instantTime7 = metaClient.createNewInstantTime(); doRollback(testTable, commitTime6, instantTime7); @@ -2301,22 +2294,21 @@ public void testErrorCases() throws Exception { @Test public void testMetadataTableWithLongLog() throws Exception { init(COPY_ON_WRITE, false); - final int maxNumDeltacommits = 3; + final int maxNumDeltaCommits = 3; writeConfig = getWriteConfigBuilder(true, true, false) .withMetadataConfig(HoodieMetadataConfig.newBuilder() .enable(true) .enableMetrics(false) - .withMaxNumDeltaCommitsBeforeCompaction(maxNumDeltacommits + 100) - .withMaxNumDeltacommitsWhenPending(maxNumDeltacommits) + .withMaxNumDeltaCommitsBeforeCompaction(maxNumDeltaCommits + 100) + .withMaxNumDeltacommitsWhenPending(maxNumDeltaCommits) .build()).build(); initWriteConfigAndMetatableWriter(writeConfig, true); testTable.addRequestedCommit(String.format("%016d", 0)); - for (int i = 1; i <= maxNumDeltacommits; i++) { + for (int i = 1; i <= maxNumDeltaCommits; i++) { doWriteOperation(testTable, String.format("%016d", i)); } - int instant = maxNumDeltacommits + 1; - Throwable t = assertThrows(HoodieMetadataException.class, () -> doWriteOperation(testTable, String.format("%016d", instant))); - assertTrue(t.getMessage().startsWith(String.format("Metadata table's deltacommits exceeded %d: ", maxNumDeltacommits))); + int instant = maxNumDeltaCommits + 1; + assertDoesNotThrow(() -> doWriteOperation(testTable, String.format("%016d", instant))); } @Test diff --git a/hudi-client/hudi-java-client/src/test/java/org/apache/hudi/client/functional/TestHoodieJavaClientOnCopyOnWriteStorage.java b/hudi-client/hudi-java-client/src/test/java/org/apache/hudi/client/functional/TestHoodieJavaClientOnCopyOnWriteStorage.java index a591134517fc..9749eb55e1ea 100644 --- a/hudi-client/hudi-java-client/src/test/java/org/apache/hudi/client/functional/TestHoodieJavaClientOnCopyOnWriteStorage.java +++ b/hudi-client/hudi-java-client/src/test/java/org/apache/hudi/client/functional/TestHoodieJavaClientOnCopyOnWriteStorage.java @@ -50,6 +50,7 @@ import org.apache.hudi.common.model.HoodieWriteStat; import org.apache.hudi.common.model.IOType; import org.apache.hudi.common.model.WriteConcurrencyMode; +import org.apache.hudi.common.table.HoodieTableConfig; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.marker.MarkerType; import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; @@ -1526,7 +1527,7 @@ private Pair> testConsistencyCheck(HoodieTableMetaClient Option markerFilePath = WriteMarkersFactory.get( cfg.getMarkersType(), getHoodieTable(metaClient, cfg), instantTime) .create(partitionPath, - FSUtils.makeBaseFileName(instantTime, "1-0-1", UUID.randomUUID().toString()), + FSUtils.makeBaseFileName(instantTime, "1-0-1", UUID.randomUUID().toString(), HoodieTableConfig.BASE_FILE_FORMAT.defaultValue().getFileExtension()), IOType.MERGE); if (!enableOptimisticConsistencyGuard) { Exception e = assertThrows(HoodieCommitException.class, () -> { diff --git a/hudi-client/hudi-java-client/src/test/java/org/apache/hudi/table/action/commit/TestJavaCopyOnWriteActionExecutor.java b/hudi-client/hudi-java-client/src/test/java/org/apache/hudi/table/action/commit/TestJavaCopyOnWriteActionExecutor.java index bda362931c7d..87e690bfca0c 100644 --- a/hudi-client/hudi-java-client/src/test/java/org/apache/hudi/table/action/commit/TestJavaCopyOnWriteActionExecutor.java +++ b/hudi-client/hudi-java-client/src/test/java/org/apache/hudi/table/action/commit/TestJavaCopyOnWriteActionExecutor.java @@ -28,6 +28,7 @@ import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.table.HoodieTableConfig; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.testutils.HoodieTestDataGenerator; import org.apache.hudi.common.testutils.HoodieTestUtils; @@ -105,7 +106,7 @@ public void testMakeNewPath() { }).collect(Collectors.toList()).get(0); assertEquals(newPathWithWriteToken.getKey().toString(), Paths.get(this.basePath, partitionPath, - FSUtils.makeBaseFileName(instantTime, newPathWithWriteToken.getRight(), fileName)).toString()); + FSUtils.makeBaseFileName(instantTime, newPathWithWriteToken.getRight(), fileName, HoodieTableConfig.BASE_FILE_FORMAT.defaultValue().getFileExtension())).toString()); } private HoodieWriteConfig makeHoodieClientConfig() { diff --git a/hudi-client/hudi-java-client/src/test/java/org/apache/hudi/testutils/TestHoodieMetadataBase.java b/hudi-client/hudi-java-client/src/test/java/org/apache/hudi/testutils/TestHoodieMetadataBase.java index 336309deb235..8de67be6bf7d 100644 --- a/hudi-client/hudi-java-client/src/test/java/org/apache/hudi/testutils/TestHoodieMetadataBase.java +++ b/hudi-client/hudi-java-client/src/test/java/org/apache/hudi/testutils/TestHoodieMetadataBase.java @@ -107,7 +107,7 @@ public void init(HoodieTableType tableType, Option writeConfi initWriteConfigAndMetatableWriter(this.writeConfig, enableMetadataTable); } - protected void initWriteConfigAndMetatableWriter(HoodieWriteConfig writeConfig, boolean enableMetadataTable) throws IOException { + protected void initWriteConfigAndMetatableWriter(HoodieWriteConfig writeConfig, boolean enableMetadataTable) { this.writeConfig = writeConfig; if (enableMetadataTable) { metadataWriter = JavaHoodieBackedTableMetadataWriter.create(hadoopConf, writeConfig, context, Option.empty()); diff --git a/hudi-client/hudi-spark-client/pom.xml b/hudi-client/hudi-spark-client/pom.xml index 34b5ab6e06e6..4e7361d651a8 100644 --- a/hudi-client/hudi-spark-client/pom.xml +++ b/hudi-client/hudi-spark-client/pom.xml @@ -259,9 +259,6 @@ src/main/resources - - src/test/resources - diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/SparkRDDWriteClient.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/SparkRDDWriteClient.java index 6fdfee16bbe0..0308649dbf61 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/SparkRDDWriteClient.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/SparkRDDWriteClient.java @@ -265,6 +265,14 @@ public HoodieWriteResult deletePartitions(List partitions, String instan return new HoodieWriteResult(postWrite(resultRDD, instantTime, table), result.getPartitionToReplaceFileIds()); } + public HoodieWriteResult managePartitionTTL(String instantTime) { + HoodieTable>, HoodieData, HoodieData> table = initTable(WriteOperationType.DELETE_PARTITION, Option.ofNullable(instantTime)); + preWrite(instantTime, WriteOperationType.DELETE_PARTITION, table.getMetaClient()); + HoodieWriteMetadata> result = table.managePartitionTTL(context, instantTime); + HoodieWriteMetadata> resultRDD = result.clone(HoodieJavaRDD.getJavaRDD(result.getWriteStatuses())); + return new HoodieWriteResult(postWrite(resultRDD, instantTime, table), result.getPartitionToReplaceFileIds()); + } + @Override protected void initMetadataTable(Option instantTime) { // Initialize Metadata Table to make sure it's bootstrapped _before_ the operation, diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/hbase/SparkHoodieHBaseIndex.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/hbase/SparkHoodieHBaseIndex.java index 097e3decc2fb..acbdbd2413c0 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/hbase/SparkHoodieHBaseIndex.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/hbase/SparkHoodieHBaseIndex.java @@ -551,7 +551,7 @@ public int getBatchSize(int numRegionServersForTable, int maxQpsPerRegionServer, int maxReqPerSec = getMaxReqPerSec(numRSAlive, maxQpsPerRegionServer, qpsFraction); int numTasks = numTasksDuringPut; int maxParallelPutsTask = Math.max(1, Math.min(numTasks, maxExecutors)); - int multiPutBatchSizePerSecPerTask = Math.max(1, (int) Math.ceil(maxReqPerSec / maxParallelPutsTask)); + int multiPutBatchSizePerSecPerTask = Math.max(1, (int) Math.ceil((double) maxReqPerSec / maxParallelPutsTask)); LOG.info("HbaseIndexThrottling: qpsFraction :" + qpsFraction); LOG.info("HbaseIndexThrottling: numRSAlive :" + numRSAlive); LOG.info("HbaseIndexThrottling: maxReqPerSec :" + maxReqPerSec); diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieSparkFileReaderFactory.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieSparkFileReaderFactory.java index 83fe4e88737b..d06b69139059 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieSparkFileReaderFactory.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieSparkFileReaderFactory.java @@ -18,6 +18,7 @@ package org.apache.hudi.io.storage; +import org.apache.hudi.common.config.HoodieConfig; import org.apache.hudi.common.util.Option; import org.apache.hudi.exception.HoodieIOException; @@ -30,6 +31,7 @@ public class HoodieSparkFileReaderFactory extends HoodieFileReaderFactory { + @Override public HoodieFileReader newParquetFileReader(Configuration conf, Path path) { conf.setIfUnset(SQLConf.PARQUET_BINARY_AS_STRING().key(), SQLConf.PARQUET_BINARY_AS_STRING().defaultValueString()); conf.setIfUnset(SQLConf.PARQUET_INT96_AS_TIMESTAMP().key(), SQLConf.PARQUET_INT96_AS_TIMESTAMP().defaultValueString()); @@ -42,12 +44,15 @@ public HoodieFileReader newParquetFileReader(Configuration conf, Path path) { return new HoodieSparkParquetReader(conf, path); } - protected HoodieFileReader newHFileFileReader(Configuration conf, + @Override + protected HoodieFileReader newHFileFileReader(HoodieConfig hoodieConfig, + Configuration conf, Path path, Option schemaOption) throws IOException { throw new HoodieIOException("Not support read HFile"); } + @Override protected HoodieFileReader newOrcFileReader(Configuration conf, Path path) { throw new HoodieIOException("Not support read orc file"); } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieSparkFileWriterFactory.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieSparkFileWriterFactory.java index 5feefa3bee2b..7091c2b240f8 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieSparkFileWriterFactory.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieSparkFileWriterFactory.java @@ -34,6 +34,7 @@ import org.apache.hadoop.fs.Path; import org.apache.parquet.hadoop.metadata.CompressionCodecName; import org.apache.spark.sql.HoodieInternalRowUtils; +import org.apache.spark.sql.types.StructType; import java.io.IOException; @@ -44,15 +45,13 @@ protected HoodieFileWriter newParquetFileWriter( String instantTime, Path path, Configuration conf, HoodieConfig config, Schema schema, TaskContextSupplier taskContextSupplier) throws IOException { boolean populateMetaFields = config.getBooleanOrDefault(HoodieTableConfig.POPULATE_META_FIELDS); - Option filter = enableBloomFilter(populateMetaFields, config) ? Option.of(createBloomFilter(config)) : Option.empty(); String compressionCodecName = config.getStringOrDefault(HoodieStorageConfig.PARQUET_COMPRESSION_CODEC_NAME); // Support PARQUET_COMPRESSION_CODEC_NAME is "" if (compressionCodecName.isEmpty()) { compressionCodecName = null; } - HoodieRowParquetWriteSupport writeSupport = new HoodieRowParquetWriteSupport(conf, - HoodieInternalRowUtils.getCachedSchema(schema), filter, - HoodieStorageConfig.newBuilder().fromProperties(config.getProps()).build()); + HoodieRowParquetWriteSupport writeSupport = getHoodieRowParquetWriteSupport(conf, schema, + config, enableBloomFilter(populateMetaFields, config)); HoodieRowParquetConfig parquetConfig = new HoodieRowParquetConfig(writeSupport, CompressionCodecName.fromConf(compressionCodecName), config.getIntOrDefault(HoodieStorageConfig.PARQUET_BLOCK_SIZE), @@ -69,10 +68,7 @@ protected HoodieFileWriter newParquetFileWriter( protected HoodieFileWriter newParquetFileWriter( FSDataOutputStream outputStream, Configuration conf, HoodieConfig config, Schema schema) throws IOException { boolean enableBloomFilter = false; - Option filter = enableBloomFilter ? Option.of(createBloomFilter(config)) : Option.empty(); - HoodieRowParquetWriteSupport writeSupport = new HoodieRowParquetWriteSupport(conf, - HoodieInternalRowUtils.getCachedSchema(schema), filter, - HoodieStorageConfig.newBuilder().fromProperties(config.getProps()).build()); + HoodieRowParquetWriteSupport writeSupport = getHoodieRowParquetWriteSupport(conf, schema, config, enableBloomFilter); String compressionCodecName = config.getStringOrDefault(HoodieStorageConfig.PARQUET_COMPRESSION_CODEC_NAME); // Support PARQUET_COMPRESSION_CODEC_NAME is "" if (compressionCodecName.isEmpty()) { @@ -100,4 +96,11 @@ protected HoodieFileWriter newOrcFileWriter(String instantTime, Path path, Confi TaskContextSupplier taskContextSupplier) throws IOException { throw new HoodieIOException("Not support write to Orc file"); } + + private static HoodieRowParquetWriteSupport getHoodieRowParquetWriteSupport(Configuration conf, Schema schema, + HoodieConfig config, boolean enableBloomFilter) { + Option filter = enableBloomFilter ? Option.of(createBloomFilter(config)) : Option.empty(); + StructType structType = HoodieInternalRowUtils.getCachedSchema(schema); + return HoodieRowParquetWriteSupport.getHoodieRowParquetWriteSupport(conf, structType, filter, config); + } } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/row/HoodieInternalRowFileWriterFactory.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/row/HoodieInternalRowFileWriterFactory.java index 8a61c7c44d90..ad362d170142 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/row/HoodieInternalRowFileWriterFactory.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/row/HoodieInternalRowFileWriterFactory.java @@ -66,8 +66,8 @@ private static HoodieInternalRowFileWriter newParquetInternalRowFileWriter(Path Option bloomFilterOpt ) throws IOException { - HoodieRowParquetWriteSupport writeSupport = - new HoodieRowParquetWriteSupport(table.getHadoopConf(), structType, bloomFilterOpt, writeConfig.getStorageConfig()); + HoodieRowParquetWriteSupport writeSupport = HoodieRowParquetWriteSupport + .getHoodieRowParquetWriteSupport(table.getHadoopConf(), structType, bloomFilterOpt, writeConfig); return new HoodieInternalRowParquetWriter( path, diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/row/HoodieRowParquetWriteSupport.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/row/HoodieRowParquetWriteSupport.java index 3a1b6d000bec..99102c309223 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/row/HoodieRowParquetWriteSupport.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/row/HoodieRowParquetWriteSupport.java @@ -21,8 +21,11 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hudi.avro.HoodieBloomFilterWriteSupport; import org.apache.hudi.common.bloom.BloomFilter; +import org.apache.hudi.common.config.HoodieConfig; import org.apache.hudi.common.config.HoodieStorageConfig; import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.ReflectionUtils; + import org.apache.parquet.hadoop.api.WriteSupport; import org.apache.spark.sql.execution.datasources.parquet.ParquetWriteSupport; import org.apache.spark.sql.types.StructType; @@ -41,11 +44,11 @@ public class HoodieRowParquetWriteSupport extends ParquetWriteSupport { private final Configuration hadoopConf; private final Option> bloomFilterWriteSupportOpt; - public HoodieRowParquetWriteSupport(Configuration conf, StructType structType, Option bloomFilterOpt, HoodieStorageConfig config) { + public HoodieRowParquetWriteSupport(Configuration conf, StructType structType, Option bloomFilterOpt, HoodieConfig config) { Configuration hadoopConf = new Configuration(conf); - hadoopConf.set("spark.sql.parquet.writeLegacyFormat", config.getString(HoodieStorageConfig.PARQUET_WRITE_LEGACY_FORMAT_ENABLED)); - hadoopConf.set("spark.sql.parquet.outputTimestampType", config.getString(HoodieStorageConfig.PARQUET_OUTPUT_TIMESTAMP_TYPE)); - hadoopConf.set("spark.sql.parquet.fieldId.write.enabled", config.getString(PARQUET_FIELD_ID_WRITE_ENABLED)); + hadoopConf.set("spark.sql.parquet.writeLegacyFormat", config.getStringOrDefault(HoodieStorageConfig.PARQUET_WRITE_LEGACY_FORMAT_ENABLED)); + hadoopConf.set("spark.sql.parquet.outputTimestampType", config.getStringOrDefault(HoodieStorageConfig.PARQUET_OUTPUT_TIMESTAMP_TYPE)); + hadoopConf.set("spark.sql.parquet.fieldId.write.enabled", config.getStringOrDefault(PARQUET_FIELD_ID_WRITE_ENABLED)); setSchema(structType, hadoopConf); this.hadoopConf = hadoopConf; @@ -89,4 +92,12 @@ protected UTF8String dereference(UTF8String key) { } } + public static HoodieRowParquetWriteSupport getHoodieRowParquetWriteSupport(Configuration conf, StructType structType, + Option bloomFilterOpt, HoodieConfig config) { + return (HoodieRowParquetWriteSupport) ReflectionUtils.loadClass( + config.getStringOrDefault(HoodieStorageConfig.HOODIE_PARQUET_SPARK_ROW_WRITE_SUPPORT_CLASS), + new Class[] {Configuration.class, StructType.class, Option.class, HoodieConfig.class}, + conf, structType, bloomFilterOpt, config); + } + } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/factory/HoodieSparkKeyGeneratorFactory.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/factory/HoodieSparkKeyGeneratorFactory.java index 34d22000fb2b..1ea5adcd6b49 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/factory/HoodieSparkKeyGeneratorFactory.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/factory/HoodieSparkKeyGeneratorFactory.java @@ -79,7 +79,7 @@ public class HoodieSparkKeyGeneratorFactory { public static KeyGenerator createKeyGenerator(TypedProperties props) throws IOException { String keyGeneratorClass = getKeyGeneratorClassName(props); - boolean autoRecordKeyGen = KeyGenUtils.enableAutoGenerateRecordKeys(props) + boolean autoRecordKeyGen = KeyGenUtils.isAutoGeneratedRecordKeysEnabled(props) //Need to prevent overwriting the keygen for spark sql merge into because we need to extract //the recordkey from the meta cols if it exists. Sql keygen will use pkless keygen if needed. && !props.getBoolean(SPARK_SQL_MERGE_INTO_PREPPED_KEY, false); diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/HoodieSparkCopyOnWriteTable.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/HoodieSparkCopyOnWriteTable.java index e9d21350c212..0a533e659125 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/HoodieSparkCopyOnWriteTable.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/HoodieSparkCopyOnWriteTable.java @@ -30,7 +30,6 @@ import org.apache.hudi.avro.model.HoodieRollbackPlan; import org.apache.hudi.avro.model.HoodieSavepointMetadata; import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.client.utils.SparkPartitionUtils; import org.apache.hudi.client.common.HoodieSparkEngineContext; import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.data.HoodieData; @@ -47,7 +46,6 @@ import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.exception.HoodieMetadataException; import org.apache.hudi.exception.HoodieNotSupportedException; -import org.apache.hudi.exception.HoodieUpsertException; import org.apache.hudi.io.HoodieCreateHandle; import org.apache.hudi.io.HoodieMergeHandle; import org.apache.hudi.io.HoodieMergeHandleFactory; @@ -61,7 +59,6 @@ import org.apache.hudi.table.action.clean.CleanPlanActionExecutor; import org.apache.hudi.table.action.cluster.ClusteringPlanActionExecutor; import org.apache.hudi.table.action.cluster.SparkExecuteClusteringCommitActionExecutor; -import org.apache.hudi.table.action.commit.HoodieMergeHelper; import org.apache.hudi.table.action.commit.SparkBulkInsertCommitActionExecutor; import org.apache.hudi.table.action.commit.SparkBulkInsertPreppedCommitActionExecutor; import org.apache.hudi.table.action.commit.SparkDeleteCommitActionExecutor; @@ -71,6 +68,7 @@ import org.apache.hudi.table.action.commit.SparkInsertOverwriteCommitActionExecutor; import org.apache.hudi.table.action.commit.SparkInsertOverwriteTableCommitActionExecutor; import org.apache.hudi.table.action.commit.SparkInsertPreppedCommitActionExecutor; +import org.apache.hudi.table.action.commit.SparkPartitionTTLActionExecutor; import org.apache.hudi.table.action.commit.SparkUpsertCommitActionExecutor; import org.apache.hudi.table.action.commit.SparkUpsertPreppedCommitActionExecutor; import org.apache.hudi.table.action.index.RunIndexActionExecutor; @@ -226,6 +224,16 @@ public Option scheduleRollback(HoodieEngineContext context, shouldRollbackUsingMarkers, isRestore).execute(); } + /** + * Delete expired partition by config + * @param context HoodieEngineContext + * @param instantTime Instant Time for the action + * @return HoodieWriteMetadata + */ + public HoodieWriteMetadata> managePartitionTTL(HoodieEngineContext context, String instantTime) { + return new SparkPartitionTTLActionExecutor<>(context, config, this, instantTime).execute(); + } + @Override public Iterator> handleUpdate( String instantTime, String partitionPath, String fileId, @@ -237,28 +245,8 @@ public Iterator> handleUpdate( protected Iterator> handleUpdateInternal(HoodieMergeHandle upsertHandle, String instantTime, String fileId) throws IOException { - if (upsertHandle.getOldFilePath() == null) { - throw new HoodieUpsertException( - "Error in finding the old file path at commit " + instantTime + " for fileId: " + fileId); - } else { - if (upsertHandle.baseFileForMerge().getBootstrapBaseFile().isPresent()) { - Option partitionFields = getMetaClient().getTableConfig().getPartitionFields(); - Object[] partitionValues = SparkPartitionUtils.getPartitionFieldVals(partitionFields, upsertHandle.getPartitionPath(), - getMetaClient().getTableConfig().getBootstrapBasePath().get(), - upsertHandle.getWriterSchema(), getHadoopConf()); - upsertHandle.setPartitionFields(partitionFields); - upsertHandle.setPartitionValues(partitionValues); - } - HoodieMergeHelper.newInstance().runMerge(this, upsertHandle); - } - - // TODO(vc): This needs to be revisited - if (upsertHandle.getPartitionPath() == null) { - LOG.info("Upsert Handle has partition path as null " + upsertHandle.getOldFilePath() + ", " - + upsertHandle.writeStatuses()); - } - - return Collections.singletonList(upsertHandle.writeStatuses()).iterator(); + runMerge(upsertHandle, instantTime, fileId); + return upsertHandle.getWriteStatusesAsIterator(); } protected HoodieMergeHandle getUpdateHandle(String instantTime, String partitionPath, String fileId, @@ -299,8 +287,8 @@ public HoodieRollbackMetadata rollback(HoodieEngineContext context, String rollb } @Override - public Option scheduleIndexing(HoodieEngineContext context, String indexInstantTime, List partitionsToIndex) { - return new ScheduleIndexActionExecutor<>(context, config, this, indexInstantTime, partitionsToIndex).execute(); + public Option scheduleIndexing(HoodieEngineContext context, String indexInstantTime, List partitionsToIndex, List partitionPaths) { + return new ScheduleIndexActionExecutor<>(context, config, this, indexInstantTime, partitionsToIndex, partitionPaths).execute(); } @Override @@ -322,4 +310,5 @@ public HoodieRestoreMetadata restore(HoodieEngineContext context, String restore public Option scheduleRestore(HoodieEngineContext context, String restoreInstantTimestamp, String savepointToRestoreTimestamp) { return new RestorePlanActionExecutor<>(context, config, this, restoreInstantTimestamp, savepointToRestoreTimestamp).execute(); } + } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/HoodieSparkTable.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/HoodieSparkTable.java index 08d8a88ae1bf..fc3aba63740e 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/HoodieSparkTable.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/HoodieSparkTable.java @@ -19,6 +19,7 @@ package org.apache.hudi.table; import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.client.utils.SparkPartitionUtils; import org.apache.hudi.common.data.HoodieData; import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy; @@ -30,12 +31,15 @@ import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.HoodieMetadataException; +import org.apache.hudi.exception.HoodieUpsertException; import org.apache.hudi.index.HoodieIndex; import org.apache.hudi.index.SparkHoodieIndexFactory; +import org.apache.hudi.io.HoodieMergeHandle; import org.apache.hudi.metadata.HoodieTableMetadata; import org.apache.hudi.metadata.HoodieTableMetadataWriter; import org.apache.hudi.metadata.SparkHoodieBackedTableMetadataWriter; import org.apache.hadoop.fs.Path; +import org.apache.hudi.table.action.commit.HoodieMergeHelper; import org.apache.spark.TaskContext; import org.apache.spark.TaskContext$; @@ -125,4 +129,22 @@ public Runnable getPreExecuteRunnable() { final TaskContext taskContext = TaskContext.get(); return () -> TaskContext$.MODULE$.setTaskContext(taskContext); } + + @Override + public void runMerge(HoodieMergeHandle upsertHandle, String instantTime, String fileId) throws IOException { + if (upsertHandle.getOldFilePath() == null) { + throw new HoodieUpsertException("Error in finding the old file path at commit " + instantTime + " for fileId: " + fileId); + } else { + if (upsertHandle.baseFileForMerge().getBootstrapBaseFile().isPresent()) { + Option partitionFields = getMetaClient().getTableConfig().getPartitionFields(); + Object[] partitionValues = SparkPartitionUtils.getPartitionFieldVals(partitionFields, upsertHandle.getPartitionPath(), + getMetaClient().getTableConfig().getBootstrapBasePath().get(), + upsertHandle.getWriterSchema(), getHadoopConf()); + upsertHandle.setPartitionFields(partitionFields); + upsertHandle.setPartitionValues(partitionValues); + } + HoodieMergeHelper.newInstance().runMerge(this, upsertHandle); + } + } + } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/BaseBootstrapMetadataHandler.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/BaseBootstrapMetadataHandler.java index 4d6d07c9e498..ffda89d5b7fd 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/BaseBootstrapMetadataHandler.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/BaseBootstrapMetadataHandler.java @@ -70,7 +70,7 @@ public BootstrapWriteStatus runMetadataBootstrap(String srcPartitionPath, String throw new HoodieException(e.getMessage(), e); } - BootstrapWriteStatus writeStatus = (BootstrapWriteStatus) bootstrapHandle.writeStatuses().get(0); + BootstrapWriteStatus writeStatus = (BootstrapWriteStatus) bootstrapHandle.getWriteStatuses().get(0); BootstrapFileMapping bootstrapFileMapping = new BootstrapFileMapping( config.getBootstrapSourceBasePath(), srcPartitionPath, partitionPath, srcFileStatus, writeStatus.getFileId()); diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/BaseSparkCommitActionExecutor.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/BaseSparkCommitActionExecutor.java index 5e379d3e9561..0fcb2359cdfc 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/BaseSparkCommitActionExecutor.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/BaseSparkCommitActionExecutor.java @@ -20,7 +20,6 @@ import org.apache.hudi.client.WriteStatus; import org.apache.hudi.client.clustering.update.strategy.SparkAllowUpdateStrategy; -import org.apache.hudi.client.utils.SparkPartitionUtils; import org.apache.hudi.client.utils.SparkValidatorUtils; import org.apache.hudi.common.data.HoodieData; import org.apache.hudi.common.data.HoodieData.HoodieDataCacheKey; @@ -349,29 +348,8 @@ public Iterator> handleUpdate(String partitionPath, String fil protected Iterator> handleUpdateInternal(HoodieMergeHandle upsertHandle, String fileId) throws IOException { - if (upsertHandle.getOldFilePath() == null) { - throw new HoodieUpsertException( - "Error in finding the old file path at commit " + instantTime + " for fileId: " + fileId); - } else { - if (upsertHandle.baseFileForMerge().getBootstrapBaseFile().isPresent()) { - Option partitionFields = table.getMetaClient().getTableConfig().getPartitionFields(); - Object[] partitionValues = SparkPartitionUtils.getPartitionFieldVals(partitionFields, upsertHandle.getPartitionPath(), - table.getMetaClient().getTableConfig().getBootstrapBasePath().get(), - upsertHandle.getWriterSchema(), table.getHadoopConf()); - upsertHandle.setPartitionFields(partitionFields); - upsertHandle.setPartitionValues(partitionValues); - } - - HoodieMergeHelper.newInstance().runMerge(table, upsertHandle); - } - - // TODO(vc): This needs to be revisited - if (upsertHandle.getPartitionPath() == null) { - LOG.info("Upsert Handle has partition path as null " + upsertHandle.getOldFilePath() + ", " - + upsertHandle.writeStatuses()); - } - - return Collections.singletonList(upsertHandle.writeStatuses()).iterator(); + table.runMerge(upsertHandle, instantTime, fileId); + return upsertHandle.getWriteStatusesAsIterator(); } protected HoodieMergeHandle getUpdateHandle(String partitionPath, String fileId, Iterator> recordItr) { diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkPartitionTTLActionExecutor.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkPartitionTTLActionExecutor.java new file mode 100644 index 000000000000..166fc3672fa0 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkPartitionTTLActionExecutor.java @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.commit; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.action.HoodieWriteMetadata; +import org.apache.hudi.table.action.ttl.strategy.HoodiePartitionTTLStrategyFactory; +import org.apache.hudi.table.action.ttl.strategy.PartitionTTLStrategy; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.List; + +public class SparkPartitionTTLActionExecutor + extends BaseSparkCommitActionExecutor { + + private static final Logger LOG = LoggerFactory.getLogger(ConsistentBucketBulkInsertDataInternalWriterHelper.class); + + public SparkPartitionTTLActionExecutor(HoodieEngineContext context, HoodieWriteConfig config, HoodieTable table, + String instantTime) { + super(context, config, table, instantTime, WriteOperationType.DELETE_PARTITION); + } + + @Override + public HoodieWriteMetadata> execute() { + try { + PartitionTTLStrategy strategy = HoodiePartitionTTLStrategyFactory.createStrategy(table, config.getProps(), instantTime); + List expiredPartitions = strategy.getExpiredPartitionPaths(); + LOG.info("Partition ttl find the following expired partitions to delete: " + String.join(",", expiredPartitions)); + return new SparkDeletePartitionCommitActionExecutor<>(context, config, table, instantTime, expiredPartitions).execute(); + } catch (IOException e) { + throw new HoodieIOException("Error executing hoodie partition ttl: ", e); + } + } +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/UpsertPartitioner.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/UpsertPartitioner.java index edd6d981d185..2b78df96765e 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/UpsertPartitioner.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/UpsertPartitioner.java @@ -97,7 +97,8 @@ public UpsertPartitioner(WorkloadProfile profile, HoodieEngineContext context, H assignUpdates(profile); assignInserts(profile, context); - LOG.info("Total Buckets: " + totalBuckets); + LOG.info("Total Buckets: {}, bucketInfoMap size: {}, partitionPathToInsertBucketInfos size: {}, updateLocationToBucket size: {}", + totalBuckets, bucketInfoMap.size(), partitionPathToInsertBucketInfos.size(), updateLocationToBucket.size()); if (LOG.isDebugEnabled()) { LOG.debug("Buckets info => " + bucketInfoMap + ", \n" + "Partition to insert buckets => " + partitionPathToInsertBucketInfos + ", \n" @@ -189,6 +190,7 @@ private void assignInserts(WorkloadProfile profile, HoodieEngineContext context) this.smallFiles.addAll(smallFiles); + LOG.info("For partitionPath : " + partitionPath + " Total Small Files => " + smallFiles.size()); LOG.debug("For partitionPath : " + partitionPath + " Small Files => " + smallFiles); long totalUnassignedInserts = pStat.getNumInserts(); @@ -230,7 +232,7 @@ private void assignInserts(WorkloadProfile profile, HoodieEngineContext context) } int insertBuckets = (int) Math.ceil((1.0 * totalUnassignedInserts) / insertRecordsPerBucket); - LOG.debug("After small file assignment: unassignedInserts => " + totalUnassignedInserts + LOG.info("After small file assignment: unassignedInserts => " + totalUnassignedInserts + ", totalInsertBuckets => " + insertBuckets + ", recordsPerBucket => " + insertRecordsPerBucket); for (int b = 0; b < insertBuckets; b++) { bucketNumbers.add(totalBuckets); @@ -258,7 +260,7 @@ private void assignInserts(WorkloadProfile profile, HoodieEngineContext context) currentCumulativeWeight += bkt.weight; insertBuckets.add(new InsertBucketCumulativeWeightPair(bkt, currentCumulativeWeight)); } - LOG.debug("Total insert buckets for partition path " + partitionPath + " => " + insertBuckets); + LOG.info("Total insert buckets for partition path " + partitionPath + " => " + insertBuckets); partitionPathToInsertBucketInfos.put(partitionPath, insertBuckets); } if (profile.hasOutputWorkLoadStats()) { diff --git a/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/AvroConversionUtils.scala b/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/AvroConversionUtils.scala index 55877938f8cb..95962d1ca443 100644 --- a/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/AvroConversionUtils.scala +++ b/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/AvroConversionUtils.scala @@ -23,6 +23,7 @@ import org.apache.avro.generic.GenericRecord import org.apache.avro.{JsonProperties, Schema} import org.apache.hudi.HoodieSparkUtils.sparkAdapter import org.apache.hudi.avro.AvroSchemaUtils +import org.apache.hudi.exception.SchemaCompatibilityException import org.apache.hudi.internal.schema.HoodieSchemaException import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow @@ -58,9 +59,16 @@ object AvroConversionUtils { */ def createInternalRowToAvroConverter(rootCatalystType: StructType, rootAvroType: Schema, nullable: Boolean): InternalRow => GenericRecord = { val serializer = sparkAdapter.createAvroSerializer(rootCatalystType, rootAvroType, nullable) - row => serializer - .serialize(row) - .asInstanceOf[GenericRecord] + row => { + try { + serializer + .serialize(row) + .asInstanceOf[GenericRecord] + } catch { + case e: HoodieSchemaException => throw e + case e => throw new SchemaCompatibilityException("Failed to convert spark record into avro record", e) + } + } } /** diff --git a/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/HoodieDatasetBulkInsertHelper.scala b/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/HoodieDatasetBulkInsertHelper.scala index 0214b0a10302..3c30d825ebf8 100644 --- a/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/HoodieDatasetBulkInsertHelper.scala +++ b/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/HoodieDatasetBulkInsertHelper.scala @@ -17,6 +17,7 @@ package org.apache.hudi +import org.apache.hudi.HoodieSparkUtils.injectSQLConf import org.apache.hudi.client.WriteStatus import org.apache.hudi.client.model.HoodieInternalRow import org.apache.hudi.common.config.TypedProperties @@ -25,6 +26,7 @@ import org.apache.hudi.common.engine.TaskContextSupplier import org.apache.hudi.common.model.HoodieRecord import org.apache.hudi.common.util.ReflectionUtils import org.apache.hudi.config.HoodieWriteConfig +import org.apache.hudi.data.HoodieJavaRDD import org.apache.hudi.exception.HoodieException import org.apache.hudi.index.HoodieIndex.BucketIndexEngineType import org.apache.hudi.index.{HoodieIndex, SparkHoodieIndexFactory} @@ -40,11 +42,14 @@ import org.apache.spark.sql.HoodieUnsafeUtils.getNumPartitions import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Alias, Literal} import org.apache.spark.sql.catalyst.plans.logical.Project +import org.apache.spark.sql.execution.SQLConfInjectingRDD +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.{StringType, StructField, StructType} import org.apache.spark.sql.{DataFrame, Dataset, HoodieUnsafeUtils, Row} import org.apache.spark.unsafe.types.UTF8String import scala.collection.JavaConverters.{asScalaBufferConverter, seqAsJavaListConverter} +import scala.reflect.ClassTag object HoodieDatasetBulkInsertHelper extends ParallelismHelper[DataFrame](toJavaSerializableFunctionUnchecked(df => getNumPartitions(df))) with Logging { @@ -65,7 +70,7 @@ object HoodieDatasetBulkInsertHelper instantTime: String): Dataset[Row] = { val populateMetaFields = config.populateMetaFields() val schema = df.schema - val autoGenerateRecordKeys = KeyGenUtils.enableAutoGenerateRecordKeys(config.getProps) + val autoGenerateRecordKeys = KeyGenUtils.isAutoGeneratedRecordKeysEnabled(config.getProps) val metaFields = Seq( StructField(HoodieRecord.COMMIT_TIME_METADATA_FIELD, StringType), @@ -83,8 +88,8 @@ object HoodieDatasetBulkInsertHelper val keyGeneratorClassName = config.getStringOrThrow(HoodieWriteConfig.KEYGENERATOR_CLASS_NAME, "Key-generator class name is required") - val prependedRdd: RDD[InternalRow] = - df.queryExecution.toRdd.mapPartitions { iter => + val prependedRdd: RDD[InternalRow] = { + injectSQLConf(df.queryExecution.toRdd.mapPartitions { iter => val typedProps = new TypedProperties(config.getProps) if (autoGenerateRecordKeys) { typedProps.setProperty(KeyGenUtils.RECORD_KEY_GEN_PARTITION_ID_CONFIG, String.valueOf(TaskContext.getPartitionId())) @@ -110,7 +115,8 @@ object HoodieDatasetBulkInsertHelper // TODO use mutable row, avoid re-allocating new HoodieInternalRow(commitTimestamp, commitSeqNo, recordKey, partitionPath, filename, row, false) } - } + }, SQLConf.get) + } val dedupedRdd = if (config.shouldCombineBeforeInsert) { dedupeRows(prependedRdd, updatedSchema, config.getPreCombineField, SparkHoodieIndexFactory.isGlobalIndex(config), targetParallelism) @@ -144,53 +150,53 @@ object HoodieDatasetBulkInsertHelper arePartitionRecordsSorted: Boolean, shouldPreserveHoodieMetadata: Boolean): HoodieData[WriteStatus] = { val schema = dataset.schema - val writeStatuses = dataset.queryExecution.toRdd.mapPartitions(iter => { - val taskContextSupplier: TaskContextSupplier = table.getTaskContextSupplier - val taskPartitionId = taskContextSupplier.getPartitionIdSupplier.get - val taskId = taskContextSupplier.getStageIdSupplier.get.toLong - val taskEpochId = taskContextSupplier.getAttemptIdSupplier.get - - val writer = writeConfig.getIndexType match { - case HoodieIndex.IndexType.BUCKET if writeConfig.getBucketIndexEngineType - == BucketIndexEngineType.CONSISTENT_HASHING => - new ConsistentBucketBulkInsertDataInternalWriterHelper( - table, - writeConfig, - instantTime, - taskPartitionId, - taskId, - taskEpochId, - schema, - writeConfig.populateMetaFields, - arePartitionRecordsSorted, - shouldPreserveHoodieMetadata) - case _ => - new BulkInsertDataInternalWriterHelper( - table, - writeConfig, - instantTime, - taskPartitionId, - taskId, - taskEpochId, - schema, - writeConfig.populateMetaFields, - arePartitionRecordsSorted, - shouldPreserveHoodieMetadata) - } + HoodieJavaRDD.of( + injectSQLConf(dataset.queryExecution.toRdd.mapPartitions(iter => { + val taskContextSupplier: TaskContextSupplier = table.getTaskContextSupplier + val taskPartitionId = taskContextSupplier.getPartitionIdSupplier.get + val taskId = taskContextSupplier.getStageIdSupplier.get.toLong + val taskEpochId = taskContextSupplier.getAttemptIdSupplier.get - try { - iter.foreach(writer.write) - } catch { - case t: Throwable => - writer.abort() - throw t - } finally { - writer.close() - } + val writer = writeConfig.getIndexType match { + case HoodieIndex.IndexType.BUCKET if writeConfig.getBucketIndexEngineType + == BucketIndexEngineType.CONSISTENT_HASHING => + new ConsistentBucketBulkInsertDataInternalWriterHelper( + table, + writeConfig, + instantTime, + taskPartitionId, + taskId, + taskEpochId, + schema, + writeConfig.populateMetaFields, + arePartitionRecordsSorted, + shouldPreserveHoodieMetadata) + case _ => + new BulkInsertDataInternalWriterHelper( + table, + writeConfig, + instantTime, + taskPartitionId, + taskId, + taskEpochId, + schema, + writeConfig.populateMetaFields, + arePartitionRecordsSorted, + shouldPreserveHoodieMetadata) + } + + try { + iter.foreach(writer.write) + } catch { + case t: Throwable => + writer.abort() + throw t + } finally { + writer.close() + } - writer.getWriteStatuses.asScala.iterator - }).collect() - table.getContext.parallelize(writeStatuses.toList.asJava) + writer.getWriteStatuses.asScala.iterator + }), SQLConf.get).toJavaRDD()) } private def dedupeRows(rdd: RDD[InternalRow], schema: StructType, preCombineFieldRef: String, isGlobalIndex: Boolean, targetParallelism: Int): RDD[InternalRow] = { diff --git a/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/HoodieSparkUtils.scala b/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/HoodieSparkUtils.scala index 975135c13d58..3393da6bd83c 100644 --- a/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/HoodieSparkUtils.scala +++ b/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/HoodieSparkUtils.scala @@ -18,25 +18,25 @@ package org.apache.hudi +import org.apache.avro.Schema +import org.apache.avro.generic.GenericRecord +import org.apache.hadoop.fs.Path import org.apache.hudi.HoodieConversionUtils.toScalaOption import org.apache.hudi.avro.{AvroSchemaUtils, HoodieAvroUtils} import org.apache.hudi.client.utils.SparkRowSerDe import org.apache.hudi.common.model.HoodieRecord import org.apache.hudi.hadoop.fs.CachingPath - -import org.apache.avro.Schema -import org.apache.avro.generic.GenericRecord -import org.apache.hadoop.fs.Path +import org.apache.hudi.util.ExceptionWrappingIterator import org.apache.spark.SPARK_VERSION import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD -import org.apache.spark.sql.DataFrame import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.util.DateTimeUtils.getTimeZone import org.apache.spark.sql.execution.SQLConfInjectingRDD import org.apache.spark.sql.execution.datasources.SparkParsePartitionUtil import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.{StringType, StructField, StructType} +import org.apache.spark.sql.{DataFrame, HoodieUnsafeUtils} import org.apache.spark.unsafe.types.UTF8String import scala.collection.JavaConverters._ @@ -128,9 +128,19 @@ object HoodieSparkUtils extends SparkAdapterSupport with SparkVersionsSupport wi }, SQLConf.get) } - private def injectSQLConf[T: ClassTag](rdd: RDD[T], conf: SQLConf): RDD[T] = + def injectSQLConf[T: ClassTag](rdd: RDD[T], conf: SQLConf): RDD[T] = new SQLConfInjectingRDD(rdd, conf) + def maybeWrapDataFrameWithException(df: DataFrame, exceptionClass: String, msg: String, shouldWrap: Boolean): DataFrame = { + if (shouldWrap) { + HoodieUnsafeUtils.createDataFrameFromRDD(df.sparkSession, injectSQLConf(df.queryExecution.toRdd.mapPartitions { + rows => new ExceptionWrappingIterator[InternalRow](rows, exceptionClass, msg) + }, SQLConf.get), df.schema) + } else { + df + } + } + def safeCreateRDD(df: DataFrame, structName: String, recordNamespace: String, reconcileToLatestSchema: Boolean, latestTableSchema: org.apache.hudi.common.util.Option[Schema] = org.apache.hudi.common.util.Option.empty()): Tuple2[RDD[GenericRecord], RDD[String]] = { @@ -213,7 +223,7 @@ object HoodieSparkUtils extends SparkAdapterSupport with SparkVersionsSupport wi val transform: GenericRecord => Either[GenericRecord, String] = record => try { Left(HoodieAvroUtils.rewriteRecordDeep(record, schema, true)) } catch { - case _: Throwable => Right(HoodieAvroUtils.avroToJsonString(record, false)) + case _: Throwable => Right(HoodieAvroUtils.safeAvroToJsonString(record)) } recs.map(transform) } diff --git a/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/util/ExceptionWrappingIterator.scala b/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/util/ExceptionWrappingIterator.scala new file mode 100644 index 000000000000..994e6f0eea2d --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/util/ExceptionWrappingIterator.scala @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.util + +import org.apache.hudi.common.util.ReflectionUtils + +/** + * Used to catch exceptions from an iterator + * @param in iterator to catch exceptions from + * @param exceptionClass name of exception class to throw when an exception is thrown during iteration + * @param msg message the thrown exception should have + */ +class ExceptionWrappingIterator[T](val in: Iterator[T], val exceptionClass: String, val msg: String) extends Iterator[T] { + override def hasNext: Boolean = try in.hasNext + catch { + case e: Throwable => throw createException(e) + } + + override def next: T = try in.next + catch { + case e: Throwable => throw createException(e) + } + + private def createException(e: Throwable): Throwable = { + ReflectionUtils.loadClass(exceptionClass, Array(classOf[String], classOf[Throwable]).asInstanceOf[Array[Class[_]]], msg, e).asInstanceOf[Throwable] + } +} diff --git a/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/HoodieCatalystPlansUtils.scala b/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/HoodieCatalystPlansUtils.scala index b9110f1ed93b..40e62ddd0efc 100644 --- a/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/HoodieCatalystPlansUtils.scala +++ b/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/HoodieCatalystPlansUtils.scala @@ -144,7 +144,7 @@ trait HoodieCatalystPlansUtils { def createMITJoin(left: LogicalPlan, right: LogicalPlan, joinType: JoinType, condition: Option[Expression], hint: String): LogicalPlan /** - * true if both plans produce the same attributes in the the same order + * true if both plans produce the same attributes in the same order */ def produceSameOutput(a: LogicalPlan, b: LogicalPlan): Boolean } diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestPartitionTTLManagement.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestPartitionTTLManagement.java new file mode 100644 index 000000000000..46677c9aaa75 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestPartitionTTLManagement.java @@ -0,0 +1,188 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.client; + +import org.apache.hudi.common.config.HoodieMetadataConfig; +import org.apache.hudi.common.config.HoodieStorageConfig; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.table.view.FileSystemViewStorageConfig; +import org.apache.hudi.common.table.view.FileSystemViewStorageType; +import org.apache.hudi.common.testutils.HoodieTestDataGenerator; +import org.apache.hudi.config.HoodieCompactionConfig; +import org.apache.hudi.config.HoodieIndexConfig; +import org.apache.hudi.config.HoodieTTLConfig; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.index.HoodieIndex; +import org.apache.hudi.table.action.ttl.strategy.PartitionTTLStrategyType; +import org.apache.hudi.testutils.HoodieClientTestBase; +import org.apache.hudi.testutils.HoodieMergeOnReadTestUtils; + +import com.github.davidmoten.guavamini.Sets; +import org.apache.avro.generic.GenericRecord; +import org.apache.hadoop.mapred.JobConf; +import org.apache.spark.api.java.JavaRDD; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +import java.nio.file.Paths; +import java.util.Arrays; +import java.util.List; +import java.util.stream.Collectors; + +import static org.apache.hudi.common.testutils.HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA; +import static org.apache.hudi.common.testutils.HoodieTestDataGenerator.getCommitTimeAtUTC; + +/** + * Test Cases for partition ttl management. + */ +public class TestPartitionTTLManagement extends HoodieClientTestBase { + + protected HoodieWriteConfig.Builder getConfigBuilder(Boolean autoCommit) { + return HoodieWriteConfig.newBuilder().withPath(basePath) + .withSchema(TRIP_EXAMPLE_SCHEMA) + .withParallelism(2, 2) + .withAutoCommit(autoCommit) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().build()) + .withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024 * 1024) + .withInlineCompaction(false).withMaxNumDeltaCommitsBeforeCompaction(1).build()) + .withStorageConfig(HoodieStorageConfig.newBuilder() + .hfileMaxFileSize(1024 * 1024 * 1024).parquetMaxFileSize(1024 * 1024 * 1024).orcMaxFileSize(1024 * 1024 * 1024).build()) + .forTable("test-trip-table") + .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()) + .withEmbeddedTimelineServerEnabled(true).withFileSystemViewConfig(FileSystemViewStorageConfig.newBuilder() + .withStorageType(FileSystemViewStorageType.EMBEDDED_KV_STORE).build()); + } + + @Test + public void testKeepByCreationTime() { + final HoodieWriteConfig cfg = getConfigBuilder(true) + .withPath(metaClient.getBasePathV2().toString()) + .withTTLConfig(HoodieTTLConfig + .newBuilder() + .withTTLDaysRetain(10) + .withTTLStrategyType(PartitionTTLStrategyType.KEEP_BY_CREATION_TIME) + .build()) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().build()) + .build(); + HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator(0xDEED); + try (SparkRDDWriteClient client = getHoodieWriteClient(cfg)) { + String partitionPath0 = dataGen.getPartitionPaths()[0]; + String instant0 = getCommitTimeAtUTC(0); + writeRecordsForPartition(client, dataGen, partitionPath0, instant0); + + String instant1 = getCommitTimeAtUTC(1000); + String partitionPath1 = dataGen.getPartitionPaths()[1]; + writeRecordsForPartition(client, dataGen, partitionPath1, instant1); + + String currentInstant = client.createNewInstantTime(); + String partitionPath2 = dataGen.getPartitionPaths()[2]; + writeRecordsForPartition(client, dataGen, partitionPath2, currentInstant); + + HoodieWriteResult result = client.managePartitionTTL(client.createNewInstantTime()); + + Assertions.assertEquals(Sets.newHashSet(partitionPath0, partitionPath1), result.getPartitionToReplaceFileIds().keySet()); + Assertions.assertEquals(10, readRecords(new String[] {partitionPath0, partitionPath1, partitionPath2}).size()); + } + } + + @Test + public void testKeepByTime() { + final HoodieWriteConfig cfg = getConfigBuilder(true) + .withPath(metaClient.getBasePathV2().toString()) + .withTTLConfig(HoodieTTLConfig + .newBuilder() + .withTTLDaysRetain(10) + .withTTLStrategyType(PartitionTTLStrategyType.KEEP_BY_TIME) + .build()) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().build()) + .build(); + HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator(0xDEED); + try (SparkRDDWriteClient client = getHoodieWriteClient(cfg)) { + String partitionPath0 = dataGen.getPartitionPaths()[0]; + String instant0 = getCommitTimeAtUTC(0); + writeRecordsForPartition(client, dataGen, partitionPath0, instant0); + + String instant1 = getCommitTimeAtUTC(1000); + String partitionPath1 = dataGen.getPartitionPaths()[1]; + writeRecordsForPartition(client, dataGen, partitionPath1, instant1); + + String currentInstant = client.createNewInstantTime(); + String partitionPath2 = dataGen.getPartitionPaths()[2]; + writeRecordsForPartition(client, dataGen, partitionPath2, currentInstant); + + HoodieWriteResult result = client.managePartitionTTL(client.createNewInstantTime()); + + Assertions.assertEquals(Sets.newHashSet(partitionPath0, partitionPath1), result.getPartitionToReplaceFileIds().keySet()); + + // remain 10 rows + Assertions.assertEquals(10, readRecords(new String[] {partitionPath0, partitionPath1, partitionPath2}).size()); + } + } + + @Test + public void testInlinePartitionTTL() { + final HoodieWriteConfig cfg = getConfigBuilder(true) + .withPath(metaClient.getBasePathV2().toString()) + .withTTLConfig(HoodieTTLConfig + .newBuilder() + .withTTLDaysRetain(10) + .withTTLStrategyType(PartitionTTLStrategyType.KEEP_BY_TIME) + .enableInlinePartitionTTL(true) + .build()) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().build()) + .build(); + HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator(0xDEED); + try (SparkRDDWriteClient client = getHoodieWriteClient(cfg)) { + String partitionPath0 = dataGen.getPartitionPaths()[0]; + String instant0 = getCommitTimeAtUTC(0); + writeRecordsForPartition(client, dataGen, partitionPath0, instant0); + + // All records will be deleted + Assertions.assertEquals(0, readRecords(new String[] {partitionPath0}).size()); + + String instant1 = getCommitTimeAtUTC(1000); + String partitionPath1 = dataGen.getPartitionPaths()[1]; + writeRecordsForPartition(client, dataGen, partitionPath1, instant1); + + // All records will be deleted + Assertions.assertEquals(0, readRecords(new String[] {partitionPath1}).size()); + + String currentInstant = client.createNewInstantTime(); + String partitionPath2 = dataGen.getPartitionPaths()[2]; + writeRecordsForPartition(client, dataGen, partitionPath2, currentInstant); + + // remain 10 rows + Assertions.assertEquals(10, readRecords(new String[] {partitionPath2}).size()); + } + } + + private void writeRecordsForPartition(SparkRDDWriteClient client, HoodieTestDataGenerator dataGen, String partition, String instantTime) { + List records = dataGen.generateInsertsForPartition(instantTime, 10, partition); + client.startCommitWithTime(instantTime); + JavaRDD writeStatuses = client.insert(jsc.parallelize(records, 1), instantTime); + client.commit(instantTime, writeStatuses); + } + + private List readRecords(String[] partitions) { + return HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf, + Arrays.stream(partitions).map(p -> Paths.get(basePath, p).toString()).collect(Collectors.toList()), + basePath, new JobConf(hadoopConf), true, false); + } + +} diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieBackedMetadata.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieBackedMetadata.java index 32264bbf35fa..a5e4e6e2ee96 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieBackedMetadata.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieBackedMetadata.java @@ -62,6 +62,7 @@ import org.apache.hudi.common.table.log.block.HoodieLogBlock; import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieInstantTimeGenerator; import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion; import org.apache.hudi.common.table.view.FileSystemViewStorageConfig; @@ -119,7 +120,6 @@ import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; -import org.apache.hadoop.util.Time; import org.apache.parquet.avro.AvroSchemaConverter; import org.apache.parquet.schema.MessageType; import org.apache.spark.api.java.JavaRDD; @@ -167,6 +167,7 @@ import static org.apache.hudi.common.model.WriteOperationType.INSERT; import static org.apache.hudi.common.model.WriteOperationType.UPSERT; import static org.apache.hudi.common.table.HoodieTableMetaClient.METAFOLDER_NAME; +import static org.apache.hudi.common.table.timeline.HoodieTimeline.COMMIT_ACTION; import static org.apache.hudi.common.table.timeline.HoodieTimeline.COMMIT_EXTENSION; import static org.apache.hudi.common.table.timeline.HoodieTimeline.DELTA_COMMIT_EXTENSION; import static org.apache.hudi.common.table.timeline.HoodieTimeline.INFLIGHT_EXTENSION; @@ -824,6 +825,43 @@ public void testVirtualKeysInBaseFiles() throws Exception { }); } + /** + * Test MDT compaction with delayed pending instants on DT(induced by multi-writer or async table services). + * + *

A demo: + *

+   *   Time t1
+   *
+   *   Main                 Metadata
+   *   c1.commit    ->   c1.deltacommit
+   *   c2.commit.   ->   c2.deltacommit
+   *   c3.inflight  ->
+   *   c4.commit    ->   c4.deltacommit
+   *
+   *   c5.requested ->   c6.compaction
+   *
+   *
+   *   MDT files:
+   *   F1.c1 (base) -> log(c2) -> log(c4)
+   *   F1.c6 (base)
+   *
+   *
+   *   Time t2 (Now c3 is completed)
+   *
+   *   Main                 Metadata
+   *   c1.commit    ->   c1.deltacommit
+   *   c2.commit.   ->   c2.deltacommit
+   *   c3.commit    ->   c3.deltacommit
+   *   c4.commit    ->   c4.deltacommit
+   *
+   *   c5.requested -> c6.compaction
+   *
+   *
+   *   MDT files:
+   *   F1.c1 (base) -> log(c2) -> log(c4)
+   *   F1.c6 (base) -> log(c3)
+   * 
+ */ @Test public void testMetadataTableCompactionWithPendingInstants() throws Exception { init(COPY_ON_WRITE, false); @@ -853,8 +891,8 @@ public void testMetadataTableCompactionWithPendingInstants() throws Exception { doWriteOperation(testTable, "0000007", INSERT); tableMetadata = metadata(writeConfig, context); - // verify that compaction of metadata table does not kick in. - assertFalse(tableMetadata.getLatestCompactionTime().isPresent()); + // verify that compaction of metadata table should kick in. + assertTrue(tableMetadata.getLatestCompactionTime().isPresent(), "Compaction of metadata table should kick in"); // move inflight to completed testTable.moveInflightCommitToComplete("0000003", inflightCommitMeta); @@ -1098,7 +1136,7 @@ private void revertTableToInflightState(HoodieWriteConfig writeConfig) throws IO // Transition the second init commit for record_index partition to inflight in MDT deleteMetaFile(metaClient.getFs(), mdtBasePath, mdtInitCommit2, DELTA_COMMIT_EXTENSION); metaClient.getTableConfig().setMetadataPartitionState( - metaClient, MetadataPartitionType.RECORD_INDEX, false); + metaClient, MetadataPartitionType.RECORD_INDEX.getPartitionPath(), false); metaClient.getTableConfig().setMetadataPartitionsInflight( metaClient, MetadataPartitionType.RECORD_INDEX); timeline = metaClient.getActiveTimeline().reload(); @@ -1445,7 +1483,6 @@ public void testRollbackOperationsNonPartitioned() throws Exception { */ @Test public void testManualRollbacks() throws Exception { - boolean populateMateFields = false; init(COPY_ON_WRITE, false); // Setting to archive more aggressively on the Metadata Table than the Dataset final int maxDeltaCommitsBeforeCompaction = 4; @@ -1476,23 +1513,17 @@ public void testManualRollbacks() throws Exception { } validateMetadata(testTable); - // We can only rollback those commits whose deltacommit have not been archived yet. - int numRollbacks = 0; - boolean exceptionRaised = false; + // We can only roll back those commits whose deltacommit have not been archived yet. List allInstants = metaClient.reloadActiveTimeline().getCommitsTimeline().getReverseOrderedInstants().collect(Collectors.toList()); for (HoodieInstant instantToRollback : allInstants) { try { - testTable.doRollback(instantToRollback.getTimestamp(), String.valueOf(Time.now())); + testTable.doRollback(instantToRollback.getTimestamp(), metaClient.createNewInstantTime()); validateMetadata(testTable); - ++numRollbacks; } catch (HoodieMetadataException e) { // This is expected since we are rolling back commits that are older than the latest compaction on the MDT break; } } - // Since each rollback also creates a deltacommit, we can only support rolling back of half of the original - // instants present before rollback started. - // assertTrue(numRollbacks >= minArchiveCommitsDataset / 2, "Rollbacks of non archived instants should work"); } /** @@ -1559,23 +1590,18 @@ public void testMetadataBootstrapLargeCommitList(HoodieTableType tableType, bool testTable.setNonPartitioned(); } for (int i = 1; i < 25; i += 7) { - long commitTime1 = Long.parseLong(InProcessTimeGenerator.createNewInstantTime()); - long commitTime2 = Long.parseLong(InProcessTimeGenerator.createNewInstantTime()); - long commitTime3 = Long.parseLong(InProcessTimeGenerator.createNewInstantTime()); - long commitTime4 = Long.parseLong(InProcessTimeGenerator.createNewInstantTime()); - doWriteOperation(testTable, Long.toString(commitTime1), INSERT, nonPartitionedDataset); - doWriteOperation(testTable, Long.toString(commitTime2), UPSERT, nonPartitionedDataset); - doClean(testTable, Long.toString(commitTime3), Arrays.asList(Long.toString(commitTime1))); - doWriteOperation(testTable, Long.toString(commitTime4), UPSERT, nonPartitionedDataset); + String commitTime1 = InProcessTimeGenerator.createNewInstantTime(); + doWriteOperation(testTable, commitTime1, INSERT, nonPartitionedDataset); + doWriteOperation(testTable, InProcessTimeGenerator.createNewInstantTime(), UPSERT, nonPartitionedDataset); + doClean(testTable, InProcessTimeGenerator.createNewInstantTime(), Collections.singletonList(commitTime1)); + doWriteOperation(testTable, InProcessTimeGenerator.createNewInstantTime(), UPSERT, nonPartitionedDataset); if (tableType == MERGE_ON_READ) { - long commitTime5 = Long.parseLong(InProcessTimeGenerator.createNewInstantTime()); - doCompaction(testTable, Long.toString(commitTime5), nonPartitionedDataset); + doCompaction(testTable, InProcessTimeGenerator.createNewInstantTime(), nonPartitionedDataset); } // added 60s to commitTime6 to make sure it is greater than compaction instant triggered by previous commit - long commitTime6 = Long.parseLong(InProcessTimeGenerator.createNewInstantTime()) + 60000L; - doWriteOperation(testTable, Long.toString(commitTime6), UPSERT, nonPartitionedDataset); - long commitTime7 = Long.parseLong(InProcessTimeGenerator.createNewInstantTime()); - doRollback(testTable, Long.toString(commitTime6), Long.toString(commitTime7)); + String commitTime6 = HoodieInstantTimeGenerator.instantTimePlusMillis(InProcessTimeGenerator.createNewInstantTime(), 60000L); + doWriteOperation(testTable, commitTime6, UPSERT, nonPartitionedDataset); + doRollback(testTable, commitTime6, InProcessTimeGenerator.createNewInstantTime()); } validateMetadata(testTable, emptyList(), nonPartitionedDataset); } @@ -2883,22 +2909,54 @@ public void testErrorCases() throws Exception { @Test public void testMetadataTableWithLongLog() throws Exception { init(COPY_ON_WRITE, false); - final int maxNumDeltacommits = 3; + final int maxNumDeltaCommits = 3; writeConfig = getWriteConfigBuilder(true, true, false) .withMetadataConfig(HoodieMetadataConfig.newBuilder() .enable(true) .enableMetrics(false) - .withMaxNumDeltaCommitsBeforeCompaction(maxNumDeltacommits + 100) - .withMaxNumDeltacommitsWhenPending(maxNumDeltacommits) + .withMaxNumDeltaCommitsBeforeCompaction(maxNumDeltaCommits + 100) + .withMaxNumDeltacommitsWhenPending(maxNumDeltaCommits) .build()).build(); initWriteConfigAndMetatableWriter(writeConfig, true); testTable.addRequestedCommit(String.format("%016d", 0)); - for (int i = 1; i <= maxNumDeltacommits; i++) { + for (int i = 1; i <= maxNumDeltaCommits; i++) { doWriteOperation(testTable, String.format("%016d", i)); } - int instant = maxNumDeltacommits + 1; - Throwable t = assertThrows(HoodieMetadataException.class, () -> doWriteOperation(testTable, String.format("%016d", instant))); - assertTrue(t.getMessage().startsWith(String.format("Metadata table's deltacommits exceeded %d: ", maxNumDeltacommits))); + int instant = maxNumDeltaCommits + 1; + assertDoesNotThrow(() -> doWriteOperation(testTable, String.format("%016d", instant))); + } + + @Test + public void testMORCheckNumDeltaCommits() throws Exception { + init(MERGE_ON_READ, true); + final int maxNumDeltaCommits = 3; + writeConfig = getWriteConfigBuilder(true, true, false) + .withMetadataConfig(HoodieMetadataConfig.newBuilder() + .enable(true) + .enableMetrics(false) + .withMaxNumDeltaCommitsBeforeCompaction(maxNumDeltaCommits - 1) + .withMaxNumDeltacommitsWhenPending(maxNumDeltaCommits) + .build()) + .build(); + initWriteConfigAndMetatableWriter(writeConfig, true); + // write deltacommits to data-table and do compaction in metadata-table (with commit-instant) + doWriteOperation(testTable, InProcessTimeGenerator.createNewInstantTime(1)); + doWriteOperation(testTable, InProcessTimeGenerator.createNewInstantTime(1)); + // ensure the compaction is triggered and executed + try (HoodieBackedTableMetadata metadata = new HoodieBackedTableMetadata(context, writeConfig.getMetadataConfig(), writeConfig.getBasePath(), true)) { + HoodieTableMetaClient metadataMetaClient = metadata.getMetadataMetaClient(); + final HoodieActiveTimeline activeTimeline = metadataMetaClient.reloadActiveTimeline(); + Option lastCompaction = activeTimeline.filterCompletedInstants() + .filter(s -> s.getAction().equals(COMMIT_ACTION)).lastInstant(); + assertTrue(lastCompaction.isPresent()); + // create pending instant in data table + testTable.addRequestedCommit(InProcessTimeGenerator.createNewInstantTime(1)); + // continue writing + for (int i = 0; i <= maxNumDeltaCommits; i++) { + doWriteOperation(testTable, InProcessTimeGenerator.createNewInstantTime(1)); + } + assertDoesNotThrow(() -> doWriteOperation(testTable, InProcessTimeGenerator.createNewInstantTime(1))); + } } @Test diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieBackedTableMetadata.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieBackedTableMetadata.java index 1a268675ac75..8ca0d4e16a96 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieBackedTableMetadata.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieBackedTableMetadata.java @@ -19,10 +19,14 @@ package org.apache.hudi.client.functional; import org.apache.hudi.avro.HoodieAvroUtils; +import org.apache.hudi.avro.model.HoodieCleanMetadata; +import org.apache.hudi.avro.model.HoodieCleanerPlan; import org.apache.hudi.avro.model.HoodieMetadataRecord; import org.apache.hudi.common.config.HoodieMetadataConfig; +import org.apache.hudi.common.engine.HoodieLocalEngineContext; import org.apache.hudi.common.model.FileSlice; import org.apache.hudi.common.model.HoodieBaseFile; +import org.apache.hudi.common.model.HoodieCommitMetadata; import org.apache.hudi.common.model.HoodieLogFile; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecord.HoodieRecordType; @@ -32,8 +36,12 @@ import org.apache.hudi.common.table.log.HoodieLogFormat; import org.apache.hudi.common.table.log.block.HoodieDataBlock; import org.apache.hudi.common.table.log.block.HoodieLogBlock; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.table.view.TableFileSystemView; import org.apache.hudi.common.testutils.HoodieTestTable; +import org.apache.hudi.common.util.CleanerUtils; import org.apache.hudi.common.util.collection.ClosableIterator; import org.apache.hudi.common.util.collection.ExternalSpillableMap; import org.apache.hudi.config.HoodieWriteConfig; @@ -43,7 +51,6 @@ import org.apache.hudi.metadata.HoodieMetadataLogRecordReader; import org.apache.hudi.metadata.HoodieMetadataPayload; import org.apache.hudi.metadata.HoodieTableMetadataKeyGenerator; -import org.apache.hudi.metadata.MetadataPartitionType; import org.apache.hudi.table.HoodieSparkTable; import org.apache.hudi.table.HoodieTable; @@ -66,6 +73,7 @@ import java.util.Collections; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.concurrent.CountDownLatch; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; @@ -76,8 +84,12 @@ import static java.util.Arrays.asList; import static java.util.Collections.emptyList; +import static org.apache.hudi.common.model.WriteOperationType.BULK_INSERT; +import static org.apache.hudi.common.model.WriteOperationType.COMPACT; import static org.apache.hudi.common.model.WriteOperationType.INSERT; import static org.apache.hudi.common.model.WriteOperationType.UPSERT; +import static org.apache.hudi.common.table.timeline.HoodieTimeline.CLEAN_ACTION; +import static org.apache.hudi.metadata.MetadataPartitionType.FILES; import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; @@ -285,6 +297,112 @@ public void testMetadataRecordKeyExcludeFromPayload(final HoodieTableType tableT validateMetadata(testTable); } + /** + * This tests the case where the two clean actions delete the same file and commit + * to the metadata table. The metadata table should not contain the deleted file afterwards. + * A new cleaner plan may contain the same file to delete if the previous cleaner + * plan has not been successfully executed before the new one is scheduled. + */ + @ParameterizedTest + @EnumSource(HoodieTableType.class) + public void testRepeatedCleanActionsWithMetadataTableEnabled(final HoodieTableType tableType) throws Exception { + initPath(); + writeConfig = getWriteConfigBuilder(true, true, false) + .withMetadataConfig(HoodieMetadataConfig.newBuilder() + .enable(true) + .withMaxNumDeltaCommitsBeforeCompaction(4) + .build()) + .build(); + init(tableType, writeConfig); + String partition = "p1"; + // Simulate two bulk insert operations adding two data files in partition "p1" + String instant1 = metaClient.createNewInstantTime(); + HoodieCommitMetadata commitMetadata1 = + testTable.doWriteOperation(instant1, BULK_INSERT, emptyList(), asList(partition), 1); + String instant2 = metaClient.createNewInstantTime(); + HoodieCommitMetadata commitMetadata2 = + testTable.doWriteOperation(instant2, BULK_INSERT, emptyList(), asList(partition), 1); + + final HoodieTableMetaClient metadataMetaClient = HoodieTableMetaClient.builder() + .setConf(hadoopConf) + .setBasePath(metadataTableBasePath) + .build(); + while (getNumCompactions(metadataMetaClient) == 0) { + // Write until the compaction happens in the metadata table + testTable.doWriteOperation( + metaClient.createNewInstantTime(), BULK_INSERT, emptyList(), asList(partition), 1); + metadataMetaClient.reloadActiveTimeline(); + } + + assertEquals(1, getNumCompactions(metadataMetaClient)); + + List fileIdsToReplace = new ArrayList<>(); + fileIdsToReplace.addAll(commitMetadata1.getFileIdAndRelativePaths().keySet()); + fileIdsToReplace.addAll(commitMetadata2.getFileIdAndRelativePaths().keySet()); + // Simulate clustering operation replacing two data files with a new data file + testTable.doCluster( + metaClient.createNewInstantTime(), + Collections.singletonMap(partition, fileIdsToReplace), asList(partition), 1); + Set fileSetBeforeCleaning = getFilePathsInPartition(partition); + + // Simulate two clean actions deleting the same set of date files + // based on the first two commits + String cleanInstant = metaClient.createNewInstantTime(); + HoodieCleanMetadata cleanMetadata = testTable.doCleanBasedOnCommits(cleanInstant, asList(instant1, instant2)); + List deleteFileList = cleanMetadata.getPartitionMetadata().get(partition).getDeletePathPatterns(); + assertTrue(deleteFileList.size() > 0); + + Set fileSetAfterFirstCleaning = getFilePathsInPartition(partition); + validateFilesAfterCleaning(deleteFileList, fileSetBeforeCleaning, fileSetAfterFirstCleaning); + + metaClient.reloadActiveTimeline(); + HoodieCleanerPlan cleanerPlan = CleanerUtils.getCleanerPlan( + metaClient, new HoodieInstant(HoodieInstant.State.REQUESTED, CLEAN_ACTION, cleanInstant)); + testTable.repeatClean(metaClient.createNewInstantTime(), cleanerPlan, cleanMetadata); + + // Compaction should not happen after the first compaction in this test case + assertEquals(1, getNumCompactions(metadataMetaClient)); + Set fileSetAfterSecondCleaning = getFilePathsInPartition(partition); + validateFilesAfterCleaning(deleteFileList, fileSetBeforeCleaning, fileSetAfterSecondCleaning); + } + + private int getNumCompactions(HoodieTableMetaClient metaClient) { + HoodieActiveTimeline timeline = metaClient.getActiveTimeline(); + return timeline + .filter(s -> { + try { + return s.getAction().equals(HoodieTimeline.COMMIT_ACTION) + && HoodieCommitMetadata.fromBytes( + timeline.getInstantDetails(s).get(), HoodieCommitMetadata.class) + .getOperationType().equals(COMPACT); + } catch (IOException e) { + throw new RuntimeException(e); + } + }) + .countInstants(); + } + + private Set getFilePathsInPartition(String partition) throws IOException { + HoodieBackedTableMetadata tableMetadata = new HoodieBackedTableMetadata( + new HoodieLocalEngineContext(hadoopConf), + HoodieMetadataConfig.newBuilder().enable(true).build(), + basePath); + return Arrays.stream(tableMetadata.getAllFilesInPartition(new Path(basePath, partition))) + .map(status -> status.getPath().getName()).collect(Collectors.toSet()); + } + + private void validateFilesAfterCleaning(List deleteFileList, + Set fileSetBeforeCleaning, + Set fileSetAfterCleaning) { + assertEquals(deleteFileList.size(), fileSetBeforeCleaning.size() - fileSetAfterCleaning.size()); + for (String deleteFile : deleteFileList) { + assertFalse(fileSetAfterCleaning.contains(deleteFile)); + } + for (String file : fileSetAfterCleaning) { + assertTrue(fileSetBeforeCleaning.contains(file)); + } + } + /** * Verify the metadata table log files for the record field correctness. On disk format * should be based on meta fields and key deduplication config. And the in-memory merged @@ -302,7 +420,7 @@ private void verifyMetadataRecordKeyExcludeFromPayloadLogFiles(HoodieTable table // Compaction should not be triggered yet. Let's verify no base file // and few log files available. List fileSlices = table.getSliceView() - .getLatestFileSlices(MetadataPartitionType.FILES.getPartitionPath()).collect(Collectors.toList()); + .getLatestFileSlices(FILES.getPartitionPath()).collect(Collectors.toList()); if (fileSlices.isEmpty()) { throw new IllegalStateException("LogFile slices are not available!"); } @@ -377,7 +495,7 @@ private void verifyMetadataMergedRecords(HoodieTableMetaClient metadataMetaClien .withBasePath(metadataMetaClient.getBasePath()) .withLogFilePaths(logFilePaths) .withLatestInstantTime(latestCommitTimestamp) - .withPartition(MetadataPartitionType.FILES.getPartitionPath()) + .withPartition(FILES.getPartitionPath()) .withReaderSchema(schema) .withMaxMemorySizeInBytes(100000L) .withBufferSize(4096) @@ -401,7 +519,7 @@ private void verifyMetadataMergedRecords(HoodieTableMetaClient metadataMetaClien private void verifyMetadataRecordKeyExcludeFromPayloadBaseFiles(HoodieTable table) throws IOException { table.getHoodieView().sync(); List fileSlices = table.getSliceView() - .getLatestFileSlices(MetadataPartitionType.FILES.getPartitionPath()).collect(Collectors.toList()); + .getLatestFileSlices(FILES.getPartitionPath()).collect(Collectors.toList()); if (!fileSlices.get(0).getBaseFile().isPresent()) { throw new IllegalStateException("Base file not available!"); } diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieClientOnCopyOnWriteStorage.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieClientOnCopyOnWriteStorage.java index 44105a419834..cff783ebd53c 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieClientOnCopyOnWriteStorage.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieClientOnCopyOnWriteStorage.java @@ -204,7 +204,6 @@ private static Stream populateMetaFieldsParams() { private static Stream rollbackFailedCommitsParams() { return Stream.of( Arguments.of(HoodieFailedWritesCleaningPolicy.LAZY, true), - Arguments.of(HoodieFailedWritesCleaningPolicy.LAZY, false), Arguments.of(HoodieFailedWritesCleaningPolicy.NEVER, true), Arguments.of(HoodieFailedWritesCleaningPolicy.NEVER, false) ); @@ -240,10 +239,9 @@ public void testAutoCommitOnInsert(boolean populateMetaFields) throws Exception /** * Test Auto Commit behavior for HoodieWriteClient insertPrepped API. */ - @ParameterizedTest - @MethodSource("populateMetaFieldsParams") - public void testAutoCommitOnInsertPrepped(boolean populateMetaFields) throws Exception { - testAutoCommit(SparkRDDWriteClient::insertPreppedRecords, true, populateMetaFields); + @Test + public void testAutoCommitOnInsertPrepped() throws Exception { + testAutoCommit(SparkRDDWriteClient::insertPreppedRecords, true, true); } /** @@ -276,11 +274,10 @@ public void testAutoCommitOnBulkInsert(boolean populateMetaFields) throws Except /** * Test Auto Commit behavior for HoodieWriteClient bulk-insert prepped API. */ - @ParameterizedTest - @MethodSource("populateMetaFieldsParams") - public void testAutoCommitOnBulkInsertPrepped(boolean populateMetaFields) throws Exception { + @Test + public void testAutoCommitOnBulkInsertPrepped() throws Exception { testAutoCommit((writeClient, recordRDD, instantTime) -> writeClient.bulkInsertPreppedRecords(recordRDD, instantTime, - Option.empty()), true, populateMetaFields); + Option.empty()), true, true); } /** @@ -440,10 +437,9 @@ public void testDeduplicationOnBulkInsert(boolean populateMetaFields) throws Exc /** * Test De-duplication behavior for HoodieWriteClient upsert API. */ - @ParameterizedTest - @MethodSource("populateMetaFieldsParams") - public void testDeduplicationOnUpsert(boolean populateMetaFields) throws Exception { - testDeduplication(SparkRDDWriteClient::upsert, populateMetaFields); + @Test + public void testDeduplicationOnUpsert() throws Exception { + testDeduplication(SparkRDDWriteClient::upsert, true); } /** @@ -598,11 +594,10 @@ public void testUpserts(boolean populateMetaFields) throws Exception { /** * Test UpsertPrepped API. */ - @ParameterizedTest - @MethodSource("populateMetaFieldsParams") - public void testUpsertsPrepped(boolean populateMetaFields) throws Exception { + @Test + public void testUpsertsPrepped() throws Exception { HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder().withRollbackUsingMarkers(true); - addConfigsForPopulateMetaFields(cfgBuilder, populateMetaFields); + addConfigsForPopulateMetaFields(cfgBuilder, true); testUpsertsInternal(cfgBuilder.build(), SparkRDDWriteClient::upsertPreppedRecords, true); } @@ -837,11 +832,10 @@ public void testInsertsWithHoodieConcatHandle(boolean populateMetaFields) throws /** * Test InsertPrepped API for HoodieConcatHandle. */ - @ParameterizedTest - @MethodSource("populateMetaFieldsParams") - public void testInsertsPreppedWithHoodieConcatHandle(boolean populateMetaFields) throws Exception { + @Test + public void testInsertsPreppedWithHoodieConcatHandle() throws Exception { HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder(); - addConfigsForPopulateMetaFields(cfgBuilder, populateMetaFields); + addConfigsForPopulateMetaFields(cfgBuilder, true); testHoodieConcatHandle(cfgBuilder.build(), true); } @@ -994,11 +988,10 @@ public void testPendingRestore() throws IOException { /** * Tests deletion of records. */ - @ParameterizedTest - @MethodSource("populateMetaFieldsParams") - public void testDeletes(boolean populateMetaFields) throws Exception { + @Test + public void testDeletes() throws Exception { HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder(HoodieFailedWritesCleaningPolicy.LAZY); - addConfigsForPopulateMetaFields(cfgBuilder, populateMetaFields); + addConfigsForPopulateMetaFields(cfgBuilder, true); SparkRDDWriteClient client = getHoodieWriteClient(cfgBuilder.build()); /** * Write 1 (inserts and deletes) Write actual 200 insert records and ignore 100 delete records @@ -1019,7 +1012,7 @@ public void testDeletes(boolean populateMetaFields) throws Exception { writeBatch(client, newCommitTime, initCommitTime, Option.empty(), initCommitTime, // unused as genFn uses hard-coded number of inserts/updates/deletes -1, recordGenFunction, SparkRDDWriteClient::upsert, true, 200, 200, 1, false, - populateMetaFields); + true); /** * Write 2 (deletes+writes). @@ -1037,7 +1030,7 @@ public void testDeletes(boolean populateMetaFields) throws Exception { }; writeBatch(client, newCommitTime, prevCommitTime, Option.empty(), initCommitTime, 100, recordGenFunction, SparkRDDWriteClient::upsert, true, 50, 150, 2, false, - populateMetaFields); + true); } /** @@ -1046,11 +1039,10 @@ public void testDeletes(boolean populateMetaFields) throws Exception { * * @throws Exception */ - @ParameterizedTest - @MethodSource("populateMetaFieldsParams") - public void testDeletesForInsertsInSameBatch(boolean populateMetaFields) throws Exception { + @Test + public void testDeletesForInsertsInSameBatch() throws Exception { HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder(HoodieFailedWritesCleaningPolicy.LAZY); - addConfigsForPopulateMetaFields(cfgBuilder, populateMetaFields); + addConfigsForPopulateMetaFields(cfgBuilder, true); SparkRDDWriteClient client = getHoodieWriteClient(cfgBuilder.build()); /** * Write 200 inserts and issue deletes to a subset(50) of inserts. @@ -1071,7 +1063,7 @@ public void testDeletesForInsertsInSameBatch(boolean populateMetaFields) throws writeBatch(client, newCommitTime, initCommitTime, Option.empty(), initCommitTime, -1, recordGenFunction, SparkRDDWriteClient::upsert, true, 150, 150, 1, false, - populateMetaFields); + true); } private void assertPartitionPathRecordKeys(List> expectedPartitionPathRecKeyPairs, String[] fullPartitionPaths) { @@ -1898,19 +1890,17 @@ public void testInsertOverwritePartitionHandlingWithMoreRecords(boolean populate /** * Test scenario of writing fewer file groups than existing number of file groups in partition. */ - @ParameterizedTest - @MethodSource("populateMetaFieldsParams") - public void testInsertOverwritePartitionHandlingWithFewerRecords(boolean populateMetaFields) throws Exception { - verifyInsertOverwritePartitionHandling(3000, 1000, populateMetaFields); + @Test + public void testInsertOverwritePartitionHandlingWithFewerRecords() throws Exception { + verifyInsertOverwritePartitionHandling(3000, 1000, true); } /** * Test scenario of writing similar number file groups in partition. */ - @ParameterizedTest - @MethodSource("populateMetaFieldsParams") - public void testInsertOverwritePartitionHandlingWithSimilarNumberOfRecords(boolean populateMetaFields) throws Exception { - verifyInsertOverwritePartitionHandling(3000, 3000, populateMetaFields); + @Test + public void testInsertOverwritePartitionHandlingWithSimilarNumberOfRecords() throws Exception { + verifyInsertOverwritePartitionHandling(3000, 3000, true); } /** @@ -1963,19 +1953,17 @@ public void verifyDeletePartitionsHandlingWithFewerRecordsFirstPartition(boolean /** * Test scenario of writing similar number file groups in partition. */ - @ParameterizedTest - @MethodSource("populateMetaFieldsParams") - public void verifyDeletePartitionsHandlingWithSimilarNumberOfRecords(boolean populateMetaFields) throws Exception { - verifyDeletePartitionsHandling(3000, 3000, 3000, populateMetaFields); + @Test + public void verifyDeletePartitionsHandlingWithSimilarNumberOfRecords() throws Exception { + verifyDeletePartitionsHandling(3000, 3000, 3000, true); } /** * Test scenario of writing more file groups for first partition than second and third partition. */ - @ParameterizedTest - @MethodSource("populateMetaFieldsParams") - public void verifyDeletePartitionsHandlingHandlingWithFewerRecordsSecondThirdPartition(boolean populateMetaFields) throws Exception { - verifyDeletePartitionsHandling(3000, 1000, 1000, populateMetaFields); + @Test + public void verifyDeletePartitionsHandlingHandlingWithFewerRecordsSecondThirdPartition() throws Exception { + verifyDeletePartitionsHandling(3000, 1000, 1000, true); } private Set insertPartitionRecordsWithCommit(SparkRDDWriteClient client, int recordsCount, String commitTime1, String partitionPath) throws IOException { @@ -2217,11 +2205,11 @@ public void testDeletesWithoutInserts(boolean populateMetaFields) { /** * Test to ensure commit metadata points to valid files. */ - @ParameterizedTest - @MethodSource("populateMetaFieldsParams") - public void testCommitWritesRelativePaths(boolean populateMetaFields) throws Exception { + @Test + public void testCommitWritesRelativePaths() throws Exception { + HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder().withAutoCommit(false); - addConfigsForPopulateMetaFields(cfgBuilder, populateMetaFields); + addConfigsForPopulateMetaFields(cfgBuilder, true); try (SparkRDDWriteClient client = getHoodieWriteClient(cfgBuilder.build())) { HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(basePath).build(); HoodieSparkTable table = HoodieSparkTable.create(cfgBuilder.build(), context, metaClient); @@ -2393,9 +2381,9 @@ private void testRollbackAfterConsistencyCheckFailureUsingFileList(boolean rollb } @ParameterizedTest - @MethodSource("rollbackAfterConsistencyCheckFailureParams") - public void testRollbackAfterConsistencyCheckFailureUsingFileList(boolean enableOptimisticConsistencyGuard, boolean populateMetCols) throws Exception { - testRollbackAfterConsistencyCheckFailureUsingFileList(false, enableOptimisticConsistencyGuard, populateMetCols); + @ValueSource(booleans = {true, false}) + public void testRollbackAfterConsistencyCheckFailureUsingFileList(boolean enableOptimisticConsistencyGuard) throws Exception { + testRollbackAfterConsistencyCheckFailureUsingFileList(false, enableOptimisticConsistencyGuard, true); } @ParameterizedTest @@ -2486,9 +2474,9 @@ public void testRollbackFailedCommits() throws Exception { } } - @ParameterizedTest - @MethodSource("populateMetaFieldsParams") - public void testRollbackFailedCommitsToggleCleaningPolicy(boolean populateMetaFields) throws Exception { + @Test + public void testRollbackFailedCommitsToggleCleaningPolicy() throws Exception { + boolean populateMetaFields = true; HoodieTestUtils.init(hadoopConf, basePath); HoodieFailedWritesCleaningPolicy cleaningPolicy = EAGER; SparkRDDWriteClient client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields)); @@ -2641,7 +2629,7 @@ private Pair> testConsistencyCheck(HoodieTableMetaCli Option markerFilePath = WriteMarkersFactory.get( cfg.getMarkersType(), getHoodieTable(metaClient, cfg), instantTime) .create(partitionPath, - FSUtils.makeBaseFileName(instantTime, "1-0-1", UUID.randomUUID().toString()), + FSUtils.makeBaseFileName(instantTime, "1-0-1", UUID.randomUUID().toString(), BASE_FILE_EXTENSION), IOType.MERGE); if (!enableOptimisticConsistencyGuard) { Exception e = assertThrows(HoodieCommitException.class, () -> { @@ -2655,12 +2643,11 @@ private Pair> testConsistencyCheck(HoodieTableMetaCli return Pair.of(markerFilePath.get(), result); } - @ParameterizedTest - @MethodSource("populateMetaFieldsParams") - public void testMultiOperationsPerCommit(boolean populateMetaFields) throws IOException { + @Test + public void testMultiOperationsPerCommit() throws IOException { HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder().withAutoCommit(false) .withAllowMultiWriteOnSameInstant(true); - addConfigsForPopulateMetaFields(cfgBuilder, populateMetaFields); + addConfigsForPopulateMetaFields(cfgBuilder, true); HoodieWriteConfig cfg = cfgBuilder.build(); SparkRDDWriteClient client = getHoodieWriteClient(cfg); String firstInstantTime = "0000"; diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieIndex.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieIndex.java index 709572325f8c..557905ae85a8 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieIndex.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieIndex.java @@ -329,11 +329,11 @@ public void testLookupIndexWithAndWithoutColumnStats() throws Exception { // check column_stats partition exists metaClient = HoodieTableMetaClient.reload(metaClient); - assertTrue(metadataPartitionExists(metaClient.getBasePath(), context, COLUMN_STATS)); + assertTrue(metadataPartitionExists(metaClient.getBasePath(), context, COLUMN_STATS.getPartitionPath())); assertTrue(metaClient.getTableConfig().getMetadataPartitions().contains(COLUMN_STATS.getPartitionPath())); // delete the column_stats partition - deleteMetadataPartition(metaClient.getBasePath(), context, COLUMN_STATS); + deleteMetadataPartition(metaClient.getBasePath(), context, COLUMN_STATS.getPartitionPath()); // Now tagLocation for these records, they should be tagged correctly despite column_stats being enabled but not present hoodieTable = HoodieSparkTable.create(config, context, metaClient); diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestRemoteFileSystemViewWithMetadataTable.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestRemoteFileSystemViewWithMetadataTable.java index 86cc078e9894..3d2e018c3a06 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestRemoteFileSystemViewWithMetadataTable.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestRemoteFileSystemViewWithMetadataTable.java @@ -116,7 +116,7 @@ public void initTimelineService() { .serverPort(config.getViewStorageConfig().getRemoteViewServerPort()).build(), FileSystem.get(new Configuration()), FileSystemViewManager.createViewManager( - context, config.getMetadataConfig(), config.getViewStorageConfig(), + context, config.getViewStorageConfig(), config.getCommonConfig(), metaClient -> new HoodieBackedTestDelayedTableMetadata( context, config.getMetadataConfig(), metaClient.getBasePathV2().toString(), true))); diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/hbase/TestHBasePutBatchSizeCalculator.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/hbase/TestHBasePutBatchSizeCalculator.java index a6068e6a8f9c..b20bc979b12b 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/hbase/TestHBasePutBatchSizeCalculator.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/hbase/TestHBasePutBatchSizeCalculator.java @@ -34,28 +34,28 @@ public void testPutBatchSizeCalculation() { int putBatchSize = batchSizeCalculator.getBatchSize(10, 16667, 1200, 200, 0.1f); // Total puts that can be sent in 1 second = (10 * 16667 * 0.1) = 16,667 // Total puts per batch will be (16,667 / parallelism) = 83.335, where 200 is the maxExecutors - assertEquals(putBatchSize, 83); + assertEquals(putBatchSize, 84); // Number of Region Servers are halved, total requests sent in a second are also halved, so batchSize is also halved int putBatchSize2 = batchSizeCalculator.getBatchSize(5, 16667, 1200, 200, 0.1f); - assertEquals(putBatchSize2, 41); + assertEquals(putBatchSize2, 42); // If the parallelism is halved, batchSize has to double int putBatchSize3 = batchSizeCalculator.getBatchSize(10, 16667, 1200, 100, 0.1f); - assertEquals(putBatchSize3, 166); + assertEquals(putBatchSize3, 167); // If the parallelism is halved, batchSize has to double. // This time parallelism is driven by numTasks rather than numExecutors int putBatchSize4 = batchSizeCalculator.getBatchSize(10, 16667, 100, 200, 0.1f); - assertEquals(putBatchSize4, 166); + assertEquals(putBatchSize4, 167); // If sleepTimeMs is halved, batchSize has to halve int putBatchSize5 = batchSizeCalculator.getBatchSize(10, 16667, 1200, 200, 0.05f); - assertEquals(putBatchSize5, 41); + assertEquals(putBatchSize5, 42); // If maxQPSPerRegionServer is doubled, batchSize also doubles int putBatchSize6 = batchSizeCalculator.getBatchSize(10, 33334, 1200, 200, 0.1f); - assertEquals(putBatchSize6, 166); + assertEquals(putBatchSize6, 167); } } diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/TestHoodieTimelineArchiver.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/TestHoodieTimelineArchiver.java index aec9b6314000..b496db5165ba 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/TestHoodieTimelineArchiver.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/TestHoodieTimelineArchiver.java @@ -58,6 +58,7 @@ import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.config.metrics.HoodieMetricsConfig; import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.exception.HoodieMetadataException; import org.apache.hudi.hadoop.fs.HoodieWrapperFileSystem; import org.apache.hudi.metadata.HoodieTableMetadata; import org.apache.hudi.metadata.HoodieTableMetadataWriter; @@ -115,6 +116,8 @@ import static org.apache.hudi.metrics.HoodieMetrics.ARCHIVE_ACTION; import static org.apache.hudi.metrics.HoodieMetrics.DELETE_INSTANTS_NUM_STR; import static org.apache.hudi.metrics.HoodieMetrics.DURATION_STR; +import static org.hamcrest.CoreMatchers.is; +import static org.hamcrest.MatcherAssert.assertThat; import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; @@ -825,35 +828,48 @@ public void testPendingClusteringWillBlockArchival(boolean enableMetadata) throw public void testArchiveRollbacksTestTable(boolean enableMetadata) throws Exception { HoodieWriteConfig writeConfig = initTestTableAndGetWriteConfig(enableMetadata, 4, 5, 2); - for (int i = 1; i < 13; i += 2) { + List> instants = new ArrayList<>(); + boolean hasArchivedInstants = false; + for (int i = 1; i < 8; i += 3) { + String commitInstant1 = metaClient.createNewInstantTime(); + instants.add(Pair.of(commitInstant1, HoodieTimeline.COMMIT_ACTION)); testTable.doWriteOperation( - "000000" + String.format("%02d", i), + commitInstant1, WriteOperationType.UPSERT, i == 1 ? Arrays.asList("p1", "p2") : Collections.emptyList(), Arrays.asList("p1", "p2"), 2); - testTable.doRollback( - "000000" + String.format("%02d", i), "000000" + String.format("%02d", i + 1)); + try { + String rollbackInstant = metaClient.createNewInstantTime(); + testTable.doRollback(commitInstant1, rollbackInstant); + instants.add(Pair.of(rollbackInstant, HoodieTimeline.ROLLBACK_ACTION)); + } catch (HoodieMetadataException e) { + // The instant that triggers compaction can not be rolled back + // because it's instant time is less than the latest compaction instant + // on the MDT timeline. + // ignore + } + + // we need enough delta commits to trigger archival on MDT. + String commitInstant2 = metaClient.createNewInstantTime(); + instants.add(Pair.of(commitInstant2, HoodieTimeline.COMMIT_ACTION)); + testTable.doWriteOperation(commitInstant2, WriteOperationType.UPSERT, Collections.emptyList(), Arrays.asList("p1", "p2"), 2); // trigger archival Pair, List> commitsList = archiveAndGetCommitsList(writeConfig); List originalCommits = commitsList.getKey(); List commitsAfterArchival = commitsList.getValue(); - if (i != 11) { - assertEquals(originalCommits, commitsAfterArchival); - } else { - // only time when archival will kick in - List expectedArchivedInstants = new ArrayList<>(); - expectedArchivedInstants.addAll(getAllArchivedCommitInstants(Arrays.asList("00000001", "00000003"))); - expectedArchivedInstants.addAll(getAllArchivedCommitInstants(Collections.singletonList("00000002"), HoodieTimeline.ROLLBACK_ACTION)); - List expectedActiveInstants = new ArrayList<>(); - expectedActiveInstants.addAll(getActiveCommitInstants( - Arrays.asList("00000005", "00000007", "00000009", "00000011"))); - expectedActiveInstants.addAll(getActiveCommitInstants( - Arrays.asList("00000004", "00000006", "00000008", "00000010", "00000012"), HoodieTimeline.ROLLBACK_ACTION)); + int numArchivedInstants = originalCommits.size() - commitsAfterArchival.size(); + if ((originalCommits.size() - commitsAfterArchival.size()) > 0) { + hasArchivedInstants = true; + List expectedArchivedInstants = instants.subList(0, numArchivedInstants).stream() + .map(p -> new HoodieInstant(State.COMPLETED, p.getValue(), p.getKey())).collect(Collectors.toList()); + List expectedActiveInstants = instants.subList(numArchivedInstants, instants.size()).stream() + .map(p -> new HoodieInstant(State.COMPLETED, p.getValue(), p.getKey())).collect(Collectors.toList()); verifyArchival(expectedArchivedInstants, expectedActiveInstants, commitsAfterArchival); } } + assertTrue(hasArchivedInstants, "Some instants should be archived"); } @ParameterizedTest @@ -1071,37 +1087,53 @@ public void testArchiveRollbacksAndCleanTestTable() throws Exception { int maxArchiveCommits = 4; HoodieWriteConfig writeConfig = initTestTableAndGetWriteConfig(true, minArchiveCommits, maxArchiveCommits, 2); + List> instants = new ArrayList<>(); + String instant1 = metaClient.createNewInstantTime(); + instants.add(Pair.of(instant1, HoodieTimeline.COMMIT_ACTION)); // trigger 1 commit to add a lot of files so that future cleans can clean them up - testTable.doWriteOperation(String.format("%08d", 1), WriteOperationType.UPSERT, Arrays.asList("p1", "p2"), Arrays.asList("p1", "p2"), 20); + testTable.doWriteOperation(instant1, WriteOperationType.UPSERT, Arrays.asList("p1", "p2"), Arrays.asList("p1", "p2"), 20); Map partitionToFileDeleteCount = new HashMap<>(); partitionToFileDeleteCount.put("p1", 1); partitionToFileDeleteCount.put("p2", 1); for (int i = 2; i < 5; i++) { - testTable.doClean(String.format("%08d", i), partitionToFileDeleteCount); + String cleanInstant = metaClient.createNewInstantTime(); + instants.add(Pair.of(cleanInstant, HoodieTimeline.CLEAN_ACTION)); + testTable.doClean(cleanInstant, partitionToFileDeleteCount); } - for (int i = 5; i <= 11; i += 2) { - testTable.doWriteOperation(String.format("%08d", i), WriteOperationType.UPSERT, Collections.emptyList(), Arrays.asList("p1", "p2"), 2); - testTable.doRollback(String.format("%08d", i), String.format("%08d", i + 1)); + for (int i = 5; i <= 13; i += 3) { + String commitInstant1 = metaClient.createNewInstantTime(); + instants.add(Pair.of(commitInstant1, HoodieTimeline.COMMIT_ACTION)); + testTable.doWriteOperation(commitInstant1, WriteOperationType.UPSERT, Collections.emptyList(), Arrays.asList("p1", "p2"), 2); + try { + String rollbackInstant = metaClient.createNewInstantTime(); + testTable.doRollback(commitInstant1, rollbackInstant); + instants.add(Pair.of(rollbackInstant, HoodieTimeline.ROLLBACK_ACTION)); + } catch (HoodieMetadataException e) { + // The instant that triggers compaction can not be rolled back + // because it's instant time is less than the latest compaction instant + // on the MDT timeline. + // ignore + } + // write more commits than rollback so that the MDT archival can be triggered, + // then the DT archival can be triggered. + String commitInstant2 = metaClient.createNewInstantTime(); + instants.add(Pair.of(commitInstant2, HoodieTimeline.COMMIT_ACTION)); + testTable.doWriteOperation(commitInstant2, WriteOperationType.UPSERT, Collections.emptyList(), Arrays.asList("p1", "p2"), 2); } // trigger archival: - // clean: 2,3: after archival -> null - // write: 1,5,7,9,11: after archival -> 9,11 - // rollback: 6,8,10,12: after archival -> 8,10,12 Pair, List> commitsList = archiveAndGetCommitsList(writeConfig); + List allCommits = commitsList.getKey(); List commitsAfterArchival = commitsList.getValue(); + assertThat("The archived commits number is not as expected", allCommits.size() - commitsAfterArchival.size(), is(5)); - List expectedActiveInstants = new ArrayList<>(); - expectedActiveInstants.addAll(getActiveCommitInstants(Arrays.asList("00000008", "00000010", "00000012"), HoodieTimeline.ROLLBACK_ACTION)); - expectedActiveInstants.addAll(getActiveCommitInstants(Arrays.asList("00000009", "00000011"))); - - List expectedArchiveInstants = new ArrayList<>(); - expectedArchiveInstants.addAll(getAllArchivedCommitInstants(Arrays.asList("00000001", "00000005", "00000007"))); - expectedArchiveInstants.addAll(getAllArchivedCommitInstants(Arrays.asList("00000002", "00000003", "00000004"), HoodieTimeline.CLEAN_ACTION)); - expectedArchiveInstants.addAll(getAllArchivedCommitInstants(Collections.singletonList("00000006"), HoodieTimeline.ROLLBACK_ACTION)); + List expectedArchiveInstants = instants.subList(0, 5).stream() + .map(p -> new HoodieInstant(State.COMPLETED, p.getValue(), p.getKey())).collect(Collectors.toList()); + List expectedActiveInstants = instants.subList(5, instants.size()).stream() + .map(p -> new HoodieInstant(State.COMPLETED, p.getValue(), p.getKey())).collect(Collectors.toList()); verifyArchival(expectedArchiveInstants, expectedActiveInstants, commitsAfterArchival); } @@ -1469,7 +1501,7 @@ public void testArchivalAndCompactionInMetadataTable() throws Exception { .setLoadActiveTimelineOnLoad(true).build(); List instants = new ArrayList<>(); - for (int i = 1; i <= 18; i++) { + for (int i = 1; i <= 19; i++) { String instant = metaClient.createNewInstantTime(); instants.add(instant); if (i != 2) { @@ -1495,65 +1527,71 @@ public void testArchivalAndCompactionInMetadataTable() throws Exception { new HoodieInstant(State.COMPLETED, HoodieTimeline.DELTA_COMMIT_ACTION, SOLO_COMMIT_TIMESTAMP + "010"))); assertTrue(metadataTableInstants.contains( new HoodieInstant(State.COMPLETED, HoodieTimeline.DELTA_COMMIT_ACTION, instants.get(0)))); - } else if (i <= 8) { + } else if (i == 2) { + assertEquals(i - 1, metadataTableInstants.size()); + assertTrue(metadataTableInstants.contains( + new HoodieInstant(State.COMPLETED, HoodieTimeline.DELTA_COMMIT_ACTION, SOLO_COMMIT_TIMESTAMP + "010"))); + assertFalse(metadataTableInstants.contains( + new HoodieInstant(State.COMPLETED, HoodieTimeline.DELTA_COMMIT_ACTION, instants.get(1)))); + } else if (i <= 9) { // In the metadata table timeline, the first delta commit is "00000000000000" - // from metadata table init, delta commits 1 till 7 are added + // from metadata table init, delta commits 1 till 8 are added // later on without archival or compaction // rollback in DT will also trigger rollback in MDT - assertEquals(i, metadataTableInstants.size()); + assertEquals(i - 1, metadataTableInstants.size()); assertTrue(metadataTableInstants.contains( new HoodieInstant(State.COMPLETED, HoodieTimeline.DELTA_COMMIT_ACTION, SOLO_COMMIT_TIMESTAMP + "010"))); // rolled back commits may not be present in MDT timeline [1] - IntStream.range(2, i).forEach(j -> + IntStream.range(3, i).forEach(j -> assertTrue(metadataTableInstants.contains( new HoodieInstant(State.COMPLETED, HoodieTimeline.DELTA_COMMIT_ACTION, instants.get(j - 1))))); - } else if (i == 9) { - // i == 9 + } else if (i == 10) { + // i == 10 // The instant "00000000000010" was archived since it's less than // the earliest commit on the dataset active timeline, // the dataset active timeline has instants: - // 2.rollback, 7.commit, 8.commit + // [7.commit, 8.commit, 9.commit, 10.commit] assertEquals(9, metadataTableInstants.size()); - // mdt timeline 2, 3,..., 8, a completed compaction commit, 9 - IntStream.range(2, i).forEach(j -> + // mdt timeline 3,..., 10, a completed compaction commit + IntStream.range(3, i).forEach(j -> assertTrue(metadataTableInstants.contains( new HoodieInstant(State.COMPLETED, HoodieTimeline.DELTA_COMMIT_ACTION, instants.get(j - 1))))); - } else if (i <= 12) { - // In the metadata table timeline, the first delta commit is 6 + } else if (i <= 13) { + // In the metadata table timeline, the first delta commit is 7 // because it equals with the earliest commit on the dataset timeline, after archival, - // delta commits 6 till 10 are added later on without archival or compaction - // mdt timeline [6, 7, 8, a completed compaction commit, 9, 10] for i = 10 - assertEquals(i - 4, metadataTableInstants.size()); + // delta commits 11 till 12 are added later on without archival or compaction + // mdt timeline [7, 8, 9, 10, a completed compaction commit] for i = 10 + assertEquals(i - 5, metadataTableInstants.size()); assertEquals(1, metadataTableMetaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants().countInstants()); - IntStream.range(6, i).forEach(j -> + IntStream.range(7, i).forEach(j -> assertTrue(metadataTableInstants.contains( new HoodieInstant(State.COMPLETED, HoodieTimeline.DELTA_COMMIT_ACTION, instants.get(j - 1))))); - } else if (i <= 16) { - // In the metadata table timeline, the first delta commit is a compaction commit - // from metadata table compaction, after archival, delta commits 9 - // till 16 are added later on without archival or compaction - // mdt timeline: [a completed compaction commit, 9, ... 13] - assertEquals(i - 7, metadataTableInstants.size()); + } else if (i <= 17) { + // In the metadata table timeline, the second commit is a compaction commit + // from metadata table compaction, after archival, delta commits 14 + // till 17 are added later on without archival or compaction + // mdt timeline: [10, a completed compaction commit, 11, ... 14, 15, ... 17] + assertEquals(i - 8, metadataTableInstants.size()); assertEquals(1, metadataTableMetaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants().countInstants()); - IntStream.range(9, i).forEach(j -> + IntStream.range(10, i).forEach(j -> assertTrue(metadataTableInstants.contains( new HoodieInstant(State.COMPLETED, HoodieTimeline.DELTA_COMMIT_ACTION, instants.get(j - 1))))); - } else if (i == 17) { - // i == 17 - // commits in MDT [a completed compaction commit, 9, ... 16, 17, a completed compaction commit] - // another compaction is triggered by this commit so everything upto 16 is compacted. + } else if (i == 18) { + // i == 18 + // commits in MDT [10, a completed compaction commit, 11, ... 17, 18, a completed compaction commit] + // another compaction is triggered by this commit so everything upto 18 is compacted. assertEquals(11, metadataTableInstants.size()); assertEquals(2, metadataTableMetaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants().countInstants()); - IntStream.range(9, i).forEach(j -> + IntStream.range(10, i).forEach(j -> assertTrue(metadataTableInstants.contains( new HoodieInstant(State.COMPLETED, HoodieTimeline.DELTA_COMMIT_ACTION, instants.get(j - 1))))); } else { - // i == 18 + // i == 19 // compaction happened in last commit, and archival is triggered with latest compaction retained plus maxInstantToKeep = 6 // commits in MDT [14, .... 17, a completed compaction commit, 18] assertEquals(6, metadataTableInstants.size()); assertTrue(metadata(writeConfig, context).getLatestCompactionTime().isPresent()); - IntStream.range(14, i).forEach(j -> + IntStream.range(15, i).forEach(j -> assertTrue(metadataTableInstants.contains( new HoodieInstant(State.COMPLETED, HoodieTimeline.DELTA_COMMIT_ACTION, instants.get(j - 1))))); } @@ -1582,6 +1620,39 @@ public void testPendingClusteringAfterArchiveCommit(boolean enableMetadata) thro "Since we have a pending clustering instant at 00000002, we should never archive any commit after 00000000"); } + @Test + public void testRetryArchivalAfterPreviousFailedDeletion() throws Exception { + HoodieWriteConfig writeConfig = initTestTableAndGetWriteConfig(true, 2, 4, 2); + for (int i = 0; i <= 5; i++) { + testTable.doWriteOperation("10" + i, WriteOperationType.UPSERT, Arrays.asList("p1", "p2"), 1); + } + HoodieTable table = HoodieSparkTable.create(writeConfig, context, metaClient); + HoodieTimelineArchiver archiver = new HoodieTimelineArchiver(writeConfig, table); + + HoodieTimeline timeline = metaClient.getActiveTimeline().getWriteTimeline(); + assertEquals(6, timeline.countInstants(), "Loaded 6 commits and the count should match"); + assertTrue(archiver.archiveIfRequired(context) > 0); + // Simulate archival failing to delete by re-adding the .commit instant files + // (101.commit, 102.commit, and 103.commit instant files) + HoodieTestDataGenerator.createOnlyCompletedCommitFile(basePath, "101_1001", wrapperFs.getConf()); + HoodieTestDataGenerator.createOnlyCompletedCommitFile(basePath, "102_1021", wrapperFs.getConf()); + HoodieTestDataGenerator.createOnlyCompletedCommitFile(basePath, "103_1031", wrapperFs.getConf()); + timeline = metaClient.getActiveTimeline().reload().getWriteTimeline(); + assertEquals(5, timeline.countInstants(), "Due to simulating partial archival deletion, there should" + + "be 5 instants (as instant times 101-103 .commit files should remain in timeline)"); + // Re-running archival again should archive and delete the 101.commit, 102.commit, and 103.commit instant files + table.getMetaClient().reloadActiveTimeline(); + table = HoodieSparkTable.create(writeConfig, context, metaClient); + archiver = new HoodieTimelineArchiver(writeConfig, table); + assertTrue(archiver.archiveIfRequired(context) > 0); + timeline = metaClient.getActiveTimeline().reload().getWriteTimeline(); + assertEquals(2, timeline.countInstants(), "The instants from prior archival should " + + "be deleted now"); + } + + /** + * IMPORTANT: this method is only suitable for one time trigger of archival validation. + */ private Pair, List> archiveAndGetCommitsList(HoodieWriteConfig writeConfig) throws IOException { return archiveAndGetCommitsList(writeConfig, false); } diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/cluster/strategy/TestSparkBuildClusteringGroupsForPartition.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/cluster/strategy/TestSparkBuildClusteringGroupsForPartition.java index cb2fd4eebb5b..ada5f4954ab1 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/cluster/strategy/TestSparkBuildClusteringGroupsForPartition.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/cluster/strategy/TestSparkBuildClusteringGroupsForPartition.java @@ -25,6 +25,7 @@ import org.apache.hudi.common.model.FileSlice; import org.apache.hudi.common.model.HoodieBaseFile; import org.apache.hudi.common.model.HoodieFileGroupId; +import org.apache.hudi.common.table.HoodieTableConfig; import org.apache.hudi.config.HoodieClusteringConfig; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.table.HoodieSparkCopyOnWriteTable; @@ -41,6 +42,9 @@ import static org.junit.jupiter.api.Assertions.assertEquals; public class TestSparkBuildClusteringGroupsForPartition { + + protected static final String BASE_FILE_EXTENSION = HoodieTableConfig.BASE_FILE_FORMAT.defaultValue().getFileExtension(); + @Mock HoodieSparkCopyOnWriteTable table; @Mock @@ -109,13 +113,13 @@ public void testBuildClusteringGroupsWithLimitScan() { private FileSlice generateFileSlice(String partitionPath, String fileId, String baseInstant) { FileSlice fs = new FileSlice(new HoodieFileGroupId(partitionPath, fileId), baseInstant); - fs.setBaseFile(new HoodieBaseFile(FSUtils.makeBaseFileName(baseInstant, "1-0-1", fileId))); + fs.setBaseFile(new HoodieBaseFile(FSUtils.makeBaseFileName(baseInstant, "1-0-1", fileId, BASE_FILE_EXTENSION))); return fs; } private FileSlice generateFileSliceWithLen(String partitionPath, String fileId, String baseInstant, long fileLen) { FileSlice fs = new FileSlice(new HoodieFileGroupId(partitionPath, fileId), baseInstant); - HoodieBaseFile hoodieBaseFile = new HoodieBaseFile(FSUtils.makeBaseFileName(baseInstant, "1-0-1", fileId)); + HoodieBaseFile hoodieBaseFile = new HoodieBaseFile(FSUtils.makeBaseFileName(baseInstant, "1-0-1", fileId, BASE_FILE_EXTENSION)); hoodieBaseFile.setFileLen(fileLen); fs.setBaseFile(hoodieBaseFile); return fs; diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/commit/TestCopyOnWriteActionExecutor.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/commit/TestCopyOnWriteActionExecutor.java index 4574b34393d5..f40ba3be02e7 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/commit/TestCopyOnWriteActionExecutor.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/commit/TestCopyOnWriteActionExecutor.java @@ -130,7 +130,7 @@ public void testMakeNewPath() { }).collect().get(0); assertEquals(newPathWithWriteToken.getKey().toString(), Paths.get(this.basePath, partitionPath, - FSUtils.makeBaseFileName(instantTime, newPathWithWriteToken.getRight(), fileName)).toString()); + FSUtils.makeBaseFileName(instantTime, newPathWithWriteToken.getRight(), fileName, BASE_FILE_EXTENSION)).toString()); } private HoodieWriteConfig makeHoodieClientConfig() { diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/compact/CompactionTestBase.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/compact/CompactionTestBase.java index 98e94b1dd5e9..e425c4c4352f 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/compact/CompactionTestBase.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/compact/CompactionTestBase.java @@ -24,6 +24,7 @@ import org.apache.hudi.client.SparkRDDWriteClient; import org.apache.hudi.client.WriteStatus; import org.apache.hudi.common.config.HoodieMetadataConfig; +import org.apache.hudi.common.config.HoodieStorageConfig; import org.apache.hudi.common.model.FileSlice; import org.apache.hudi.common.model.HoodieBaseFile; import org.apache.hudi.common.model.HoodieFileGroupId; @@ -40,7 +41,6 @@ import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.config.HoodieCompactionConfig; import org.apache.hudi.config.HoodieIndexConfig; -import org.apache.hudi.common.config.HoodieStorageConfig; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.index.HoodieIndex; import org.apache.hudi.table.HoodieTable; diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/marker/TestTimelineServerBasedWriteMarkers.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/marker/TestTimelineServerBasedWriteMarkers.java index b27f40e2addd..367229b18da4 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/marker/TestTimelineServerBasedWriteMarkers.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/marker/TestTimelineServerBasedWriteMarkers.java @@ -20,7 +20,6 @@ import org.apache.hudi.client.common.HoodieSparkEngineContext; import org.apache.hudi.common.config.HoodieCommonConfig; -import org.apache.hudi.common.config.HoodieMetadataConfig; import org.apache.hudi.common.engine.HoodieLocalEngineContext; import org.apache.hudi.common.table.marker.MarkerType; import org.apache.hudi.common.table.view.FileSystemViewManager; @@ -66,15 +65,13 @@ public void setup() throws IOException { FileSystemViewStorageConfig storageConf = FileSystemViewStorageConfig.newBuilder().withStorageType(FileSystemViewStorageType.SPILLABLE_DISK).build(); - HoodieMetadataConfig metadataConfig = HoodieMetadataConfig.newBuilder().build(); HoodieLocalEngineContext localEngineContext = new HoodieLocalEngineContext(metaClient.getHadoopConf()); try { timelineService = new TimelineService(localEngineContext, new Configuration(), TimelineService.Config.builder().serverPort(0).enableMarkerRequests(true).build(), FileSystem.get(new Configuration()), - FileSystemViewManager.createViewManager( - localEngineContext, metadataConfig, storageConf, HoodieCommonConfig.newBuilder().build())); + FileSystemViewManager.createViewManager(localEngineContext, storageConf, HoodieCommonConfig.newBuilder().build())); timelineService.startService(); } catch (Exception ex) { throw new RuntimeException(ex); diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/upgrade/TestUpgradeDowngrade.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/upgrade/TestUpgradeDowngrade.java index ca26cad75d10..0fa28f2ef422 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/upgrade/TestUpgradeDowngrade.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/upgrade/TestUpgradeDowngrade.java @@ -540,7 +540,7 @@ public void testDowngradeSixToFiveShouldDeleteRecordIndexPartition() throws Exce .withEnableRecordIndex(true).build()) .build(); for (MetadataPartitionType partitionType : MetadataPartitionType.values()) { - metaClient.getTableConfig().setMetadataPartitionState(metaClient, partitionType, true); + metaClient.getTableConfig().setMetadataPartitionState(metaClient, partitionType.getPartitionPath(), true); } metaClient.getTableConfig().setMetadataPartitionsInflight(metaClient, MetadataPartitionType.values()); String metadataTableBasePath = Paths.get(basePath, METADATA_TABLE_FOLDER_PATH).toString(); diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieClientTestUtils.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieClientTestUtils.java index b59b1ea8d670..57a2793f0f66 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieClientTestUtils.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieClientTestUtils.java @@ -87,7 +87,7 @@ public class HoodieClientTestUtils { */ public static SparkConf getSparkConfForTest(String appName) { SparkConf sparkConf = new SparkConf().setAppName(appName) - .setMaster("local[4]") + .setMaster("local[8]") .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .set("spark.kryo.registrator", "org.apache.spark.HoodieSparkKryoRegistrar") .set("spark.sql.shuffle.partitions", "4") @@ -281,8 +281,7 @@ public static TimelineService initTimelineService( TimelineService.Config.builder().enableMarkerRequests(true) .serverPort(config.getViewStorageConfig().getRemoteViewServerPort()).build(), FileSystem.get(new Configuration()), - FileSystemViewManager.createViewManager(context, config.getMetadataConfig(), - config.getViewStorageConfig(), config.getCommonConfig())); + FileSystemViewManager.createViewManager(context, config.getViewStorageConfig(), config.getCommonConfig())); timelineService.startService(); LOG.info("Timeline service server port: " + timelineServicePort); return timelineService; diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/providers/SparkProvider.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/providers/SparkProvider.java index 3a8bb1a300f1..91045034e5f3 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/providers/SparkProvider.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/providers/SparkProvider.java @@ -38,7 +38,7 @@ public interface SparkProvider extends org.apache.hudi.testutils.providers.Hoodi default SparkConf conf(Map overwritingConfigs) { SparkConf sparkConf = new SparkConf(); sparkConf.set("spark.app.name", getClass().getName()); - sparkConf.set("spark.master", "local[*]"); + sparkConf.set("spark.master", "local[8]"); sparkConf.set("spark.default.parallelism", "4"); sparkConf.set("spark.sql.shuffle.partitions", "4"); sparkConf.set("spark.driver.maxResultSize", "2g"); diff --git a/hudi-common/src/main/java/org/apache/hudi/avro/AvroSchemaCompatibility.java b/hudi-common/src/main/java/org/apache/hudi/avro/AvroSchemaCompatibility.java index 63629b31dddc..b7bf072b218b 100644 --- a/hudi-common/src/main/java/org/apache/hudi/avro/AvroSchemaCompatibility.java +++ b/hudi-common/src/main/java/org/apache/hudi/avro/AvroSchemaCompatibility.java @@ -36,6 +36,7 @@ import java.util.Collections; import java.util.Deque; import java.util.HashMap; +import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Objects; @@ -283,6 +284,35 @@ private SchemaCompatibilityResult getCompatibility(final Schema reader, return result; } + private static String getLocationName(final Deque locations, Type readerType) { + StringBuilder sb = new StringBuilder(); + Iterator locationInfoIterator = locations.iterator(); + boolean addDot = false; + while (locationInfoIterator.hasNext()) { + if (addDot) { + sb.append("."); + } else { + addDot = true; + } + LocationInfo next = locationInfoIterator.next(); + sb.append(next.name); + //we check the reader type if we are at the last location. This is because + //if the type is array/map, that means the problem is that the field type + //of the writer is not array/map. If the type is something else, the problem + //is between the array element/map value of the reader and writer schemas + if (next.type.equals(Type.MAP)) { + if (locationInfoIterator.hasNext() || !readerType.equals(Type.MAP)) { + sb.append(".value"); + } + } else if (next.type.equals(Type.ARRAY)) { + if (locationInfoIterator.hasNext() || !readerType.equals(Type.ARRAY)) { + sb.append(".element"); + } + } + } + return sb.toString(); + } + /** * Calculates the compatibility of a reader/writer schema pair. * @@ -335,7 +365,7 @@ private SchemaCompatibilityResult calculateCompatibility(final Schema reader, fi for (final Schema writerBranch : writer.getTypes()) { SchemaCompatibilityResult compatibility = getCompatibility(reader, writerBranch, locations); if (compatibility.getCompatibility() == SchemaCompatibilityType.INCOMPATIBLE) { - String message = String.format("reader union lacking writer type: %s", writerBranch.getType()); + String message = String.format("reader union lacking writer type: %s for field: '%s'", writerBranch.getType(), getLocationName(locations, reader.getType())); result = result.mergedWith(SchemaCompatibilityResult.incompatible( SchemaIncompatibilityType.MISSING_UNION_BRANCH, reader, writer, message, asList(locations))); } @@ -407,7 +437,7 @@ private SchemaCompatibilityResult calculateCompatibility(final Schema reader, fi } // No branch in the reader union has been found compatible with the writer // schema: - String message = String.format("reader union lacking writer type: %s", writer.getType()); + String message = String.format("reader union lacking writer type: %s for field: '%s'", writer.getType(), getLocationName(locations, reader.getType())); return result.mergedWith(SchemaCompatibilityResult .incompatible(SchemaIncompatibilityType.MISSING_UNION_BRANCH, reader, writer, message, asList(locations))); } @@ -433,9 +463,10 @@ private SchemaCompatibilityResult checkReaderWriterRecordFields(final Schema rea // reader field must have a default value. if (defaultValueAccessor.getDefaultValue(readerField) == null) { // reader field has no default value + String message = String.format("Field '%s.%s' has no default value", getLocationName(locations, readerField.schema().getType()), readerField.name()); result = result.mergedWith( SchemaCompatibilityResult.incompatible(SchemaIncompatibilityType.READER_FIELD_MISSING_DEFAULT_VALUE, - reader, writer, readerField.name(), asList(locations))); + reader, writer, message, asList(locations))); } } else { locations.addLast(new LocationInfo(readerField.name(), readerField.schema().getType())); @@ -482,8 +513,9 @@ private SchemaCompatibilityResult checkReaderEnumContainsAllWriterEnumSymbols(fi final Set symbols = new TreeSet<>(writer.getEnumSymbols()); symbols.removeAll(reader.getEnumSymbols()); if (!symbols.isEmpty()) { + String message = String.format("Field '%s' missing enum symbols: %s", getLocationName(locations, reader.getType()), symbols); result = SchemaCompatibilityResult.incompatible(SchemaIncompatibilityType.MISSING_ENUM_SYMBOLS, reader, - writer, symbols.toString(), asList(locations)); + writer, message, asList(locations)); } return result; } @@ -494,7 +526,7 @@ private SchemaCompatibilityResult checkFixedSize(final Schema reader, final Sche int actual = reader.getFixedSize(); int expected = writer.getFixedSize(); if (actual != expected) { - String message = String.format("expected: %d, found: %d", expected, actual); + String message = String.format("Fixed size field '%s' expected: %d, found: %d", getLocationName(locations, reader.getType()), expected, actual); result = SchemaCompatibilityResult.incompatible(SchemaIncompatibilityType.FIXED_SIZE_MISMATCH, reader, writer, message, asList(locations)); } @@ -511,7 +543,7 @@ private SchemaCompatibilityResult checkSchemaNames(final Schema reader, final Sc boolean shouldCheckNames = checkNaming && (locations.size() == 1 || locations.peekLast().type == Type.UNION); SchemaCompatibilityResult result = SchemaCompatibilityResult.compatible(); if (shouldCheckNames && !Objects.equals(reader.getFullName(), writer.getFullName())) { - String message = String.format("expected: %s", writer.getFullName()); + String message = String.format("Reader schema name: '%s' is not compatible with writer schema name: '%s'", reader.getFullName(), writer.getFullName()); result = SchemaCompatibilityResult.incompatible(SchemaIncompatibilityType.NAME_MISMATCH, reader, writer, message, asList(locations)); } @@ -520,8 +552,8 @@ private SchemaCompatibilityResult checkSchemaNames(final Schema reader, final Sc private SchemaCompatibilityResult typeMismatch(final Schema reader, final Schema writer, final Deque locations) { - String message = String.format("reader type: %s not compatible with writer type: %s", reader.getType(), - writer.getType()); + String message = String.format("reader type '%s' not compatible with writer type '%s' for field '%s'", reader.getType(), + writer.getType(), getLocationName(locations, reader.getType())); return SchemaCompatibilityResult.incompatible(SchemaIncompatibilityType.TYPE_MISMATCH, reader, writer, message, asList(locations)); } diff --git a/hudi-common/src/main/java/org/apache/hudi/avro/AvroSchemaUtils.java b/hudi-common/src/main/java/org/apache/hudi/avro/AvroSchemaUtils.java index b0489d75ae01..ba747a63cbc0 100644 --- a/hudi-common/src/main/java/org/apache/hudi/avro/AvroSchemaUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/avro/AvroSchemaUtils.java @@ -19,14 +19,19 @@ package org.apache.hudi.avro; import org.apache.hudi.common.util.Option; +import org.apache.hudi.exception.HoodieAvroSchemaException; +import org.apache.hudi.exception.InvalidUnionTypeException; +import org.apache.hudi.exception.MissingSchemaFieldException; +import org.apache.hudi.exception.SchemaBackwardsCompatibilityException; import org.apache.hudi.exception.SchemaCompatibilityException; -import org.apache.avro.AvroRuntimeException; import org.apache.avro.Schema; import org.apache.avro.SchemaCompatibility; +import java.util.ArrayDeque; import java.util.ArrayList; import java.util.Collections; +import java.util.Deque; import java.util.List; import java.util.Objects; import java.util.Set; @@ -59,7 +64,7 @@ public static boolean isSchemaCompatible(Schema prevSchema, Schema newSchema, bo /** * Establishes whether {@code newSchema} is compatible w/ {@code prevSchema}, as * defined by Avro's {@link AvroSchemaCompatibility}. - * From avro's compatability standpoint, prevSchema is writer schema and new schema is reader schema. + * From avro's compatibility standpoint, prevSchema is writer schema and new schema is reader schema. * {@code newSchema} is considered compatible to {@code prevSchema}, iff data written using {@code prevSchema} * could be read by {@code newSchema} * @@ -92,20 +97,20 @@ public static boolean isSchemaCompatible(Schema prevSchema, Schema newSchema, bo * @return true if prev schema is a projection of new schema. */ public static boolean canProject(Schema prevSchema, Schema newSchema) { - return canProject(prevSchema, newSchema, Collections.emptySet()); + return findMissingFields(prevSchema, newSchema, Collections.emptySet()).isEmpty(); } /** - * Check that each field in the prevSchema can be populated in the newSchema except specified columns + * Check that each top level field in the prevSchema can be populated in the newSchema except specified columns * @param prevSchema prev schema. * @param newSchema new schema - * @return true if prev schema is a projection of new schema. + * @return List of fields that should be in the new schema */ - public static boolean canProject(Schema prevSchema, Schema newSchema, Set exceptCols) { + private static List findMissingFields(Schema prevSchema, Schema newSchema, Set exceptCols) { return prevSchema.getFields().stream() .filter(f -> !exceptCols.contains(f.name())) - .map(oldSchemaField -> SchemaCompatibility.lookupWriterField(newSchema, oldSchemaField)) - .noneMatch(Objects::isNull); + .filter(oldSchemaField -> SchemaCompatibility.lookupWriterField(newSchema, oldSchemaField) == null) + .collect(Collectors.toList()); } /** @@ -121,31 +126,6 @@ public static String getAvroRecordQualifiedName(String tableName) { return "hoodie." + sanitizedTableName + "." + sanitizedTableName + "_record"; } - /** - * Validate whether the {@code targetSchema} is a valid evolution of {@code sourceSchema}. - * Basically {@link #isCompatibleProjectionOf(Schema, Schema)} but type promotion in the - * opposite direction - */ - public static boolean isValidEvolutionOf(Schema sourceSchema, Schema targetSchema) { - return (sourceSchema.getType() == Schema.Type.NULL) || isProjectionOfInternal(sourceSchema, targetSchema, - AvroSchemaUtils::isAtomicSchemasCompatibleEvolution); - } - - /** - * Establishes whether {@code newReaderSchema} is compatible w/ {@code prevWriterSchema}, as - * defined by Avro's {@link AvroSchemaCompatibility}. - * {@code newReaderSchema} is considered compatible to {@code prevWriterSchema}, iff data written using {@code prevWriterSchema} - * could be read by {@code newReaderSchema} - * @param newReaderSchema new reader schema instance. - * @param prevWriterSchema prev writer schema instance. - * @return true if its compatible. else false. - */ - private static boolean isAtomicSchemasCompatibleEvolution(Schema newReaderSchema, Schema prevWriterSchema) { - // NOTE: Checking for compatibility of atomic types, we should ignore their - // corresponding fully-qualified names (as irrelevant) - return isSchemaCompatible(prevWriterSchema, newReaderSchema, false, true); - } - /** * Validate whether the {@code targetSchema} is a "compatible" projection of {@code sourceSchema}. * Only difference of this method from {@link #isStrictProjectionOf(Schema, Schema)} is @@ -337,7 +317,7 @@ public static Schema resolveUnionSchema(Schema schema, String fieldSchemaFullNam .orElse(null); if (nonNullType == null) { - throw new AvroRuntimeException( + throw new HoodieAvroSchemaException( String.format("Unsupported Avro UNION type %s: Only UNION of a null type and a non-null type is supported", schema)); } @@ -369,14 +349,14 @@ public static Schema resolveNullableSchema(Schema schema) { List innerTypes = schema.getTypes(); if (innerTypes.size() != 2) { - throw new AvroRuntimeException( + throw new HoodieAvroSchemaException( String.format("Unsupported Avro UNION type %s: Only UNION of a null type and a non-null type is supported", schema)); } Schema firstInnerType = innerTypes.get(0); Schema secondInnerType = innerTypes.get(1); if ((firstInnerType.getType() != Schema.Type.NULL && secondInnerType.getType() != Schema.Type.NULL) || (firstInnerType.getType() == Schema.Type.NULL && secondInnerType.getType() == Schema.Type.NULL)) { - throw new AvroRuntimeException( + throw new HoodieAvroSchemaException( String.format("Unsupported Avro UNION type %s: Only UNION of a null type and a non-null type is supported", schema)); } return firstInnerType.getType() == Schema.Type.NULL ? secondInnerType : firstInnerType; @@ -428,25 +408,118 @@ public static void checkSchemaCompatible( boolean allowProjection, Set dropPartitionColNames) throws SchemaCompatibilityException { - String errorMessage = null; - - if (!allowProjection && !canProject(tableSchema, writerSchema, dropPartitionColNames)) { - errorMessage = "Column dropping is not allowed"; + if (!allowProjection) { + List missingFields = findMissingFields(tableSchema, writerSchema, dropPartitionColNames); + if (!missingFields.isEmpty()) { + throw new MissingSchemaFieldException(missingFields.stream().map(Schema.Field::name).collect(Collectors.toList()), writerSchema, tableSchema); + } } // TODO(HUDI-4772) re-enable validations in case partition columns // being dropped from the data-file after fixing the write schema - if (dropPartitionColNames.isEmpty() && shouldValidate && !isSchemaCompatible(tableSchema, writerSchema)) { - errorMessage = "Failed schema compatibility check"; + if (dropPartitionColNames.isEmpty() && shouldValidate) { + AvroSchemaCompatibility.SchemaPairCompatibility result = + AvroSchemaCompatibility.checkReaderWriterCompatibility(writerSchema, tableSchema, true); + if (result.getType() != AvroSchemaCompatibility.SchemaCompatibilityType.COMPATIBLE) { + throw new SchemaBackwardsCompatibilityException(result, writerSchema, tableSchema); + } } + } - if (errorMessage != null) { - String errorDetails = String.format( - "%s\nwriterSchema: %s\ntableSchema: %s", - errorMessage, - writerSchema, - tableSchema); - throw new SchemaCompatibilityException(errorDetails); + /** + * Validate whether the {@code incomingSchema} is a valid evolution of {@code tableSchema}. + * + * @param incomingSchema schema of the incoming dataset + * @param tableSchema latest table schema + */ + public static void checkValidEvolution(Schema incomingSchema, Schema tableSchema) { + if (incomingSchema.getType() == Schema.Type.NULL) { + return; } + + //not really needed for `hoodie.write.set.null.for.missing.columns` but good to check anyway + List missingFields = new ArrayList<>(); + findAnyMissingFields(incomingSchema, tableSchema, new ArrayDeque<>(), missingFields); + if (!missingFields.isEmpty()) { + throw new MissingSchemaFieldException(missingFields, incomingSchema, tableSchema); + } + + //make sure that the table schema can be read using the incoming schema + AvroSchemaCompatibility.SchemaPairCompatibility result = + AvroSchemaCompatibility.checkReaderWriterCompatibility(incomingSchema, tableSchema, false); + if (result.getType() != AvroSchemaCompatibility.SchemaCompatibilityType.COMPATIBLE) { + throw new SchemaBackwardsCompatibilityException(result, incomingSchema, tableSchema); + } + } + + /** + * Find all fields in the latest table schema that are not in + * the incoming schema. + */ + private static void findAnyMissingFields(Schema incomingSchema, + Schema latestTableSchema, + Deque visited, + List missingFields) { + findAnyMissingFieldsRec(incomingSchema, latestTableSchema, visited, + missingFields, incomingSchema, latestTableSchema); + } + + /** + * We want to pass the full schemas so that the error message has the entire schema to print from + */ + private static void findAnyMissingFieldsRec(Schema incomingSchema, + Schema latestTableSchema, + Deque visited, + List missingFields, + Schema fullIncomingSchema, + Schema fullTableSchema) { + if (incomingSchema.getType() == latestTableSchema.getType()) { + if (incomingSchema.getType() == Schema.Type.RECORD) { + visited.addLast(latestTableSchema.getName()); + for (Schema.Field targetField : latestTableSchema.getFields()) { + visited.addLast(targetField.name()); + Schema.Field sourceField = incomingSchema.getField(targetField.name()); + if (sourceField == null) { + missingFields.add(String.join(".", visited)); + } else { + findAnyMissingFieldsRec(sourceField.schema(), targetField.schema(), visited, + missingFields, fullIncomingSchema, fullTableSchema); + } + visited.removeLast(); + } + visited.removeLast(); + } else if (incomingSchema.getType() == Schema.Type.ARRAY) { + visited.addLast("element"); + findAnyMissingFieldsRec(incomingSchema.getElementType(), latestTableSchema.getElementType(), + visited, missingFields, fullIncomingSchema, fullTableSchema); + visited.removeLast(); + } else if (incomingSchema.getType() == Schema.Type.MAP) { + visited.addLast("value"); + findAnyMissingFieldsRec(incomingSchema.getValueType(), latestTableSchema.getValueType(), + visited, missingFields, fullIncomingSchema, fullTableSchema); + visited.removeLast(); + } else if (incomingSchema.getType() == Schema.Type.UNION) { + List incomingNestedSchemas = incomingSchema.getTypes(); + List latestTableNestedSchemas = latestTableSchema.getTypes(); + if (incomingNestedSchemas.size() != latestTableNestedSchemas.size()) { + throw new InvalidUnionTypeException(createSchemaErrorString( + String.format("Incoming batch field '%s' has union with %d types, while the table schema has %d types", + String.join(".", visited), incomingNestedSchemas.size(), latestTableNestedSchemas.size()), fullIncomingSchema, fullTableSchema)); + } + if (incomingNestedSchemas.size() > 2) { + throw new InvalidUnionTypeException(createSchemaErrorString( + String.format("Union for incoming batch field '%s' should not have more than 2 types but has %d", + String.join(".", visited), incomingNestedSchemas.size()), fullIncomingSchema, fullTableSchema)); + } + for (int i = 0; i < incomingNestedSchemas.size(); ++i) { + findAnyMissingFieldsRec(incomingNestedSchemas.get(i), latestTableNestedSchemas.get(i), visited, + missingFields, fullIncomingSchema, fullTableSchema); + } + } + } + } + + public static String createSchemaErrorString(String errorMessage, Schema writerSchema, Schema tableSchema) { + return String.format("%s\nwriterSchema: %s\ntableSchema: %s", errorMessage, writerSchema, tableSchema); } } diff --git a/hudi-common/src/main/java/org/apache/hudi/avro/HoodieAvroUtils.java b/hudi-common/src/main/java/org/apache/hudi/avro/HoodieAvroUtils.java index b352099cb1e1..2172c7b1ae0f 100644 --- a/hudi-common/src/main/java/org/apache/hudi/avro/HoodieAvroUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/avro/HoodieAvroUtils.java @@ -36,6 +36,7 @@ import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.exception.HoodieAvroSchemaException; import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.exception.SchemaCompatibilityException; @@ -196,6 +197,20 @@ public static String avroToJsonString(GenericRecord record, boolean pretty) thro return avroToJsonHelper(record, pretty).toString(); } + /** + * Convert a given avro record to a JSON string. If the record contents are invalid, return the record.toString(). + * Use this method over {@link HoodieAvroUtils#avroToJsonString} when simply trying to print the record contents without any guarantees around their correctness. + * @param record The GenericRecord to convert + * @return a JSON string + */ + public static String safeAvroToJsonString(GenericRecord record) { + try { + return avroToJsonString(record, false); + } catch (Exception e) { + return record.toString(); + } + } + /** * Convert a given avro record to json and return the encoded bytes. * @@ -931,7 +946,9 @@ private static Object rewriteRecordWithNewSchema(Object oldRecord, Schema oldAvr private static Object rewriteRecordWithNewSchemaInternal(Object oldRecord, Schema oldSchema, Schema newSchema, Map renameCols, Deque fieldNames) { switch (newSchema.getType()) { case RECORD: - ValidationUtils.checkArgument(oldRecord instanceof IndexedRecord, "cannot rewrite record with different type"); + if (!(oldRecord instanceof IndexedRecord)) { + throw new SchemaCompatibilityException("cannot rewrite record with different type"); + } IndexedRecord indexedRecord = (IndexedRecord) oldRecord; List fields = newSchema.getFields(); GenericData.Record newRecord = new GenericData.Record(newSchema); @@ -963,15 +980,17 @@ private static Object rewriteRecordWithNewSchemaInternal(Object oldRecord, Schem } return newRecord; case ENUM: - ValidationUtils.checkArgument( - oldSchema.getType() == Schema.Type.STRING || oldSchema.getType() == Schema.Type.ENUM, - "Only ENUM or STRING type can be converted ENUM type"); + if (oldSchema.getType() != Schema.Type.STRING && oldSchema.getType() != Schema.Type.ENUM) { + throw new SchemaCompatibilityException(String.format("Only ENUM or STRING type can be converted ENUM type. Schema type was %s", oldSchema.getType().getName())); + } if (oldSchema.getType() == Schema.Type.STRING) { return new GenericData.EnumSymbol(newSchema, oldRecord); } return oldRecord; case ARRAY: - ValidationUtils.checkArgument(oldRecord instanceof Collection, "cannot rewrite record with different type"); + if (!(oldRecord instanceof Collection)) { + throw new SchemaCompatibilityException(String.format("Cannot rewrite %s as an array", oldRecord.getClass().getName())); + } Collection array = (Collection) oldRecord; List newArray = new ArrayList<>(array.size()); fieldNames.push("element"); @@ -981,7 +1000,9 @@ private static Object rewriteRecordWithNewSchemaInternal(Object oldRecord, Schem fieldNames.pop(); return newArray; case MAP: - ValidationUtils.checkArgument(oldRecord instanceof Map, "cannot rewrite record with different type"); + if (!(oldRecord instanceof Map)) { + throw new SchemaCompatibilityException(String.format("Cannot rewrite %s as a map", oldRecord.getClass().getName())); + } Map map = (Map) oldRecord; Map newMap = new HashMap<>(map.size(), 1.0f); fieldNames.push("value"); @@ -1029,7 +1050,7 @@ private static Object rewritePrimaryType(Object oldValue, Schema oldSchema, Sche BigDecimal bd = new BigDecimal(new BigInteger(bytes), decimal.getScale()).setScale(((Decimal) newSchema.getLogicalType()).getScale()); return DECIMAL_CONVERSION.toFixed(bd, newSchema, newSchema.getLogicalType()); } else { - throw new UnsupportedOperationException("Fixed type size change is not currently supported"); + throw new HoodieAvroSchemaException("Fixed type size change is not currently supported"); } } @@ -1045,7 +1066,7 @@ private static Object rewritePrimaryType(Object oldValue, Schema oldSchema, Sche } default: - throw new AvroRuntimeException("Unknown schema type: " + newSchema.getType()); + throw new HoodieAvroSchemaException("Unknown schema type: " + newSchema.getType()); } } else { return rewritePrimaryTypeWithDiffSchemaType(oldValue, oldSchema, newSchema); @@ -1130,7 +1151,7 @@ private static Object rewritePrimaryTypeWithDiffSchemaType(Object oldValue, Sche break; default: } - throw new AvroRuntimeException(String.format("cannot support rewrite value for schema type: %s since the old schema type is: %s", newSchema, oldSchema)); + throw new HoodieAvroSchemaException(String.format("cannot support rewrite value for schema type: %s since the old schema type is: %s", newSchema, oldSchema)); } /** diff --git a/hudi-common/src/main/java/org/apache/hudi/common/bloom/InternalBloomFilter.java b/hudi-common/src/main/java/org/apache/hudi/common/bloom/InternalBloomFilter.java index ac93de2d58fb..7ef766a2a3c5 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/bloom/InternalBloomFilter.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/bloom/InternalBloomFilter.java @@ -199,7 +199,7 @@ public String toString() { } /** - * @return size of the the bloomfilter + * @return size of the bloomfilter */ public int getVectorSize() { return this.vectorSize; diff --git a/hudi-common/src/main/java/org/apache/hudi/common/bloom/Key.java b/hudi-common/src/main/java/org/apache/hudi/common/bloom/Key.java index 37ae6e68f73a..f14d301ae3b3 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/bloom/Key.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/bloom/Key.java @@ -136,7 +136,7 @@ public int hashCode() { /** * Serialize the fields of this object to out. * - * @param out DataOuput to serialize this object into. + * @param out DataOutput to serialize this object into. * @throws IOException */ public void write(DataOutput out) throws IOException { diff --git a/hudi-common/src/main/java/org/apache/hudi/common/config/HoodieCommonConfig.java b/hudi-common/src/main/java/org/apache/hudi/common/config/HoodieCommonConfig.java index 8a1dbf04b3b5..1a4c2e317807 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/config/HoodieCommonConfig.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/config/HoodieCommonConfig.java @@ -72,15 +72,6 @@ public class HoodieCommonConfig extends HoodieConfig { + "This enables us, to always extend the table's schema during evolution and never lose the data (when, for " + "ex, existing column is being dropped in a new batch)"); - public static final ConfigProperty MAKE_NEW_COLUMNS_NULLABLE = ConfigProperty - .key("hoodie.datasource.write.new.columns.nullable") - .defaultValue(false) - .markAdvanced() - .sinceVersion("0.14.0") - .withDocumentation("When a non-nullable column is added to datasource during a write operation, the write " - + " operation will fail schema compatibility check. Set this option to true will make the newly added " - + " column nullable to successfully complete the write operation."); - public static final ConfigProperty SET_NULL_FOR_MISSING_COLUMNS = ConfigProperty .key("hoodie.write.set.null.for.missing.columns") .defaultValue("false") diff --git a/hudi-common/src/main/java/org/apache/hudi/common/config/HoodieStorageConfig.java b/hudi-common/src/main/java/org/apache/hudi/common/config/HoodieStorageConfig.java index d68b8326ca8c..f3ad183def43 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/config/HoodieStorageConfig.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/config/HoodieStorageConfig.java @@ -226,6 +226,17 @@ public class HoodieStorageConfig extends HoodieConfig { + "and it is loaded at runtime. This is only required when trying to " + "override the existing write context."); + public static final ConfigProperty HOODIE_PARQUET_SPARK_ROW_WRITE_SUPPORT_CLASS = ConfigProperty + .key("hoodie.parquet.spark.row.write.support.class") + .defaultValue("org.apache.hudi.io.storage.row.HoodieRowParquetWriteSupport") + .markAdvanced() + .sinceVersion("0.15.0") + .withDocumentation("Provided write support class should extend HoodieRowParquetWriteSupport class " + + "and it is loaded at runtime. This is only required when trying to " + + "override the existing write context when `hoodie.datasource.write.row.writer.enable=true`."); + + + /** * @deprecated Use {@link #PARQUET_MAX_FILE_SIZE} and its methods instead */ diff --git a/hudi-common/src/main/java/org/apache/hudi/common/data/HoodieBaseListData.java b/hudi-common/src/main/java/org/apache/hudi/common/data/HoodieBaseListData.java index 7bc276b36e67..6f3dbfcef993 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/data/HoodieBaseListData.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/data/HoodieBaseListData.java @@ -53,7 +53,7 @@ protected Stream asStream() { protected boolean isEmpty() { if (lazy) { - return data.asLeft().findAny().isPresent(); + return !data.asLeft().findAny().isPresent(); } else { return data.asRight().isEmpty(); } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/engine/HoodieReaderContext.java b/hudi-common/src/main/java/org/apache/hudi/common/engine/HoodieReaderContext.java index 9c556e84f82c..f230afed30f8 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/engine/HoodieReaderContext.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/engine/HoodieReaderContext.java @@ -245,4 +245,11 @@ public long extractRecordPosition(T record, Schema schema, String fieldName, lon } return providedPositionIfNeeded; } + + /** + * Constructs engine specific delete record. + */ + public T constructRawDeleteRecord(Map metadata) { + return null; + } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/fs/FSUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/fs/FSUtils.java index d84c677e3418..44e153dcce0a 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/fs/FSUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/fs/FSUtils.java @@ -142,12 +142,6 @@ public static String makeWriteToken(int taskPartitionId, int stageId, long taskA return String.format("%d-%d-%d", taskPartitionId, stageId, taskAttemptId); } - // TODO: this should be removed - public static String makeBaseFileName(String instantTime, String writeToken, String fileId) { - return String.format("%s_%s_%s%s", fileId, writeToken, instantTime, - HoodieTableConfig.BASE_FILE_FORMAT.defaultValue().getFileExtension()); - } - public static String makeBaseFileName(String instantTime, String writeToken, String fileId, String fileExtension) { return String.format("%s_%s_%s%s", fileId, writeToken, instantTime, fileExtension); } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/fs/SizeAwareDataOutputStream.java b/hudi-common/src/main/java/org/apache/hudi/common/fs/SizeAwareDataOutputStream.java index 1cc3da6fe3cb..350665d2521c 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/fs/SizeAwareDataOutputStream.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/fs/SizeAwareDataOutputStream.java @@ -55,7 +55,7 @@ public void write(byte[] v) throws IOException { } public void write(byte[] v, int offset, int len) throws IOException { - size.addAndGet(len + offset); + size.addAndGet((long) len + offset); outputStream.write(v, offset, len); } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/DefaultHoodieRecordPayload.java b/hudi-common/src/main/java/org/apache/hudi/common/model/DefaultHoodieRecordPayload.java index daa1dcb0207f..a3e6ce1f1331 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/DefaultHoodieRecordPayload.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/DefaultHoodieRecordPayload.java @@ -37,9 +37,11 @@ import java.util.concurrent.atomic.AtomicBoolean; /** + * Default payload. * {@link HoodieRecordPayload} impl that honors ordering field in both preCombine and combineAndGetUpdateValue. *

- * 1. preCombine - Picks the latest delta record for a key, based on an ordering field 2. combineAndGetUpdateValue/getInsertValue - Chooses the latest record based on ordering field value. + * 1. preCombine - Picks the latest delta record for a key, based on an ordering field + * 2. combineAndGetUpdateValue/getInsertValue - Chooses the latest record based on ordering field value. */ public class DefaultHoodieRecordPayload extends OverwriteWithLatestAvroPayload { public static final String METADATA_EVENT_TIME_KEY = "metadata.event_time.key"; diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/FirstValueAvroPayload.java b/hudi-common/src/main/java/org/apache/hudi/common/model/FirstValueAvroPayload.java new file mode 100644 index 000000000000..33da44e3bccd --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/FirstValueAvroPayload.java @@ -0,0 +1,124 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.model; + +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.generic.IndexedRecord; +import org.apache.hudi.avro.HoodieAvroUtils; +import org.apache.hudi.common.util.ConfigUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.keygen.constant.KeyGeneratorOptions; + +import java.util.Properties; + +/** + * Payload clazz that is used for Hudi Table. + * + *

Simplified FirstValueAvroPayload Logic: + *

+ *
+ *  Illustration with simple data.
+ *  the order field is 'ts', recordkey is 'id' and schema is :
+ *  {
+ *    [
+ *      {"name":"id","type":"string"},
+ *      {"name":"ts","type":"long"},
+ *      {"name":"name","type":"string"},
+ *      {"name":"price","type":"string"}
+ *    ]
+ *  }
+ *
+ *  case 1
+ *  Current data:
+ *      id      ts      name    price
+ *      1       1       name_1  price_1
+ *  Insert data:
+ *      id      ts      name    price
+ *      1       1       name_2  price_2
+ *
+ *  Result data after #preCombine or #combineAndGetUpdateValue:
+ *      id      ts      name    price
+ *      1       1       name_1  price_1
+ *
+ *  If precombine is the same, would keep the first one record
+ *
+ *  case 2
+ *  Current data:
+ *      id      ts      name    price
+ *      1       1       name_1  price_1
+ *  Insert data:
+ *      id      ts      name    price
+ *      1       2       name_2  price_2
+ *
+ *  Result data after preCombine or combineAndGetUpdateValue:
+ *      id      ts      name    price
+ *      1       2       name_2  price_2
+ *
+ *  The other functionalities are inherited from DefaultHoodieRecordPayload.
+ * 
+ */ +public class FirstValueAvroPayload extends DefaultHoodieRecordPayload { + + public FirstValueAvroPayload(GenericRecord record, Comparable orderingVal) { + super(record, orderingVal); + } + + public FirstValueAvroPayload(Option record) { + super(record); + } + + @Override + public OverwriteWithLatestAvroPayload preCombine(OverwriteWithLatestAvroPayload oldValue) { + if (oldValue.recordBytes.length == 0) { + // use natural order for delete record + return this; + } + if (oldValue.orderingVal.compareTo(orderingVal) >= 0) { + // pick the payload with greatest ordering value + return oldValue; + } else { + return this; + } + } + + @Override + protected boolean needUpdatingPersistedRecord(IndexedRecord currentValue, + IndexedRecord incomingRecord, Properties properties) { + /* + * Combining strategy here returns currentValue on disk if incoming record is older absolutely. + * The incoming record can be either a delete (sent as an upsert with _hoodie_is_deleted set to true) + * or an insert/update record. In any case, if it is older absolutely than the record in disk, the currentValue + * in disk is returned (to be rewritten with new commit time). + */ + String orderField = ConfigUtils.getOrderingField(properties); + if (orderField == null) { + return true; + } + boolean consistentLogicalTimestampEnabled = Boolean.parseBoolean(properties.getProperty( + KeyGeneratorOptions.KEYGENERATOR_CONSISTENT_LOGICAL_TIMESTAMP_ENABLED.key(), + KeyGeneratorOptions.KEYGENERATOR_CONSISTENT_LOGICAL_TIMESTAMP_ENABLED.defaultValue())); + Object persistedOrderingVal = HoodieAvroUtils.getNestedFieldVal((GenericRecord) currentValue, + orderField, + true, consistentLogicalTimestampEnabled); + Comparable incomingOrderingVal = (Comparable) HoodieAvroUtils.getNestedFieldVal((GenericRecord) incomingRecord, + orderField, + true, consistentLogicalTimestampEnabled); + return persistedOrderingVal == null || ((Comparable) persistedOrderingVal).compareTo(incomingOrderingVal) < 0; + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieDeltaWriteStat.java b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieDeltaWriteStat.java index 4fee7cdcb6ea..0593e280e6f9 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieDeltaWriteStat.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieDeltaWriteStat.java @@ -94,6 +94,20 @@ public Option>> getColumnStats return recordsStats; } + /** + * Make a new write status and copy basic fields from current object + * @return copy write status + */ + public HoodieDeltaWriteStat copy() { + HoodieDeltaWriteStat copy = new HoodieDeltaWriteStat(); + copy.setFileId(getFileId()); + copy.setPartitionPath(getPartitionPath()); + copy.setPrevCommit(getPrevCommit()); + copy.setBaseFile(getBaseFile()); + copy.setLogFiles(new ArrayList<>(getLogFiles())); + return copy; + } + private static Map> mergeRecordsStats( Map> stats1, Map> stats2) { diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/OverwriteWithLatestAvroPayload.java b/hudi-common/src/main/java/org/apache/hudi/common/model/OverwriteWithLatestAvroPayload.java index d9fbd4cba05c..dac9b8288969 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/OverwriteWithLatestAvroPayload.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/OverwriteWithLatestAvroPayload.java @@ -30,8 +30,6 @@ import java.util.Objects; /** - * Default payload. - * *
    *
  1. preCombine - Picks the latest delta record for a key, based on an ordering field; *
  2. combineAndGetUpdateValue/getInsertValue - Simply overwrites storage with latest delta record diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/RecordPayloadType.java b/hudi-common/src/main/java/org/apache/hudi/common/model/RecordPayloadType.java index d1eae004dc51..953a79348b7d 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/RecordPayloadType.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/RecordPayloadType.java @@ -49,7 +49,7 @@ public enum RecordPayloadType { @EnumFieldDescription("Subclass of OVERWRITE_LATEST_AVRO used for delta streamer.") OVERWRITE_NON_DEF_LATEST_AVRO(OverwriteNonDefaultsWithLatestAvroPayload.class.getName()), - @EnumFieldDescription("Default payload used for delta streamer.") + @EnumFieldDescription("Honors ordering field in preCombine and overwrites storage with latest delta record in combineAndGetUpdateValue") OVERWRITE_LATEST_AVRO(OverwriteWithLatestAvroPayload.class.getName()), @EnumFieldDescription("Used for partial update to Hudi Table.") diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableConfig.java b/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableConfig.java index f70f9456a868..3cac820d4a81 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableConfig.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableConfig.java @@ -24,12 +24,12 @@ import org.apache.hudi.common.config.OrderedProperties; import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.model.BootstrapIndexType; +import org.apache.hudi.common.model.DefaultHoodieRecordPayload; import org.apache.hudi.common.model.HoodieFileFormat; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordMerger; import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.model.HoodieTimelineTimeZone; -import org.apache.hudi.common.model.OverwriteWithLatestAvroPayload; import org.apache.hudi.common.model.RecordPayloadType; import org.apache.hudi.common.table.cdc.HoodieCDCSupplementalLoggingMode; import org.apache.hudi.common.table.timeline.HoodieInstantTimeGenerator; @@ -167,14 +167,14 @@ public class HoodieTableConfig extends HoodieConfig { public static final ConfigProperty PAYLOAD_CLASS_NAME = ConfigProperty .key("hoodie.compaction.payload.class") - .defaultValue(OverwriteWithLatestAvroPayload.class.getName()) + .defaultValue(DefaultHoodieRecordPayload.class.getName()) .deprecatedAfter("1.0.0") .withDocumentation("Payload class to use for performing compactions, i.e merge delta logs with current base file and then " + " produce a new base file."); public static final ConfigProperty PAYLOAD_TYPE = ConfigProperty .key("hoodie.compaction.payload.type") - .defaultValue(RecordPayloadType.OVERWRITE_LATEST_AVRO.name()) + .defaultValue(RecordPayloadType.HOODIE_AVRO_DEFAULT.name()) .sinceVersion("1.0.0") .withDocumentation(RecordPayloadType.class); @@ -747,52 +747,52 @@ public boolean isMetadataPartitionAvailable(MetadataPartitionType partition) { /** * Enables or disables the specified metadata table partition. * - * @param partitionType The partition + * @param partitionPath The partition * @param enabled If true, the partition is enabled, else disabled */ - public void setMetadataPartitionState(HoodieTableMetaClient metaClient, MetadataPartitionType partitionType, boolean enabled) { - ValidationUtils.checkArgument(!partitionType.getPartitionPath().contains(CONFIG_VALUES_DELIMITER), - "Metadata Table partition path cannot contain a comma: " + partitionType.getPartitionPath()); + public void setMetadataPartitionState(HoodieTableMetaClient metaClient, String partitionPath, boolean enabled) { + ValidationUtils.checkArgument(!partitionPath.contains(CONFIG_VALUES_DELIMITER), + "Metadata Table partition path cannot contain a comma: " + partitionPath); Set partitions = getMetadataPartitions(); Set partitionsInflight = getMetadataPartitionsInflight(); if (enabled) { - partitions.add(partitionType.getPartitionPath()); - partitionsInflight.remove(partitionType.getPartitionPath()); - } else if (partitionType.equals(MetadataPartitionType.FILES)) { + partitions.add(partitionPath); + partitionsInflight.remove(partitionPath); + } else if (partitionPath.equals(MetadataPartitionType.FILES.getPartitionPath())) { // file listing partition is required for all other partitions to work // Disabling file partition will also disable all partitions partitions.clear(); partitionsInflight.clear(); } else { - partitions.remove(partitionType.getPartitionPath()); - partitionsInflight.remove(partitionType.getPartitionPath()); + partitions.remove(partitionPath); + partitionsInflight.remove(partitionPath); } setValue(TABLE_METADATA_PARTITIONS, partitions.stream().sorted().collect(Collectors.joining(CONFIG_VALUES_DELIMITER))); setValue(TABLE_METADATA_PARTITIONS_INFLIGHT, partitionsInflight.stream().sorted().collect(Collectors.joining(CONFIG_VALUES_DELIMITER))); update(metaClient.getFs(), new Path(metaClient.getMetaPath()), getProps()); - LOG.info(String.format("MDT %s partition %s has been %s", metaClient.getBasePathV2(), partitionType.name(), enabled ? "enabled" : "disabled")); + LOG.info(String.format("MDT %s partition %s has been %s", metaClient.getBasePathV2(), partitionPath, enabled ? "enabled" : "disabled")); } /** * Enables the specified metadata table partition as inflight. * - * @param partitionTypes The list of partitions to enable as inflight. + * @param partitionPaths The list of partitions to enable as inflight. */ - public void setMetadataPartitionsInflight(HoodieTableMetaClient metaClient, List partitionTypes) { + public void setMetadataPartitionsInflight(HoodieTableMetaClient metaClient, List partitionPaths) { Set partitionsInflight = getMetadataPartitionsInflight(); - partitionTypes.forEach(t -> { - ValidationUtils.checkArgument(!t.getPartitionPath().contains(CONFIG_VALUES_DELIMITER), - "Metadata Table partition path cannot contain a comma: " + t.getPartitionPath()); - partitionsInflight.add(t.getPartitionPath()); + partitionPaths.forEach(partitionPath -> { + ValidationUtils.checkArgument(!partitionPath.contains(CONFIG_VALUES_DELIMITER), + "Metadata Table partition path cannot contain a comma: " + partitionPath); + partitionsInflight.add(partitionPath); }); setValue(TABLE_METADATA_PARTITIONS_INFLIGHT, partitionsInflight.stream().sorted().collect(Collectors.joining(CONFIG_VALUES_DELIMITER))); update(metaClient.getFs(), new Path(metaClient.getMetaPath()), getProps()); - LOG.info(String.format("MDT %s partitions %s have been set to inflight", metaClient.getBasePathV2(), partitionTypes)); + LOG.info(String.format("MDT %s partitions %s have been set to inflight", metaClient.getBasePathV2(), partitionPaths)); } public void setMetadataPartitionsInflight(HoodieTableMetaClient metaClient, MetadataPartitionType... partitionTypes) { - setMetadataPartitionsInflight(metaClient, Arrays.stream(partitionTypes).collect(Collectors.toList())); + setMetadataPartitionsInflight(metaClient, Arrays.stream(partitionTypes).map(MetadataPartitionType::getPartitionPath).collect(Collectors.toList())); } /** @@ -800,7 +800,7 @@ public void setMetadataPartitionsInflight(HoodieTableMetaClient metaClient, Meta * {@link HoodieTableConfig#TABLE_METADATA_PARTITIONS_INFLIGHT}. */ public void clearMetadataPartitions(HoodieTableMetaClient metaClient) { - setMetadataPartitionState(metaClient, MetadataPartitionType.FILES, false); + setMetadataPartitionState(metaClient, MetadataPartitionType.FILES.getPartitionPath(), false); } /** diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableMetaClient.java b/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableMetaClient.java index a2455e08356b..6468d165568d 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableMetaClient.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableMetaClient.java @@ -636,11 +636,11 @@ private static void initTableMetaClient(Configuration hadoopConf, String basePat fs.mkdirs(auxiliaryFolder); } - initializeBootstrapDirsIfNotExists(hadoopConf, basePath, fs); + initializeBootstrapDirsIfNotExists(basePath, fs); HoodieTableConfig.create(fs, metaPathDir, props); } - public static void initializeBootstrapDirsIfNotExists(Configuration hadoopConf, String basePath, FileSystem fs) throws IOException { + public static void initializeBootstrapDirsIfNotExists(String basePath, FileSystem fs) throws IOException { // Create bootstrap index by partition folder if it does not exist final Path bootstrap_index_folder_by_partition = @@ -801,7 +801,7 @@ public String toString() { } public void initializeBootstrapDirsIfNotExists() throws IOException { - initializeBootstrapDirsIfNotExists(getHadoopConf(), basePath.toString(), getFs()); + initializeBootstrapDirsIfNotExists(basePath.toString(), getFs()); } private static HoodieTableMetaClient newMetaClient(Configuration conf, String basePath, boolean loadActiveTimelineOnLoad, diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/TableSchemaResolver.java b/hudi-common/src/main/java/org/apache/hudi/common/table/TableSchemaResolver.java index a8f46c416f9d..9ab8ef52d19d 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/TableSchemaResolver.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/TableSchemaResolver.java @@ -37,8 +37,8 @@ import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.HoodieIOException; -import org.apache.hudi.exception.HoodieIncompatibleSchemaException; import org.apache.hudi.exception.InvalidTableException; +import org.apache.hudi.internal.schema.HoodieSchemaException; import org.apache.hudi.internal.schema.InternalSchema; import org.apache.hudi.internal.schema.io.FileBasedInternalSchemaStorageManager; import org.apache.hudi.internal.schema.utils.SerDeHelper; @@ -571,7 +571,7 @@ public static Schema appendPartitionColumns(Schema dataSchema, Option boolean hasPartitionColNotInSchema = Arrays.stream(partitionFields.get()).anyMatch(pf -> !containsFieldInSchema(dataSchema, pf)); boolean hasPartitionColInSchema = Arrays.stream(partitionFields.get()).anyMatch(pf -> containsFieldInSchema(dataSchema, pf)); if (hasPartitionColNotInSchema && hasPartitionColInSchema) { - throw new HoodieIncompatibleSchemaException("Partition columns could not be partially contained w/in the data schema"); + throw new HoodieSchemaException("Partition columns could not be partially contained w/in the data schema"); } if (hasPartitionColNotInSchema) { diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/read/HoodieFileGroupRecordBuffer.java b/hudi-common/src/main/java/org/apache/hudi/common/table/read/HoodieFileGroupRecordBuffer.java index ccc001e79c91..d9ba8bcd90eb 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/read/HoodieFileGroupRecordBuffer.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/read/HoodieFileGroupRecordBuffer.java @@ -34,8 +34,12 @@ public interface HoodieFileGroupRecordBuffer { enum BufferType { - KEY_BASED, - POSITION_BASED + // Merging based on record key. + KEY_BASED_MERGE, + // Merging based on record position. + POSITION_BASED_MERGE, + // No Merging at all. + UNMERGED } /** diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/read/HoodieKeyBasedFileGroupRecordBuffer.java b/hudi-common/src/main/java/org/apache/hudi/common/table/read/HoodieKeyBasedFileGroupRecordBuffer.java index b4e32be8c656..0430a42e8639 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/read/HoodieKeyBasedFileGroupRecordBuffer.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/read/HoodieKeyBasedFileGroupRecordBuffer.java @@ -65,7 +65,7 @@ public HoodieKeyBasedFileGroupRecordBuffer(HoodieReaderContext readerContext, @Override public BufferType getBufferType() { - return BufferType.KEY_BASED; + return BufferType.KEY_BASED_MERGE; } @Override diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/read/HoodiePositionBasedFileGroupRecordBuffer.java b/hudi-common/src/main/java/org/apache/hudi/common/table/read/HoodiePositionBasedFileGroupRecordBuffer.java index 4412713928ff..50e969343e15 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/read/HoodiePositionBasedFileGroupRecordBuffer.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/read/HoodiePositionBasedFileGroupRecordBuffer.java @@ -72,7 +72,7 @@ public HoodiePositionBasedFileGroupRecordBuffer(HoodieReaderContext readerCon @Override public BufferType getBufferType() { - return BufferType.POSITION_BASED; + return BufferType.POSITION_BASED_MERGE; } @Override diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/read/HoodieUnmergedFileGroupRecordBuffer.java b/hudi-common/src/main/java/org/apache/hudi/common/table/read/HoodieUnmergedFileGroupRecordBuffer.java new file mode 100644 index 000000000000..76aa28308c44 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/read/HoodieUnmergedFileGroupRecordBuffer.java @@ -0,0 +1,146 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.common.table.read; + +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.engine.HoodieReaderContext; +import org.apache.hudi.common.model.DeleteRecord; +import org.apache.hudi.common.model.HoodieRecordMerger; +import org.apache.hudi.common.table.log.KeySpec; +import org.apache.hudi.common.table.log.block.HoodieDataBlock; +import org.apache.hudi.common.table.log.block.HoodieDeleteBlock; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.ValidationUtils; +import org.apache.hudi.common.util.collection.ClosableIterator; +import org.apache.hudi.common.util.collection.ExternalSpillableMap; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.exception.HoodieException; + +import org.apache.avro.Schema; + +import java.io.IOException; +import java.io.Serializable; +import java.util.Arrays; +import java.util.Iterator; +import java.util.Map; + +public class HoodieUnmergedFileGroupRecordBuffer extends HoodieBaseFileGroupRecordBuffer { + // Used to order the records in the record map. + private Long putIndex = 0L; + private Long getIndex = 0L; + + public HoodieUnmergedFileGroupRecordBuffer( + HoodieReaderContext readerContext, + Schema readerSchema, + Schema baseFileSchema, + Option partitionNameOverrideOpt, + Option partitionPathFieldOpt, + HoodieRecordMerger recordMerger, + TypedProperties payloadProps, + long maxMemorySizeInBytes, + String spillableMapBasePath, + ExternalSpillableMap.DiskMapType diskMapType, + boolean isBitCaskDiskMapCompressionEnabled) { + super(readerContext, readerSchema, baseFileSchema, partitionNameOverrideOpt, partitionPathFieldOpt, + recordMerger, payloadProps, maxMemorySizeInBytes, spillableMapBasePath, diskMapType, isBitCaskDiskMapCompressionEnabled); + } + + @Override + protected boolean doHasNext() throws IOException { + ValidationUtils.checkState(baseFileIterator != null, "Base file iterator has not been set yet"); + + // Output from base file first. + if (baseFileIterator.hasNext()) { + nextRecord = baseFileIterator.next(); + return true; + } + + // Output records based on the index to preserve the order. + if (!records.isEmpty()) { + Pair, Map> nextRecordInfo = records.remove(getIndex++); + + if (nextRecordInfo == null) { + throw new HoodieException("Row index should be continuous!"); + } + + if (nextRecordInfo.getLeft().isPresent()) { + nextRecord = nextRecordInfo.getKey().get(); + } else { + nextRecord = readerContext.constructRawDeleteRecord(nextRecordInfo.getRight()); + } + return true; + } + + return false; + } + + @Override + public Iterator, Map>> getLogRecordIterator() { + return records.values().iterator(); + } + + @Override + public BufferType getBufferType() { + return BufferType.UNMERGED; + } + + @Override + public void processDataBlock(HoodieDataBlock dataBlock, Option keySpecOpt) { + Pair, Schema> recordsIteratorSchemaPair = + getRecordsIterator(dataBlock, keySpecOpt); + if (dataBlock.containsPartialUpdates()) { + throw new HoodieException("Partial update is not supported for unmerged record read"); + } + + try (ClosableIterator recordIterator = recordsIteratorSchemaPair.getLeft()) { + while (recordIterator.hasNext()) { + T nextRecord = recordIterator.next(); + Map metadata = readerContext.generateMetadataForRecord( + nextRecord, recordsIteratorSchemaPair.getRight()); + processNextDataRecord(nextRecord, metadata, putIndex++); + } + } + } + + @Override + public void processNextDataRecord(T record, Map metadata, Serializable index) { + records.put(index, Pair.of(Option.ofNullable(readerContext.seal(record)), metadata)); + } + + @Override + public void processDeleteBlock(HoodieDeleteBlock deleteBlock) { + Iterator it = Arrays.stream(deleteBlock.getRecordsToDelete()).iterator(); + while (it.hasNext()) { + DeleteRecord record = it.next(); + processNextDeletedRecord(record, putIndex++); + } + } + + @Override + public void processNextDeletedRecord(DeleteRecord deleteRecord, Serializable index) { + records.put(index, Pair.of(Option.empty(), readerContext.generateMetadataForRecord( + deleteRecord.getRecordKey(), deleteRecord.getPartitionPath(), deleteRecord.getOrderingValue()))); + } + + @Override + public boolean containsLogRecord(String recordKey) { + return records.containsKey(recordKey); + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/read/IncrementalQueryAnalyzer.java b/hudi-common/src/main/java/org/apache/hudi/common/table/read/IncrementalQueryAnalyzer.java index 03596354bb1c..ca8ae575898f 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/read/IncrementalQueryAnalyzer.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/read/IncrementalQueryAnalyzer.java @@ -111,6 +111,7 @@ public class IncrementalQueryAnalyzer { private final InstantRange.RangeType rangeType; private final boolean skipCompaction; private final boolean skipClustering; + private final boolean skipInsertOverwrite; private final int limit; private IncrementalQueryAnalyzer( @@ -120,6 +121,7 @@ private IncrementalQueryAnalyzer( InstantRange.RangeType rangeType, boolean skipCompaction, boolean skipClustering, + boolean skipInsertOverwrite, int limit) { this.metaClient = metaClient; this.startTime = Option.ofNullable(startTime); @@ -127,6 +129,7 @@ private IncrementalQueryAnalyzer( this.rangeType = rangeType; this.skipCompaction = skipCompaction; this.skipClustering = skipClustering; + this.skipInsertOverwrite = skipInsertOverwrite; this.limit = limit; } @@ -206,13 +209,13 @@ private static Pair, List> splitInstantByActiveness(List !instant.getAction().equals(HoodieTimeline.COMMIT_ACTION)); } if (skipClustering) { - timeline = timeline.filter(instant -> !ClusteringUtils.isClusteringInstant(instant, oriTimeline)); + timeline = timeline.filter(instant -> !ClusteringUtils.isCompletedClusteringInstant(instant, oriTimeline)); + } + if (skipInsertOverwrite) { + timeline = timeline.filter(instant -> !ClusteringUtils.isInsertOverwriteInstant(instant, oriTimeline)); } return timeline; } @@ -254,6 +260,7 @@ public static class Builder { private HoodieTableMetaClient metaClient; private boolean skipCompaction = false; private boolean skipClustering = false; + private boolean skipInsertOverwrite = false; /** * Maximum number of instants to read per run. */ @@ -292,6 +299,11 @@ public Builder skipClustering(boolean skipClustering) { return this; } + public Builder skipInsertOverwrite(boolean skipInsertOverwrite) { + this.skipInsertOverwrite = skipInsertOverwrite; + return this; + } + public Builder limit(int limit) { this.limit = limit; return this; @@ -299,7 +311,7 @@ public Builder limit(int limit) { public IncrementalQueryAnalyzer build() { return new IncrementalQueryAnalyzer(Objects.requireNonNull(this.metaClient), this.startTime, this.endTime, - Objects.requireNonNull(this.rangeType), this.skipCompaction, this.skipClustering, this.limit); + Objects.requireNonNull(this.rangeType), this.skipCompaction, this.skipClustering, this.skipInsertOverwrite, this.limit); } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieArchivedTimeline.java b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieArchivedTimeline.java index c9aba9ebc0dd..d6167ea61e7d 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieArchivedTimeline.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieArchivedTimeline.java @@ -72,6 +72,13 @@ public class HoodieArchivedTimeline extends HoodieDefaultTimeline { private static final Logger LOG = LoggerFactory.getLogger(HoodieArchivedTimeline.class); + /** + * Used for loading the archived timeline incrementally, the earliest loaded instant time get memorized + * each time the timeline is loaded. The instant time is then used as the end boundary + * of the next loading. + */ + private String cursorInstant; + /** * Loads all the archived instants. * Note that there is no lazy loading, so this may not work if the archived timeline range is really long. @@ -80,6 +87,7 @@ public class HoodieArchivedTimeline extends HoodieDefaultTimeline { public HoodieArchivedTimeline(HoodieTableMetaClient metaClient) { this.metaClient = metaClient; setInstants(this.loadInstants()); + this.cursorInstant = firstInstant().map(HoodieInstant::getTimestamp).orElse(null); // multiple casts will make this lambda serializable - // http://docs.oracle.com/javase/specs/jls/se8/html/jls-15.html#jls-15.16 this.details = (Function> & Serializable) this::getInstantDetails; @@ -92,6 +100,7 @@ public HoodieArchivedTimeline(HoodieTableMetaClient metaClient) { public HoodieArchivedTimeline(HoodieTableMetaClient metaClient, String startTs) { this.metaClient = metaClient; setInstants(loadInstants(new StartTsFilter(startTs), LoadMode.METADATA)); + this.cursorInstant = startTs; // multiple casts will make this lambda serializable - // http://docs.oracle.com/javase/specs/jls/se8/html/jls-15.html#jls-15.16 this.details = (Function> & Serializable) this::getInstantDetails; @@ -152,6 +161,26 @@ public HoodieArchivedTimeline reload() { return new HoodieArchivedTimeline(metaClient); } + /** + * Reloads the archived timeline incrementally with given beginning timestamp {@code startTs}. + * This method is not thread safe. + * + *

    IMPORTANT: this is for multiple loading of one static snapshot of the timeline, if there is new instants got archived, + * use {@link #reload()} instead. + */ + public HoodieArchivedTimeline reload(String startTs) { + if (this.cursorInstant != null) { + if (HoodieTimeline.compareTimestamps(startTs, LESSER_THAN, this.cursorInstant)) { + appendInstants(loadInstants(new ClosedOpenTimeRangeFilter(startTs, this.cursorInstant), LoadMode.METADATA)); + this.cursorInstant = startTs; + } + return this; + } else { + // a null cursor instant indicates an empty timeline + return new HoodieArchivedTimeline(metaClient, startTs); + } + } + private HoodieInstant readCommit(String instantTime, GenericRecord record, Option> instantDetailsConsumer) { final String action = record.get(ACTION_ARCHIVED_META_FIELD).toString(); final String completionTime = record.get(COMPLETION_TIME_ARCHIVED_META_FIELD).toString(); diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieDefaultTimeline.java b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieDefaultTimeline.java index d08194f266c9..9b231b4ee855 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieDefaultTimeline.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieDefaultTimeline.java @@ -18,23 +18,22 @@ package org.apache.hudi.common.table.timeline; -import org.apache.hudi.common.model.HoodieCommitMetadata; -import org.apache.hudi.common.model.WriteOperationType; import org.apache.hudi.common.table.timeline.HoodieInstant.State; import org.apache.hudi.common.util.ClusteringUtils; import org.apache.hudi.common.util.CollectionUtils; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.exception.HoodieException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.IOException; import java.io.Serializable; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; import java.util.ArrayList; +import java.util.Collections; import java.util.Comparator; import java.util.List; import java.util.Set; @@ -75,15 +74,23 @@ public HoodieDefaultTimeline(Stream instants, Function instants) { this.instants = instants; - final MessageDigest md; - try { - md = MessageDigest.getInstance(HASHING_ALGORITHM); - this.instants.forEach(i -> md - .update(getUTF8Bytes(StringUtils.joinUsingDelim("_", i.getTimestamp(), i.getAction(), i.getState().name())))); - } catch (NoSuchAlgorithmException nse) { - throw new HoodieException(nse); + this.timelineHash = computeTimelineHash(this.instants); + clearState(); + } + + public void appendInstants(List newInstants) { + if (newInstants.isEmpty()) { + // the new instants is empty, nothing to do. + return; + } + if (this.instants.isEmpty()) { + // the existing instants is empty, set up the new ones directly. + setInstants(newInstants); + return; } - this.timelineHash = StringUtils.toHexString(md.digest()); + this.instants = mergeInstants(newInstants, this.instants); + this.timelineHash = computeTimelineHash(this.instants); + clearState(); } /** @@ -502,25 +509,18 @@ public Option getFirstNonSavepointCommit() { } @Override - public Option getLastClusterCommit() { - return Option.fromJavaOptional(getCommitsTimeline().filter(s -> s.getAction().equalsIgnoreCase(HoodieTimeline.REPLACE_COMMIT_ACTION)) + public Option getLastClusteringInstant() { + return Option.fromJavaOptional(getCommitsTimeline().filter(s -> s.getAction().equalsIgnoreCase(HoodieTimeline.REPLACE_COMMIT_ACTION)) .getReverseOrderedInstants() - .filter(i -> { - try { - HoodieCommitMetadata metadata = TimelineUtils.getCommitMetadata(i, this); - return metadata.getOperationType().equals(WriteOperationType.CLUSTER); - } catch (IOException e) { - LOG.warn("Unable to read commit metadata for " + i + " due to " + e.getMessage()); - return false; - } - }).findFirst()); + .filter(i -> ClusteringUtils.isClusteringInstant(this, i)) + .findFirst()); } @Override public Option getLastPendingClusterInstant() { return Option.fromJavaOptional(filterPendingReplaceTimeline() .getReverseOrderedInstants() - .filter(i -> ClusteringUtils.isPendingClusteringInstant(this, i)).findFirst()); + .filter(i -> ClusteringUtils.isClusteringInstant(this, i)).findFirst()); } @Override @@ -567,6 +567,11 @@ private static Option findFirstNonSavepointCommit(List instants) { + final MessageDigest md; + try { + md = MessageDigest.getInstance(HASHING_ALGORITHM); + instants.forEach(i -> md + .update(getUTF8Bytes(StringUtils.joinUsingDelim("_", i.getTimestamp(), i.getAction(), i.getState().name())))); + } catch (NoSuchAlgorithmException nse) { + throw new HoodieException(nse); + } + return StringUtils.toHexString(md.digest()); + } + + /** + * Merges the given instant list into one and keep the sequence. + */ + private static List mergeInstants(List instants1, List instants2) { + ValidationUtils.checkArgument(!instants1.isEmpty() && !instants2.isEmpty(), "The instants to merge can not be empty"); + // some optimizations are based on the assumption all the instant lists are already sorted. + // skip when one list contains all the instants of the other one. + final List merged; + if (HoodieTimeline.compareTimestamps(instants1.get(instants1.size() - 1).getTimestamp(), LESSER_THAN_OR_EQUALS, instants2.get(0).getTimestamp())) { + merged = new ArrayList<>(instants1); + merged.addAll(instants2); + } else if (HoodieTimeline.compareTimestamps(instants2.get(instants2.size() - 1).getTimestamp(), LESSER_THAN_OR_EQUALS, instants1.get(0).getTimestamp())) { + merged = new ArrayList<>(instants2); + merged.addAll(instants1); + } else { + merged = new ArrayList<>(instants1); + merged.addAll(instants2); + // sort the instants explicitly + Collections.sort(merged); + } + return merged; + } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieInstantTimeGenerator.java b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieInstantTimeGenerator.java index a4f4b2cdf24f..efa9c6f120a8 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieInstantTimeGenerator.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieInstantTimeGenerator.java @@ -120,7 +120,7 @@ public static String instantTimeMinusMillis(String timestamp, long milliseconds) } } - private static String fixInstantTimeCompatibility(String instantTime) { + public static String fixInstantTimeCompatibility(String instantTime) { // Enables backwards compatibility with non-millisecond granularity instants if (isSecondGranularity(instantTime)) { // Add milliseconds to the instant in order to parse successfully diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieTimeline.java b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieTimeline.java index ac77e1eb6060..b02e797f23d5 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieTimeline.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieTimeline.java @@ -397,9 +397,8 @@ public interface HoodieTimeline extends Serializable { /** * get the most recent cluster commit if present - * */ - public Option getLastClusterCommit(); + public Option getLastClusteringInstant(); /** * get the most recent pending cluster commit if present diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/TimeGeneratorBase.java b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/TimeGeneratorBase.java index 4acb8d2af54b..c8e56c6b6d87 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/TimeGeneratorBase.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/TimeGeneratorBase.java @@ -50,15 +50,7 @@ public abstract class TimeGeneratorBase implements TimeGenerator, Serializable { /** * The lock provider. */ - private volatile LockProvider lockProvider; - /** - * The maximum times to retry in case there are failures. - */ - private final int maxRetries; - /** - * The maximum time to wait for each time generation to resolve the clock skew issue on distributed hosts. - */ - private final long maxWaitTimeInMs; + private volatile LockProvider lockProvider; /** * The maximum time to block for acquiring a lock. */ @@ -79,24 +71,26 @@ public TimeGeneratorBase(HoodieTimeGeneratorConfig config, SerializableConfigura this.lockConfiguration = config.getLockConfiguration(); this.hadoopConf = hadoopConf; - maxRetries = lockConfiguration.getConfig().getInteger(LOCK_ACQUIRE_CLIENT_NUM_RETRIES_PROP_KEY, + // The maximum times to retry in case there are failures. + int maxRetries = lockConfiguration.getConfig().getInteger(LOCK_ACQUIRE_CLIENT_NUM_RETRIES_PROP_KEY, Integer.parseInt(DEFAULT_LOCK_ACQUIRE_NUM_RETRIES)); lockAcquireWaitTimeInMs = lockConfiguration.getConfig().getInteger(LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP_KEY, DEFAULT_LOCK_ACQUIRE_WAIT_TIMEOUT_MS); - maxWaitTimeInMs = lockConfiguration.getConfig().getLong(LOCK_ACQUIRE_CLIENT_RETRY_WAIT_TIME_IN_MILLIS_PROP_KEY, + // The maximum time to wait for each time generation to resolve the clock skew issue on distributed hosts. + long maxWaitTimeInMs = lockConfiguration.getConfig().getLong(LOCK_ACQUIRE_CLIENT_RETRY_WAIT_TIME_IN_MILLIS_PROP_KEY, Long.parseLong(DEFAULT_LOCK_ACQUIRE_CLIENT_RETRY_WAIT_TIME_IN_MILLIS)); lockRetryHelper = new RetryHelper<>(maxWaitTimeInMs, maxRetries, maxWaitTimeInMs, Arrays.asList(HoodieLockException.class, InterruptedException.class), "acquire timeGenerator lock"); } - protected LockProvider getLockProvider() { + protected LockProvider getLockProvider() { // Perform lazy initialization of lock provider only if needed if (lockProvider == null) { synchronized (this) { if (lockProvider == null) { String lockProviderClass = lockConfiguration.getConfig().getString("hoodie.write.lock.provider"); LOG.info("LockProvider for TimeGenerator: " + lockProviderClass); - lockProvider = (LockProvider) ReflectionUtils.loadClass(lockProviderClass, + lockProvider = (LockProvider) ReflectionUtils.loadClass(lockProviderClass, lockConfiguration, hadoopConf.get()); } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/TimeGenerators.java b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/TimeGenerators.java index 3f394a47a6d9..4ae2f1054a03 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/TimeGenerators.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/TimeGenerators.java @@ -29,7 +29,7 @@ import static org.apache.hudi.common.config.HoodieCommonConfig.BASE_PATH; /** - * Holds all different {@link TimeGenerator} implementations, use {@link HoodieCommonConfig.BASE_PATH} + * Holds all different {@link TimeGenerator} implementations, use {@link org.apache.hudi.common.config.HoodieCommonConfig#BASE_PATH} * to cache the existing instances. */ public class TimeGenerators { diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/TimelineDiffHelper.java b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/TimelineDiffHelper.java index aa7e2a30754d..a98b71aa5711 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/TimelineDiffHelper.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/TimelineDiffHelper.java @@ -37,8 +37,11 @@ public class TimelineDiffHelper { private static final Logger LOG = LoggerFactory.getLogger(TimelineDiffHelper.class); + private TimelineDiffHelper() { + } + public static TimelineDiffResult getNewInstantsForIncrementalSync(HoodieTimeline oldTimeline, - HoodieTimeline newTimeline) { + HoodieTimeline newTimeline) { HoodieTimeline oldT = oldTimeline.filterCompletedAndCompactionInstants(); HoodieTimeline newT = newTimeline.filterCompletedAndCompactionInstants(); @@ -57,14 +60,14 @@ public static TimelineDiffResult getNewInstantsForIncrementalSync(HoodieTimeline List newInstants = new ArrayList<>(); // Check If any pending compaction is lost. If so, do not allow incremental timeline sync - List> compactionInstants = getPendingCompactionTransitions(oldT, newT); + List> compactionInstants = getPendingActionTransitions(oldT.filterPendingCompactionTimeline(), + newT, HoodieTimeline.COMMIT_ACTION, HoodieTimeline.COMPACTION_ACTION); List lostPendingCompactions = compactionInstants.stream() .filter(instantPair -> instantPair.getValue() == null).map(Pair::getKey).collect(Collectors.toList()); if (!lostPendingCompactions.isEmpty()) { // If a compaction is unscheduled, fall back to complete refresh of fs view since some log files could have been // moved. Its unsafe to incrementally sync in that case. - LOG.warn("Some pending compactions are no longer in new timeline (unscheduled ?). They are :" - + lostPendingCompactions); + LOG.warn("Some pending compactions are no longer in new timeline (unscheduled ?). They are: {}", lostPendingCompactions); return TimelineDiffResult.UNSAFE_SYNC_RESULT; } List finishedCompactionInstants = compactionInstants.stream() @@ -74,7 +77,8 @@ public static TimelineDiffResult getNewInstantsForIncrementalSync(HoodieTimeline newTimeline.getInstantsAsStream().filter(instant -> !oldTimelineInstants.contains(instant)).forEach(newInstants::add); - List> logCompactionInstants = getPendingLogCompactionTransitions(oldTimeline, newTimeline); + List> logCompactionInstants = getPendingActionTransitions(oldTimeline.filterPendingLogCompactionTimeline(), + newTimeline, HoodieTimeline.DELTA_COMMIT_ACTION, HoodieTimeline.LOG_COMPACTION_ACTION); List finishedOrRemovedLogCompactionInstants = logCompactionInstants.stream() .filter(instantPair -> !instantPair.getKey().isCompleted() && (instantPair.getValue() == null || instantPair.getValue().isCompleted())) @@ -87,52 +91,24 @@ public static TimelineDiffResult getNewInstantsForIncrementalSync(HoodieTimeline } } - /** - * Getting pending log compaction transitions. - */ - private static List> getPendingLogCompactionTransitions(HoodieTimeline oldTimeline, - HoodieTimeline newTimeline) { - Set newTimelineInstants = newTimeline.getInstantsAsStream().collect(Collectors.toSet()); - - return oldTimeline.filterPendingLogCompactionTimeline().getInstantsAsStream().map(instant -> { - if (newTimelineInstants.contains(instant)) { - return Pair.of(instant, instant); - } else { - HoodieInstant logCompacted = - new HoodieInstant(State.COMPLETED, HoodieTimeline.DELTA_COMMIT_ACTION, instant.getTimestamp()); - if (newTimelineInstants.contains(logCompacted)) { - return Pair.of(instant, logCompacted); - } - HoodieInstant inflightLogCompacted = - new HoodieInstant(State.INFLIGHT, HoodieTimeline.LOG_COMPACTION_ACTION, instant.getTimestamp()); - if (newTimelineInstants.contains(inflightLogCompacted)) { - return Pair.of(instant, inflightLogCompacted); - } - return Pair.of(instant, null); - } - }).collect(Collectors.toList()); - } - - /** - * Getting pending compaction transitions. - */ - private static List> getPendingCompactionTransitions(HoodieTimeline oldTimeline, - HoodieTimeline newTimeline) { + private static List> getPendingActionTransitions(HoodieTimeline pendingActionTimelineFromOld, + HoodieTimeline newTimeline, + String completedAction, String pendingAction) { Set newTimelineInstants = newTimeline.getInstantsAsStream().collect(Collectors.toSet()); - return oldTimeline.filterPendingCompactionTimeline().getInstantsAsStream().map(instant -> { + return pendingActionTimelineFromOld.getInstantsAsStream().map(instant -> { if (newTimelineInstants.contains(instant)) { return Pair.of(instant, instant); } else { - HoodieInstant compacted = - new HoodieInstant(State.COMPLETED, HoodieTimeline.COMMIT_ACTION, instant.getTimestamp()); - if (newTimelineInstants.contains(compacted)) { - return Pair.of(instant, compacted); + HoodieInstant completedInstant = + new HoodieInstant(State.COMPLETED, completedAction, instant.getTimestamp()); + if (newTimelineInstants.contains(completedInstant)) { + return Pair.of(instant, completedInstant); } - HoodieInstant inflightCompacted = - new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, instant.getTimestamp()); - if (newTimelineInstants.contains(inflightCompacted)) { - return Pair.of(instant, inflightCompacted); + HoodieInstant inflightInstant = + new HoodieInstant(State.INFLIGHT, pendingAction, instant.getTimestamp()); + if (newTimelineInstants.contains(inflightInstant)) { + return Pair.of(instant, inflightInstant); } return Pair.of(instant, null); } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/TimelineUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/TimelineUtils.java index 5e710800d6f4..dbe8f83fdbe1 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/TimelineUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/TimelineUtils.java @@ -81,13 +81,15 @@ public static List getWrittenPartitions(HoodieTimeline timeline) { } /** - * Returns partitions that have been deleted or marked for deletion in the given timeline. + * Returns partitions that have been deleted or marked for deletion in the timeline between given commit time range. * Does not include internal operations such as clean in the timeline. */ - public static List getDroppedPartitions(HoodieTimeline timeline) { + public static List getDroppedPartitions(HoodieTableMetaClient metaClient, Option lastCommitTimeSynced, Option lastCommitCompletionTimeSynced) { + HoodieTimeline timeline = lastCommitTimeSynced.isPresent() + ? TimelineUtils.getCommitsTimelineAfter(metaClient, lastCommitTimeSynced.get(), lastCommitCompletionTimeSynced) + : metaClient.getActiveTimeline(); HoodieTimeline completedTimeline = timeline.getWriteTimeline().filterCompletedInstants(); HoodieTimeline replaceCommitTimeline = completedTimeline.getCompletedReplaceTimeline(); - Map partitionToLatestDeleteTimestamp = replaceCommitTimeline.getInstantsAsStream() .map(instant -> { try { @@ -102,6 +104,21 @@ public static List getDroppedPartitions(HoodieTimeline timeline) { .flatMap(pair -> pair.getRight().getPartitionToReplaceFileIds().keySet().stream() .map(partition -> new AbstractMap.SimpleEntry<>(partition, pair.getLeft().getTimestamp())) ).collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue, (existing, replace) -> replace)); + // cleaner could delete a partition when there are no active filegroups in the partition + HoodieTimeline cleanerTimeline = metaClient.getActiveTimeline().getCleanerTimeline().filterCompletedInstants(); + cleanerTimeline.getInstantsAsStream() + .forEach(instant -> { + try { + HoodieCleanMetadata cleanMetadata = TimelineMetadataUtils.deserializeHoodieCleanMetadata(cleanerTimeline.getInstantDetails(instant).get()); + cleanMetadata.getPartitionMetadata().forEach((partition, partitionMetadata) -> { + if (partitionMetadata.getIsPartitionDeleted()) { + partitionToLatestDeleteTimestamp.put(partition, instant.getTimestamp()); + } + }); + } catch (IOException e) { + throw new HoodieIOException("Failed to get partitions cleaned at " + instant, e); + } + }); if (partitionToLatestDeleteTimestamp.isEmpty()) { // There is no dropped partitions @@ -244,7 +261,7 @@ public static boolean isClusteringCommit(HoodieTableMetaClient metaClient, Hoodi return false; } catch (IOException e) { - throw new HoodieIOException("Unable to read instant information: " + instant + " for " + metaClient.getBasePath(), e); + throw new HoodieIOException("Unable to read instant information: " + instant + " for " + metaClient.getBasePathV2().toString(), e); } } @@ -440,7 +457,7 @@ public static HoodieTimeline handleHollowCommitIfNeeded(HoodieTimeline completed } public enum HollowCommitHandling { - FAIL, BLOCK, USE_TRANSITION_TIME; + FAIL, BLOCK, USE_TRANSITION_TIME } /** diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/view/AbstractTableFileSystemView.java b/hudi-common/src/main/java/org/apache/hudi/common/table/view/AbstractTableFileSystemView.java index 76750171fb55..cdac0eeeb200 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/view/AbstractTableFileSystemView.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/view/AbstractTableFileSystemView.java @@ -801,11 +801,20 @@ public final Stream getLatestBaseFilesInRange(List commi } @Override - public Void loadAllPartitions() { + public void loadAllPartitions() { try { readLock.lock(); ensureAllPartitionsLoadedCorrectly(); - return null; + } finally { + readLock.unlock(); + } + } + + @Override + public void loadPartitions(List partitionPaths) { + try { + readLock.lock(); + ensurePartitionsLoadedCorrectly(partitionPaths); } finally { readLock.unlock(); } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/view/FileSystemViewManager.java b/hudi-common/src/main/java/org/apache/hudi/common/table/view/FileSystemViewManager.java index d5697e83eeba..172b5e41af77 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/view/FileSystemViewManager.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/view/FileSystemViewManager.java @@ -31,7 +31,6 @@ import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.metadata.HoodieMetadataFileSystemView; import org.apache.hudi.metadata.HoodieTableMetadata; - import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -66,17 +65,19 @@ public class FileSystemViewManager { private final SerializableConfiguration conf; // The View Storage config used to store file-system views private final FileSystemViewStorageConfig viewStorageConfig; - // Map from Base-Path to View - private final ConcurrentHashMap globalViewMap; // Factory Map to create file-system views private final Function2 viewCreator; + // Map from Base-Path to View + private final ConcurrentHashMap globalViewMap; - private FileSystemViewManager(HoodieEngineContext context, FileSystemViewStorageConfig viewStorageConfig, + private FileSystemViewManager( + HoodieEngineContext context, + FileSystemViewStorageConfig viewStorageConfig, Function2 viewCreator) { this.conf = context.getHadoopConf(); this.viewStorageConfig = viewStorageConfig; - this.globalViewMap = new ConcurrentHashMap<>(); this.viewCreator = viewCreator; + this.globalViewMap = new ConcurrentHashMap<>(); } /** @@ -95,7 +96,7 @@ public void clearFileSystemView(String basePath) { * Main API to get the file-system view for the base-path. * * @param basePath Hoodie table base path - * @return + * @return {@link SyncableFileSystemView} */ public SyncableFileSystemView getFileSystemView(String basePath) { return globalViewMap.computeIfAbsent(basePath, (path) -> { @@ -108,10 +109,10 @@ public SyncableFileSystemView getFileSystemView(String basePath) { * Main API to get the file-system view for the base-path. * * @param metaClient HoodieTableMetaClient - * @return + * @return {@link SyncableFileSystemView} */ public SyncableFileSystemView getFileSystemView(HoodieTableMetaClient metaClient) { - return globalViewMap.computeIfAbsent(metaClient.getBasePath(), + return globalViewMap.computeIfAbsent(metaClient.getBasePathV2().toString(), (path) -> viewCreator.apply(metaClient, viewStorageConfig)); } @@ -130,12 +131,12 @@ public void close() { /** * Create RocksDB based file System view for a table. * - * @param viewConf View Storage Configuration + * @param viewConf View Storage Configuration * @param metaClient HoodieTableMetaClient - * @return + * @return {@link RocksDbBasedFileSystemView} */ private static RocksDbBasedFileSystemView createRocksDBBasedFileSystemView(FileSystemViewStorageConfig viewConf, - HoodieTableMetaClient metaClient) { + HoodieTableMetaClient metaClient) { HoodieTimeline timeline = metaClient.getActiveTimeline().filterCompletedAndCompactionInstants(); return new RocksDbBasedFileSystemView(metaClient, timeline, viewConf); } @@ -143,24 +144,25 @@ private static RocksDbBasedFileSystemView createRocksDBBasedFileSystemView(FileS /** * Create a spillable Map based file System view for a table. * - * @param viewConf View Storage Configuration + * @param viewConf View Storage Configuration * @param metaClient HoodieTableMetaClient - * @return + * @return {@link SpillableMapBasedFileSystemView} */ - private static SpillableMapBasedFileSystemView createSpillableMapBasedFileSystemView(FileSystemViewStorageConfig viewConf, - HoodieTableMetaClient metaClient, HoodieCommonConfig commonConfig) { - LOG.info("Creating SpillableMap based view for basePath " + metaClient.getBasePath()); + private static SpillableMapBasedFileSystemView createSpillableMapBasedFileSystemView( + FileSystemViewStorageConfig viewConf, HoodieTableMetaClient metaClient, HoodieCommonConfig commonConfig) { + LOG.info("Creating SpillableMap based view for basePath {}.", metaClient.getBasePathV2()); HoodieTimeline timeline = metaClient.getActiveTimeline().filterCompletedAndCompactionInstants(); return new SpillableMapBasedFileSystemView(metaClient, timeline, viewConf, commonConfig); } /** * Create an in-memory file System view for a table. - * */ - private static HoodieTableFileSystemView createInMemoryFileSystemView(HoodieMetadataConfig metadataConfig, FileSystemViewStorageConfig viewConf, - HoodieTableMetaClient metaClient, SerializableFunctionUnchecked metadataCreator) { - LOG.info("Creating InMemory based view for basePath " + metaClient.getBasePathV2()); + private static HoodieTableFileSystemView createInMemoryFileSystemView( + FileSystemViewStorageConfig viewConf, + HoodieTableMetaClient metaClient, + SerializableFunctionUnchecked metadataCreator) { + LOG.info("Creating InMemory based view for basePath {}.", metaClient.getBasePathV2()); HoodieTimeline timeline = metaClient.getActiveTimeline().filterCompletedAndCompactionInstants(); if (metaClient.getTableConfig().isMetadataTableAvailable()) { ValidationUtils.checkArgument(metadataCreator != null, "Metadata supplier is null. Cannot instantiate metadata file system view"); @@ -168,31 +170,30 @@ private static HoodieTableFileSystemView createInMemoryFileSystemView(HoodieMeta } if (metaClient.getMetaserverConfig().isMetaserverEnabled()) { return (HoodieTableFileSystemView) ReflectionUtils.loadClass(HOODIE_METASERVER_FILE_SYSTEM_VIEW_CLASS, - new Class[] {HoodieTableMetaClient.class, HoodieTimeline.class, HoodieMetaserverConfig.class}, + new Class[]{HoodieTableMetaClient.class, HoodieTimeline.class, HoodieMetaserverConfig.class}, metaClient, timeline, metaClient.getMetaserverConfig()); } return new HoodieTableFileSystemView(metaClient, timeline, viewConf.isIncrementalTimelineSyncEnabled()); } - public static HoodieTableFileSystemView createInMemoryFileSystemView(HoodieEngineContext engineContext, HoodieTableMetaClient metaClient, - HoodieMetadataConfig metadataConfig) { - + public static HoodieTableFileSystemView createInMemoryFileSystemView( + HoodieEngineContext engineContext, HoodieTableMetaClient metaClient, HoodieMetadataConfig metadataConfig) { return createInMemoryFileSystemViewWithTimeline(engineContext, metaClient, metadataConfig, metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants()); - } - public static HoodieTableFileSystemView createInMemoryFileSystemViewWithTimeline(HoodieEngineContext engineContext, - HoodieTableMetaClient metaClient, - HoodieMetadataConfig metadataConfig, - HoodieTimeline timeline) { - LOG.info("Creating InMemory based view for basePath " + metaClient.getBasePath()); + public static HoodieTableFileSystemView createInMemoryFileSystemViewWithTimeline( + HoodieEngineContext engineContext, + HoodieTableMetaClient metaClient, + HoodieMetadataConfig metadataConfig, + HoodieTimeline timeline) { + LOG.info("Creating InMemory based view for basePath {}.", metaClient.getBasePathV2()); if (metaClient.getTableConfig().isMetadataTableAvailable()) { return new HoodieMetadataFileSystemView(engineContext, metaClient, timeline, metadataConfig); } if (metaClient.getMetaserverConfig().isMetaserverEnabled()) { return (HoodieTableFileSystemView) ReflectionUtils.loadClass(HOODIE_METASERVER_FILE_SYSTEM_VIEW_CLASS, - new Class[] {HoodieTableMetaClient.class, HoodieTimeline.class, HoodieMetadataConfig.class}, + new Class[]{HoodieTableMetaClient.class, HoodieTimeline.class, HoodieMetadataConfig.class}, metaClient, timeline, metaClient.getMetaserverConfig()); } return new HoodieTableFileSystemView(metaClient, timeline); @@ -201,43 +202,40 @@ public static HoodieTableFileSystemView createInMemoryFileSystemViewWithTimeline /** * Create a remote file System view for a table. * - * @param viewConf View Storage Configuration + * @param viewConf View Storage Configuration * @param metaClient Hoodie Table MetaClient for the table. - * @return + * @return {@link RemoteHoodieTableFileSystemView} */ private static RemoteHoodieTableFileSystemView createRemoteFileSystemView(FileSystemViewStorageConfig viewConf, - HoodieTableMetaClient metaClient) { - LOG.info("Creating remote view for basePath " + metaClient.getBasePath() + ". Server=" - + viewConf.getRemoteViewServerHost() + ":" + viewConf.getRemoteViewServerPort() + ", Timeout=" - + viewConf.getRemoteTimelineClientTimeoutSecs()); + HoodieTableMetaClient metaClient) { + LOG.info("Creating remote view for basePath {}. Server={}:{}, Timeout={}", metaClient.getBasePathV2(), + viewConf.getRemoteViewServerHost(), viewConf.getRemoteViewServerPort(), viewConf.getRemoteTimelineClientTimeoutSecs()); return new RemoteHoodieTableFileSystemView(metaClient, viewConf); } + public static FileSystemViewManager createViewManagerWithTableMetadata( + final HoodieEngineContext context, + final HoodieMetadataConfig metadataConfig, + final FileSystemViewStorageConfig config, + final HoodieCommonConfig commonConfig) { + return createViewManager(context, config, commonConfig, + metaClient -> HoodieTableMetadata.create(context, metadataConfig, metaClient.getBasePathV2().toString(), true)); + } + public static FileSystemViewManager createViewManager(final HoodieEngineContext context, - final HoodieMetadataConfig metadataConfig, final FileSystemViewStorageConfig config, final HoodieCommonConfig commonConfig) { - return createViewManager(context, metadataConfig, config, commonConfig, null); - } - - public static FileSystemViewManager createViewManagerWithTableMetadata(final HoodieEngineContext context, - final HoodieMetadataConfig metadataConfig, - final FileSystemViewStorageConfig config, - final HoodieCommonConfig commonConfig) { - return createViewManager(context, metadataConfig, config, commonConfig, - metaClient -> HoodieTableMetadata.create(context, metadataConfig, metaClient.getBasePathV2().toString(), true)); + return createViewManager(context, config, commonConfig, null); } /** * Main Factory method for building file-system views. - * */ public static FileSystemViewManager createViewManager(final HoodieEngineContext context, - final HoodieMetadataConfig metadataConfig, final FileSystemViewStorageConfig config, final HoodieCommonConfig commonConfig, final SerializableFunctionUnchecked metadataCreator) { - LOG.info("Creating View Manager with storage type :" + config.getStorageType()); + LOG.info("Creating View Manager with storage type {}.", config.getStorageType()); switch (config.getStorageType()) { case EMBEDDED_KV_STORE: LOG.info("Creating embedded rocks-db based Table View"); @@ -250,7 +248,7 @@ public static FileSystemViewManager createViewManager(final HoodieEngineContext case MEMORY: LOG.info("Creating in-memory based Table View"); return new FileSystemViewManager(context, config, - (metaClient, viewConfig) -> createInMemoryFileSystemView(metadataConfig, viewConfig, metaClient, metadataCreator)); + (metaClient, viewConfig) -> createInMemoryFileSystemView(viewConfig, metaClient, metadataCreator)); case REMOTE_ONLY: LOG.info("Creating remote only table view"); return new FileSystemViewManager(context, config, (metaClient, viewConfig) -> createRemoteFileSystemView(viewConfig, @@ -263,7 +261,7 @@ public static FileSystemViewManager createViewManager(final HoodieEngineContext SyncableFileSystemView secondaryView; switch (viewConfig.getSecondaryStorageType()) { case MEMORY: - secondaryView = createInMemoryFileSystemView(metadataConfig, viewConfig, metaClient, metadataCreator); + secondaryView = createInMemoryFileSystemView(viewConfig, metaClient, metadataCreator); break; case EMBEDDED_KV_STORE: secondaryView = createRocksDBBasedFileSystemView(viewConfig, metaClient); diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/view/PriorityBasedFileSystemView.java b/hudi-common/src/main/java/org/apache/hudi/common/table/view/PriorityBasedFileSystemView.java index 56d7c7cc25cf..1e4b1852d1b2 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/view/PriorityBasedFileSystemView.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/view/PriorityBasedFileSystemView.java @@ -168,8 +168,29 @@ public Stream getLatestBaseFilesInRange(List commitsToRe } @Override - public Void loadAllPartitions() { - return execute(preferredView::loadAllPartitions, secondaryView::loadAllPartitions); + public void loadAllPartitions() { + execute( + () -> { + preferredView.loadAllPartitions(); + return null; + }, + () -> { + secondaryView.loadAllPartitions(); + return null; + }); + } + + @Override + public void loadPartitions(List partitionPaths) { + execute( + () -> { + preferredView.loadPartitions(partitionPaths); + return null; + }, + () -> { + secondaryView.loadPartitions(partitionPaths); + return null; + }); } @Override diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/view/RemoteHoodieTableFileSystemView.java b/hudi-common/src/main/java/org/apache/hudi/common/table/view/RemoteHoodieTableFileSystemView.java index 4363a7daf271..61c90c6eb020 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/view/RemoteHoodieTableFileSystemView.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/view/RemoteHoodieTableFileSystemView.java @@ -127,8 +127,10 @@ public class RemoteHoodieTableFileSystemView implements SyncableFileSystemView, // POST Requests public static final String REFRESH_TABLE = String.format("%s/%s", BASE_URL, "refresh/"); public static final String LOAD_ALL_PARTITIONS_URL = String.format("%s/%s", BASE_URL, "loadallpartitions/"); + public static final String LOAD_PARTITIONS_URL = String.format("%s/%s", BASE_URL, "loadpartitions/"); public static final String PARTITION_PARAM = "partition"; + public static final String PARTITIONS_PARAM = "partitions"; public static final String BASEPATH_PARAM = "basepath"; public static final String INSTANT_PARAM = "instant"; public static final String MAX_INSTANT_PARAM = "maxinstant"; @@ -526,11 +528,21 @@ public boolean refresh() { } @Override - public Void loadAllPartitions() { + public void loadAllPartitions() { Map paramsMap = getParams(); try { executeRequest(LOAD_ALL_PARTITIONS_URL, paramsMap, BOOLEAN_TYPE_REFERENCE, RequestMethod.POST); - return null; + } catch (IOException e) { + throw new HoodieRemoteException(e); + } + } + + @Override + public void loadPartitions(List partitionPaths) { + try { + Map paramsMap = getParams(); + paramsMap.put(PARTITIONS_PARAM, OBJECT_MAPPER.writeValueAsString(partitionPaths)); + executeRequest(LOAD_PARTITIONS_URL, paramsMap, BOOLEAN_TYPE_REFERENCE, RequestMethod.POST); } catch (IOException e) { throw new HoodieRemoteException(e); } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/view/TableFileSystemView.java b/hudi-common/src/main/java/org/apache/hudi/common/table/view/TableFileSystemView.java index 1bcd1de61bc5..87b3db142e67 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/view/TableFileSystemView.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/view/TableFileSystemView.java @@ -246,5 +246,11 @@ interface SliceView extends SliceViewWithLatestSlice { /** * Load all partition and file slices into view */ - Void loadAllPartitions(); + void loadAllPartitions(); + + /** + * Load all partition and file slices into view for the provided partition paths + * @param partitionPaths List of partition paths to load + */ + void loadPartitions(List partitionPaths); } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/ClusteringUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/util/ClusteringUtils.java index e9ae9cfbdfb6..64eb27453b3b 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/ClusteringUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/ClusteringUtils.java @@ -75,15 +75,22 @@ public static Stream> getAllPendingClu } /** - * Checks if the replacecommit is clustering commit. + * Checks if the requested, inflight, or completed instant of replacecommit action + * is a clustering operation, by checking whether the requested instant contains + * a clustering plan. + * + * @param timeline Hudi timeline. + * @param replaceInstant the instant of replacecommit action to check. + * @return whether the instant is a clustering operation. */ - public static boolean isClusteringCommit(HoodieTableMetaClient metaClient, HoodieInstant pendingReplaceInstant) { - return getClusteringPlan(metaClient, pendingReplaceInstant).isPresent(); + public static boolean isClusteringInstant(HoodieTimeline timeline, HoodieInstant replaceInstant) { + return getClusteringPlan(timeline, replaceInstant).isPresent(); } /** * Get requested replace metadata from timeline. - * @param timeline used to get the bytes stored in the requested replace instant in the timeline + * + * @param timeline used to get the bytes stored in the requested replace instant in the timeline * @param pendingReplaceInstant can be in any state, because it will always be converted to requested state * @return option of the replace metadata if present, else empty * @throws IOException @@ -238,16 +245,8 @@ private static Map buildMetrics(List fileSlices) { public static List getPendingClusteringInstantTimes(HoodieTableMetaClient metaClient) { return metaClient.getActiveTimeline().filterPendingReplaceTimeline().getInstantsAsStream() - .filter(instant -> isPendingClusteringInstant(metaClient, instant)) - .collect(Collectors.toList()); - } - - public static boolean isPendingClusteringInstant(HoodieTableMetaClient metaClient, HoodieInstant instant) { - return getClusteringPlan(metaClient, instant).isPresent(); - } - - public static boolean isPendingClusteringInstant(HoodieTimeline timeline, HoodieInstant instant) { - return getClusteringPlan(timeline, instant).isPresent(); + .filter(instant -> isClusteringInstant(metaClient.getActiveTimeline(), instant)) + .collect(Collectors.toList()); } /** @@ -301,9 +300,11 @@ public static Option getEarliestInstantToRetainForClustering( } /** - * Returns whether the given instant {@code instant} is with clustering operation. + * @param instant Hudi instant to check. + * @param timeline Hudi timeline. + * @return whether the given {@code instant} is a completed clustering operation. */ - public static boolean isClusteringInstant(HoodieInstant instant, HoodieTimeline timeline) { + public static boolean isCompletedClusteringInstant(HoodieInstant instant, HoodieTimeline timeline) { if (!instant.getAction().equals(HoodieTimeline.REPLACE_COMMIT_ACTION)) { return false; } @@ -313,4 +314,19 @@ public static boolean isClusteringInstant(HoodieInstant instant, HoodieTimeline throw new HoodieException("Resolve replace commit metadata error for instant: " + instant, e); } } + + /** + * Returns whether the given instant {@code instant} is with insert overwrite operation. + */ + public static boolean isInsertOverwriteInstant(HoodieInstant instant, HoodieTimeline timeline) { + if (!instant.getAction().equals(HoodieTimeline.REPLACE_COMMIT_ACTION)) { + return false; + } + try { + WriteOperationType opType = TimelineUtils.getCommitMetadata(instant, timeline).getOperationType(); + return opType.equals(WriteOperationType.INSERT_OVERWRITE) || opType.equals(WriteOperationType.INSERT_OVERWRITE_TABLE); + } catch (IOException e) { + throw new HoodieException("Resolve replace commit metadata error for instant: " + instant, e); + } + } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/collection/ExternalSpillableMap.java b/hudi-common/src/main/java/org/apache/hudi/common/util/collection/ExternalSpillableMap.java index 5df8f97d4b96..44b7a8020357 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/collection/ExternalSpillableMap.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/collection/ExternalSpillableMap.java @@ -284,7 +284,10 @@ public Collection values() { } List result = new ArrayList<>(inMemoryMap.size() + diskBasedMap.size()); result.addAll(inMemoryMap.values()); - result.addAll(diskBasedMap.values()); + Iterator iterator = diskBasedMap.iterator(); + while (iterator.hasNext()) { + result.add(iterator.next()); + } return result; } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/queue/BaseHoodieQueueBasedExecutor.java b/hudi-common/src/main/java/org/apache/hudi/common/util/queue/BaseHoodieQueueBasedExecutor.java index 20b9c802f605..f2843c56b031 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/queue/BaseHoodieQueueBasedExecutor.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/queue/BaseHoodieQueueBasedExecutor.java @@ -215,7 +215,7 @@ public E execute() { // to be interrupted as well Thread.currentThread().interrupt(); } - // throw if we have any other exception seen already. There is a chance that cancellation/closing of producers with CompeletableFuture wins before the actual exception + // throw if we have any other exception seen already. There is a chance that cancellation/closing of producers with CompletableFuture wins before the actual exception // is thrown. if (this.queue.getThrowable() != null) { throw new HoodieException(queue.getThrowable()); diff --git a/hudi-common/src/main/java/org/apache/hudi/exception/HoodieAvroSchemaException.java b/hudi-common/src/main/java/org/apache/hudi/exception/HoodieAvroSchemaException.java new file mode 100644 index 000000000000..c19c88c15c8b --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/exception/HoodieAvroSchemaException.java @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.exception; + +/** + * Thrown when we detect in Hudi code that a record schema + * violates Avro rules. This can happen even when using Spark + * because we use Avro schema internally + */ +public class HoodieAvroSchemaException extends SchemaCompatibilityException { + public HoodieAvroSchemaException(String message) { + super(message); + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/exception/HoodieNullSchemaTypeException.java b/hudi-common/src/main/java/org/apache/hudi/exception/HoodieNullSchemaTypeException.java new file mode 100644 index 000000000000..ff4abadcde9e --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/exception/HoodieNullSchemaTypeException.java @@ -0,0 +1,32 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.exception; + +import org.apache.hudi.internal.schema.HoodieSchemaException; + +/** + * Thrown if a schema is null or empty. Or if a field has type null + * (null is ok if it is in a union with 1 (one) other type) + */ +public class HoodieNullSchemaTypeException extends HoodieSchemaException { + public HoodieNullSchemaTypeException(String message) { + super(message); + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/exception/HoodieRecordCreationException.java b/hudi-common/src/main/java/org/apache/hudi/exception/HoodieRecordCreationException.java new file mode 100644 index 000000000000..dec70b369dae --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/exception/HoodieRecordCreationException.java @@ -0,0 +1,32 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.exception; + +/** + * Exception thrown during HoodieRecord construction for any failure + * that is not a KeyGeneration failure. An example of a failure would be if the + * record is malformed. + */ +public class HoodieRecordCreationException extends HoodieException { + + public HoodieRecordCreationException(String message, Throwable t) { + super(message, t); + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/exception/InvalidUnionTypeException.java b/hudi-common/src/main/java/org/apache/hudi/exception/InvalidUnionTypeException.java new file mode 100644 index 000000000000..370ad9438cc4 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/exception/InvalidUnionTypeException.java @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.exception; + +/** + * Thrown when a field is a union and at least one of the following is true: + *

      + *
    • the incoming union and the latest table union have differing numbers of types
    • + *
    • the incoming union has more than two types
    • + *
    + */ +public class InvalidUnionTypeException extends SchemaCompatibilityException { + public InvalidUnionTypeException(String message) { + super(message); + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/exception/MissingSchemaFieldException.java b/hudi-common/src/main/java/org/apache/hudi/exception/MissingSchemaFieldException.java new file mode 100644 index 000000000000..4727ff814f10 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/exception/MissingSchemaFieldException.java @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.exception; + +import org.apache.hudi.avro.AvroSchemaUtils; + +import org.apache.avro.Schema; + +import java.util.List; + +/** + * Thrown when the schema of the incoming data is missing fields that are in the table schema. + */ +public class MissingSchemaFieldException extends SchemaCompatibilityException { + + public MissingSchemaFieldException(List missingFields, Schema writerSchema, Schema tableSchema) { + super(constructExceptionMessage(missingFields, writerSchema, tableSchema)); + } + + private static String constructExceptionMessage(List missingFields, Schema writerSchema, Schema tableSchema) { + return AvroSchemaUtils.createSchemaErrorString( + "Schema validation failed due to missing field. Fields missing from incoming schema: {" + + String.join(", ", missingFields) + "}", writerSchema, tableSchema); + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/exception/SchemaBackwardsCompatibilityException.java b/hudi-common/src/main/java/org/apache/hudi/exception/SchemaBackwardsCompatibilityException.java new file mode 100644 index 000000000000..c38d13c9e292 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/exception/SchemaBackwardsCompatibilityException.java @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.exception; + +import org.apache.hudi.avro.AvroSchemaCompatibility; +import org.apache.hudi.avro.AvroSchemaUtils; + +import org.apache.avro.Schema; + +import java.util.stream.Collectors; + +/** + * Thrown when there is a backwards compatibility issue with the incoming schema. + * i.e. when the incoming schema cannot be used to read older data files + */ +public class SchemaBackwardsCompatibilityException extends SchemaCompatibilityException { + + public SchemaBackwardsCompatibilityException(AvroSchemaCompatibility.SchemaPairCompatibility compatibility, Schema writerSchema, Schema tableSchema) { + super(constructExceptionMessage(compatibility, writerSchema, tableSchema)); + } + + private static String constructExceptionMessage(AvroSchemaCompatibility.SchemaPairCompatibility compatibility, Schema writerSchema, Schema tableSchema) { + return AvroSchemaUtils.createSchemaErrorString("Schema validation backwards compatibility check failed with the following issues: {" + + compatibility.getResult().getIncompatibilities().stream() + .map(incompatibility -> incompatibility.getType().name() + ": " + incompatibility.getMessage()) + .collect(Collectors.joining(", ")) + "}", writerSchema, tableSchema); + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/exception/SchemaCompatibilityException.java b/hudi-common/src/main/java/org/apache/hudi/exception/SchemaCompatibilityException.java index 478ec0d42697..92d2f6744c14 100644 --- a/hudi-common/src/main/java/org/apache/hudi/exception/SchemaCompatibilityException.java +++ b/hudi-common/src/main/java/org/apache/hudi/exception/SchemaCompatibilityException.java @@ -18,10 +18,12 @@ package org.apache.hudi.exception; +import org.apache.hudi.internal.schema.HoodieSchemaException; + /** * An exception thrown when schema has compatibility problems. */ -public class SchemaCompatibilityException extends HoodieException { +public class SchemaCompatibilityException extends HoodieSchemaException { public SchemaCompatibilityException(String message) { super(message); diff --git a/hudi-common/src/main/java/org/apache/hudi/internal/schema/convert/AvroInternalSchemaConverter.java b/hudi-common/src/main/java/org/apache/hudi/internal/schema/convert/AvroInternalSchemaConverter.java index 69977563e85f..0f8d1606f2ad 100644 --- a/hudi-common/src/main/java/org/apache/hudi/internal/schema/convert/AvroInternalSchemaConverter.java +++ b/hudi-common/src/main/java/org/apache/hudi/internal/schema/convert/AvroInternalSchemaConverter.java @@ -19,6 +19,7 @@ package org.apache.hudi.internal.schema.convert; import org.apache.hudi.common.util.Option; +import org.apache.hudi.exception.HoodieNullSchemaTypeException; import org.apache.hudi.internal.schema.HoodieSchemaException; import org.apache.hudi.internal.schema.InternalSchema; import org.apache.hudi.internal.schema.Type; @@ -32,6 +33,7 @@ import java.util.ArrayList; import java.util.Deque; import java.util.HashMap; +import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Map; @@ -79,7 +81,7 @@ public static Schema convert(InternalSchema internalSchema, String name) { * but for the metadata table HoodieMetadata.avsc uses a trick where we have a bunch of * different types wrapped in record for col stats. * - * @param Schema avro schema. + * @param schema avro schema. * @return an avro Schema where null is the first. */ public static Schema fixNullOrdering(Schema schema) { @@ -156,6 +158,29 @@ public static Type buildTypeFromAvroSchema(Schema schema) { return visitAvroSchemaToBuildType(schema, visited, true, nextId); } + private static void checkNullType(Type fieldType, String fieldName, Deque visited) { + if (fieldType == null) { + StringBuilder sb = new StringBuilder(); + sb.append("Field '"); + Iterator visitedIterator = visited.descendingIterator(); + while (visitedIterator.hasNext()) { + sb.append(visitedIterator.next()); + sb.append("."); + } + sb.append(fieldName); + sb.append("' has type null"); + throw new HoodieNullSchemaTypeException(sb.toString()); + } else if (fieldType.typeId() == Type.TypeID.ARRAY) { + visited.push(fieldName); + checkNullType(((Types.ArrayType) fieldType).elementType(), "element", visited); + visited.pop(); + } else if (fieldType.typeId() == Type.TypeID.MAP) { + visited.push(fieldName); + checkNullType(((Types.MapType) fieldType).valueType(), "value", visited); + visited.pop(); + } + } + /** * Converts an avro schema into hudi type. * @@ -182,7 +207,9 @@ private static Type visitAvroSchemaToBuildType(Schema schema, Deque visi } nextId.set(nextAssignId + fields.size()); fields.stream().forEach(field -> { - fieldTypes.add(visitAvroSchemaToBuildType(field.schema(), visited, false, nextId)); + Type fieldType = visitAvroSchemaToBuildType(field.schema(), visited, false, nextId); + checkNullType(fieldType, field.name(), visited); + fieldTypes.add(fieldType); }); visited.pop(); List internalFields = new ArrayList<>(fields.size()); diff --git a/hudi-common/src/main/java/org/apache/hudi/internal/schema/utils/AvroSchemaEvolutionUtils.java b/hudi-common/src/main/java/org/apache/hudi/internal/schema/utils/AvroSchemaEvolutionUtils.java index e714d99f0e0e..cf7e87f457b8 100644 --- a/hudi-common/src/main/java/org/apache/hudi/internal/schema/utils/AvroSchemaEvolutionUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/internal/schema/utils/AvroSchemaEvolutionUtils.java @@ -25,11 +25,9 @@ import java.util.ArrayList; import java.util.List; -import java.util.Map; import java.util.TreeMap; import java.util.stream.Collectors; -import static org.apache.hudi.common.config.HoodieCommonConfig.MAKE_NEW_COLUMNS_NULLABLE; import static org.apache.hudi.common.util.CollectionUtils.reduce; import static org.apache.hudi.internal.schema.convert.AvroInternalSchemaConverter.convert; @@ -136,10 +134,9 @@ public static Schema reconcileSchema(Schema incomingSchema, Schema oldTableSchem * * @param sourceSchema source schema that needs reconciliation * @param targetSchema target schema that source schema will be reconciled against - * @param opts config options * @return schema (based off {@code source} one) that has nullability constraints and datatypes reconciled */ - public static Schema reconcileSchemaRequirements(Schema sourceSchema, Schema targetSchema, Map opts) { + public static Schema reconcileSchemaRequirements(Schema sourceSchema, Schema targetSchema) { if (targetSchema.getType() == Schema.Type.NULL || targetSchema.getFields().isEmpty()) { return sourceSchema; } @@ -153,14 +150,12 @@ public static Schema reconcileSchemaRequirements(Schema sourceSchema, Schema tar List colNamesSourceSchema = sourceInternalSchema.getAllColsFullName(); List colNamesTargetSchema = targetInternalSchema.getAllColsFullName(); - boolean makeNewColsNullable = "true".equals(opts.get(MAKE_NEW_COLUMNS_NULLABLE.key())); List nullableUpdateColsInSource = new ArrayList<>(); List typeUpdateColsInSource = new ArrayList<>(); colNamesSourceSchema.forEach(field -> { // handle columns that needs to be made nullable - if ((makeNewColsNullable && !colNamesTargetSchema.contains(field)) - || colNamesTargetSchema.contains(field) && sourceInternalSchema.findField(field).isOptional() != targetInternalSchema.findField(field).isOptional()) { + if (colNamesTargetSchema.contains(field) && sourceInternalSchema.findField(field).isOptional() != targetInternalSchema.findField(field).isOptional()) { nullableUpdateColsInSource.add(field); } // handle columns that needs type to be updated diff --git a/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroFileReaderFactory.java b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroFileReaderFactory.java index 0a511d10b031..84aed905a4d1 100644 --- a/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroFileReaderFactory.java +++ b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroFileReaderFactory.java @@ -18,6 +18,7 @@ package org.apache.hudi.io.storage; +import org.apache.hudi.common.config.HoodieConfig; import org.apache.hudi.common.util.Option; import org.apache.avro.Schema; @@ -29,15 +30,18 @@ import java.io.IOException; public class HoodieAvroFileReaderFactory extends HoodieFileReaderFactory { + + @Override protected HoodieFileReader newParquetFileReader(Configuration conf, Path path) { return new HoodieAvroParquetReader(conf, path); } - protected HoodieFileReader newHFileFileReader(boolean useNativeHFileReader, + @Override + protected HoodieFileReader newHFileFileReader(HoodieConfig hoodieConfig, Configuration conf, Path path, Option schemaOption) throws IOException { - if (useNativeHFileReader) { + if (isUseNativeHFileReaderEnabled(hoodieConfig)) { return new HoodieNativeAvroHFileReader(conf, path, schemaOption); } CacheConfig cacheConfig = new CacheConfig(conf); @@ -47,14 +51,15 @@ protected HoodieFileReader newHFileFileReader(boolean useNativeHFileReader, return new HoodieHBaseAvroHFileReader(conf, path, cacheConfig); } - protected HoodieFileReader newHFileFileReader(boolean useNativeHFileReader, + @Override + protected HoodieFileReader newHFileFileReader(HoodieConfig hoodieConfig, Configuration conf, Path path, FileSystem fs, byte[] content, Option schemaOption) throws IOException { - if (useNativeHFileReader) { + if (isUseNativeHFileReaderEnabled(hoodieConfig)) { return new HoodieNativeAvroHFileReader(conf, content, schemaOption); } CacheConfig cacheConfig = new CacheConfig(conf); diff --git a/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieFileReaderFactory.java b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieFileReaderFactory.java index f4b4bedc468b..ac2736f8829a 100644 --- a/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieFileReaderFactory.java +++ b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieFileReaderFactory.java @@ -84,11 +84,9 @@ public HoodieFileReader getFileReader(HoodieConfig hoodieConfig, Option schemaOption) throws IOException { switch (format) { case PARQUET: - return this.newParquetFileReader(conf, path); + return newParquetFileReader(conf, path); case HFILE: - boolean useNativeHFileReader = - hoodieConfig.getBooleanOrDefault(HoodieReaderConfig.USE_NATIVE_HFILE_READER); - return newHFileFileReader(useNativeHFileReader, conf, path, schemaOption); + return newHFileFileReader(hoodieConfig, conf, path, schemaOption); case ORC: return newOrcFileReader(conf, path); default: @@ -96,15 +94,13 @@ public HoodieFileReader getFileReader(HoodieConfig hoodieConfig, } } - public HoodieFileReader getContentReader(HoodieConfig config, + public HoodieFileReader getContentReader(HoodieConfig hoodieConfig, Configuration conf, Path path, HoodieFileFormat format, FileSystem fs, byte[] content, Option schemaOption) throws IOException { switch (format) { case HFILE: - boolean useNativeHFileReader = - config.getBooleanOrDefault(HoodieReaderConfig.USE_NATIVE_HFILE_READER); - return newHFileFileReader(useNativeHFileReader, conf, path, fs, content, schemaOption); + return newHFileFileReader(hoodieConfig, conf, path, fs, content, schemaOption); default: throw new UnsupportedOperationException(format + " format not supported yet."); } @@ -114,13 +110,13 @@ protected HoodieFileReader newParquetFileReader(Configuration conf, Path path) { throw new UnsupportedOperationException(); } - protected HoodieFileReader newHFileFileReader(boolean useNativeHFileReader, + protected HoodieFileReader newHFileFileReader(HoodieConfig hoodieConfig, Configuration conf, Path path, Option schemaOption) throws IOException { throw new UnsupportedOperationException(); } - protected HoodieFileReader newHFileFileReader(boolean useNativeHFileReader, + protected HoodieFileReader newHFileFileReader(HoodieConfig hoodieConfig, Configuration conf, Path path, FileSystem fs, byte[] content, Option schemaOption) @@ -138,4 +134,8 @@ public HoodieFileReader newBootstrapFileReader(HoodieFileReader skeletonFileRead Object[] partitionValues) { throw new UnsupportedOperationException(); } + + protected static boolean isUseNativeHFileReaderEnabled(HoodieConfig hoodieConfig) { + return hoodieConfig.getBooleanOrDefault(HoodieReaderConfig.USE_NATIVE_HFILE_READER); + } } diff --git a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataPayload.java b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataPayload.java index 10c094fdfb68..4d0b8a5d662f 100644 --- a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataPayload.java +++ b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataPayload.java @@ -49,6 +49,8 @@ import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import javax.annotation.Nullable; @@ -99,7 +101,7 @@ * During compaction on the table, the deletions are merged with additions and hence records are pruned. */ public class HoodieMetadataPayload implements HoodieRecordPayload { - + private static final Logger LOG = LoggerFactory.getLogger(HoodieMetadataPayload.class); /** * Type of the record. This can be an enum in the schema but Avro1.8 * has a bug - https://issues.apache.org/jira/browse/AVRO-1810 @@ -217,7 +219,7 @@ public HoodieMetadataPayload(Option recordOpt) { // Otherwise, it has to be present or the record would be considered invalid if (bloomFilterRecord == null) { checkArgument(record.getSchema().getField(SCHEMA_FIELD_ID_BLOOM_FILTER) == null, - String.format("Valid %s record expected for type: %s", SCHEMA_FIELD_ID_BLOOM_FILTER, METADATA_TYPE_COLUMN_STATS)); + String.format("Valid %s record expected for type: %s", SCHEMA_FIELD_ID_BLOOM_FILTER, METADATA_TYPE_BLOOM_FILTER)); } else { bloomFilterMetadata = new HoodieMetadataBloomFilter( (String) bloomFilterRecord.get(BLOOM_FILTER_FIELD_TYPE), @@ -550,27 +552,34 @@ private Map combineFileSystemMetadata(HoodieMeta // - First we merge records from all of the delta log-files // - Then we merge records from base-files with the delta ones (coming as a result // of the previous step) - (oldFileInfo, newFileInfo) -> - // NOTE: We can’t assume that MT update records will be ordered the same way as actual - // FS operations (since they are not atomic), therefore MT record merging should be a - // _commutative_ & _associative_ operation (ie one that would work even in case records - // will get re-ordered), which is - // - Possible for file-sizes (since file-sizes will ever grow, we can simply - // take max of the old and new records) - // - Not possible for is-deleted flags* - // - // *However, we’re assuming that the case of concurrent write and deletion of the same - // file is _impossible_ -- it would only be possible with concurrent upsert and - // rollback operation (affecting the same log-file), which is implausible, b/c either - // of the following have to be true: - // - We’re appending to failed log-file (then the other writer is trying to - // rollback it concurrently, before it’s own write) - // - Rollback (of completed instant) is running concurrently with append (meaning - // that restore is running concurrently with a write, which is also nut supported - // currently) - newFileInfo.getIsDeleted() - ? null - : new HoodieMetadataFileInfo(Math.max(newFileInfo.getSize(), oldFileInfo.getSize()), false)); + (oldFileInfo, newFileInfo) -> { + // NOTE: We can’t assume that MT update records will be ordered the same way as actual + // FS operations (since they are not atomic), therefore MT record merging should be a + // _commutative_ & _associative_ operation (ie one that would work even in case records + // will get re-ordered), which is + // - Possible for file-sizes (since file-sizes will ever grow, we can simply + // take max of the old and new records) + // - Not possible for is-deleted flags* + // + // *However, we’re assuming that the case of concurrent write and deletion of the same + // file is _impossible_ -- it would only be possible with concurrent upsert and + // rollback operation (affecting the same log-file), which is implausible, b/c either + // of the following have to be true: + // - We’re appending to failed log-file (then the other writer is trying to + // rollback it concurrently, before it’s own write) + // - Rollback (of completed instant) is running concurrently with append (meaning + // that restore is running concurrently with a write, which is also nut supported + // currently) + if (newFileInfo.getIsDeleted()) { + if (oldFileInfo.getIsDeleted()) { + LOG.warn("A file is repeatedly deleted in the files partition of the metadata table: " + key); + return newFileInfo; + } + return null; + } + return new HoodieMetadataFileInfo( + Math.max(newFileInfo.getSize(), oldFileInfo.getSize()), false); + }); }); } diff --git a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java index e19570443054..e81cd63f66e8 100644 --- a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java +++ b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java @@ -68,6 +68,7 @@ import org.apache.hudi.common.table.timeline.TimelineMetadataUtils; import org.apache.hudi.common.table.view.HoodieTableFileSystemView; import org.apache.hudi.common.util.CollectionUtils; +import org.apache.hudi.common.util.ExternalFilePathUtil; import org.apache.hudi.common.util.FileIOUtils; import org.apache.hudi.common.util.HoodieRecordUtils; import org.apache.hudi.common.util.Option; @@ -328,11 +329,11 @@ public static void deleteMetadataTable(String basePath, HoodieEngineContext cont * * @param basePath - base path of the dataset * @param context - instance of {@link HoodieEngineContext} - * @param partitionType - {@link MetadataPartitionType} of the partition to delete + * @param partitionPath - Partition path of the partition to delete */ - public static void deleteMetadataPartition(String basePath, HoodieEngineContext context, MetadataPartitionType partitionType) { + public static void deleteMetadataPartition(String basePath, HoodieEngineContext context, String partitionPath) { HoodieTableMetaClient dataMetaClient = HoodieTableMetaClient.builder().setBasePath(basePath).setConf(context.getHadoopConf().get()).build(); - deleteMetadataTablePartition(dataMetaClient, context, partitionType, false); + deleteMetadataTablePartition(dataMetaClient, context, partitionPath, false); } /** @@ -341,13 +342,13 @@ public static void deleteMetadataPartition(String basePath, HoodieEngineContext * @param basePath base path of the dataset * @param context instance of {@link HoodieEngineContext}. */ - public static boolean metadataPartitionExists(String basePath, HoodieEngineContext context, MetadataPartitionType partitionType) { + public static boolean metadataPartitionExists(String basePath, HoodieEngineContext context, String partitionPath) { final String metadataTablePath = HoodieTableMetadata.getMetadataTableBasePath(basePath); FileSystem fs = HadoopFSUtils.getFs(metadataTablePath, context.getHadoopConf().get()); try { - return fs.exists(new Path(metadataTablePath, partitionType.getPartitionPath())); + return fs.exists(new Path(metadataTablePath, partitionPath)); } catch (Exception e) { - throw new HoodieIOException(String.format("Failed to check metadata partition %s exists.", partitionType.getPartitionPath())); + throw new HoodieIOException(String.format("Failed to check metadata partition %s exists.", partitionPath)); } } @@ -455,6 +456,19 @@ private static List getPartitionsAdded(HoodieCommitMetadata commitMetada .collect(Collectors.toList()); } + /** + * Returns all the incremental write partition paths as a set with the given commits metadata. + * + * @param metadataList The commits metadata + * @return the partition path set + */ + public static Set getWritePartitionPaths(List metadataList) { + return metadataList.stream() + .map(HoodieCommitMetadata::getWritePartitionPaths) + .flatMap(Collection::stream) + .collect(Collectors.toSet()); + } + /** * Convert commit action metadata to bloom filter records. * @@ -684,7 +698,7 @@ public static HoodieData convertMetadataToColumnStatsRecords(Hoodi String partitionPath = deleteFileInfoPair.getLeft(); String filePath = deleteFileInfoPair.getRight(); - if (filePath.endsWith(HoodieFileFormat.PARQUET.getFileExtension())) { + if (filePath.endsWith(HoodieFileFormat.PARQUET.getFileExtension()) || ExternalFilePathUtil.isExternallyCreatedFile(filePath)) { return getColumnStatsRecords(partitionPath, filePath, dataTableMetaClient, columnsToIndex, true).iterator(); } return Collections.emptyListIterator(); @@ -1500,41 +1514,41 @@ public static String deleteMetadataTable(HoodieTableMetaClient dataMetaClient, H * @param context instance of {@code HoodieEngineContext}. * @param backup Whether metadata table should be backed up before deletion. If true, the table is backed up to the * directory with name metadata_. - * @param partitionType The partition to delete + * @param partitionPath The partition to delete * @return The backup directory if backup was requested, null otherwise */ public static String deleteMetadataTablePartition(HoodieTableMetaClient dataMetaClient, HoodieEngineContext context, - MetadataPartitionType partitionType, boolean backup) { - if (partitionType.equals(MetadataPartitionType.FILES)) { + String partitionPath, boolean backup) { + if (partitionPath.equals(MetadataPartitionType.FILES.getPartitionPath())) { return deleteMetadataTable(dataMetaClient, context, backup); } - final Path metadataTablePartitionPath = new Path(HoodieTableMetadata.getMetadataTableBasePath(dataMetaClient.getBasePath()), partitionType.getPartitionPath()); + final Path metadataTablePartitionPath = new Path(HoodieTableMetadata.getMetadataTableBasePath(dataMetaClient.getBasePath()), partitionPath); FileSystem fs = HadoopFSUtils.getFs(metadataTablePartitionPath.toString(), context.getHadoopConf().get()); - dataMetaClient.getTableConfig().setMetadataPartitionState(dataMetaClient, partitionType, false); + dataMetaClient.getTableConfig().setMetadataPartitionState(dataMetaClient, partitionPath, false); try { if (!fs.exists(metadataTablePartitionPath)) { return null; } } catch (FileNotFoundException e) { // Ignoring exception as metadata table already does not exist - LOG.debug("Metadata table partition " + partitionType + " not found at path " + metadataTablePartitionPath); + LOG.debug("Metadata table partition " + partitionPath + " not found at path " + metadataTablePartitionPath); return null; } catch (Exception e) { - throw new HoodieMetadataException(String.format("Failed to check existence of MDT partition %s at path %s: ", partitionType, metadataTablePartitionPath), e); + throw new HoodieMetadataException(String.format("Failed to check existence of MDT partition %s at path %s: ", partitionPath, metadataTablePartitionPath), e); } if (backup) { final Path metadataPartitionBackupPath = new Path(metadataTablePartitionPath.getParent().getParent(), - String.format(".metadata_%s_%s", partitionType.getPartitionPath(), dataMetaClient.createNewInstantTime(false))); - LOG.info(String.format("Backing up MDT partition %s to %s before deletion", partitionType, metadataPartitionBackupPath)); + String.format(".metadata_%s_%s", partitionPath, dataMetaClient.createNewInstantTime(false))); + LOG.info(String.format("Backing up MDT partition %s to %s before deletion", partitionPath, metadataPartitionBackupPath)); try { if (fs.rename(metadataTablePartitionPath, metadataPartitionBackupPath)) { return metadataPartitionBackupPath.toString(); } } catch (Exception e) { // If rename fails, we will try to delete the table instead - LOG.error(String.format("Failed to backup MDT partition %s using rename", partitionType), e); + LOG.error(String.format("Failed to backup MDT partition %s using rename", partitionPath), e); } } else { LOG.info("Deleting metadata table partition from " + metadataTablePartitionPath); @@ -1640,7 +1654,7 @@ public static String createLogCompactionTimestamp(String timestamp) { * * @param partitionType Type of the partition for which the file group count is to be estimated. * @param recordCount The number of records expected to be written. - * @param averageRecordSize Average size of each record to be writen. + * @param averageRecordSize Average size of each record to be written. * @param minFileGroupCount Minimum number of file groups to use. * @param maxFileGroupCount Maximum number of file groups to use. * @param growthFactor By what factor are the records (recordCount) expected to grow? diff --git a/hudi-common/src/test/java/org/apache/hudi/avro/TestAvroSchemaUtils.java b/hudi-common/src/test/java/org/apache/hudi/avro/TestAvroSchemaUtils.java index 6d8fa651e51b..1417f2f67dcc 100644 --- a/hudi-common/src/test/java/org/apache/hudi/avro/TestAvroSchemaUtils.java +++ b/hudi-common/src/test/java/org/apache/hudi/avro/TestAvroSchemaUtils.java @@ -18,6 +18,7 @@ package org.apache.hudi.avro; +import org.apache.hudi.exception.SchemaBackwardsCompatibilityException; import org.apache.hudi.exception.SchemaCompatibilityException; import org.apache.avro.Schema; @@ -231,6 +232,30 @@ public void testIsCompatiblePartitionDropCols(boolean shouldValidate) { AvroSchemaUtils.checkSchemaCompatible(FULL_SCHEMA, SHORT_SCHEMA, shouldValidate, false, Collections.singleton("c")); } + private static final Schema BROKEN_SCHEMA = new Schema.Parser().parse("{\n" + + " \"type\" : \"record\",\n" + + " \"name\" : \"broken\",\n" + + " \"fields\" : [ {\n" + + " \"name\" : \"a\",\n" + + " \"type\" : [ \"null\", \"int\" ],\n" + + " \"default\" : null\n" + + " }, {\n" + + " \"name\" : \"b\",\n" + + " \"type\" : [ \"null\", \"int\" ],\n" + + " \"default\" : null\n" + + " }, {\n" + + " \"name\" : \"c\",\n" + + " \"type\" : [ \"null\", \"boolean\" ],\n" + + " \"default\" : null\n" + + " } ]\n" + + "}"); + + @Test + public void testBrokenSchema() { + assertThrows(SchemaBackwardsCompatibilityException.class, + () -> AvroSchemaUtils.checkSchemaCompatible(FULL_SCHEMA, BROKEN_SCHEMA, true, false, Collections.emptySet())); + } + /* [HUDI-7045] should uncomment this test @Test public void testAppendFieldsToSchemaDedupNested() { diff --git a/hudi-common/src/test/java/org/apache/hudi/avro/TestHoodieAvroUtils.java b/hudi-common/src/test/java/org/apache/hudi/avro/TestHoodieAvroUtils.java index eb20081475ff..f1e5f606602c 100644 --- a/hudi-common/src/test/java/org/apache/hudi/avro/TestHoodieAvroUtils.java +++ b/hudi-common/src/test/java/org/apache/hudi/avro/TestHoodieAvroUtils.java @@ -629,4 +629,27 @@ public void testAddMetadataFields() { assertEquals("custom_schema_property_value", schemaWithMetadata.getProp("custom_schema_property")); assertEquals("value", originalFieldsInUpdatedSchema.get(0).getProp("custom_field_property")); } + + @Test + void testSafeAvroToJsonStringMissingRequiredField() { + Schema schema = new Schema.Parser().parse(EXAMPLE_SCHEMA); + GenericRecord record = new GenericData.Record(schema); + record.put("non_pii_col", "val1"); + record.put("pii_col", "val2"); + record.put("timestamp", 3.5); + String jsonString = HoodieAvroUtils.safeAvroToJsonString(record); + assertEquals("{\"timestamp\": 3.5, \"_row_key\": null, \"non_pii_col\": \"val1\", \"pii_col\": \"val2\"}", jsonString); + } + + @Test + void testSafeAvroToJsonStringBadDataType() { + Schema schema = new Schema.Parser().parse(EXAMPLE_SCHEMA); + GenericRecord record = new GenericData.Record(schema); + record.put("non_pii_col", "val1"); + record.put("_row_key", "key"); + record.put("pii_col", "val2"); + record.put("timestamp", "foo"); + String jsonString = HoodieAvroUtils.safeAvroToJsonString(record); + assertEquals("{\"timestamp\": \"foo\", \"_row_key\": \"key\", \"non_pii_col\": \"val1\", \"pii_col\": \"val2\"}", jsonString); + } } diff --git a/hudi-common/src/test/java/org/apache/hudi/client/transaction/lock/TestInProcessLockProvider.java b/hudi-common/src/test/java/org/apache/hudi/client/transaction/lock/TestInProcessLockProvider.java index 60f74bb79963..6e7dcd7e3fa2 100644 --- a/hudi-common/src/test/java/org/apache/hudi/client/transaction/lock/TestInProcessLockProvider.java +++ b/hudi-common/src/test/java/org/apache/hudi/client/transaction/lock/TestInProcessLockProvider.java @@ -166,6 +166,9 @@ public void testLockIdentity() throws InterruptedException { Assertions.assertTrue(writer3Completed.get()); Assertions.assertEquals(lockProviderList.get(0).getLock(), lockProviderList.get(1).getLock()); Assertions.assertEquals(lockProviderList.get(1).getLock(), lockProviderList.get(2).getLock()); + + writer2.interrupt(); + writer3.interrupt(); } @Test @@ -254,6 +257,8 @@ public void run() { // } Assertions.assertTrue(writer2Completed.get()); + + writer2.interrupt(); } @Test @@ -317,6 +322,9 @@ public void run() { } Assertions.assertTrue(writer2Stream1Completed.get()); Assertions.assertTrue(writer2Stream2Completed.get()); + + writer2Stream1.interrupt(); + writer2Stream2.interrupt(); } @Test @@ -373,6 +381,8 @@ public void testTryLockReAcquisitionByDifferentThread() { assertDoesNotThrow(() -> { inProcessLockProvider.unlock(); }); + + writer2.interrupt(); } @Test @@ -414,6 +424,9 @@ public void testTryUnLockByDifferentThread() { // unlock by main thread should succeed. inProcessLockProvider.unlock(); }); + + writer2.interrupt(); + writer3.interrupt(); } @Test @@ -472,6 +485,9 @@ public void testTryLockAcquisitionBeforeTimeOutFromTwoThreads() { // Make sure both writers actually completed good Assertions.assertTrue(writer1Completed.get()); Assertions.assertTrue(writer2Completed.get()); + + writer1.interrupt(); + writer2.interrupt(); } @Test diff --git a/hudi-common/src/test/java/org/apache/hudi/common/data/TestHoodieListData.java b/hudi-common/src/test/java/org/apache/hudi/common/data/TestHoodieListData.java index ea19f128d1a9..795318f5e01b 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/data/TestHoodieListData.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/data/TestHoodieListData.java @@ -27,12 +27,15 @@ import org.junit.jupiter.params.provider.MethodSource; import java.util.Arrays; +import java.util.Collections; import java.util.List; import java.util.stream.Collectors; import java.util.stream.IntStream; import java.util.stream.Stream; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; class TestHoodieListData { @@ -72,4 +75,23 @@ public void testGetNumPartitions() { IntStream.rangeClosed(0, 100).boxed().collect(Collectors.toList())); assertEquals(1, listData.getNumPartitions()); } + + @Test + public void testIsEmpty() { + // HoodieListData bearing eager execution semantic + HoodieData listData = HoodieListData.eager( + IntStream.rangeClosed(0, 100).boxed().collect(Collectors.toList())); + assertFalse(listData.isEmpty()); + + HoodieData emptyListData = HoodieListData.eager(Collections.emptyList()); + assertTrue(emptyListData.isEmpty()); + + // HoodieListData bearing lazy execution semantic + listData = HoodieListData.lazy( + IntStream.rangeClosed(0, 100).boxed().collect(Collectors.toList())); + assertFalse(listData.isEmpty()); + + emptyListData = HoodieListData.lazy(Collections.emptyList()); + assertTrue(emptyListData.isEmpty()); + } } diff --git a/hudi-common/src/test/java/org/apache/hudi/common/fs/TestFSUtils.java b/hudi-common/src/test/java/org/apache/hudi/common/fs/TestFSUtils.java index 350d1a02072c..7d5e1de93a9b 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/fs/TestFSUtils.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/fs/TestFSUtils.java @@ -21,7 +21,6 @@ import org.apache.hudi.common.config.SerializableConfiguration; import org.apache.hudi.common.engine.HoodieLocalEngineContext; import org.apache.hudi.common.model.HoodieLogFile; -import org.apache.hudi.common.table.HoodieTableConfig; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.cdc.HoodieCDCUtils; import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; @@ -75,7 +74,6 @@ public class TestFSUtils extends HoodieCommonTestHarness { private static final String TEST_WRITE_TOKEN = "1-0-1"; - private static final String BASE_FILE_EXTENSION = HoodieTableConfig.BASE_FILE_FORMAT.defaultValue().getFileExtension(); @Rule public final EnvironmentVariables environmentVariables = new EnvironmentVariables(); @@ -94,7 +92,8 @@ public void tearDown() throws Exception { public void testMakeDataFileName() { String instantTime = HoodieActiveTimeline.formatDate(new Date()); String fileName = UUID.randomUUID().toString(); - assertEquals(FSUtils.makeBaseFileName(instantTime, TEST_WRITE_TOKEN, fileName), fileName + "_" + TEST_WRITE_TOKEN + "_" + instantTime + BASE_FILE_EXTENSION); + assertEquals(FSUtils.makeBaseFileName(instantTime, TEST_WRITE_TOKEN, fileName, BASE_FILE_EXTENSION), + fileName + "_" + TEST_WRITE_TOKEN + "_" + instantTime + BASE_FILE_EXTENSION); } @Test @@ -169,7 +168,7 @@ public void testProcessFiles() throws Exception { public void testGetCommitTime() { String instantTime = HoodieActiveTimeline.formatDate(new Date()); String fileName = UUID.randomUUID().toString(); - String fullFileName = FSUtils.makeBaseFileName(instantTime, TEST_WRITE_TOKEN, fileName); + String fullFileName = FSUtils.makeBaseFileName(instantTime, TEST_WRITE_TOKEN, fileName, BASE_FILE_EXTENSION); assertEquals(instantTime, FSUtils.getCommitTime(fullFileName)); // test log file name fullFileName = FSUtils.makeLogFileName(fileName, HOODIE_LOG.getFileExtension(), instantTime, 1, TEST_WRITE_TOKEN); @@ -180,7 +179,7 @@ public void testGetCommitTime() { public void testGetFileNameWithoutMeta() { String instantTime = HoodieActiveTimeline.formatDate(new Date()); String fileName = UUID.randomUUID().toString(); - String fullFileName = FSUtils.makeBaseFileName(instantTime, TEST_WRITE_TOKEN, fileName); + String fullFileName = FSUtils.makeBaseFileName(instantTime, TEST_WRITE_TOKEN, fileName, BASE_FILE_EXTENSION); assertEquals(fileName, FSUtils.getFileId(fullFileName)); } @@ -375,7 +374,7 @@ public void testFileNameRelatedFunctions() throws Exception { final String LOG_EXTENSION = "." + LOG_STR; // data file name - String dataFileName = FSUtils.makeBaseFileName(instantTime, writeToken, fileId); + String dataFileName = FSUtils.makeBaseFileName(instantTime, writeToken, fileId, BASE_FILE_EXTENSION); assertEquals(instantTime, FSUtils.getCommitTime(dataFileName)); assertEquals(fileId, FSUtils.getFileId(dataFileName)); diff --git a/hudi-common/src/test/java/org/apache/hudi/common/fs/TestStorageSchemes.java b/hudi-common/src/test/java/org/apache/hudi/common/fs/TestStorageSchemes.java index 7adcb1052375..e718bc21ed6b 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/fs/TestStorageSchemes.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/fs/TestStorageSchemes.java @@ -56,6 +56,7 @@ public void testStorageSchemes() { assertFalse(StorageSchemes.isAtomicCreationSupported("jfs")); assertFalse(StorageSchemes.isAtomicCreationSupported("bos")); assertFalse(StorageSchemes.isAtomicCreationSupported("ks3")); + assertFalse(StorageSchemes.isAtomicCreationSupported("nos")); assertFalse(StorageSchemes.isAtomicCreationSupported("ofs")); assertFalse(StorageSchemes.isAtomicCreationSupported("oci")); assertFalse(StorageSchemes.isAtomicCreationSupported("tos")); diff --git a/hudi-common/src/test/java/org/apache/hudi/common/model/TestFirstValueAvroPayload.java b/hudi-common/src/test/java/org/apache/hudi/common/model/TestFirstValueAvroPayload.java new file mode 100644 index 000000000000..a0b7eb86b488 --- /dev/null +++ b/hudi-common/src/test/java/org/apache/hudi/common/model/TestFirstValueAvroPayload.java @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.model; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericRecord; +import org.apache.hudi.common.testutils.PreCombineTestUtils; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.MethodSource; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Properties; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +public class TestFirstValueAvroPayload { + + private Schema schema; + private Properties props; + + @BeforeEach + public void setUp() throws Exception { + schema = Schema.createRecord(Arrays.asList( + new Schema.Field("id", Schema.create(Schema.Type.STRING), "", null), + new Schema.Field("partition", Schema.create(Schema.Type.STRING), "", null), + new Schema.Field("ts", Schema.create(Schema.Type.LONG), "", null), + new Schema.Field("_hoodie_is_deleted", Schema.create(Schema.Type.BOOLEAN), "", false) + )); + props = new Properties(); + props.setProperty(HoodiePayloadProps.PAYLOAD_ORDERING_FIELD_PROP_KEY, "ts"); + props.setProperty(HoodiePayloadProps.PAYLOAD_EVENT_TIME_FIELD_PROP_KEY, "ts"); + } + + @ParameterizedTest + @MethodSource("org.apache.hudi.common.testutils.PreCombineTestUtils#configurePreCombine") + public void testActiveRecordsForFirstValueAvroPayload(String key) throws IOException { + PreCombineTestUtils.setPreCombineConfig(props, key, "ts"); + GenericRecord record1 = new GenericData.Record(schema); + record1.put("id", "0"); + record1.put("partition", "partition0"); + record1.put("ts", 0L); + record1.put("_hoodie_is_deleted", false); + + GenericRecord record2 = new GenericData.Record(schema); + record2.put("id", "0"); + record2.put("partition", "partition0"); + record2.put("ts", 0L); + record2.put("_hoodie_is_deleted", false); + + DefaultHoodieRecordPayload payload1 = new FirstValueAvroPayload(record1, 1); + DefaultHoodieRecordPayload payload2 = new FirstValueAvroPayload(record2, 1); + assertEquals(payload1.preCombine(payload2, props), payload2); + assertEquals(payload2.preCombine(payload1, props), payload1); + + assertEquals(record1, payload1.getInsertValue(schema, props).get()); + assertEquals(record2, payload2.getInsertValue(schema, props).get()); + + assertEquals(payload1.combineAndGetUpdateValue(record2, schema, props).get(), record2); + assertEquals(payload2.combineAndGetUpdateValue(record1, schema, props).get(), record1); + } +} diff --git a/hudi-common/src/test/java/org/apache/hudi/common/model/TestHoodieDeltaWriteStat.java b/hudi-common/src/test/java/org/apache/hudi/common/model/TestHoodieDeltaWriteStat.java index b774e06cea6d..a09bf539febc 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/model/TestHoodieDeltaWriteStat.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/model/TestHoodieDeltaWriteStat.java @@ -23,6 +23,8 @@ import org.junit.jupiter.api.Test; import java.util.ArrayList; +import java.util.Collections; +import java.util.List; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertTrue; @@ -49,4 +51,27 @@ public void testBaseFileAndLogFiles() { writeStat.setLogFiles(new ArrayList<>()); assertTrue(writeStat.getLogFiles().isEmpty()); } + + @Test + void testGetHoodieDeltaWriteStatFromPreviousStat() { + HoodieDeltaWriteStat prevStat = createDeltaWriteStat("part", "fileId", "888", + "base", Collections.singletonList("log1")); + HoodieDeltaWriteStat stat = prevStat.copy(); + assertEquals(prevStat.getPartitionPath(), stat.getPartitionPath()); + assertEquals(prevStat.getFileId(), stat.getFileId()); + assertEquals(prevStat.getPrevCommit(), stat.getPrevCommit()); + assertEquals(prevStat.getBaseFile(), stat.getBaseFile()); + assertEquals(1, stat.getLogFiles().size()); + assertEquals(prevStat.getLogFiles().get(0), stat.getLogFiles().get(0)); + } + + private HoodieDeltaWriteStat createDeltaWriteStat(String partition, String fileId, String prevCommit, String baseFile, List logFiles) { + HoodieDeltaWriteStat writeStat1 = new HoodieDeltaWriteStat(); + writeStat1.setPartitionPath(partition); + writeStat1.setFileId(fileId); + writeStat1.setPrevCommit(prevCommit); + writeStat1.setBaseFile(baseFile); + writeStat1.setLogFiles(logFiles); + return writeStat1; + } } diff --git a/hudi-common/src/test/java/org/apache/hudi/common/model/TestHoodieWriteStat.java b/hudi-common/src/test/java/org/apache/hudi/common/model/TestHoodieWriteStat.java index e8a7205f769e..d6c3cf7fbb02 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/model/TestHoodieWriteStat.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/model/TestHoodieWriteStat.java @@ -19,6 +19,7 @@ package org.apache.hudi.common.model; import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.table.HoodieTableConfig; import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hadoop.fs.Path; @@ -46,7 +47,8 @@ public void testSetPaths() { Path basePath = new Path(basePathString); Path partitionPath = new Path(basePath, partitionPathString); - Path finalizeFilePath = new Path(partitionPath, FSUtils.makeBaseFileName(instantTime, writeToken, fileName)); + Path finalizeFilePath = new Path(partitionPath, FSUtils.makeBaseFileName(instantTime, writeToken, fileName, + HoodieTableConfig.BASE_FILE_FORMAT.defaultValue().getFileExtension())); HoodieWriteStat writeStat = new HoodieWriteStat(); writeStat.setPath(basePath, finalizeFilePath); assertEquals(finalizeFilePath, new Path(basePath, writeStat.getPath())); diff --git a/hudi-common/src/test/java/org/apache/hudi/common/table/TestTableSchemaResolver.java b/hudi-common/src/test/java/org/apache/hudi/common/table/TestTableSchemaResolver.java index 3ac42b9d3b7c..b7f0ba8eba77 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/table/TestTableSchemaResolver.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/table/TestTableSchemaResolver.java @@ -21,7 +21,7 @@ import org.apache.hudi.avro.AvroSchemaUtils; import org.apache.hudi.common.testutils.HoodieTestDataGenerator; import org.apache.hudi.common.util.Option; -import org.apache.hudi.exception.HoodieIncompatibleSchemaException; +import org.apache.hudi.internal.schema.HoodieSchemaException; import org.apache.avro.Schema; import org.junit.jupiter.api.Test; @@ -61,7 +61,7 @@ public void testRecreateSchemaWhenDropPartitionColumns() { String[] pts4 = {"user_partition", "partition_path"}; try { TableSchemaResolver.appendPartitionColumns(originSchema, Option.of(pts3)); - } catch (HoodieIncompatibleSchemaException e) { + } catch (HoodieSchemaException e) { assertTrue(e.getMessage().contains("Partial partition fields are still in the schema")); } } diff --git a/hudi-common/src/test/java/org/apache/hudi/common/table/TestTimelineUtils.java b/hudi-common/src/test/java/org/apache/hudi/common/table/TestTimelineUtils.java index c81a05b4c20a..d258753c3a85 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/table/TestTimelineUtils.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/table/TestTimelineUtils.java @@ -158,7 +158,7 @@ public void testGetPartitions() throws IOException { HoodieInstant cleanInstant = new HoodieInstant(true, CLEAN_ACTION, ts); activeTimeline.createNewInstant(cleanInstant); - activeTimeline.saveAsComplete(cleanInstant, getCleanMetadata(olderPartition, ts)); + activeTimeline.saveAsComplete(cleanInstant, getCleanMetadata(olderPartition, ts, false)); } metaClient.reloadActiveTimeline(); @@ -197,7 +197,7 @@ public void testGetPartitionsUnPartitioned() throws IOException { HoodieInstant cleanInstant = new HoodieInstant(true, CLEAN_ACTION, ts); activeTimeline.createNewInstant(cleanInstant); - activeTimeline.saveAsComplete(cleanInstant, getCleanMetadata(partitionPath, ts)); + activeTimeline.saveAsComplete(cleanInstant, getCleanMetadata(partitionPath, ts, false)); } metaClient.reloadActiveTimeline(); @@ -553,7 +553,7 @@ private byte[] getReplaceCommitMetadata(String basePath, String commitTs, String return serializeCommitMetadata(commit).get(); } - private Option getCleanMetadata(String partition, String time) throws IOException { + private Option getCleanMetadata(String partition, String time, boolean isPartitionDeleted) throws IOException { Map partitionToFilesCleaned = new HashMap<>(); List filesDeleted = new ArrayList<>(); filesDeleted.add("file-" + partition + "-" + time + "1"); @@ -564,6 +564,7 @@ private Option getCleanMetadata(String partition, String time) throws IO .setFailedDeleteFiles(Collections.emptyList()) .setDeletePathPatterns(Collections.emptyList()) .setSuccessDeleteFiles(filesDeleted) + .setIsPartitionDeleted(isPartitionDeleted) .build(); partitionToFilesCleaned.putIfAbsent(partition, partitionMetadata); HoodieCleanMetadata cleanMetadata = HoodieCleanMetadata.newBuilder() @@ -611,4 +612,43 @@ public void testHandleHollowCommitIfNeeded(HollowCommitHandling handlingMode) th fail("should cover all handling mode."); } } + + @Test + public void testGetDroppedPartitions() throws Exception { + HoodieActiveTimeline activeTimeline = metaClient.getActiveTimeline(); + HoodieTimeline activeCommitTimeline = activeTimeline.getCommitTimeline(); + assertTrue(activeCommitTimeline.empty()); + + String olderPartition = "p1"; // older partitions that will be deleted by clean commit + // first insert to the older partition + HoodieInstant instant1 = new HoodieInstant(true, COMMIT_ACTION, "00001"); + activeTimeline.createNewInstant(instant1); + activeTimeline.saveAsComplete(instant1, Option.of(getCommitMetadata(basePath, olderPartition, "00001", 2, Collections.emptyMap()))); + + metaClient.reloadActiveTimeline(); + List droppedPartitions = TimelineUtils.getDroppedPartitions(metaClient, Option.empty(), Option.empty()); + // no dropped partitions + assertEquals(0, droppedPartitions.size()); + + // another commit inserts to new partition + HoodieInstant instant2 = new HoodieInstant(true, COMMIT_ACTION, "00002"); + activeTimeline.createNewInstant(instant2); + activeTimeline.saveAsComplete(instant2, Option.of(getCommitMetadata(basePath, "p2", "00002", 2, Collections.emptyMap()))); + + metaClient.reloadActiveTimeline(); + droppedPartitions = TimelineUtils.getDroppedPartitions(metaClient, Option.empty(), Option.empty()); + // no dropped partitions + assertEquals(0, droppedPartitions.size()); + + // clean commit deletes older partition + HoodieInstant cleanInstant = new HoodieInstant(true, CLEAN_ACTION, "00003"); + activeTimeline.createNewInstant(cleanInstant); + activeTimeline.saveAsComplete(cleanInstant, getCleanMetadata(olderPartition, "00003", true)); + + metaClient.reloadActiveTimeline(); + droppedPartitions = TimelineUtils.getDroppedPartitions(metaClient, Option.empty(), Option.empty()); + // older partition is in the list dropped partitions + assertEquals(1, droppedPartitions.size()); + assertEquals(olderPartition, droppedPartitions.get(0)); + } } diff --git a/hudi-common/src/test/java/org/apache/hudi/common/table/read/TestHoodieFileGroupReaderBase.java b/hudi-common/src/test/java/org/apache/hudi/common/table/read/TestHoodieFileGroupReaderBase.java index 5fc1895ef781..8001cbe45d37 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/table/read/TestHoodieFileGroupReaderBase.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/table/read/TestHoodieFileGroupReaderBase.java @@ -150,7 +150,7 @@ private void validateOutputFromFileGroupReader(Configuration hadoopConf, HoodieEngineContext engineContext = new HoodieLocalEngineContext(hadoopConf); HoodieMetadataConfig metadataConfig = HoodieMetadataConfig.newBuilder().build(); FileSystemViewManager viewManager = FileSystemViewManager.createViewManager( - engineContext, metadataConfig, FileSystemViewStorageConfig.newBuilder().build(), + engineContext, FileSystemViewStorageConfig.newBuilder().build(), HoodieCommonConfig.newBuilder().build(), mc -> HoodieTableMetadata.create( engineContext, metadataConfig, mc.getBasePathV2().toString())); diff --git a/hudi-common/src/test/java/org/apache/hudi/common/table/view/TestHoodieTableFSViewWithClustering.java b/hudi-common/src/test/java/org/apache/hudi/common/table/view/TestHoodieTableFSViewWithClustering.java index e9546f8f9d19..26f9c3f07611 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/table/view/TestHoodieTableFSViewWithClustering.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/table/view/TestHoodieTableFSViewWithClustering.java @@ -124,10 +124,10 @@ public void testReplaceFileIdIsExcludedInView() throws IOException { // Only one commit String commitTime1 = "1"; - String fileName1 = FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId1); - String fileName2 = FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId2); - String fileName3 = FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId3); - String fileName4 = FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId4); + String fileName1 = FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId1, BASE_FILE_EXTENSION); + String fileName2 = FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId2, BASE_FILE_EXTENSION); + String fileName3 = FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId3, BASE_FILE_EXTENSION); + String fileName4 = FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId4, BASE_FILE_EXTENSION); new File(basePath + "/" + partitionPath1 + "/" + fileName1).createNewFile(); new File(basePath + "/" + partitionPath1 + "/" + fileName2).createNewFile(); new File(basePath + "/" + partitionPath2 + "/" + fileName3).createNewFile(); diff --git a/hudi-common/src/test/java/org/apache/hudi/common/table/view/TestHoodieTableFileSystemView.java b/hudi-common/src/test/java/org/apache/hudi/common/table/view/TestHoodieTableFileSystemView.java index 1ab824724aab..f5cee136d2e2 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/table/view/TestHoodieTableFileSystemView.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/table/view/TestHoodieTableFileSystemView.java @@ -313,8 +313,8 @@ public void testViewForFileSlicesWithPartitionMetadataFile() throws Exception { String fileId1 = UUID.randomUUID().toString(); String fileId2 = UUID.randomUUID().toString(); String commitTime1 = "1"; - String fileName1 = FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId1); - String fileName2 = FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId2); + String fileName1 = FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId1, BASE_FILE_EXTENSION); + String fileName2 = FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId2, BASE_FILE_EXTENSION); new File(basePath + "/" + partitionPath + "/" + fileName1).createNewFile(); new File(basePath + "/" + partitionPath + "/" + fileName2).createNewFile(); HoodieActiveTimeline commitTimeline = metaClient.getActiveTimeline(); @@ -343,8 +343,8 @@ public void testViewForGetAllFileGroupsStateless() throws Exception { String fileId1 = UUID.randomUUID().toString(); String fileId2 = UUID.randomUUID().toString(); String commitTime1 = "1"; - String fileName1 = FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId1); - String fileName2 = FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId2); + String fileName1 = FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId1, BASE_FILE_EXTENSION); + String fileName2 = FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId2, BASE_FILE_EXTENSION); new File(basePath + "/" + partitionPath1 + "/" + fileName1).createNewFile(); new File(basePath + "/" + partitionPath1 + "/" + fileName2).createNewFile(); @@ -356,8 +356,8 @@ public void testViewForGetAllFileGroupsStateless() throws Exception { String fileId3 = UUID.randomUUID().toString(); String fileId4 = UUID.randomUUID().toString(); String commitTime2 = "2"; - String fileName3 = FSUtils.makeBaseFileName(commitTime2, TEST_WRITE_TOKEN, fileId3); - String fileName4 = FSUtils.makeBaseFileName(commitTime2, TEST_WRITE_TOKEN, fileId4); + String fileName3 = FSUtils.makeBaseFileName(commitTime2, TEST_WRITE_TOKEN, fileId3, BASE_FILE_EXTENSION); + String fileName4 = FSUtils.makeBaseFileName(commitTime2, TEST_WRITE_TOKEN, fileId4, BASE_FILE_EXTENSION); new File(basePath + "/" + partitionPath2 + "/" + fileName3).createNewFile(); new File(basePath + "/" + partitionPath2 + "/" + fileName4).createNewFile(); @@ -394,8 +394,8 @@ public void testViewForGetLatestFileSlicesStateless() throws Exception { String fileId1 = UUID.randomUUID().toString(); String fileId2 = UUID.randomUUID().toString(); String commitTime1 = "1"; - String fileName1 = FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId1); - String fileName2 = FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId2); + String fileName1 = FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId1, BASE_FILE_EXTENSION); + String fileName2 = FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId2, BASE_FILE_EXTENSION); new File(basePath + "/" + partitionPath1 + "/" + fileName1).createNewFile(); new File(basePath + "/" + partitionPath1 + "/" + fileName2).createNewFile(); @@ -407,8 +407,8 @@ public void testViewForGetLatestFileSlicesStateless() throws Exception { String fileId3 = UUID.randomUUID().toString(); String fileId4 = UUID.randomUUID().toString(); String commitTime2 = "2"; - String fileName3 = FSUtils.makeBaseFileName(commitTime2, TEST_WRITE_TOKEN, fileId3); - String fileName4 = FSUtils.makeBaseFileName(commitTime2, TEST_WRITE_TOKEN, fileId4); + String fileName3 = FSUtils.makeBaseFileName(commitTime2, TEST_WRITE_TOKEN, fileId3, BASE_FILE_EXTENSION); + String fileName4 = FSUtils.makeBaseFileName(commitTime2, TEST_WRITE_TOKEN, fileId4, BASE_FILE_EXTENSION); new File(basePath + "/" + partitionPath2 + "/" + fileName3).createNewFile(); new File(basePath + "/" + partitionPath2 + "/" + fileName4).createNewFile(); @@ -515,7 +515,7 @@ void testFileSlicingWithMultipleDeltaWriters() throws Exception { String deltaInstantTime2 = "30"; // 30 -> 50 String deltaInstantTime3 = "35"; // 35 -> 90 - String baseFile1 = FSUtils.makeBaseFileName(instantTime1, TEST_WRITE_TOKEN, fileId); + String baseFile1 = FSUtils.makeBaseFileName(instantTime1, TEST_WRITE_TOKEN, fileId, BASE_FILE_EXTENSION); String deltaFile1 = FSUtils.makeLogFileName(fileId, HoodieLogFile.DELTA_EXTENSION, deltaInstantTime1, 0, TEST_WRITE_TOKEN); String deltaFile2 = @@ -556,7 +556,7 @@ void testFileSlicingWithMultipleDeltaWriters() throws Exception { // schedules a compaction String compactionInstantTime1 = metaClient.createNewInstantTime(); // 60 -> 80 - String compactionFile1 = FSUtils.makeBaseFileName(compactionInstantTime1, TEST_WRITE_TOKEN, fileId); + String compactionFile1 = FSUtils.makeBaseFileName(compactionInstantTime1, TEST_WRITE_TOKEN, fileId, BASE_FILE_EXTENSION); List> partitionFileSlicesPairs = new ArrayList<>(); partitionFileSlicesPairs.add(Pair.of(partitionPath, fileSlices.get(0))); HoodieCompactionPlan compactionPlan = @@ -633,6 +633,61 @@ void testFileSlicingWithMultipleDeltaWriters() throws Exception { assertEquals(deltaFile3, logFiles.get(0).getFileName(), "Log File Order check"); } + @Test + void testLoadPartitions_unPartitioned() throws Exception { + String partitionPath = ""; + Paths.get(basePath, partitionPath).toFile().mkdirs(); + String fileId = UUID.randomUUID().toString(); + + String instantTime1 = "1"; + String fileName1 = + FSUtils.makeLogFileName(fileId, HoodieLogFile.DELTA_EXTENSION, instantTime1, 0, TEST_WRITE_TOKEN); + + Paths.get(basePath, partitionPath, fileName1).toFile().createNewFile(); + HoodieActiveTimeline commitTimeline = metaClient.getActiveTimeline(); + HoodieInstant instant1 = new HoodieInstant(true, HoodieTimeline.COMMIT_ACTION, instantTime1); + + saveAsComplete(commitTimeline, instant1, Option.empty()); + refreshFsView(); + + // Assert that no base files are returned without the partitions being loaded + assertEquals(0, fsView.getLatestFileSliceInRange(Collections.singletonList("1")).count()); + // Assert that load does not fail for un-partitioned tables + fsView.loadPartitions(Collections.singletonList(partitionPath)); + // Assert that base files are returned after the empty-string partition is loaded + assertEquals(1, fsView.getLatestFileSliceInRange(Collections.singletonList("1")).count()); + } + + @Test + void testLoadPartitions_partitioned() throws Exception { + String partitionPath1 = "2016/05/01"; + String partitionPath2 = "2016/05/02"; + Paths.get(basePath, partitionPath1).toFile().mkdirs(); + Paths.get(basePath, partitionPath2).toFile().mkdirs(); + String fileId1 = UUID.randomUUID().toString(); + String fileId2 = UUID.randomUUID().toString(); + String instantTime1 = "1"; + String fileName1 = + FSUtils.makeLogFileName(fileId1, HoodieLogFile.DELTA_EXTENSION, instantTime1, 0, TEST_WRITE_TOKEN); + String fileName2 = + FSUtils.makeLogFileName(fileId2, HoodieLogFile.DELTA_EXTENSION, instantTime1, 0, TEST_WRITE_TOKEN); + + Paths.get(basePath, partitionPath1, fileName1).toFile().createNewFile(); + Paths.get(basePath, partitionPath2, fileName2).toFile().createNewFile(); + HoodieActiveTimeline commitTimeline = metaClient.getActiveTimeline(); + HoodieInstant instant1 = new HoodieInstant(true, HoodieTimeline.COMMIT_ACTION, instantTime1); + + saveAsComplete(commitTimeline, instant1, Option.empty()); + refreshFsView(); + + // Assert that no base files are returned without the partitions being loaded + assertEquals(0, fsView.getLatestFileSliceInRange(Collections.singletonList("1")).count()); + // Only load a single partition path + fsView.loadPartitions(Collections.singletonList(partitionPath1)); + // Assert that base file is returned for partitionPath1 only + assertEquals(1, fsView.getLatestFileSliceInRange(Collections.singletonList("1")).count()); + } + /** * Returns all file-slices including uncommitted ones. * @@ -726,7 +781,7 @@ protected void testViewForFileSlicesWithAsyncCompaction(boolean skipCreatingData String dataFileName = null; if (!skipCreatingDataFile) { - dataFileName = FSUtils.makeBaseFileName(instantTime1, TEST_WRITE_TOKEN, fileId); + dataFileName = FSUtils.makeBaseFileName(instantTime1, TEST_WRITE_TOKEN, fileId, BASE_FILE_EXTENSION); new File(basePath + "/" + partitionPath + "/" + dataFileName).createNewFile(); } String fileName1 = @@ -765,7 +820,7 @@ protected void testViewForFileSlicesWithAsyncCompaction(boolean skipCreatingData checkExternalFile(srcFileStatus, fileSlice.getBaseFile().get().getBootstrapBaseFile(), testBootstrap); } String compactionRequestedTime = "4"; - String compactDataFileName = FSUtils.makeBaseFileName(compactionRequestedTime, TEST_WRITE_TOKEN, fileId); + String compactDataFileName = FSUtils.makeBaseFileName(compactionRequestedTime, TEST_WRITE_TOKEN, fileId, BASE_FILE_EXTENSION); List> partitionFileSlicesPairs = new ArrayList<>(); partitionFileSlicesPairs.add(Pair.of(partitionPath, fileSlices.get(0))); HoodieCompactionPlan compactionPlan = @@ -900,12 +955,12 @@ protected void testViewForFileSlicesWithAsyncCompaction(boolean skipCreatingData final String orphanFileId2 = UUID.randomUUID().toString(); final String invalidInstantId = "INVALIDTIME"; String inflightDeltaInstantTime = "7"; - String orphanDataFileName = FSUtils.makeBaseFileName(invalidInstantId, TEST_WRITE_TOKEN, orphanFileId1); + String orphanDataFileName = FSUtils.makeBaseFileName(invalidInstantId, TEST_WRITE_TOKEN, orphanFileId1, BASE_FILE_EXTENSION); new File(basePath + "/" + partitionPath + "/" + orphanDataFileName).createNewFile(); String orphanLogFileName = FSUtils.makeLogFileName(orphanFileId2, HoodieLogFile.DELTA_EXTENSION, invalidInstantId, 0, TEST_WRITE_TOKEN); new File(basePath + "/" + partitionPath + "/" + orphanLogFileName).createNewFile(); - String inflightDataFileName = FSUtils.makeBaseFileName(inflightDeltaInstantTime, TEST_WRITE_TOKEN, inflightFileId1); + String inflightDataFileName = FSUtils.makeBaseFileName(inflightDeltaInstantTime, TEST_WRITE_TOKEN, inflightFileId1, BASE_FILE_EXTENSION); new File(basePath + "/" + partitionPath + "/" + inflightDataFileName).createNewFile(); String inflightLogFileName = FSUtils.makeLogFileName(inflightFileId2, HoodieLogFile.DELTA_EXTENSION, inflightDeltaInstantTime, 0, TEST_WRITE_TOKEN); @@ -1060,7 +1115,7 @@ public void testGetLatestDataFilesForFileId() throws IOException { // Only one commit, but is not safe String commitTime1 = "1"; - String fileName1 = FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId); + String fileName1 = FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId, BASE_FILE_EXTENSION); new File(basePath + "/" + partitionPath + "/" + fileName1).createNewFile(); refreshFsView(); assertFalse(roView.getLatestBaseFiles(partitionPath).anyMatch(dfile -> dfile.getFileId().equals(fileId)), @@ -1076,7 +1131,7 @@ public void testGetLatestDataFilesForFileId() throws IOException { // Do another commit, but not safe String commitTime2 = "2"; - String fileName2 = FSUtils.makeBaseFileName(commitTime2, TEST_WRITE_TOKEN, fileId); + String fileName2 = FSUtils.makeBaseFileName(commitTime2, TEST_WRITE_TOKEN, fileId, BASE_FILE_EXTENSION); new File(basePath + "/" + partitionPath + "/" + fileName2).createNewFile(); refreshFsView(); assertEquals(fileName1, roView.getLatestBaseFiles(partitionPath) @@ -1110,22 +1165,22 @@ public void testStreamLatestVersionInPartition(boolean isLatestFileSliceOnly) th String fileId3 = UUID.randomUUID().toString(); String fileId4 = UUID.randomUUID().toString(); - new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId1)).createNewFile(); - new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime4, TEST_WRITE_TOKEN, fileId1)).createNewFile(); + new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId1, BASE_FILE_EXTENSION)).createNewFile(); + new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime4, TEST_WRITE_TOKEN, fileId1, BASE_FILE_EXTENSION)).createNewFile(); new File(fullPartitionPath + FSUtils.makeLogFileName(fileId1, HoodieLogFile.DELTA_EXTENSION, commitTime4, 0, TEST_WRITE_TOKEN)) .createNewFile(); new File(fullPartitionPath + FSUtils.makeLogFileName(fileId1, HoodieLogFile.DELTA_EXTENSION, commitTime4, 1, TEST_WRITE_TOKEN)) .createNewFile(); - new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId2)).createNewFile(); - new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime2, TEST_WRITE_TOKEN, fileId2)).createNewFile(); - new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime3, TEST_WRITE_TOKEN, fileId2)).createNewFile(); + new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId2, BASE_FILE_EXTENSION)).createNewFile(); + new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime2, TEST_WRITE_TOKEN, fileId2, BASE_FILE_EXTENSION)).createNewFile(); + new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime3, TEST_WRITE_TOKEN, fileId2, BASE_FILE_EXTENSION)).createNewFile(); new File(fullPartitionPath + FSUtils.makeLogFileName(fileId2, HoodieLogFile.DELTA_EXTENSION, commitTime3, 0, TEST_WRITE_TOKEN)) .createNewFile(); - new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime3, TEST_WRITE_TOKEN, fileId3)).createNewFile(); - new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime4, TEST_WRITE_TOKEN, fileId3)).createNewFile(); + new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime3, TEST_WRITE_TOKEN, fileId3, BASE_FILE_EXTENSION)).createNewFile(); + new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime4, TEST_WRITE_TOKEN, fileId3, BASE_FILE_EXTENSION)).createNewFile(); new File(fullPartitionPath + FSUtils.makeLogFileName(fileId4, HoodieLogFile.DELTA_EXTENSION, commitTime4, 0, TEST_WRITE_TOKEN)) .createNewFile(); @@ -1178,9 +1233,9 @@ private void testStreamLatestVersionInPartition(boolean isLatestFileSliceOnly, S for (HoodieBaseFile status : dataFileList) { filenames.add(status.getFileName()); } - assertTrue(filenames.contains(FSUtils.makeBaseFileName(commitTime4, TEST_WRITE_TOKEN, fileId1))); - assertTrue(filenames.contains(FSUtils.makeBaseFileName(commitTime3, TEST_WRITE_TOKEN, fileId2))); - assertTrue(filenames.contains(FSUtils.makeBaseFileName(commitTime4, TEST_WRITE_TOKEN, fileId3))); + assertTrue(filenames.contains(FSUtils.makeBaseFileName(commitTime4, TEST_WRITE_TOKEN, fileId1, BASE_FILE_EXTENSION))); + assertTrue(filenames.contains(FSUtils.makeBaseFileName(commitTime3, TEST_WRITE_TOKEN, fileId2, BASE_FILE_EXTENSION))); + assertTrue(filenames.contains(FSUtils.makeBaseFileName(commitTime4, TEST_WRITE_TOKEN, fileId3, BASE_FILE_EXTENSION))); filenames = new HashSet<>(); List logFilesList = rtView.getLatestFileSlicesBeforeOrOn("2016/05/01", commitTime4, true) @@ -1207,12 +1262,12 @@ private void testStreamLatestVersionInPartition(boolean isLatestFileSliceOnly, S } if (!isLatestFileSliceOnly) { assertEquals(3, dataFiles.size()); - assertTrue(filenames.contains(FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId1))); - assertTrue(filenames.contains(FSUtils.makeBaseFileName(commitTime3, TEST_WRITE_TOKEN, fileId2))); - assertTrue(filenames.contains(FSUtils.makeBaseFileName(commitTime3, TEST_WRITE_TOKEN, fileId3))); + assertTrue(filenames.contains(FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId1, BASE_FILE_EXTENSION))); + assertTrue(filenames.contains(FSUtils.makeBaseFileName(commitTime3, TEST_WRITE_TOKEN, fileId2, BASE_FILE_EXTENSION))); + assertTrue(filenames.contains(FSUtils.makeBaseFileName(commitTime3, TEST_WRITE_TOKEN, fileId3, BASE_FILE_EXTENSION))); } else { assertEquals(1, dataFiles.size()); - assertTrue(filenames.contains(FSUtils.makeBaseFileName(commitTime3, TEST_WRITE_TOKEN, fileId2))); + assertTrue(filenames.contains(FSUtils.makeBaseFileName(commitTime3, TEST_WRITE_TOKEN, fileId2, BASE_FILE_EXTENSION))); } logFilesList = rtView.getLatestFileSlicesBeforeOrOn("2016/05/01", commitTime3, true) @@ -1238,13 +1293,13 @@ protected void testStreamEveryVersionInPartition(boolean isLatestFileSliceOnly) String fileId2 = UUID.randomUUID().toString(); String fileId3 = UUID.randomUUID().toString(); - new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId1)).createNewFile(); - new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime4, TEST_WRITE_TOKEN, fileId1)).createNewFile(); - new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId2)).createNewFile(); - new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime2, TEST_WRITE_TOKEN, fileId2)).createNewFile(); - new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime3, TEST_WRITE_TOKEN, fileId2)).createNewFile(); - new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime3, TEST_WRITE_TOKEN, fileId3)).createNewFile(); - new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime4, TEST_WRITE_TOKEN, fileId3)).createNewFile(); + new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId1, BASE_FILE_EXTENSION)).createNewFile(); + new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime4, TEST_WRITE_TOKEN, fileId1, BASE_FILE_EXTENSION)).createNewFile(); + new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId2, BASE_FILE_EXTENSION)).createNewFile(); + new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime2, TEST_WRITE_TOKEN, fileId2, BASE_FILE_EXTENSION)).createNewFile(); + new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime3, TEST_WRITE_TOKEN, fileId2, BASE_FILE_EXTENSION)).createNewFile(); + new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime3, TEST_WRITE_TOKEN, fileId3, BASE_FILE_EXTENSION)).createNewFile(); + new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime4, TEST_WRITE_TOKEN, fileId3, BASE_FILE_EXTENSION)).createNewFile(); new File(basePath + "/.hoodie/" + commitTime1 + ".commit").createNewFile(); new File(basePath + "/.hoodie/" + commitTime2 + ".commit").createNewFile(); @@ -1269,22 +1324,22 @@ protected void testStreamEveryVersionInPartition(boolean isLatestFileSliceOnly) Set expFileNames = new HashSet<>(); if (fileId.equals(fileId1)) { if (!isLatestFileSliceOnly) { - expFileNames.add(FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId1)); + expFileNames.add(FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId1, BASE_FILE_EXTENSION)); } - expFileNames.add(FSUtils.makeBaseFileName(commitTime4, TEST_WRITE_TOKEN, fileId1)); + expFileNames.add(FSUtils.makeBaseFileName(commitTime4, TEST_WRITE_TOKEN, fileId1, BASE_FILE_EXTENSION)); assertEquals(expFileNames, filenames); } else if (fileId.equals(fileId2)) { if (!isLatestFileSliceOnly) { - expFileNames.add(FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId2)); - expFileNames.add(FSUtils.makeBaseFileName(commitTime2, TEST_WRITE_TOKEN, fileId2)); + expFileNames.add(FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId2, BASE_FILE_EXTENSION)); + expFileNames.add(FSUtils.makeBaseFileName(commitTime2, TEST_WRITE_TOKEN, fileId2, BASE_FILE_EXTENSION)); } - expFileNames.add(FSUtils.makeBaseFileName(commitTime3, TEST_WRITE_TOKEN, fileId2)); + expFileNames.add(FSUtils.makeBaseFileName(commitTime3, TEST_WRITE_TOKEN, fileId2, BASE_FILE_EXTENSION)); assertEquals(expFileNames, filenames); } else { if (!isLatestFileSliceOnly) { - expFileNames.add(FSUtils.makeBaseFileName(commitTime3, TEST_WRITE_TOKEN, fileId3)); + expFileNames.add(FSUtils.makeBaseFileName(commitTime3, TEST_WRITE_TOKEN, fileId3, BASE_FILE_EXTENSION)); } - expFileNames.add(FSUtils.makeBaseFileName(commitTime4, TEST_WRITE_TOKEN, fileId3)); + expFileNames.add(FSUtils.makeBaseFileName(commitTime4, TEST_WRITE_TOKEN, fileId3, BASE_FILE_EXTENSION)); assertEquals(expFileNames, filenames); } } @@ -1307,21 +1362,21 @@ protected void testStreamLatestVersionInRange(boolean isLatestFileSliceOnly) thr String fileId2 = UUID.randomUUID().toString(); String fileId3 = UUID.randomUUID().toString(); - new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId1)).createNewFile(); + new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId1, BASE_FILE_EXTENSION)).createNewFile(); new File(fullPartitionPath + FSUtils.makeLogFileName(fileId1, HoodieLogFile.DELTA_EXTENSION, commitTime1, 0, TEST_WRITE_TOKEN)) .createNewFile(); - new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime3, TEST_WRITE_TOKEN, fileId1)).createNewFile(); + new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime3, TEST_WRITE_TOKEN, fileId1, BASE_FILE_EXTENSION)).createNewFile(); - new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId2)).createNewFile(); - new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime2, TEST_WRITE_TOKEN, fileId2)).createNewFile(); - new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime3, TEST_WRITE_TOKEN, fileId2)).createNewFile(); + new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId2, BASE_FILE_EXTENSION)).createNewFile(); + new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime2, TEST_WRITE_TOKEN, fileId2, BASE_FILE_EXTENSION)).createNewFile(); + new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime3, TEST_WRITE_TOKEN, fileId2, BASE_FILE_EXTENSION)).createNewFile(); new File(fullPartitionPath + FSUtils.makeLogFileName(fileId2, HoodieLogFile.DELTA_EXTENSION, commitTime3, 0, TEST_WRITE_TOKEN)) .createNewFile(); - new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime3, TEST_WRITE_TOKEN, fileId3)).createNewFile(); - new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime4, TEST_WRITE_TOKEN, fileId3)).createNewFile(); + new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime3, TEST_WRITE_TOKEN, fileId3, BASE_FILE_EXTENSION)).createNewFile(); + new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime4, TEST_WRITE_TOKEN, fileId3, BASE_FILE_EXTENSION)).createNewFile(); new File(basePath + "/.hoodie/" + commitTime1 + ".commit").createNewFile(); new File(basePath + "/.hoodie/" + commitTime2 + ".commit").createNewFile(); @@ -1344,10 +1399,10 @@ protected void testStreamLatestVersionInRange(boolean isLatestFileSliceOnly) thr filenames.add(status.getFileName()); } - assertTrue(filenames.contains(FSUtils.makeBaseFileName(commitTime3, TEST_WRITE_TOKEN, fileId1))); - assertTrue(filenames.contains(FSUtils.makeBaseFileName(commitTime3, TEST_WRITE_TOKEN, fileId2))); + assertTrue(filenames.contains(FSUtils.makeBaseFileName(commitTime3, TEST_WRITE_TOKEN, fileId1, BASE_FILE_EXTENSION))); + assertTrue(filenames.contains(FSUtils.makeBaseFileName(commitTime3, TEST_WRITE_TOKEN, fileId2, BASE_FILE_EXTENSION))); if (!isLatestFileSliceOnly) { - assertTrue(filenames.contains(FSUtils.makeBaseFileName(commitTime3, TEST_WRITE_TOKEN, fileId3))); + assertTrue(filenames.contains(FSUtils.makeBaseFileName(commitTime3, TEST_WRITE_TOKEN, fileId3, BASE_FILE_EXTENSION))); } List slices = @@ -1388,13 +1443,13 @@ protected void testStreamLatestVersionsBefore(boolean isLatestFileSliceOnly) thr String fileId2 = UUID.randomUUID().toString(); String fileId3 = UUID.randomUUID().toString(); - new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId1)).createNewFile(); - new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime4, TEST_WRITE_TOKEN, fileId1)).createNewFile(); - new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId2)).createNewFile(); - new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime2, TEST_WRITE_TOKEN, fileId2)).createNewFile(); - new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime3, TEST_WRITE_TOKEN, fileId2)).createNewFile(); - new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime3, TEST_WRITE_TOKEN, fileId3)).createNewFile(); - new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime4, TEST_WRITE_TOKEN, fileId3)).createNewFile(); + new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId1, BASE_FILE_EXTENSION)).createNewFile(); + new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime4, TEST_WRITE_TOKEN, fileId1, BASE_FILE_EXTENSION)).createNewFile(); + new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId2, BASE_FILE_EXTENSION)).createNewFile(); + new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime2, TEST_WRITE_TOKEN, fileId2, BASE_FILE_EXTENSION)).createNewFile(); + new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime3, TEST_WRITE_TOKEN, fileId2, BASE_FILE_EXTENSION)).createNewFile(); + new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime3, TEST_WRITE_TOKEN, fileId3, BASE_FILE_EXTENSION)).createNewFile(); + new File(fullPartitionPath + FSUtils.makeBaseFileName(commitTime4, TEST_WRITE_TOKEN, fileId3, BASE_FILE_EXTENSION)).createNewFile(); new File(basePath + "/.hoodie/" + commitTime1 + ".commit").createNewFile(); new File(basePath + "/.hoodie/" + commitTime2 + ".commit").createNewFile(); @@ -1414,8 +1469,8 @@ protected void testStreamLatestVersionsBefore(boolean isLatestFileSliceOnly) thr for (HoodieBaseFile status : dataFiles) { filenames.add(status.getFileName()); } - assertTrue(filenames.contains(FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId1))); - assertTrue(filenames.contains(FSUtils.makeBaseFileName(commitTime2, TEST_WRITE_TOKEN, fileId2))); + assertTrue(filenames.contains(FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId1, BASE_FILE_EXTENSION))); + assertTrue(filenames.contains(FSUtils.makeBaseFileName(commitTime2, TEST_WRITE_TOKEN, fileId2, BASE_FILE_EXTENSION))); } else { assertEquals(0, dataFiles.size()); } @@ -1439,30 +1494,30 @@ protected void testStreamLatestVersions(boolean isLatestFileSliceOnly) throws IO String fileId2 = UUID.randomUUID().toString(); String fileId3 = UUID.randomUUID().toString(); - new File(fullPartitionPath + "/" + FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId1)) + new File(fullPartitionPath + "/" + FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId1, BASE_FILE_EXTENSION)) .createNewFile(); new File(fullPartitionPath + "/" + FSUtils.makeLogFileName(fileId1, HoodieLogFile.DELTA_EXTENSION, commitTime1, 0, TEST_WRITE_TOKEN)) .createNewFile(); - new File(fullPartitionPath + "/" + FSUtils.makeBaseFileName(commitTime4, TEST_WRITE_TOKEN, fileId1)) + new File(fullPartitionPath + "/" + FSUtils.makeBaseFileName(commitTime4, TEST_WRITE_TOKEN, fileId1, BASE_FILE_EXTENSION)) .createNewFile(); new File(fullPartitionPath + "/" + FSUtils.makeLogFileName(fileId1, HoodieLogFile.DELTA_EXTENSION, commitTime4, 0, TEST_WRITE_TOKEN)) .createNewFile(); - new File(fullPartitionPath + "/" + FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId2)) + new File(fullPartitionPath + "/" + FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId2, BASE_FILE_EXTENSION)) .createNewFile(); - new File(fullPartitionPath + "/" + FSUtils.makeBaseFileName(commitTime2, TEST_WRITE_TOKEN, fileId2)) + new File(fullPartitionPath + "/" + FSUtils.makeBaseFileName(commitTime2, TEST_WRITE_TOKEN, fileId2, BASE_FILE_EXTENSION)) .createNewFile(); new File(fullPartitionPath + "/" + FSUtils.makeLogFileName(fileId2, HoodieLogFile.DELTA_EXTENSION, commitTime2, 0, TEST_WRITE_TOKEN)) .createNewFile(); - new File(fullPartitionPath + "/" + FSUtils.makeBaseFileName(commitTime3, TEST_WRITE_TOKEN, fileId2)) + new File(fullPartitionPath + "/" + FSUtils.makeBaseFileName(commitTime3, TEST_WRITE_TOKEN, fileId2, BASE_FILE_EXTENSION)) .createNewFile(); - new File(fullPartitionPath + "/" + FSUtils.makeBaseFileName(commitTime3, TEST_WRITE_TOKEN, fileId3)) + new File(fullPartitionPath + "/" + FSUtils.makeBaseFileName(commitTime3, TEST_WRITE_TOKEN, fileId3, BASE_FILE_EXTENSION)) .createNewFile(); - new File(fullPartitionPath + "/" + FSUtils.makeBaseFileName(commitTime4, TEST_WRITE_TOKEN, fileId3)) + new File(fullPartitionPath + "/" + FSUtils.makeBaseFileName(commitTime4, TEST_WRITE_TOKEN, fileId3, BASE_FILE_EXTENSION)) .createNewFile(); new File(basePath + "/.hoodie/" + commitTime1 + ".commit").createNewFile(); @@ -1509,9 +1564,9 @@ protected void testStreamLatestVersions(boolean isLatestFileSliceOnly) throws IO for (HoodieBaseFile status : statuses1) { filenames.add(status.getFileName()); } - assertTrue(filenames.contains(FSUtils.makeBaseFileName(commitTime4, TEST_WRITE_TOKEN, fileId1))); - assertTrue(filenames.contains(FSUtils.makeBaseFileName(commitTime3, TEST_WRITE_TOKEN, fileId2))); - assertTrue(filenames.contains(FSUtils.makeBaseFileName(commitTime4, TEST_WRITE_TOKEN, fileId3))); + assertTrue(filenames.contains(FSUtils.makeBaseFileName(commitTime4, TEST_WRITE_TOKEN, fileId1, BASE_FILE_EXTENSION))); + assertTrue(filenames.contains(FSUtils.makeBaseFileName(commitTime3, TEST_WRITE_TOKEN, fileId2, BASE_FILE_EXTENSION))); + assertTrue(filenames.contains(FSUtils.makeBaseFileName(commitTime4, TEST_WRITE_TOKEN, fileId3, BASE_FILE_EXTENSION))); } @Test @@ -1532,15 +1587,15 @@ public void testPendingCompactionWithDuplicateFileIdsAcrossPartitions() throws E String deltaInstantTime2 = "4"; String fileId = UUID.randomUUID().toString(); - String dataFileName = FSUtils.makeBaseFileName(instantTime1, TEST_WRITE_TOKEN, fileId); + String dataFileName = FSUtils.makeBaseFileName(instantTime1, TEST_WRITE_TOKEN, fileId, BASE_FILE_EXTENSION); new File(fullPartitionPath1 + dataFileName).createNewFile(); String fileName1 = FSUtils.makeLogFileName(fileId, HoodieLogFile.DELTA_EXTENSION, instantTime1, 0, TEST_WRITE_TOKEN); new File(fullPartitionPath1 + fileName1).createNewFile(); - new File(fullPartitionPath2 + FSUtils.makeBaseFileName(instantTime1, TEST_WRITE_TOKEN, fileId)).createNewFile(); + new File(fullPartitionPath2 + FSUtils.makeBaseFileName(instantTime1, TEST_WRITE_TOKEN, fileId, BASE_FILE_EXTENSION)).createNewFile(); new File(fullPartitionPath2 + fileName1).createNewFile(); - new File(fullPartitionPath3 + FSUtils.makeBaseFileName(instantTime1, TEST_WRITE_TOKEN, fileId)).createNewFile(); + new File(fullPartitionPath3 + FSUtils.makeBaseFileName(instantTime1, TEST_WRITE_TOKEN, fileId, BASE_FILE_EXTENSION)).createNewFile(); new File(fullPartitionPath3 + fileName1).createNewFile(); HoodieActiveTimeline commitTimeline = metaClient.getActiveTimeline(); @@ -1579,7 +1634,7 @@ public void testPendingCompactionWithDuplicateFileIdsAcrossPartitions() throws E partitionFileSlicesPairs.add(Pair.of(partitionPath3, fileSlices.get(0))); String compactionRequestedTime = "2"; - String compactDataFileName = FSUtils.makeBaseFileName(compactionRequestedTime, TEST_WRITE_TOKEN, fileId); + String compactDataFileName = FSUtils.makeBaseFileName(compactionRequestedTime, TEST_WRITE_TOKEN, fileId, BASE_FILE_EXTENSION); HoodieCompactionPlan compactionPlan = CompactionUtils.buildFromFileSlices(partitionFileSlicesPairs, Option.empty(), Option.empty()); @@ -1696,8 +1751,8 @@ public void testReplaceWithTimeTravel() throws IOException { "No commit, should not find any data file"); // Only one commit String commitTime1 = "1"; - String fileName1 = FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId1); - String fileName2 = FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId2); + String fileName1 = FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId1, BASE_FILE_EXTENSION); + String fileName2 = FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId2, BASE_FILE_EXTENSION); new File(basePath + "/" + partitionPath1 + "/" + fileName1).createNewFile(); new File(basePath + "/" + partitionPath1 + "/" + fileName2).createNewFile(); @@ -1713,8 +1768,8 @@ public void testReplaceWithTimeTravel() throws IOException { // create commit2 - fileId1 is replaced. new file groups fileId3,fileId4 are created. String fileId3 = UUID.randomUUID().toString(); String fileId4 = UUID.randomUUID().toString(); - String fileName3 = FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId3); - String fileName4 = FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId4); + String fileName3 = FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId3, BASE_FILE_EXTENSION); + String fileName4 = FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId4, BASE_FILE_EXTENSION); new File(basePath + "/" + partitionPath1 + "/" + fileName3).createNewFile(); new File(basePath + "/" + partitionPath1 + "/" + fileName4).createNewFile(); @@ -1795,10 +1850,10 @@ public void testReplaceFileIdIsExcludedInView() throws IOException { // Only one commit String commitTime1 = "1"; - String fileName1 = FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId1); - String fileName2 = FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId2); - String fileName3 = FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId3); - String fileName4 = FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId4); + String fileName1 = FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId1, BASE_FILE_EXTENSION); + String fileName2 = FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId2, BASE_FILE_EXTENSION); + String fileName3 = FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId3, BASE_FILE_EXTENSION); + String fileName4 = FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId4, BASE_FILE_EXTENSION); new File(basePath + "/" + partitionPath1 + "/" + fileName1).createNewFile(); new File(basePath + "/" + partitionPath1 + "/" + fileName2).createNewFile(); new File(basePath + "/" + partitionPath2 + "/" + fileName3).createNewFile(); @@ -1858,9 +1913,9 @@ public void testPendingClusteringOperations() throws IOException { "No commit, should not find any data file"); // Only one commit String commitTime1 = "1"; - String fileName1 = FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId1); - String fileName2 = FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId2); - String fileName3 = FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId3); + String fileName1 = FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId1, BASE_FILE_EXTENSION); + String fileName2 = FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId2, BASE_FILE_EXTENSION); + String fileName3 = FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId3, BASE_FILE_EXTENSION); new File(basePath + "/" + partitionPath1 + "/" + fileName1).createNewFile(); new File(basePath + "/" + partitionPath1 + "/" + fileName2).createNewFile(); new File(basePath + "/" + partitionPath1 + "/" + fileName3).createNewFile(); @@ -1972,8 +2027,8 @@ public void testHoodieTableFileSystemViewWithPendingClustering() throws IOExcept // first insert commit String commitTime1 = "1"; - String fileName1 = FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId1); - String fileName2 = FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId2); + String fileName1 = FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId1, BASE_FILE_EXTENSION); + String fileName2 = FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, fileId2, BASE_FILE_EXTENSION); new File(basePath + "/" + partitionPath + "/" + fileName1).createNewFile(); new File(basePath + "/" + partitionPath + "/" + fileName2).createNewFile(); @@ -1997,7 +2052,7 @@ public void testHoodieTableFileSystemViewWithPendingClustering() throws IOExcept // replace commit String commitTime2 = "2"; - String fileName3 = FSUtils.makeBaseFileName(commitTime2, TEST_WRITE_TOKEN, fileId3); + String fileName3 = FSUtils.makeBaseFileName(commitTime2, TEST_WRITE_TOKEN, fileId3, BASE_FILE_EXTENSION); new File(basePath + "/" + partitionPath + "/" + fileName3).createNewFile(); HoodieInstant instant2 = new HoodieInstant(true, HoodieTimeline.REPLACE_COMMIT_ACTION, commitTime2); @@ -2022,7 +2077,7 @@ public void testHoodieTableFileSystemViewWithPendingClustering() throws IOExcept // another insert commit String commitTime3 = "3"; - String fileName4 = FSUtils.makeBaseFileName(commitTime3, TEST_WRITE_TOKEN, fileId4); + String fileName4 = FSUtils.makeBaseFileName(commitTime3, TEST_WRITE_TOKEN, fileId4, BASE_FILE_EXTENSION); new File(basePath + "/" + partitionPath + "/" + fileName4).createNewFile(); HoodieInstant instant3 = new HoodieInstant(true, HoodieTimeline.COMMIT_ACTION, commitTime3); diff --git a/hudi-common/src/test/java/org/apache/hudi/common/table/view/TestIncrementalFSViewSync.java b/hudi-common/src/test/java/org/apache/hudi/common/table/view/TestIncrementalFSViewSync.java index 852f916c1a4b..91fcad237206 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/table/view/TestIncrementalFSViewSync.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/table/view/TestIncrementalFSViewSync.java @@ -967,7 +967,7 @@ private List> generateDataForInstant(String instan try { java.nio.file.Path filePath = Paths.get(basePath, p, deltaCommit ? FSUtils.makeLogFileName(f, ".log", instant, 0, TEST_WRITE_TOKEN) - : FSUtils.makeBaseFileName(instant, TEST_WRITE_TOKEN, f)); + : FSUtils.makeBaseFileName(instant, TEST_WRITE_TOKEN, f, BASE_FILE_EXTENSION)); Files.createFile(filePath); HoodieWriteStat w = new HoodieWriteStat(); w.setFileId(f); diff --git a/hudi-common/src/test/java/org/apache/hudi/common/table/view/TestPriorityBasedFileSystemView.java b/hudi-common/src/test/java/org/apache/hudi/common/table/view/TestPriorityBasedFileSystemView.java index b297d320c7a6..1e2b8e0c35e5 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/table/view/TestPriorityBasedFileSystemView.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/table/view/TestPriorityBasedFileSystemView.java @@ -53,6 +53,9 @@ import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.Mockito.doThrow; +import static org.mockito.Mockito.never; import static org.mockito.Mockito.reset; import static org.mockito.Mockito.times; import static org.mockito.Mockito.verify; @@ -698,6 +701,27 @@ public void testGetLatestFileSlice() { }); } + @Test + public void testLoadPartitions() { + String partitionPath = "/table2"; + + fsView.loadPartitions(Collections.singletonList(partitionPath)); + verify(primary, times(1)).loadPartitions(Collections.singletonList(partitionPath)); + verify(secondary, never()).loadPartitions(any()); + + resetMocks(); + doThrow(new RuntimeException()).when(primary).loadPartitions(Collections.singletonList(partitionPath)); + fsView.loadPartitions(Collections.singletonList(partitionPath)); + verify(primary, times(1)).loadPartitions(Collections.singletonList(partitionPath)); + verify(secondary, times(1)).loadPartitions(Collections.singletonList(partitionPath)); + + resetMocks(); + doThrow(new RuntimeException()).when(secondary).loadPartitions(Collections.singletonList(partitionPath)); + assertThrows(RuntimeException.class, () -> { + fsView.loadPartitions(Collections.singletonList(partitionPath)); + }); + } + @Test public void testGetPreferredView() { assertEquals(primary, fsView.getPreferredView()); diff --git a/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieCommonTestHarness.java b/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieCommonTestHarness.java index a1a3864a6a98..bda5b38c5178 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieCommonTestHarness.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieCommonTestHarness.java @@ -19,6 +19,7 @@ package org.apache.hudi.common.testutils; import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.table.HoodieTableConfig; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.table.view.HoodieTableFileSystemView; @@ -35,6 +36,8 @@ */ public class HoodieCommonTestHarness { + protected static final String BASE_FILE_EXTENSION = HoodieTableConfig.BASE_FILE_FORMAT.defaultValue().getFileExtension(); + protected String tableName; protected String basePath; protected URI baseUri; diff --git a/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestDataGenerator.java b/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestDataGenerator.java index fe2583b87af3..5adbb57edfa8 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestDataGenerator.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestDataGenerator.java @@ -91,6 +91,13 @@ */ public class HoodieTestDataGenerator implements AutoCloseable { + /** + * You may get a different result due to the upgrading of Spark 3.0: reading dates before 1582-10-15 or timestamps before 1900-01-01T00:00:00Z from Parquet INT96 files can be ambiguous, + * as the files may be written by Spark 2.x or legacy versions of Hive, which uses a legacy hybrid calendar that is different from Spark 3.0+s Proleptic Gregorian calendar. + * See more details in SPARK-31404. + */ + private boolean makeDatesAmbiguous = false; + // based on examination of sample file, the schema produces the following per record size public static final int BYTES_PER_RECORD = (int) (1.2 * 1024); // with default bloom filter with 60,000 entries and 0.000000001 FPRate @@ -208,6 +215,11 @@ public HoodieTestDataGenerator() { this(DEFAULT_PARTITION_PATHS); } + public HoodieTestDataGenerator(boolean makeDatesAmbiguous) { + this(); + this.makeDatesAmbiguous = makeDatesAmbiguous; + } + @Deprecated public HoodieTestDataGenerator(String[] partitionPaths, Map keyPartitionMap) { // NOTE: This used as a workaround to make sure that new instantiations of the generator @@ -392,7 +404,8 @@ private void generateExtraSchemaValues(GenericRecord rec) { rec.put("nation", ByteBuffer.wrap(bytes)); long randomMillis = genRandomTimeMillis(rand); Instant instant = Instant.ofEpochMilli(randomMillis); - rec.put("current_date", (int) LocalDateTime.ofInstant(instant, ZoneOffset.UTC).toLocalDate().toEpochDay()); + rec.put("current_date", makeDatesAmbiguous ? -1000000 : + (int) LocalDateTime.ofInstant(instant, ZoneOffset.UTC).toLocalDate().toEpochDay()); rec.put("current_ts", randomMillis); BigDecimal bigDecimal = new BigDecimal(String.format(Locale.ENGLISH, "%5f", rand.nextFloat())); @@ -524,6 +537,15 @@ private static void createCommitFile(String basePath, String instantTime, Config .forEach(f -> createMetadataFile(f, basePath, configuration, commitMetadata)); } + public static void createOnlyCompletedCommitFile(String basePath, String instantTime, Configuration configuration) { + HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata(); + createOnlyCompletedCommitFile(basePath, instantTime, configuration, commitMetadata); + } + + public static void createOnlyCompletedCommitFile(String basePath, String instantTime, Configuration configuration, HoodieCommitMetadata commitMetadata) { + createMetadataFile(HoodieTimeline.makeCommitFileName(instantTime), basePath, configuration, commitMetadata); + } + public static void createDeltaCommitFile(String basePath, String instantTime, Configuration configuration) { HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata(); createDeltaCommitFile(basePath, instantTime, configuration, commitMetadata); diff --git a/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestTable.java b/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestTable.java index ca13ff79c521..94060d999af2 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestTable.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestTable.java @@ -120,6 +120,7 @@ import static org.apache.hudi.common.testutils.FileCreateUtils.createSavepointCommit; import static org.apache.hudi.common.testutils.FileCreateUtils.deleteSavepointCommit; import static org.apache.hudi.common.testutils.FileCreateUtils.logFileName; +import static org.apache.hudi.common.testutils.HoodieCommonTestHarness.BASE_FILE_EXTENSION; import static org.apache.hudi.common.util.CleanerUtils.convertCleanMetadata; import static org.apache.hudi.common.util.CommitUtils.buildMetadata; import static org.apache.hudi.common.util.CommitUtils.getCommitActionType; @@ -555,7 +556,7 @@ private Pair genera if (newFileId.isPresent() && !StringUtils.isNullOrEmpty(newFileId.get())) { HoodieWriteStat writeStat = new HoodieWriteStat(); writeStat.setPartitionPath(partition); - writeStat.setPath(partition + "/" + FSUtils.makeBaseFileName(instantTime, "1-0-1", newFileId.get())); + writeStat.setPath(partition + "/" + FSUtils.makeBaseFileName(instantTime, "1-0-1", newFileId.get(), BASE_FILE_EXTENSION)); writeStat.setFileId(newFileId.get()); writeStat.setTotalWriteBytes(1); writeStat.setFileSizeInBytes(1); @@ -962,6 +963,19 @@ public HoodieCleanMetadata doClean(String commitTime, Map parti return cleanerMeta.getValue(); } + /** + * Repeats the same cleaning based on the cleaner plan and clean commit metadata. + * + * @param cleanCommitTime new clean commit time to use. + * @param cleanerPlan cleaner plan to write to the metadata. + * @param cleanMetadata clean metadata in data table to use. + */ + public void repeatClean(String cleanCommitTime, + HoodieCleanerPlan cleanerPlan, + HoodieCleanMetadata cleanMetadata) throws IOException { + addClean(cleanCommitTime, cleanerPlan, cleanMetadata); + } + public HoodieCleanMetadata doCleanBasedOnCommits(String cleanCommitTime, List commitsToClean) throws IOException { Map partitionFileCountsToDelete = new HashMap<>(); for (String commitTime : commitsToClean) { diff --git a/hudi-common/src/test/java/org/apache/hudi/common/testutils/reader/HoodieTestReaderContext.java b/hudi-common/src/test/java/org/apache/hudi/common/testutils/reader/HoodieTestReaderContext.java index 38108cf15784..92344ba39ab2 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/testutils/reader/HoodieTestReaderContext.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/testutils/reader/HoodieTestReaderContext.java @@ -19,6 +19,7 @@ package org.apache.hudi.common.testutils.reader; +import org.apache.hudi.avro.model.HoodieDeleteRecord; import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.engine.HoodieReaderContext; import org.apache.hudi.common.model.DefaultHoodieRecordPayload; @@ -207,6 +208,14 @@ public UnaryOperator projectRecord(Schema from, Schema to) { }; } + @Override + public IndexedRecord constructRawDeleteRecord(Map metadata) { + return new HoodieDeleteRecord( + (String) metadata.get(INTERNAL_META_RECORD_KEY), + (String) metadata.get(INTERNAL_META_PARTITION_PATH), + metadata.get(INTERNAL_META_ORDERING_FIELD)); + } + private Object getFieldValueFromIndexedRecord( IndexedRecord record, Schema recordSchema, diff --git a/hudi-common/src/test/java/org/apache/hudi/common/util/TestClusteringUtils.java b/hudi-common/src/test/java/org/apache/hudi/common/util/TestClusteringUtils.java index d3375fe5e8af..72630787c13f 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/util/TestClusteringUtils.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/util/TestClusteringUtils.java @@ -133,7 +133,9 @@ public void testClusteringPlanInflight() throws Exception { String clusterTime1 = "1"; HoodieInstant requestedInstant = createRequestedReplaceInstant(partitionPath1, clusterTime1, fileIds1); HoodieInstant inflightInstant = metaClient.getActiveTimeline().transitionReplaceRequestedToInflight(requestedInstant, Option.empty()); + assertTrue(ClusteringUtils.isClusteringInstant(metaClient.getActiveTimeline(), requestedInstant)); HoodieClusteringPlan requestedClusteringPlan = ClusteringUtils.getClusteringPlan(metaClient, requestedInstant).get().getRight(); + assertTrue(ClusteringUtils.isClusteringInstant(metaClient.getActiveTimeline(), inflightInstant)); HoodieClusteringPlan inflightClusteringPlan = ClusteringUtils.getClusteringPlan(metaClient, inflightInstant).get().getRight(); assertEquals(requestedClusteringPlan, inflightClusteringPlan); } @@ -261,7 +263,7 @@ private HoodieInstant createRequestedReplaceInstant(String partitionPath1, Strin private FileSlice generateFileSlice(String partitionPath, String fileId, String baseInstant) { FileSlice fs = new FileSlice(new HoodieFileGroupId(partitionPath, fileId), baseInstant); - fs.setBaseFile(new HoodieBaseFile(FSUtils.makeBaseFileName(baseInstant, "1-0-1", fileId))); + fs.setBaseFile(new HoodieBaseFile(FSUtils.makeBaseFileName(baseInstant, "1-0-1", fileId, BASE_FILE_EXTENSION))); return fs; } diff --git a/hudi-common/src/test/java/org/apache/hudi/internal/schema/utils/TestAvroSchemaEvolutionUtils.java b/hudi-common/src/test/java/org/apache/hudi/internal/schema/utils/TestAvroSchemaEvolutionUtils.java index 0be0a5f89c52..4027bd28178f 100644 --- a/hudi-common/src/test/java/org/apache/hudi/internal/schema/utils/TestAvroSchemaEvolutionUtils.java +++ b/hudi-common/src/test/java/org/apache/hudi/internal/schema/utils/TestAvroSchemaEvolutionUtils.java @@ -20,6 +20,7 @@ import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.common.testutils.SchemaTestUtil; +import org.apache.hudi.exception.HoodieNullSchemaTypeException; import org.apache.hudi.internal.schema.InternalSchema; import org.apache.hudi.internal.schema.InternalSchemaBuilder; import org.apache.hudi.internal.schema.Type; @@ -46,6 +47,9 @@ import java.util.Map; import java.util.concurrent.atomic.AtomicInteger; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + /** * Tests {@link AvroSchemaEvolutionUtils}. */ @@ -184,6 +188,37 @@ public void testComplexConvert() { Assertions.assertEquals(schema, AvroInternalSchemaConverter.convert(internalSchema, "newTableName")); } + @Test + public void testNullFieldType() { + Schema schema = create("t1", + new Schema.Field("nullField", Schema.create(Schema.Type.NULL), null, JsonProperties.NULL_VALUE)); + Throwable t = assertThrows(HoodieNullSchemaTypeException.class, + () -> AvroInternalSchemaConverter.convert(schema)); + assertTrue(t.getMessage().contains("'t1.nullField'")); + + Schema schemaArray = create("t2", + new Schema.Field("nullArray", Schema.createArray(Schema.create(Schema.Type.NULL)), null, null)); + t = assertThrows(HoodieNullSchemaTypeException.class, + () -> AvroInternalSchemaConverter.convert(schemaArray)); + assertTrue(t.getMessage().contains("'t2.nullArray.element'")); + + Schema schemaMap = create("t3", + new Schema.Field("nullMap", Schema.createMap(Schema.create(Schema.Type.NULL)), null, null)); + t = assertThrows(HoodieNullSchemaTypeException.class, + () -> AvroInternalSchemaConverter.convert(schemaMap)); + assertTrue(t.getMessage().contains("'t3.nullMap.value'")); + + + Schema schemaComplex = create("t4", + new Schema.Field("complexField", Schema.createMap( + create("nestedStruct", + new Schema.Field("nestedArray", Schema.createArray(Schema.createMap(Schema.create(Schema.Type.NULL))), + null, null))), null, null)); + t = assertThrows(HoodieNullSchemaTypeException.class, + () -> AvroInternalSchemaConverter.convert(schemaComplex)); + assertTrue(t.getMessage().contains("'t4.nestedStruct.nestedArray.element.value'")); + } + @Test public void testRefreshNewId() { Types.RecordType record = Types.RecordType.get(Types.Field.get(0, false, "id", Types.IntType.get()), diff --git a/hudi-common/src/test/java/org/apache/hudi/metadata/TestHoodieMetadataPayload.java b/hudi-common/src/test/java/org/apache/hudi/metadata/TestHoodieMetadataPayload.java index cde9341f5cdf..941587531a50 100644 --- a/hudi-common/src/test/java/org/apache/hudi/metadata/TestHoodieMetadataPayload.java +++ b/hudi-common/src/test/java/org/apache/hudi/metadata/TestHoodieMetadataPayload.java @@ -28,6 +28,7 @@ import org.junit.jupiter.api.Test; import java.io.IOException; +import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Map; @@ -39,11 +40,10 @@ * Tests {@link HoodieMetadataPayload}. */ public class TestHoodieMetadataPayload extends HoodieCommonTestHarness { + public static final String PARTITION_NAME = "2022/10/01"; @Test public void testFileSystemMetadataPayloadMerging() { - String partitionName = "2022/10/01"; - Map firstCommitAddedFiles = createImmutableMap( Pair.of("file1.parquet", 1000L), Pair.of("file2.parquet", 2000L), @@ -51,7 +51,7 @@ public void testFileSystemMetadataPayloadMerging() { ); HoodieRecord firstPartitionFilesRecord = - HoodieMetadataPayload.createPartitionFilesRecord(partitionName, firstCommitAddedFiles, Collections.emptyList()); + HoodieMetadataPayload.createPartitionFilesRecord(PARTITION_NAME, firstCommitAddedFiles, Collections.emptyList()); Map secondCommitAddedFiles = createImmutableMap( // NOTE: This is an append @@ -63,13 +63,13 @@ public void testFileSystemMetadataPayloadMerging() { List secondCommitDeletedFiles = Collections.singletonList("file1.parquet"); HoodieRecord secondPartitionFilesRecord = - HoodieMetadataPayload.createPartitionFilesRecord(partitionName, secondCommitAddedFiles, secondCommitDeletedFiles); + HoodieMetadataPayload.createPartitionFilesRecord(PARTITION_NAME, secondCommitAddedFiles, secondCommitDeletedFiles); HoodieMetadataPayload combinedPartitionFilesRecordPayload = secondPartitionFilesRecord.getData().preCombine(firstPartitionFilesRecord.getData()); HoodieMetadataPayload expectedCombinedPartitionedFilesRecordPayload = - HoodieMetadataPayload.createPartitionFilesRecord(partitionName, + HoodieMetadataPayload.createPartitionFilesRecord(PARTITION_NAME, createImmutableMap( Pair.of("file2.parquet", 2000L), Pair.of("file3.parquet", 3333L), @@ -82,9 +82,76 @@ public void testFileSystemMetadataPayloadMerging() { assertEquals(expectedCombinedPartitionedFilesRecordPayload, combinedPartitionFilesRecordPayload); } + @Test + public void testFileSystemMetadataPayloadMergingWithDeletions() { + Map addedFileMap = createImmutableMap( + Pair.of("file1.parquet", 1000L), + Pair.of("file2.parquet", 2000L), + Pair.of("file3.parquet", 3000L), + Pair.of("file4.parquet", 4000L) + ); + HoodieRecord additionRecord = + HoodieMetadataPayload.createPartitionFilesRecord(PARTITION_NAME, addedFileMap, Collections.emptyList()); + + List deletedFileList1 = new ArrayList<>(); + deletedFileList1.add("file1.parquet"); + deletedFileList1.add("file3.parquet"); + HoodieRecord deletionRecord1 = + HoodieMetadataPayload.createPartitionFilesRecord(PARTITION_NAME, Collections.emptyMap(), deletedFileList1); + + List deletedFileList2 = new ArrayList<>(); + deletedFileList2.add("file1.parquet"); + deletedFileList2.add("file4.parquet"); + HoodieRecord deletionRecord2 = + HoodieMetadataPayload.createPartitionFilesRecord(PARTITION_NAME, Collections.emptyMap(), deletedFileList2); + + assertEquals( + HoodieMetadataPayload.createPartitionFilesRecord(PARTITION_NAME, + createImmutableMap( + Pair.of("file2.parquet", 2000L), + Pair.of("file4.parquet", 4000L) + ), + Collections.emptyList() + ).getData(), + deletionRecord1.getData().preCombine(additionRecord.getData()) + ); + + List expectedDeleteFileList = new ArrayList<>(); + expectedDeleteFileList.add("file1.parquet"); + expectedDeleteFileList.add("file3.parquet"); + expectedDeleteFileList.add("file4.parquet"); + + assertEquals( + HoodieMetadataPayload.createPartitionFilesRecord(PARTITION_NAME, + Collections.emptyMap(), + expectedDeleteFileList + ).getData(), + deletionRecord2.getData().preCombine(deletionRecord1.getData()) + ); + + assertEquals( + HoodieMetadataPayload.createPartitionFilesRecord(PARTITION_NAME, + createImmutableMap( + Pair.of("file2.parquet", 2000L) + ), + Collections.emptyList() + ).getData(), + deletionRecord2.getData().preCombine(deletionRecord1.getData()).preCombine(additionRecord.getData()) + ); + + assertEquals( + HoodieMetadataPayload.createPartitionFilesRecord(PARTITION_NAME, + createImmutableMap( + Pair.of("file2.parquet", 2000L) + ), + Collections.singletonList("file1.parquet") + ).getData(), + deletionRecord2.getData().preCombine(deletionRecord1.getData().preCombine(additionRecord.getData())) + ); + } + @Test public void testColumnStatsPayloadMerging() throws IOException { - String partitionPath = "2022/10/01"; String fileName = "file.parquet"; String targetColName = "c1"; @@ -92,7 +159,7 @@ public void testColumnStatsPayloadMerging() throws IOException { HoodieColumnRangeMetadata.create(fileName, targetColName, 100, 1000, 5, 1000, 123456, 123456); HoodieRecord columnStatsRecord = - HoodieMetadataPayload.createColumnStatsRecords(partitionPath, Collections.singletonList(c1Metadata), false) + HoodieMetadataPayload.createColumnStatsRecords(PARTITION_NAME, Collections.singletonList(c1Metadata), false) .findFirst().get(); //////////////////////////////////////////////////////////////////////// @@ -105,7 +172,7 @@ public void testColumnStatsPayloadMerging() throws IOException { HoodieColumnRangeMetadata.create(fileName, targetColName, 0, 500, 0, 100, 12345, 12345); HoodieRecord updatedColumnStatsRecord = - HoodieMetadataPayload.createColumnStatsRecords(partitionPath, Collections.singletonList(c1AppendedBlockMetadata), false) + HoodieMetadataPayload.createColumnStatsRecords(PARTITION_NAME, Collections.singletonList(c1AppendedBlockMetadata), false) .findFirst().get(); HoodieMetadataPayload combinedMetadataPayload = @@ -115,7 +182,7 @@ public void testColumnStatsPayloadMerging() throws IOException { HoodieColumnRangeMetadata.create(fileName, targetColName, 0, 1000, 5, 1100, 135801, 135801); HoodieRecord expectedColumnStatsRecord = - HoodieMetadataPayload.createColumnStatsRecords(partitionPath, Collections.singletonList(expectedColumnRangeMetadata), false) + HoodieMetadataPayload.createColumnStatsRecords(PARTITION_NAME, Collections.singletonList(expectedColumnRangeMetadata), false) .findFirst().get(); // Assert combined payload @@ -135,7 +202,7 @@ public void testColumnStatsPayloadMerging() throws IOException { HoodieColumnRangeMetadata.stub(fileName, targetColName); HoodieRecord deletedColumnStatsRecord = - HoodieMetadataPayload.createColumnStatsRecords(partitionPath, Collections.singletonList(c1StubbedMetadata), true) + HoodieMetadataPayload.createColumnStatsRecords(PARTITION_NAME, Collections.singletonList(c1StubbedMetadata), true) .findFirst().get(); // NOTE: In this case, deleted (or tombstone) record will be therefore deleting diff --git a/hudi-examples/hudi-examples-flink/pom.xml b/hudi-examples/hudi-examples-flink/pom.xml index c2697372e2a9..884d8f161fc9 100644 --- a/hudi-examples/hudi-examples-flink/pom.xml +++ b/hudi-examples/hudi-examples-flink/pom.xml @@ -69,9 +69,6 @@ src/main/resources - - src/test/resources - diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/configuration/FlinkOptions.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/configuration/FlinkOptions.java index a15ac0efe07a..7565ad153002 100644 --- a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/configuration/FlinkOptions.java +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/configuration/FlinkOptions.java @@ -290,7 +290,7 @@ private FlinkOptions() { + " log file records(combines the two records with same key for base and log file records), then read the left log file records"); @AdvancedConfig - public static final ConfigOption UTC_TIMEZONE = ConfigOptions + public static final ConfigOption READ_UTC_TIMEZONE = ConfigOptions .key("read.utc-timezone") .booleanType() .defaultValue(true) @@ -316,7 +316,7 @@ private FlinkOptions() { public static final ConfigOption READ_STREAMING_SKIP_COMPACT = ConfigOptions .key("read.streaming.skip_compaction") .booleanType() - .defaultValue(false)// default read as batch + .defaultValue(true) .withDescription("Whether to skip compaction instants and avoid reading compacted base files for streaming read to improve read performance.\n" + "This option can be used to avoid reading duplicates when changelog mode is enabled, it is a solution to keep data integrity\n"); @@ -325,10 +325,18 @@ private FlinkOptions() { public static final ConfigOption READ_STREAMING_SKIP_CLUSTERING = ConfigOptions .key("read.streaming.skip_clustering") .booleanType() - .defaultValue(false) + .defaultValue(true) .withDescription("Whether to skip clustering instants to avoid reading base files of clustering operations for streaming read " + "to improve read performance."); + // this option is experimental + public static final ConfigOption READ_STREAMING_SKIP_INSERT_OVERWRITE = ConfigOptions + .key("read.streaming.skip_insertoverwrite") + .booleanType() + .defaultValue(false) + .withDescription("Whether to skip insert overwrite instants to avoid reading base files of insert overwrite operations for streaming read. " + + "In streaming scenarios, insert overwrite is usually used to repair data, here you can control the visibility of downstream streaming read."); + public static final String START_COMMIT_EARLIEST = "earliest"; public static final ConfigOption READ_START_COMMIT = ConfigOptions .key("read.start-commit") @@ -489,6 +497,15 @@ private FlinkOptions() { public static final String PARTITION_FORMAT_HOUR = "yyyyMMddHH"; public static final String PARTITION_FORMAT_DAY = "yyyyMMdd"; public static final String PARTITION_FORMAT_DASHED_DAY = "yyyy-MM-dd"; + + @AdvancedConfig + public static final ConfigOption WRITE_UTC_TIMEZONE = ConfigOptions + .key("write.utc-timezone") + .booleanType() + .defaultValue(true) + .withDescription("Use UTC timezone or local timezone to the conversion between epoch" + + " time and LocalDateTime. Default value is utc timezone for forward compatibility."); + @AdvancedConfig public static final ConfigOption PARTITION_FORMAT = ConfigOptions .key("write.partition.format") diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/configuration/OptionsResolver.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/configuration/OptionsResolver.java index ae1a86f36cc4..f73c5d28c5b9 100644 --- a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/configuration/OptionsResolver.java +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/configuration/OptionsResolver.java @@ -77,6 +77,14 @@ public static boolean isInsertOperation(Configuration conf) { return operationType == WriteOperationType.INSERT; } + /** + * Returns whether the table operation is 'upsert'. + */ + public static boolean isUpsertOperation(Configuration conf) { + WriteOperationType operationType = WriteOperationType.fromValue(conf.getString(FlinkOptions.OPERATION)); + return operationType == WriteOperationType.UPSERT; + } + /** * Returns whether the table operation is 'bulk_insert'. */ @@ -142,10 +150,20 @@ public static boolean isPartitionedTable(Configuration conf) { return FilePathUtils.extractPartitionKeys(conf).length > 0; } + /** + * Returns whether the table index is bucket index. + */ public static boolean isBucketIndexType(Configuration conf) { return conf.getString(FlinkOptions.INDEX_TYPE).equalsIgnoreCase(HoodieIndex.IndexType.BUCKET.name()); } + /** + * Returns whether it is a MERGE_ON_READ table, and updates by bucket index. + */ + public static boolean isMorWithBucketIndexUpsert(Configuration conf) { + return isMorTable(conf) && isUpsertOperation(conf) && isBucketIndexType(conf); + } + public static HoodieIndex.BucketIndexEngineType getBucketEngineType(Configuration conf) { String bucketEngineType = conf.get(FlinkOptions.BUCKET_INDEX_ENGINE_TYPE); return HoodieIndex.BucketIndexEngineType.valueOf(bucketEngineType); diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/StreamWriteOperatorCoordinator.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/StreamWriteOperatorCoordinator.java index f98f1bde8c8a..26ca245f8bee 100644 --- a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/StreamWriteOperatorCoordinator.java +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/StreamWriteOperatorCoordinator.java @@ -437,6 +437,18 @@ private void handleBootstrapEvent(WriteMetadataEvent event) { .filter(evt -> evt.getWriteStatuses().size() > 0) .findFirst().map(WriteMetadataEvent::getInstantTime) .orElse(WriteMetadataEvent.BOOTSTRAP_INSTANT); + + // if currentInstant is pending && bootstrap event instant is empty + // reuse currentInstant, reject bootstrap + if (this.metaClient.reloadActiveTimeline().filterInflightsAndRequested().containsInstant(this.instant) + && instant.equals(WriteMetadataEvent.BOOTSTRAP_INSTANT) + && this.tableState.operationType == WriteOperationType.INSERT) { + LOG.warn("Reuse current pending Instant {} with {} operationType, " + + "ignoring empty bootstrap event.", this.instant, WriteOperationType.INSERT.value()); + reset(); + return; + } + initInstant(instant); } } diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/bulk/sort/SortOperator.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/bulk/sort/SortOperator.java index e91535a24736..357bc07160d3 100644 --- a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/bulk/sort/SortOperator.java +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/bulk/sort/SortOperator.java @@ -100,7 +100,7 @@ public void open() throws Exception { collector = new StreamRecordCollector<>(output); - // register the the metrics. + // register the metrics. getMetricGroup().gauge("memoryUsedSizeInBytes", (Gauge) sorter::getUsedMemoryInBytes); getMetricGroup().gauge("numSpillFiles", (Gauge) sorter::getNumSpillFiles); getMetricGroup().gauge("spillInBytes", (Gauge) sorter::getSpillInBytes); diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/transform/RowDataToHoodieFunction.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/transform/RowDataToHoodieFunction.java index bfc7d7d62ad4..0a13bea513d4 100644 --- a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/transform/RowDataToHoodieFunction.java +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/transform/RowDataToHoodieFunction.java @@ -23,6 +23,7 @@ import org.apache.hudi.common.model.HoodieOperation; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.configuration.FlinkOptions; import org.apache.hudi.keygen.KeyGenerator; import org.apache.hudi.keygen.factory.HoodieAvroKeyGeneratorFactory; import org.apache.hudi.sink.utils.PayloadCreation; @@ -84,7 +85,7 @@ public RowDataToHoodieFunction(RowType rowType, Configuration config) { public void open(Configuration parameters) throws Exception { super.open(parameters); this.avroSchema = StreamerUtil.getSourceSchema(this.config); - this.converter = RowDataToAvroConverters.createConverter(this.rowType); + this.converter = RowDataToAvroConverters.createConverter(this.rowType, this.config.getBoolean(FlinkOptions.WRITE_UTC_TIMEZONE)); this.keyGenerator = HoodieAvroKeyGeneratorFactory .createKeyGenerator(flinkConf2TypedProperties(this.config)); diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/source/IncrementalInputSplits.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/source/IncrementalInputSplits.java index c1cd5874d960..ddd7fbbb0a88 100644 --- a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/source/IncrementalInputSplits.java +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/source/IncrementalInputSplits.java @@ -35,7 +35,7 @@ import org.apache.hudi.common.util.Option; import org.apache.hudi.configuration.FlinkOptions; import org.apache.hudi.configuration.OptionsResolver; -import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils; +import org.apache.hudi.metadata.HoodieTableMetadataUtil; import org.apache.hudi.sink.partitioner.profile.WriteProfiles; import org.apache.hudi.source.prune.PartitionPruners; import org.apache.hudi.table.format.cdc.CdcInputSplit; @@ -91,6 +91,8 @@ public class IncrementalInputSplits implements Serializable { private final boolean skipCompaction; // skip clustering private final boolean skipClustering; + // skip insert overwrite + private final boolean skipInsertOverwrite; private IncrementalInputSplits( Configuration conf, @@ -99,7 +101,8 @@ private IncrementalInputSplits( long maxCompactionMemoryInBytes, @Nullable PartitionPruners.PartitionPruner partitionPruner, boolean skipCompaction, - boolean skipClustering) { + boolean skipClustering, + boolean skipInsertOverwrite) { this.conf = conf; this.path = path; this.rowType = rowType; @@ -107,6 +110,7 @@ private IncrementalInputSplits( this.partitionPruner = partitionPruner; this.skipCompaction = skipCompaction; this.skipClustering = skipClustering; + this.skipInsertOverwrite = skipInsertOverwrite; } /** @@ -135,6 +139,7 @@ public Result inputSplits( .rangeType(InstantRange.RangeType.CLOSED_CLOSED) .skipCompaction(skipCompaction) .skipClustering(skipClustering) + .skipInsertOverwrite(skipInsertOverwrite) .build(); IncrementalQueryAnalyzer.QueryContext analyzingResult = analyzer.analyze(); @@ -241,6 +246,7 @@ public Result inputSplits( .rangeType(issuedOffset != null ? InstantRange.RangeType.OPEN_CLOSED : InstantRange.RangeType.CLOSED_CLOSED) .skipCompaction(skipCompaction) .skipClustering(skipClustering) + .skipInsertOverwrite(skipInsertOverwrite) .limit(OptionsResolver.getReadCommitsLimit(conf)) .build(); @@ -412,7 +418,7 @@ private FileIndex getFileIndex() { * @return the set of read partitions */ private Set getReadPartitions(List metadataList) { - Set partitions = HoodieInputFormatUtils.getWritePartitionPaths(metadataList); + Set partitions = HoodieTableMetadataUtil.getWritePartitionPaths(metadataList); // apply partition push down if (this.partitionPruner != null) { Set selectedPartitions = this.partitionPruner.filter(partitions); @@ -498,6 +504,8 @@ public static class Builder { private boolean skipCompaction = false; // skip clustering private boolean skipClustering = false; + // skip insert overwrite + private boolean skipInsertOverwrite = false; public Builder() { } @@ -537,10 +545,15 @@ public Builder skipClustering(boolean skipClustering) { return this; } + public Builder skipInsertOverwrite(boolean skipInsertOverwrite) { + this.skipInsertOverwrite = skipInsertOverwrite; + return this; + } + public IncrementalInputSplits build() { return new IncrementalInputSplits( Objects.requireNonNull(this.conf), Objects.requireNonNull(this.path), Objects.requireNonNull(this.rowType), - this.maxCompactionMemoryInBytes, this.partitionPruner, this.skipCompaction, this.skipClustering); + this.maxCompactionMemoryInBytes, this.partitionPruner, this.skipCompaction, this.skipClustering, this.skipInsertOverwrite); } } } diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/source/StreamReadMonitoringFunction.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/source/StreamReadMonitoringFunction.java index fa911cadb0e1..0e3b1f0ce58e 100644 --- a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/source/StreamReadMonitoringFunction.java +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/source/StreamReadMonitoringFunction.java @@ -124,6 +124,7 @@ public StreamReadMonitoringFunction( .partitionPruner(partitionPruner) .skipCompaction(conf.getBoolean(FlinkOptions.READ_STREAMING_SKIP_COMPACT)) .skipClustering(conf.getBoolean(FlinkOptions.READ_STREAMING_SKIP_CLUSTERING)) + .skipInsertOverwrite(conf.getBoolean(FlinkOptions.READ_STREAMING_SKIP_INSERT_OVERWRITE)) .build(); } diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/source/filedistribution/partitioner/StreamReadAppendPartitioner.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/source/filedistribution/partitioner/StreamReadAppendPartitioner.java new file mode 100644 index 000000000000..67bd9f9e324f --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/source/filedistribution/partitioner/StreamReadAppendPartitioner.java @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.source.filedistribution.partitioner; + +import org.apache.flink.api.common.functions.Partitioner; + +public class StreamReadAppendPartitioner implements Partitioner { + + private final int parallNum; + + public StreamReadAppendPartitioner(int parallNum) { + this.parallNum = parallNum; + } + + @Override + public int partition(Integer splitNum, int maxParallelism) { + return splitNum % parallNum; + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/source/filedistribution/partitioner/StreamReadBucketIndexPartitioner.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/source/filedistribution/partitioner/StreamReadBucketIndexPartitioner.java new file mode 100644 index 000000000000..4b5531b67ba9 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/source/filedistribution/partitioner/StreamReadBucketIndexPartitioner.java @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.source.filedistribution.partitioner; + +import org.apache.hudi.index.bucket.BucketIdentifier; + +import org.apache.flink.api.common.functions.Partitioner; + +public class StreamReadBucketIndexPartitioner implements Partitioner { + + private final int parallNum; + + public StreamReadBucketIndexPartitioner(int parallNum) { + this.parallNum = parallNum; + } + + @Override + public int partition(String fileName, int maxParallelism) { + return BucketIdentifier.bucketIdFromFileId(fileName) % parallNum; + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/source/filedistribution/selector/StreamReadAppendKeySelector.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/source/filedistribution/selector/StreamReadAppendKeySelector.java new file mode 100644 index 000000000000..de4a5f85f9c2 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/source/filedistribution/selector/StreamReadAppendKeySelector.java @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.source.filedistribution.selector; + +import org.apache.hudi.table.format.mor.MergeOnReadInputSplit; + +import org.apache.flink.api.java.functions.KeySelector; + +public class StreamReadAppendKeySelector implements KeySelector { + + @Override + public Integer getKey(MergeOnReadInputSplit mergeOnReadInputSplit) throws Exception { + return mergeOnReadInputSplit.getSplitNumber(); + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/source/filedistribution/selector/StreamReadBucketIndexKeySelector.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/source/filedistribution/selector/StreamReadBucketIndexKeySelector.java new file mode 100644 index 000000000000..d1db65596598 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/source/filedistribution/selector/StreamReadBucketIndexKeySelector.java @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.source.filedistribution.selector; + +import org.apache.hudi.table.format.mor.MergeOnReadInputSplit; + +import org.apache.flink.api.java.functions.KeySelector; + +public class StreamReadBucketIndexKeySelector implements KeySelector { + + @Override + public String getKey(MergeOnReadInputSplit mergeOnReadInputSplit) throws Exception { + return mergeOnReadInputSplit.getFileId(); + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/streamer/FlinkStreamerConfig.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/streamer/FlinkStreamerConfig.java index e8050d157618..25ba73f97d3d 100644 --- a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/streamer/FlinkStreamerConfig.java +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/streamer/FlinkStreamerConfig.java @@ -455,7 +455,7 @@ public static org.apache.flink.configuration.Configuration toFlinkConfig(FlinkSt conf.setString(FlinkOptions.SOURCE_AVRO_SCHEMA_PATH, config.sourceAvroSchemaPath); } conf.setString(FlinkOptions.SOURCE_AVRO_SCHEMA, config.sourceAvroSchema); - conf.setBoolean(FlinkOptions.UTC_TIMEZONE, config.utcTimezone); + conf.setBoolean(FlinkOptions.READ_UTC_TIMEZONE, config.utcTimezone); conf.setBoolean(FlinkOptions.URL_ENCODE_PARTITIONING, config.writePartitionUrlEncode); conf.setBoolean(FlinkOptions.HIVE_STYLE_PARTITIONING, config.hiveStylePartitioning); conf.setDouble(FlinkOptions.WRITE_TASK_MAX_SIZE, config.writeTaskMaxSize); diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/HoodieTableFactory.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/HoodieTableFactory.java index 68642b39da89..65f0199ae809 100644 --- a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/HoodieTableFactory.java +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/HoodieTableFactory.java @@ -28,7 +28,6 @@ import org.apache.hudi.configuration.OptionsResolver; import org.apache.hudi.exception.HoodieValidationException; import org.apache.hudi.index.HoodieIndex; -import org.apache.hudi.keygen.ComplexAvroKeyGenerator; import org.apache.hudi.keygen.NonpartitionedAvroKeyGenerator; import org.apache.hudi.keygen.TimestampBasedAvroKeyGenerator; import org.apache.hudi.util.AvroSchemaConverter; @@ -318,11 +317,7 @@ private static void setupHoodieKeyOptions(Configuration conf, CatalogTable table } } boolean complexHoodieKey = pks.length > 1 || partitions.length > 1; - if (complexHoodieKey && FlinkOptions.isDefaultValueDefined(conf, FlinkOptions.KEYGEN_CLASS_NAME)) { - conf.setString(FlinkOptions.KEYGEN_CLASS_NAME, ComplexAvroKeyGenerator.class.getName()); - LOG.info("Table option [{}] is reset to {} because record key or partition path has two or more fields", - FlinkOptions.KEYGEN_CLASS_NAME.key(), ComplexAvroKeyGenerator.class.getName()); - } + StreamerUtil.checkKeygenGenerator(complexHoodieKey, conf); } /** diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/HoodieTableSource.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/HoodieTableSource.java index 47692c141cf7..b4cfdbb0c9fa 100644 --- a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/HoodieTableSource.java +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/HoodieTableSource.java @@ -46,6 +46,10 @@ import org.apache.hudi.source.IncrementalInputSplits; import org.apache.hudi.source.StreamReadMonitoringFunction; import org.apache.hudi.source.StreamReadOperator; +import org.apache.hudi.source.filedistribution.partitioner.StreamReadAppendPartitioner; +import org.apache.hudi.source.filedistribution.partitioner.StreamReadBucketIndexPartitioner; +import org.apache.hudi.source.filedistribution.selector.StreamReadAppendKeySelector; +import org.apache.hudi.source.filedistribution.selector.StreamReadBucketIndexKeySelector; import org.apache.hudi.source.prune.DataPruner; import org.apache.hudi.source.prune.PartitionPruners; import org.apache.hudi.source.prune.PrimaryKeyPruners; @@ -204,24 +208,18 @@ public DataStream produceDataStream(StreamExecutionEnvironment execEnv) conf, FilePathUtils.toFlinkPath(path), tableRowType, maxCompactionMemoryInBytes, partitionPruner); InputFormat inputFormat = getInputFormat(true); OneInputStreamOperatorFactory factory = StreamReadOperator.factory((MergeOnReadInputFormat) inputFormat); - DataStream monitorOperatorStream = execEnv.addSource(monitoringFunction, getSourceOperatorName("split_monitor")) + SingleOutputStreamOperator monitorOperatorStream = execEnv.addSource(monitoringFunction, getSourceOperatorName("split_monitor")) .uid(Pipelines.opUID("split_monitor", conf)) .setParallelism(1) .setMaxParallelism(1); - SingleOutputStreamOperator source; - if (OptionsResolver.isAppendMode(HoodieTableSource.this.conf)) { - source = monitorOperatorStream - .transform("split_reader", typeInfo, factory) - .uid(Pipelines.opUID("split_reader", conf)) - .setParallelism(conf.getInteger(FlinkOptions.READ_TASKS)); - } else { - source = monitorOperatorStream - .keyBy(MergeOnReadInputSplit::getFileId) - .transform("split_reader", typeInfo, factory) - .uid(Pipelines.opUID("split_reader", conf)) - .setParallelism(conf.getInteger(FlinkOptions.READ_TASKS)); - } - return new DataStreamSource<>(source); + + DataStream sourceWithKey = addFileDistributionStrategy(monitorOperatorStream); + + SingleOutputStreamOperator streamReadSource = sourceWithKey + .transform("split_reader", typeInfo, factory) + .uid(Pipelines.opUID("split_reader", conf)) + .setParallelism(conf.getInteger(FlinkOptions.READ_TASKS)); + return new DataStreamSource<>(streamReadSource); } else { InputFormatSourceFunction func = new InputFormatSourceFunction<>(getInputFormat(), typeInfo); DataStreamSource source = execEnv.addSource(func, asSummaryString(), typeInfo); @@ -231,6 +229,20 @@ public DataStream produceDataStream(StreamExecutionEnvironment execEnv) }; } + /** + * Specify the file distribution strategy based on different upstream writing mechanisms, + * to prevent hot spot issues during stream reading. + */ + private DataStream addFileDistributionStrategy(SingleOutputStreamOperator source) { + if (OptionsResolver.isMorWithBucketIndexUpsert(conf)) { + return source.partitionCustom(new StreamReadBucketIndexPartitioner(conf.getInteger(FlinkOptions.READ_TASKS)), new StreamReadBucketIndexKeySelector()); + } else if (OptionsResolver.isAppendMode(conf)) { + return source.partitionCustom(new StreamReadAppendPartitioner(conf.getInteger(FlinkOptions.READ_TASKS)), new StreamReadAppendKeySelector()); + } else { + return source.keyBy(MergeOnReadInputSplit::getFileId); + } + } + @Override public ChangelogMode getChangelogMode() { // when read as streaming and changelog mode is enabled, emit as FULL mode; @@ -548,7 +560,7 @@ private MergeOnReadInputFormat mergeOnReadInputFormat( this.predicates, this.limit == NO_LIMIT_CONSTANT ? Long.MAX_VALUE : this.limit, // ParquetInputFormat always uses the limit value getParquetConf(this.conf, this.hadoopConf), - this.conf.getBoolean(FlinkOptions.UTC_TIMEZONE), + this.conf.getBoolean(FlinkOptions.READ_UTC_TIMEZONE), this.internalSchemaManager ); } diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/catalog/HoodieCatalog.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/catalog/HoodieCatalog.java index d25db7d82fac..f9088b4096c8 100644 --- a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/catalog/HoodieCatalog.java +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/catalog/HoodieCatalog.java @@ -343,6 +343,10 @@ public void createTable(ObjectPath tablePath, CatalogBaseTable catalogTable, boo final String partitions = String.join(",", resolvedTable.getPartitionKeys()); conf.setString(FlinkOptions.PARTITION_PATH_FIELD, partitions); options.put(TableOptionProperties.PARTITION_COLUMNS, partitions); + + final String[] pks = conf.getString(FlinkOptions.RECORD_KEY_FIELD).split(","); + boolean complexHoodieKey = pks.length > 1 || resolvedTable.getPartitionKeys().size() > 1; + StreamerUtil.checkKeygenGenerator(complexHoodieKey, conf); } else { conf.setString(FlinkOptions.KEYGEN_CLASS_NAME.key(), NonpartitionedAvroKeyGenerator.class.getName()); } diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/catalog/HoodieHiveCatalog.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/catalog/HoodieHiveCatalog.java index 0d3a478f59d0..ce0230e69394 100644 --- a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/catalog/HoodieHiveCatalog.java +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/catalog/HoodieHiveCatalog.java @@ -502,6 +502,9 @@ private void initTableIfNotExists(ObjectPath tablePath, CatalogTable catalogTabl if (catalogTable.isPartitioned() && !flinkConf.contains(FlinkOptions.PARTITION_PATH_FIELD)) { final String partitions = String.join(",", catalogTable.getPartitionKeys()); flinkConf.setString(FlinkOptions.PARTITION_PATH_FIELD, partitions); + final String[] pks = flinkConf.getString(FlinkOptions.RECORD_KEY_FIELD).split(","); + boolean complexHoodieKey = pks.length > 1 || catalogTable.getPartitionKeys().size() > 1; + StreamerUtil.checkKeygenGenerator(complexHoodieKey, flinkConf); } if (!catalogTable.isPartitioned()) { @@ -549,6 +552,7 @@ private Table instantiateHiveTable(ObjectPath tablePath, CatalogBaseTable table, hiveTable.setCreateTime((int) (System.currentTimeMillis() / 1000)); Map properties = new HashMap<>(table.getOptions()); + hiveConf.getAllProperties().forEach((k, v) -> properties.put("hadoop." + k, String.valueOf(v))); if (external) { hiveTable.setTableType(TableType.EXTERNAL_TABLE.toString()); diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/format/mor/MergeOnReadInputFormat.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/format/mor/MergeOnReadInputFormat.java index f13098fc7c7c..29bb0a06d8ce 100644 --- a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/format/mor/MergeOnReadInputFormat.java +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/format/mor/MergeOnReadInputFormat.java @@ -332,7 +332,7 @@ private ClosableIterator getBaseFileIterator(String path, int[] require return RecordIterators.getParquetRecordIterator( internalSchemaManager, - this.conf.getBoolean(FlinkOptions.UTC_TIMEZONE), + this.conf.getBoolean(FlinkOptions.READ_UTC_TIMEZONE), true, HadoopConfigurations.getParquetConf(this.conf, hadoopConf), fieldNames.toArray(new String[0]), @@ -735,7 +735,7 @@ public MergeIterator( this.emitDelete = emitDelete; this.operationPos = operationPos; this.avroProjection = avroProjection; - this.rowDataToAvroConverter = RowDataToAvroConverters.createConverter(tableRowType); + this.rowDataToAvroConverter = RowDataToAvroConverters.createConverter(tableRowType, flinkConf.getBoolean(FlinkOptions.WRITE_UTC_TIMEZONE)); this.avroToRowDataConverter = AvroToRowDataConverters.createRowConverter(requiredRowType); this.projection = projection; this.instantRange = split.getInstantRange().orElse(null); diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/util/RowDataToAvroConverters.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/util/RowDataToAvroConverters.java index ff2903c0a733..23dbe71721a5 100644 --- a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/util/RowDataToAvroConverters.java +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/util/RowDataToAvroConverters.java @@ -77,6 +77,10 @@ public interface RowDataToAvroConverter extends Serializable { * Flink Table & SQL internal data structures to corresponding Avro data structures. */ public static RowDataToAvroConverter createConverter(LogicalType type) { + return createConverter(type, true); + } + + public static RowDataToAvroConverter createConverter(LogicalType type, boolean utcTimezone) { final RowDataToAvroConverter converter; switch (type.getTypeRoot()) { case NULL: @@ -156,8 +160,34 @@ public Object convert(Schema schema, Object object) { }; break; case TIMESTAMP_WITH_LOCAL_TIME_ZONE: + int precision = DataTypeUtils.precision(type); + if (precision <= 3) { + converter = + new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + return ((TimestampData) object).toInstant().toEpochMilli(); + } + }; + } else if (precision <= 6) { + converter = + new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + Instant instant = ((TimestampData) object).toInstant(); + return Math.addExact(Math.multiplyExact(instant.getEpochSecond(), 1000_000), instant.getNano() / 1000); + } + }; + } else { + throw new UnsupportedOperationException("Unsupported timestamp precision: " + precision); + } + break; case TIMESTAMP_WITHOUT_TIME_ZONE: - final int precision = DataTypeUtils.precision(type); + precision = DataTypeUtils.precision(type); if (precision <= 3) { converter = new RowDataToAvroConverter() { @@ -165,7 +195,7 @@ public Object convert(Schema schema, Object object) { @Override public Object convert(Schema schema, Object object) { - return ((TimestampData) object).toInstant().toEpochMilli(); + return utcTimezone ? ((TimestampData) object).toInstant().toEpochMilli() : ((TimestampData) object).toTimestamp().getTime(); } }; } else if (precision <= 6) { @@ -175,7 +205,7 @@ public Object convert(Schema schema, Object object) { @Override public Object convert(Schema schema, Object object) { - Instant instant = ((TimestampData) object).toInstant(); + Instant instant = utcTimezone ? ((TimestampData) object).toInstant() : ((TimestampData) object).toTimestamp().toInstant(); return Math.addExact(Math.multiplyExact(instant.getEpochSecond(), 1000_000), instant.getNano() / 1000); } }; @@ -196,14 +226,14 @@ public Object convert(Schema schema, Object object) { }; break; case ARRAY: - converter = createArrayConverter((ArrayType) type); + converter = createArrayConverter((ArrayType) type, utcTimezone); break; case ROW: - converter = createRowConverter((RowType) type); + converter = createRowConverter((RowType) type, utcTimezone); break; case MAP: case MULTISET: - converter = createMapConverter(type); + converter = createMapConverter(type, utcTimezone); break; case RAW: default: @@ -241,10 +271,10 @@ public Object convert(Schema schema, Object object) { }; } - private static RowDataToAvroConverter createRowConverter(RowType rowType) { + private static RowDataToAvroConverter createRowConverter(RowType rowType, boolean utcTimezone) { final RowDataToAvroConverter[] fieldConverters = rowType.getChildren().stream() - .map(RowDataToAvroConverters::createConverter) + .map(type -> createConverter(type, utcTimezone)) .toArray(RowDataToAvroConverter[]::new); final LogicalType[] fieldTypes = rowType.getFields().stream() @@ -276,10 +306,10 @@ public Object convert(Schema schema, Object object) { }; } - private static RowDataToAvroConverter createArrayConverter(ArrayType arrayType) { + private static RowDataToAvroConverter createArrayConverter(ArrayType arrayType, boolean utcTimezone) { LogicalType elementType = arrayType.getElementType(); final ArrayData.ElementGetter elementGetter = ArrayData.createElementGetter(elementType); - final RowDataToAvroConverter elementConverter = createConverter(arrayType.getElementType()); + final RowDataToAvroConverter elementConverter = createConverter(arrayType.getElementType(), utcTimezone); return new RowDataToAvroConverter() { private static final long serialVersionUID = 1L; @@ -299,10 +329,10 @@ public Object convert(Schema schema, Object object) { }; } - private static RowDataToAvroConverter createMapConverter(LogicalType type) { + private static RowDataToAvroConverter createMapConverter(LogicalType type, boolean utcTimezone) { LogicalType valueType = AvroSchemaConverter.extractValueTypeToAvroMap(type); final ArrayData.ElementGetter valueGetter = ArrayData.createElementGetter(valueType); - final RowDataToAvroConverter valueConverter = createConverter(valueType); + final RowDataToAvroConverter valueConverter = createConverter(valueType, utcTimezone); return new RowDataToAvroConverter() { private static final long serialVersionUID = 1L; diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/util/StreamerUtil.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/util/StreamerUtil.java index 064e59cc7512..c4587cc2c0b8 100644 --- a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/util/StreamerUtil.java +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/util/StreamerUtil.java @@ -48,6 +48,7 @@ import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.exception.HoodieValidationException; import org.apache.hudi.hadoop.fs.HadoopFSUtils; +import org.apache.hudi.keygen.ComplexAvroKeyGenerator; import org.apache.hudi.keygen.SimpleAvroKeyGenerator; import org.apache.hudi.schema.FilebasedSchemaProvider; import org.apache.hudi.sink.transform.ChainedTransformer; @@ -69,7 +70,6 @@ import java.io.BufferedReader; import java.io.IOException; import java.io.StringReader; -import java.text.ParseException; import java.util.ArrayList; import java.util.Collections; import java.util.Date; @@ -265,7 +265,7 @@ public static HoodieTableMetaClient initTableIfNotExists( .initTable(hadoopConf, basePath); LOG.info("Table initialized under base path {}", basePath); } else { - LOG.info("Table [{}/{}] already exists, no need to initialize the table", + LOG.info("Table [path={}, name={}] already exists, no need to initialize the table", basePath, conf.getString(FlinkOptions.TABLE_NAME)); } @@ -380,34 +380,30 @@ public static Option getTableConfig(String basePath, org.apac * Returns the median instant time between the given two instant time. */ public static Option medianInstantTime(String highVal, String lowVal) { - try { - long high = HoodieActiveTimeline.parseDateFromInstantTime(highVal).getTime(); - long low = HoodieActiveTimeline.parseDateFromInstantTime(lowVal).getTime(); - ValidationUtils.checkArgument(high > low, - "Instant [" + highVal + "] should have newer timestamp than instant [" + lowVal + "]"); - long median = low + (high - low) / 2; - final String instantTime = HoodieActiveTimeline.formatDate(new Date(median)); - if (HoodieTimeline.compareTimestamps(lowVal, HoodieTimeline.GREATER_THAN_OR_EQUALS, instantTime) - || HoodieTimeline.compareTimestamps(highVal, HoodieTimeline.LESSER_THAN_OR_EQUALS, instantTime)) { - return Option.empty(); - } - return Option.of(instantTime); - } catch (ParseException e) { - throw new HoodieException("Get median instant time with interval [" + lowVal + ", " + highVal + "] error", e); + long high = HoodieActiveTimeline.parseDateFromInstantTimeSafely(highVal) + .orElseThrow(() -> new HoodieException("Get instant time diff with interval [" + highVal + "] error")).getTime(); + long low = HoodieActiveTimeline.parseDateFromInstantTimeSafely(lowVal) + .orElseThrow(() -> new HoodieException("Get instant time diff with interval [" + lowVal + "] error")).getTime(); + ValidationUtils.checkArgument(high > low, + "Instant [" + highVal + "] should have newer timestamp than instant [" + lowVal + "]"); + long median = low + (high - low) / 2; + final String instantTime = HoodieActiveTimeline.formatDate(new Date(median)); + if (HoodieTimeline.compareTimestamps(lowVal, HoodieTimeline.GREATER_THAN_OR_EQUALS, instantTime) + || HoodieTimeline.compareTimestamps(highVal, HoodieTimeline.LESSER_THAN_OR_EQUALS, instantTime)) { + return Option.empty(); } + return Option.of(instantTime); } /** * Returns the time interval in seconds between the given instant time. */ public static long instantTimeDiffSeconds(String newInstantTime, String oldInstantTime) { - try { - long newTimestamp = HoodieActiveTimeline.parseDateFromInstantTime(newInstantTime).getTime(); - long oldTimestamp = HoodieActiveTimeline.parseDateFromInstantTime(oldInstantTime).getTime(); - return (newTimestamp - oldTimestamp) / 1000; - } catch (ParseException e) { - throw new HoodieException("Get instant time diff with interval [" + oldInstantTime + ", " + newInstantTime + "] error", e); - } + long newTimestamp = HoodieActiveTimeline.parseDateFromInstantTimeSafely(newInstantTime) + .orElseThrow(() -> new HoodieException("Get instant time diff with interval [" + oldInstantTime + ", " + newInstantTime + "] error")).getTime(); + long oldTimestamp = HoodieActiveTimeline.parseDateFromInstantTimeSafely(oldInstantTime) + .orElseThrow(() -> new HoodieException("Get instant time diff with interval [" + oldInstantTime + ", " + newInstantTime + "] error")).getTime(); + return (newTimestamp - oldTimestamp) / 1000; } public static Option createTransformer(List classNames) throws IOException { @@ -518,7 +514,7 @@ public static boolean fileExists(FileSystem fs, Path path) { public static boolean isWriteCommit(HoodieTableType tableType, HoodieInstant instant, HoodieTimeline timeline) { return tableType == HoodieTableType.MERGE_ON_READ ? !instant.getAction().equals(HoodieTimeline.COMMIT_ACTION) // not a compaction - : !ClusteringUtils.isClusteringInstant(instant, timeline); // not a clustering + : !ClusteringUtils.isCompletedClusteringInstant(instant, timeline); // not a clustering } /** @@ -539,4 +535,15 @@ public static void checkPreCombineKey(Configuration conf, List fields) { } } } + + /** + * Validate keygen generator. + */ + public static void checkKeygenGenerator(boolean isComplexHoodieKey, Configuration conf) { + if (isComplexHoodieKey && FlinkOptions.isDefaultValueDefined(conf, FlinkOptions.KEYGEN_CLASS_NAME)) { + conf.setString(FlinkOptions.KEYGEN_CLASS_NAME, ComplexAvroKeyGenerator.class.getName()); + LOG.info("Table option [{}] is reset to {} because record key or partition path has two or more fields", + FlinkOptions.KEYGEN_CLASS_NAME.key(), ComplexAvroKeyGenerator.class.getName()); + } + } } diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/ITTestDataStreamWrite.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/ITTestDataStreamWrite.java index fea986885f8c..47c613ec7847 100644 --- a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/ITTestDataStreamWrite.java +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/ITTestDataStreamWrite.java @@ -24,7 +24,7 @@ import org.apache.hudi.configuration.FlinkOptions; import org.apache.hudi.configuration.HadoopConfigurations; import org.apache.hudi.configuration.OptionsInference; -import org.apache.hudi.exception.SchemaCompatibilityException; +import org.apache.hudi.exception.MissingSchemaFieldException; import org.apache.hudi.sink.transform.ChainedTransformer; import org.apache.hudi.sink.transform.Transformer; import org.apache.hudi.sink.utils.Pipelines; @@ -557,13 +557,13 @@ public void testColumnDroppingIsNotAllowed() throws Exception { } catch (JobExecutionException e) { Throwable actualException = e; while (actualException != null) { - if (actualException.getClass() == SchemaCompatibilityException.class) { + if (actualException.getClass() == MissingSchemaFieldException.class) { // test is passed return; } actualException = actualException.getCause(); } } - throw new AssertionError(String.format("Excepted exception %s is not found", SchemaCompatibilityException.class)); + throw new AssertionError(String.format("Excepted exception %s is not found", MissingSchemaFieldException.class)); } } diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/TestWriteCopyOnWrite.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/TestWriteCopyOnWrite.java index 0f8651b4b05e..c3e77a379390 100644 --- a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/TestWriteCopyOnWrite.java +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/TestWriteCopyOnWrite.java @@ -148,6 +148,8 @@ public void testSubtaskFails() throws Exception { .end(); } + // Only when Job level fails with INSERT operationType can we roll back the unfinished instant. + // Task level failed retry, we should reuse the unfinished Instant with INSERT operationType @Test public void testPartialFailover() throws Exception { conf.setLong(FlinkOptions.WRITE_COMMIT_ACK_TIMEOUT, 1L); @@ -163,7 +165,7 @@ public void testPartialFailover() throws Exception { .assertNextEvent() // if the write task can not fetch any pending instant when starts up(the coordinator restarts), // it will send an event to the coordinator - .coordinatorFails() + .restartCoordinator() .subTaskFails(0, 2) // the subtask can not fetch the instant to write until a new instant is initialized .checkpointThrows(4, "Timeout(1000ms) while waiting for instant initialize") @@ -172,6 +174,13 @@ public void testPartialFailover() throws Exception { // the last checkpoint instant was rolled back by subTaskFails(0, 2) // with EAGER cleaning strategy .assertNoEvent() + .checkpoint(4) + .assertNextEvent() + .subTaskFails(0, 4) + // the last checkpoint instant can not be rolled back by subTaskFails(0, 4) with INSERT write operationType + // because last data has been snapshot by checkpoint complete but instant has not been committed + // so we need re-commit it + .assertEmptyEvent() .end(); } diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/utils/BulkInsertFunctionWrapper.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/utils/BulkInsertFunctionWrapper.java index 92f8f6decda0..ca6a317cfad6 100644 --- a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/utils/BulkInsertFunctionWrapper.java +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/utils/BulkInsertFunctionWrapper.java @@ -72,7 +72,7 @@ public class BulkInsertFunctionWrapper implements TestFunctionWrapper { private final MockStreamingRuntimeContext runtimeContext; private final MockOperatorEventGateway gateway; private final MockOperatorCoordinatorContext coordinatorContext; - private final StreamWriteOperatorCoordinator coordinator; + private StreamWriteOperatorCoordinator coordinator; private final boolean needSortInput; private BulkInsertWriteFunction writeFunction; @@ -160,6 +160,13 @@ public void coordinatorFails() throws Exception { this.coordinator.setExecutor(new MockCoordinatorExecutor(coordinatorContext)); } + public void restartCoordinator() throws Exception { + this.coordinator.close(); + this.coordinator = new StreamWriteOperatorCoordinator(conf, this.coordinatorContext); + this.coordinator.start(); + this.coordinator.setExecutor(new MockCoordinatorExecutor(coordinatorContext)); + } + public void checkpointFails(long checkpointId) { coordinator.notifyCheckpointAborted(checkpointId); } diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/utils/InsertFunctionWrapper.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/utils/InsertFunctionWrapper.java index cb144e92ba06..15634cc6e72b 100644 --- a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/utils/InsertFunctionWrapper.java +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/utils/InsertFunctionWrapper.java @@ -59,7 +59,7 @@ public class InsertFunctionWrapper implements TestFunctionWrapper { private final MockStreamingRuntimeContext runtimeContext; private final MockOperatorEventGateway gateway; private final MockOperatorCoordinatorContext coordinatorContext; - private final StreamWriteOperatorCoordinator coordinator; + private StreamWriteOperatorCoordinator coordinator; private final MockStateInitializationContext stateInitializationContext; private final boolean asyncClustering; @@ -152,6 +152,13 @@ public void coordinatorFails() throws Exception { this.coordinator.setExecutor(new MockCoordinatorExecutor(coordinatorContext)); } + public void restartCoordinator() throws Exception { + this.coordinator.close(); + this.coordinator = new StreamWriteOperatorCoordinator(conf, this.coordinatorContext); + this.coordinator.start(); + this.coordinator.setExecutor(new MockCoordinatorExecutor(coordinatorContext)); + } + public void checkpointFails(long checkpointId) { coordinator.notifyCheckpointAborted(checkpointId); } diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/utils/StreamWriteFunctionWrapper.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/utils/StreamWriteFunctionWrapper.java index cf801bb0d7d0..c65e42f1521a 100644 --- a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/utils/StreamWriteFunctionWrapper.java +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/utils/StreamWriteFunctionWrapper.java @@ -71,7 +71,7 @@ public class StreamWriteFunctionWrapper implements TestFunctionWrapper { private final MockStreamingRuntimeContext runtimeContext; private final MockOperatorEventGateway gateway; private final MockOperatorCoordinatorContext coordinatorContext; - private final StreamWriteOperatorCoordinator coordinator; + private StreamWriteOperatorCoordinator coordinator; private final MockStateInitializationContext stateInitializationContext; /** @@ -227,6 +227,13 @@ public void coordinatorFails() throws Exception { this.coordinator.setExecutor(new MockCoordinatorExecutor(coordinatorContext)); } + public void restartCoordinator() throws Exception { + this.coordinator.close(); + this.coordinator = new StreamWriteOperatorCoordinator(conf, this.coordinatorContext); + this.coordinator.start(); + this.coordinator.setExecutor(new MockCoordinatorExecutor(coordinatorContext)); + } + public void checkpointFails(long checkpointId) { coordinator.notifyCheckpointAborted(checkpointId); } diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/utils/TestFunctionWrapper.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/utils/TestFunctionWrapper.java index 25593d8d2fd2..faee168bf251 100644 --- a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/utils/TestFunctionWrapper.java +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/utils/TestFunctionWrapper.java @@ -82,6 +82,14 @@ default void coordinatorFails() throws Exception { throw new UnsupportedOperationException(); } + /** + * Triggers Job level fail, so the coordinator need re-create a new instance. + * @throws Exception + */ + default void restartCoordinator() throws Exception { + throw new UnsupportedOperationException(); + } + /** * Returns the operator coordinator. */ diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/utils/TestWriteBase.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/utils/TestWriteBase.java index f40bc9c365aa..22ff06c606b6 100644 --- a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/utils/TestWriteBase.java +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/utils/TestWriteBase.java @@ -517,8 +517,8 @@ public TestHarness checkLastPendingInstantCompleted() { * Used to simulate the use case that the coordinator has not finished a new instant initialization, * while the write task fails intermittently. */ - public TestHarness coordinatorFails() throws Exception { - this.pipeline.coordinatorFails(); + public TestHarness restartCoordinator() throws Exception { + this.pipeline.restartCoordinator(); return this; } diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/source/TestIncrementalInputSplits.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/source/TestIncrementalInputSplits.java index 64211608e058..c15e4c628b64 100644 --- a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/source/TestIncrementalInputSplits.java +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/source/TestIncrementalInputSplits.java @@ -67,6 +67,7 @@ import static org.apache.hudi.common.table.timeline.HoodieTimeline.LESSER_THAN; import static org.apache.hudi.common.table.timeline.TimelineMetadataUtils.serializeCommitMetadata; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertIterableEquals; import static org.junit.jupiter.api.Assertions.assertTrue; @@ -142,18 +143,20 @@ void testFilterInstantsWithRange() throws IOException { } @Test - void testFilterInstantsByCondition() throws IOException { - Configuration conf = TestConfigurations.getDefaultConf(basePath); + void testFilterInstantsByConditionForMOR() throws IOException { metaClient = HoodieTestUtils.init(basePath, HoodieTableType.MERGE_ON_READ); + HoodieActiveTimeline timelineMOR = metaClient.getActiveTimeline(); - HoodieActiveTimeline timeline = metaClient.getActiveTimeline(); + // commit1: delta commit HoodieInstant commit1 = new HoodieInstant(HoodieInstant.State.COMPLETED, HoodieTimeline.DELTA_COMMIT_ACTION, "1"); + timelineMOR.createCompleteInstant(commit1); + // commit2: delta commit HoodieInstant commit2 = new HoodieInstant(HoodieInstant.State.COMPLETED, HoodieTimeline.DELTA_COMMIT_ACTION, "2"); + // commit3: clustering + timelineMOR.createCompleteInstant(commit2); HoodieInstant commit3 = new HoodieInstant(HoodieInstant.State.REQUESTED, HoodieTimeline.REPLACE_COMMIT_ACTION, "3"); - timeline.createCompleteInstant(commit1); - timeline.createCompleteInstant(commit2); - timeline.createNewInstant(commit3); - commit3 = timeline.transitionReplaceRequestedToInflight(commit3, Option.empty()); + timelineMOR.createNewInstant(commit3); + commit3 = timelineMOR.transitionReplaceRequestedToInflight(commit3, Option.empty()); HoodieCommitMetadata commitMetadata = CommitUtils.buildMetadata( new ArrayList<>(), new HashMap<>(), @@ -161,15 +164,139 @@ void testFilterInstantsByCondition() throws IOException { WriteOperationType.CLUSTER, "", HoodieTimeline.REPLACE_COMMIT_ACTION); - timeline.transitionReplaceInflightToComplete(true, + timelineMOR.transitionReplaceInflightToComplete(true, HoodieTimeline.getReplaceCommitInflightInstant(commit3.getTimestamp()), serializeCommitMetadata(commitMetadata)); - timeline = timeline.reload(); + // commit4: insert overwrite + HoodieInstant commit4 = new HoodieInstant(HoodieInstant.State.REQUESTED, HoodieTimeline.REPLACE_COMMIT_ACTION, "4"); + timelineMOR.createNewInstant(commit4); + commit4 = timelineMOR.transitionReplaceRequestedToInflight(commit4, Option.empty()); + commitMetadata = CommitUtils.buildMetadata( + new ArrayList<>(), + new HashMap<>(), + Option.empty(), + WriteOperationType.INSERT_OVERWRITE, + "", + HoodieTimeline.REPLACE_COMMIT_ACTION); + timelineMOR.transitionReplaceInflightToComplete(true, + HoodieTimeline.getReplaceCommitInflightInstant(commit4.getTimestamp()), + serializeCommitMetadata(commitMetadata)); + // commit5: insert overwrite table + HoodieInstant commit5 = new HoodieInstant(HoodieInstant.State.REQUESTED, HoodieTimeline.REPLACE_COMMIT_ACTION, "5"); + timelineMOR.createNewInstant(commit5); + commit5 = timelineMOR.transitionReplaceRequestedToInflight(commit5, Option.empty()); + commitMetadata = CommitUtils.buildMetadata( + new ArrayList<>(), + new HashMap<>(), + Option.empty(), + WriteOperationType.INSERT_OVERWRITE_TABLE, + "", + HoodieTimeline.REPLACE_COMMIT_ACTION); + timelineMOR.transitionReplaceInflightToComplete(true, + HoodieTimeline.getReplaceCommitInflightInstant(commit5.getTimestamp()), + serializeCommitMetadata(commitMetadata)); + // commit6: compaction + HoodieInstant commit6 = new HoodieInstant(HoodieInstant.State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "6"); + timelineMOR.createNewInstant(commit6); + commit6 = timelineMOR.transitionCompactionRequestedToInflight(commit6); + commit6 = timelineMOR.transitionCompactionInflightToComplete(false, commit6, Option.empty()); + timelineMOR.createCompleteInstant(commit6); + timelineMOR = timelineMOR.reload(); + + // will not filter commits by default + HoodieTimeline resTimeline = IncrementalQueryAnalyzer.filterInstantsAsPerUserConfigs(metaClient, timelineMOR, false, false, false); + assertEquals(6, resTimeline.getInstants().size()); + + // filter cluster commits + resTimeline = IncrementalQueryAnalyzer.filterInstantsAsPerUserConfigs(metaClient, timelineMOR, false, true, false); + assertEquals(5, resTimeline.getInstants().size()); + assertFalse(resTimeline.containsInstant(commit3)); + + // filter compaction commits for mor table + resTimeline = IncrementalQueryAnalyzer.filterInstantsAsPerUserConfigs(metaClient, timelineMOR, true, false, false); + assertFalse(resTimeline.containsInstant(commit6)); + + // filter insert overwriter commits + resTimeline = IncrementalQueryAnalyzer.filterInstantsAsPerUserConfigs(metaClient, timelineMOR, false, false, true); + assertEquals(4, resTimeline.getInstants().size()); + assertFalse(resTimeline.containsInstant(commit4)); + assertFalse(resTimeline.containsInstant(commit5)); + } + + @Test + void testFilterInstantsByConditionForCOW() throws IOException { + metaClient = HoodieTestUtils.init(basePath, HoodieTableType.COPY_ON_WRITE); + HoodieActiveTimeline timelineCOW = metaClient.getActiveTimeline(); + + // commit1: commit + HoodieInstant commit1 = new HoodieInstant(HoodieInstant.State.COMPLETED, HoodieTimeline.COMMIT_ACTION, "1"); + timelineCOW.createCompleteInstant(commit1); + // commit2: commit + HoodieInstant commit2 = new HoodieInstant(HoodieInstant.State.COMPLETED, HoodieTimeline.COMMIT_ACTION, "2"); + // commit3: clustering + timelineCOW.createCompleteInstant(commit2); + HoodieInstant commit3 = new HoodieInstant(HoodieInstant.State.REQUESTED, HoodieTimeline.REPLACE_COMMIT_ACTION, "3"); + timelineCOW.createNewInstant(commit3); + commit3 = timelineCOW.transitionReplaceRequestedToInflight(commit3, Option.empty()); + HoodieCommitMetadata commitMetadata = CommitUtils.buildMetadata( + new ArrayList<>(), + new HashMap<>(), + Option.empty(), + WriteOperationType.CLUSTER, + "", + HoodieTimeline.REPLACE_COMMIT_ACTION); + timelineCOW.transitionReplaceInflightToComplete(true, + HoodieTimeline.getReplaceCommitInflightInstant(commit3.getTimestamp()), + serializeCommitMetadata(commitMetadata)); + // commit4: insert overwrite + HoodieInstant commit4 = new HoodieInstant(HoodieInstant.State.REQUESTED, HoodieTimeline.REPLACE_COMMIT_ACTION, "4"); + timelineCOW.createNewInstant(commit4); + commit4 = timelineCOW.transitionReplaceRequestedToInflight(commit4, Option.empty()); + commitMetadata = CommitUtils.buildMetadata( + new ArrayList<>(), + new HashMap<>(), + Option.empty(), + WriteOperationType.INSERT_OVERWRITE, + "", + HoodieTimeline.REPLACE_COMMIT_ACTION); + timelineCOW.transitionReplaceInflightToComplete(true, + HoodieTimeline.getReplaceCommitInflightInstant(commit4.getTimestamp()), + serializeCommitMetadata(commitMetadata)); + // commit5: insert overwrite table + HoodieInstant commit5 = new HoodieInstant(HoodieInstant.State.REQUESTED, HoodieTimeline.REPLACE_COMMIT_ACTION, "5"); + timelineCOW.createNewInstant(commit5); + commit5 = timelineCOW.transitionReplaceRequestedToInflight(commit5, Option.empty()); + commitMetadata = CommitUtils.buildMetadata( + new ArrayList<>(), + new HashMap<>(), + Option.empty(), + WriteOperationType.INSERT_OVERWRITE_TABLE, + "", + HoodieTimeline.REPLACE_COMMIT_ACTION); + timelineCOW.transitionReplaceInflightToComplete(true, + HoodieTimeline.getReplaceCommitInflightInstant(commit5.getTimestamp()), + serializeCommitMetadata(commitMetadata)); + + timelineCOW = timelineCOW.reload(); + + // will not filter commits by default + HoodieTimeline resTimeline = IncrementalQueryAnalyzer.filterInstantsAsPerUserConfigs(metaClient, timelineCOW, false, false, false); + assertEquals(5, resTimeline.getInstants().size()); + + // filter cluster commits + resTimeline = IncrementalQueryAnalyzer.filterInstantsAsPerUserConfigs(metaClient, timelineCOW, false, true, false); + assertEquals(4, resTimeline.getInstants().size()); + assertFalse(resTimeline.containsInstant(commit3)); + + // cow table skip-compact does not take effect (because if it take effect will affect normal commits) + resTimeline = IncrementalQueryAnalyzer.filterInstantsAsPerUserConfigs(metaClient, timelineCOW, true, false, false); + assertEquals(5, resTimeline.getInstants().size()); - conf.set(FlinkOptions.READ_END_COMMIT, "3"); - HoodieTimeline resTimeline = IncrementalQueryAnalyzer.filterInstantsAsPerUserConfigs(metaClient, timeline, false, false); - // will not filter cluster commit by default + // filter insert overwriter commits + resTimeline = IncrementalQueryAnalyzer.filterInstantsAsPerUserConfigs(metaClient, timelineCOW, false, false, true); assertEquals(3, resTimeline.getInstants().size()); + assertFalse(resTimeline.containsInstant(commit4)); + assertFalse(resTimeline.containsInstant(commit5)); } @Test diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/table/ITTestHoodieDataSource.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/table/ITTestHoodieDataSource.java index 22a2511ba073..ac38d92a5770 100644 --- a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/table/ITTestHoodieDataSource.java +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/table/ITTestHoodieDataSource.java @@ -60,7 +60,10 @@ import org.junit.jupiter.params.provider.ValueSource; import java.io.File; +import java.time.Instant; +import java.time.LocalDateTime; import java.time.ZoneId; +import java.time.format.DateTimeFormatter; import java.util.Arrays; import java.util.Collection; import java.util.HashMap; @@ -131,6 +134,7 @@ void testStreamWriteAndReadFromSpecifiedCommit(HoodieTableType tableType) throws hoodieTableDDL = sql("t1") .option(FlinkOptions.PATH, tempFile.getAbsolutePath()) .option(FlinkOptions.READ_AS_STREAMING, true) + .option(FlinkOptions.READ_STREAMING_SKIP_COMPACT, false) .option(FlinkOptions.TABLE_TYPE, tableType) .option(FlinkOptions.READ_START_COMMIT, firstCommit) .end(); @@ -163,6 +167,7 @@ void testStreamReadFromSpecifiedCommitWithChangelog(HoodieCDCSupplementalLogging String hoodieTableDDL = sql("t1") .option(FlinkOptions.PATH, tempFile.getAbsolutePath()) .option(FlinkOptions.READ_AS_STREAMING, true) + .option(FlinkOptions.READ_STREAMING_SKIP_COMPACT, false) .option(FlinkOptions.CDC_ENABLED, true) .option(FlinkOptions.SUPPLEMENTAL_LOGGING_MODE, mode.name()) .end(); @@ -196,6 +201,7 @@ void testStreamWriteAndRead(HoodieTableType tableType) throws Exception { String hoodieTableDDL = sql("t1") .option(FlinkOptions.PATH, tempFile.getAbsolutePath()) .option(FlinkOptions.READ_AS_STREAMING, true) + .option(FlinkOptions.READ_STREAMING_SKIP_COMPACT, false) .option(FlinkOptions.TABLE_TYPE, tableType) .end(); streamTableEnv.executeSql(hoodieTableDDL); @@ -239,6 +245,7 @@ void testStreamReadAppendData(HoodieTableType tableType) throws Exception { String createHoodieTable2 = sql("t2") .option(FlinkOptions.PATH, tempFile.getAbsolutePath()) .option(FlinkOptions.READ_AS_STREAMING, true) + .option(FlinkOptions.READ_STREAMING_SKIP_COMPACT, false) .option(FlinkOptions.TABLE_TYPE, tableType) .option(FlinkOptions.READ_START_COMMIT, specifiedCommit) .end(); @@ -332,7 +339,6 @@ void testStreamWriteReadSkippingCompaction() throws Exception { .option(FlinkOptions.PATH, tempFile.getAbsolutePath()) .option(FlinkOptions.TABLE_TYPE, FlinkOptions.TABLE_TYPE_MERGE_ON_READ) .option(FlinkOptions.READ_AS_STREAMING, true) - .option(FlinkOptions.READ_STREAMING_SKIP_COMPACT, true) .option(FlinkOptions.COMPACTION_DELTA_COMMITS, 1) .option(FlinkOptions.COMPACTION_TASKS, 1) .end(); @@ -359,7 +365,6 @@ void testAppendWriteReadSkippingClustering() throws Exception { .option(FlinkOptions.PATH, tempFile.getAbsolutePath()) .option(FlinkOptions.OPERATION, "insert") .option(FlinkOptions.READ_AS_STREAMING, true) - .option(FlinkOptions.READ_STREAMING_SKIP_CLUSTERING, true) .option(FlinkOptions.CLUSTERING_SCHEDULE_ENABLED,true) .option(FlinkOptions.CLUSTERING_ASYNC_ENABLED, true) .option(FlinkOptions.CLUSTERING_DELTA_COMMITS,1) @@ -490,6 +495,7 @@ void testStreamReadFilterByPartition(HoodieTableType tableType, boolean hiveStyl .option(FlinkOptions.TABLE_TYPE, tableType) .option(FlinkOptions.READ_AS_STREAMING, true) .option(FlinkOptions.READ_STREAMING_CHECK_INTERVAL, 2) + .option(FlinkOptions.READ_STREAMING_SKIP_COMPACT, false) .option(FlinkOptions.HIVE_STYLE_PARTITIONING, hiveStylePartitioning) .end(); streamTableEnv.executeSql(hoodieTableDDL); @@ -675,7 +681,8 @@ void testWriteAndReadParMiddle(ExecMode execMode) throws Exception { + "with (\n" + " 'connector' = 'hudi',\n" + " 'path' = '" + tempFile.getAbsolutePath() + "',\n" - + " 'read.streaming.enabled' = '" + streaming + "'\n" + + " 'read.streaming.enabled' = '" + streaming + "',\n" + + " 'read.streaming.skip_compaction' = 'false'\n" + ")"; streamTableEnv.executeSql(hoodieTableDDL); String insertInto = "insert into t1 values\n" @@ -721,6 +728,7 @@ void testWriteAndReadWithTimestampMicros(ExecMode execMode) throws Exception { .noPartition() .option(FlinkOptions.PATH, tempFile.getAbsolutePath()) .option(FlinkOptions.READ_AS_STREAMING, streaming) + .option(FlinkOptions.READ_STREAMING_SKIP_COMPACT, false) .end(); streamTableEnv.executeSql(hoodieTableDDL); String insertInto = "insert into t1 values\n" @@ -824,6 +832,7 @@ void testStreamWriteAndReadWithMiniBatches(HoodieTableType tableType) throws Exc String hoodieTableDDL = sql("t1") .option(FlinkOptions.PATH, tempFile.getAbsolutePath()) .option(FlinkOptions.READ_AS_STREAMING, true) + .option(FlinkOptions.READ_STREAMING_SKIP_COMPACT, false) .option(FlinkOptions.TABLE_TYPE, tableType) .option(FlinkOptions.READ_START_COMMIT, "earliest") .option(FlinkOptions.WRITE_BATCH_SIZE, 0.00001) @@ -1076,6 +1085,7 @@ void testWriteAndReadDebeziumJson(ExecMode execMode) throws Exception { .pkField("id") .option(FlinkOptions.PATH, tempFile.getAbsolutePath()) .option(FlinkOptions.READ_AS_STREAMING, execMode == ExecMode.STREAM) + .option(FlinkOptions.READ_STREAMING_SKIP_COMPACT, false) .option(FlinkOptions.PRE_COMBINE, true) .noPartition() .end(); @@ -1825,6 +1835,56 @@ void testWriteReadWithLocalTimestamp(HoodieTableType tableType) { assertRowsEquals(result, expected); } + @ParameterizedTest + @EnumSource(value = HoodieTableType.class) + void testWriteReadWithTimestampWithoutTZ(HoodieTableType tableType) { + TableEnvironment tableEnv = batchTableEnv; + tableEnv.getConfig().setLocalTimeZone(ZoneId.of("America/Los_Angeles")); + String createTable = sql("t1") + .field("f0 int") + .field("f1 varchar(10)") + .field("f2 TIMESTAMP(3)") + .field("f3 TIMESTAMP(6)") + .option(FlinkOptions.PATH, tempFile.getAbsolutePath()) + .option(FlinkOptions.PRECOMBINE_FIELD, "f1") + .option(FlinkOptions.TABLE_TYPE, tableType) + .option(FlinkOptions.WRITE_UTC_TIMEZONE, false) + //FlinkOptions.READ_UTC_TIMEZONE doesn't affect in MergeOnReadInputFormat since the option isn't supported in AvroToRowDataConverters + //.option(FlinkOptions.READ_UTC_TIMEZONE, false) + .pkField("f0") + .noPartition() + .end(); + tableEnv.executeSql(createTable); + + long epochMillis = 0L; + DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); + String insertInto = "insert into t1 values\n" + + "(1" + + ", 'abc'" + + ", TIMESTAMP '" + formatter.format(LocalDateTime.ofInstant(Instant.ofEpochMilli(epochMillis + 1000), ZoneId.systemDefault())) + "'" + + ", TIMESTAMP '" + formatter.format(LocalDateTime.ofInstant(Instant.ofEpochMilli(epochMillis + 2000), ZoneId.systemDefault())) + "'),\n" + + "(2" + + ", 'def'" + + ", TIMESTAMP '" + formatter.format(LocalDateTime.ofInstant(Instant.ofEpochMilli(epochMillis + 3000), ZoneId.systemDefault())) + "'" + + ", TIMESTAMP '" + formatter.format(LocalDateTime.ofInstant(Instant.ofEpochMilli(epochMillis + 4000), ZoneId.systemDefault())) + "')"; + execInsertSql(tableEnv, insertInto); + + List result = CollectionUtil.iterableToList( + () -> tableEnv.sqlQuery("select * from t1").execute().collect()); + formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss"); + final String expected = "[" + + "+I[1" + + ", abc" + + ", " + formatter.format(LocalDateTime.ofInstant(Instant.ofEpochMilli(epochMillis + 1000), ZoneId.of("UTC"))) + + ", " + formatter.format(LocalDateTime.ofInstant(Instant.ofEpochMilli(epochMillis + 2000), ZoneId.of("UTC"))) + "], " + + "+I[2" + + ", def" + + ", " + formatter.format(LocalDateTime.ofInstant(Instant.ofEpochMilli(epochMillis + 3000), ZoneId.of("UTC"))) + + ", " + formatter.format(LocalDateTime.ofInstant(Instant.ofEpochMilli(epochMillis + 4000), ZoneId.of("UTC"))) + "]]"; + + assertRowsEquals(result, expected); + } + @ParameterizedTest @MethodSource("tableTypeQueryTypeNumInsertAndCompactionDeltaCommitsParams") void testReadMetaFields(HoodieTableType tableType, String queryType, int numInsertBatches, int compactionDeltaCommits) throws Exception { @@ -1969,6 +2029,7 @@ void testDynamicPartitionPrune(HoodieTableType tableType, boolean hiveStyleParti .option(FlinkOptions.TABLE_TYPE, tableType) .option(FlinkOptions.READ_AS_STREAMING, true) .option(FlinkOptions.READ_STREAMING_CHECK_INTERVAL, 2) + .option(FlinkOptions.READ_STREAMING_SKIP_COMPACT, false) .option(FlinkOptions.HIVE_STYLE_PARTITIONING, hiveStylePartitioning) .end(); streamTableEnv.executeSql(hoodieTableDDL); diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/table/ITTestSchemaEvolution.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/table/ITTestSchemaEvolution.java index 0417285815a9..46f51df741f1 100644 --- a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/table/ITTestSchemaEvolution.java +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/table/ITTestSchemaEvolution.java @@ -90,7 +90,8 @@ public void testCopyOnWriteInputFormat() throws Exception { public void testMergeOnReadInputFormatBaseFileOnlyIterator() throws Exception { TableOptions tableOptions = defaultTableOptions(tempFile.getAbsolutePath()) .withOption(FlinkOptions.READ_AS_STREAMING.key(), true) - .withOption(FlinkOptions.READ_START_COMMIT.key(), FlinkOptions.START_COMMIT_EARLIEST); + .withOption(FlinkOptions.READ_START_COMMIT.key(), FlinkOptions.START_COMMIT_EARLIEST) + .withOption(FlinkOptions.READ_STREAMING_SKIP_COMPACT.key(), false); testSchemaEvolution(tableOptions); } @@ -98,7 +99,8 @@ public void testMergeOnReadInputFormatBaseFileOnlyIterator() throws Exception { public void testMergeOnReadInputFormatBaseFileOnlyFilteringIterator() throws Exception { TableOptions tableOptions = defaultTableOptions(tempFile.getAbsolutePath()) .withOption(FlinkOptions.READ_AS_STREAMING.key(), true) - .withOption(FlinkOptions.READ_START_COMMIT.key(), 1); + .withOption(FlinkOptions.READ_START_COMMIT.key(), 1) + .withOption(FlinkOptions.READ_STREAMING_SKIP_COMPACT.key(), false); testSchemaEvolution(tableOptions); } diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/table/catalog/TestHoodieCatalog.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/table/catalog/TestHoodieCatalog.java index 0207022903b4..d883b72b075d 100644 --- a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/table/catalog/TestHoodieCatalog.java +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/table/catalog/TestHoodieCatalog.java @@ -28,6 +28,7 @@ import org.apache.hudi.configuration.FlinkOptions; import org.apache.hudi.configuration.HadoopConfigurations; import org.apache.hudi.exception.HoodieValidationException; +import org.apache.hudi.keygen.ComplexAvroKeyGenerator; import org.apache.hudi.keygen.NonpartitionedAvroKeyGenerator; import org.apache.hudi.keygen.SimpleAvroKeyGenerator; import org.apache.hudi.sink.partitioner.profile.WriteProfiles; @@ -35,6 +36,7 @@ import org.apache.hudi.utils.TestConfigurations; import org.apache.hudi.utils.TestData; +import org.apache.flink.calcite.shaded.com.google.common.collect.Lists; import org.apache.flink.configuration.Configuration; import org.apache.flink.core.fs.Path; import org.apache.flink.table.api.DataTypes; @@ -108,6 +110,13 @@ public class TestHoodieCatalog { Collections.emptyList(), CONSTRAINTS); + private static final UniqueConstraint MULTI_KEY_CONSTRAINTS = UniqueConstraint.primaryKey("uuid", Arrays.asList("uuid", "name")); + private static final ResolvedSchema CREATE_MULTI_KEY_TABLE_SCHEMA = + new ResolvedSchema( + CREATE_COLUMNS, + Collections.emptyList(), + MULTI_KEY_CONSTRAINTS); + private static final List EXPECTED_TABLE_COLUMNS = CREATE_COLUMNS.stream() .map( @@ -258,6 +267,40 @@ public void testCreateTable() throws Exception { String keyGeneratorClassName = metaClient.getTableConfig().getKeyGeneratorClassName(); assertEquals(keyGeneratorClassName, SimpleAvroKeyGenerator.class.getName()); + // validate single key and multiple partition for partitioned table + ObjectPath singleKeyMultiplePartitionPath = new ObjectPath(TEST_DEFAULT_DATABASE, "tb_skmp" + System.currentTimeMillis()); + final ResolvedCatalogTable singleKeyMultiplePartitionTable = new ResolvedCatalogTable( + CatalogTable.of( + Schema.newBuilder().fromResolvedSchema(CREATE_TABLE_SCHEMA).build(), + "test", + Lists.newArrayList("par1", "par2"), + EXPECTED_OPTIONS), + CREATE_TABLE_SCHEMA + ); + + catalog.createTable(singleKeyMultiplePartitionPath, singleKeyMultiplePartitionTable, false); + metaClient = + StreamerUtil.createMetaClient(catalog.inferTablePath(catalogPathStr, singleKeyMultiplePartitionPath), new org.apache.hadoop.conf.Configuration()); + keyGeneratorClassName = metaClient.getTableConfig().getKeyGeneratorClassName(); + assertThat(keyGeneratorClassName, is(ComplexAvroKeyGenerator.class.getName())); + + // validate multiple key and single partition for partitioned table + ObjectPath multipleKeySinglePartitionPath = new ObjectPath(TEST_DEFAULT_DATABASE, "tb_mksp" + System.currentTimeMillis()); + final ResolvedCatalogTable multipleKeySinglePartitionTable = new ResolvedCatalogTable( + CatalogTable.of( + Schema.newBuilder().fromResolvedSchema(CREATE_MULTI_KEY_TABLE_SCHEMA).build(), + "test", + Lists.newArrayList("par1"), + EXPECTED_OPTIONS), + CREATE_TABLE_SCHEMA + ); + + catalog.createTable(multipleKeySinglePartitionPath, multipleKeySinglePartitionTable, false); + metaClient = + StreamerUtil.createMetaClient(catalog.inferTablePath(catalogPathStr, singleKeyMultiplePartitionPath), new org.apache.hadoop.conf.Configuration()); + keyGeneratorClassName = metaClient.getTableConfig().getKeyGeneratorClassName(); + assertThat(keyGeneratorClassName, is(ComplexAvroKeyGenerator.class.getName())); + // validate key generator for non partitioned table ObjectPath nonPartitionPath = new ObjectPath(TEST_DEFAULT_DATABASE, "tb"); final ResolvedCatalogTable nonPartitionCatalogTable = new ResolvedCatalogTable( diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/table/catalog/TestHoodieHiveCatalog.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/table/catalog/TestHoodieHiveCatalog.java index 3ee85a46fc46..d88bb0326ef4 100644 --- a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/table/catalog/TestHoodieHiveCatalog.java +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/table/catalog/TestHoodieHiveCatalog.java @@ -29,11 +29,13 @@ import org.apache.hudi.configuration.FlinkOptions; import org.apache.hudi.exception.HoodieCatalogException; import org.apache.hudi.hadoop.fs.HadoopFSUtils; +import org.apache.hudi.keygen.ComplexAvroKeyGenerator; import org.apache.hudi.keygen.NonpartitionedAvroKeyGenerator; import org.apache.hudi.keygen.SimpleAvroKeyGenerator; import org.apache.hudi.sink.partitioner.profile.WriteProfiles; import org.apache.hudi.util.StreamerUtil; +import org.apache.flink.calcite.shaded.com.google.common.collect.Lists; import org.apache.flink.table.api.DataTypes; import org.apache.flink.table.api.Schema; import org.apache.flink.table.api.TableSchema; @@ -71,6 +73,7 @@ import static org.apache.flink.table.factories.FactoryUtil.CONNECTOR; import static org.apache.hudi.configuration.FlinkOptions.PRECOMBINE_FIELD; +import static org.apache.hudi.keygen.constant.KeyGeneratorOptions.RECORDKEY_FIELD_NAME; import static org.apache.hudi.table.catalog.HoodieCatalogTestUtils.createHiveConf; import static org.hamcrest.CoreMatchers.instanceOf; import static org.hamcrest.CoreMatchers.is; @@ -97,6 +100,26 @@ public class TestHoodieHiveCatalog { .primaryKey("uuid") .build(); List partitions = Collections.singletonList("par1"); + + TableSchema multiKeySinglePartitionTableSchema = + TableSchema.builder() + .field("uuid", DataTypes.INT().notNull()) + .field("name", DataTypes.STRING().notNull()) + .field("age", DataTypes.INT()) + .field("par1", DataTypes.STRING()) + .primaryKey("uuid", "name") + .build(); + + TableSchema singleKeyMultiPartitionTableSchema = + TableSchema.builder() + .field("uuid", DataTypes.INT().notNull()) + .field("name", DataTypes.STRING()) + .field("par1", DataTypes.STRING()) + .field("par2", DataTypes.STRING()) + .primaryKey("uuid") + .build(); + List multiPartitions = Lists.newArrayList("par1", "par2"); + private static HoodieHiveCatalog hoodieCatalog; private final ObjectPath tablePath = new ObjectPath("default", "test"); @@ -201,6 +224,28 @@ public void testCreateAndGetHoodieTable(HoodieTableType tableType) throws Except String keyGeneratorClassName = metaClient.getTableConfig().getKeyGeneratorClassName(); assertEquals(keyGeneratorClassName, SimpleAvroKeyGenerator.class.getName()); + // validate single key and multiple partition for partitioned table + ObjectPath singleKeyMultiPartitionPath = new ObjectPath("default", "tb_skmp_" + System.currentTimeMillis()); + CatalogTable singleKeyMultiPartitionTable = + new CatalogTableImpl(singleKeyMultiPartitionTableSchema, multiPartitions, options, "hudi table"); + hoodieCatalog.createTable(singleKeyMultiPartitionPath, singleKeyMultiPartitionTable, false); + + HoodieTableMetaClient singleKeyMultiPartitionTableMetaClient = + StreamerUtil.createMetaClient(hoodieCatalog.inferTablePath(singleKeyMultiPartitionPath, singleKeyMultiPartitionTable), createHiveConf()); + assertThat(singleKeyMultiPartitionTableMetaClient.getTableConfig().getKeyGeneratorClassName(), is(ComplexAvroKeyGenerator.class.getName())); + + // validate multiple key and single partition for partitioned table + ObjectPath multiKeySinglePartitionPath = new ObjectPath("default", "tb_mksp_" + System.currentTimeMillis()); + + options.remove(RECORDKEY_FIELD_NAME.key()); + CatalogTable multiKeySinglePartitionTable = + new CatalogTableImpl(multiKeySinglePartitionTableSchema, partitions, options, "hudi table"); + hoodieCatalog.createTable(multiKeySinglePartitionPath, multiKeySinglePartitionTable, false); + + HoodieTableMetaClient multiKeySinglePartitionTableMetaClient = + StreamerUtil.createMetaClient(hoodieCatalog.inferTablePath(multiKeySinglePartitionPath, multiKeySinglePartitionTable), createHiveConf()); + assertThat(multiKeySinglePartitionTableMetaClient.getTableConfig().getKeyGeneratorClassName(), is(ComplexAvroKeyGenerator.class.getName())); + // validate key generator for non partitioned table ObjectPath nonPartitionPath = new ObjectPath("default", "tb_" + tableType); CatalogTable nonPartitionTable = @@ -370,6 +415,19 @@ public void testDropPartition() throws Exception { assertThrows(NoSuchObjectException.class, () -> getHivePartition(partitionSpec)); } + @Test + public void testMappingHiveConfPropsToHiveTableParams() throws TableAlreadyExistException, DatabaseNotExistException, TableNotExistException { + HoodieHiveCatalog catalog = HoodieCatalogTestUtils.createHiveCatalog("myCatalog", true); + catalog.open(); + Map originOptions = new HashMap<>(); + originOptions.put(FactoryUtil.CONNECTOR.key(), "hudi"); + CatalogTable table = new CatalogTableImpl(schema, originOptions, "hudi table"); + catalog.createTable(tablePath, table, false); + + Table hiveTable = hoodieCatalog.getHiveTable(tablePath); + assertEquals("false", hiveTable.getParameters().get("hadoop.hive.metastore.schema.verification")); + } + private Partition getHivePartition(CatalogPartitionSpec partitionSpec) throws Exception { return hoodieCatalog.getClient().getPartition( tablePath.getDatabaseName(), diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/utils/TestRowDataToAvroConverters.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/utils/TestRowDataToAvroConverters.java new file mode 100644 index 000000000000..0ab0626d0345 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/utils/TestRowDataToAvroConverters.java @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.utils; + +import org.apache.avro.generic.GenericRecord; +import org.apache.flink.formats.common.TimestampFormat; +import org.apache.flink.formats.json.JsonToRowDataConverters; +import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.core.JsonProcessingException; +import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.databind.ObjectMapper; +import org.apache.flink.table.api.DataTypes; +import org.apache.flink.table.types.DataType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.hudi.util.AvroSchemaConverter; +import org.apache.hudi.util.RowDataToAvroConverters; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +import java.time.Instant; +import java.time.LocalDateTime; +import java.time.ZoneId; +import java.time.format.DateTimeFormatter; + +import static org.apache.flink.table.api.DataTypes.ROW; +import static org.apache.flink.table.api.DataTypes.FIELD; +import static org.apache.flink.table.api.DataTypes.TIMESTAMP; + +class TestRowDataToAvroConverters { + + DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); + @Test + void testRowDataToAvroStringToRowDataWithLocalTimezone() throws JsonProcessingException { + String timestampFromLocal = "2021-03-30 07:44:29"; + + DataType rowDataType = ROW(FIELD("timestamp_from_local", TIMESTAMP())); + JsonToRowDataConverters.JsonToRowDataConverter jsonToRowDataConverter = + new JsonToRowDataConverters(true, true, TimestampFormat.SQL) + .createConverter(rowDataType.getLogicalType()); + Object rowData = jsonToRowDataConverter.convert(new ObjectMapper().readTree("{\"timestamp_from_local\":\"" + timestampFromLocal + "\"}")); + + RowType rowType = (RowType) DataTypes.ROW(DataTypes.FIELD("f_timestamp", DataTypes.TIMESTAMP(3))).getLogicalType(); + RowDataToAvroConverters.RowDataToAvroConverter converter = + RowDataToAvroConverters.createConverter(rowType, false); + GenericRecord avroRecord = + (GenericRecord) converter.convert(AvroSchemaConverter.convertToSchema(rowType), rowData); + Assertions.assertEquals(timestampFromLocal, formatter.format(LocalDateTime.ofInstant(Instant.ofEpochMilli((Long) avroRecord.get(0)), ZoneId.systemDefault()))); + } + + @Test + void testRowDataToAvroStringToRowDataWithUtcTimezone() throws JsonProcessingException { + String timestampFromUtc0 = "2021-03-30 07:44:29"; + + DataType rowDataType = ROW(FIELD("timestamp_from_utc_0", TIMESTAMP())); + JsonToRowDataConverters.JsonToRowDataConverter jsonToRowDataConverter = + new JsonToRowDataConverters(true, true, TimestampFormat.SQL) + .createConverter(rowDataType.getLogicalType()); + Object rowData = jsonToRowDataConverter.convert(new ObjectMapper().readTree("{\"timestamp_from_utc_0\":\"" + timestampFromUtc0 + "\"}")); + + RowType rowType = (RowType) DataTypes.ROW(DataTypes.FIELD("f_timestamp", DataTypes.TIMESTAMP(3))).getLogicalType(); + RowDataToAvroConverters.RowDataToAvroConverter converter = + RowDataToAvroConverters.createConverter(rowType); + GenericRecord avroRecord = + (GenericRecord) converter.convert(AvroSchemaConverter.convertToSchema(rowType), rowData); + Assertions.assertEquals(timestampFromUtc0, formatter.format(LocalDateTime.ofInstant(Instant.ofEpochMilli((Long) avroRecord.get(0)), ZoneId.of("UTC")))); + Assertions.assertEquals("2021-03-30 08:44:29", formatter.format(LocalDateTime.ofInstant(Instant.ofEpochMilli((Long) avroRecord.get(0)), ZoneId.of("UTC+1")))); + Assertions.assertEquals("2021-03-30 15:44:29", formatter.format(LocalDateTime.ofInstant(Instant.ofEpochMilli((Long) avroRecord.get(0)), ZoneId.of("Asia/Shanghai")))); + } +} \ No newline at end of file diff --git a/hudi-gcp/pom.xml b/hudi-gcp/pom.xml index 2c308fbf4244..5ffe9e86c731 100644 --- a/hudi-gcp/pom.xml +++ b/hudi-gcp/pom.xml @@ -36,7 +36,7 @@ See https://github.com/GoogleCloudPlatform/cloud-opensource-java/wiki/The-Google com.google.cloud libraries-bom - 26.15.0 + ${gcp-libraries-bom.version} pom import diff --git a/hudi-gcp/src/main/java/org/apache/hudi/gcp/bigquery/HoodieBigQuerySyncClient.java b/hudi-gcp/src/main/java/org/apache/hudi/gcp/bigquery/HoodieBigQuerySyncClient.java index 5a23a4079ae2..32430b533291 100644 --- a/hudi-gcp/src/main/java/org/apache/hudi/gcp/bigquery/HoodieBigQuerySyncClient.java +++ b/hudi-gcp/src/main/java/org/apache/hudi/gcp/bigquery/HoodieBigQuerySyncClient.java @@ -44,6 +44,7 @@ import com.google.cloud.bigquery.TableId; import com.google.cloud.bigquery.TableInfo; import com.google.cloud.bigquery.ViewDefinition; +import com.google.cloud.bigquery.StandardTableDefinition; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -198,16 +199,22 @@ public void updateTableSchema(String tableName, Schema schema, List part LOG.info("No table update is needed."); return; // No need to update schema. } - ExternalTableDefinition.Builder builder = definition.toBuilder(); - builder.setSchema(finalSchema); - builder.setAutodetect(false); - if (definition.getHivePartitioningOptions() != null) { - builder.setHivePartitioningOptions(definition.getHivePartitioningOptions().toBuilder().setRequirePartitionFilter(requirePartitionFilter).build()); + if (!StringUtils.isNullOrEmpty(bigLakeConnectionId)) { + Table updatedTable = + existingTable.toBuilder().setDefinition(StandardTableDefinition.of(finalSchema)).build(); + updatedTable.update(); + } else { + ExternalTableDefinition.Builder builder = definition.toBuilder(); + builder.setSchema(finalSchema); + builder.setAutodetect(false); + if (definition.getHivePartitioningOptions() != null) { + builder.setHivePartitioningOptions(definition.getHivePartitioningOptions().toBuilder().setRequirePartitionFilter(requirePartitionFilter).build()); + } + Table updatedTable = existingTable.toBuilder() + .setDefinition(builder.build()) + .build(); + bigquery.update(updatedTable); } - Table updatedTable = existingTable.toBuilder() - .setDefinition(builder.build()) - .build(); - bigquery.update(updatedTable); } public void createVersionsTable(String tableName, String sourceUri, String sourceUriPrefix, List partitionFields) { diff --git a/hudi-hadoop-common/src/main/java/org/apache/hudi/hadoop/fs/HadoopFSUtils.java b/hudi-hadoop-common/src/main/java/org/apache/hudi/hadoop/fs/HadoopFSUtils.java index be38dfe8d6d5..d59bffc92172 100644 --- a/hudi-hadoop-common/src/main/java/org/apache/hudi/hadoop/fs/HadoopFSUtils.java +++ b/hudi-hadoop-common/src/main/java/org/apache/hudi/hadoop/fs/HadoopFSUtils.java @@ -22,9 +22,12 @@ import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.storage.StorageConfiguration; +import org.apache.hudi.storage.StoragePath; +import org.apache.hudi.storage.StoragePathInfo; import org.apache.hudi.storage.hadoop.HadoopStorageConfiguration; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.slf4j.Logger; @@ -107,4 +110,48 @@ public static Path addSchemeIfLocalPath(String path) { LOG.info("Resolving file " + path + "to be a remote file."); return providedPath; } + + /** + * @param path {@link StoragePath} instance. + * @return the Hadoop {@link Path} instance after conversion. + */ + public static Path convertToHadoopPath(StoragePath path) { + return new Path(path.toUri()); + } + + /** + * @param path Hadoop {@link Path} instance. + * @return the {@link StoragePath} instance after conversion. + */ + public static StoragePath convertToStoragePath(Path path) { + return new StoragePath(path.toUri()); + } + + /** + * @param fileStatus Hadoop {@link FileStatus} instance. + * @return the {@link StoragePathInfo} instance after conversion. + */ + public static StoragePathInfo convertToStoragePathInfo(FileStatus fileStatus) { + return new StoragePathInfo( + convertToStoragePath(fileStatus.getPath()), + fileStatus.getLen(), + fileStatus.isDirectory(), + fileStatus.getReplication(), + fileStatus.getBlockSize(), + fileStatus.getModificationTime()); + } + + /** + * @param pathInfo {@link StoragePathInfo} instance. + * @return the {@link FileStatus} instance after conversion. + */ + public static FileStatus convertToHadoopFileStatus(StoragePathInfo pathInfo) { + return new FileStatus( + pathInfo.getLength(), + pathInfo.isDirectory(), + pathInfo.getBlockReplication(), + pathInfo.getBlockSize(), + pathInfo.getModificationTime(), + convertToHadoopPath(pathInfo.getPath())); + } } diff --git a/hudi-hadoop-common/src/main/java/org/apache/hudi/storage/hadoop/HoodieHadoopStorage.java b/hudi-hadoop-common/src/main/java/org/apache/hudi/storage/hadoop/HoodieHadoopStorage.java index 87d4d9667e63..54c1712be354 100644 --- a/hudi-hadoop-common/src/main/java/org/apache/hudi/storage/hadoop/HoodieHadoopStorage.java +++ b/hudi-hadoop-common/src/main/java/org/apache/hudi/storage/hadoop/HoodieHadoopStorage.java @@ -19,12 +19,12 @@ package org.apache.hudi.storage.hadoop; +import org.apache.hudi.hadoop.fs.HadoopFSUtils; import org.apache.hudi.storage.HoodieStorage; import org.apache.hudi.storage.StoragePath; import org.apache.hudi.storage.StoragePathFilter; import org.apache.hudi.storage.StoragePathInfo; -import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.LocatedFileStatus; import org.apache.hadoop.fs.Path; @@ -39,6 +39,10 @@ import java.util.List; import java.util.stream.Collectors; +import static org.apache.hudi.hadoop.fs.HadoopFSUtils.convertToHadoopPath; +import static org.apache.hudi.hadoop.fs.HadoopFSUtils.convertToStoragePath; +import static org.apache.hudi.hadoop.fs.HadoopFSUtils.convertToStoragePathInfo; + /** * Implementation of {@link HoodieStorage} using Hadoop's {@link FileSystem} */ @@ -92,7 +96,7 @@ public boolean createDirectory(StoragePath path) throws IOException { @Override public List listDirectEntries(StoragePath path) throws IOException { return Arrays.stream(fs.listStatus(convertToHadoopPath(path))) - .map(this::convertToStoragePathInfo) + .map(HadoopFSUtils::convertToStoragePathInfo) .collect(Collectors.toList()); } @@ -109,9 +113,9 @@ public List listFiles(StoragePath path) throws IOException { @Override public List listDirectEntries(List pathList) throws IOException { return Arrays.stream(fs.listStatus(pathList.stream() - .map(this::convertToHadoopPath) + .map(HadoopFSUtils::convertToHadoopPath) .toArray(Path[]::new))) - .map(this::convertToStoragePathInfo) + .map(HadoopFSUtils::convertToStoragePathInfo) .collect(Collectors.toList()); } @@ -122,7 +126,7 @@ public List listDirectEntries(StoragePath path, return Arrays.stream(fs.listStatus( convertToHadoopPath(path), e -> filter.accept(convertToStoragePath(e)))) - .map(this::convertToStoragePathInfo) + .map(HadoopFSUtils::convertToStoragePathInfo) .collect(Collectors.toList()); } @@ -130,7 +134,7 @@ public List listDirectEntries(StoragePath path, public List globEntries(StoragePath pathPattern) throws IOException { return Arrays.stream(fs.globStatus(convertToHadoopPath(pathPattern))) - .map(this::convertToStoragePathInfo) + .map(HadoopFSUtils::convertToStoragePathInfo) .collect(Collectors.toList()); } @@ -139,7 +143,7 @@ public List globEntries(StoragePath pathPattern, StoragePathFil throws IOException { return Arrays.stream(fs.globStatus(convertToHadoopPath(pathPattern), path -> filter.accept(convertToStoragePath(path)))) - .map(this::convertToStoragePathInfo) + .map(HadoopFSUtils::convertToStoragePathInfo) .collect(Collectors.toList()); } @@ -184,22 +188,6 @@ public boolean createNewFile(StoragePath path) throws IOException { return fs.createNewFile(convertToHadoopPath(path)); } - private Path convertToHadoopPath(StoragePath loc) { - return new Path(loc.toUri()); - } - - private StoragePath convertToStoragePath(Path path) { - return new StoragePath(path.toUri()); - } - - private StoragePathInfo convertToStoragePathInfo(FileStatus fileStatus) { - return new StoragePathInfo( - convertToStoragePath(fileStatus.getPath()), - fileStatus.getLen(), - fileStatus.isDirectory(), - fileStatus.getModificationTime()); - } - @Override public void close() throws IOException { fs.close(); diff --git a/hudi-hadoop-common/src/test/java/org/apache/hudi/hadoop/fs/TestHadoopFSUtils.java b/hudi-hadoop-common/src/test/java/org/apache/hudi/hadoop/fs/TestHadoopFSUtils.java new file mode 100644 index 000000000000..7768ff4feae7 --- /dev/null +++ b/hudi-hadoop-common/src/test/java/org/apache/hudi/hadoop/fs/TestHadoopFSUtils.java @@ -0,0 +1,126 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.hadoop.fs; + +import org.apache.hudi.storage.StoragePath; +import org.apache.hudi.storage.StoragePathInfo; + +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.Path; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.CsvSource; +import org.junit.jupiter.params.provider.ValueSource; + +import static org.apache.hudi.hadoop.fs.HadoopFSUtils.convertToHadoopFileStatus; +import static org.apache.hudi.hadoop.fs.HadoopFSUtils.convertToHadoopPath; +import static org.apache.hudi.hadoop.fs.HadoopFSUtils.convertToStoragePath; +import static org.apache.hudi.hadoop.fs.HadoopFSUtils.convertToStoragePathInfo; +import static org.junit.jupiter.api.Assertions.assertEquals; + +/** + * Tests {@link HadoopFSUtils} + */ +public class TestHadoopFSUtils { + @ParameterizedTest + @ValueSource(strings = { + "/a/b/c", + "s3://bucket/partition=1%2F2%2F3", + "hdfs://x/y/z.file#bar" + }) + public void testPathConversion(String pathString) { + // Hadoop Path -> StoragePath -> Hadoop Path + Path path = new Path(pathString); + StoragePath storagePath = convertToStoragePath(path); + Path convertedPath = convertToHadoopPath(storagePath); + assertEquals(path.toUri(), storagePath.toUri()); + assertEquals(path, convertedPath); + + // StoragePath -> Hadoop Path -> StoragePath + storagePath = new StoragePath(pathString); + path = convertToHadoopPath(storagePath); + StoragePath convertedStoragePath = convertToStoragePath(path); + assertEquals(storagePath.toUri(), path.toUri()); + assertEquals(storagePath, convertedStoragePath); + } + + @ParameterizedTest + @CsvSource({ + "/a/b/c,1000,false,1,1000000,1238493920", + "/x/y/z,0,true,2,0,2002403203" + }) + public void testFileStatusConversion(String path, + long length, + boolean isDirectory, + short blockReplication, + long blockSize, + long modificationTime) { + // FileStatus -> StoragePathInfo -> FileStatus + FileStatus fileStatus = new FileStatus( + length, isDirectory, blockReplication, blockSize, modificationTime, new Path(path)); + StoragePathInfo pathInfo = convertToStoragePathInfo(fileStatus); + assertStoragePathInfo( + pathInfo, path, length, isDirectory, blockReplication, blockSize, modificationTime); + FileStatus convertedFileStatus = convertToHadoopFileStatus(pathInfo); + assertFileStatus( + convertedFileStatus, path, length, isDirectory, blockReplication, blockSize, modificationTime); + + // StoragePathInfo -> FileStatus -> StoragePathInfo + pathInfo = new StoragePathInfo( + new StoragePath(path), length, isDirectory, blockReplication, blockSize, modificationTime); + fileStatus = convertToHadoopFileStatus(pathInfo); + assertFileStatus( + fileStatus, path, length, isDirectory, blockReplication, blockSize, modificationTime); + StoragePathInfo convertedPathInfo = convertToStoragePathInfo(fileStatus); + assertStoragePathInfo( + convertedPathInfo, path, length, isDirectory, blockReplication, blockSize, modificationTime); + } + + private void assertFileStatus(FileStatus fileStatus, + String path, + long length, + boolean isDirectory, + short blockReplication, + long blockSize, + long modificationTime) { + assertEquals(new Path(path), fileStatus.getPath()); + assertEquals(length, fileStatus.getLen()); + assertEquals(isDirectory, fileStatus.isDirectory()); + assertEquals(!isDirectory, fileStatus.isFile()); + assertEquals(blockReplication, fileStatus.getReplication()); + assertEquals(blockSize, fileStatus.getBlockSize()); + assertEquals(modificationTime, fileStatus.getModificationTime()); + } + + private void assertStoragePathInfo(StoragePathInfo pathInfo, + String path, + long length, + boolean isDirectory, + short blockReplication, + long blockSize, + long modificationTime) { + assertEquals(new StoragePath(path), pathInfo.getPath()); + assertEquals(length, pathInfo.getLength()); + assertEquals(isDirectory, pathInfo.isDirectory()); + assertEquals(!isDirectory, pathInfo.isFile()); + assertEquals(blockReplication, pathInfo.getBlockReplication()); + assertEquals(blockSize, pathInfo.getBlockSize()); + assertEquals(modificationTime, pathInfo.getModificationTime()); + } +} diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieMergeOnReadTableInputFormat.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieMergeOnReadTableInputFormat.java index 0cfe0d0a1940..76209422fe9c 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieMergeOnReadTableInputFormat.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieMergeOnReadTableInputFormat.java @@ -53,6 +53,7 @@ import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.SplitLocationInfo; import org.apache.hadoop.mapreduce.Job; +import org.apache.hudi.metadata.HoodieTableMetadataUtil; import java.io.IOException; import java.util.ArrayList; @@ -192,7 +193,7 @@ protected List listStatusForIncrementalMode(JobConf job, // build fileGroup from fsView Path basePath = new Path(tableMetaClient.getBasePath()); // filter affectedPartition by inputPaths - List affectedPartition = HoodieInputFormatUtils.getWritePartitionPaths(metadataList).stream() + List affectedPartition = HoodieTableMetadataUtil.getWritePartitionPaths(metadataList).stream() .filter(k -> k.isEmpty() ? inputPaths.contains(basePath) : inputPaths.contains(new Path(basePath, k))).collect(Collectors.toList()); if (affectedPartition.isEmpty()) { return result; diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HiveAvroSerializer.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HiveAvroSerializer.java index a0d1b086e035..22116283d121 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HiveAvroSerializer.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HiveAvroSerializer.java @@ -99,7 +99,7 @@ public GenericRecord serialize(Object o, Schema schema) { List allStructFieldRefs = soi.getAllStructFieldRefs(); List structFieldsDataAsList = soi.getStructFieldsDataAsList(o); - for (int i = 0; i < size; i++) { + for (int i = 0; i < size; i++) { Schema.Field field = schema.getFields().get(i); if (i >= columnTypes.size()) { break; @@ -134,7 +134,7 @@ private void setUpRecordFieldFromWritable(TypeInfo typeInfo, Object structFieldD * Determine if an Avro schema is of type Union[T, NULL]. Avro supports nullable * types via a union of type T and null. This is a very common use case. * As such, we want to silently convert it to just T and allow the value to be null. - * + *

    * When a Hive union type is used with AVRO, the schema type becomes * Union[NULL, T1, T2, ...]. The NULL in the union should be silently removed * @@ -266,7 +266,7 @@ private Object serializeStruct(StructTypeInfo typeInfo, StructObjectInspector ss GenericData.Record record = new GenericData.Record(schema); ArrayList allStructFieldTypeInfos = typeInfo.getAllStructFieldTypeInfos(); - for (int i = 0; i < size; i++) { + for (int i = 0; i < size; i++) { Schema.Field field = schema.getFields().get(i); setUpRecordFieldFromWritable(allStructFieldTypeInfos.get(i), structFieldsDataAsList.get(i), allStructFieldRefs.get(i).getFieldObjectInspector(), record, field); @@ -278,26 +278,30 @@ private Object serializePrimitive(PrimitiveObjectInspector fieldOI, Object struc switch (fieldOI.getPrimitiveCategory()) { case BINARY: if (schema.getType() == Schema.Type.BYTES) { - return AvroSerdeUtils.getBufferFromBytes((byte[])fieldOI.getPrimitiveJavaObject(structFieldData)); + return AvroSerdeUtils.getBufferFromBytes((byte[]) fieldOI.getPrimitiveJavaObject(structFieldData)); } else if (schema.getType() == Schema.Type.FIXED) { - GenericData.Fixed fixed = new GenericData.Fixed(schema, (byte[])fieldOI.getPrimitiveJavaObject(structFieldData)); + GenericData.Fixed fixed = new GenericData.Fixed(schema, (byte[]) fieldOI.getPrimitiveJavaObject(structFieldData)); return fixed; } else { throw new HoodieException("Unexpected Avro schema for Binary TypeInfo: " + schema.getType()); } case DECIMAL: - HiveDecimal dec = (HiveDecimal)fieldOI.getPrimitiveJavaObject(structFieldData); - LogicalTypes.Decimal decimal = (LogicalTypes.Decimal)schema.getLogicalType(); + HiveDecimal dec = (HiveDecimal) fieldOI.getPrimitiveJavaObject(structFieldData); + LogicalTypes.Decimal decimal = (LogicalTypes.Decimal) schema.getLogicalType(); BigDecimal bd = new BigDecimal(dec.toString()).setScale(decimal.getScale()); - return HoodieAvroUtils.DECIMAL_CONVERSION.toFixed(bd, schema, decimal); + if (schema.getType() == Schema.Type.BYTES) { + return HoodieAvroUtils.DECIMAL_CONVERSION.toBytes(bd, schema, decimal); + } else { + return HoodieAvroUtils.DECIMAL_CONVERSION.toFixed(bd, schema, decimal); + } case CHAR: - HiveChar ch = (HiveChar)fieldOI.getPrimitiveJavaObject(structFieldData); + HiveChar ch = (HiveChar) fieldOI.getPrimitiveJavaObject(structFieldData); return new Utf8(ch.getStrippedValue()); case VARCHAR: - HiveVarchar vc = (HiveVarchar)fieldOI.getPrimitiveJavaObject(structFieldData); + HiveVarchar vc = (HiveVarchar) fieldOI.getPrimitiveJavaObject(structFieldData); return new Utf8(vc.getValue()); case STRING: - String string = (String)fieldOI.getPrimitiveJavaObject(structFieldData); + String string = (String) fieldOI.getPrimitiveJavaObject(structFieldData); return new Utf8(string); case DATE: return HoodieHiveUtils.getDays(structFieldData); @@ -364,7 +368,7 @@ private Object serializeMap(MapTypeInfo typeInfo, MapObjectInspector fieldOI, Ob ObjectInspector mapValueObjectInspector = fieldOI.getMapValueObjectInspector(); TypeInfo mapKeyTypeInfo = typeInfo.getMapKeyTypeInfo(); TypeInfo mapValueTypeInfo = typeInfo.getMapValueTypeInfo(); - Map map = fieldOI.getMap(structFieldData); + Map map = fieldOI.getMap(structFieldData); Schema valueType = schema.getValueType(); Map deserialized = new LinkedHashMap(fieldOI.getMapSize(structFieldData)); diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieInputFormatUtils.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieInputFormatUtils.java index 8922b837871f..4ab72701a11a 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieInputFormatUtils.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieInputFormatUtils.java @@ -514,19 +514,6 @@ public static FileStatus[] listAffectedFilesForCommits(Configuration hadoopConf, return fullPathToFileStatus.values().toArray(new FileStatus[0]); } - /** - * Returns all the incremental write partition paths as a set with the given commits metadata. - * - * @param metadataList The commits metadata - * @return the partition path set - */ - public static Set getWritePartitionPaths(List metadataList) { - return metadataList.stream() - .map(HoodieCommitMetadata::getWritePartitionPaths) - .flatMap(Collection::stream) - .collect(Collectors.toSet()); - } - public static HoodieRealtimeFileSplit createRealtimeFileSplit(HoodieRealtimePath path, long start, long length, String[] hosts) { try { return new HoodieRealtimeFileSplit(new FileSplit(path, start, length, hosts), path); diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieRealtimeRecordReaderUtils.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieRealtimeRecordReaderUtils.java index 275ab36b82a0..6d98f0c8f52e 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieRealtimeRecordReaderUtils.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieRealtimeRecordReaderUtils.java @@ -27,7 +27,6 @@ import org.apache.hudi.io.storage.HoodieFileReader; import org.apache.hudi.io.storage.HoodieFileReaderFactory; -import org.apache.avro.AvroRuntimeException; import org.apache.avro.JsonProperties; import org.apache.avro.LogicalType; import org.apache.avro.LogicalTypes; @@ -168,6 +167,9 @@ public static Writable avroToArrayWritable(Object value, Schema schema, boolean case STRING: return new Text(value.toString()); case BYTES: + if (schema.getLogicalType() != null && schema.getLogicalType().getName().equals("decimal")) { + return toHiveDecimalWritable(((ByteBuffer) value).array(), schema); + } return new BytesWritable(((ByteBuffer) value).array()); case INT: if (schema.getLogicalType() != null && schema.getLogicalType().getName().equals("date")) { @@ -198,12 +200,13 @@ public static Writable avroToArrayWritable(Object value, Schema schema, boolean Writable[] recordValues = new Writable[schema.getFields().size()]; int recordValueIndex = 0; for (Schema.Field field : schema.getFields()) { - // TODO Revisit Avro exception handling in future Object fieldValue = null; - try { + if (record.getSchema().getField(field.name()) != null) { fieldValue = record.get(field.name()); - } catch (AvroRuntimeException e) { - LOG.debug("Field:" + field.name() + "not found in Schema:" + schema); + } else { + if (LOG.isDebugEnabled()) { + LOG.debug("Field:" + field.name() + "not found in Schema:" + schema); + } } recordValues[recordValueIndex++] = avroToArrayWritable(fieldValue, field.schema(), supportTimestamp); } @@ -248,11 +251,7 @@ public static Writable avroToArrayWritable(Object value, Schema schema, boolean } case FIXED: if (schema.getLogicalType() != null && schema.getLogicalType().getName().equals("decimal")) { - LogicalTypes.Decimal decimal = (LogicalTypes.Decimal) LogicalTypes.fromSchema(schema); - HiveDecimalWritable writable = new HiveDecimalWritable(((GenericFixed) value).bytes(), - decimal.getScale()); - return HiveDecimalUtils.enforcePrecisionScale(writable, - new DecimalTypeInfo(decimal.getPrecision(), decimal.getScale())); + return toHiveDecimalWritable(((GenericFixed) value).bytes(), schema); } return new BytesWritable(((GenericFixed) value).bytes()); default: @@ -319,4 +318,11 @@ private static Schema appendNullSchemaFields(Schema schema, List newFiel } return appendFieldsToSchema(schema, newFields); } + + private static HiveDecimalWritable toHiveDecimalWritable(byte[] bytes, Schema schema) { + LogicalTypes.Decimal decimal = (LogicalTypes.Decimal) LogicalTypes.fromSchema(schema); + HiveDecimalWritable writable = new HiveDecimalWritable(bytes, decimal.getScale()); + return HiveDecimalUtils.enforcePrecisionScale(writable, + new DecimalTypeInfo(decimal.getPrecision(), decimal.getScale())); + } } diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/streaming/StructuredStreamingSinkUtil.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/streaming/StructuredStreamingSinkUtil.java new file mode 100644 index 000000000000..f6fec62cb3b2 --- /dev/null +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/streaming/StructuredStreamingSinkUtil.java @@ -0,0 +1,168 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.integ.testsuite.streaming; + +import org.apache.hudi.exception.HoodieException; + +import com.beust.jcommander.JCommander; +import com.beust.jcommander.Parameter; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.SparkSession; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.Serializable; +import java.util.HashMap; +import java.util.Map; + +/** + * Saprk-submit to test spark streaming + * + * Sample command. + * ./bin/spark-submit --master local[2] --driver-memory 1g --executor-memory 1g \ + * --class org.apache.hudi.streaming.StructuredStreamingSinkUtil PATH TO hudi-integ-test-bundle-0.13.0-SNAPSHOT.jar \ + * --spark-master local[2] \ + * --source-path /tmp/parquet_ny/ \ + * --target-path /tmp/hudi_streaming_kafka10/MERGE_ON_READ3/ \ + * --checkpoint-path /tmp/hudi_streaming_kafka10/checkpoint_mor3/ \ + * --table-type COPY_ON_WRITE \ + * --partition-field date_col \ + * --record-key-field tpep_pickup_datetime \ + * --pre-combine-field tpep_dropoff_datetime \ + * --table-name test_tbl + * + * Ensure "source-path" has parquet data. + */ +public class StructuredStreamingSinkUtil implements Serializable { + + private static final Logger LOG = LoggerFactory.getLogger(StructuredStreamingSinkUtil.class); + + private transient JavaSparkContext jsc; + private SparkSession sparkSession; + private Config cfg; + + public StructuredStreamingSinkUtil(JavaSparkContext jsc, Config cfg) { + this.jsc = jsc; + this.sparkSession = SparkSession.builder().config(jsc.getConf()).getOrCreate(); + this.cfg = cfg; + } + + public static class Config implements Serializable { + @Parameter(names = {"--source-path", "-sp"}, description = "Source path to consume data from", required = true) + public String sourcePath = null; + + @Parameter(names = {"--target-path", "-tp"}, description = "Target path of the table of interest.", required = true) + public String targetPath = null; + + @Parameter(names = {"--table-type", "-ty"}, description = "Target path of the table of interest.", required = true) + public String tableType = "COPY_ON_WRITE"; + + @Parameter(names = {"--checkpoint-path", "-cp"}, description = "Checkppint path of the table of interest", required = true) + public String checkpointPath = null; + + @Parameter(names = {"--partition-field", "-pp"}, description = "Partitioning field", required = true) + public String partitionField = null; + + @Parameter(names = {"--record-key-field", "-rk"}, description = "record key field", required = true) + public String recordKeyField = null; + + @Parameter(names = {"--pre-combine-field", "-pc"}, description = "Precombine field", required = true) + public String preCombineField = null; + + @Parameter(names = {"--table-name", "-tn"}, description = "Table name", required = true) + public String tableName = null; + + @Parameter(names = {"--disable-metadata", "-dmdt"}, description = "Disable metadata while querying", required = false) + public Boolean disableMetadata = false; + + @Parameter(names = {"--spark-master", "-ms"}, description = "Spark master", required = false) + public String sparkMaster = null; + + @Parameter(names = {"--spark-memory", "-sm"}, description = "spark memory to use", required = false) + public String sparkMemory = "1g"; + + @Parameter(names = {"--help", "-h"}, help = true) + public Boolean help = false; + + } + + public static void main(String[] args) { + final Config cfg = new Config(); + JCommander cmd = new JCommander(cfg, null, args); + + if (cfg.help || args.length == 0) { + cmd.usage(); + System.exit(1); + } + + SparkConf sparkConf = buildSparkConf("Spark-structured-streaming-test", cfg.sparkMaster); + sparkConf.set("spark.executor.memory", cfg.sparkMemory); + JavaSparkContext jsc = new JavaSparkContext(sparkConf); + + try { + StructuredStreamingSinkUtil streamingSinkUtil = new StructuredStreamingSinkUtil(jsc, cfg); + streamingSinkUtil.run(); + } catch (Throwable throwable) { + LOG.error("Fail to execute tpcds read benchmarks for " + cfg, throwable); + } finally { + jsc.stop(); + } + } + + public void run() { + try { + LOG.info(cfg.toString()); + StructuredStreamingSinkTestWriter.triggerStreaming(sparkSession, cfg.tableType, cfg.sourcePath, cfg.targetPath, cfg.checkpointPath, + cfg.tableName, cfg.partitionField, cfg.recordKeyField, cfg.preCombineField); + StructuredStreamingSinkTestWriter.waitUntilCondition(1000 * 60 * 10, 1000 * 30); + } catch (Exception e) { + throw new HoodieException("Unable to test spark structured writes to hudi " + cfg.targetPath, e); + } finally { + LOG.warn("Completing Spark Structured Streaming test"); + } + } + + public static SparkConf buildSparkConf(String appName, String defaultMaster) { + return buildSparkConf(appName, defaultMaster, new HashMap<>()); + } + + private static SparkConf buildSparkConf(String appName, String defaultMaster, Map additionalConfigs) { + final SparkConf sparkConf = new SparkConf().setAppName(appName); + String master = sparkConf.get("spark.master", defaultMaster); + sparkConf.setMaster(master); + if (master.startsWith("yarn")) { + sparkConf.set("spark.eventLog.overwrite", "true"); + sparkConf.set("spark.eventLog.enabled", "true"); + } + sparkConf.set("spark.ui.port", "8090"); + sparkConf.setIfMissing("spark.driver.maxResultSize", "2g"); + sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); + sparkConf.set("spark.kryo.registrator", "org.apache.spark.HoodieSparkKryoRegistrar"); + sparkConf.set("spark.sql.extensions", "org.apache.spark.sql.hudi.HoodieSparkSessionExtension"); + sparkConf.set("spark.hadoop.mapred.output.compress", "true"); + sparkConf.set("spark.hadoop.mapred.output.compression.codec", "true"); + sparkConf.set("spark.hadoop.mapred.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec"); + sparkConf.set("spark.hadoop.mapred.output.compression.type", "BLOCK"); + + additionalConfigs.forEach(sparkConf::set); + return sparkConf; + } +} diff --git a/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/streaming/StructuredStreamingSinkTestWriter.scala b/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/streaming/StructuredStreamingSinkTestWriter.scala new file mode 100644 index 000000000000..8eb3b469e938 --- /dev/null +++ b/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/streaming/StructuredStreamingSinkTestWriter.scala @@ -0,0 +1,104 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.integ.testsuite.streaming + +import org.apache.hudi.DataSourceWriteOptions._ +import org.apache.hudi.config.HoodieWriteConfig.FAIL_ON_TIMELINE_ARCHIVING_ENABLE +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.streaming.StreamingQueryListener.{QueryProgressEvent, QueryStartedEvent, QueryTerminatedEvent} +import org.apache.spark.sql.streaming.{OutputMode, StreamingQueryListener, Trigger} +import org.apache.log4j.LogManager + +object StructuredStreamingSinkTestWriter { + + private val log = LogManager.getLogger(getClass) + var validationComplete: Boolean = false; + + def waitUntilCondition(): Unit = { + waitUntilCondition(1000 * 60 * 5, 500) + } + + def waitUntilCondition(maxWaitTimeMs: Long, intervalTimeMs: Long): Unit = { + var waitSoFar: Long = 0; + while (waitSoFar < maxWaitTimeMs && !validationComplete) { + log.info("Waiting for " + intervalTimeMs + ". Total wait time " + waitSoFar) + Thread.sleep(intervalTimeMs) + waitSoFar += intervalTimeMs + } + } + + def triggerStreaming(spark: SparkSession, tableType: String, inputPath: String, hudiPath: String, hudiCheckpointPath: String, + tableName: String, partitionPathField: String, recordKeyField: String, + preCombineField: String): Unit = { + + def validate(): Unit = { + log.info("Validation starting") + val inputDf = spark.read.format("parquet").load(inputPath) + val hudiDf = spark.read.format("hudi").load(hudiPath) + inputDf.registerTempTable("inputTbl") + hudiDf.registerTempTable("hudiTbl") + assert(spark.sql("select count(distinct " + partitionPathField + ", " + recordKeyField + ") from inputTbl").count == + spark.sql("select count(distinct " + partitionPathField + ", " + recordKeyField + ") from hudiTbl").count) + validationComplete = true + log.info("Validation complete") + } + + def shutdownListener(spark: SparkSession) = new StreamingQueryListener() { + override def onQueryStarted(queryStarted: QueryStartedEvent): Unit = { + log.info("Query started: " + queryStarted.id) + } + + override def onQueryTerminated(queryTerminated: QueryTerminatedEvent): Unit = { + log.info("Query terminated! " + queryTerminated.id + ". Validating input and hudi") + validate() + log.info("Data Validation complete") + } + + override def onQueryProgress(queryProgressEvent: QueryProgressEvent): Unit = { + if (queryProgressEvent.progress.numInputRows == 0) { + log.info("Stopping spark structured streaming as we have reached the end") + spark.streams.active.foreach(_.stop()) + } + } + } + + spark.streams.addListener(shutdownListener(spark)) + log.info("Starting to consume from source and writing to hudi ") + + val inputDfSchema = spark.read.format("parquet").load(inputPath).schema + val parquetdf = spark.readStream.option("spark.sql.streaming.schemaInference", "true").option("maxFilesPerTrigger", "1") + .schema(inputDfSchema).parquet(inputPath) + + val writer = parquetdf.writeStream.format("org.apache.hudi"). + option(TABLE_TYPE.key, tableType). + option(PRECOMBINE_FIELD.key, preCombineField). + option(RECORDKEY_FIELD.key, recordKeyField). + option(PARTITIONPATH_FIELD.key, partitionPathField). + option(FAIL_ON_TIMELINE_ARCHIVING_ENABLE.key, false). + option(STREAMING_IGNORE_FAILED_BATCH.key, false). + option(STREAMING_RETRY_CNT.key, 0). + option("hoodie.table.name", tableName). + option("hoodie.compact.inline.max.delta.commits", "2"). + option("checkpointLocation", hudiCheckpointPath). + outputMode(OutputMode.Append()); + + writer.trigger(Trigger.ProcessingTime(30000)).start(hudiPath); + } +} diff --git a/hudi-io/src/main/java/org/apache/hudi/storage/StoragePathInfo.java b/hudi-io/src/main/java/org/apache/hudi/storage/StoragePathInfo.java index b4ec8194b4de..e4711bf72dd0 100644 --- a/hudi-io/src/main/java/org/apache/hudi/storage/StoragePathInfo.java +++ b/hudi-io/src/main/java/org/apache/hudi/storage/StoragePathInfo.java @@ -35,15 +35,21 @@ public class StoragePathInfo implements Serializable { private final StoragePath path; private final long length; private final boolean isDirectory; + private final short blockReplication; + private final long blockSize; private final long modificationTime; public StoragePathInfo(StoragePath path, long length, boolean isDirectory, + short blockReplication, + long blockSize, long modificationTime) { this.path = path; this.length = length; this.isDirectory = isDirectory; + this.blockReplication = blockReplication; + this.blockSize = blockSize; this.modificationTime = modificationTime; } @@ -79,6 +85,22 @@ public boolean isDirectory() { return isDirectory; } + /** + * @return the block replication if applied. + */ + @PublicAPIMethod(maturity = ApiMaturityLevel.EVOLVING) + public short getBlockReplication() { + return blockReplication; + } + + /** + * @return the block size in bytes if applied. + */ + @PublicAPIMethod(maturity = ApiMaturityLevel.EVOLVING) + public long getBlockSize() { + return blockSize; + } + /** * @return the modification of a file. */ @@ -114,6 +136,8 @@ public String toString() { + "path=" + path + ", length=" + length + ", isDirectory=" + isDirectory + + ", blockReplication=" + blockReplication + + ", blockSize=" + blockSize + ", modificationTime=" + modificationTime + '}'; } diff --git a/hudi-io/src/main/java/org/apache/hudi/storage/StorageSchemes.java b/hudi-io/src/main/java/org/apache/hudi/storage/StorageSchemes.java index 74d126d7f07a..00e8594a0c83 100644 --- a/hudi-io/src/main/java/org/apache/hudi/storage/StorageSchemes.java +++ b/hudi-io/src/main/java/org/apache/hudi/storage/StorageSchemes.java @@ -69,6 +69,8 @@ public enum StorageSchemes { OBS("obs", null, null), // Kingsoft Standard Storage ks3 KS3("ks3", null, null), + // Netease Object Storage nos + NOS("nos", null, null), // JuiceFileSystem JFS("jfs", null, null), // Baidu Object Storage diff --git a/hudi-io/src/test/java/org/apache/hudi/io/storage/TestHoodieStorageBase.java b/hudi-io/src/test/java/org/apache/hudi/io/storage/TestHoodieStorageBase.java index a6a0efee6dc0..460c831e1c08 100644 --- a/hudi-io/src/test/java/org/apache/hudi/io/storage/TestHoodieStorageBase.java +++ b/hudi-io/src/test/java/org/apache/hudi/io/storage/TestHoodieStorageBase.java @@ -164,37 +164,37 @@ public void testListing() throws IOException { validatePathInfoList( Arrays.stream(new StoragePathInfo[] { - new StoragePathInfo(new StoragePath(getTempDir(), "x/1.file"), 0, false, 0), - new StoragePathInfo(new StoragePath(getTempDir(), "x/2.file"), 0, false, 0), - new StoragePathInfo(new StoragePath(getTempDir(), "x/y"), 0, true, 0), - new StoragePathInfo(new StoragePath(getTempDir(), "x/z"), 0, true, 0), + getStoragePathInfo("x/1.file", false), + getStoragePathInfo("x/2.file", false), + getStoragePathInfo("x/y", true), + getStoragePathInfo("x/z", true) }).collect(Collectors.toList()), storage.listDirectEntries(new StoragePath(getTempDir(), "x"))); validatePathInfoList( Arrays.stream(new StoragePathInfo[] { - new StoragePathInfo(new StoragePath(getTempDir(), "x/1.file"), 0, false, 0), - new StoragePathInfo(new StoragePath(getTempDir(), "x/2.file"), 0, false, 0), - new StoragePathInfo(new StoragePath(getTempDir(), "x/y/1.file"), 0, false, 0), - new StoragePathInfo(new StoragePath(getTempDir(), "x/y/2.file"), 0, false, 0), - new StoragePathInfo(new StoragePath(getTempDir(), "x/z/1.file"), 0, false, 0), - new StoragePathInfo(new StoragePath(getTempDir(), "x/z/2.file"), 0, false, 0) + getStoragePathInfo("x/1.file", false), + getStoragePathInfo("x/2.file", false), + getStoragePathInfo("x/y/1.file", false), + getStoragePathInfo("x/y/2.file", false), + getStoragePathInfo("x/z/1.file", false), + getStoragePathInfo("x/z/2.file", false) }).collect(Collectors.toList()), storage.listFiles(new StoragePath(getTempDir(), "x"))); validatePathInfoList( Arrays.stream(new StoragePathInfo[] { - new StoragePathInfo(new StoragePath(getTempDir(), "x/2.file"), 0, false, 0) + getStoragePathInfo("x/2.file", false) }).collect(Collectors.toList()), storage.listDirectEntries( new StoragePath(getTempDir(), "x"), e -> e.getName().contains("2"))); validatePathInfoList( Arrays.stream(new StoragePathInfo[] { - new StoragePathInfo(new StoragePath(getTempDir(), "w/1.file"), 0, false, 0), - new StoragePathInfo(new StoragePath(getTempDir(), "w/2.file"), 0, false, 0), - new StoragePathInfo(new StoragePath(getTempDir(), "x/z/1.file"), 0, false, 0), - new StoragePathInfo(new StoragePath(getTempDir(), "x/z/2.file"), 0, false, 0) + getStoragePathInfo("w/1.file", false), + getStoragePathInfo("w/2.file", false), + getStoragePathInfo("x/z/1.file", false), + getStoragePathInfo("x/z/2.file", false) }).collect(Collectors.toList()), storage.listDirectEntries(Arrays.stream(new StoragePath[] { new StoragePath(getTempDir(), "w"), @@ -206,21 +206,21 @@ public void testListing() throws IOException { validatePathInfoList( Arrays.stream(new StoragePathInfo[] { - new StoragePathInfo(new StoragePath(getTempDir(), "x/y/1.file"), 0, false, 0), - new StoragePathInfo(new StoragePath(getTempDir(), "x/z/1.file"), 0, false, 0) + getStoragePathInfo("x/y/1.file", false), + getStoragePathInfo("x/z/1.file", false) }).collect(Collectors.toList()), storage.globEntries(new StoragePath(getTempDir(), "x/*/1.file"))); validatePathInfoList( Arrays.stream(new StoragePathInfo[] { - new StoragePathInfo(new StoragePath(getTempDir(), "x/1.file"), 0, false, 0), - new StoragePathInfo(new StoragePath(getTempDir(), "x/2.file"), 0, false, 0), + getStoragePathInfo("x/1.file", false), + getStoragePathInfo("x/2.file", false) }).collect(Collectors.toList()), storage.globEntries(new StoragePath(getTempDir(), "x/*.file"))); validatePathInfoList( Arrays.stream(new StoragePathInfo[] { - new StoragePathInfo(new StoragePath(getTempDir(), "x/y/1.file"), 0, false, 0), + getStoragePathInfo("x/y/1.file", false) }).collect(Collectors.toList()), storage.globEntries( new StoragePath(getTempDir(), "x/*/*.file"), @@ -319,6 +319,11 @@ private HoodieStorage getHoodieStorage() { return getHoodieStorage(getFileSystem(conf), conf); } + private StoragePathInfo getStoragePathInfo(String subPath, boolean isDirectory) { + return new StoragePathInfo(new StoragePath(getTempDir(), subPath), + 0, isDirectory, (short) 1, 1000000L, 10L); + } + private void validatePathInfo(HoodieStorage storage, StoragePath path, byte[] data, diff --git a/hudi-io/src/test/java/org/apache/hudi/io/storage/TestStoragePathInfo.java b/hudi-io/src/test/java/org/apache/hudi/io/storage/TestStoragePathInfo.java index 1d92fa075d0f..72640c5e3df5 100644 --- a/hudi-io/src/test/java/org/apache/hudi/io/storage/TestStoragePathInfo.java +++ b/hudi-io/src/test/java/org/apache/hudi/io/storage/TestStoragePathInfo.java @@ -41,6 +41,8 @@ public class TestStoragePathInfo { private static final Logger LOG = LoggerFactory.getLogger(TestStoragePathInfo.class); private static final long LENGTH = 100; + private static final short BLOCK_REPLICATION = 1; + private static final long BLOCK_SIZE = 1000000L; private static final long MODIFICATION_TIME = System.currentTimeMillis(); private static final String PATH1 = "/abc/xyz1"; private static final String PATH2 = "/abc/xyz2"; @@ -49,15 +51,15 @@ public class TestStoragePathInfo { @Test public void testConstructor() { - StoragePathInfo pathInfo = new StoragePathInfo(STORAGE_PATH1, LENGTH, false, MODIFICATION_TIME); + StoragePathInfo pathInfo = new StoragePathInfo(STORAGE_PATH1, LENGTH, false, BLOCK_REPLICATION, BLOCK_SIZE, MODIFICATION_TIME); validateAccessors(pathInfo, PATH1, LENGTH, false, MODIFICATION_TIME); - pathInfo = new StoragePathInfo(STORAGE_PATH2, -1, true, MODIFICATION_TIME + 2L); + pathInfo = new StoragePathInfo(STORAGE_PATH2, -1, true, BLOCK_REPLICATION, BLOCK_SIZE, MODIFICATION_TIME + 2L); validateAccessors(pathInfo, PATH2, -1, true, MODIFICATION_TIME + 2L); } @Test public void testSerializability() throws IOException, ClassNotFoundException { - StoragePathInfo pathInfo = new StoragePathInfo(STORAGE_PATH1, LENGTH, false, MODIFICATION_TIME); + StoragePathInfo pathInfo = new StoragePathInfo(STORAGE_PATH1, LENGTH, false, BLOCK_REPLICATION, BLOCK_SIZE, MODIFICATION_TIME); try (ByteArrayOutputStream baos = new ByteArrayOutputStream(); ObjectOutputStream oos = new ObjectOutputStream(baos)) { oos.writeObject(pathInfo); @@ -72,18 +74,18 @@ public void testSerializability() throws IOException, ClassNotFoundException { @Test public void testEquals() { StoragePathInfo pathInfo1 = new StoragePathInfo( - new StoragePath(PATH1), LENGTH, false, MODIFICATION_TIME); + new StoragePath(PATH1), LENGTH, false, BLOCK_REPLICATION, BLOCK_SIZE, MODIFICATION_TIME); StoragePathInfo pathInfo2 = new StoragePathInfo( - new StoragePath(PATH1), LENGTH + 2, false, MODIFICATION_TIME + 2L); + new StoragePath(PATH1), LENGTH + 2, false, BLOCK_REPLICATION, BLOCK_SIZE, MODIFICATION_TIME + 2L); assertEquals(pathInfo1, pathInfo2); } @Test public void testNotEquals() { StoragePathInfo pathInfo1 = new StoragePathInfo( - STORAGE_PATH1, LENGTH, false, MODIFICATION_TIME); + STORAGE_PATH1, LENGTH, false, BLOCK_REPLICATION, BLOCK_SIZE, MODIFICATION_TIME); StoragePathInfo pathInfo2 = new StoragePathInfo( - STORAGE_PATH2, LENGTH, false, MODIFICATION_TIME + 2L); + STORAGE_PATH2, LENGTH, false, BLOCK_REPLICATION, BLOCK_SIZE, MODIFICATION_TIME + 2L); assertFalse(pathInfo1.equals(pathInfo2)); assertFalse(pathInfo2.equals(pathInfo1)); } diff --git a/hudi-kafka-connect/pom.xml b/hudi-kafka-connect/pom.xml index 1abe6ee961e5..d9fce1c9c57b 100644 --- a/hudi-kafka-connect/pom.xml +++ b/hudi-kafka-connect/pom.xml @@ -73,9 +73,6 @@ src/main/resources - - src/test/resources - diff --git a/hudi-platform-service/hudi-metaserver/hudi-metaserver-server/pom.xml b/hudi-platform-service/hudi-metaserver/hudi-metaserver-server/pom.xml index 4ffbea73b44a..d7d9ca77cd72 100644 --- a/hudi-platform-service/hudi-metaserver/hudi-metaserver-server/pom.xml +++ b/hudi-platform-service/hudi-metaserver/hudi-metaserver-server/pom.xml @@ -30,9 +30,9 @@ ${project.parent.basedir} - 3.4.6 + 3.5.6 4.0.3 - 8.0.22 + 8.0.28 diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/HoodieSparkFunctionalIndexClient.java b/hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/HoodieSparkFunctionalIndexClient.java index 542b76e8dd16..e66ad5ac417b 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/HoodieSparkFunctionalIndexClient.java +++ b/hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/HoodieSparkFunctionalIndexClient.java @@ -27,7 +27,6 @@ import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.ValidationUtils; -import org.apache.hudi.config.HoodieLockConfig; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.HoodieFunctionalIndexException; @@ -49,6 +48,9 @@ import static org.apache.hudi.HoodieConversionUtils.mapAsScalaImmutableMap; import static org.apache.hudi.HoodieConversionUtils.toScalaOption; +import static org.apache.hudi.common.config.HoodieMetadataConfig.ENABLE_METADATA_INDEX_BLOOM_FILTER; +import static org.apache.hudi.common.config.HoodieMetadataConfig.ENABLE_METADATA_INDEX_COLUMN_STATS; +import static org.apache.hudi.common.config.HoodieMetadataConfig.RECORD_INDEX_ENABLE_PROP; import static org.apache.hudi.common.util.ValidationUtils.checkArgument; public class HoodieSparkFunctionalIndexClient extends BaseHoodieFunctionalIndexClient { @@ -82,7 +84,9 @@ public void create(HoodieTableMetaClient metaClient, String indexName, String in throw new HoodieFunctionalIndexException("Index already exists: " + indexName); } - if (!metaClient.getTableConfig().getIndexDefinitionPath().isPresent() || !metaClient.getFunctionalIndexMetadata().isPresent()) { + if (!metaClient.getTableConfig().getIndexDefinitionPath().isPresent() + || !metaClient.getFunctionalIndexMetadata().isPresent() + || !metaClient.getFunctionalIndexMetadata().get().getIndexDefinitions().containsKey(indexName)) { LOG.info("Index definition is not present. Registering the index first"); register(metaClient, indexName, indexType, columns, options); } @@ -94,7 +98,7 @@ public void create(HoodieTableMetaClient metaClient, String indexName, String in try (SparkRDDWriteClient writeClient = HoodieCLIUtils.createHoodieWriteClient( sparkSession, metaClient.getBasePathV2().toString(), mapAsScalaImmutableMap(buildWriteConfig(metaClient, functionalIndexDefinition)), toScalaOption(Option.empty()))) { // generate index plan - Option indexInstantTime = doSchedule(writeClient, metaClient); + Option indexInstantTime = doSchedule(writeClient, metaClient, indexName); if (indexInstantTime.isPresent()) { // build index writeClient.index(indexInstantTime.get()); @@ -104,13 +108,13 @@ public void create(HoodieTableMetaClient metaClient, String indexName, String in } } - private static Option doSchedule(SparkRDDWriteClient client, HoodieTableMetaClient metaClient) { + private static Option doSchedule(SparkRDDWriteClient client, HoodieTableMetaClient metaClient, String indexName) { List partitionTypes = Collections.singletonList(MetadataPartitionType.FUNCTIONAL_INDEX); checkArgument(partitionTypes.size() == 1, "Currently, only one index type can be scheduled at a time."); if (metaClient.getTableConfig().getMetadataPartitions().isEmpty()) { throw new HoodieException("Metadata table is not yet initialized. Initialize FILES partition before any other partition " + Arrays.toString(partitionTypes.toArray())); } - return client.scheduleIndexing(partitionTypes); + return client.scheduleIndexing(partitionTypes, Collections.singletonList(indexName)); } private static boolean indexExists(HoodieTableMetaClient metaClient, String indexName) { @@ -120,11 +124,25 @@ private static boolean indexExists(HoodieTableMetaClient metaClient, String inde private static Map buildWriteConfig(HoodieTableMetaClient metaClient, HoodieFunctionalIndexDefinition indexDefinition) { Map writeConfig = new HashMap<>(); if (metaClient.getTableConfig().isMetadataTableAvailable()) { - if (!writeConfig.containsKey(HoodieLockConfig.LOCK_PROVIDER_CLASS_NAME.key())) { - writeConfig.put(HoodieWriteConfig.WRITE_CONCURRENCY_MODE.key(), WriteConcurrencyMode.OPTIMISTIC_CONCURRENCY_CONTROL.name()); - writeConfig.putAll(JavaConverters.mapAsJavaMapConverter(HoodieCLIUtils.getLockOptions(metaClient.getBasePathV2().toString())).asJava()); - } + writeConfig.put(HoodieWriteConfig.WRITE_CONCURRENCY_MODE.key(), WriteConcurrencyMode.OPTIMISTIC_CONCURRENCY_CONTROL.name()); + writeConfig.putAll(JavaConverters.mapAsJavaMapConverter(HoodieCLIUtils.getLockOptions(metaClient.getBasePathV2().toString())).asJava()); + + // [HUDI-7472] Ensure write-config contains the existing MDT partition to prevent those from getting deleted + metaClient.getTableConfig().getMetadataPartitions().forEach(partitionPath -> { + if (partitionPath.equals(MetadataPartitionType.RECORD_INDEX.getPartitionPath())) { + writeConfig.put(RECORD_INDEX_ENABLE_PROP.key(), "true"); + } + + if (partitionPath.equals(MetadataPartitionType.BLOOM_FILTERS.getPartitionPath())) { + writeConfig.put(ENABLE_METADATA_INDEX_BLOOM_FILTER.key(), "true"); + } + + if (partitionPath.equals(MetadataPartitionType.COLUMN_STATS.getPartitionPath())) { + writeConfig.put(ENABLE_METADATA_INDEX_COLUMN_STATS.key(), "true"); + } + }); } + HoodieFunctionalIndexConfig.fromIndexDefinition(indexDefinition).getProps().forEach((key, value) -> writeConfig.put(key.toString(), value.toString())); return writeConfig; } diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DataSourceOptions.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DataSourceOptions.scala index 47a7c61a60fa..f93209c98367 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DataSourceOptions.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DataSourceOptions.scala @@ -565,8 +565,6 @@ object DataSourceWriteOptions { val SET_NULL_FOR_MISSING_COLUMNS: ConfigProperty[String] = HoodieCommonConfig.SET_NULL_FOR_MISSING_COLUMNS - val MAKE_NEW_COLUMNS_NULLABLE: ConfigProperty[java.lang.Boolean] = HoodieCommonConfig.MAKE_NEW_COLUMNS_NULLABLE - val SPARK_SQL_INSERT_INTO_OPERATION: ConfigProperty[String] = ConfigProperty .key("hoodie.spark.sql.insert.into.operation") .defaultValue(WriteOperationType.INSERT.value()) diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DefaultSource.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DefaultSource.scala index c346f7665df5..be3d2f4ed4bf 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DefaultSource.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DefaultSource.scala @@ -74,7 +74,12 @@ class DefaultSource extends RelationProvider override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = { try { - createRelation(sqlContext, parameters, null) + val relation = createRelation(sqlContext, parameters, null) + if (relation.schema.isEmpty) { + new EmptyRelation(sqlContext, new StructType()) + } else { + relation + } } catch { case _: HoodieSchemaNotFoundException => new EmptyRelation(sqlContext, new StructType()) case e => throw e diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieFileIndex.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieFileIndex.scala index 3444feaecff6..affed871cad9 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieFileIndex.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieFileIndex.scala @@ -63,7 +63,7 @@ import scala.util.{Failure, Success, Try} * who's directory level is 3).We can still read it as a partitioned table. We will mapping the * partition path (e.g. 2021/03/10) to the only partition column (e.g. "dt"). * - * 3、Else the the partition columns size is not equal to the partition directory level and the + * 3、Else the partition columns size is not equal to the partition directory level and the * size is great than "1" (e.g. partition column is "dt,hh", the partition path is "2021/03/10/12") * , we read it as a Non-Partitioned table because we cannot know how to mapping the partition * path with the partition columns in this case. diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieSchemaUtils.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieSchemaUtils.scala index ed073ce4b174..9aeff64f2370 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieSchemaUtils.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieSchemaUtils.scala @@ -21,10 +21,10 @@ package org.apache.hudi import org.apache.avro.Schema import org.apache.hudi.HoodieSparkSqlWriter.{CANONICALIZE_SCHEMA, SQL_MERGE_INTO_WRITES} -import org.apache.hudi.avro.AvroSchemaUtils.{isCompatibleProjectionOf, isSchemaCompatible, isValidEvolutionOf} +import org.apache.hudi.avro.AvroSchemaUtils.{checkSchemaCompatible, checkValidEvolution, isCompatibleProjectionOf, isSchemaCompatible} import org.apache.hudi.avro.HoodieAvroUtils import org.apache.hudi.avro.HoodieAvroUtils.removeMetadataFields -import org.apache.hudi.common.config.HoodieConfig +import org.apache.hudi.common.config.{HoodieConfig, TypedProperties} import org.apache.hudi.common.model.HoodieRecord import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver} import org.apache.hudi.config.HoodieWriteConfig @@ -76,125 +76,131 @@ object HoodieSchemaUtils { latestTableSchemaOpt: Option[Schema], internalSchemaOpt: Option[InternalSchema], opts: Map[String, String]): Schema = { - val setNullForMissingColumns = opts.getOrDefault(DataSourceWriteOptions.SET_NULL_FOR_MISSING_COLUMNS.key(), - DataSourceWriteOptions.SET_NULL_FOR_MISSING_COLUMNS.defaultValue).toBoolean - val shouldReconcileSchema = opts(DataSourceWriteOptions.RECONCILE_SCHEMA.key()).toBoolean - val shouldValidateSchemasCompatibility = opts.getOrDefault(HoodieWriteConfig.AVRO_SCHEMA_VALIDATE_ENABLE.key, - HoodieWriteConfig.AVRO_SCHEMA_VALIDATE_ENABLE.defaultValue).toBoolean - latestTableSchemaOpt match { - // In case table schema is empty we're just going to use the source schema as a - // writer's schema. + // If table schema is empty, then we use the source schema as a writer's schema. case None => AvroInternalSchemaConverter.fixNullOrdering(sourceSchema) // Otherwise, we need to make sure we reconcile incoming and latest table schemas case Some(latestTableSchemaWithMetaFields) => - // NOTE: Meta-fields will be unconditionally injected by Hudi writing handles, for the sake of - // deducing proper writer schema we're stripping them to make sure we can perform proper - // analysis - //add call to fix null ordering to ensure backwards compatibility + // NOTE: Meta-fields will be unconditionally injected by Hudi writing handles, for the sake of deducing proper writer schema + // we're stripping them to make sure we can perform proper analysis + // add call to fix null ordering to ensure backwards compatibility val latestTableSchema = AvroInternalSchemaConverter.fixNullOrdering(removeMetadataFields(latestTableSchemaWithMetaFields)) + // Before validating whether schemas are compatible, we need to "canonicalize" source's schema // relative to the table's one, by doing a (minor) reconciliation of the nullability constraints: // for ex, if in incoming schema column A is designated as non-null, but it's designated as nullable // in the table's one we want to proceed aligning nullability constraints w/ the table's schema // Also, we promote types to the latest table schema if possible. - val shouldCanonicalizeSchema = opts.getOrDefault(CANONICALIZE_SCHEMA.key, - CANONICALIZE_SCHEMA.defaultValue.toString).toBoolean - val mergeIntoWrites = opts.getOrDefault(SQL_MERGE_INTO_WRITES.key(), - SQL_MERGE_INTO_WRITES.defaultValue.toString).toBoolean - + val shouldCanonicalizeSchema = opts.getOrDefault(CANONICALIZE_SCHEMA.key, CANONICALIZE_SCHEMA.defaultValue.toString).toBoolean val canonicalizedSourceSchema = if (shouldCanonicalizeSchema) { canonicalizeSchema(sourceSchema, latestTableSchema, opts) } else { AvroInternalSchemaConverter.fixNullOrdering(sourceSchema) } - val allowAutoEvolutionColumnDrop = opts.getOrDefault(HoodieWriteConfig.SCHEMA_ALLOW_AUTO_EVOLUTION_COLUMN_DROP.key, - HoodieWriteConfig.SCHEMA_ALLOW_AUTO_EVOLUTION_COLUMN_DROP.defaultValue).toBoolean - + val shouldReconcileSchema = opts.getOrDefault(DataSourceWriteOptions.RECONCILE_SCHEMA.key(), + DataSourceWriteOptions.RECONCILE_SCHEMA.defaultValue().toString).toBoolean if (shouldReconcileSchema) { - internalSchemaOpt match { - case Some(internalSchema) => - // Apply schema evolution, by auto-merging write schema and read schema - val mergedInternalSchema = AvroSchemaEvolutionUtils.reconcileSchema(canonicalizedSourceSchema, internalSchema) - val evolvedSchema = AvroInternalSchemaConverter.convert(mergedInternalSchema, latestTableSchema.getFullName) - val shouldRemoveMetaDataFromInternalSchema = sourceSchema.getFields().filter(f => f.name().equalsIgnoreCase(HoodieRecord.RECORD_KEY_METADATA_FIELD)).isEmpty - if (shouldRemoveMetaDataFromInternalSchema) HoodieAvroUtils.removeMetadataFields(evolvedSchema) else evolvedSchema - case None => - // In case schema reconciliation is enabled we will employ (legacy) reconciliation - // strategy to produce target writer's schema (see definition below) - val (reconciledSchema, isCompatible) = - reconcileSchemasLegacy(latestTableSchema, canonicalizedSourceSchema) - - // NOTE: In some cases we need to relax constraint of incoming dataset's schema to be compatible - // w/ the table's one and allow schemas to diverge. This is required in cases where - // partial updates will be performed (for ex, `MERGE INTO` Spark SQL statement) and as such - // only incoming dataset's projection has to match the table's schema, and not the whole one - if (!shouldValidateSchemasCompatibility || isCompatible) { - reconciledSchema - } else { - log.error( - s"""Failed to reconcile incoming batch schema with the table's one. - |Incoming schema ${sourceSchema.toString(true)} - |Incoming schema (canonicalized) ${canonicalizedSourceSchema.toString(true)} - |Table's schema ${latestTableSchema.toString(true)} - |""".stripMargin) - throw new SchemaCompatibilityException("Failed to reconcile incoming schema with the table's one") - } - } + deduceWriterSchemaWithReconcile(sourceSchema, canonicalizedSourceSchema, latestTableSchema, internalSchemaOpt, opts) + } else { + deduceWriterSchemaWithoutReconcile(sourceSchema, canonicalizedSourceSchema, latestTableSchema, opts) + } + } + } + + /** + * Deducing with disabled reconciliation. + * We have to validate that the source's schema is compatible w/ the table's latest schema, + * such that we're able to read existing table's records using [[sourceSchema]]. + */ + private def deduceWriterSchemaWithoutReconcile(sourceSchema: Schema, + canonicalizedSourceSchema: Schema, + latestTableSchema: Schema, + opts: Map[String, String]): Schema = { + // NOTE: In some cases we need to relax constraint of incoming dataset's schema to be compatible + // w/ the table's one and allow schemas to diverge. This is required in cases where + // partial updates will be performed (for ex, `MERGE INTO` Spark SQL statement) and as such + // only incoming dataset's projection has to match the table's schema, and not the whole one + val mergeIntoWrites = opts.getOrDefault(SQL_MERGE_INTO_WRITES.key(), SQL_MERGE_INTO_WRITES.defaultValue.toString).toBoolean + val shouldValidateSchemasCompatibility = opts.getOrDefault(HoodieWriteConfig.AVRO_SCHEMA_VALIDATE_ENABLE.key, + HoodieWriteConfig.AVRO_SCHEMA_VALIDATE_ENABLE.defaultValue).toBoolean + val allowAutoEvolutionColumnDrop = opts.getOrDefault(HoodieWriteConfig.SCHEMA_ALLOW_AUTO_EVOLUTION_COLUMN_DROP.key, + HoodieWriteConfig.SCHEMA_ALLOW_AUTO_EVOLUTION_COLUMN_DROP.defaultValue).toBoolean + val setNullForMissingColumns = opts.getOrDefault(DataSourceWriteOptions.SET_NULL_FOR_MISSING_COLUMNS.key(), + DataSourceWriteOptions.SET_NULL_FOR_MISSING_COLUMNS.defaultValue).toBoolean + + if (!mergeIntoWrites && !shouldValidateSchemasCompatibility && !allowAutoEvolutionColumnDrop) { + // Default behaviour + val reconciledSchema = if (setNullForMissingColumns) { + AvroSchemaEvolutionUtils.reconcileSchema(canonicalizedSourceSchema, latestTableSchema) + } else { + canonicalizedSourceSchema + } + checkValidEvolution(reconciledSchema, latestTableSchema) + reconciledSchema + } else { + // If it's merge into writes, we don't check for projection nor schema compatibility. Writers down the line will take care of it. + // Or it's not merge into writes, and we don't validate schema, but we allow to drop columns automatically. + // Or it's not merge into writes, we validate schema, and schema is compatible. + if (shouldValidateSchemasCompatibility) { + checkSchemaCompatible(latestTableSchema, canonicalizedSourceSchema, true, + allowAutoEvolutionColumnDrop, java.util.Collections.emptySet()) + } + canonicalizedSourceSchema + } + } + + /** + * Deducing with enabled reconciliation. + * Marked as Deprecated. + */ + private def deduceWriterSchemaWithReconcile(sourceSchema: Schema, + canonicalizedSourceSchema: Schema, + latestTableSchema: Schema, + internalSchemaOpt: Option[InternalSchema], + opts: Map[String, String]): Schema = { + internalSchemaOpt match { + case Some(internalSchema) => + // Apply schema evolution, by auto-merging write schema and read schema + val mergedInternalSchema = AvroSchemaEvolutionUtils.reconcileSchema(canonicalizedSourceSchema, internalSchema) + val evolvedSchema = AvroInternalSchemaConverter.convert(mergedInternalSchema, latestTableSchema.getFullName) + val shouldRemoveMetaDataFromInternalSchema = sourceSchema.getFields().filter(f => f.name().equalsIgnoreCase(HoodieRecord.RECORD_KEY_METADATA_FIELD)).isEmpty + if (shouldRemoveMetaDataFromInternalSchema) HoodieAvroUtils.removeMetadataFields(evolvedSchema) else evolvedSchema + case None => + // In case schema reconciliation is enabled we will employ (legacy) reconciliation + // strategy to produce target writer's schema (see definition below) + val (reconciledSchema, isCompatible) = + reconcileSchemasLegacy(latestTableSchema, canonicalizedSourceSchema) + + // NOTE: In some cases we need to relax constraint of incoming dataset's schema to be compatible + // w/ the table's one and allow schemas to diverge. This is required in cases where + // partial updates will be performed (for ex, `MERGE INTO` Spark SQL statement) and as such + // only incoming dataset's projection has to match the table's schema, and not the whole one + val shouldValidateSchemasCompatibility = opts.getOrDefault(HoodieWriteConfig.AVRO_SCHEMA_VALIDATE_ENABLE.key, HoodieWriteConfig.AVRO_SCHEMA_VALIDATE_ENABLE.defaultValue).toBoolean + if (!shouldValidateSchemasCompatibility || isCompatible) { + reconciledSchema } else { - // In case reconciliation is disabled, we have to validate that the source's schema - // is compatible w/ the table's latest schema, such that we're able to read existing table's - // records using [[sourceSchema]]. - // - // NOTE: In some cases we need to relax constraint of incoming dataset's schema to be compatible - // w/ the table's one and allow schemas to diverge. This is required in cases where - // partial updates will be performed (for ex, `MERGE INTO` Spark SQL statement) and as such - // only incoming dataset's projection has to match the table's schema, and not the whole one - - if (mergeIntoWrites) { - // if its merge into writes, do not check for projection nor schema compatibility. Writers down the line will - // take care of it. - canonicalizedSourceSchema - } else { - if (!shouldValidateSchemasCompatibility) { - // if no validation is enabled, check for col drop - if (allowAutoEvolutionColumnDrop) { - canonicalizedSourceSchema - } else { - val reconciledSchema = if (setNullForMissingColumns) { - AvroSchemaEvolutionUtils.reconcileSchema(canonicalizedSourceSchema, latestTableSchema) - } else { - canonicalizedSourceSchema - } - if (isValidEvolutionOf(reconciledSchema, latestTableSchema)) { - reconciledSchema - } else { - log.error( - s"""Incoming batch schema is not compatible with the table's one. - |Incoming schema ${sourceSchema.toString(true)} - |Incoming schema (canonicalized) ${reconciledSchema.toString(true)} - |Table's schema ${latestTableSchema.toString(true)} - |""".stripMargin) - throw new SchemaCompatibilityException("Incoming batch schema is not compatible with the table's one") - } - } - } else if (isSchemaCompatible(latestTableSchema, canonicalizedSourceSchema, allowAutoEvolutionColumnDrop)) { - canonicalizedSourceSchema - } else { - log.error( - s"""Incoming batch schema is not compatible with the table's one. - |Incoming schema ${sourceSchema.toString(true)} - |Incoming schema (canonicalized) ${canonicalizedSourceSchema.toString(true)} - |Table's schema ${latestTableSchema.toString(true)} - |""".stripMargin) - throw new SchemaCompatibilityException("Incoming batch schema is not compatible with the table's one") - } - } + log.error( + s"""Failed to reconcile incoming batch schema with the table's one. + |Incoming schema ${sourceSchema.toString(true)} + |Incoming schema (canonicalized) ${canonicalizedSourceSchema.toString(true)} + |Table's schema ${latestTableSchema.toString(true)} + |""".stripMargin) + throw new SchemaCompatibilityException("Failed to reconcile incoming schema with the table's one") } } } + def deduceWriterSchema(sourceSchema: Schema, + latestTableSchemaOpt: org.apache.hudi.common.util.Option[Schema], + internalSchemaOpt: org.apache.hudi.common.util.Option[InternalSchema], + props: TypedProperties): Schema = { + deduceWriterSchema(sourceSchema, + HoodieConversionUtils.toScalaOption(latestTableSchemaOpt), + HoodieConversionUtils.toScalaOption(internalSchemaOpt), + HoodieConversionUtils.fromProperties(props)) + } + /** * Canonicalizes [[sourceSchema]] by reconciling it w/ [[latestTableSchema]] in following * @@ -206,7 +212,7 @@ object HoodieSchemaUtils { * TODO support casing reconciliation */ private def canonicalizeSchema(sourceSchema: Schema, latestTableSchema: Schema, opts : Map[String, String]): Schema = { - reconcileSchemaRequirements(sourceSchema, latestTableSchema, opts) + reconcileSchemaRequirements(sourceSchema, latestTableSchema) } diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala index 00ec59c5b8fd..98c7c5d29f56 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala @@ -51,7 +51,7 @@ import org.apache.hudi.common.util.{CommitUtils, StringUtils, Option => HOption} import org.apache.hudi.config.HoodieBootstrapConfig.{BASE_PATH, INDEX_CLASS_NAME} import org.apache.hudi.config.HoodieWriteConfig.SPARK_SQL_MERGE_INTO_PREPPED_KEY import org.apache.hudi.config.{HoodieCompactionConfig, HoodieInternalConfig, HoodieWriteConfig} -import org.apache.hudi.exception.{HoodieException, HoodieWriteConflictException} +import org.apache.hudi.exception.{HoodieException, HoodieRecordCreationException, HoodieWriteConflictException} import org.apache.hudi.hive.{HiveSyncConfigHolder, HiveSyncTool} import org.apache.hudi.internal.schema.InternalSchema import org.apache.hudi.internal.schema.convert.AvroInternalSchemaConverter @@ -78,6 +78,7 @@ import java.util.function.BiConsumer import scala.collection.JavaConversions._ import scala.collection.JavaConverters.setAsJavaSetConverter import scala.collection.mutable +import scala.util.{Failure, Success, Try} object HoodieSparkSqlWriter { @@ -132,21 +133,6 @@ object HoodieSparkSqlWriter { new HoodieSparkSqlWriterInternal().bootstrap(sqlContext, mode, optParams, df, hoodieTableConfigOpt, streamingWritesParamsOpt, hoodieWriteClient) } - /** - * Deduces writer's schema based on - *

      - *
    • Source's schema
    • - *
    • Target table's schema (including Hudi's [[InternalSchema]] representation)
    • - *
    - */ - def deduceWriterSchema(sourceSchema: Schema, - latestTableSchemaOpt: Option[Schema], - internalSchemaOpt: Option[InternalSchema], - props: TypedProperties): Schema = { - HoodieSchemaUtils.deduceWriterSchema(sourceSchema, latestTableSchemaOpt, - internalSchemaOpt, HoodieConversionUtils.fromProperties(props)) - } - def cleanup(): Unit = { Metrics.shutdownAllMetrics() } @@ -493,10 +479,13 @@ class HoodieSparkSqlWriterInternal { } instantTime = client.createNewInstantTime() // Convert to RDD[HoodieRecord] - val hoodieRecords = - HoodieCreateRecordUtils.createHoodieRecordRdd(HoodieCreateRecordUtils.createHoodieRecordRddArgs(df, - writeConfig, parameters, avroRecordName, avroRecordNamespace, writerSchema, - processedDataSchema, operation, instantTime, preppedSparkSqlWrites, preppedSparkSqlMergeInto, preppedWriteOperation)) + val hoodieRecords = Try(HoodieCreateRecordUtils.createHoodieRecordRdd( + HoodieCreateRecordUtils.createHoodieRecordRddArgs(df, writeConfig, parameters, avroRecordName, + avroRecordNamespace, writerSchema, processedDataSchema, operation, instantTime, preppedSparkSqlWrites, + preppedSparkSqlMergeInto, preppedWriteOperation))) match { + case Success(recs) => recs + case Failure(e) => throw new HoodieRecordCreationException("Failed to create Hoodie Spark Record", e) + } val dedupedHoodieRecords = if (hoodieConfig.getBoolean(INSERT_DROP_DUPS) && operation != WriteOperationType.INSERT_OVERWRITE_TABLE && operation != WriteOperationType.INSERT_OVERWRITE) { diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieWriterUtils.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieWriterUtils.scala index 133f641d280b..63495b0eede6 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieWriterUtils.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieWriterUtils.scala @@ -83,7 +83,6 @@ object HoodieWriterUtils { hoodieConfig.setDefaultValue(ASYNC_CLUSTERING_ENABLE) hoodieConfig.setDefaultValue(ENABLE_ROW_WRITER) hoodieConfig.setDefaultValue(RECONCILE_SCHEMA) - hoodieConfig.setDefaultValue(MAKE_NEW_COLUMNS_NULLABLE) hoodieConfig.setDefaultValue(DROP_PARTITION_COLUMNS) hoodieConfig.setDefaultValue(KEYGENERATOR_CONSISTENT_LOGICAL_TIMESTAMP_ENABLED) Map() ++ hoodieConfig.getProps.asScala ++ globalProps ++ DataSourceOptionsHelper.translateConfigurations(parameters) diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/MergeOnReadIncrementalRelation.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/MergeOnReadIncrementalRelation.scala index 7b25fd9a8c73..243cf3db550e 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/MergeOnReadIncrementalRelation.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/MergeOnReadIncrementalRelation.scala @@ -26,9 +26,10 @@ import org.apache.hudi.common.table.timeline.TimelineUtils.HollowCommitHandling. import org.apache.hudi.common.table.timeline.TimelineUtils.{HollowCommitHandling, concatTimeline, getCommitMetadata, handleHollowCommitIfNeeded} import org.apache.hudi.common.table.timeline.{HoodieInstant, HoodieTimeline} import org.apache.hudi.common.table.view.HoodieTableFileSystemView +import org.apache.hudi.metadata.HoodieTableMetadataUtil.getWritePartitionPaths import org.apache.hudi.common.util.StringUtils import org.apache.hudi.exception.HoodieException -import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils.{getWritePartitionPaths, listAffectedFilesForCommits} +import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils.listAffectedFilesForCommits import org.apache.spark.rdd.RDD import org.apache.spark.sql.SQLContext import org.apache.spark.sql.catalyst.InternalRow diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/RecordLevelIndexSupport.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/RecordLevelIndexSupport.scala index 764ce69795d9..894405aec45d 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/RecordLevelIndexSupport.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/RecordLevelIndexSupport.scala @@ -162,7 +162,10 @@ class RecordLevelIndexSupport(spark: SparkSession, case inQuery: In => var validINQuery = true inQuery.value match { - case _: AttributeReference => + case attribute: AttributeReference => + if (!attributeMatchesRecordKey(attribute.name)) { + validINQuery = false + } case _ => validINQuery = false } var literals: List[String] = List.empty diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/catalyst/catalog/HoodieCatalogTable.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/catalyst/catalog/HoodieCatalogTable.scala index c1414fe77fed..bfd5613feba9 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/catalyst/catalog/HoodieCatalogTable.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/catalyst/catalog/HoodieCatalogTable.scala @@ -159,11 +159,6 @@ class HoodieCatalogTable(val spark: SparkSession, var table: CatalogTable) exten StructType(tableSchema.filterNot(f => partitionFields.contains(f.name))) } - /** - * The schema of data fields not including hoodie meta fields - */ - lazy val dataSchemaWithoutMetaFields: StructType = removeMetaFields(dataSchema) - /** * The schema of partition fields */ @@ -173,7 +168,7 @@ class HoodieCatalogTable(val spark: SparkSession, var table: CatalogTable) exten * All the partition paths, excludes lazily deleted partitions. */ def getPartitionPaths: Seq[String] = { - val droppedPartitions = TimelineUtils.getDroppedPartitions(metaClient.getActiveTimeline) + val droppedPartitions = TimelineUtils.getDroppedPartitions(metaClient, org.apache.hudi.common.util.Option.empty(), org.apache.hudi.common.util.Option.empty()) getAllPartitionPaths(spark, table) .filter(!droppedPartitions.contains(_)) diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/HoodieOptionConfig.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/HoodieOptionConfig.scala index cc2d8903a162..46a004808088 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/HoodieOptionConfig.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/HoodieOptionConfig.scala @@ -155,7 +155,8 @@ object HoodieOptionConfig { def mapSqlOptionsToTableConfigs(options: Map[String, String]): Map[String, String] = { options.map { case (k, v) => if (sqlOptionKeyToTableConfigKey.contains(k)) { - sqlOptionKeyToTableConfigKey(k) -> sqlOptionValueToHoodieConfigValue.getOrElse(v, v) + // support table type incase-sensitive + sqlOptionKeyToTableConfigKey(k) -> sqlOptionValueToHoodieConfigValue.getOrElse(v.toLowerCase, v) } else { k -> v } diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/ProvidesHoodieConfig.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/ProvidesHoodieConfig.scala index 250067d4b847..a4003bbd4807 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/ProvidesHoodieConfig.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/ProvidesHoodieConfig.scala @@ -22,7 +22,7 @@ import org.apache.hudi.{DataSourceWriteOptions, HoodieFileIndex} import org.apache.hudi.DataSourceWriteOptions._ import org.apache.hudi.HoodieConversionUtils.toProperties import org.apache.hudi.common.config.{DFSPropertiesConfiguration, TypedProperties} -import org.apache.hudi.common.model.{OverwriteWithLatestAvroPayload, WriteOperationType} +import org.apache.hudi.common.model.{DefaultHoodieRecordPayload, WriteOperationType} import org.apache.hudi.common.table.HoodieTableConfig import org.apache.hudi.config.HoodieWriteConfig.TBL_NAME import org.apache.hudi.config.{HoodieIndexConfig, HoodieInternalConfig, HoodieWriteConfig} @@ -44,8 +44,8 @@ import org.apache.spark.sql.hudi.command.{SqlKeyGenerator, ValidateDuplicateKeyP import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.internal.SQLConf.PARTITION_OVERWRITE_MODE import org.apache.spark.sql.types.StructType - import java.util.Locale + import scala.collection.JavaConverters._ trait ProvidesHoodieConfig extends Logging { @@ -102,7 +102,7 @@ trait ProvidesHoodieConfig extends Logging { // Validate duplicate key for inserts to COW table when using strict insert mode. classOf[ValidateDuplicateKeyPayload].getCanonicalName } else { - classOf[OverwriteWithLatestAvroPayload].getCanonicalName + classOf[DefaultHoodieRecordPayload].getCanonicalName } } @@ -276,7 +276,7 @@ trait ProvidesHoodieConfig extends Logging { if (insertDupPolicy == FAIL_INSERT_DUP_POLICY) { classOf[ValidateDuplicateKeyPayload].getCanonicalName } else { - classOf[OverwriteWithLatestAvroPayload].getCanonicalName + classOf[DefaultHoodieRecordPayload].getCanonicalName } } @@ -480,6 +480,8 @@ trait ProvidesHoodieConfig extends Logging { hiveSyncConfig.setValue(HoodieSyncConfig.META_SYNC_PARTITION_FIELDS, props.getString(HoodieSyncConfig.META_SYNC_PARTITION_FIELDS.key)) } hiveSyncConfig.setDefaultValue(HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS, classOf[MultiPartKeysValueExtractor].getName) + // This is hardcoded to true to ensure consistency as Spark syncs TIMESTAMP types as TIMESTAMP by default + // via Spark's externalCatalog API, which is used by AlterHoodieTableCommand. hiveSyncConfig.setDefaultValue(HiveSyncConfigHolder.HIVE_SUPPORT_TIMESTAMP_TYPE, "true") if (hiveSyncConfig.useBucketSync()) hiveSyncConfig.setValue(HiveSyncConfigHolder.HIVE_SYNC_BUCKET_SYNC_SPEC, diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/CreateHoodieTableCommand.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/CreateHoodieTableCommand.scala index 3db9742aaf0c..a857c3a5ded0 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/CreateHoodieTableCommand.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/CreateHoodieTableCommand.scala @@ -145,12 +145,17 @@ object CreateHoodieTableCommand { val partitionColumnNames = hoodieCatalogTable.partitionSchema.map(_.name) // Remove some properties should not be used;append pk, preCombineKey, type to the properties of table - val newTblProperties = + var newTblProperties = hoodieCatalogTable.catalogProperties.--(needFilterProps) ++ HoodieOptionConfig.extractSqlOptions(properties) + + // Add provider -> hudi as a table property + newTblProperties = newTblProperties + ("provider" -> "hudi") + val newTable = table.copy( identifier = newTableIdentifier, storage = newStorage, schema = hoodieCatalogTable.tableSchema, + provider = Some("hudi"), partitionColumnNames = partitionColumnNames, createVersion = SPARK_VERSION, properties = newTblProperties diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/SqlKeyGenerator.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/SqlKeyGenerator.scala index 04f1fbd5ba04..740ac6758685 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/SqlKeyGenerator.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/SqlKeyGenerator.scala @@ -49,7 +49,7 @@ class SqlKeyGenerator(props: TypedProperties) extends BuiltinKeyGenerator(props) } } - private lazy val autoRecordKeyGen = KeyGenUtils.enableAutoGenerateRecordKeys(props) + private lazy val autoRecordKeyGen = KeyGenUtils.isAutoGeneratedRecordKeysEnabled(props) private lazy val complexKeyGen = if (autoRecordKeyGen) { new AutoRecordGenWrapperKeyGenerator(props, new ComplexKeyGenerator(props)) } else { diff --git a/hudi-spark-datasource/hudi-spark-common/src/test/java/org/apache/hudi/TestHoodieSchemaUtils.java b/hudi-spark-datasource/hudi-spark-common/src/test/java/org/apache/hudi/TestHoodieSchemaUtils.java new file mode 100644 index 000000000000..b10d0cfa9929 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark-common/src/test/java/org/apache/hudi/TestHoodieSchemaUtils.java @@ -0,0 +1,286 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi; + +import org.apache.hudi.common.config.HoodieCommonConfig; +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.exception.HoodieNullSchemaTypeException; +import org.apache.hudi.exception.MissingSchemaFieldException; +import org.apache.hudi.exception.SchemaBackwardsCompatibilityException; + +import org.apache.avro.Schema; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; + +import java.util.Arrays; +import java.util.HashMap; +import java.util.Map; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class TestHoodieSchemaUtils { + + @Test + void testSchemaWithNullField() { + Schema withNullfield = createRecord("nullRecord", createPrimitiveField("nullField", Schema.Type.NULL)); + assertThrows(HoodieNullSchemaTypeException.class, + () -> deduceWriterSchema(withNullfield, null)); + } + + @Test + void testSimplePromotionWithComplexFields() { + Schema start = createRecord("simple", createPrimitiveField("f", Schema.Type.INT)); + Schema end = createRecord("simple", createPrimitiveField("f", Schema.Type.LONG)); + assertEquals(end, deduceWriterSchema(end, start)); + + start = createRecord("nested", createNestedField("f", Schema.Type.INT)); + end = createRecord("nested", createNestedField("f", Schema.Type.LONG)); + assertEquals(end, deduceWriterSchema(end, start)); + + start = createRecord("arrayRec", createArrayField("f", Schema.Type.INT)); + end = createRecord("arrayRec", createArrayField("f", Schema.Type.LONG)); + assertEquals(end, deduceWriterSchema(end, start)); + + start = createRecord("mapRec", createMapField("f", Schema.Type.INT)); + end = createRecord("mapRec", createMapField("f", Schema.Type.LONG)); + assertEquals(end, deduceWriterSchema(end, start)); + } + + @Test + void testAllowedTypePromotions() { + Schema.Type[] promotionTypes = new Schema.Type[]{Schema.Type.INT, Schema.Type.LONG, Schema.Type.FLOAT, Schema.Type.DOUBLE, Schema.Type.STRING, Schema.Type.BYTES}; + Map> allowedPromotions = new HashMap<>(); + //allowedPromotions.key can be promoted to any type in the range allowedPromotions.value + allowedPromotions.put(Schema.Type.INT, Pair.of(0, 4)); + allowedPromotions.put(Schema.Type.LONG, Pair.of(1, 4)); + allowedPromotions.put(Schema.Type.FLOAT, Pair.of(2, 4)); + allowedPromotions.put(Schema.Type.DOUBLE, Pair.of(3, 4)); + allowedPromotions.put(Schema.Type.STRING, Pair.of(4, 4)); + allowedPromotions.put(Schema.Type.BYTES, Pair.of(5, 5)); + + Map schemaMap = new HashMap<>(); + for (Schema.Type type : promotionTypes) { + schemaMap.put(type, createRecord("rec", + createPrimitiveField("simpleField", type), + createArrayField("arrayField", type), + createMapField("mapField", type), + createNestedField("nestedField", type))); + } + + for (int i = 0; i < promotionTypes.length; i++) { + Schema startSchema = schemaMap.get(promotionTypes[i]); + Pair minMax = allowedPromotions.get(promotionTypes[i]); + for (int j = minMax.getLeft(); j <= minMax.getRight(); j++) { + Schema endSchema = schemaMap.get(promotionTypes[j]); + assertEquals(endSchema, deduceWriterSchema(endSchema, startSchema)); + } + } + } + + @Test + void testReversePromotions() { + Schema.Type[] promotionTypes = new Schema.Type[]{Schema.Type.INT, Schema.Type.LONG, Schema.Type.FLOAT, Schema.Type.DOUBLE, Schema.Type.STRING, Schema.Type.BYTES}; + Map> reversePromotions = new HashMap<>(); + //Incoming data types in the range reversePromotions.value will be promoted to reversePromotions.key + //if reversePromotions.key is the current table schema + reversePromotions.put(Schema.Type.INT, Pair.of(0, 0)); + reversePromotions.put(Schema.Type.LONG, Pair.of(0, 1)); + reversePromotions.put(Schema.Type.FLOAT, Pair.of(0, 2)); + reversePromotions.put(Schema.Type.DOUBLE, Pair.of(0, 3)); + reversePromotions.put(Schema.Type.STRING, Pair.of(0, 5)); + reversePromotions.put(Schema.Type.BYTES, Pair.of(4, 5)); + + Map schemaMap = new HashMap<>(); + for (Schema.Type type : promotionTypes) { + schemaMap.put(type, createRecord("rec", + createPrimitiveField("simpleField", type), + createArrayField("arrayField", type), + createMapField("mapField", type), + createNestedField("nestedField", type))); + } + + for (int i = 0; i < promotionTypes.length; i++) { + Schema startSchema = schemaMap.get(promotionTypes[i]); + Pair minMax = reversePromotions.get(promotionTypes[i]); + for (int j = minMax.getLeft(); j <= minMax.getRight(); j++) { + Schema endSchema = schemaMap.get(promotionTypes[j]); + assertEquals(startSchema, deduceWriterSchema(endSchema, startSchema)); + } + } + } + + @Test + void testIllegalPromotionsBetweenPrimitives() { + Schema.Type[] promotionTypes = new Schema.Type[]{Schema.Type.INT, Schema.Type.LONG, Schema.Type.FLOAT, Schema.Type.DOUBLE, Schema.Type.BYTES}; + Map schemaMap = new HashMap<>(); + for (Schema.Type type : promotionTypes) { + schemaMap.put(type, createRecord("rec", + createPrimitiveField("simpleField", type), + createArrayField("arrayField", type), + createMapField("mapField", type), + createNestedField("nestedField", type))); + } + + String[] fieldNames = new String[]{"rec.simpleField", "rec.arrayField.element", "rec.mapField.value", "rec.nestedField.nested"}; + //int, long, float, double can't be promoted to bytes + for (int i = 0; i < 4; i++) { + Schema startSchema = schemaMap.get(promotionTypes[i]); + Schema endSchema = schemaMap.get(Schema.Type.BYTES); + Throwable t = assertThrows(SchemaBackwardsCompatibilityException.class, + () -> deduceWriterSchema(endSchema, startSchema)); + String baseString = String.format("TYPE_MISMATCH: reader type 'BYTES' not compatible with writer type '%s' for field '%%s'", + promotionTypes[i].getName().toUpperCase()); + for (String fieldName : fieldNames) { + assertTrue(t.getMessage().contains(String.format(baseString, fieldName))); + } + } + } + + @Test + void testIllegalPromotionsBetweenComplexFields() { + String[] typeNames = new String[]{"INT", "ARRAY", "MAP", "RECORD"}; + Schema[] fieldTypes = new Schema[]{createRecord("rec", createPrimitiveField("testField", Schema.Type.INT)), + createRecord("rec", createArrayField("testField", Schema.Type.INT)), + createRecord("rec", createMapField("testField", Schema.Type.INT)), + createRecord("rec", createNestedField("testField", Schema.Type.INT))}; + + for (int i = 0; i < fieldTypes.length; i++) { + for (int j = 0; j < fieldTypes.length; j++) { + if (i != j) { + Schema startSchema = fieldTypes[i]; + Schema endSchema = fieldTypes[j]; + Throwable t = assertThrows(SchemaBackwardsCompatibilityException.class, + () -> deduceWriterSchema(startSchema, endSchema)); + String errorMessage = String.format("Schema validation backwards compatibility check failed with the following issues: " + + "{TYPE_MISMATCH: reader type '%s' not compatible with writer type '%s' for field 'rec.testField'}", typeNames[i], typeNames[j]); + assertTrue(t.getMessage().startsWith(errorMessage)); + } + } + } + } + + @ParameterizedTest + @ValueSource(booleans = {true, false}) + void testMissingColumn(boolean allowDroppedColumns) { + //simple case + Schema start = createRecord("missingSimpleField", + createPrimitiveField("field1", Schema.Type.INT), + createPrimitiveField("field2", Schema.Type.INT), + createPrimitiveField("field3", Schema.Type.INT)); + Schema end = createRecord("missingSimpleField", + createPrimitiveField("field1", Schema.Type.INT), + createPrimitiveField("field3", Schema.Type.INT)); + try { + assertEquals(start, deduceWriterSchema(end, start, allowDroppedColumns)); + assertTrue(allowDroppedColumns); + } catch (MissingSchemaFieldException e) { + assertFalse(allowDroppedColumns); + assertTrue(e.getMessage().contains("missingSimpleField.field2")); + } + + //complex case + start = createRecord("missingComplexField", + createPrimitiveField("field1", Schema.Type.INT), + createPrimitiveField("field2", Schema.Type.INT), + createArrayField("field3", createRecord("nestedRecord", + createPrimitiveField("nestedField1", Schema.Type.INT), + createPrimitiveField("nestedField2", Schema.Type.INT), + createPrimitiveField("nestedField3", Schema.Type.INT))), + createPrimitiveField("field4", Schema.Type.INT)); + end = createRecord("missingComplexField", + createPrimitiveField("field1", Schema.Type.INT), + createPrimitiveField("field2", Schema.Type.INT), + createPrimitiveField("field4", Schema.Type.INT)); + try { + assertEquals(start, deduceWriterSchema(end, start, allowDroppedColumns)); + assertTrue(allowDroppedColumns); + } catch (MissingSchemaFieldException e) { + assertFalse(allowDroppedColumns); + assertTrue(e.getMessage().contains("missingComplexField.field3")); + } + + //partial missing field + end = createRecord("missingComplexField", + createPrimitiveField("field1", Schema.Type.INT), + createArrayField("field3", createRecord("nestedRecord", + createPrimitiveField("nestedField2", Schema.Type.INT), + createPrimitiveField("nestedField3", Schema.Type.INT))), + createPrimitiveField("field4", Schema.Type.INT)); + try { + assertEquals(start, deduceWriterSchema(end, start, allowDroppedColumns)); + assertTrue(allowDroppedColumns); + } catch (MissingSchemaFieldException e) { + assertFalse(allowDroppedColumns); + assertTrue(e.getMessage().contains("missingComplexField.field3.element.nestedRecord.nestedField1")); + assertTrue(e.getMessage().contains("missingComplexField.field2")); + } + } + + private static Schema deduceWriterSchema(Schema incomingSchema, Schema latestTableSchema) { + return deduceWriterSchema(incomingSchema, latestTableSchema, false); + } + + private static final TypedProperties TYPED_PROPERTIES = new TypedProperties(); + + private static Schema deduceWriterSchema(Schema incomingSchema, Schema latestTableSchema, Boolean addNull) { + TYPED_PROPERTIES.setProperty(HoodieCommonConfig.SET_NULL_FOR_MISSING_COLUMNS.key(), addNull.toString()); + return HoodieSchemaUtils.deduceWriterSchema(incomingSchema, Option.ofNullable(latestTableSchema), + Option.empty(), TYPED_PROPERTIES); + } + + private static Schema.Field createNestedField(String name, Schema.Type type) { + return createNestedField(name, Schema.create(type)); + } + + private static Schema.Field createNestedField(String name, Schema schema) { + return new Schema.Field(name, createRecord(name, new Schema.Field("nested", schema, null, null)), null, null); + } + + private static Schema.Field createArrayField(String name, Schema.Type type) { + return createArrayField(name, Schema.create(type)); + } + + private static Schema.Field createArrayField(String name, Schema schema) { + return new Schema.Field(name, Schema.createArray(schema), null, null); + } + + private static Schema.Field createMapField(String name, Schema.Type type) { + return createMapField(name, Schema.create(type)); + } + + private static Schema.Field createMapField(String name, Schema schema) { + return new Schema.Field(name, Schema.createMap(schema), null, null); + } + + private static Schema.Field createPrimitiveField(String name, Schema.Type type) { + return new Schema.Field(name, Schema.create(type), null, null); + } + + private static Schema createRecord(String name, Schema.Field... fields) { + return Schema.createRecord(name, null, null, false, Arrays.asList(fields)); + } + +} diff --git a/hudi-spark-datasource/hudi-spark-common/src/test/scala/org/apache/spark/execution/datasources/TestHoodieInMemoryFileIndex.scala b/hudi-spark-datasource/hudi-spark-common/src/test/scala/org/apache/spark/execution/datasources/TestHoodieInMemoryFileIndex.scala index 8e7f6bf14b7e..c9052a952e68 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/test/scala/org/apache/spark/execution/datasources/TestHoodieInMemoryFileIndex.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/test/scala/org/apache/spark/execution/datasources/TestHoodieInMemoryFileIndex.scala @@ -18,6 +18,7 @@ package org.apache.spark.execution.datasources import org.apache.hadoop.fs.Path +import org.apache.hudi.testutils.HoodieClientTestUtils.getSparkConfForTest import org.apache.spark.sql.SparkSession import org.junit.jupiter.api.Assertions.assertEquals import org.junit.jupiter.api.Test @@ -31,9 +32,7 @@ class TestHoodieInMemoryFileIndex { @Test def testCreateInMemoryIndex(@TempDir tempDir: File): Unit = { val spark = SparkSession.builder - .appName("Hoodie Datasource test") - .master("local[2]") - .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") + .config(getSparkConfForTest("Hoodie Datasource test")) .getOrCreate val folders: Seq[Path] = Seq( diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/HoodieProcedures.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/HoodieProcedures.scala index 9bb6fb6db8db..e12aad789d78 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/HoodieProcedures.scala +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/HoodieProcedures.scala @@ -91,6 +91,7 @@ object HoodieProcedures { ,(ShowTablePropertiesProcedure.NAME, ShowTablePropertiesProcedure.builder) ,(HelpProcedure.NAME, HelpProcedure.builder) ,(ArchiveCommitsProcedure.NAME, ArchiveCommitsProcedure.builder) + ,(RunTTLProcedure.NAME, RunTTLProcedure.builder) ) } } diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/RunTTLProcedure.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/RunTTLProcedure.scala new file mode 100644 index 000000000000..2d3e704ad129 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/RunTTLProcedure.scala @@ -0,0 +1,99 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command.procedures + +import org.apache.hudi.HoodieCLIUtils +import org.apache.hudi.client.SparkRDDWriteClient +import org.apache.hudi.config.HoodieTTLConfig +import org.apache.spark.internal.Logging +import org.apache.spark.sql.Row +import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType} + +import java.util.function.Supplier +import scala.collection.JavaConverters._ + +class RunTTLProcedure extends BaseProcedure with ProcedureBuilder with Logging { + + private val PARAMETERS = Array[ProcedureParameter]( + ProcedureParameter.required(0, "table", DataTypes.StringType), + ProcedureParameter.optional(1, "ttl_policy", DataTypes.StringType), + ProcedureParameter.optional(2, "retain_days", DataTypes.IntegerType), + ProcedureParameter.optional(3, "options", DataTypes.StringType) + ) + + private val OUTPUT_TYPE = new StructType(Array[StructField]( + StructField("deleted_partitions", DataTypes.StringType, nullable = true, Metadata.empty) + )) + + override def build: Procedure = new RunTTLProcedure + + /** + * Returns the input parameters of this procedure. + */ + override def parameters: Array[ProcedureParameter] = PARAMETERS + + /** + * Returns the type of rows produced by this procedure. + */ + override def outputType: StructType = OUTPUT_TYPE + + override def call(args: ProcedureArgs): Seq[Row] = { + super.checkArgs(PARAMETERS, args) + + val tableName = getArgValueOrDefault(args, PARAMETERS(0)) + var confs: Map[String, String] = Map.empty + if (getArgValueOrDefault(args, PARAMETERS(1)).isDefined) { + confs += HoodieTTLConfig.PARTITION_TTL_STRATEGY_TYPE.key() -> getArgValueOrDefault(args, PARAMETERS(1)).get.toString + } + if (getArgValueOrDefault(args, PARAMETERS(2)).isDefined) { + confs += HoodieTTLConfig.DAYS_RETAIN.key() -> getArgValueOrDefault(args, PARAMETERS(2)).get.toString + } + if (getArgValueOrDefault(args, PARAMETERS(3)).isDefined) { + confs ++= HoodieCLIUtils.extractOptions(getArgValueOrDefault(args, PARAMETERS(3)).get.asInstanceOf[String]) + } + + val basePath = getBasePath(tableName, Option.empty) + + var client: SparkRDDWriteClient[_] = null + try { + client = HoodieCLIUtils.createHoodieWriteClient(sparkSession, basePath, confs, + tableName.asInstanceOf[Option[String]]) + val ttlInstantTime = client.createNewInstantTime() + val hoodieTTLMeta = client.managePartitionTTL(ttlInstantTime) + if (hoodieTTLMeta == null) { + Seq.empty + } else { + hoodieTTLMeta.getPartitionToReplaceFileIds.keySet().asScala.map { p => + Row(p) + }.toSeq + } + } finally { + if (client != null) { + client.close() + } + } + } +} + +object RunTTLProcedure { + val NAME = "run_ttl" + + def builder: Supplier[ProcedureBuilder] = new Supplier[ProcedureBuilder] { + override def get() = new RunTTLProcedure + } +} diff --git a/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/functional/TestHiveTableSchemaEvolution.java b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/functional/TestHiveTableSchemaEvolution.java index f79d54ed0d2f..806f77544231 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/functional/TestHiveTableSchemaEvolution.java +++ b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/functional/TestHiveTableSchemaEvolution.java @@ -98,7 +98,8 @@ public void testHiveReadSchemaEvolutionTable(String tableType) throws Exception spark.sql("set hoodie.schema.on.read.enable=true"); spark.sql(String.format("create table %s (col0 int, col1 float, col2 string) using hudi " - + "tblproperties (type='%s', primaryKey='col0', preCombineField='col1') location '%s'", + + "tblproperties (type='%s', primaryKey='col0', preCombineField='col1', " + + "hoodie.compaction.payload.class='org.apache.hudi.common.model.OverwriteWithLatestAvroPayload') location '%s'", tableName, tableType, path)); spark.sql(String.format("insert into %s values(1, 1.1, 'text')", tableName)); spark.sql(String.format("update %s set col2 = 'text2' where col0 = 1", tableName)); diff --git a/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/io/storage/row/TestHoodieInternalRowParquetWriter.java b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/io/storage/row/TestHoodieInternalRowParquetWriter.java index fb4559263125..0e4dc22b8ce7 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/io/storage/row/TestHoodieInternalRowParquetWriter.java +++ b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/io/storage/row/TestHoodieInternalRowParquetWriter.java @@ -131,6 +131,7 @@ private HoodieRowParquetWriteSupport getWriteSupport(HoodieWriteConfig.Builder w writeConfig.getBloomFilterFPP(), writeConfig.getDynamicBloomFilterMaxNumEntries(), writeConfig.getBloomFilterType()); - return new HoodieRowParquetWriteSupport(hadoopConf, SparkDatasetTestUtils.STRUCT_TYPE, Option.of(filter), writeConfig.getStorageConfig()); + return HoodieRowParquetWriteSupport.getHoodieRowParquetWriteSupport(hadoopConf, + SparkDatasetTestUtils.STRUCT_TYPE, Option.of(filter), writeConfig); } } diff --git a/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/keygen/TestComplexKeyGenerator.java b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/keygen/TestComplexKeyGenerator.java index 296cf3d6e0db..2fa09861d25c 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/keygen/TestComplexKeyGenerator.java +++ b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/keygen/TestComplexKeyGenerator.java @@ -78,7 +78,7 @@ public void testNullPartitionPathFields() { @Test public void testNullRecordKeyFields() { GenericRecord record = getRecord(); - Assertions.assertThrows(StringIndexOutOfBoundsException.class, () -> { + Assertions.assertThrows(HoodieKeyException.class, () -> { ComplexKeyGenerator keyGenerator = new ComplexKeyGenerator(getPropertiesWithoutRecordKeyProp()); keyGenerator.getRecordKey(record); }); diff --git a/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/keygen/TestGlobalDeleteRecordGenerator.java b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/keygen/TestGlobalDeleteRecordGenerator.java index df69279cc89f..4c9fc1c9ddaa 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/keygen/TestGlobalDeleteRecordGenerator.java +++ b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/keygen/TestGlobalDeleteRecordGenerator.java @@ -62,7 +62,7 @@ private TypedProperties getProps() { @Test public void testNullRecordKeyFields() { GenericRecord record = getRecord(); - Assertions.assertThrows(StringIndexOutOfBoundsException.class, () -> { + Assertions.assertThrows(HoodieKeyException.class, () -> { BaseKeyGenerator keyGenerator = new GlobalDeleteKeyGenerator(getPropertiesWithoutRecordKeyProp()); keyGenerator.getRecordKey(record); }); diff --git a/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/keygen/TestNonpartitionedKeyGenerator.java b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/keygen/TestNonpartitionedKeyGenerator.java index fb740d00e2a5..187f96197b1d 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/keygen/TestNonpartitionedKeyGenerator.java +++ b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/keygen/TestNonpartitionedKeyGenerator.java @@ -69,7 +69,7 @@ private TypedProperties getWrongRecordKeyFieldProps() { @Test public void testNullRecordKeyFields() { GenericRecord record = getRecord(); - Assertions.assertThrows(StringIndexOutOfBoundsException.class, () -> { + Assertions.assertThrows(HoodieKeyException.class, () -> { BaseKeyGenerator keyGenerator = new NonpartitionedKeyGenerator(getPropertiesWithoutRecordKeyProp()); keyGenerator.getRecordKey(record); }); diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/HoodieSparkWriterTestBase.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/HoodieSparkWriterTestBase.scala new file mode 100644 index 000000000000..c0c1c2c12bd4 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/HoodieSparkWriterTestBase.scala @@ -0,0 +1,136 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi + +import org.apache.commons.io.FileUtils +import org.apache.hudi.common.model.{HoodieRecord, HoodieTableType} +import org.apache.hudi.config.HoodieWriteConfig +import org.apache.hudi.testutils.HoodieClientTestUtils +import org.apache.spark.SparkContext +import org.apache.spark.sql.hudi.HoodieSparkSessionExtension +import org.apache.spark.sql.{Dataset, Row, SQLContext, SparkSession} +import org.junit.jupiter.api.{AfterEach, BeforeEach} + +import scala.collection.JavaConverters + +class HoodieSparkWriterTestBase { + var spark: SparkSession = _ + var sqlContext: SQLContext = _ + var sc: SparkContext = _ + var tempPath: java.nio.file.Path = _ + var tempBootStrapPath: java.nio.file.Path = _ + var hoodieFooTableName = "hoodie_foo_tbl" + var tempBasePath: String = _ + var commonTableModifier: Map[String, String] = Map() + + case class StringLongTest(uuid: String, ts: Long) + + /** + * Setup method running before each test. + */ + @BeforeEach + def setUp(): Unit = { + initSparkContext() + tempPath = java.nio.file.Files.createTempDirectory("hoodie_test_path") + tempBootStrapPath = java.nio.file.Files.createTempDirectory("hoodie_test_bootstrap") + tempBasePath = tempPath.toAbsolutePath.toString + commonTableModifier = getCommonParams(tempPath, hoodieFooTableName, HoodieTableType.COPY_ON_WRITE.name()) + } + + /** + * Tear down method running after each test. + */ + @AfterEach + def tearDown(): Unit = { + cleanupSparkContexts() + FileUtils.deleteDirectory(tempPath.toFile) + FileUtils.deleteDirectory(tempBootStrapPath.toFile) + } + + /** + * Utility method for initializing the spark context. + */ + def initSparkContext(): Unit = { + val sparkConf = HoodieClientTestUtils.getSparkConfForTest(getClass.getSimpleName) + + spark = SparkSession.builder() + .withExtensions(new HoodieSparkSessionExtension) + .config(sparkConf) + .getOrCreate() + + sc = spark.sparkContext + sc.setLogLevel("ERROR") + sqlContext = spark.sqlContext + } + + /** + * Utility method for cleaning up spark resources. + */ + def cleanupSparkContexts(): Unit = { + if (sqlContext != null) { + sqlContext.clearCache(); + sqlContext = null; + } + if (sc != null) { + sc.stop() + sc = null + } + if (spark != null) { + spark.close() + } + } + + /** + * Utility method for creating common params for writer. + * + * @param path Path for hoodie table + * @param hoodieFooTableName Name of hoodie table + * @param tableType Type of table + * @return Map of common params + */ + def getCommonParams(path: java.nio.file.Path, hoodieFooTableName: String, tableType: String): Map[String, String] = { + Map("path" -> path.toAbsolutePath.toString, + HoodieWriteConfig.TBL_NAME.key -> hoodieFooTableName, + "hoodie.insert.shuffle.parallelism" -> "1", + "hoodie.upsert.shuffle.parallelism" -> "1", + DataSourceWriteOptions.TABLE_TYPE.key -> tableType, + DataSourceWriteOptions.RECORDKEY_FIELD.key -> "_row_key", + DataSourceWriteOptions.PARTITIONPATH_FIELD.key -> "partition", + DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME.key -> "org.apache.hudi.keygen.SimpleKeyGenerator") + } + + /** + * Utility method for dropping all hoodie meta related columns. + */ + def dropMetaFields(df: Dataset[Row]): Dataset[Row] = { + df.drop(HoodieRecord.HOODIE_META_COLUMNS.get(0)).drop(HoodieRecord.HOODIE_META_COLUMNS.get(1)) + .drop(HoodieRecord.HOODIE_META_COLUMNS.get(2)).drop(HoodieRecord.HOODIE_META_COLUMNS.get(3)) + .drop(HoodieRecord.HOODIE_META_COLUMNS.get(4)) + } + + /** + * Utility method for converting list of Row to list of Seq. + * + * @param inputList list of Row + * @return list of Seq + */ + def convertRowListToSeq(inputList: java.util.List[Row]): Seq[Row] = + JavaConverters.asScalaIteratorConverter(inputList.iterator).asScala.toSeq + +} diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestDataSourceDefaults.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestDataSourceDefaults.scala index a2598c766b19..784ddd6c883b 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestDataSourceDefaults.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestDataSourceDefaults.scala @@ -262,7 +262,7 @@ class TestDataSourceDefaults extends ScalaAssertionSupport { } // Record's key field not specified - assertThrows(classOf[StringIndexOutOfBoundsException]) { + assertThrows(classOf[HoodieKeyException]) { val props = new TypedProperties() props.setProperty(DataSourceWriteOptions.PARTITIONPATH_FIELD.key, "partitionField") val keyGen = new ComplexKeyGenerator(props) @@ -494,7 +494,7 @@ class TestDataSourceDefaults extends ScalaAssertionSupport { val props = new TypedProperties() props.setProperty(DataSourceWriteOptions.PARTITIONPATH_FIELD.key, "partitionField") - assertThrows(classOf[StringIndexOutOfBoundsException]) { + assertThrows(classOf[HoodieKeyException]) { new GlobalDeleteKeyGenerator(props).getRecordKey(baseRecord) } } diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieSparkSqlWriter.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieSparkSqlWriter.scala index 1c6766063d24..0767d0559159 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieSparkSqlWriter.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieSparkSqlWriter.scala @@ -19,11 +19,8 @@ package org.apache.hudi import org.apache.avro.Schema import org.apache.commons.io.FileUtils -import org.apache.hudi.DataSourceWriteOptions._ -import org.apache.hudi.HoodieSparkUtils.gteqSpark3_0 import org.apache.hudi.client.SparkRDDWriteClient -import org.apache.hudi.common.model._ -import org.apache.hudi.common.table.timeline.HoodieInstantTimeGenerator +import org.apache.hudi.common.model.{HoodieFileFormat, HoodieRecord, HoodieRecordPayload, HoodieTableType, WriteOperationType} import org.apache.hudi.common.table.{HoodieTableConfig, HoodieTableMetaClient, TableSchemaResolver} import org.apache.hudi.common.testutils.HoodieTestDataGenerator import org.apache.hudi.config.{HoodieBootstrapConfig, HoodieIndexConfig, HoodieWriteConfig} @@ -31,142 +28,35 @@ import org.apache.hudi.exception.{HoodieException, SchemaCompatibilityException} import org.apache.hudi.execution.bulkinsert.BulkInsertSortMode import org.apache.hudi.functional.TestBootstrap import org.apache.hudi.keygen.{ComplexKeyGenerator, NonpartitionedKeyGenerator, SimpleKeyGenerator} -import org.apache.hudi.testutils.DataSourceTestUtils -import org.apache.hudi.testutils.HoodieClientTestUtils.getSparkConfForTest -import org.apache.spark.SparkContext +import org.apache.hudi.testutils.{DataSourceTestUtils, HoodieClientTestUtils} import org.apache.spark.api.java.JavaSparkContext -import org.apache.spark.sql._ +import org.apache.spark.sql.{DataFrame, Row, SaveMode, SparkSession} import org.apache.spark.sql.functions.{expr, lit} -import org.apache.spark.sql.hudi.HoodieSparkSessionExtension import org.apache.spark.sql.hudi.command.SqlKeyGenerator import org.junit.jupiter.api.Assertions.{assertEquals, assertFalse, assertNotNull, assertNull, assertTrue, fail} -import org.junit.jupiter.api.{AfterEach, BeforeEach, Disabled, Test} +import org.junit.jupiter.api.Test import org.junit.jupiter.params.ParameterizedTest -import org.junit.jupiter.params.provider.Arguments.arguments -import org.junit.jupiter.params.provider._ +import org.junit.jupiter.params.provider.{Arguments, CsvSource, EnumSource, MethodSource, ValueSource} import org.mockito.ArgumentMatchers.any import org.mockito.Mockito.{spy, times, verify} import org.scalatest.Assertions.assertThrows import org.scalatest.Matchers.{be, convertToAnyShouldWrapper, intercept} import java.io.IOException -import java.time.format.DateTimeFormatterBuilder -import java.time.temporal.ChronoField -import java.time.{Instant, ZoneId} -import java.util.{Collections, Date, TimeZone, UUID} +import java.time.Instant +import java.util.{Collections, Date, UUID} import scala.collection.JavaConversions._ -import scala.collection.JavaConverters /** * Test suite for SparkSqlWriter class. + * All cases of using of {@link HoodieTimelineTimeZone.UTC} should be done in a separate test class {@link TestHoodieSparkSqlWriterUtc}. + * Otherwise UTC tests will generate infinite loops, if there is any initiated test with time zone that is greater then UTC+0. + * The reason is in a saved value in the heap of static {@link org.apache.hudi.common.table.timeline.HoodieInstantTimeGenerator.lastInstantTime}. */ -class TestHoodieSparkSqlWriter { - var spark: SparkSession = _ - var sqlContext: SQLContext = _ - var sc: SparkContext = _ - var tempPath: java.nio.file.Path = _ - var tempBootStrapPath: java.nio.file.Path = _ - var hoodieFooTableName = "hoodie_foo_tbl" - var tempBasePath: String = _ - var commonTableModifier: Map[String, String] = Map() - case class StringLongTest(uuid: String, ts: Long) +class TestHoodieSparkSqlWriter extends HoodieSparkWriterTestBase { /** - * Setup method running before each test. - */ - @BeforeEach - def setUp(): Unit = { - initSparkContext() - tempPath = java.nio.file.Files.createTempDirectory("hoodie_test_path") - tempBootStrapPath = java.nio.file.Files.createTempDirectory("hoodie_test_bootstrap") - tempBasePath = tempPath.toAbsolutePath.toString - commonTableModifier = getCommonParams(tempPath, hoodieFooTableName, HoodieTableType.COPY_ON_WRITE.name()) - } - - /** - * Tear down method running after each test. - */ - @AfterEach - def tearDown(): Unit = { - cleanupSparkContexts() - FileUtils.deleteDirectory(tempPath.toFile) - FileUtils.deleteDirectory(tempBootStrapPath.toFile) - } - - /** - * Utility method for initializing the spark context. - * - * TODO rebase this onto existing base class to avoid duplication - */ - def initSparkContext(): Unit = { - val sparkConf = getSparkConfForTest(getClass.getSimpleName) - - spark = SparkSession.builder() - .withExtensions(new HoodieSparkSessionExtension) - .config(sparkConf) - .getOrCreate() - - sc = spark.sparkContext - sc.setLogLevel("ERROR") - sqlContext = spark.sqlContext - } - - /** - * Utility method for cleaning up spark resources. - */ - def cleanupSparkContexts(): Unit = { - if (sqlContext != null) { - sqlContext.clearCache(); - sqlContext = null; - } - if (sc != null) { - sc.stop() - sc = null - } - if (spark != null) { - spark.close() - } - } - - /** - * Utility method for dropping all hoodie meta related columns. - */ - def dropMetaFields(df: Dataset[Row]): Dataset[Row] = { - df.drop(HoodieRecord.HOODIE_META_COLUMNS.get(0)).drop(HoodieRecord.HOODIE_META_COLUMNS.get(1)) - .drop(HoodieRecord.HOODIE_META_COLUMNS.get(2)).drop(HoodieRecord.HOODIE_META_COLUMNS.get(3)) - .drop(HoodieRecord.HOODIE_META_COLUMNS.get(4)) - } - - /** - * Utility method for creating common params for writer. - * - * @param path Path for hoodie table - * @param hoodieFooTableName Name of hoodie table - * @param tableType Type of table - * @return Map of common params - */ - def getCommonParams(path: java.nio.file.Path, hoodieFooTableName: String, tableType: String): Map[String, String] = { - Map("path" -> path.toAbsolutePath.toString, - HoodieWriteConfig.TBL_NAME.key -> hoodieFooTableName, - "hoodie.insert.shuffle.parallelism" -> "1", - "hoodie.upsert.shuffle.parallelism" -> "1", - DataSourceWriteOptions.TABLE_TYPE.key -> tableType, - DataSourceWriteOptions.RECORDKEY_FIELD.key -> "_row_key", - DataSourceWriteOptions.PARTITIONPATH_FIELD.key -> "partition", - DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME.key -> "org.apache.hudi.keygen.SimpleKeyGenerator") - } - - /** - * Utility method for converting list of Row to list of Seq. - * - * @param inputList list of Row - * @return list of Seq - */ - def convertRowListToSeq(inputList: java.util.List[Row]): Seq[Row] = - JavaConverters.asScalaIteratorConverter(inputList.iterator).asScala.toSeq - - /** - * Utility method for performing bulk insert tests. + * Local utility method for performing bulk insert tests. * * @param sortMode Bulk insert sort mode * @param populateMetaFields Flag for populating meta fields @@ -226,12 +116,13 @@ class TestHoodieSparkSqlWriter { val originals = HoodieWriterUtils.parametersWithWriteDefaults(Map.empty) val rhsKey = "hoodie.right.hand.side.key" val rhsVal = "hoodie.right.hand.side.val" - val modifier = Map(OPERATION.key -> INSERT_OPERATION_OPT_VAL, TABLE_TYPE.key -> MOR_TABLE_TYPE_OPT_VAL, rhsKey -> rhsVal) + val modifier = Map(DataSourceWriteOptions.OPERATION.key -> DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL, + DataSourceWriteOptions.TABLE_TYPE.key -> DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL, rhsKey -> rhsVal) val modified = HoodieWriterUtils.parametersWithWriteDefaults(modifier) val matcher = (k: String, v: String) => modified(k) should be(v) originals foreach { - case ("hoodie.datasource.write.operation", _) => matcher("hoodie.datasource.write.operation", INSERT_OPERATION_OPT_VAL) - case ("hoodie.datasource.write.table.type", _) => matcher("hoodie.datasource.write.table.type", MOR_TABLE_TYPE_OPT_VAL) + case ("hoodie.datasource.write.operation", _) => matcher("hoodie.datasource.write.operation", DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL) + case ("hoodie.datasource.write.table.type", _) => matcher("hoodie.datasource.write.table.type", DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL) case (`rhsKey`, _) => matcher(rhsKey, rhsVal) case (k, v) => matcher(k, v) } @@ -243,10 +134,15 @@ class TestHoodieSparkSqlWriter { @Test def testThrowExceptionInvalidSerializer(): Unit = { spark.stop() - val session = SparkSession.builder().appName("hoodie_test").master("local").getOrCreate() + val session = SparkSession.builder() + // Here we intentionally remove the "spark.serializer" config to test failure + .config(HoodieClientTestUtils.getSparkConfForTest("hoodie_test").remove("spark.serializer")) + .getOrCreate() try { val sqlContext = session.sqlContext - val options = Map("path" -> "hoodie/test/path", HoodieWriteConfig.TBL_NAME.key -> "hoodie_test_tbl") + val options = Map( + "path" -> (tempPath.toUri.toString + "/testThrowExceptionInvalidSerializer/basePath"), + HoodieWriteConfig.TBL_NAME.key -> "hoodie_test_tbl") val e = intercept[HoodieException](HoodieSparkSqlWriter.write(sqlContext, SaveMode.ErrorIfExists, options, session.emptyDataFrame)) assert(e.getMessage.contains("spark.serializer")) @@ -285,7 +181,7 @@ class TestHoodieSparkSqlWriter { assert(tableAlreadyExistException.getMessage.contains(s"${HoodieWriteConfig.TBL_NAME.key}:\thoodie_bar_tbl\thoodie_foo_tbl")) //on same path try append with delete operation and different("hoodie_bar_tbl") table name which should throw an exception - val deleteTableModifier = barTableModifier ++ Map(OPERATION.key -> "delete") + val deleteTableModifier = barTableModifier ++ Map(DataSourceWriteOptions.OPERATION.key -> "delete") val deleteCmdException = intercept[HoodieException](HoodieSparkSqlWriter.write(sqlContext, SaveMode.Append, deleteTableModifier, dataFrame2)) assert(tableAlreadyExistException.getMessage.contains("Config conflict")) assert(tableAlreadyExistException.getMessage.contains(s"${HoodieWriteConfig.TBL_NAME.key}:\thoodie_bar_tbl\thoodie_foo_tbl")) @@ -449,7 +345,7 @@ def testBulkInsertForDropPartitionColumn(): Unit = { val fooTableModifier = commonTableModifier.updated("hoodie.bulkinsert.shuffle.parallelism", "4") .updated(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.BULK_INSERT_OPERATION_OPT_VAL) .updated(DataSourceWriteOptions.ENABLE_ROW_WRITER.key, "true") - .updated(INSERT_DROP_DUPS.key, "true") + .updated(DataSourceWriteOptions.INSERT_DROP_DUPS.key, "true") // generate the inserts val schema = DataSourceTestUtils.getStructTypeExampleSchema @@ -682,10 +578,11 @@ def testBulkInsertForDropPartitionColumn(): Unit = { .setBaseFileFormat(fooTableParams.getOrElse(HoodieWriteConfig.BASE_FILE_FORMAT.key, HoodieTableConfig.BASE_FILE_FORMAT.defaultValue().name)) .setArchiveLogFolder(HoodieTableConfig.ARCHIVELOG_FOLDER.defaultValue()) - .setPayloadClassName(PAYLOAD_CLASS_NAME.key) - .setPreCombineField(fooTableParams.getOrElse(PRECOMBINE_FIELD.key, PRECOMBINE_FIELD.defaultValue())) + .setPayloadClassName(DataSourceWriteOptions.PAYLOAD_CLASS_NAME.key) + .setPreCombineField(fooTableParams.getOrElse(DataSourceWriteOptions.PRECOMBINE_FIELD.key, DataSourceWriteOptions.PRECOMBINE_FIELD.defaultValue())) .setPartitionFields(fooTableParams(DataSourceWriteOptions.PARTITIONPATH_FIELD.key)) - .setKeyGeneratorClassProp(fooTableParams.getOrElse(KEYGENERATOR_CLASS_NAME.key, KEYGENERATOR_CLASS_NAME.defaultValue())) + .setKeyGeneratorClassProp(fooTableParams.getOrElse(DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME.key, + DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME.defaultValue())) if(addBootstrapPath) { tableMetaClientBuilder .setBootstrapBasePath(fooTableParams(HoodieBootstrapConfig.BASE_PATH.key)) @@ -1336,53 +1233,6 @@ def testBulkInsertForDropPartitionColumn(): Unit = { assert(exc.getMessage.contains("Consistent hashing bucket index does not work with COW table. Use simple bucket index or an MOR table.")) } - /* - * Test case for instant is generated with commit timezone when TIMELINE_TIMEZONE set to UTC - * related to HUDI-5978 - * Issue [HUDI-7275] is tracking this test being disabled - */ - @Disabled - def testInsertDatasetWithTimelineTimezoneUTC(): Unit = { - val defaultTimezone = TimeZone.getDefault - try { - val fooTableModifier = commonTableModifier.updated(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL) - .updated(DataSourceWriteOptions.INSERT_DROP_DUPS.key, "false") - .updated(HoodieTableConfig.TIMELINE_TIMEZONE.key, "UTC") // utc timezone - - // generate the inserts - val schema = DataSourceTestUtils.getStructTypeExampleSchema - val structType = AvroConversionUtils.convertAvroSchemaToStructType(schema) - val records = DataSourceTestUtils.generateRandomRows(100) - val recordsSeq = convertRowListToSeq(records) - val df = spark.createDataFrame(sc.parallelize(recordsSeq), structType) - - // get UTC instant before write - val beforeWriteInstant = Instant.now() - - // set local timezone to America/Los_Angeles(UTC-7) - TimeZone.setDefault(TimeZone.getTimeZone("America/Los_Angeles")) - - // write to Hudi - val (success, writeInstantTimeOpt, _, _, _, hoodieTableConfig) = HoodieSparkSqlWriter.write(sqlContext, SaveMode.Append, fooTableModifier, df) - assertTrue(success) - val hoodieTableTimelineTimezone = HoodieTimelineTimeZone.valueOf(hoodieTableConfig.getString(HoodieTableConfig.TIMELINE_TIMEZONE)) - assertEquals(hoodieTableTimelineTimezone, HoodieTimelineTimeZone.UTC) - - val utcFormatter = new DateTimeFormatterBuilder() - .appendPattern(HoodieInstantTimeGenerator.SECS_INSTANT_TIMESTAMP_FORMAT) - .appendValue(ChronoField.MILLI_OF_SECOND, 3) - .toFormatter - .withZone(ZoneId.of("UTC")) - // instant parsed by UTC timezone - val writeInstant = Instant.from(utcFormatter.parse(writeInstantTimeOpt.get())) - - assertTrue(beforeWriteInstant.toEpochMilli < writeInstant.toEpochMilli, - s"writeInstant(${writeInstant.toEpochMilli}) must always be greater than beforeWriteInstant(${beforeWriteInstant.toEpochMilli}) if writeInstant was generated with UTC timezone") - } finally { - TimeZone.setDefault(defaultTimezone) - } - } - private def fetchActualSchema(): Schema = { val tableMetaClient = HoodieTableMetaClient.builder() .setConf(spark.sparkContext.hadoopConfiguration) @@ -1406,19 +1256,19 @@ object TestHoodieSparkSqlWriter { // NOTE: Hudi doesn't support Orc in Spark < 3.0 // Please check HUDI-4496 for more details - val targetScenarios = if (gteqSpark3_0) { + val targetScenarios = if (HoodieSparkUtils.gteqSpark3_0) { parquetScenarios ++ orcScenarios } else { parquetScenarios } - java.util.Arrays.stream(targetScenarios.map(as => arguments(as.map(_.asInstanceOf[AnyRef]):_*))) + java.util.Arrays.stream(targetScenarios.map(as => Arguments.arguments(as.map(_.asInstanceOf[AnyRef]):_*))) } def deletePartitionsWildcardTestParams(): java.util.stream.Stream[Arguments] = { java.util.stream.Stream.of( - arguments("*5/03/1*", Seq("2016/03/15")), - arguments("2016/03/*", Seq("2015/03/16", "2015/03/17"))) + Arguments.arguments("*5/03/1*", Seq("2016/03/15")), + Arguments.arguments("2016/03/*", Seq("2015/03/16", "2015/03/17"))) } } diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieSparkSqlWriterUtc.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieSparkSqlWriterUtc.scala new file mode 100644 index 000000000000..ca4d23f719d7 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieSparkSqlWriterUtc.scala @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi + +import org.apache.hudi.common.model.HoodieTimelineTimeZone +import org.apache.hudi.common.table.HoodieTableConfig +import org.apache.hudi.common.table.timeline.HoodieInstantTimeGenerator +import org.apache.hudi.testutils.DataSourceTestUtils +import org.apache.spark.sql.SaveMode +import org.junit.jupiter.api.Assertions.{assertEquals, assertTrue} +import org.junit.jupiter.api.Test + +import java.time.{Instant, ZoneId} +import java.time.format.DateTimeFormatterBuilder +import java.time.temporal.ChronoField +import java.util.TimeZone + +/** + * Test suite for SparkSqlWriter class for all cases of using of {@link HoodieTimelineTimeZone.UTC}. + * Using of {@link HoodieTimelineTimeZone.LOCAL} here could lead to infinite loops, because it could save + * value of static {@link HoodieInstantTimeGenerator.lastInstantTime} in the heap, + * which will be greater than instant time for {@link HoodieTimelineTimeZone.UTC}. + */ +class TestHoodieSparkSqlWriterUtc extends HoodieSparkWriterTestBase { + /* + * Test case for instant is generated with commit timezone when TIMELINE_TIMEZONE set to UTC + * related to HUDI-5978 + */ + @Test + def testInsertDatasetWithTimelineTimezoneUTC(): Unit = { + val defaultTimezone = TimeZone.getDefault + try { + val fooTableModifier = commonTableModifier.updated(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL) + .updated(DataSourceWriteOptions.INSERT_DROP_DUPS.key, "false") + .updated(HoodieTableConfig.TIMELINE_TIMEZONE.key, "UTC") // utc timezone + + // generate the inserts + val schema = DataSourceTestUtils.getStructTypeExampleSchema + val structType = AvroConversionUtils.convertAvroSchemaToStructType(schema) + val records = DataSourceTestUtils.generateRandomRows(100) + val recordsSeq = convertRowListToSeq(records) + val df = spark.createDataFrame(sc.parallelize(recordsSeq), structType) + + // get UTC instant before write + val beforeWriteInstant = Instant.now() + + // set local timezone to America/Los_Angeles(UTC-7) + TimeZone.setDefault(TimeZone.getTimeZone("Asia/Novosibirsk")) + + // write to Hudi + val (success, writeInstantTimeOpt, _, _, _, hoodieTableConfig) = HoodieSparkSqlWriter.write(sqlContext, SaveMode.Append, fooTableModifier, df) + assertTrue(success) + val hoodieTableTimelineTimezone = HoodieTimelineTimeZone.valueOf(hoodieTableConfig.getString(HoodieTableConfig.TIMELINE_TIMEZONE)) + assertEquals(hoodieTableTimelineTimezone, HoodieTimelineTimeZone.UTC) + + val utcFormatter = new DateTimeFormatterBuilder() + .appendPattern(HoodieInstantTimeGenerator.SECS_INSTANT_TIMESTAMP_FORMAT) + .appendValue(ChronoField.MILLI_OF_SECOND, 3) + .toFormatter + .withZone(ZoneId.of("UTC")) + // instant parsed by UTC timezone + val writeInstant = Instant.from(utcFormatter.parse(writeInstantTimeOpt.get())) + + assertTrue(beforeWriteInstant.toEpochMilli < writeInstant.toEpochMilli, + s"writeInstant(${writeInstant.toEpochMilli}) must always be greater than beforeWriteInstant(${beforeWriteInstant.toEpochMilli}) if writeInstant was generated with UTC timezone") + } finally { + TimeZone.setDefault(defaultTimezone) + } + } +} diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieSparkUtils.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieSparkUtils.scala index 15b6b2b35da7..85c3c619111b 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieSparkUtils.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieSparkUtils.scala @@ -20,6 +20,7 @@ package org.apache.hudi import org.apache.avro.generic.GenericRecord import org.apache.hudi.testutils.DataSourceTestUtils +import org.apache.hudi.testutils.HoodieClientTestUtils.getSparkConfForTest import org.apache.spark.sql.types.{ArrayType, StructField, StructType} import org.apache.spark.sql.{DataFrame, Row, SparkSession} import org.junit.jupiter.api.Assertions._ @@ -88,11 +89,7 @@ class TestHoodieSparkUtils { @Test def testCreateRddSchemaEvol(): Unit = { val spark = SparkSession.builder - .appName("Hoodie Datasource test") - .master("local[2]") - .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") - .config("spark.kryo.registrator", "org.apache.spark.HoodieSparkKryoRegistrar") - .config("spark.sql.extensions", "org.apache.spark.sql.hudi.HoodieSparkSessionExtension") + .config(getSparkConfForTest("Hoodie Datasource test")) .getOrCreate val schema = DataSourceTestUtils.getStructTypeExampleSchema @@ -126,11 +123,7 @@ class TestHoodieSparkUtils { @Test def testCreateRddWithNestedSchemas(): Unit = { val spark = SparkSession.builder - .appName("Hoodie Datasource test") - .master("local[2]") - .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") - .config("spark.kryo.registrator", "org.apache.spark.HoodieSparkKryoRegistrar") - .config("spark.sql.extensions", "org.apache.spark.sql.hudi.HoodieSparkSessionExtension") + .config(getSparkConfForTest("Hoodie Datasource test")) .getOrCreate val innerStruct1 = new StructType().add("innerKey","string",false).add("innerValue", "long", true) diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestTableSchemaResolverWithSparkSQL.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestTableSchemaResolverWithSparkSQL.scala index d9d5b59c8d76..70886d964445 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestTableSchemaResolverWithSparkSQL.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestTableSchemaResolverWithSparkSQL.scala @@ -18,120 +18,24 @@ package org.apache.hudi import org.apache.avro.Schema -import org.apache.commons.io.FileUtils import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hudi.avro.HoodieAvroUtils import org.apache.hudi.avro.model.HoodieMetadataRecord -import org.apache.hudi.common.model._ import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver} import org.apache.hudi.config.HoodieWriteConfig import org.apache.hudi.testutils.DataSourceTestUtils -import org.apache.hudi.testutils.HoodieClientTestUtils.getSparkConfForTest -import org.apache.spark.SparkContext -import org.apache.spark.sql._ -import org.apache.spark.sql.hudi.HoodieSparkSessionExtension +import org.apache.spark.sql.SaveMode import org.junit.jupiter.api.Assertions.{assertEquals, assertTrue} -import org.junit.jupiter.api.{AfterEach, BeforeEach, Tag, Test} +import org.junit.jupiter.api.{Tag, Test} import org.junit.jupiter.params.ParameterizedTest import org.junit.jupiter.params.provider.CsvSource -import scala.collection.JavaConverters - /** * Test suite for TableSchemaResolver with SparkSqlWriter. */ @Tag("functional") -class TestTableSchemaResolverWithSparkSQL { - var spark: SparkSession = _ - var sqlContext: SQLContext = _ - var sc: SparkContext = _ - var tempPath: java.nio.file.Path = _ - var tempBootStrapPath: java.nio.file.Path = _ - var hoodieFooTableName = "hoodie_foo_tbl" - var tempBasePath: String = _ - var commonTableModifier: Map[String, String] = Map() - - case class StringLongTest(uuid: String, ts: Long) - - /** - * Setup method running before each test. - */ - @BeforeEach - def setUp(): Unit = { - initSparkContext() - tempPath = java.nio.file.Files.createTempDirectory("hoodie_test_path") - tempBootStrapPath = java.nio.file.Files.createTempDirectory("hoodie_test_bootstrap") - tempBasePath = tempPath.toAbsolutePath.toString - commonTableModifier = getCommonParams(tempPath, hoodieFooTableName, HoodieTableType.COPY_ON_WRITE.name()) - } - - /** - * Tear down method running after each test. - */ - @AfterEach - def tearDown(): Unit = { - cleanupSparkContexts() - FileUtils.deleteDirectory(tempPath.toFile) - FileUtils.deleteDirectory(tempBootStrapPath.toFile) - } - - /** - * Utility method for initializing the spark context. - */ - def initSparkContext(): Unit = { - spark = SparkSession.builder() - .config(getSparkConfForTest(hoodieFooTableName)) - .getOrCreate() - sc = spark.sparkContext - sc.setLogLevel("ERROR") - sqlContext = spark.sqlContext - } - - /** - * Utility method for cleaning up spark resources. - */ - def cleanupSparkContexts(): Unit = { - if (sqlContext != null) { - sqlContext.clearCache(); - sqlContext = null; - } - if (sc != null) { - sc.stop() - sc = null - } - if (spark != null) { - spark.close() - } - } - - /** - * Utility method for creating common params for writer. - * - * @param path Path for hoodie table - * @param hoodieFooTableName Name of hoodie table - * @param tableType Type of table - * @return Map of common params - */ - def getCommonParams(path: java.nio.file.Path, hoodieFooTableName: String, tableType: String): Map[String, String] = { - Map("path" -> path.toAbsolutePath.toString, - HoodieWriteConfig.TBL_NAME.key -> hoodieFooTableName, - "hoodie.insert.shuffle.parallelism" -> "1", - "hoodie.upsert.shuffle.parallelism" -> "1", - DataSourceWriteOptions.TABLE_TYPE.key -> tableType, - DataSourceWriteOptions.RECORDKEY_FIELD.key -> "_row_key", - DataSourceWriteOptions.PARTITIONPATH_FIELD.key -> "partition", - DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME.key -> "org.apache.hudi.keygen.SimpleKeyGenerator") - } - - /** - * Utility method for converting list of Row to list of Seq. - * - * @param inputList list of Row - * @return list of Seq - */ - def convertRowListToSeq(inputList: java.util.List[Row]): Seq[Row] = - JavaConverters.asScalaIteratorConverter(inputList.iterator).asScala.toSeq +class TestTableSchemaResolverWithSparkSQL extends HoodieSparkWriterTestBase { @Test def testTableSchemaResolverInMetadataTable(): Unit = { diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/common/table/read/TestHoodieFileGroupReaderOnSpark.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/common/table/read/TestHoodieFileGroupReaderOnSpark.scala index 4da9f87e6f35..377e2dd9d7cf 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/common/table/read/TestHoodieFileGroupReaderOnSpark.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/common/table/read/TestHoodieFileGroupReaderOnSpark.scala @@ -23,9 +23,9 @@ import org.apache.avro.Schema import org.apache.hadoop.conf.Configuration import org.apache.hudi.common.config.HoodieReaderConfig.FILE_GROUP_READER_ENABLED import org.apache.hudi.common.engine.HoodieReaderContext +import org.apache.hudi.common.fs.FSUtils import org.apache.hudi.common.model.{HoodieRecord, WriteOperationType} import org.apache.hudi.{AvroConversionUtils, SparkFileFormatInternalRowReaderContext} -import org.apache.hudi.common.fs.FSUtils import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.datasources.PartitionedFile import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat @@ -50,7 +50,7 @@ class TestHoodieFileGroupReaderOnSpark extends TestHoodieFileGroupReaderBase[Int def setup() { val sparkConf = new SparkConf sparkConf.set("spark.app.name", getClass.getName) - sparkConf.set("spark.master", "local[*]") + sparkConf.set("spark.master", "local[8]") sparkConf.set("spark.default.parallelism", "4") sparkConf.set("spark.sql.shuffle.partitions", "4") sparkConf.set("spark.driver.maxResultSize", "2g") diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestBasicSchemaEvolution.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestBasicSchemaEvolution.scala index dfb69da29c00..ed7437fd5101 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestBasicSchemaEvolution.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestBasicSchemaEvolution.scala @@ -19,7 +19,7 @@ package org.apache.hudi.functional import org.apache.hadoop.fs.FileSystem import org.apache.hudi.HoodieConversionUtils.toJavaOption -import org.apache.hudi.common.model.{HoodieRecord, HoodieTableType, WriteOperationType} +import org.apache.hudi.common.model.{HoodieRecord, HoodieTableType, OverwriteWithLatestAvroPayload} import org.apache.hudi.common.table.{HoodieTableConfig, HoodieTableMetaClient, TableSchemaResolver} import org.apache.hudi.common.util import org.apache.hudi.config.HoodieWriteConfig @@ -31,7 +31,7 @@ import org.apache.hudi.{AvroConversionUtils, DataSourceWriteOptions, ScalaAssert import org.apache.spark.sql.hudi.HoodieSparkSessionExtension import org.apache.spark.sql.types.{IntegerType, LongType, StringType, StructField, StructType} import org.apache.spark.sql.{HoodieUnsafeUtils, Row, SaveMode, SparkSession, SparkSessionExtensions, functions} -import org.junit.jupiter.api.Assertions.{assertEquals, assertTrue} +import org.junit.jupiter.api.Assertions.{assertEquals} import org.junit.jupiter.api.{AfterEach, BeforeEach} import org.junit.jupiter.params.ParameterizedTest import org.junit.jupiter.params.provider.CsvSource @@ -49,6 +49,8 @@ class TestBasicSchemaEvolution extends HoodieSparkClientTestBase with ScalaAsser "hoodie.bulkinsert.shuffle.parallelism" -> "2", "hoodie.delete.shuffle.parallelism" -> "1", HoodieTableConfig.PARTITION_METAFILE_USE_BASE_FORMAT.key() -> "true", + HoodieWriteConfig.WRITE_PAYLOAD_CLASS_NAME.key() -> classOf[OverwriteWithLatestAvroPayload].getName, + HoodieWriteConfig.WRITE_PAYLOAD_TYPE.key() -> "OVERWRITE_LATEST_AVRO", DataSourceWriteOptions.RECORDKEY_FIELD.key -> "_row_key", DataSourceWriteOptions.PARTITIONPATH_FIELD.key -> "partition", DataSourceWriteOptions.PRECOMBINE_FIELD.key -> "timestamp", diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestCOWDataSource.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestCOWDataSource.scala index cb0209de979c..2014db073d91 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestCOWDataSource.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestCOWDataSource.scala @@ -22,30 +22,34 @@ import org.apache.hadoop.fs.{FileSystem, Path, PathFilter} import org.apache.hudi.DataSourceWriteOptions.{INLINE_CLUSTERING_ENABLE, KEYGENERATOR_CLASS_NAME} import org.apache.hudi.HoodieConversionUtils.toJavaOption import org.apache.hudi.QuickstartUtils.{convertToStringList, getQuickstartWriteConfigs} +import org.apache.hudi.avro.AvroSchemaCompatibility.SchemaIncompatibilityType +import org.apache.hudi.client.SparkRDDWriteClient import org.apache.hudi.client.common.HoodieSparkEngineContext import org.apache.hudi.common.config.TimestampKeyGeneratorConfig.{TIMESTAMP_INPUT_DATE_FORMAT, TIMESTAMP_OUTPUT_DATE_FORMAT, TIMESTAMP_TIMEZONE_FORMAT, TIMESTAMP_TYPE_FIELD} import org.apache.hudi.common.config.{HoodieCommonConfig, HoodieMetadataConfig} +import org.apache.hudi.common.fs.FSUtils import org.apache.hudi.common.model.HoodieRecord.HoodieRecordType import org.apache.hudi.common.model.{HoodieRecord, WriteOperationType} import org.apache.hudi.common.table.timeline.{HoodieInstant, HoodieTimeline, TimelineUtils} import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver} import org.apache.hudi.common.testutils.HoodieTestDataGenerator import org.apache.hudi.common.testutils.RawTripTestPayload.{deleteRecordsToStrings, recordsToStrings} -import org.apache.hudi.common.util +import org.apache.hudi.common.util.{ClusteringUtils, Option} +import org.apache.hudi.common.{HoodiePendingRollbackInfo, util} import org.apache.hudi.config.HoodieWriteConfig import org.apache.hudi.config.metrics.HoodieMetricsConfig import org.apache.hudi.exception.ExceptionUtil.getRootCause -import org.apache.hudi.exception.HoodieException +import org.apache.hudi.exception.{HoodieException, SchemaBackwardsCompatibilityException} import org.apache.hudi.functional.CommonOptionUtils._ import org.apache.hudi.functional.TestCOWDataSource.convertColumnsToNullable import org.apache.hudi.hive.HiveSyncConfigHolder import org.apache.hudi.keygen._ import org.apache.hudi.keygen.constant.KeyGeneratorOptions import org.apache.hudi.metrics.{Metrics, MetricsReporterType} +import org.apache.hudi.table.HoodieSparkTable import org.apache.hudi.testutils.HoodieSparkClientTestBase import org.apache.hudi.util.JFunction import org.apache.hudi.{AvroConversionUtils, DataSourceReadOptions, DataSourceWriteOptions, HoodieDataSourceHelpers, QuickstartUtils, ScalaAssertionSupport} -import org.apache.hudi.common.fs.FSUtils import org.apache.spark.sql._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.hudi.HoodieSparkSessionExtension @@ -97,10 +101,9 @@ class TestCOWDataSource extends HoodieSparkClientTestBase with ScalaAssertionSup System.gc() } - @ParameterizedTest - @EnumSource(value = classOf[HoodieRecordType], names = Array("AVRO", "SPARK")) - def testShortNameStorage(recordType: HoodieRecordType) { - val (writeOpts, readOpts) = getWriterReaderOpts(recordType) + @Test + def testShortNameStorage(): Unit = { + val (writeOpts, readOpts) = getWriterReaderOpts() // Insert Operation val records = recordsToStrings(dataGen.generateInserts("000", 100)).toList @@ -489,6 +492,27 @@ class TestCOWDataSource extends HoodieSparkClientTestBase with ScalaAssertionSup assertEquals(snapshotDF2.count(), (validRecordsFromBatch1 + validRecordsFromBatch2)) } + @Test + def bulkInsertCompositeKeys(): Unit = { + val (writeOpts, readOpts) = getWriterReaderOpts(HoodieRecordType.AVRO) + + // Insert Operation + val records = recordsToStrings(dataGen.generateInserts("000", 100)).toList + val inputDF = spark.read.json(spark.sparkContext.parallelize(records, 2)) + + val inputDf1 = inputDF.withColumn("new_col",lit("value1")) + val inputDf2 = inputDF.withColumn("new_col", lit(null).cast("String") ) + + inputDf1.union(inputDf2).write.format("hudi") + .options(writeOpts) + .option(DataSourceWriteOptions.RECORDKEY_FIELD.key, "_row_key,new_col") + .option(DataSourceWriteOptions.OPERATION.key(),"bulk_insert") + .mode(SaveMode.Overwrite) + .save(basePath) + + assertEquals(200, spark.read.format("org.apache.hudi").options(readOpts).load(basePath).count()) + } + /** * This tests the case that query by with a specified partition condition on hudi table which is * different between the value of the partition field and the actual partition path, @@ -565,10 +589,9 @@ class TestCOWDataSource extends HoodieSparkClientTestBase with ScalaAssertionSup * archival should kick in and 2 commits should be archived. If schema is valid, no exception will be thrown. If not, * NPE will be thrown. */ - @ParameterizedTest - @EnumSource(value = classOf[HoodieRecordType], names = Array("AVRO", "SPARK")) - def testArchivalWithBulkInsert(recordType: HoodieRecordType): Unit = { - val (writeOpts, readOpts) = getWriterReaderOpts(recordType) + @Test + def testArchivalWithBulkInsert(): Unit = { + val (writeOpts, readOpts) = getWriterReaderOpts() var structType: StructType = null for (i <- 1 to 7) { @@ -697,10 +720,9 @@ class TestCOWDataSource extends HoodieSparkClientTestBase with ScalaAssertionSup } } - @ParameterizedTest - @EnumSource(value = classOf[HoodieRecordType], names = Array("AVRO", "SPARK")) - def testOverWriteModeUseReplaceAction(recordType: HoodieRecordType): Unit = { - val (writeOpts, readOpts) = getWriterReaderOpts(recordType) + @Test + def testOverWriteModeUseReplaceAction(): Unit = { + val (writeOpts, readOpts) = getWriterReaderOpts() val records1 = recordsToStrings(dataGen.generateInserts("001", 5)).toList val inputDF1 = spark.read.json(spark.sparkContext.parallelize(records1, 2)) inputDF1.write.format("org.apache.hudi") @@ -775,10 +797,9 @@ class TestCOWDataSource extends HoodieSparkClientTestBase with ScalaAssertionSup assertEquals(expectedCount, hudiReadPathDF.count()) } - @ParameterizedTest - @EnumSource(value = classOf[HoodieRecordType], names = Array("AVRO", "SPARK")) - def testOverWriteTableModeUseReplaceAction(recordType: HoodieRecordType): Unit = { - val (writeOpts, readOpts) = getWriterReaderOpts(recordType) + @Test + def testOverWriteTableModeUseReplaceAction(): Unit = { + val (writeOpts, readOpts) = getWriterReaderOpts() val records1 = recordsToStrings(dataGen.generateInserts("001", 5)).toList val inputDF1 = spark.read.json(spark.sparkContext.parallelize(records1, 2)) @@ -805,10 +826,9 @@ class TestCOWDataSource extends HoodieSparkClientTestBase with ScalaAssertionSup assertEquals("replacecommit", commits(1)) } - @ParameterizedTest - @EnumSource(value = classOf[HoodieRecordType], names = Array("AVRO", "SPARK")) - def testOverWriteModeUseReplaceActionOnDisJointPartitions(recordType: HoodieRecordType): Unit = { - val (writeOpts, readOpts) = getWriterReaderOpts(recordType) + @Test + def testOverWriteModeUseReplaceActionOnDisJointPartitions(): Unit = { + val (writeOpts, readOpts) = getWriterReaderOpts() // step1: Write 5 records to hoodie table for partition1 DEFAULT_FIRST_PARTITION_PATH val records1 = recordsToStrings(dataGen.generateInsertsForPartition("001", 5, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH)).toList @@ -865,10 +885,9 @@ class TestCOWDataSource extends HoodieSparkClientTestBase with ScalaAssertionSup assertEquals("replacecommit", commits(2)) } - @ParameterizedTest - @EnumSource(value = classOf[HoodieRecordType], names = Array("AVRO", "SPARK")) - def testOverWriteTableModeUseReplaceActionOnDisJointPartitions(recordType: HoodieRecordType): Unit = { - val (writeOpts, readOpts) = getWriterReaderOpts(recordType) + @Test + def testOverWriteTableModeUseReplaceActionOnDisJointPartitions(): Unit = { + val (writeOpts, readOpts) = getWriterReaderOpts() // step1: Write 5 records to hoodie table for partition1 DEFAULT_FIRST_PARTITION_PATH val records1 = recordsToStrings(dataGen.generateInsertsForPartition("001", 5, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH)).toList @@ -1004,10 +1023,9 @@ class TestCOWDataSource extends HoodieSparkClientTestBase with ScalaAssertionSup }) } - @ParameterizedTest - @EnumSource(value = classOf[HoodieRecordType], names = Array("AVRO", "SPARK")) - def testWithAutoCommitOn(recordType: HoodieRecordType): Unit = { - val (writeOpts, readOpts) = getWriterReaderOpts(recordType) + @Test + def testWithAutoCommitOn(): Unit = { + val (writeOpts, readOpts) = getWriterReaderOpts() val records1 = recordsToStrings(dataGen.generateInserts("000", 100)).toList val inputDF1 = spark.read.json(spark.sparkContext.parallelize(records1, 2)) @@ -1319,8 +1337,7 @@ class TestCOWDataSource extends HoodieSparkClientTestBase with ScalaAssertionSup @ParameterizedTest @CsvSource(Array( - "true,false,AVRO", "true,true,AVRO", "false,true,AVRO", "false,false,AVRO", - "true,false,SPARK", "true,true,SPARK", "false,true,SPARK", "false,false,SPARK" + "true,false,AVRO", "true,true,AVRO", "false,true,AVRO", "false,false,AVRO" )) def testQueryCOWWithBasePathAndFileIndex(partitionEncode: Boolean, isMetadataEnabled: Boolean, recordType: HoodieRecordType): Unit = { testPartitionPruning( @@ -1517,11 +1534,10 @@ class TestCOWDataSource extends HoodieSparkClientTestBase with ScalaAssertionSup } } - @ParameterizedTest - @EnumSource(value = classOf[HoodieRecordType], names = Array("AVRO", "SPARK")) - def testSaveAsTableInDifferentModes(recordType: HoodieRecordType): Unit = { + @Test + def testSaveAsTableInDifferentModes(): Unit = { val options = scala.collection.mutable.Map.empty ++ commonOpts ++ Map("path" -> basePath) - val (writeOpts, readOpts) = getWriterReaderOpts(recordType, options.toMap) + val (writeOpts, readOpts) = getWriterReaderOpts(HoodieRecordType.AVRO, options.toMap) // first use the Overwrite mode val records1 = recordsToStrings(dataGen.generateInserts("001", 5)).toList @@ -1584,10 +1600,9 @@ class TestCOWDataSource extends HoodieSparkClientTestBase with ScalaAssertionSup assertEquals(spark.read.format("hudi").options(readOpts).load(basePath).count(), 9) } - @ParameterizedTest - @EnumSource(value = classOf[HoodieRecordType], names = Array("AVRO", "SPARK")) - def testMetricsReporterViaDataSource(recordType: HoodieRecordType): Unit = { - val (writeOpts, _) = getWriterReaderOpts(recordType, getQuickstartWriteConfigs.asScala.toMap) + @Test + def testMetricsReporterViaDataSource(): Unit = { + val (writeOpts, _) = getWriterReaderOpts(HoodieRecordType.AVRO, getQuickstartWriteConfigs.asScala.toMap) val dataGenerator = new QuickstartUtils.DataGenerator() val records = convertToStringList(dataGenerator.generateInserts(10)) @@ -1681,7 +1696,7 @@ class TestCOWDataSource extends HoodieSparkClientTestBase with ScalaAssertionSup }) } - def getWriterReaderOpts(recordType: HoodieRecordType, + def getWriterReaderOpts(recordType: HoodieRecordType = HoodieRecordType.AVRO, opt: Map[String, String] = commonOpts, enableFileIndex: Boolean = DataSourceReadOptions.ENABLE_HOODIE_FILE_INDEX.defaultValue()): (Map[String, String], Map[String, String]) = { @@ -1773,8 +1788,8 @@ class TestCOWDataSource extends HoodieSparkClientTestBase with ScalaAssertionSup (df2.write.format("hudi").options(hudiOptions).mode("append").save(basePath)) fail("Option succeeded, but was expected to fail.") } catch { - case ex: org.apache.hudi.exception.HoodieInsertException => { - assertTrue(ex.getMessage.equals("Failed insert schema compatibility check")) + case ex: SchemaBackwardsCompatibilityException => { + assertTrue(ex.getMessage.contains(SchemaIncompatibilityType.READER_FIELD_MISSING_DEFAULT_VALUE.name())) } case ex: Exception => { fail(ex) @@ -1783,7 +1798,7 @@ class TestCOWDataSource extends HoodieSparkClientTestBase with ScalaAssertionSup // Try adding the string column again. This operation is expected to succeed since 'MAKE_NEW_COLUMNS_NULLABLE' // parameter has been set to 'true'. - hudiOptions = hudiOptions + (HoodieCommonConfig.MAKE_NEW_COLUMNS_NULLABLE.key() -> "true") + hudiOptions = hudiOptions + (HoodieCommonConfig.SET_NULL_FOR_MISSING_COLUMNS.key() -> "true") try { (df2.write.format("hudi").options(hudiOptions).mode("append").save(basePath)) } catch { @@ -1807,9 +1822,9 @@ class TestCOWDataSource extends HoodieSparkClientTestBase with ScalaAssertionSup } @ParameterizedTest - @EnumSource(value = classOf[HoodieRecordType], names = Array("AVRO", "SPARK")) - def testInsertOverwriteCluster(recordType: HoodieRecordType): Unit = { - val (writeOpts, _) = getWriterReaderOpts(recordType) + @EnumSource(value = classOf[HoodieInstant.State], names = Array("REQUESTED", "INFLIGHT", "COMPLETED")) + def testInsertOverwriteCluster(firstClusteringState: HoodieInstant.State): Unit = { + val (writeOpts, _) = getWriterReaderOpts() // Insert Operation val records = recordsToStrings(dataGen.generateInserts("000", 100)).toList @@ -1819,6 +1834,7 @@ class TestCOWDataSource extends HoodieSparkClientTestBase with ScalaAssertionSup INLINE_CLUSTERING_ENABLE.key() -> "true", "hoodie.clustering.inline.max.commits" -> "2", "hoodie.clustering.plan.strategy.sort.columns" -> "_row_key", + "hoodie.clustering.plan.strategy.max.num.groups" -> "1", "hoodie.insert.shuffle.parallelism" -> "4", "hoodie.upsert.shuffle.parallelism" -> "4", DataSourceWriteOptions.RECORDKEY_FIELD.key -> "_row_key", @@ -1831,7 +1847,15 @@ class TestCOWDataSource extends HoodieSparkClientTestBase with ScalaAssertionSup .mode(SaveMode.Overwrite) .save(basePath) - for (i <- 1 until 6) { + val metaClient = HoodieTableMetaClient.builder() + .setBasePath(basePath) + .setConf(hadoopConf) + .build() + + assertTrue(metaClient.getActiveTimeline.getLastClusteringInstant.isEmpty) + + var lastClustering: HoodieInstant = null + for (i <- 1 until 4) { val records = recordsToStrings(dataGen.generateInsertsForPartition("00" + i, 10, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH)).toList val inputDF = spark.read.json(spark.sparkContext.parallelize(records, 2)) inputDF.write.format("hudi") @@ -1839,21 +1863,72 @@ class TestCOWDataSource extends HoodieSparkClientTestBase with ScalaAssertionSup .option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OVERWRITE_OPERATION_OPT_VAL) .mode(SaveMode.Append) .save(basePath) + val lastInstant = metaClient.reloadActiveTimeline.getCommitsTimeline.lastInstant.get + if (i == 1 || i == 3) { + // Last instant is clustering + assertTrue(TimelineUtils.getCommitMetadata(lastInstant, metaClient.getActiveTimeline) + .getOperationType.equals(WriteOperationType.CLUSTER)) + assertTrue(ClusteringUtils.isClusteringInstant(metaClient.getActiveTimeline, lastInstant)) + lastClustering = lastInstant + assertEquals( + lastClustering, + metaClient.getActiveTimeline.getLastClusteringInstant.get) + } else { + assertTrue(TimelineUtils.getCommitMetadata(lastInstant, metaClient.getActiveTimeline) + .getOperationType.equals(WriteOperationType.INSERT_OVERWRITE)) + assertFalse(ClusteringUtils.isClusteringInstant(metaClient.getActiveTimeline, lastInstant)) + assertEquals( + lastClustering, + metaClient.getActiveTimeline.getLastClusteringInstant.get) + } + if (i == 1) { + val writeConfig = HoodieWriteConfig.newBuilder() + .forTable("hoodie_test") + .withPath(basePath) + .withProps(optsWithCluster) + .build() + if (firstClusteringState == HoodieInstant.State.INFLIGHT + || firstClusteringState == HoodieInstant.State.REQUESTED) { + // Move the clustering to inflight for testing + fs.delete(new Path(metaClient.getMetaPath, lastInstant.getFileName), false) + val inflightClustering = metaClient.reloadActiveTimeline.lastInstant.get + assertTrue(inflightClustering.isInflight) + assertEquals( + inflightClustering, + metaClient.getActiveTimeline.getLastClusteringInstant.get) + } + if (firstClusteringState == HoodieInstant.State.REQUESTED) { + val table = HoodieSparkTable.create(writeConfig, context) + table.rollbackInflightClustering( + metaClient.getActiveTimeline.getLastClusteringInstant.get, + new java.util.function.Function[String, Option[HoodiePendingRollbackInfo]] { + override def apply(commitToRollback: String): Option[HoodiePendingRollbackInfo] = { + new SparkRDDWriteClient(context, writeConfig).getTableServiceClient + .getPendingRollbackInfo(table.getMetaClient, commitToRollback, false) + } + }) + val requestedClustering = metaClient.reloadActiveTimeline.getCommitsTimeline.lastInstant.get + assertTrue(requestedClustering.isRequested) + assertEquals( + requestedClustering, + metaClient.getActiveTimeline.getLastClusteringInstant.get) + } + // This should not schedule any new clustering + new SparkRDDWriteClient(context, writeConfig) + .scheduleClustering(org.apache.hudi.common.util.Option.of(Map[String, String]())) + assertEquals(lastInstant.getTimestamp, + metaClient.reloadActiveTimeline.getCommitsTimeline.lastInstant.get.getTimestamp) + } } - - val metaClient = HoodieTableMetaClient.builder() - .setBasePath(basePath) - .setConf(hadoopConf) - .build() - val timeline = metaClient.getActiveTimeline - val instants = timeline.getAllCommitsTimeline.filterCompletedInstants.getInstants - assertEquals(9, instants.size) + val timeline = metaClient.reloadActiveTimeline + val instants = timeline.getCommitsTimeline.getInstants + assertEquals(6, instants.size) val replaceInstants = instants.filter(i => i.getAction.equals(HoodieTimeline.REPLACE_COMMIT_ACTION)).toList - assertEquals(8, replaceInstants.size) + assertEquals(5, replaceInstants.size) val clusterInstants = replaceInstants.filter(i => { TimelineUtils.getCommitMetadata(i, metaClient.getActiveTimeline).getOperationType.equals(WriteOperationType.CLUSTER) }) - assertEquals(3, clusterInstants.size) + assertEquals(2, clusterInstants.size) } diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestGetPartitionValuesFromPath.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestGetPartitionValuesFromPath.scala index aadd9397f47d..9b6feacca0f1 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestGetPartitionValuesFromPath.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestGetPartitionValuesFromPath.scala @@ -18,7 +18,7 @@ package org.apache.hudi.functional -import org.apache.spark.sql.hudi.HoodieSparkSqlTestBase +import org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase class TestGetPartitionValuesFromPath extends HoodieSparkSqlTestBase { diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestMORDataSource.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestMORDataSource.scala index 1cfd0e7fbbaf..896219ed6a83 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestMORDataSource.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestMORDataSource.scala @@ -27,7 +27,7 @@ import org.apache.hudi.common.model.HoodieRecord.HoodieRecordType import org.apache.hudi.common.model._ import org.apache.hudi.common.table.HoodieTableMetaClient import org.apache.hudi.common.testutils.HoodieTestDataGenerator -import org.apache.hudi.common.testutils.RawTripTestPayload.{recordToString, recordsToStrings} +import org.apache.hudi.common.testutils.RawTripTestPayload.recordsToStrings import org.apache.hudi.common.util import org.apache.hudi.config.{HoodieCompactionConfig, HoodieIndexConfig, HoodieWriteConfig} import org.apache.hudi.functional.TestCOWDataSource.convertColumnsToNullable @@ -43,7 +43,7 @@ import org.apache.spark.sql.types.BooleanType import org.junit.jupiter.api.Assertions.{assertEquals, assertTrue} import org.junit.jupiter.api.{AfterEach, BeforeEach, Test} import org.junit.jupiter.params.ParameterizedTest -import org.junit.jupiter.params.provider.{CsvSource, EnumSource} +import org.junit.jupiter.params.provider.{CsvSource, EnumSource, ValueSource} import org.slf4j.LoggerFactory import java.util.function.Consumer @@ -950,10 +950,9 @@ class TestMORDataSource extends HoodieSparkClientTestBase with SparkDatasetMixin assertEquals(20, spark.read.format("hudi").options(readOpts).load(basePath).count()) } - @ParameterizedTest - @EnumSource(value = classOf[HoodieRecordType], names = Array("AVRO", "SPARK")) - def testTempFilesCleanForClustering(recordType: HoodieRecordType): Unit = { - val (writeOpts, readOpts) = getWriterReaderOpts(recordType) + @Test + def testTempFilesCleanForClustering(): Unit = { + val (writeOpts, readOpts) = getWriterReaderOpts() val records1 = recordsToStrings(dataGen.generateInserts("001", 1000)).asScala val inputDF1: Dataset[Row] = spark.read.json(spark.sparkContext.parallelize(records1, 2)) @@ -1232,9 +1231,8 @@ class TestMORDataSource extends HoodieSparkClientTestBase with SparkDatasetMixin * The read-optimized query should read `fg1_dc1.parquet` only in this case. */ @ParameterizedTest - @CsvSource(Array("true,AVRO", "true,SPARK", "false,AVRO", "false,SPARK")) - def testReadOptimizedQueryAfterInflightCompactionAndCompletedDeltaCommit(enableFileIndex: Boolean, - recordType: HoodieRecordType): Unit = { + @ValueSource(booleans = Array(true, false)) + def testReadOptimizedQueryAfterInflightCompactionAndCompletedDeltaCommit(enableFileIndex: Boolean): Unit = { val (tableName, tablePath) = ("hoodie_mor_ro_read_test_table", s"${basePath}_mor_test_table") val precombineField = "col3" val recordKeyField = "key" @@ -1252,7 +1250,7 @@ class TestMORDataSource extends HoodieSparkClientTestBase with SparkDatasetMixin "hoodie.upsert.shuffle.parallelism" -> "1") val pathForROQuery = getPathForROQuery(tablePath, !enableFileIndex, 0) - val (writeOpts, readOpts) = getWriterReaderOpts(recordType, options, enableFileIndex) + val (writeOpts, readOpts) = getWriterReaderOpts(HoodieRecordType.AVRO, options, enableFileIndex) // First batch with all inserts // Deltacommit1 (DC1, completed), writing file group 1 (fg1) @@ -1385,7 +1383,7 @@ class TestMORDataSource extends HoodieSparkClientTestBase with SparkDatasetMixin assertEquals(inputRows, readRows) } - def getWriterReaderOpts(recordType: HoodieRecordType, + def getWriterReaderOpts(recordType: HoodieRecordType = HoodieRecordType.AVRO, opt: Map[String, String] = commonOpts, enableFileIndex: Boolean = DataSourceReadOptions.ENABLE_HOODIE_FILE_INDEX.defaultValue()): (Map[String, String], Map[String, String]) = { diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestRecordLevelIndex.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestRecordLevelIndex.scala index 56866e7bf40a..b5304cd2e23c 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestRecordLevelIndex.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestRecordLevelIndex.scala @@ -367,7 +367,7 @@ class TestRecordLevelIndex extends RecordLevelIndexTestBase { saveMode = SaveMode.Append) hudiOpts += (HoodieMetadataConfig.RECORD_INDEX_ENABLE_PROP.key -> "false") - metaClient.getTableConfig.setMetadataPartitionState(metaClient, MetadataPartitionType.RECORD_INDEX, false) + metaClient.getTableConfig.setMetadataPartitionState(metaClient, MetadataPartitionType.RECORD_INDEX.getPartitionPath, false) doWriteAndValidateDataAndRecordIndex(hudiOpts, operation = DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL, diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestRecordLevelIndexWithSQL.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestRecordLevelIndexWithSQL.scala index 8e235960fba3..97fdc1e10b21 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestRecordLevelIndexWithSQL.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestRecordLevelIndexWithSQL.scala @@ -26,7 +26,8 @@ import org.apache.spark.sql.SaveMode import org.apache.spark.sql.catalyst.expressions.{AttributeReference, EqualTo, Expression, GreaterThan, GreaterThanOrEqual, In, Literal, Or} import org.apache.spark.sql.types.StringType import org.junit.jupiter.api.Assertions.{assertEquals, assertTrue} -import org.junit.jupiter.api.Tag +import org.junit.jupiter.api.io.TempDir +import org.junit.jupiter.api.{Tag, Test} import org.junit.jupiter.params.ParameterizedTest import org.junit.jupiter.params.provider.ValueSource @@ -155,4 +156,36 @@ class TestRecordLevelIndexWithSQL extends RecordLevelIndexTestBase { val readDf = spark.read.format("hudi").options(hudiOpts).load(basePath) readDf.registerTempTable(sqlTempTable) } + + @Test + def testInFilterOnNonRecordKey(): Unit = { + var hudiOpts = commonOpts + hudiOpts = hudiOpts + ( + DataSourceWriteOptions.TABLE_TYPE.key -> HoodieTableType.COPY_ON_WRITE.name(), + DataSourceReadOptions.ENABLE_DATA_SKIPPING.key -> "true") + + val dummyTablePath = tempDir.resolve("dummy_table").toAbsolutePath.toString + spark.sql( + s""" + |create table dummy_table ( + | record_key_col string, + | not_record_key_col string, + | partition_key_col string + |) using hudi + | options ( + | primaryKey ='record_key_col', + | hoodie.metadata.enable = 'true', + | hoodie.metadata.record.index.enable = 'true', + | hoodie.datasource.write.recordkey.field = 'record_key_col', + | hoodie.enable.data.skipping = 'true' + | ) + | partitioned by(partition_key_col) + | location '$dummyTablePath' + """.stripMargin) + spark.sql(s"insert into dummy_table values('row1', 'row2', 'p1')") + spark.sql(s"insert into dummy_table values('row2', 'row1', 'p2')") + spark.sql(s"insert into dummy_table values('row3', 'row1', 'p2')") + + assertEquals(2, spark.read.format("hudi").options(hudiOpts).load(dummyTablePath).filter("not_record_key_col in ('row1', 'abc')").count()) + } } diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestSparkSqlCoreFlow.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestSparkSqlCoreFlow.scala index b554aa735ec8..80d151d5b5ed 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestSparkSqlCoreFlow.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestSparkSqlCoreFlow.scala @@ -28,19 +28,16 @@ import org.apache.hudi.common.table.HoodieTableMetaClient import org.apache.hudi.common.table.timeline.TimelineUtils import org.apache.hudi.common.testutils.HoodieTestDataGenerator import org.apache.hudi.common.testutils.RawTripTestPayload.recordsToStrings +import org.apache.hudi.hadoop.fs.HadoopFSUtils import org.apache.hudi.keygen.NonpartitionedKeyGenerator import org.apache.hudi.{DataSourceReadOptions, HoodieSparkUtils} -import org.apache.hudi.common.fs.FSUtils -import org.apache.hudi.hadoop.fs.HadoopFSUtils - import org.apache.spark.sql -import org.apache.spark.sql.hudi.HoodieSparkSqlTestBase +import org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase import org.apache.spark.sql.{Dataset, Row} import org.junit.jupiter.api.Assertions.{assertEquals, assertTrue} import org.scalatest.Inspectors.forAll import java.io.File - import scala.collection.JavaConversions._ @SparkSQLCoreFlow diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestSqlStatement.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestSqlStatement.scala index e120cc00fc57..607b99e87b85 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestSqlStatement.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestSqlStatement.scala @@ -18,7 +18,7 @@ package org.apache.hudi.functional import org.apache.hudi.common.util.FileIOUtils -import org.apache.spark.sql.hudi.HoodieSparkSqlTestBase +import org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase class TestSqlStatement extends HoodieSparkSqlTestBase { val STATE_INIT = 0 diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/execution/benchmark/SpaceCurveOptimizeBenchmark.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/execution/benchmark/SpaceCurveOptimizeBenchmark.scala index 273303fdae63..b185a44dc6f1 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/execution/benchmark/SpaceCurveOptimizeBenchmark.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/execution/benchmark/SpaceCurveOptimizeBenchmark.scala @@ -23,7 +23,7 @@ import org.apache.hudi.ColumnStatsIndexHelper.buildColumnStatsTableFor import org.apache.hudi.config.HoodieClusteringConfig.LayoutOptimizationStrategy import org.apache.hudi.sort.SpaceCurveSortingHelper import org.apache.spark.sql.DataFrame -import org.apache.spark.sql.hudi.HoodieSparkSqlTestBase +import org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase import org.apache.spark.sql.types.{IntegerType, StructField} import org.junit.jupiter.api.{Disabled, Tag, Test} diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/command/index/TestFunctionalIndex.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/command/index/TestFunctionalIndex.scala index a555378713a6..727daef00583 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/command/index/TestFunctionalIndex.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/command/index/TestFunctionalIndex.scala @@ -27,12 +27,13 @@ import org.apache.hudi.common.util.Option import org.apache.hudi.hive.HiveSyncConfigHolder._ import org.apache.hudi.hive.{HiveSyncTool, HoodieHiveSyncClient} import org.apache.hudi.hive.testutils.HiveTestUtil +import org.apache.hudi.metadata.MetadataPartitionType import org.apache.hudi.sync.common.HoodieSyncConfig.{META_SYNC_BASE_PATH, META_SYNC_DATABASE_NAME, META_SYNC_NO_PARTITION_METADATA, META_SYNC_TABLE_NAME} import org.apache.spark.sql.catalyst.analysis.Analyzer import org.apache.spark.sql.catalyst.catalog.CatalogTable import org.apache.spark.sql.catalyst.parser.ParserInterface -import org.apache.spark.sql.hudi.HoodieSparkSqlTestBase import org.apache.spark.sql.hudi.command.{CreateIndexCommand, ShowIndexesCommand} +import org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase import org.junit.jupiter.api.Assertions.{assertEquals, assertTrue} class TestFunctionalIndex extends HoodieSparkSqlTestBase { @@ -186,7 +187,9 @@ class TestFunctionalIndex extends HoodieSparkSqlTestBase { | options ( | primaryKey ='id', | type = '$tableType', - | preCombineField = 'ts' + | preCombineField = 'ts', + | hoodie.metadata.record.index.enable = 'true', + | hoodie.datasource.write.recordkey.field = 'id' | ) | partitioned by(ts) | location '$basePath' @@ -195,6 +198,13 @@ class TestFunctionalIndex extends HoodieSparkSqlTestBase { spark.sql(s"insert into $tableName values(2, 'a2', 10, 1001)") spark.sql(s"insert into $tableName values(3, 'a3', 10, 1002)") + var metaClient = HoodieTableMetaClient.builder() + .setBasePath(basePath) + .setConf(spark.sessionState.newHadoopConf()) + .build() + + assert(metaClient.getTableConfig.isMetadataPartitionAvailable(MetadataPartitionType.RECORD_INDEX)) + val sqlParser: ParserInterface = spark.sessionState.sqlParser val analyzer: Analyzer = spark.sessionState.analyzer @@ -202,8 +212,9 @@ class TestFunctionalIndex extends HoodieSparkSqlTestBase { var resolvedLogicalPlan = analyzer.execute(logicalPlan) assertTableIdentifier(resolvedLogicalPlan.asInstanceOf[ShowIndexesCommand].table, databaseName, tableName) - val createIndexSql = s"create index idx_datestr on $tableName using column_stats(ts) options(func='from_unixtime', format='yyyy-MM-dd')" + var createIndexSql = s"create index idx_datestr on $tableName using column_stats(ts) options(func='from_unixtime', format='yyyy-MM-dd')" logicalPlan = sqlParser.parsePlan(createIndexSql) + resolvedLogicalPlan = analyzer.execute(logicalPlan) assertTableIdentifier(resolvedLogicalPlan.asInstanceOf[CreateIndexCommand].table, databaseName, tableName) assertResult("idx_datestr")(resolvedLogicalPlan.asInstanceOf[CreateIndexCommand].indexName) @@ -211,14 +222,32 @@ class TestFunctionalIndex extends HoodieSparkSqlTestBase { assertResult(false)(resolvedLogicalPlan.asInstanceOf[CreateIndexCommand].ignoreIfExists) spark.sql(createIndexSql) - val metaClient = HoodieTableMetaClient.builder() + metaClient = HoodieTableMetaClient.builder() .setBasePath(basePath) .setConf(spark.sessionState.newHadoopConf()) .build() assertTrue(metaClient.getFunctionalIndexMetadata.isPresent) - val functionalIndexMetadata = metaClient.getFunctionalIndexMetadata.get() + var functionalIndexMetadata = metaClient.getFunctionalIndexMetadata.get() assertEquals(1, functionalIndexMetadata.getIndexDefinitions.size()) assertEquals("func_index_idx_datestr", functionalIndexMetadata.getIndexDefinitions.get("func_index_idx_datestr").getIndexName) + + // Verify one can create more than one functional index + createIndexSql = s"create index name_lower on $tableName using column_stats(ts) options(func='identity')" + spark.sql(createIndexSql) + metaClient = HoodieTableMetaClient.builder() + .setBasePath(basePath) + .setConf(spark.sessionState.newHadoopConf()) + .build() + functionalIndexMetadata = metaClient.getFunctionalIndexMetadata.get() + assertEquals(2, functionalIndexMetadata.getIndexDefinitions.size()) + assertEquals("func_index_name_lower", functionalIndexMetadata.getIndexDefinitions.get("func_index_name_lower").getIndexName) + + // Ensure that both the indexes are tracked correctly in metadata partition config + val mdtPartitions = metaClient.getTableConfig.getMetadataPartitions + assert(mdtPartitions.contains("func_index_name_lower") && mdtPartitions.contains("func_index_idx_datestr")) + + // [HUDI-7472] After creating functional index, the existing MDT partitions should still be available + assert(metaClient.getTableConfig.isMetadataPartitionAvailable(MetadataPartitionType.RECORD_INDEX)) } } } diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/command/index/TestIndexSyntax.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/command/index/TestIndexSyntax.scala index 43b4063260fa..158d8ca4f018 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/command/index/TestIndexSyntax.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/command/index/TestIndexSyntax.scala @@ -23,8 +23,8 @@ import org.apache.hudi.HoodieSparkUtils import org.apache.spark.sql.catalyst.analysis.Analyzer import org.apache.spark.sql.catalyst.catalog.CatalogTable import org.apache.spark.sql.catalyst.parser.ParserInterface -import org.apache.spark.sql.hudi.HoodieSparkSqlTestBase import org.apache.spark.sql.hudi.command.{CreateIndexCommand, DropIndexCommand, ShowIndexesCommand} +import org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase class TestIndexSyntax extends HoodieSparkSqlTestBase { diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/command/index/TestSecondaryIndex.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/command/index/TestSecondaryIndex.scala index 816fecc38f51..26f1a901d89c 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/command/index/TestSecondaryIndex.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/command/index/TestSecondaryIndex.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql.hudi.command.index import org.apache.hudi.HoodieSparkUtils -import org.apache.spark.sql.hudi.HoodieSparkSqlTestBase +import org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase class TestSecondaryIndex extends HoodieSparkSqlTestBase { diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/HoodieSparkSqlTestBase.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/common/HoodieSparkSqlTestBase.scala similarity index 98% rename from hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/HoodieSparkSqlTestBase.scala rename to hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/common/HoodieSparkSqlTestBase.scala index b9628d05af14..b101e838c841 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/HoodieSparkSqlTestBase.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/common/HoodieSparkSqlTestBase.scala @@ -15,12 +15,11 @@ * limitations under the License. */ -package org.apache.spark.sql.hudi +package org.apache.spark.sql.hudi.common import org.apache.hadoop.fs.Path import org.apache.hudi.HoodieSparkRecordMerger import org.apache.hudi.common.config.HoodieStorageConfig -import org.apache.hudi.common.fs.FSUtils import org.apache.hudi.common.model.HoodieAvroRecordMerger import org.apache.hudi.common.model.HoodieRecord.HoodieRecordType import org.apache.hudi.common.table.HoodieTableMetaClient @@ -30,10 +29,9 @@ import org.apache.hudi.exception.ExceptionUtil.getRootCause import org.apache.hudi.hadoop.fs.HadoopFSUtils import org.apache.hudi.index.inmemory.HoodieInMemoryHashIndex import org.apache.hudi.testutils.HoodieClientTestUtils.getSparkConfForTest - import org.apache.spark.SparkConf import org.apache.spark.sql.catalyst.util.DateTimeUtils -import org.apache.spark.sql.hudi.HoodieSparkSqlTestBase.checkMessageContains +import org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase.checkMessageContains import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.util.Utils import org.joda.time.DateTimeZone diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestHoodieInternalRowUtils.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/common/TestHoodieInternalRowUtils.scala similarity index 99% rename from hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestHoodieInternalRowUtils.scala rename to hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/common/TestHoodieInternalRowUtils.scala index 35afff918b9f..2ce4393c6a8c 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestHoodieInternalRowUtils.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/common/TestHoodieInternalRowUtils.scala @@ -16,7 +16,7 @@ * limitations under the License. */ -package org.apache.spark.sql.hudi +package org.apache.spark.sql.hudi.common import org.apache.avro.generic.GenericData import org.apache.avro.{LogicalTypes, Schema} diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestHoodieOptionConfig.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/common/TestHoodieOptionConfig.scala similarity index 95% rename from hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestHoodieOptionConfig.scala rename to hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/common/TestHoodieOptionConfig.scala index 985300c44c25..42db138671e9 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestHoodieOptionConfig.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/common/TestHoodieOptionConfig.scala @@ -15,12 +15,13 @@ * limitations under the License. */ -package org.apache.spark.sql.hudi +package org.apache.spark.sql.hudi.common import org.apache.hudi.DataSourceWriteOptions import org.apache.hudi.common.model.{DefaultHoodieRecordPayload, HoodieRecordMerger, OverwriteWithLatestAvroPayload} import org.apache.hudi.common.table.HoodieTableConfig import org.apache.hudi.testutils.SparkClientFunctionalTestHarness +import org.apache.spark.sql.hudi.HoodieOptionConfig import org.apache.spark.sql.types._ import org.junit.jupiter.api.Assertions.assertTrue import org.junit.jupiter.api.Test @@ -35,14 +36,14 @@ class TestHoodieOptionConfig extends SparkClientFunctionalTestHarness { assertTrue(with1.size == 5) assertTrue(with1("primaryKey") == "id") assertTrue(with1("type") == "cow") - assertTrue(with1("payloadClass") == classOf[OverwriteWithLatestAvroPayload].getName) + assertTrue(with1("payloadClass") == classOf[DefaultHoodieRecordPayload].getName) assertTrue(with1("recordMergerStrategy") == HoodieRecordMerger.DEFAULT_MERGER_STRATEGY_UUID) assertTrue(with1("payloadType") == DataSourceWriteOptions.PAYLOAD_TYPE.defaultValue) val ops2 = Map("primaryKey" -> "id", "preCombineField" -> "timestamp", "type" -> "mor", - "payloadClass" -> classOf[DefaultHoodieRecordPayload].getName, + "payloadClass" -> classOf[OverwriteWithLatestAvroPayload].getName, "recordMergerStrategy" -> HoodieRecordMerger.DEFAULT_MERGER_STRATEGY_UUID, "payloadType" -> DataSourceWriteOptions.PAYLOAD_TYPE.defaultValue ) diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestLazyPartitionPathFetching.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/common/TestLazyPartitionPathFetching.scala similarity index 99% rename from hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestLazyPartitionPathFetching.scala rename to hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/common/TestLazyPartitionPathFetching.scala index e2635c0cba87..aa6cd64fcb3e 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestLazyPartitionPathFetching.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/common/TestLazyPartitionPathFetching.scala @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.spark.sql.hudi +package org.apache.spark.sql.hudi.common class TestLazyPartitionPathFetching extends HoodieSparkSqlTestBase { diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestNestedSchemaPruningOptimization.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/common/TestNestedSchemaPruningOptimization.scala similarity index 99% rename from hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestNestedSchemaPruningOptimization.scala rename to hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/common/TestNestedSchemaPruningOptimization.scala index cd4f90da4e79..62b2352d9c7f 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestNestedSchemaPruningOptimization.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/common/TestNestedSchemaPruningOptimization.scala @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.spark.sql.hudi +package org.apache.spark.sql.hudi.common import org.apache.hudi.common.config.HoodieCommonConfig import org.apache.hudi.config.HoodieWriteConfig diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestPartitionPushDownWhenListingPaths.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/common/TestPartitionPushDownWhenListingPaths.scala similarity index 99% rename from hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestPartitionPushDownWhenListingPaths.scala rename to hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/common/TestPartitionPushDownWhenListingPaths.scala index 1b5e590913f3..7740da5e664c 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestPartitionPushDownWhenListingPaths.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/common/TestPartitionPushDownWhenListingPaths.scala @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.spark.sql.hudi +package org.apache.spark.sql.hudi.common import org.apache.hudi.common.config.HoodieMetadataConfig diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestSqlConf.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/common/TestSqlConf.scala similarity index 99% rename from hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestSqlConf.scala rename to hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/common/TestSqlConf.scala index dbf6d173865e..26b21e95437b 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestSqlConf.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/common/TestSqlConf.scala @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.spark.sql.hudi +package org.apache.spark.sql.hudi.common import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path @@ -23,12 +23,11 @@ import org.apache.hudi.DataSourceReadOptions._ import org.apache.hudi.common.config.DFSPropertiesConfiguration import org.apache.hudi.common.model.HoodieTableType import org.apache.hudi.common.table.{HoodieTableConfig, HoodieTableMetaClient} +import org.scalatest.BeforeAndAfter import java.io.File import java.nio.file.{Files, Paths} -import org.scalatest.BeforeAndAfter - class TestSqlConf extends HoodieSparkSqlTestBase with BeforeAndAfter { def setEnv(key: String, value: String): String = { diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestAlterTable.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/ddl/TestAlterTable.scala similarity index 99% rename from hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestAlterTable.scala rename to hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/ddl/TestAlterTable.scala index b3cd9e497f55..268f5a87bc16 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestAlterTable.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/ddl/TestAlterTable.scala @@ -15,12 +15,14 @@ * limitations under the License. */ -package org.apache.spark.sql.hudi +package org.apache.spark.sql.hudi.ddl import org.apache.hudi.HoodieSparkUtils import org.apache.hudi.common.model.HoodieRecord import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver} import org.apache.spark.sql.catalyst.TableIdentifier +import org.apache.spark.sql.hudi.HoodieSqlCommonUtils +import org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase import org.junit.jupiter.api.Assertions.assertFalse import scala.collection.JavaConverters._ diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestAlterTableAddPartition.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/ddl/TestAlterTableAddPartition.scala similarity index 97% rename from hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestAlterTableAddPartition.scala rename to hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/ddl/TestAlterTableAddPartition.scala index 68c6c115448e..3ad6f113a827 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestAlterTableAddPartition.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/ddl/TestAlterTableAddPartition.scala @@ -15,12 +15,14 @@ * limitations under the License. */ -package org.apache.spark.sql.hudi +package org.apache.spark.sql.hudi.ddl + +import org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase class TestAlterTableAddPartition extends HoodieSparkSqlTestBase { test("Add partition for non-partitioned table") { - withTable(generateTableName){ tableName => + withTable(generateTableName) { tableName => // create table spark.sql( s""" diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestAlterTableDropPartition.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/ddl/TestAlterTableDropPartition.scala similarity index 98% rename from hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestAlterTableDropPartition.scala rename to hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/ddl/TestAlterTableDropPartition.scala index a032f0c3081c..fc9c2c55b7b8 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestAlterTableDropPartition.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/ddl/TestAlterTableDropPartition.scala @@ -15,19 +15,20 @@ * limitations under the License. */ -package org.apache.spark.sql.hudi +package org.apache.spark.sql.hudi.ddl import org.apache.hudi.DataSourceWriteOptions._ import org.apache.hudi.avro.model.{HoodieCleanMetadata, HoodieCleanPartitionMetadata} import org.apache.hudi.common.model.{HoodieCleaningPolicy, HoodieCommitMetadata} import org.apache.hudi.common.table.HoodieTableMetaClient -import org.apache.hudi.common.table.timeline.{HoodieActiveTimeline, HoodieInstant} +import org.apache.hudi.common.table.timeline.HoodieInstant import org.apache.hudi.common.util.{PartitionPathEncodeUtils, StringUtils, Option => HOption} import org.apache.hudi.config.{HoodieCleanConfig, HoodieWriteConfig} import org.apache.hudi.keygen.{ComplexKeyGenerator, SimpleKeyGenerator} import org.apache.hudi.{HoodieCLIUtils, HoodieSparkUtils} import org.apache.spark.sql.SaveMode -import org.apache.spark.sql.hudi.HoodieSparkSqlTestBase.getLastCleanMetadata +import org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase +import org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase.getLastCleanMetadata import org.junit.jupiter.api.Assertions import org.junit.jupiter.api.Assertions.assertTrue @@ -621,7 +622,7 @@ class TestAlterTableDropPartition extends HoodieSparkSqlTestBase { } test("Test drop partition with wildcards") { - withRecordType()(withTempDir { tmp => + withTempDir { tmp => Seq("cow", "mor").foreach { tableType => val tableName = generateTableName spark.sql( @@ -653,6 +654,6 @@ class TestAlterTableDropPartition extends HoodieSparkSqlTestBase { Seq("2023-09-01") ) } - }) + } } } diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestCreateTable.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/ddl/TestCreateTable.scala similarity index 99% rename from hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestCreateTable.scala rename to hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/ddl/TestCreateTable.scala index 36e585eef98f..5e6beb2ef00b 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestCreateTable.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/ddl/TestCreateTable.scala @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.spark.sql.hudi +package org.apache.spark.sql.hudi.ddl import org.apache.hudi.DataSourceWriteOptions._ import org.apache.hudi.HoodieSparkUtils @@ -28,7 +28,9 @@ import org.apache.hudi.keygen.constant.KeyGeneratorType import org.apache.spark.sql.SaveMode import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.catalog.{CatalogTableType, HoodieCatalogTable} -import org.apache.spark.sql.hudi.HoodieSparkSqlTestBase.getLastCommitMetadata +import org.apache.spark.sql.hudi.HoodieSqlCommonUtils +import org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase +import org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase.getLastCommitMetadata import org.apache.spark.sql.types._ import org.junit.jupiter.api.Assertions.{assertFalse, assertTrue} @@ -535,9 +537,10 @@ class TestCreateTable extends HoodieSparkSqlTestBase { )(table.schema.fields) // Should not include non.hoodie.property - assertResult(2)(table.properties.size) + assertResult(3)(table.properties.size) assertResult("cow")(table.properties("type")) assertResult("id,name")(table.properties("primaryKey")) + assertResult("hudi")(table.properties("provider")) } } } diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestSpark3DDL.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/ddl/TestSpark3DDL.scala similarity index 92% rename from hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestSpark3DDL.scala rename to hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/ddl/TestSpark3DDL.scala index 6a64c69021c8..5e43d714a5ec 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestSpark3DDL.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/ddl/TestSpark3DDL.scala @@ -15,21 +15,22 @@ * limitations under the License. */ -package org.apache.spark.sql.hudi +package org.apache.spark.sql.hudi.ddl import org.apache.hadoop.fs.Path -import org.apache.hudi.DataSourceWriteOptions.{PARTITIONPATH_FIELD_OPT_KEY, PRECOMBINE_FIELD_OPT_KEY, RECORDKEY_FIELD_OPT_KEY, SPARK_SQL_INSERT_INTO_OPERATION, TABLE_NAME} -import org.apache.hudi.QuickstartUtils.{DataGenerator, convertToStringList, getQuickstartWriteConfigs} import org.apache.hudi.common.config.HoodieStorageConfig import org.apache.hudi.common.model.HoodieRecord import org.apache.hudi.common.model.HoodieRecord.HoodieRecordType import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver} import org.apache.hudi.common.testutils.{HoodieTestDataGenerator, RawTripTestPayload} import org.apache.hudi.config.HoodieWriteConfig +import org.apache.hudi.index.inmemory.HoodieInMemoryHashIndex import org.apache.hudi.testutils.DataSourceTestUtils -import org.apache.hudi.{DataSourceWriteOptions, HoodieSparkRecordMerger, HoodieSparkUtils} +import org.apache.hudi.{DataSourceWriteOptions, HoodieSparkRecordMerger, HoodieSparkUtils, QuickstartUtils} import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.functions.{arrays_zip, col, expr, lit} +import org.apache.spark.sql.hudi.HoodieSqlCommonUtils +import org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase import org.apache.spark.sql.types.StringType import org.apache.spark.sql.{Row, SaveMode, SparkSession} @@ -75,7 +76,7 @@ class TestSpark3DDL extends HoodieSparkSqlTestBase { val tableName = generateTableName val tablePath = s"${new Path(tmp.getCanonicalPath, tableName).toUri.toString}" if (HoodieSparkUtils.gteqSpark3_1) { - spark.sql("set " + SPARK_SQL_INSERT_INTO_OPERATION.key + "=upsert") + spark.sql("set " + DataSourceWriteOptions.SPARK_SQL_INSERT_INTO_OPERATION.key + "=upsert") spark.sql("set hoodie.schema.on.read.enable=true") // NOTE: This is required since as this tests use type coercions which were only permitted in Spark 2.x // and are disallowed now by default in Spark 3.x @@ -136,14 +137,14 @@ class TestSpark3DDL extends HoodieSparkSqlTestBase { ) spark.sessionState.catalog.dropTable(TableIdentifier(tableName), true, true) spark.sessionState.catalog.refreshTable(TableIdentifier(tableName)) - spark.sessionState.conf.unsetConf(SPARK_SQL_INSERT_INTO_OPERATION.key) + spark.sessionState.conf.unsetConf(DataSourceWriteOptions.SPARK_SQL_INSERT_INTO_OPERATION.key) } } }) } test("Test alter column types 2") { - withRecordType()(withTempDir { tmp => + withTempDir { tmp => Seq("cow", "mor").foreach { tableType => val tableName = generateTableName val tablePath = s"${new Path(tmp.getCanonicalPath, tableName).toUri.toString}" @@ -176,7 +177,7 @@ class TestSpark3DDL extends HoodieSparkSqlTestBase { ) } } - }) + } } test("Test Enable and Disable Schema on read") { @@ -232,7 +233,7 @@ class TestSpark3DDL extends HoodieSparkSqlTestBase { } test("Test alter table properties and add rename drop column") { - withRecordType()(withTempDir { tmp => + withTempDir { tmp => Seq("cow", "mor").foreach { tableType => val tableName = generateTableName val tablePath = s"${new Path(tmp.getCanonicalPath, tableName).toUri.toString}" @@ -242,7 +243,7 @@ class TestSpark3DDL extends HoodieSparkSqlTestBase { if (HoodieSparkUtils.gteqSpark3_1) { spark.sql("set hoodie.schema.on.read.enable=true") - spark.sql("set " + SPARK_SQL_INSERT_INTO_OPERATION.key + "=upsert") + spark.sql("set " + DataSourceWriteOptions.SPARK_SQL_INSERT_INTO_OPERATION.key + "=upsert") // NOTE: This is required since as this tests use type coercions which were only permitted in Spark 2.x // and are disallowed now by default in Spark 3.x spark.sql("set spark.sql.storeAssignmentPolicy=legacy") @@ -335,8 +336,8 @@ class TestSpark3DDL extends HoodieSparkSqlTestBase { spark.sql(s"select id, col1_new, col2 from $tableName where id = 1 or id = 6 or id = 2 or id = 11 order by id").show(false) } } - spark.sessionState.conf.unsetConf(SPARK_SQL_INSERT_INTO_OPERATION.key) - }) + spark.sessionState.conf.unsetConf(DataSourceWriteOptions.SPARK_SQL_INSERT_INTO_OPERATION.key) + } } test("Test Chinese table ") { @@ -346,7 +347,7 @@ class TestSpark3DDL extends HoodieSparkSqlTestBase { val tablePath = s"${new Path(tmp.getCanonicalPath, tableName).toUri.toString}" if (HoodieSparkUtils.gteqSpark3_1) { spark.sql("set hoodie.schema.on.read.enable=true") - spark.sql("set " + SPARK_SQL_INSERT_INTO_OPERATION.key + "=upsert") + spark.sql("set " + DataSourceWriteOptions.SPARK_SQL_INSERT_INTO_OPERATION.key + "=upsert") spark.sql( s""" |create table $tableName ( @@ -387,13 +388,13 @@ class TestSpark3DDL extends HoodieSparkSqlTestBase { ) } } - spark.sessionState.conf.unsetConf(SPARK_SQL_INSERT_INTO_OPERATION.key) + spark.sessionState.conf.unsetConf(DataSourceWriteOptions.SPARK_SQL_INSERT_INTO_OPERATION.key) }) } test("Test alter column by add rename and drop") { - withRecordType()(withTempDir { tmp => + withTempDir { tmp => Seq("cow", "mor").foreach { tableType => val tableName = generateTableName val tablePath = s"${new Path(tmp.getCanonicalPath, tableName).toUri.toString}" @@ -453,7 +454,7 @@ class TestSpark3DDL extends HoodieSparkSqlTestBase { validateInternalSchema(tablePath, isDropColumn = false, currentMaxColumnId = maxColumnId) } } - }) + } } private def validateInternalSchema(basePath: String, isDropColumn: Boolean, currentMaxColumnId: Int): Unit = { @@ -543,8 +544,8 @@ class TestSpark3DDL extends HoodieSparkSqlTestBase { } test("Test alter column with complex schema") { - withRecordType()(withTempDir { tmp => - withSQLConf(s"$SPARK_SQL_INSERT_INTO_OPERATION" -> "upsert", + withTempDir { tmp => + withSQLConf(s"${DataSourceWriteOptions.SPARK_SQL_INSERT_INTO_OPERATION}" -> "upsert", "hoodie.schema.on.read.enable" -> "true", "spark.sql.parquet.enableNestedColumnVectorizedReader" -> "false") { val tableName = generateTableName @@ -628,7 +629,7 @@ class TestSpark3DDL extends HoodieSparkSqlTestBase { ) } } - }) + } } test("Test schema auto evolution complex") { @@ -711,36 +712,38 @@ class TestSpark3DDL extends HoodieSparkSqlTestBase { val tableName = generateTableName val tablePath = s"${new Path(tmp.getCanonicalPath, tableName).toUri.toString}" if (HoodieSparkUtils.gteqSpark3_1) { - val dataGen = new DataGenerator - val inserts = convertToStringList(dataGen.generateInserts(10)) + val dataGen = new QuickstartUtils.DataGenerator + val inserts = QuickstartUtils.convertToStringList(dataGen.generateInserts(10)) val df = spark.read.json(spark.sparkContext.parallelize(inserts, 2)) + .withColumn("ts", lit("20240404000000")) // to make test determinate for HOODIE_AVRO_DEFAULT payload df.write.format("hudi"). - options(getQuickstartWriteConfigs). + options(QuickstartUtils.getQuickstartWriteConfigs). option(DataSourceWriteOptions.TABLE_TYPE_OPT_KEY, tableType). - option(PRECOMBINE_FIELD_OPT_KEY, "ts"). - option(RECORDKEY_FIELD_OPT_KEY, "uuid"). - option(PARTITIONPATH_FIELD_OPT_KEY, "partitionpath"). + option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY, "ts"). + option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY, "uuid"). + option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY, "partitionpath"). option("hoodie.schema.on.read.enable","true"). - option(TABLE_NAME.key(), tableName). + option(DataSourceWriteOptions.TABLE_NAME.key(), tableName). option("hoodie.table.name", tableName). mode("overwrite"). save(tablePath) - val updates = convertToStringList(dataGen.generateUpdates(10)) + val updates = QuickstartUtils.convertToStringList(dataGen.generateUpdates(10)) // type change: fare (double -> String) // add new column and drop a column val dfUpdate = spark.read.json(spark.sparkContext.parallelize(updates, 2)) .withColumn("fare", expr("cast(fare as string)")) .withColumn("addColumn", lit("new")) + .withColumn("ts", lit("20240404000005")) // to make test determinate for HOODIE_AVRO_DEFAULT payload dfUpdate.drop("begin_lat").write.format("hudi"). - options(getQuickstartWriteConfigs). + options(QuickstartUtils.getQuickstartWriteConfigs). option(DataSourceWriteOptions.TABLE_TYPE_OPT_KEY, tableType). - option(PRECOMBINE_FIELD_OPT_KEY, "ts"). - option(RECORDKEY_FIELD_OPT_KEY, "uuid"). - option(PARTITIONPATH_FIELD_OPT_KEY, "partitionpath"). + option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY, "ts"). + option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY, "uuid"). + option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY, "partitionpath"). option("hoodie.schema.on.read.enable","true"). option("hoodie.datasource.write.reconcile.schema","true"). - option(TABLE_NAME.key(), tableName). + option(DataSourceWriteOptions.TABLE_NAME.key(), tableName). option("hoodie.table.name", tableName). mode("append"). save(tablePath) @@ -758,35 +761,38 @@ class TestSpark3DDL extends HoodieSparkSqlTestBase { spark.sql(s"select * from hudi_trips_snapshot").show(false) // test insert_over_write + update again - val overwrite = convertToStringList(dataGen.generateInserts(10)) + val overwrite = QuickstartUtils.convertToStringList(dataGen.generateInserts(10)) val dfOverWrite = spark. read.json(spark.sparkContext.parallelize(overwrite, 2)). filter("partitionpath = 'americas/united_states/san_francisco'") + .withColumn("ts", lit("20240404000010")) // to make test determinate for HOODIE_AVRO_DEFAULT payload .withColumn("fare", expr("cast(fare as string)")) // fare now in table is string type, we forbid convert string to double. dfOverWrite.write.format("hudi"). - options(getQuickstartWriteConfigs). + options(QuickstartUtils.getQuickstartWriteConfigs). option("hoodie.datasource.write.operation","insert_overwrite"). - option(PRECOMBINE_FIELD_OPT_KEY, "ts"). - option(RECORDKEY_FIELD_OPT_KEY, "uuid"). - option(PARTITIONPATH_FIELD_OPT_KEY, "partitionpath"). + option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY, "ts"). + option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY, "uuid"). + option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY, "partitionpath"). option("hoodie.schema.on.read.enable","true"). option("hoodie.datasource.write.reconcile.schema","true"). - option(TABLE_NAME.key(), tableName). + option(DataSourceWriteOptions.TABLE_NAME.key(), tableName). option("hoodie.table.name", tableName). mode("append"). save(tablePath) spark.read.format("hudi").load(tablePath).show(false) - val updatesAgain = convertToStringList(dataGen.generateUpdates(10)) - val dfAgain = spark.read.json(spark.sparkContext.parallelize(updatesAgain, 2)).withColumn("fare", expr("cast(fare as string)")) + val updatesAgain = QuickstartUtils.convertToStringList(dataGen.generateUpdates(10)) + val dfAgain = spark.read.json(spark.sparkContext.parallelize(updatesAgain, 2)). + withColumn("fare", expr("cast(fare as string)")). + withColumn("ts", lit("20240404000015")) // to make test determinate for HOODIE_AVRO_DEFAULT payload dfAgain.write.format("hudi"). - options(getQuickstartWriteConfigs). - option(PRECOMBINE_FIELD_OPT_KEY, "ts"). - option(RECORDKEY_FIELD_OPT_KEY, "uuid"). - option(PARTITIONPATH_FIELD_OPT_KEY, "partitionpath"). + options(QuickstartUtils.getQuickstartWriteConfigs). + option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY, "ts"). + option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY, "uuid"). + option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY, "partitionpath"). option("hoodie.schema.on.read.enable","true"). option("hoodie.datasource.write.reconcile.schema","true"). - option(TABLE_NAME.key(), tableName). + option(DataSourceWriteOptions.TABLE_NAME.key(), tableName). option("hoodie.table.name", tableName). mode("append"). save(tablePath) @@ -880,6 +886,9 @@ class TestSpark3DDL extends HoodieSparkSqlTestBase { // Not checking answer as this is an unsafe casting operation, just need to make sure that error is not thrown spark.sql(s"select id, name, cast(price as string), ts from $tableName") + + // clear after using INMEMORY index + HoodieInMemoryHashIndex.clear() } } } @@ -945,6 +954,9 @@ class TestSpark3DDL extends HoodieSparkSqlTestBase { Seq(11, "a11", "-10.04", 1000), Seq(12, "a12", "-10.04", 1000) ) + + // clear after using INMEMORY index + HoodieInMemoryHashIndex.clear() } } } @@ -1010,6 +1022,9 @@ class TestSpark3DDL extends HoodieSparkSqlTestBase { Seq(11, "a11", "-10.04", 1000), Seq(12, "a12", "-10.04", 1000) ) + + // clear after using INMEMORY index + HoodieInMemoryHashIndex.clear() } } } diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestCDCForSparkSQL.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/dml/TestCDCForSparkSQL.scala similarity index 99% rename from hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestCDCForSparkSQL.scala rename to hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/dml/TestCDCForSparkSQL.scala index a799ce8f787d..59f9eed83b0a 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestCDCForSparkSQL.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/dml/TestCDCForSparkSQL.scala @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.spark.sql.hudi +package org.apache.spark.sql.hudi.dml import org.apache.hudi.DataSourceReadOptions._ import org.apache.hudi.DataSourceWriteOptions.SPARK_SQL_INSERT_INTO_OPERATION @@ -23,6 +23,7 @@ import org.apache.hudi.common.table.HoodieTableMetaClient import org.apache.hudi.common.table.cdc.HoodieCDCSupplementalLoggingMode.{DATA_BEFORE, DATA_BEFORE_AFTER, OP_KEY_ONLY} import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions._ +import org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase import org.junit.jupiter.api.Assertions.assertEquals class TestCDCForSparkSQL extends HoodieSparkSqlTestBase { diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestCompactionTable.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/dml/TestCompactionTable.scala similarity index 97% rename from hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestCompactionTable.scala rename to hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/dml/TestCompactionTable.scala index 568e3569725c..31948c3298da 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestCompactionTable.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/dml/TestCompactionTable.scala @@ -15,12 +15,14 @@ * limitations under the License. */ -package org.apache.spark.sql.hudi +package org.apache.spark.sql.hudi.dml + +import org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase class TestCompactionTable extends HoodieSparkSqlTestBase { test("Test compaction table") { - withRecordType()(withTempDir {tmp => + withRecordType()(withTempDir { tmp => val tableName = generateTableName spark.sql( s""" @@ -75,7 +77,7 @@ class TestCompactionTable extends HoodieSparkSqlTestBase { } test("Test compaction path") { - withRecordType()(withTempDir { tmp => + withTempDir { tmp => val tableName = generateTableName spark.sql( s""" @@ -132,6 +134,6 @@ class TestCompactionTable extends HoodieSparkSqlTestBase { checkException(s"run compaction on '${tmp.getCanonicalPath}' at 12345")( s"specific 12345 instants is not exist" ) - }) + } } } diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestDataSkippingQuery.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/dml/TestDataSkippingQuery.scala similarity index 98% rename from hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestDataSkippingQuery.scala rename to hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/dml/TestDataSkippingQuery.scala index 1ac7185f642d..23255b763ff3 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestDataSkippingQuery.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/dml/TestDataSkippingQuery.scala @@ -17,7 +17,9 @@ * under the License. */ -package org.apache.spark.sql.hudi +package org.apache.spark.sql.hudi.dml + +import org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase class TestDataSkippingQuery extends HoodieSparkSqlTestBase { diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestDeleteFromTable.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/dml/TestDeleteFromTable.scala similarity index 96% rename from hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestDeleteFromTable.scala rename to hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/dml/TestDeleteFromTable.scala index e3ea01730222..b289ce74646c 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestDeleteFromTable.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/dml/TestDeleteFromTable.scala @@ -15,7 +15,9 @@ * limitations under the License. */ -package org.apache.spark.sql.hudi +package org.apache.spark.sql.hudi.dml + +import org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase class TestDeleteFromTable extends HoodieSparkSqlTestBase { diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestDeleteTable.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/dml/TestDeleteTable.scala similarity index 99% rename from hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestDeleteTable.scala rename to hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/dml/TestDeleteTable.scala index bc87405b9f91..b9cafb6ec079 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestDeleteTable.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/dml/TestDeleteTable.scala @@ -15,12 +15,13 @@ * limitations under the License. */ -package org.apache.spark.sql.hudi +package org.apache.spark.sql.hudi.dml import org.apache.hudi.DataSourceWriteOptions._ import org.apache.hudi.HoodieSparkUtils.isSpark2 import org.apache.hudi.config.HoodieWriteConfig import org.apache.spark.sql.SaveMode +import org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase class TestDeleteTable extends HoodieSparkSqlTestBase { diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestDropTable.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/dml/TestDropTable.scala similarity index 98% rename from hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestDropTable.scala rename to hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/dml/TestDropTable.scala index 0781fc6af06f..743abc5b2fd0 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestDropTable.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/dml/TestDropTable.scala @@ -15,15 +15,14 @@ * limitations under the License. */ -package org.apache.spark.sql.hudi +package org.apache.spark.sql.hudi.dml -import org.apache.hudi.common.fs.FSUtils +import org.apache.hadoop.fs.Path import org.apache.hudi.hadoop.fs.HadoopFSUtils - -import org.apache.hadoop.fs.{LocalFileSystem, Path} import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.catalog.SessionCatalog +import org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase class TestDropTable extends HoodieSparkSqlTestBase { diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestHoodieTableValuedFunction.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/dml/TestHoodieTableValuedFunction.scala similarity index 99% rename from hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestHoodieTableValuedFunction.scala rename to hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/dml/TestHoodieTableValuedFunction.scala index bdf512d3451a..44e5f26aa580 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestHoodieTableValuedFunction.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/dml/TestHoodieTableValuedFunction.scala @@ -15,13 +15,12 @@ * limitations under the License. */ -package org.apache.spark.sql.hudi +package org.apache.spark.sql.hudi.dml import org.apache.hudi.DataSourceWriteOptions.SPARK_SQL_INSERT_INTO_OPERATION import org.apache.hudi.HoodieSparkUtils import org.apache.spark.sql.functions.{col, from_json} - -import scala.collection.Seq +import org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase class TestHoodieTableValuedFunction extends HoodieSparkSqlTestBase { @@ -450,6 +449,7 @@ class TestHoodieTableValuedFunction extends HoodieSparkSqlTestBase { |""".stripMargin ) + spark.sql("set hoodie.merge.allow.duplicate.on.inserts = false") spark.sql( s""" | insert into $tableName diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestInsertTable.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/dml/TestInsertTable.scala similarity index 92% rename from hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestInsertTable.scala rename to hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/dml/TestInsertTable.scala index 21369ea34e0c..47a718cff735 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestInsertTable.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/dml/TestInsertTable.scala @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.spark.sql.hudi +package org.apache.spark.sql.hudi.dml import org.apache.hudi.DataSourceWriteOptions._ import org.apache.hudi.client.common.HoodieSparkEngineContext @@ -31,8 +31,10 @@ import org.apache.hudi.index.HoodieIndex.IndexType import org.apache.hudi.{DataSourceWriteOptions, HoodieCLIUtils, HoodieSparkUtils} import org.apache.spark.scheduler.{SparkListener, SparkListenerStageSubmitted} import org.apache.spark.sql.SaveMode -import org.apache.spark.sql.hudi.HoodieSparkSqlTestBase.getLastCommitMetadata +import org.apache.spark.sql.hudi.HoodieSqlCommonUtils import org.apache.spark.sql.hudi.command.HoodieSparkValidateDuplicateKeyRecordMerger +import org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase +import org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase.getLastCommitMetadata import org.junit.jupiter.api.Assertions.assertEquals import java.io.File @@ -40,6 +42,116 @@ import java.util.concurrent.CountDownLatch class TestInsertTable extends HoodieSparkSqlTestBase { + test("Test table type name incase-sensitive test") { + withTempDir { tmp => + val targetTable = generateTableName + val tablePath = s"${tmp.getCanonicalPath}/$targetTable" + + spark.sql( + s""" + |create table ${targetTable} ( + | `id` string, + | `name` string, + | `dt` bigint, + | `day` STRING, + | `hour` INT + |) using hudi + |tblproperties ( + | 'primaryKey' = 'id', + | 'type' = 'MOR', + | 'preCombineField'='dt', + | 'hoodie.index.type' = 'BUCKET', + | 'hoodie.bucket.index.hash.field' = 'id', + | 'hoodie.bucket.index.num.buckets'=512 + | ) + partitioned by (`day`,`hour`) + location '${tablePath}' + """.stripMargin) + + spark.sql( + s""" + |insert into ${targetTable} + |select '1' as id, 'aa' as name, 123 as dt, '2024-02-19' as `day`, 10 as `hour` + |""".stripMargin) + + spark.sql( + s""" + |merge into ${targetTable} as target + |using ( + |select '2' as id, 'bb' as name, 456 as dt, '2024-02-19' as `day`, 10 as `hour` + |) as source + |on target.id = source.id + |when matched then update set * + |when not matched then insert * + |""".stripMargin + ) + + // check result after insert and merge data into target table + checkAnswer(s"select id, name, dt, day, hour from $targetTable limit 10")( + Seq("1", "aa", 123, "2024-02-19", 10), + Seq("2", "bb", 456, "2024-02-19", 10) + ) + } + } + + test("Test FirstValueAvroPayload test") { + withTempDir { tmp => + val targetTable = generateTableName + val tablePath = s"${tmp.getCanonicalPath}/$targetTable" + + spark.sql( + s""" + |create table ${targetTable} ( + | `id` string, + | `name` string, + | `dt` bigint, + | `day` STRING, + | `hour` INT + |) using hudi + |tblproperties ( + | 'primaryKey' = 'id', + | 'type' = 'mor', + | 'preCombineField'='dt', + | 'hoodie.index.type' = 'BUCKET', + | 'hoodie.bucket.index.hash.field' = 'id', + | 'hoodie.bucket.index.num.buckets'=12, + | 'hoodie.datasource.write.payload.class'='org.apache.hudi.common.model.FirstValueAvroPayload' + | ) + partitioned by (`day`,`hour`) + location '${tablePath}' + """.stripMargin) + + spark.sql("set hoodie.file.group.reader.enabled=false") + + spark.sql( + s""" + |insert into ${targetTable} + |select '1' as id, 'aa' as name, 123 as dt, '2024-02-19' as `day`, 10 as `hour` + |""".stripMargin) + + spark.sql( + s""" + |insert into ${targetTable} + |select '1' as id, 'bb' as name, 123 as dt, '2024-02-19' as `day`, 10 as `hour` + |""".stripMargin) + + checkAnswer(s"select id, name, dt, day, hour from $targetTable limit 10")( + Seq("1", "aa", 123, "2024-02-19", 10) + ) + + spark.sql( + s""" + |insert into ${targetTable} + |select '1' as id, 'cc' as name, 124 as dt, '2024-02-19' as `day`, 10 as `hour` + |""".stripMargin) + + checkAnswer(s"select id, name, dt, day, hour from $targetTable limit 10")( + Seq("1", "cc", 124, "2024-02-19", 10) + ) + + } + } + test("Test Insert Into with values") { withRecordType()(withTempDir { tmp => val tableName = generateTableName @@ -76,7 +188,7 @@ class TestInsertTable extends HoodieSparkSqlTestBase { } test("Test Insert Into with static partition") { - withRecordType()(withTempDir { tmp => + withTempDir { tmp => val tableName = generateTableName // Create a partitioned table spark.sql( @@ -125,11 +237,11 @@ class TestInsertTable extends HoodieSparkSqlTestBase { Seq(2, "a2", 20.0, 2000, "2021-01-06"), Seq(3, "a3", 30.0, 3000, "2021-01-07") ) - }) + } } test("Test Insert Into with dynamic partition") { - withRecordType()(withTempDir { tmp => + withTempDir { tmp => val tableName = generateTableName // Create a partitioned table spark.sql( @@ -179,7 +291,7 @@ class TestInsertTable extends HoodieSparkSqlTestBase { Seq(2, "a2", 20.0, 2000, "2021-01-06"), Seq(3, "a3", 30.0, 3000, "2021-01-07") ) - }) + } } test("Test Insert Into with multi partition") { @@ -340,6 +452,7 @@ class TestInsertTable extends HoodieSparkSqlTestBase { Seq(2, "a2", 12.0) ) + spark.sql("set hoodie.merge.allow.duplicate.on.inserts = false") assertThrows[HoodieDuplicateKeyException] { try { spark.sql(s"insert into $tableName select 1, 'a1', 10") @@ -359,7 +472,7 @@ class TestInsertTable extends HoodieSparkSqlTestBase { } test("Test Insert Overwrite") { - withRecordType()(withTempDir { tmp => + withTempDir { tmp => Seq("cow", "mor").foreach { tableType => withTable(generateTableName) { tableName => // Create a partitioned table @@ -504,7 +617,7 @@ class TestInsertTable extends HoodieSparkSqlTestBase { ) } } - }) + } } test("Test insert overwrite for multi partitioned table") { @@ -606,19 +719,19 @@ class TestInsertTable extends HoodieSparkSqlTestBase { } test("Test Different Type of Partition Column") { - withRecordType()(withTempDir { tmp => - val typeAndValue = Seq( - ("string", "'1000'"), - ("int", 1000), - ("bigint", 10000), - ("timestamp", "TIMESTAMP'2021-05-20 00:00:00'"), - ("date", "DATE'2021-05-20'") - ) - typeAndValue.foreach { case (partitionType, partitionValue) => - val tableName = generateTableName - validateDifferentTypesOfPartitionColumn(tmp, partitionType, partitionValue, tableName) - } - }) + withTempDir { tmp => + val typeAndValue = Seq( + ("string", "'1000'"), + ("int", 1000), + ("bigint", 10000), + ("timestamp", "TIMESTAMP'2021-05-20 00:00:00'"), + ("date", "DATE'2021-05-20'") + ) + typeAndValue.foreach { case (partitionType, partitionValue) => + val tableName = generateTableName + validateDifferentTypesOfPartitionColumn(tmp, partitionType, partitionValue, tableName) + } + } } test("Test TimestampType Partition Column With Consistent Logical Timestamp Enabled") { @@ -636,7 +749,7 @@ class TestInsertTable extends HoodieSparkSqlTestBase { } test("Test insert for uppercase table name") { - withRecordType()(withTempDir{ tmp => + withTempDir { tmp => val tableName = s"H_$generateTableName" if (HoodieSparkUtils.gteqSpark3_5) { // [SPARK-44284] Spark 3.5+ requires conf below to be case sensitive @@ -663,84 +776,82 @@ class TestInsertTable extends HoodieSparkSqlTestBase { .setConf(spark.sessionState.newHadoopConf()) .build() assertResult(tableName)(metaClient.getTableConfig.getTableName) - }) + } } test("Test Insert Exception") { - withRecordType() { - val tableName = generateTableName + val tableName = generateTableName + spark.sql( + s""" + |create table $tableName ( + | id int, + | name string, + | price double, + | dt string + |) using hudi + | tblproperties (primaryKey = 'id') + | partitioned by (dt) + """.stripMargin) + val tooManyDataColumnsErrorMsg = if (HoodieSparkUtils.gteqSpark3_5) { + s""" + |[INSERT_COLUMN_ARITY_MISMATCH.TOO_MANY_DATA_COLUMNS] Cannot write to `spark_catalog`.`default`.`$tableName`, the reason is too many data columns: + |Table columns: `id`, `name`, `price`. + |Data columns: `1`, `a1`, `10`, `2021-06-20`. + |""".stripMargin + } else if (HoodieSparkUtils.gteqSpark3_4) { + """ + |too many data columns: + |Table columns: 'id', 'name', 'price'. + |Data columns: '1', 'a1', '10', '2021-06-20'. + |""".stripMargin + } else { + """ + |too many data columns: + |Table columns: 'id', 'name', 'price' + |Data columns: '1', 'a1', '10', '2021-06-20' + |""".stripMargin + } + checkExceptionContain(s"insert into $tableName partition(dt = '2021-06-20') select 1, 'a1', 10, '2021-06-20'")( + tooManyDataColumnsErrorMsg) + + val notEnoughDataColumnsErrorMsg = if (HoodieSparkUtils.gteqSpark3_5) { + s""" + |[INSERT_COLUMN_ARITY_MISMATCH.NOT_ENOUGH_DATA_COLUMNS] Cannot write to `spark_catalog`.`default`.`$tableName`, the reason is not enough data columns: + |Table columns: `id`, `name`, `price`, `dt`. + |Data columns: `1`, `a1`, `10`. + |""".stripMargin + } else if (HoodieSparkUtils.gteqSpark3_4) { + """ + |not enough data columns: + |Table columns: 'id', 'name', 'price', 'dt'. + |Data columns: '1', 'a1', '10'. + |""".stripMargin + } else { + """ + |not enough data columns: + |Table columns: 'id', 'name', 'price', 'dt' + |Data columns: '1', 'a1', '10' + |""".stripMargin + } + checkExceptionContain(s"insert into $tableName select 1, 'a1', 10")(notEnoughDataColumnsErrorMsg) + withSQLConf("hoodie.sql.bulk.insert.enable" -> "true", "hoodie.sql.insert.mode" -> "strict") { + val tableName2 = generateTableName spark.sql( s""" - |create table $tableName ( + |create table $tableName2 ( | id int, | name string, | price double, - | dt string + | ts long |) using hudi - | tblproperties (primaryKey = 'id') - | partitioned by (dt) - """.stripMargin) - val tooManyDataColumnsErrorMsg = if (HoodieSparkUtils.gteqSpark3_5) { - s""" - |[INSERT_COLUMN_ARITY_MISMATCH.TOO_MANY_DATA_COLUMNS] Cannot write to `spark_catalog`.`default`.`$tableName`, the reason is too many data columns: - |Table columns: `id`, `name`, `price`. - |Data columns: `1`, `a1`, `10`, `2021-06-20`. - |""".stripMargin - } else if (HoodieSparkUtils.gteqSpark3_4) { - """ - |too many data columns: - |Table columns: 'id', 'name', 'price'. - |Data columns: '1', 'a1', '10', '2021-06-20'. - |""".stripMargin - } else { - """ - |too many data columns: - |Table columns: 'id', 'name', 'price' - |Data columns: '1', 'a1', '10', '2021-06-20' - |""".stripMargin - } - checkExceptionContain(s"insert into $tableName partition(dt = '2021-06-20') select 1, 'a1', 10, '2021-06-20'")( - tooManyDataColumnsErrorMsg) - - val notEnoughDataColumnsErrorMsg = if (HoodieSparkUtils.gteqSpark3_5) { - s""" - |[INSERT_COLUMN_ARITY_MISMATCH.NOT_ENOUGH_DATA_COLUMNS] Cannot write to `spark_catalog`.`default`.`$tableName`, the reason is not enough data columns: - |Table columns: `id`, `name`, `price`, `dt`. - |Data columns: `1`, `a1`, `10`. - |""".stripMargin - } else if (HoodieSparkUtils.gteqSpark3_4) { - """ - |not enough data columns: - |Table columns: 'id', 'name', 'price', 'dt'. - |Data columns: '1', 'a1', '10'. - |""".stripMargin - } else { - """ - |not enough data columns: - |Table columns: 'id', 'name', 'price', 'dt' - |Data columns: '1', 'a1', '10' - |""".stripMargin - } - checkExceptionContain(s"insert into $tableName select 1, 'a1', 10")(notEnoughDataColumnsErrorMsg) - withSQLConf("hoodie.sql.bulk.insert.enable" -> "true", "hoodie.sql.insert.mode" -> "strict") { - val tableName2 = generateTableName - spark.sql( - s""" - |create table $tableName2 ( - | id int, - | name string, - | price double, - | ts long - |) using hudi - | tblproperties ( - | primaryKey = 'id', - | preCombineField = 'ts' - | ) - """.stripMargin) - checkException(s"insert into $tableName2 values(1, 'a1', 10, 1000)")( - "Table with primaryKey can not use bulk insert in strict mode." - ) - } + | tblproperties ( + | primaryKey = 'id', + | preCombineField = 'ts' + | ) + """.stripMargin) + checkException(s"insert into $tableName2 values(1, 'a1', 10, 1000)")( + "Table with primaryKey can not use bulk insert in strict mode." + ) } } @@ -773,8 +884,8 @@ class TestInsertTable extends HoodieSparkSqlTestBase { test("Test bulk insert with insert into for single partitioned table") { withSQLConf("hoodie.sql.insert.mode" -> "non-strict") { - withRecordType()(withTempDir { tmp => - Seq("cow", "mor").foreach {tableType => + withTempDir { tmp => + Seq("cow", "mor").foreach { tableType => withTable(generateTableName) { tableName => spark.sql( s""" @@ -817,7 +928,7 @@ class TestInsertTable extends HoodieSparkSqlTestBase { ) } } - }) + } } } @@ -906,7 +1017,7 @@ class TestInsertTable extends HoodieSparkSqlTestBase { test("Test bulk insert with CTAS") { withSQLConf("hoodie.sql.insert.mode" -> "non-strict", "hoodie.sql.bulk.insert.enable" -> "true") { - withRecordType()(withTempDir { tmp => + withTempDir { tmp => Seq("cow", "mor").foreach { tableType => withTable(generateTableName) { inputTable => spark.sql( @@ -948,13 +1059,13 @@ class TestInsertTable extends HoodieSparkSqlTestBase { } } } - }) + } } } test("Test bulk insert with empty dataset") { withSQLConf(SPARK_SQL_INSERT_INTO_OPERATION.key -> WriteOperationType.BULK_INSERT.value()) { - withRecordType()(withTempDir { tmp => + withTempDir { tmp => Seq("cow", "mor").foreach { tableType => withTable(generateTableName) { inputTable => spark.sql( @@ -992,7 +1103,7 @@ class TestInsertTable extends HoodieSparkSqlTestBase { } } } - }) + } } } @@ -1004,7 +1115,7 @@ class TestInsertTable extends HoodieSparkSqlTestBase { Array() } withSQLConf(bulkInsertConf: _*) { - withRecordType()(withTempDir { tmp => + withTempDir { tmp => Seq("cow", "mor").foreach { tableType => withTable(generateTableName) { inputTable => spark.sql( @@ -1048,14 +1159,14 @@ class TestInsertTable extends HoodieSparkSqlTestBase { } } } - }) + } } } } test("Test bulk insert with insert overwrite table") { withSQLConf(SPARK_SQL_INSERT_INTO_OPERATION.key -> WriteOperationType.BULK_INSERT.value()) { - withRecordType()(withTempDir { tmp => + withTempDir { tmp => Seq("cow", "mor").foreach { tableType => withTable(generateTableName) { nonPartitionedTable => spark.sql( @@ -1082,13 +1193,13 @@ class TestInsertTable extends HoodieSparkSqlTestBase { } } } - }) + } } } test("Test bulk insert with insert overwrite partition") { withSQLConf(SPARK_SQL_INSERT_INTO_OPERATION.key -> WriteOperationType.BULK_INSERT.value()) { - withRecordType()(withTempDir { tmp => + withTempDir { tmp => Seq("cow", "mor").foreach { tableType => withTable(generateTableName) { partitionedTable => spark.sql( @@ -1129,12 +1240,12 @@ class TestInsertTable extends HoodieSparkSqlTestBase { } } } - }) + } } } test("Test combine before insert") { - withSQLConf("hoodie.sql.bulk.insert.enable" -> "false") { + withSQLConf("hoodie.sql.bulk.insert.enable" -> "false", "hoodie.merge.allow.duplicate.on.inserts" -> "false") { withRecordType()(withTempDir{tmp => val tableName = generateTableName spark.sql( @@ -1305,7 +1416,7 @@ class TestInsertTable extends HoodieSparkSqlTestBase { test("Test Insert Into With Catalog Identifier for spark >= 3.2.0") { Seq("hudi", "parquet").foreach { format => - withRecordType()(withTempDir { tmp => + withTempDir { tmp => val tableName = s"spark_catalog.default.$generateTableName" // Create a partitioned table if (HoodieSparkUtils.gteqSpark3_2) { @@ -1342,7 +1453,7 @@ class TestInsertTable extends HoodieSparkSqlTestBase { Seq(2, "a2", 10.0, 1000, "2021-01-05") ) } - }) + } } } @@ -1448,6 +1559,7 @@ class TestInsertTable extends HoodieSparkSqlTestBase { Seq(3, "a3", 30.0, 3000, "2021-01-07") ) + spark.sql("set hoodie.merge.allow.duplicate.on.inserts = false") spark.sql( s""" | insert into $tableName values @@ -1612,7 +1724,7 @@ class TestInsertTable extends HoodieSparkSqlTestBase { test("Test Insert Overwrite Into Bucket Index Table") { withSQLConf("hoodie.sql.bulk.insert.enable" -> "false") { Seq("mor", "cow").foreach { tableType => - withRecordType()(withTempDir { tmp => + withTempDir { tmp => val tableName = generateTableName // Create a partitioned table spark.sql( @@ -1657,14 +1769,14 @@ class TestInsertTable extends HoodieSparkSqlTestBase { checkAnswer(s"select id, name, price, ts, dt from $tableName order by dt")( Seq(13, "a2", 12.0, 1000, "2021-01-05") ) - }) + } } } } test("Test Insert Overwrite Into Consistent Bucket Index Table") { withSQLConf("hoodie.sql.bulk.insert.enable" -> "false") { - withRecordType()(withTempDir { tmp => + withTempDir { tmp => val tableName = generateTableName // Create a partitioned table spark.sql( @@ -1717,13 +1829,13 @@ class TestInsertTable extends HoodieSparkSqlTestBase { checkAnswer(s"select id, name, price, ts, dt from $tableName order by dt")( Seq(13, "a3", 12.0, 1000, "2021-01-05") ) - }) + } } } test("Test Hudi should not record empty preCombineKey in hoodie.properties") { withSQLConf("hoodie.datasource.write.operation" -> "insert") { - withRecordType()(withTempDir { tmp => + withTempDir { tmp => val tableName = generateTableName spark.sql( s""" @@ -1751,7 +1863,7 @@ class TestInsertTable extends HoodieSparkSqlTestBase { Seq(2, "name2", 12.0), Seq(3, "name3", 13.0) ) - }) + } } } @@ -1953,17 +2065,17 @@ class TestInsertTable extends HoodieSparkSqlTestBase { spark.sessionState.conf.unsetConf("hoodie.sql.insert.mode") spark.sessionState.conf.unsetConf("hoodie.datasource.insert.dup.policy") spark.sessionState.conf.unsetConf("hoodie.datasource.write.operation") - withRecordType()(withTempDir { tmp => + withTempDir { tmp => Seq("cow", "mor").foreach { tableType => withTable(generateTableName) { tableName => ingestAndValidateData(tableType, tableName, tmp, WriteOperationType.UPSERT) } } - }) + } } test("Test sql write operation with INSERT_INTO override both strict mode and sql write operation") { - withRecordType()(withTempDir { tmp => + withTempDir { tmp => Seq("cow", "mor").foreach { tableType => Seq(WriteOperationType.INSERT, WriteOperationType.BULK_INSERT, WriteOperationType.UPSERT).foreach { operation => withTable(generateTableName) { tableName => @@ -1972,11 +2084,11 @@ class TestInsertTable extends HoodieSparkSqlTestBase { } } } - }) + } } test("Test sql write operation with INSERT_INTO override only sql write operation") { - withRecordType()(withTempDir { tmp => + withTempDir { tmp => Seq("cow", "mor").foreach { tableType => Seq(WriteOperationType.INSERT, WriteOperationType.BULK_INSERT, WriteOperationType.UPSERT).foreach { operation => withTable(generateTableName) { tableName => @@ -1985,7 +2097,7 @@ class TestInsertTable extends HoodieSparkSqlTestBase { } } } - }) + } } test("Test sql write operation with INSERT_INTO override only strict mode") { @@ -1994,14 +2106,14 @@ class TestInsertTable extends HoodieSparkSqlTestBase { spark.sessionState.conf.unsetConf(DataSourceWriteOptions.INSERT_DUP_POLICY.key()) spark.sessionState.conf.unsetConf("hoodie.datasource.write.operation") spark.sessionState.conf.unsetConf("hoodie.sql.bulk.insert.enable") - withRecordType()(withTempDir { tmp => + withTempDir { tmp => Seq("cow", "mor").foreach { tableType => withTable(generateTableName) { tableName => ingestAndValidateData(tableType, tableName, tmp, WriteOperationType.UPSERT, List("set hoodie.sql.insert.mode = upsert")) } } - }) + } } def ingestAndValidateData(tableType: String, tableName: String, tmp: File, @@ -2075,13 +2187,13 @@ class TestInsertTable extends HoodieSparkSqlTestBase { spark.sessionState.conf.unsetConf("hoodie.sql.insert.mode") spark.sessionState.conf.unsetConf("hoodie.datasource.insert.dup.policy") spark.sessionState.conf.unsetConf("hoodie.datasource.write.operation") - withRecordType()(withTempDir { tmp => - Seq("cow","mor").foreach { tableType => + withTempDir { tmp => + Seq("cow", "mor").foreach { tableType => withTable(generateTableName) { tableName => ingestAndValidateDataNoPrecombine(tableType, tableName, tmp, WriteOperationType.INSERT) } } - }) + } } var listenerCallCount: Int = 0 @@ -2101,7 +2213,7 @@ class TestInsertTable extends HoodieSparkSqlTestBase { test("Test multiple partition fields pruning") { - withRecordType()(withTempDir { tmp => + withTempDir { tmp => val targetTable = generateTableName countDownLatch = new CountDownLatch(1) listenerCallCount = 0 @@ -2148,12 +2260,12 @@ class TestInsertTable extends HoodieSparkSqlTestBase { assertResult(1)(rddHead.partitions.size) countDownLatch.await assert(listenerCallCount >= 1) - }) + } } test("Test single partiton field pruning") { - withRecordType()(withTempDir { tmp => + withTempDir { tmp => countDownLatch = new CountDownLatch(1) listenerCallCount = 0 val targetTable = generateTableName @@ -2200,11 +2312,11 @@ class TestInsertTable extends HoodieSparkSqlTestBase { assertResult(1)(rddHead.partitions.size) countDownLatch.await assert(listenerCallCount >= 1) - }) + } } test("Test inaccurate index type") { - withRecordType()(withTempDir { tmp => + withTempDir { tmp => val targetTable = generateTableName assertThrows[IllegalArgumentException] { @@ -2232,7 +2344,7 @@ class TestInsertTable extends HoodieSparkSqlTestBase { |""".stripMargin) } } - }) + } } test("Test vectorized read nested columns for LegacyHoodieParquetFileFormat") { @@ -2338,7 +2450,7 @@ class TestInsertTable extends HoodieSparkSqlTestBase { } test("Test insert dup policy with INSERT_INTO explicit new configs INSERT operation ") { - withRecordType()(withTempDir { tmp => + withTempDir { tmp => Seq("cow", "mor").foreach { tableType => val operation = WriteOperationType.INSERT Seq(NONE_INSERT_DUP_POLICY, DROP_INSERT_DUP_POLICY).foreach { dupPolicy => @@ -2350,11 +2462,11 @@ class TestInsertTable extends HoodieSparkSqlTestBase { } } } - }) + } } test("Test insert dup policy with INSERT_INTO explicit new configs BULK_INSERT operation ") { - withRecordType()(withTempDir { tmp => + withTempDir { tmp => Seq("cow").foreach { tableType => val operation = WriteOperationType.BULK_INSERT val dupPolicy = NONE_INSERT_DUP_POLICY @@ -2365,7 +2477,7 @@ class TestInsertTable extends HoodieSparkSqlTestBase { dupPolicy) } } - }) + } } test("Test DROP insert dup policy with INSERT_INTO explicit new configs BULK INSERT operation") { @@ -2432,7 +2544,6 @@ class TestInsertTable extends HoodieSparkSqlTestBase { }) } - def ingestAndValidateDataDupPolicy(tableType: String, tableName: String, tmp: File, expectedOperationtype: WriteOperationType = WriteOperationType.INSERT, setOptions: List[String] = List.empty, diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestMergeIntoLogOnlyTable.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/dml/TestMergeIntoLogOnlyTable.scala similarity index 97% rename from hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestMergeIntoLogOnlyTable.scala rename to hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/dml/TestMergeIntoLogOnlyTable.scala index 48ee872d4d95..d25b9752e35b 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestMergeIntoLogOnlyTable.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/dml/TestMergeIntoLogOnlyTable.scala @@ -15,9 +15,10 @@ * limitations under the License. */ -package org.apache.spark.sql.hudi +package org.apache.spark.sql.hudi.dml import org.apache.hudi.testutils.DataSourceTestUtils +import org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase class TestMergeIntoLogOnlyTable extends HoodieSparkSqlTestBase { diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestMergeIntoTable.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/dml/TestMergeIntoTable.scala similarity index 99% rename from hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestMergeIntoTable.scala rename to hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/dml/TestMergeIntoTable.scala index a2111265bd64..d712e2df2597 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestMergeIntoTable.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/dml/TestMergeIntoTable.scala @@ -15,14 +15,13 @@ * limitations under the License. */ -package org.apache.spark.sql.hudi +package org.apache.spark.sql.hudi.dml import org.apache.hudi.DataSourceWriteOptions.SPARK_SQL_OPTIMIZED_WRITES import org.apache.hudi.config.HoodieWriteConfig.MERGE_SMALL_FILE_GROUP_CANDIDATES_LIMIT -import org.apache.hudi.{DataSourceReadOptions, HoodieDataSourceHelpers, HoodieSparkUtils, ScalaAssertionSupport} -import org.apache.hudi.common.fs.FSUtils import org.apache.hudi.hadoop.fs.HadoopFSUtils - +import org.apache.hudi.{DataSourceReadOptions, HoodieDataSourceHelpers, HoodieSparkUtils, ScalaAssertionSupport} +import org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase import org.apache.spark.sql.internal.SQLConf class TestMergeIntoTable extends HoodieSparkSqlTestBase with ScalaAssertionSupport { @@ -131,7 +130,7 @@ class TestMergeIntoTable extends HoodieSparkSqlTestBase with ScalaAssertionSuppo test("Test MergeInto with more than once update actions for spark >= 3.1.x") { if (HoodieSparkUtils.gteqSpark3_1) { - withRecordType()(withTempDir { tmp => + withTempDir { tmp => val targetTable = generateTableName spark.sql( s""" @@ -180,7 +179,7 @@ class TestMergeIntoTable extends HoodieSparkSqlTestBase with ScalaAssertionSuppo checkAnswer(s"select id, name, data, country, ts from $targetTable")( Seq(1, "lb", 5, "shu", 1646643196L) ) - }) + } } } @@ -265,7 +264,7 @@ class TestMergeIntoTable extends HoodieSparkSqlTestBase with ScalaAssertionSuppo test("Test MergeInto for MOR table ") { spark.sql(s"set ${MERGE_SMALL_FILE_GROUP_CANDIDATES_LIMIT.key} = 0") - withRecordType()(withTempDir { tmp => + withTempDir { tmp => spark.sql("set hoodie.payload.combined.schema.validate = true") val tableName = generateTableName // Create a mor partitioned table. @@ -393,12 +392,12 @@ class TestMergeIntoTable extends HoodieSparkSqlTestBase with ScalaAssertionSuppo checkAnswer(s"select id,name,price,dt from $tableName order by id")( Seq(1, "a1", 12, "2021-03-21") ) - }) + } } test("Test MergeInto with insert only") { spark.sql(s"set ${MERGE_SMALL_FILE_GROUP_CANDIDATES_LIMIT.key} = 0") - withRecordType()(withTempDir { tmp => + withTempDir { tmp => spark.sql("set hoodie.payload.combined.schema.validate = true") // Create a partitioned mor table val tableName = generateTableName @@ -449,7 +448,7 @@ class TestMergeIntoTable extends HoodieSparkSqlTestBase with ScalaAssertionSuppo Seq(1, "a1", 10, "2021-03-21"), Seq(2, "a2", 10, "2021-03-20") ) - }) + } } test("Test MergeInto For PreCombineField") { @@ -529,7 +528,7 @@ class TestMergeIntoTable extends HoodieSparkSqlTestBase with ScalaAssertionSuppo test("Test MergeInto with preCombine field expression") { spark.sql(s"set ${MERGE_SMALL_FILE_GROUP_CANDIDATES_LIMIT.key} = 0") - withRecordType()(withTempDir { tmp => + withTempDir { tmp => spark.sql("set hoodie.payload.combined.schema.validate = true") Seq("cow", "mor").foreach { tableType => val tableName1 = generateTableName @@ -607,11 +606,11 @@ class TestMergeIntoTable extends HoodieSparkSqlTestBase with ScalaAssertionSuppo Seq(1, "a1", 24, "2021-03-21", 1002) ) } - }) + } } test("Test MergeInto with primaryKey expression") { - withRecordType()(withTempDir { tmp => + withTempDir { tmp => spark.sql("set hoodie.payload.combined.schema.validate = true") val tableName1 = generateTableName spark.sql( @@ -707,7 +706,7 @@ class TestMergeIntoTable extends HoodieSparkSqlTestBase with ScalaAssertionSuppo checkAnswer(s"select id,name,price,v,dt from $tableName1 order by id")( Seq(1, "a1", 10, 1000, "2021-03-21") ) - }) + } } test("Test MergeInto with combination of delete update insert") { @@ -882,7 +881,7 @@ class TestMergeIntoTable extends HoodieSparkSqlTestBase with ScalaAssertionSuppo } test("Test Different Type of PreCombineField") { - withRecordType()(withTempDir { tmp => + withTempDir { tmp => spark.sql("set hoodie.payload.combined.schema.validate = true") val typeAndValue = Seq( ("string", "'1000'"), @@ -938,7 +937,7 @@ class TestMergeIntoTable extends HoodieSparkSqlTestBase with ScalaAssertionSuppo Seq(1, "a1", 20.0) ) } - }) + } } test("Test MergeInto For MOR With Compaction On") { diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestMergeIntoTable2.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/dml/TestMergeIntoTable2.scala similarity index 98% rename from hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestMergeIntoTable2.scala rename to hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/dml/TestMergeIntoTable2.scala index b8f315575ddf..e38447692de3 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestMergeIntoTable2.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/dml/TestMergeIntoTable2.scala @@ -15,12 +15,13 @@ * limitations under the License. */ -package org.apache.spark.sql.hudi +package org.apache.spark.sql.hudi.dml import org.apache.hudi.HoodieSparkUtils import org.apache.hudi.common.table.HoodieTableMetaClient import org.apache.hudi.config.HoodieWriteConfig.MERGE_SMALL_FILE_GROUP_CANDIDATES_LIMIT import org.apache.spark.sql.Row +import org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase class TestMergeIntoTable2 extends HoodieSparkSqlTestBase { @@ -144,7 +145,7 @@ class TestMergeIntoTable2 extends HoodieSparkSqlTestBase { } test("Test Merge Into CTAS Table") { - withRecordType()(withTempDir { tmp => + withTempDir { tmp => spark.sql("set hoodie.payload.combined.schema.validate = true") val tableName = generateTableName spark.sql( @@ -176,7 +177,7 @@ class TestMergeIntoTable2 extends HoodieSparkSqlTestBase { checkAnswer(s"select id, name from $tableName")( Seq(1, "a1_1") ) - }) + } } test("Test Merge With Complex Data Type") { @@ -245,7 +246,7 @@ class TestMergeIntoTable2 extends HoodieSparkSqlTestBase { } test("Test column name matching for insert * and update set *") { - withRecordType()(withTempDir { tmp => + withTempDir { tmp => spark.sql("set hoodie.payload.combined.schema.validate = true") val tableName = generateTableName // Create table @@ -329,11 +330,11 @@ class TestMergeIntoTable2 extends HoodieSparkSqlTestBase { Seq(3, "a3", 102.0, 1000, "2021-05-05"), Seq(4, "a4", 100.0, 1000, "2021-05-06") ) - }) + } } test("Test MergeInto For Source Table With Column Aliases") { - withRecordType()(withTempDir { tmp => + withTempDir { tmp => spark.sql("set hoodie.payload.combined.schema.validate = true") val tableName = generateTableName // Create table @@ -373,7 +374,7 @@ class TestMergeIntoTable2 extends HoodieSparkSqlTestBase { Seq(1, "a1", 10.0, 1000) ) } - }) + } } /* TODO [HUDI-6472] @@ -559,7 +560,7 @@ class TestMergeIntoTable2 extends HoodieSparkSqlTestBase { */ test("Test only insert when source table contains history") { - withRecordType()(withTempDir { tmp => + withTempDir { tmp => spark.sql("set hoodie.payload.combined.schema.validate = true") val tableName = generateTableName // Create table @@ -601,7 +602,7 @@ class TestMergeIntoTable2 extends HoodieSparkSqlTestBase { Seq(1, "a1", 1.0, 10, "2022-08-18"), Seq(2, "a2", 10.0, 100, "2022-08-18") ) - }) + } } test("Test only insert when source table contains history and target table has multiple keys") { @@ -653,7 +654,7 @@ class TestMergeIntoTable2 extends HoodieSparkSqlTestBase { test("Test Merge Into For Source Table With Different Column Order") { spark.sql(s"set ${MERGE_SMALL_FILE_GROUP_CANDIDATES_LIMIT.key} = 0") - withRecordType()(withTempDir { tmp => + withTempDir { tmp => val tableName = generateTableName // Create a mor partitioned table. spark.sql( @@ -687,7 +688,7 @@ class TestMergeIntoTable2 extends HoodieSparkSqlTestBase { checkAnswer(s"select id,name,price,dt from $tableName")( Seq(1, "a1", 10, "2021-03-21") ) - }) + } } test("Test Merge into with String cast to Double") { @@ -923,6 +924,7 @@ class TestMergeIntoTable2 extends HoodieSparkSqlTestBase { | partitioned by(dt) | location '${tmp.getCanonicalPath}' """.stripMargin) + spark.sql("set hoodie.merge.allow.duplicate.on.inserts = false") spark.sql( s""" @@ -971,6 +973,7 @@ class TestMergeIntoTable2 extends HoodieSparkSqlTestBase { | partitioned by(dt) | location '${path1}' """.stripMargin) + spark.sql("set hoodie.merge.allow.duplicate.on.inserts = false") spark.sql(s"insert into $sourceTable values(1, 'a1', cast(3.01 as double), 11, '2022-09-26'),(2, 'a2', cast(3.02 as double), 12, '2022-09-27'),(3, 'a3', cast(3.03 as double), 13, '2022-09-28'),(4, 'a4', cast(3.04 as double), 14, '2022-09-29')") diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestMergeIntoTableWithNonRecordKeyField.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/dml/TestMergeIntoTableWithNonRecordKeyField.scala similarity index 98% rename from hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestMergeIntoTableWithNonRecordKeyField.scala rename to hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/dml/TestMergeIntoTableWithNonRecordKeyField.scala index 419bb43de43f..8e06995475b8 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestMergeIntoTableWithNonRecordKeyField.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/dml/TestMergeIntoTableWithNonRecordKeyField.scala @@ -15,10 +15,11 @@ * limitations under the License. */ -package org.apache.spark.sql.hudi +package org.apache.spark.sql.hudi.dml import org.apache.hudi.DataSourceWriteOptions.SPARK_SQL_OPTIMIZED_WRITES import org.apache.hudi.{HoodieSparkUtils, ScalaAssertionSupport} +import org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase class TestMergeIntoTableWithNonRecordKeyField extends HoodieSparkSqlTestBase with ScalaAssertionSupport { @@ -247,7 +248,7 @@ class TestMergeIntoTableWithNonRecordKeyField extends HoodieSparkSqlTestBase wit test("Test pkless multiple source match") { for (withPrecombine <- Seq(true, false)) { - withRecordType()(withTempDir { tmp => + withTempDir { tmp => spark.sql("set hoodie.payload.combined.schema.validate = true") val tableName = generateTableName @@ -292,13 +293,13 @@ class TestMergeIntoTableWithNonRecordKeyField extends HoodieSparkSqlTestBase wit Seq(1, "a1", 30.0, 100) ) } - }) + } } } test("Test MergeInto Basic pkless") { - withRecordType()(withTempDir { tmp => + withTempDir { tmp => spark.sql("set hoodie.payload.combined.schema.validate = true") spark.sql(s"set ${SPARK_SQL_OPTIMIZED_WRITES.key()}=true") val tableName = generateTableName @@ -385,6 +386,6 @@ class TestMergeIntoTableWithNonRecordKeyField extends HoodieSparkSqlTestBase wit """.stripMargin) val cnt = spark.sql(s"select * from $tableName where id = 1").count() assertResult(0)(cnt) - }) + } } } diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestPartialUpdateForMergeInto.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/dml/TestPartialUpdateForMergeInto.scala similarity index 87% rename from hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestPartialUpdateForMergeInto.scala rename to hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/dml/TestPartialUpdateForMergeInto.scala index 0c1ca31479d8..e4151bd7e950 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestPartialUpdateForMergeInto.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/dml/TestPartialUpdateForMergeInto.scala @@ -15,14 +15,14 @@ * limitations under the License. */ -package org.apache.spark.sql.hudi +package org.apache.spark.sql.hudi.dml import org.apache.avro.Schema -import org.apache.hudi.DataSourceWriteOptions.ENABLE_MERGE_INTO_PARTIAL_UPDATES +import org.apache.hudi.DataSourceWriteOptions import org.apache.hudi.HoodieSparkUtils import org.apache.hudi.avro.HoodieAvroUtils -import org.apache.hudi.common.config.HoodieReaderConfig.FILE_GROUP_READER_ENABLED -import org.apache.hudi.common.config.HoodieStorageConfig.LOGFILE_DATA_BLOCK_FORMAT +import org.apache.hudi.common.config.HoodieReaderConfig +import org.apache.hudi.common.config.HoodieStorageConfig import org.apache.hudi.common.config.{HoodieCommonConfig, HoodieMetadataConfig} import org.apache.hudi.common.engine.HoodieLocalEngineContext import org.apache.hudi.common.function.SerializableFunctionUnchecked @@ -31,10 +31,10 @@ import org.apache.hudi.common.table.log.HoodieLogFileReader import org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType import org.apache.hudi.common.table.view.{FileSystemViewManager, FileSystemViewStorageConfig, SyncableFileSystemView} import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver} -import org.apache.hudi.common.testutils.HoodieTestUtils.{getDefaultHadoopConf, getLogFileListFromFileSlice} -import org.apache.hudi.config.HoodieIndexConfig.INDEX_TYPE -import org.apache.hudi.config.HoodieWriteConfig.MERGE_SMALL_FILE_GROUP_CANDIDATES_LIMIT +import org.apache.hudi.common.testutils.HoodieTestUtils +import org.apache.hudi.config.{HoodieIndexConfig, HoodieWriteConfig} import org.apache.hudi.metadata.HoodieTableMetadata +import org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase import org.junit.jupiter.api.Assertions.{assertEquals, assertFalse, assertTrue} import java.util.function.Predicate @@ -71,9 +71,9 @@ class TestPartialUpdateForMergeInto extends HoodieSparkSqlTestBase { withTempDir { tmp => val tableName = generateTableName val basePath = tmp.getCanonicalPath + "/" + tableName - spark.sql(s"set ${MERGE_SMALL_FILE_GROUP_CANDIDATES_LIMIT.key} = 0") - spark.sql(s"set ${ENABLE_MERGE_INTO_PARTIAL_UPDATES.key} = true") - spark.sql(s"set ${FILE_GROUP_READER_ENABLED.key} = true") + spark.sql(s"set ${HoodieWriteConfig.MERGE_SMALL_FILE_GROUP_CANDIDATES_LIMIT.key} = 0") + spark.sql(s"set ${DataSourceWriteOptions.ENABLE_MERGE_INTO_PARTIAL_UPDATES.key} = true") + spark.sql(s"set ${HoodieReaderConfig.FILE_GROUP_READER_ENABLED.key} = true") // Create a table with five data fields spark.sql( @@ -119,11 +119,11 @@ class TestPartialUpdateForMergeInto extends HoodieSparkSqlTestBase { withTempDir { tmp => val tableName = generateTableName val basePath = tmp.getCanonicalPath + "/" + tableName - spark.sql(s"set ${MERGE_SMALL_FILE_GROUP_CANDIDATES_LIMIT.key} = 0") - spark.sql(s"set ${ENABLE_MERGE_INTO_PARTIAL_UPDATES.key} = true") - spark.sql(s"set ${FILE_GROUP_READER_ENABLED.key} = true") + spark.sql(s"set ${HoodieWriteConfig.MERGE_SMALL_FILE_GROUP_CANDIDATES_LIMIT.key} = 0") + spark.sql(s"set ${DataSourceWriteOptions.ENABLE_MERGE_INTO_PARTIAL_UPDATES.key} = true") + spark.sql(s"set ${HoodieReaderConfig.FILE_GROUP_READER_ENABLED.key} = true") // Write inserts to log block - spark.sql(s"set ${INDEX_TYPE.key} = INMEMORY") + spark.sql(s"set ${HoodieIndexConfig.INDEX_TYPE.key} = INMEMORY") // Create a table with five data fields spark.sql( @@ -176,10 +176,10 @@ class TestPartialUpdateForMergeInto extends HoodieSparkSqlTestBase { withTempDir { tmp => val tableName = generateTableName val basePath = tmp.getCanonicalPath + "/" + tableName - spark.sql(s"set ${MERGE_SMALL_FILE_GROUP_CANDIDATES_LIMIT.key} = 0") - spark.sql(s"set ${ENABLE_MERGE_INTO_PARTIAL_UPDATES.key} = true") - spark.sql(s"set ${LOGFILE_DATA_BLOCK_FORMAT.key} = $logDataBlockFormat") - spark.sql(s"set ${FILE_GROUP_READER_ENABLED.key} = true") + spark.sql(s"set ${HoodieWriteConfig.MERGE_SMALL_FILE_GROUP_CANDIDATES_LIMIT.key} = 0") + spark.sql(s"set ${DataSourceWriteOptions.ENABLE_MERGE_INTO_PARTIAL_UPDATES.key} = true") + spark.sql(s"set ${HoodieStorageConfig.LOGFILE_DATA_BLOCK_FORMAT.key} = $logDataBlockFormat") + spark.sql(s"set ${HoodieReaderConfig.FILE_GROUP_READER_ENABLED.key} = true") // Create a table with five data fields spark.sql( @@ -279,10 +279,10 @@ class TestPartialUpdateForMergeInto extends HoodieSparkSqlTestBase { withTempDir { tmp => val tableName = generateTableName val basePath = tmp.getCanonicalPath + "/" + tableName - spark.sql(s"set ${MERGE_SMALL_FILE_GROUP_CANDIDATES_LIMIT.key} = 0") - spark.sql(s"set ${ENABLE_MERGE_INTO_PARTIAL_UPDATES.key} = true") - spark.sql(s"set ${LOGFILE_DATA_BLOCK_FORMAT.key} = $logDataBlockFormat") - spark.sql(s"set ${FILE_GROUP_READER_ENABLED.key} = true") + spark.sql(s"set ${HoodieWriteConfig.MERGE_SMALL_FILE_GROUP_CANDIDATES_LIMIT.key} = 0") + spark.sql(s"set ${DataSourceWriteOptions.ENABLE_MERGE_INTO_PARTIAL_UPDATES.key} = true") + spark.sql(s"set ${HoodieStorageConfig.LOGFILE_DATA_BLOCK_FORMAT.key} = $logDataBlockFormat") + spark.sql(s"set ${HoodieReaderConfig.FILE_GROUP_READER_ENABLED.key} = true") // Create a table with five data fields spark.sql( @@ -379,13 +379,13 @@ class TestPartialUpdateForMergeInto extends HoodieSparkSqlTestBase { expectedNumLogFile: Int, changedFields: Seq[Seq[String]], isPartial: Boolean): Unit = { - val hadoopConf = getDefaultHadoopConf + val hadoopConf = HoodieTestUtils.getDefaultHadoopConf val metaClient: HoodieTableMetaClient = HoodieTableMetaClient.builder.setConf(hadoopConf).setBasePath(basePath).build val metadataConfig = HoodieMetadataConfig.newBuilder.build val engineContext = new HoodieLocalEngineContext(hadoopConf) val viewManager: FileSystemViewManager = FileSystemViewManager.createViewManager( - engineContext, metadataConfig, FileSystemViewStorageConfig.newBuilder.build, + engineContext, FileSystemViewStorageConfig.newBuilder.build, HoodieCommonConfig.newBuilder.build, new SerializableFunctionUnchecked[HoodieTableMetaClient, HoodieTableMetadata] { override def apply(v1: HoodieTableMetaClient): HoodieTableMetadata = { @@ -398,12 +398,12 @@ class TestPartialUpdateForMergeInto extends HoodieSparkSqlTestBase { val fileSlice: Optional[FileSlice] = fsView.getAllFileSlices("") .filter(new Predicate[FileSlice] { override def test(fileSlice: FileSlice): Boolean = { - getLogFileListFromFileSlice(fileSlice).size() == expectedNumLogFile + HoodieTestUtils.getLogFileListFromFileSlice(fileSlice).size() == expectedNumLogFile } }) .findFirst() assertTrue(fileSlice.isPresent) - val logFilePathList: List[String] = getLogFileListFromFileSlice(fileSlice.get) + val logFilePathList: List[String] = HoodieTestUtils.getLogFileListFromFileSlice(fileSlice.get) Collections.sort(logFilePathList) val avroSchema = new TableSchemaResolver(metaClient).getTableAvroSchema diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestQueryMergeOnReadOptimizedTable.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/dml/TestQueryMergeOnReadOptimizedTable.scala similarity index 97% rename from hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestQueryMergeOnReadOptimizedTable.scala rename to hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/dml/TestQueryMergeOnReadOptimizedTable.scala index 16d56373442c..2f4d12bda44a 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestQueryMergeOnReadOptimizedTable.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/dml/TestQueryMergeOnReadOptimizedTable.scala @@ -15,7 +15,9 @@ * limitations under the License. */ -package org.apache.spark.sql.hudi +package org.apache.spark.sql.hudi.dml + +import org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase class TestQueryMergeOnReadOptimizedTable extends HoodieSparkSqlTestBase { test("Test Query Merge_On_Read Read_Optimized table") { diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestRepairTable.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/dml/TestRepairTable.scala similarity index 98% rename from hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestRepairTable.scala rename to hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/dml/TestRepairTable.scala index 8078ed29bd7e..fccc7b61f1f5 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestRepairTable.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/dml/TestRepairTable.scala @@ -16,14 +16,14 @@ * limitations under the License. */ -package org.apache.spark.sql.hudi +package org.apache.spark.sql.hudi.dml import org.apache.hudi.DataSourceWriteOptions.{PARTITIONPATH_FIELD, PRECOMBINE_FIELD, RECORDKEY_FIELD} import org.apache.hudi.HoodieSparkUtils import org.apache.hudi.common.table.HoodieTableConfig.HIVE_STYLE_PARTITIONING_ENABLE import org.apache.hudi.config.HoodieWriteConfig.TBL_NAME - import org.apache.spark.sql.SaveMode +import org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase class TestRepairTable extends HoodieSparkSqlTestBase { diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestShowPartitions.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/dml/TestShowPartitions.scala similarity index 99% rename from hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestShowPartitions.scala rename to hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/dml/TestShowPartitions.scala index 0704e895309d..1afb0f0974fc 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestShowPartitions.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/dml/TestShowPartitions.scala @@ -15,10 +15,11 @@ * limitations under the License. */ -package org.apache.spark.sql.hudi +package org.apache.spark.sql.hudi.dml import org.apache.hudi.HoodieSparkUtils.isSpark2 import org.apache.hudi.common.util.PartitionPathEncodeUtils.DEFAULT_PARTITION_PATH +import org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase class TestShowPartitions extends HoodieSparkSqlTestBase { diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestTimeTravelTable.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/dml/TestTimeTravelTable.scala similarity index 98% rename from hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestTimeTravelTable.scala rename to hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/dml/TestTimeTravelTable.scala index 73bad3be282d..9924b7003536 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestTimeTravelTable.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/dml/TestTimeTravelTable.scala @@ -15,10 +15,11 @@ * limitations under the License. */ -package org.apache.spark.sql.hudi +package org.apache.spark.sql.hudi.dml import org.apache.hudi.HoodieSparkUtils import org.apache.hudi.common.table.HoodieTableMetaClient +import org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase class TestTimeTravelTable extends HoodieSparkSqlTestBase { test("Test Insert and Update Record with time travel") { @@ -69,7 +70,7 @@ class TestTimeTravelTable extends HoodieSparkSqlTestBase { test("Test Insert Into Records with time travel To new Table") { if (HoodieSparkUtils.gteqSpark3_2) { - withRecordType()(withTempDir { tmp => + withTempDir { tmp => // Create Non-Partitioned table val tableName1 = generateTableName spark.sql( @@ -138,7 +139,7 @@ class TestTimeTravelTable extends HoodieSparkSqlTestBase { Seq(1, "a1", 10.0, 1000, "2022-02-14"), Seq(2, "a2", 10.0, 1000, "2022-02-15") ) - }) + } } } @@ -238,18 +239,18 @@ class TestTimeTravelTable extends HoodieSparkSqlTestBase { test("Test Unsupported syntax can be parsed") { if (HoodieSparkUtils.gteqSpark3_2) { checkAnswer("select 1 distribute by 1")(Seq(1)) - withRecordType()(withTempDir { dir => + withTempDir { dir => val path = dir.toURI.getPath spark.sql(s"insert overwrite local directory '$path' using parquet select 1") // Requires enable hive support, so didn't test it // spark.sql(s"insert overwrite local directory '$path' stored as orc select 1") - }) + } } } test("Test Select Record with time travel and Repartition") { if (HoodieSparkUtils.gteqSpark3_2) { - withRecordType()(withTempDir { tmp => + withTempDir { tmp => val tableName = generateTableName spark.sql( s""" @@ -289,7 +290,7 @@ class TestTimeTravelTable extends HoodieSparkSqlTestBase { s"select id, name, price, ts from $tableName TIMESTAMP AS OF '$instant1' distribute by cast(rand() * 2 as int)")( Seq(1, "a1", 10.0, 1000) ) - }) + } } } diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestTruncateTable.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/dml/TestTruncateTable.scala similarity index 98% rename from hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestTruncateTable.scala rename to hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/dml/TestTruncateTable.scala index 808bfebb802c..411562c35583 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestTruncateTable.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/dml/TestTruncateTable.scala @@ -16,11 +16,12 @@ * limitations under the License. */ -package org.apache.spark.sql.hudi +package org.apache.spark.sql.hudi.dml import org.apache.hudi.DataSourceWriteOptions._ import org.apache.hudi.config.HoodieWriteConfig import org.apache.spark.sql.SaveMode +import org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase class TestTruncateTable extends HoodieSparkSqlTestBase { diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestUpdateTable.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/dml/TestUpdateTable.scala similarity index 98% rename from hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestUpdateTable.scala rename to hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/dml/TestUpdateTable.scala index 0c2c34ae6d9e..5d023b8d856c 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestUpdateTable.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/dml/TestUpdateTable.scala @@ -15,12 +15,13 @@ * limitations under the License. */ -package org.apache.spark.sql.hudi +package org.apache.spark.sql.hudi.dml import org.apache.hudi.DataSourceWriteOptions.SPARK_SQL_OPTIMIZED_WRITES import org.apache.hudi.HoodieSparkUtils.isSpark2 import org.apache.hudi.common.model.HoodieTableType import org.apache.hudi.common.table.HoodieTableMetaClient +import org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase import org.junit.jupiter.api.Assertions.assertEquals class TestUpdateTable extends HoodieSparkSqlTestBase { @@ -233,8 +234,8 @@ class TestUpdateTable extends HoodieSparkSqlTestBase { } test("Test ignoring case for Update Table") { - withRecordType()(withTempDir { tmp => - Seq("cow", "mor").foreach {tableType => + withTempDir { tmp => + Seq("cow", "mor").foreach { tableType => val tableName = generateTableName // create table spark.sql( @@ -270,7 +271,7 @@ class TestUpdateTable extends HoodieSparkSqlTestBase { Seq(1, "a1", 40.0, 1000) ) } - }) + } } test("Test decimal type") { diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/procedure/HoodieSparkProcedureTestBase.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/procedure/HoodieSparkProcedureTestBase.scala index cff411051178..ff4f7aa6ab06 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/procedure/HoodieSparkProcedureTestBase.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/procedure/HoodieSparkProcedureTestBase.scala @@ -19,7 +19,7 @@ package org.apache.spark.sql.hudi.procedure import org.apache.spark.sql.Dataset import org.apache.spark.sql.execution.columnar.InMemoryRelation -import org.apache.spark.sql.hudi.HoodieSparkSqlTestBase +import org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase class HoodieSparkProcedureTestBase extends HoodieSparkSqlTestBase { override def generateTableName: String = { diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/procedure/TestCallCommandParser.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/procedure/TestCallCommandParser.scala index b5b13f468060..3d07286ca190 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/procedure/TestCallCommandParser.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/procedure/TestCallCommandParser.scala @@ -21,7 +21,7 @@ import org.apache.hudi.HoodieSparkUtils import org.apache.hudi.common.util.CollectionUtils.createImmutableList import org.apache.spark.sql.catalyst.expressions.Literal import org.apache.spark.sql.catalyst.plans.logical.{CallCommand, NamedArgument, PositionalArgument} -import org.apache.spark.sql.hudi.HoodieSparkSqlTestBase +import org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase import org.apache.spark.sql.types.{DataType, DataTypes} import java.math.BigDecimal diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/procedure/TestCopyToTempViewProcedure.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/procedure/TestCopyToTempViewProcedure.scala index 5cb5b68fa045..6f54dfb5094c 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/procedure/TestCopyToTempViewProcedure.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/procedure/TestCopyToTempViewProcedure.scala @@ -18,7 +18,7 @@ package org.apache.spark.sql.hudi.procedure import org.apache.hudi.HoodieSparkUtils -import org.apache.spark.sql.hudi.HoodieSparkSqlTestBase +import org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase class TestCopyToTempViewProcedure extends HoodieSparkSqlTestBase { diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/procedure/TestTTLProcedure.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/procedure/TestTTLProcedure.scala new file mode 100644 index 000000000000..002375ac462e --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/procedure/TestTTLProcedure.scala @@ -0,0 +1,117 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.spark.sql.hudi.procedure + +import org.apache.hudi.SparkDatasetMixin +import org.apache.hudi.client.SparkRDDWriteClient +import org.apache.hudi.client.common.HoodieSparkEngineContext +import org.apache.hudi.common.model.{HoodieRecord, HoodieTableType} +import org.apache.hudi.common.table.HoodieTableConfig +import org.apache.hudi.common.table.timeline.HoodieTimeline +import org.apache.hudi.common.testutils.HoodieTestDataGenerator.{TRIP_EXAMPLE_SCHEMA, getCommitTimeAtUTC} +import org.apache.hudi.common.testutils.{HoodieTestDataGenerator, HoodieTestUtils} +import org.apache.hudi.config.HoodieWriteConfig +import org.apache.spark.api.java.JavaSparkContext + +import java.util.Properties +import scala.collection.JavaConverters._ + +class TestTTLProcedure extends HoodieSparkProcedureTestBase with SparkDatasetMixin { + + test("Test Call run_ttl Procedure by Table") { + withSQLConf("hoodie.partition.ttl.automatic" -> "false") { + withTempDir { tmp => { + val tableName = generateTableName + val basePath = s"${tmp.getCanonicalPath}/$tableName" + initTable(basePath) + + val writeConfig = getConfigBuilder(basePath, tableName, true).build() + val client = getHoodieWriteClient(writeConfig) + val dataGen = new HoodieTestDataGenerator(0xDEED) + val partitionPaths = dataGen.getPartitionPaths() + val partitionPath0 = partitionPaths(0) + val instant0 = getCommitTimeAtUTC(0) + + writeRecordsForPartition(client, dataGen, partitionPath0, instant0) + + val instant1 = getCommitTimeAtUTC(1000) + val partitionPath1 = partitionPaths(1) + writeRecordsForPartition(client, dataGen, partitionPath1, instant1) + + val currentInstant = client.createNewInstantTime() + val partitionPath2 = partitionPaths(2) + writeRecordsForPartition(client, dataGen, partitionPath2, currentInstant) + spark.sql( + s""" + | create table $tableName using hudi + | location '$basePath' + | tblproperties ( + | primaryKey = '_row_key', + | preCombineField = '_row_key', + | type = 'cow' + | ) + |""".stripMargin) + + checkAnswer(s"call run_ttl(table => '$tableName', retain_days => 1)")( + Seq(partitionPath0), + Seq(partitionPath1) + ) + } + } + } + } + + private def writeRecordsForPartition(client: SparkRDDWriteClient[Nothing], + dataGen: HoodieTestDataGenerator, + partition: String, instantTime: String): Unit = { + val records: java.util.List[HoodieRecord[Nothing]] = + dataGen.generateInsertsForPartition(instantTime, 10, partition) + .asInstanceOf[java.util.List[HoodieRecord[Nothing]]] + // Use this JavaRDD to call the insert method + client.startCommitWithTime(instantTime, HoodieTimeline.COMMIT_ACTION) + client.insert(spark.sparkContext.parallelize(records.asScala).toJavaRDD(), instantTime) + } + + private def getHoodieWriteClient(cfg: HoodieWriteConfig): SparkRDDWriteClient[Nothing] = { + val writeClient = new SparkRDDWriteClient[Nothing]( + new HoodieSparkEngineContext(new JavaSparkContext(spark.sparkContext)), cfg + ) + writeClient + } + + private def initTable(basePath: String): Unit = { + val props = new Properties() + props.put("hoodie.datasource.write.partitionpath.field", "partition_path") + props.put("hoodie.datasource.write.keygenerator.class", "org.apache.hudi.keygen.SimpleKeyGenerator") + props.put(HoodieTableConfig.PARTITION_FIELDS.key(), "partition_path") + props.put(HoodieTableConfig.RECORDKEY_FIELDS.key(), "_row_key") + HoodieTestUtils.init(basePath, HoodieTableType.COPY_ON_WRITE, props); + } + + protected def getConfigBuilder(basePath: String, tableName: String, autoCommit: Boolean): HoodieWriteConfig.Builder = + HoodieWriteConfig + .newBuilder + .withPath(basePath) + .withSchema(TRIP_EXAMPLE_SCHEMA) + .withAutoCommit(autoCommit) + .withPreCombineField("_row_key") + .forTable(tableName) + +} diff --git a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncConfigHolder.java b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncConfigHolder.java index 74cb90de0209..8f31cae29bc9 100644 --- a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncConfigHolder.java +++ b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncConfigHolder.java @@ -90,7 +90,8 @@ public class HiveSyncConfigHolder { .defaultValue("false") .markAdvanced() .withDocumentation("‘INT64’ with original type TIMESTAMP_MICROS is converted to hive ‘timestamp’ type. " - + "Disabled by default for backward compatibility."); + + "Disabled by default for backward compatibility. \n" + + "NOTE: On Spark entrypoints, this is defaulted to TRUE"); public static final ConfigProperty HIVE_TABLE_PROPERTIES = ConfigProperty .key("hoodie.datasource.hive_sync.table_properties") .noDefaultValue() diff --git a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncTool.java b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncTool.java index 4f0ff2b070b9..f99bd2d0e020 100644 --- a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncTool.java +++ b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncTool.java @@ -39,7 +39,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.util.Comparator; import java.util.List; import java.util.Map; import java.util.Properties; @@ -383,18 +382,7 @@ private List getTablePartitions(String tableName, List writte return syncClient.getAllPartitions(tableName); } - List partitionKeys = config.getSplitStrings(META_SYNC_PARTITION_FIELDS).stream() - .map(String::toLowerCase) - .collect(Collectors.toList()); - - List partitionFields = syncClient.getMetastoreFieldSchemas(tableName) - .stream() - .filter(f -> partitionKeys.contains(f.getName())) - .sorted(Comparator.comparing(f -> partitionKeys.indexOf(f.getName()))) - .collect(Collectors.toList()); - - return syncClient.getPartitionsByFilter(tableName, - syncClient.generatePushDownFilter(writtenPartitions, partitionFields)); + return syncClient.getPartitionsFromList(tableName, writtenPartitions); } /** diff --git a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HoodieHiveSyncClient.java b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HoodieHiveSyncClient.java index 4865a304a301..5a88f03e1630 100644 --- a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HoodieHiveSyncClient.java +++ b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HoodieHiveSyncClient.java @@ -66,6 +66,7 @@ import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_BASE_FILE_FORMAT; import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_BASE_PATH; import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_DATABASE_NAME; +import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_PARTITION_FIELDS; import static org.apache.hudi.sync.common.util.TableUtils.tableId; /** @@ -217,8 +218,19 @@ public List getAllPartitions(String tableName) { } @Override - public List getPartitionsByFilter(String tableName, String filter) { + public List getPartitionsFromList(String tableName, List partitions) { + String filter = null; try { + List partitionKeys = config.getSplitStrings(META_SYNC_PARTITION_FIELDS).stream() + .map(String::toLowerCase) + .collect(Collectors.toList()); + + List partitionFields = this.getMetastoreFieldSchemas(tableName) + .stream() + .filter(f -> partitionKeys.contains(f.getName())) + .collect(Collectors.toList()); + filter = this.generatePushDownFilter(partitions, partitionFields); + return client.listPartitionsByFilter(databaseName, tableName, filter, (short)-1) .stream() .map(p -> new Partition(p.getValues(), p.getSd().getLocation())) diff --git a/hudi-sync/hudi-hive-sync/src/test/java/org/apache/hudi/hive/testutils/HiveTestCluster.java b/hudi-sync/hudi-hive-sync/src/test/java/org/apache/hudi/hive/testutils/HiveTestCluster.java index 362ad1453c43..e92c7aedb508 100644 --- a/hudi-sync/hudi-hive-sync/src/test/java/org/apache/hudi/hive/testutils/HiveTestCluster.java +++ b/hudi-sync/hudi-hive-sync/src/test/java/org/apache/hudi/hive/testutils/HiveTestCluster.java @@ -27,6 +27,7 @@ import org.apache.hudi.common.model.HoodieCommitMetadata; import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.model.HoodieWriteStat; +import org.apache.hudi.common.table.HoodieTableConfig; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.testutils.InProcessTimeGenerator; @@ -203,7 +204,7 @@ private List createTestData(Path partPath, boolean isParquetSch // Create 5 files String fileId = UUID.randomUUID().toString(); Path filePath = new Path(partPath.toString() + "/" + FSUtils - .makeBaseFileName(commitTime, "1-0-1", fileId)); + .makeBaseFileName(commitTime, "1-0-1", fileId, HoodieTableConfig.BASE_FILE_FORMAT.defaultValue().getFileExtension())); generateParquetData(filePath, isParquetSchemaSimple); HoodieWriteStat writeStat = new HoodieWriteStat(); writeStat.setFileId(fileId); diff --git a/hudi-sync/hudi-hive-sync/src/test/java/org/apache/hudi/hive/testutils/HiveTestUtil.java b/hudi-sync/hudi-hive-sync/src/test/java/org/apache/hudi/hive/testutils/HiveTestUtil.java index 328e6efda082..c7ce2dd22e1f 100644 --- a/hudi-sync/hudi-hive-sync/src/test/java/org/apache/hudi/hive/testutils/HiveTestUtil.java +++ b/hudi-sync/hudi-hive-sync/src/test/java/org/apache/hudi/hive/testutils/HiveTestUtil.java @@ -36,6 +36,7 @@ import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.model.HoodieWriteStat; import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.common.table.HoodieTableConfig; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.log.HoodieLogFormat; import org.apache.hudi.common.table.log.HoodieLogFormat.Writer; @@ -391,7 +392,8 @@ public static void createCOWTableWithSchema(String instantTime, String schemaFil FileCreateUtils.createPartitionMetaFile(basePath, partitionPath); List writeStats = new ArrayList<>(); String fileId = UUID.randomUUID().toString(); - Path filePath = new Path(partPath.toString() + "/" + FSUtils.makeBaseFileName(instantTime, "1-0-1", fileId)); + Path filePath = new Path(partPath.toString() + "/" + + FSUtils.makeBaseFileName(instantTime, "1-0-1", fileId, HoodieTableConfig.BASE_FILE_FORMAT.defaultValue().getFileExtension())); Schema schema = SchemaTestUtil.getSchemaFromResource(HiveTestUtil.class, schemaFileName); generateParquetDataWithSchema(filePath, schema); HoodieWriteStat writeStat = new HoodieWriteStat(); @@ -530,7 +532,8 @@ private static List createTestData(Path partPath, boolean isPar for (int i = 0; i < 5; i++) { // Create 5 files String fileId = UUID.randomUUID().toString(); - Path filePath = new Path(partPath.toString() + "/" + FSUtils.makeBaseFileName(instantTime, "1-0-1", fileId)); + Path filePath = new Path(partPath.toString() + "/" + + FSUtils.makeBaseFileName(instantTime, "1-0-1", fileId, HoodieTableConfig.BASE_FILE_FORMAT.defaultValue().getFileExtension())); generateParquetData(filePath, isParquetSchemaSimple); HoodieWriteStat writeStat = new HoodieWriteStat(); writeStat.setFileId(fileId); diff --git a/hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/HoodieMetaSyncOperations.java b/hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/HoodieMetaSyncOperations.java index b1acaf143961..ca0bec3604bd 100644 --- a/hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/HoodieMetaSyncOperations.java +++ b/hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/HoodieMetaSyncOperations.java @@ -99,11 +99,9 @@ default List getAllPartitions(String tableName) { } /** - * Get the metadata of partitions that belong to the specified table - * @param tableName - * @return + * Get partitions given input list of partitions. */ - default List getPartitionsByFilter(String tableName, String filter) { + default List getPartitionsFromList(String tableName, List partitionList) { return Collections.emptyList(); } diff --git a/hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/HoodieSyncClient.java b/hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/HoodieSyncClient.java index aec0e484e6cd..fc3e31164acc 100644 --- a/hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/HoodieSyncClient.java +++ b/hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/HoodieSyncClient.java @@ -92,10 +92,7 @@ public HoodieTableMetaClient getMetaClient() { * Going through archive timeline is a costly operation, and it should be avoided unless some start time is given. */ public Set getDroppedPartitionsSince(Option lastCommitTimeSynced, Option lastCommitCompletionTimeSynced) { - HoodieTimeline timeline = lastCommitTimeSynced.isPresent() - ? TimelineUtils.getCommitsTimelineAfter(metaClient, lastCommitTimeSynced.get(), lastCommitCompletionTimeSynced) - : metaClient.getActiveTimeline(); - return new HashSet<>(TimelineUtils.getDroppedPartitions(timeline)); + return new HashSet<>(TimelineUtils.getDroppedPartitions(metaClient, lastCommitTimeSynced, lastCommitCompletionTimeSynced)); } @Override diff --git a/hudi-timeline-service/pom.xml b/hudi-timeline-service/pom.xml index 9e6a504626a2..a2b3c67aa830 100644 --- a/hudi-timeline-service/pom.xml +++ b/hudi-timeline-service/pom.xml @@ -62,9 +62,6 @@ src/main/resources - - src/test/resources - diff --git a/hudi-timeline-service/src/main/java/org/apache/hudi/timeline/service/RequestHandler.java b/hudi-timeline-service/src/main/java/org/apache/hudi/timeline/service/RequestHandler.java index c72491341fe4..f17c5624084e 100644 --- a/hudi-timeline-service/src/main/java/org/apache/hudi/timeline/service/RequestHandler.java +++ b/hudi-timeline-service/src/main/java/org/apache/hudi/timeline/service/RequestHandler.java @@ -37,6 +37,7 @@ import org.apache.hudi.common.util.HoodieTimer; import org.apache.hudi.common.util.Option; import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.timeline.service.handlers.BaseFileHandler; import org.apache.hudi.timeline.service.handlers.FileSliceHandler; import org.apache.hudi.timeline.service.handlers.InstantStateHandler; @@ -44,6 +45,7 @@ import org.apache.hudi.timeline.service.handlers.TimelineHandler; import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.core.type.TypeReference; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.module.afterburner.AfterburnerModule; import io.javalin.Javalin; @@ -72,6 +74,7 @@ public class RequestHandler { private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper().registerModule(new AfterburnerModule()); private static final Logger LOG = LoggerFactory.getLogger(RequestHandler.class); + private static final TypeReference> LIST_TYPE_REFERENCE = new TypeReference>() {}; private final TimelineService.Config timelineServiceConfig; private final FileSystemViewManager viewManager; @@ -444,6 +447,19 @@ private void registerFileSlicesAPI() { writeValueAsString(ctx, success); }, false)); + app.post(RemoteHoodieTableFileSystemView.LOAD_PARTITIONS_URL, new ViewHandler(ctx -> { + metricsRegistry.add("LOAD_PARTITIONS", 1); + String basePath = ctx.queryParamAsClass(RemoteHoodieTableFileSystemView.BASEPATH_PARAM, String.class).getOrThrow(e -> new HoodieException("Basepath is invalid")); + try { + List partitionPaths = OBJECT_MAPPER.readValue(ctx.queryParamAsClass(RemoteHoodieTableFileSystemView.PARTITIONS_PARAM, String.class) + .getOrThrow(e -> new HoodieException("Partitions param is invalid")), LIST_TYPE_REFERENCE); + boolean success = sliceHandler.loadPartitions(basePath, partitionPaths); + writeValueAsString(ctx, success); + } catch (IOException e) { + throw new HoodieIOException("Failed to parse request parameter", e); + } + }, false)); + app.post(RemoteHoodieTableFileSystemView.LOAD_ALL_PARTITIONS_URL, new ViewHandler(ctx -> { metricsRegistry.add("LOAD_ALL_PARTITIONS", 1); boolean success = sliceHandler diff --git a/hudi-timeline-service/src/main/java/org/apache/hudi/timeline/service/TimelineService.java b/hudi-timeline-service/src/main/java/org/apache/hudi/timeline/service/TimelineService.java index f596ee79f9c5..c2e5dc265677 100644 --- a/hudi-timeline-service/src/main/java/org/apache/hudi/timeline/service/TimelineService.java +++ b/hudi-timeline-service/src/main/java/org/apache/hudi/timeline/service/TimelineService.java @@ -401,20 +401,20 @@ public static FileSystemViewManager buildFileSystemViewManager(Config config, Se case MEMORY: FileSystemViewStorageConfig.Builder inMemConfBuilder = FileSystemViewStorageConfig.newBuilder(); inMemConfBuilder.withStorageType(FileSystemViewStorageType.MEMORY); - return FileSystemViewManager.createViewManager(localEngineContext, metadataConfig, inMemConfBuilder.build(), commonConfig); + return FileSystemViewManager.createViewManager(localEngineContext, inMemConfBuilder.build(), commonConfig); case SPILLABLE_DISK: { FileSystemViewStorageConfig.Builder spillableConfBuilder = FileSystemViewStorageConfig.newBuilder(); spillableConfBuilder.withStorageType(FileSystemViewStorageType.SPILLABLE_DISK) .withBaseStoreDir(config.baseStorePathForFileGroups) .withMaxMemoryForView(config.maxViewMemPerTableInMB * 1024 * 1024L) .withMemFractionForPendingCompaction(config.memFractionForCompactionPerTable); - return FileSystemViewManager.createViewManager(localEngineContext, metadataConfig, spillableConfBuilder.build(), commonConfig); + return FileSystemViewManager.createViewManager(localEngineContext, spillableConfBuilder.build(), commonConfig); } case EMBEDDED_KV_STORE: { FileSystemViewStorageConfig.Builder rocksDBConfBuilder = FileSystemViewStorageConfig.newBuilder(); rocksDBConfBuilder.withStorageType(FileSystemViewStorageType.EMBEDDED_KV_STORE) .withRocksDBPath(config.rocksDBPath); - return FileSystemViewManager.createViewManager(localEngineContext, metadataConfig, rocksDBConfBuilder.build(), commonConfig); + return FileSystemViewManager.createViewManager(localEngineContext, rocksDBConfBuilder.build(), commonConfig); } default: throw new IllegalArgumentException("Invalid view manager storage type :" + config.viewStorageType); diff --git a/hudi-timeline-service/src/main/java/org/apache/hudi/timeline/service/handlers/FileSliceHandler.java b/hudi-timeline-service/src/main/java/org/apache/hudi/timeline/service/handlers/FileSliceHandler.java index 4a4226724f8b..391145c5cf8b 100644 --- a/hudi-timeline-service/src/main/java/org/apache/hudi/timeline/service/handlers/FileSliceHandler.java +++ b/hudi-timeline-service/src/main/java/org/apache/hudi/timeline/service/handlers/FileSliceHandler.java @@ -163,4 +163,9 @@ public boolean loadAllPartitions(String basePath) { viewManager.getFileSystemView(basePath).loadAllPartitions(); return true; } + + public boolean loadPartitions(String basePath, List partitionPaths) { + viewManager.getFileSystemView(basePath).loadPartitions(partitionPaths); + return true; + } } diff --git a/hudi-timeline-service/src/test/java/org/apache/hudi/timeline/service/functional/TestRemoteHoodieTableFileSystemView.java b/hudi-timeline-service/src/test/java/org/apache/hudi/timeline/service/functional/TestRemoteHoodieTableFileSystemView.java index c9a103e5264f..834697852822 100644 --- a/hudi-timeline-service/src/test/java/org/apache/hudi/timeline/service/functional/TestRemoteHoodieTableFileSystemView.java +++ b/hudi-timeline-service/src/test/java/org/apache/hudi/timeline/service/functional/TestRemoteHoodieTableFileSystemView.java @@ -19,7 +19,6 @@ package org.apache.hudi.timeline.service.functional; import org.apache.hudi.common.config.HoodieCommonConfig; -import org.apache.hudi.common.config.HoodieMetadataConfig; import org.apache.hudi.common.engine.HoodieLocalEngineContext; import org.apache.hudi.common.model.HoodieFileGroup; import org.apache.hudi.common.table.timeline.HoodieTimeline; @@ -67,14 +66,13 @@ public class TestRemoteHoodieTableFileSystemView extends TestHoodieTableFileSyst protected SyncableFileSystemView getFileSystemView(HoodieTimeline timeline) { FileSystemViewStorageConfig sConf = FileSystemViewStorageConfig.newBuilder().withStorageType(FileSystemViewStorageType.SPILLABLE_DISK).build(); - HoodieMetadataConfig metadataConfig = HoodieMetadataConfig.newBuilder().build(); HoodieCommonConfig commonConfig = HoodieCommonConfig.newBuilder().build(); HoodieLocalEngineContext localEngineContext = new HoodieLocalEngineContext(metaClient.getHadoopConf()); try { server = new TimelineService(localEngineContext, new Configuration(), TimelineService.Config.builder().serverPort(0).build(), FileSystem.get(new Configuration()), - FileSystemViewManager.createViewManager(localEngineContext, metadataConfig, sConf, commonConfig)); + FileSystemViewManager.createViewManager(localEngineContext, sConf, commonConfig)); server.startService(); } catch (Exception ex) { throw new RuntimeException(ex); diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieIndexer.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieIndexer.java index 5c626a53ae7e..03b6d934b5f8 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieIndexer.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieIndexer.java @@ -43,6 +43,7 @@ import java.io.Serializable; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; import java.util.List; import java.util.Locale; import java.util.Set; @@ -240,7 +241,8 @@ private Option doSchedule(SparkRDDWriteClient clien if (indexExists(partitionTypes)) { return Option.empty(); } - Option indexingInstant = client.scheduleIndexing(partitionTypes); + + Option indexingInstant = client.scheduleIndexing(partitionTypes, Collections.emptyList()); if (!indexingInstant.isPresent()) { LOG.error("Scheduling of index action did not return any instant."); } diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieMetadataTableValidator.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieMetadataTableValidator.java index 4cebbf0b3cc1..06dbde8b108c 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieMetadataTableValidator.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieMetadataTableValidator.java @@ -103,6 +103,7 @@ import static org.apache.hudi.common.model.HoodieRecord.PARTITION_PATH_METADATA_FIELD; import static org.apache.hudi.common.model.HoodieRecord.RECORD_KEY_METADATA_FIELD; import static org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType.INSTANT_TIME; +import static org.apache.hudi.common.table.timeline.HoodieTimeline.GREATER_THAN; import static org.apache.hudi.common.table.timeline.HoodieTimeline.LESSER_THAN_OR_EQUALS; import static org.apache.hudi.common.util.StringUtils.getUTF8Bytes; import static org.apache.hudi.hadoop.fs.CachingPath.getPathWithoutSchemeAndAuthority; @@ -180,6 +181,8 @@ public class HoodieMetadataTableValidator implements Serializable { private final String taskLabels; + private List throwables = new ArrayList<>(); + public HoodieMetadataTableValidator(JavaSparkContext jsc, Config cfg) { this.jsc = jsc; this.cfg = cfg; @@ -197,8 +200,30 @@ public HoodieMetadataTableValidator(JavaSparkContext jsc, Config cfg) { this.taskLabels = generateValidationTaskLabels(); } + /** + * Returns list of Throwable which were encountered during validation. This method is useful + * when ignoreFailed parameter is set to true. + */ + public List getThrowables() { + return throwables; + } + + /** + * Returns true if there is a validation failure encountered during validation. + * This method is useful when ignoreFailed parameter is set to true. + */ + public boolean hasValidationFailure() { + for (Throwable throwable : throwables) { + if (throwable instanceof HoodieValidationException) { + return true; + } + } + return false; + } + private String generateValidationTaskLabels() { List labelList = new ArrayList<>(); + labelList.add(cfg.basePath); if (cfg.validateLatestBaseFiles) { labelList.add("validate-latest-base-files"); } @@ -411,10 +436,10 @@ public boolean run() { try { LOG.info(cfg.toString()); if (cfg.continuous) { - LOG.info(" ****** do hoodie metadata table validation in CONTINUOUS mode ******"); + LOG.info(" ****** do hoodie metadata table validation in CONTINUOUS mode - {} ******", taskLabels); doHoodieMetadataTableValidationContinuous(); } else { - LOG.info(" ****** do hoodie metadata table validation once ******"); + LOG.info(" ****** do hoodie metadata table validation once - {} ******", taskLabels); result = doHoodieMetadataTableValidationOnce(); } } catch (Exception e) { @@ -432,10 +457,11 @@ private boolean doHoodieMetadataTableValidationOnce() { try { return doMetadataTableValidation(); } catch (Throwable e) { - LOG.error("Metadata table validation failed to HoodieValidationException", e); + LOG.error("Metadata table validation failed to HoodieValidationException {} {}", taskLabels, e); if (!cfg.ignoreFailed) { throw e; } + throwables.add(e); return false; } } @@ -491,7 +517,7 @@ public boolean doMetadataTableValidation() { List allPartitions = validatePartitions(engineContext, basePath); if (allPartitions.isEmpty()) { - LOG.warn("The result of getting all partitions is null or empty, skip current validation."); + LOG.warn("The result of getting all partitions is null or empty, skip current validation. {}", taskLabels); return true; } @@ -500,12 +526,12 @@ public boolean doMetadataTableValidation() { HoodieMetadataValidationContext fsBasedContext = new HoodieMetadataValidationContext(engineContext, props, metaClient, false)) { Set finalBaseFilesForCleaning = baseFilesForCleaning; - List> result = new ArrayList<>( + List> result = new ArrayList<>( engineContext.parallelize(allPartitions, allPartitions.size()).map(partitionPath -> { try { validateFilesInPartition(metadataTableBasedContext, fsBasedContext, partitionPath, finalBaseFilesForCleaning); LOG.info(String.format("Metadata table validation succeeded for partition %s (partition %s)", partitionPath, taskLabels)); - return Pair.of(true, ""); + return Pair.of(true, null); } catch (HoodieValidationException e) { LOG.error( String.format("Metadata table validation failed for partition %s due to HoodieValidationException (partition %s)", @@ -513,26 +539,29 @@ public boolean doMetadataTableValidation() { if (!cfg.ignoreFailed) { throw e; } - return Pair.of(false, e.getMessage() + " for partition: " + partitionPath); + return Pair.of(false, new HoodieValidationException(e.getMessage() + " for partition: " + partitionPath, e)); } }).collectAsList()); try { validateRecordIndex(engineContext, metaClient, metadataTableBasedContext.getTableMetadata()); - result.add(Pair.of(true, "")); + result.add(Pair.of(true, null)); } catch (HoodieValidationException e) { LOG.error( - "Metadata table validation failed due to HoodieValidationException in record index validation", e); + "Metadata table validation failed due to HoodieValidationException in record index validation for table: {} ", cfg.basePath, e); if (!cfg.ignoreFailed) { throw e; } - result.add(Pair.of(false, e.getMessage())); + result.add(Pair.of(false, e)); } - for (Pair res : result) { + for (Pair res : result) { finalResult &= res.getKey(); if (res.getKey().equals(false)) { LOG.error("Metadata Validation failed for table: " + cfg.basePath + " with error: " + res.getValue()); + if (res.getRight() != null) { + throwables.add(res.getRight()); + } } } @@ -563,19 +592,19 @@ private boolean checkMetadataTableIsAvailable() { int finishedInstants = mdtMetaClient.getCommitsTimeline().filterCompletedInstants().countInstants(); if (finishedInstants == 0) { if (metaClient.getCommitsTimeline().filterCompletedInstants().countInstants() == 0) { - LOG.info("There is no completed commit in both metadata table and corresponding data table."); + LOG.info("There is no completed commit in both metadata table and corresponding data table: {}", taskLabels); return false; } else { - throw new HoodieValidationException("There is no completed instant for metadata table."); + throw new HoodieValidationException("There is no completed instant for metadata table: " + cfg.basePath); } } return true; } catch (TableNotFoundException tbe) { // Suppress the TableNotFound exception if Metadata table is not available to read for now - LOG.warn("Metadata table is not found. Skip current validation."); + LOG.warn("Metadata table is not found for table: {}. Skip current validation.", cfg.basePath); return false; } catch (Exception ex) { - LOG.warn("Metadata table is not available to read for now, ", ex); + LOG.warn("Metadata table is not available to read for now for table: {}, ", cfg.basePath, ex); return false; } } @@ -622,9 +651,43 @@ private List validatePartitions(HoodieSparkEngineContext engineContext, if (allPartitionPathsFromFS.size() != allPartitionPathsMeta.size() || !allPartitionPathsFromFS.equals(allPartitionPathsMeta)) { - String message = "Compare Partitions Failed! " + "AllPartitionPathsFromFS : " + allPartitionPathsFromFS + " and allPartitionPathsMeta : " + allPartitionPathsMeta; - LOG.error(message); - throw new HoodieValidationException(message); + List additionalFromFS = new ArrayList<>(allPartitionPathsFromFS); + additionalFromFS.remove(allPartitionPathsMeta); + List additionalFromMDT = new ArrayList<>(allPartitionPathsMeta); + additionalFromMDT.remove(allPartitionPathsFromFS); + boolean misMatch = true; + List actualAdditionalPartitionsInMDT = new ArrayList<>(additionalFromMDT); + if (additionalFromFS.isEmpty() && !additionalFromMDT.isEmpty()) { + // there is a chance that when we polled MDT there could have been a new completed commit which was not complete when we polled FS based + // listing. let's rule that out. + additionalFromMDT.forEach(partitionFromDMT -> { + + HoodiePartitionMetadata hoodiePartitionMetadata = + new HoodiePartitionMetadata(metaClient.getFs(), FSUtils.getPartitionPath(basePath, partitionFromDMT)); + Option partitionCreationTimeOpt = hoodiePartitionMetadata.readPartitionCreatedCommitTime(); + // if creation time is greater than last completed instant in active timeline, we can ignore the additional partition from MDT. + if (partitionCreationTimeOpt.isPresent() && !completedTimeline.containsInstant(partitionCreationTimeOpt.get())) { + Option lastInstant = completedTimeline.lastInstant(); + if (lastInstant.isPresent() + && HoodieTimeline.compareTimestamps(partitionCreationTimeOpt.get(), GREATER_THAN, lastInstant.get().getTimestamp())) { + LOG.warn("Ignoring additional partition " + partitionFromDMT + ", as it was deduced to be part of a " + + "latest completed commit which was inflighht when FS based listing was polled."); + actualAdditionalPartitionsInMDT.remove(partitionFromDMT); + } + } + }); + // if there is no additional partitions from FS listing and only additional partitions from MDT based listing is due to a new commit, we are good + if (actualAdditionalPartitionsInMDT.isEmpty()) { + misMatch = false; + } + } + if (misMatch) { + String message = "Compare Partitions Failed! " + " Additional partitions from FS, but missing from MDT : \"" + additionalFromFS + + "\" and additional partitions from MDT, but missing from FS listing : \"" + actualAdditionalPartitionsInMDT + + "\".\n All partitions from FS listing " + allPartitionPathsFromFS; + LOG.error(message); + throw new HoodieValidationException(message); + } } return allPartitionPathsMeta; @@ -835,13 +898,13 @@ private void validateRecordIndexCount(HoodieSparkEngineContext sparkEngineContex if (countKeyFromTable != countKeyFromRecordIndex) { String message = String.format("Validation of record index count failed: " - + "%s entries from record index metadata, %s keys from the data table.", + + "%s entries from record index metadata, %s keys from the data table: " + cfg.basePath, countKeyFromRecordIndex, countKeyFromTable); LOG.error(message); throw new HoodieValidationException(message); } else { LOG.info(String.format( - "Validation of record index count succeeded: %s entries.", countKeyFromRecordIndex)); + "Validation of record index count succeeded: %s entries. Table: %s", countKeyFromRecordIndex, cfg.basePath)); } } @@ -950,13 +1013,13 @@ private void validateRecordIndexContent(HoodieSparkEngineContext sparkEngineCont if (diffCount > 0) { String message = String.format("Validation of record index content failed: " + "%s keys (total %s) from the data table have wrong location in record index " - + "metadata. Sample mismatches: %s", - diffCount, countKey, String.join(";", result.getRight())); + + "metadata. Table: %s Sample mismatches: %s", + diffCount, countKey, cfg.basePath, String.join(";", result.getRight())); LOG.error(message); throw new HoodieValidationException(message); } else { LOG.info(String.format( - "Validation of record index content succeeded: %s entries.", countKey)); + "Validation of record index content succeeded: %s entries. Table: %s", countKey, cfg.basePath)); } } @@ -995,13 +1058,13 @@ private void validate( List infoListFromMetadataTable, List infoListFromFS, String partitionPath, String label) { if (infoListFromMetadataTable.size() != infoListFromFS.size() || !infoListFromMetadataTable.equals(infoListFromFS)) { - String message = String.format("Validation of %s for partition %s failed." + String message = String.format("Validation of %s for partition %s failed for table: %s " + "\n%s from metadata: %s\n%s from file system and base files: %s", - label, partitionPath, label, infoListFromMetadataTable, label, infoListFromFS); + label, partitionPath, cfg.basePath, label, infoListFromMetadataTable, label, infoListFromFS); LOG.error(message); throw new HoodieValidationException(message); } else { - LOG.info(String.format("Validation of %s succeeded for partition %s", label, partitionPath)); + LOG.info(String.format("Validation of %s succeeded for partition %s for table: %s", label, partitionPath, cfg.basePath)); } } @@ -1035,13 +1098,13 @@ private void validateFileSlices( } if (mismatch) { - String message = String.format("Validation of %s for partition %s failed." + String message = String.format("Validation of %s for partition %s failed for table: %s " + "\n%s from metadata: %s\n%s from file system and base files: %s", - label, partitionPath, label, fileSliceListFromMetadataTable, label, fileSliceListFromFS); + label, partitionPath, cfg.basePath, label, fileSliceListFromMetadataTable, label, fileSliceListFromFS); LOG.error(message); throw new HoodieValidationException(message); } else { - LOG.info(String.format("Validation of %s succeeded for partition %s", label, partitionPath)); + LOG.info(String.format("Validation of %s succeeded for partition %s for table: %s ", label, partitionPath, cfg.basePath)); } } @@ -1217,6 +1280,7 @@ protected Pair startService() { if (!cfg.ignoreFailed) { throw e; } + throwables.add(e); } catch (InterruptedException e) { // ignore InterruptedException here. } diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieTTLJob.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieTTLJob.java new file mode 100644 index 000000000000..3f11621d9d10 --- /dev/null +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieTTLJob.java @@ -0,0 +1,131 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.utilities; + +import org.apache.hudi.client.SparkRDDWriteClient; +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieCleanConfig; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieException; + +import com.beust.jcommander.JCommander; +import com.beust.jcommander.Parameter; +import org.apache.hadoop.fs.Path; +import org.apache.spark.api.java.JavaSparkContext; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.List; + +/** + * Utility class to run TTL management. + */ +public class HoodieTTLJob { + + private static final Logger LOG = LoggerFactory.getLogger(HoodieTTLJob.class); + private final Config cfg; + private final TypedProperties props; + private final JavaSparkContext jsc; + private HoodieTableMetaClient metaClient; + + public HoodieTTLJob(JavaSparkContext jsc, Config cfg) { + this(jsc, cfg, UtilHelpers.buildProperties(jsc.hadoopConfiguration(), cfg.propsFilePath, cfg.configs), + UtilHelpers.createMetaClient(jsc, cfg.basePath, true)); + } + + public HoodieTTLJob(JavaSparkContext jsc, Config cfg, TypedProperties props, HoodieTableMetaClient metaClient) { + this.cfg = cfg; + this.jsc = jsc; + this.props = props; + this.metaClient = metaClient; + LOG.info("Creating TTL job with configs : " + props.toString()); + // Disable async cleaning, will trigger synchronous cleaning manually. + this.props.put(HoodieCleanConfig.ASYNC_CLEAN.key(), false); + if (this.metaClient.getTableConfig().isMetadataTableAvailable()) { + // add default lock config options if MDT is enabled. + UtilHelpers.addLockOptions(cfg.basePath, this.props); + } + } + + public void run() { + // need to do commit in SparkDeletePartitionCommitActionExecutor#execute + this.props.put(HoodieWriteConfig.AUTO_COMMIT_ENABLE.key(), "true"); + try (SparkRDDWriteClient client = + UtilHelpers.createHoodieClient(jsc, cfg.basePath, "", cfg.parallelism, Option.empty(), props)) { + client.managePartitionTTL(client.createNewInstantTime()); + } + } + + private HoodieWriteConfig getHoodieClientConfig() { + return HoodieWriteConfig.newBuilder().combineInput(true, true).withPath(cfg.basePath).withAutoCommit(true) + .withProps(props).build(); + } + + public static class Config implements Serializable { + @Parameter(names = {"--base-path", "-sp"}, description = "Base path for the table", required = true) + public String basePath = null; + @Parameter(names = {"--parallelism", "-pl"}, description = "Parallelism for hoodie insert/upsert/delete", required = false) + public int parallelism = 1500; + @Parameter(names = {"--spark-master", "-ms"}, description = "Spark master") + public String sparkMaster = null; + @Parameter(names = {"--spark-memory", "-sm"}, description = "spark memory to use", required = false) + public String sparkMemory = null; + + @Parameter(names = {"--help", "-h"}, help = true) + public Boolean help = false; + + + @Parameter(names = {"--props"}, description = "path to properties file on localfs or dfs, with configurations for " + + "hoodie client for clustering") + public String propsFilePath = null; + + @Parameter(names = {"--hoodie-conf"}, description = "Any configuration that can be set in the properties file " + + "(using the CLI parameter \"--props\") can also be passed command line using this parameter. This can be repeated", + splitter = IdentitySplitter.class) + public List configs = new ArrayList<>(); + } + + public static void main(String[] args) { + final HoodieTTLJob.Config cfg = new HoodieTTLJob.Config(); + JCommander cmd = new JCommander(cfg, null, args); + if (cfg.help || args.length == 0) { + cmd.usage(); + throw new HoodieException("Failed to run ttl for " + cfg.basePath); + } + + String dirName = new Path(cfg.basePath).getName(); + JavaSparkContext jssc = UtilHelpers.buildSparkContext("hoodie-ttl-job-" + dirName, cfg.sparkMaster); + + try { + new HoodieTTLJob(jssc, cfg).run(); + } catch (Throwable throwable) { + throw new HoodieException("Failed to run ttl for " + cfg.basePath, throwable); + } finally { + jssc.stop(); + } + + LOG.info("Hoodie TTL job ran successfully"); + } + +} diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieWithTimelineServer.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieWithTimelineServer.java index e2c23b151532..fdcb806b434d 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieWithTimelineServer.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieWithTimelineServer.java @@ -102,7 +102,7 @@ public String sendRequest(String driverHost, int port) { try (CloseableHttpClient client = HttpClientBuilder.create().build()) { System.out.println("Sleeping for " + cfg.delaySecs + " secs "); - Thread.sleep(cfg.delaySecs * 1000); + Thread.sleep(cfg.delaySecs * 1000L); System.out.println("Woke up after sleeping for " + cfg.delaySecs + " secs "); HttpGet request = new HttpGet(url); diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/UtilHelpers.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/UtilHelpers.java index 18e92a8463ce..9dcecfa0bcb3 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/UtilHelpers.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/UtilHelpers.java @@ -66,6 +66,7 @@ import org.apache.hudi.utilities.sources.Source; import org.apache.hudi.utilities.sources.processor.ChainedJsonKafkaSourcePostProcessor; import org.apache.hudi.utilities.sources.processor.JsonKafkaSourcePostProcessor; +import org.apache.hudi.utilities.streamer.StreamContext; import org.apache.hudi.utilities.transform.ChainedTransformer; import org.apache.hudi.utilities.transform.ErrorTableAwareChainedTransformer; import org.apache.hudi.utilities.transform.Transformer; @@ -156,6 +157,24 @@ public static Source createSource(String sourceClass, TypedProperties cfg, JavaS } } + public static Source createSource(String sourceClass, TypedProperties cfg, JavaSparkContext jssc, + SparkSession sparkSession, HoodieIngestionMetrics metrics, StreamContext streamContext) + throws IOException { + try { + try { + return (Source) ReflectionUtils.loadClass(sourceClass, + new Class[] {TypedProperties.class, JavaSparkContext.class, + SparkSession.class, + HoodieIngestionMetrics.class, StreamContext.class}, + cfg, jssc, sparkSession, metrics, streamContext); + } catch (HoodieException e) { + return createSource(sourceClass, cfg, jssc, sparkSession, streamContext.getSchemaProvider(), metrics); + } + } catch (Throwable e) { + throw new IOException("Could not load source class " + sourceClass, e); + } + } + public static JsonKafkaSourcePostProcessor createJsonKafkaSourcePostProcessor(String postProcessorClassNames, TypedProperties props) throws IOException { if (StringUtils.isNullOrEmpty(postProcessorClassNames)) { return null; @@ -608,6 +627,7 @@ public static int retry(int maxRetryCount, CheckedSupplier supplier, St } while (ret != 0 && maxRetryCount-- > 0); } catch (Throwable t) { LOG.error(errorMessage, t); + throw new RuntimeException("Failed in retry", t); } return ret; } diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/config/CloudSourceConfig.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/config/CloudSourceConfig.java index 54be9cabef92..e3bdca1a3957 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/config/CloudSourceConfig.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/config/CloudSourceConfig.java @@ -85,14 +85,14 @@ public class CloudSourceConfig extends HoodieConfig { .noDefaultValue() .withAlternatives(DELTA_STREAMER_CONFIG_PREFIX + "source.cloud.data.select.relpath.prefix") .markAdvanced() - .withDocumentation("Only selects objects in the bucket whose relative path matches this prefix"); + .withDocumentation("Only selects objects in the bucket whose relative path starts with this prefix"); public static final ConfigProperty IGNORE_RELATIVE_PATH_PREFIX = ConfigProperty .key(STREAMER_CONFIG_PREFIX + "source.cloud.data.ignore.relpath.prefix") .noDefaultValue() .withAlternatives(DELTA_STREAMER_CONFIG_PREFIX + "source.cloud.data.ignore.relpath.prefix") .markAdvanced() - .withDocumentation("Ignore objects in the bucket whose relative path matches this prefix"); + .withDocumentation("Ignore objects in the bucket whose relative path starts this prefix"); public static final ConfigProperty IGNORE_RELATIVE_PATH_SUBSTR = ConfigProperty .key(STREAMER_CONFIG_PREFIX + "source.cloud.data.ignore.relpath.substring") diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/config/HoodieStreamerConfig.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/config/HoodieStreamerConfig.java index b3b64cff905b..e50e7fa06124 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/config/HoodieStreamerConfig.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/config/HoodieStreamerConfig.java @@ -132,4 +132,11 @@ public class HoodieStreamerConfig extends HoodieConfig { .sinceVersion("0.14.0") .withDocumentation("Number of records to sample from the first write. To improve the estimation's accuracy, " + "for smaller or more compressable record size, set the sample size bigger. For bigger or less compressable record size, set smaller."); + + public static final ConfigProperty ROW_THROW_EXPLICIT_EXCEPTIONS = ConfigProperty + .key(STREAMER_CONFIG_PREFIX + "row.throw.explicit.exceptions") + .defaultValue(false) + .markAdvanced() + .sinceVersion("0.15.0") + .withDocumentation("When enabled, the dataframe generated from reading source data is wrapped with an exception handler to explicitly surface exceptions."); } diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/config/S3EventsHoodieIncrSourceConfig.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/config/S3EventsHoodieIncrSourceConfig.java index 3db572b1f84f..23ecb96d7956 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/config/S3EventsHoodieIncrSourceConfig.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/config/S3EventsHoodieIncrSourceConfig.java @@ -47,6 +47,8 @@ public class S3EventsHoodieIncrSourceConfig extends HoodieConfig { .markAdvanced() .withDocumentation("Control whether we do existence check for files before consuming them"); + @Deprecated + // Use {@link CloudSourceConfig.SELECT_RELATIVE_PATH_PREFIX} public static final ConfigProperty S3_KEY_PREFIX = ConfigProperty .key(STREAMER_CONFIG_PREFIX + "source.s3incr.key.prefix") .noDefaultValue() @@ -61,6 +63,8 @@ public class S3EventsHoodieIncrSourceConfig extends HoodieConfig { .markAdvanced() .withDocumentation("The file system prefix."); + @Deprecated + // Use {@link CloudSourceConfig.IGNORE_RELATIVE_PATH_PREFIX} public static final ConfigProperty S3_IGNORE_KEY_PREFIX = ConfigProperty .key(STREAMER_CONFIG_PREFIX + "source.s3incr.ignore.key.prefix") .noDefaultValue() @@ -68,6 +72,8 @@ public class S3EventsHoodieIncrSourceConfig extends HoodieConfig { .markAdvanced() .withDocumentation("Control whether to ignore the s3 objects starting with this prefix"); + @Deprecated + // Use {@link CloudSourceConfig.IGNORE_RELATIVE_PATH_SUBSTR} public static final ConfigProperty S3_IGNORE_KEY_SUBSTRING = ConfigProperty .key(STREAMER_CONFIG_PREFIX + "source.s3incr.ignore.key.substring") .noDefaultValue() diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/DeltaSync.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/DeltaSync.java index c794db32510e..4002d1579bb7 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/DeltaSync.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/DeltaSync.java @@ -22,7 +22,9 @@ import org.apache.hudi.client.SparkRDDWriteClient; import org.apache.hudi.client.common.HoodieSparkEngineContext; import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.util.Option; import org.apache.hudi.utilities.schema.SchemaProvider; +import org.apache.hudi.utilities.streamer.DefaultStreamContext; import org.apache.hudi.utilities.streamer.HoodieStreamer; import org.apache.hudi.utilities.streamer.StreamSync; @@ -49,6 +51,6 @@ public DeltaSync(HoodieStreamer.Config cfg, SparkSession sparkSession, SchemaPro public DeltaSync(HoodieDeltaStreamer.Config cfg, SparkSession sparkSession, SchemaProvider schemaProvider, TypedProperties props, HoodieSparkEngineContext hoodieSparkContext, FileSystem fs, Configuration conf, Function onInitializingHoodieWriteClient) throws IOException { - super(cfg, sparkSession, schemaProvider, props, hoodieSparkContext, fs, conf, onInitializingHoodieWriteClient); + super(cfg, sparkSession, props, hoodieSparkContext, fs, conf, onInitializingHoodieWriteClient, new DefaultStreamContext(schemaProvider, Option.empty())); } } diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/schema/KafkaOffsetPostProcessor.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/schema/KafkaOffsetPostProcessor.java index 500bb0c7f99f..294838a435fa 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/schema/KafkaOffsetPostProcessor.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/schema/KafkaOffsetPostProcessor.java @@ -18,18 +18,18 @@ package org.apache.hudi.utilities.schema; -import org.apache.avro.JsonProperties; import org.apache.hudi.common.config.ConfigProperty; import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.internal.schema.HoodieSchemaException; import org.apache.hudi.utilities.config.HoodieStreamerConfig; +import org.apache.avro.JsonProperties; import org.apache.avro.Schema; import org.apache.spark.api.java.JavaSparkContext; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; +import java.util.Arrays; import java.util.List; +import java.util.Set; import java.util.stream.Collectors; import static org.apache.hudi.avro.AvroSchemaUtils.createNullableSchema; @@ -51,8 +51,6 @@ public static boolean shouldAddOffsets(TypedProperties props) { } } - private static final Logger LOG = LoggerFactory.getLogger(KafkaOffsetPostProcessor.class); - public static final String KAFKA_SOURCE_OFFSET_COLUMN = "_hoodie_kafka_source_offset"; public static final String KAFKA_SOURCE_PARTITION_COLUMN = "_hoodie_kafka_source_partition"; public static final String KAFKA_SOURCE_TIMESTAMP_COLUMN = "_hoodie_kafka_source_timestamp"; @@ -65,16 +63,29 @@ public KafkaOffsetPostProcessor(TypedProperties props, JavaSparkContext jssc) { @Override public Schema processSchema(Schema schema) { // this method adds kafka offset fields namely source offset, partition, timestamp and kafka message key to the schema of the batch. + List fieldList = schema.getFields(); + Set fieldNames = fieldList.stream().map(Schema.Field::name).collect(Collectors.toSet()); + // if the source schema already contains the kafka offset fields, then return the schema as is. + if (fieldNames.containsAll(Arrays.asList(KAFKA_SOURCE_OFFSET_COLUMN, KAFKA_SOURCE_PARTITION_COLUMN, KAFKA_SOURCE_TIMESTAMP_COLUMN, KAFKA_SOURCE_KEY_COLUMN))) { + return schema; + } try { - List fieldList = schema.getFields(); List newFieldList = fieldList.stream() .map(f -> new Schema.Field(f.name(), f.schema(), f.doc(), f.defaultVal())).collect(Collectors.toList()); - newFieldList.add(new Schema.Field(KAFKA_SOURCE_OFFSET_COLUMN, Schema.create(Schema.Type.LONG), "offset column", 0)); - newFieldList.add(new Schema.Field(KAFKA_SOURCE_PARTITION_COLUMN, Schema.create(Schema.Type.INT), "partition column", 0)); - newFieldList.add(new Schema.Field(KAFKA_SOURCE_TIMESTAMP_COLUMN, Schema.create(Schema.Type.LONG), "timestamp column", 0)); - newFieldList.add(new Schema.Field(KAFKA_SOURCE_KEY_COLUMN, createNullableSchema(Schema.Type.STRING), "kafka key column", JsonProperties.NULL_VALUE)); - Schema newSchema = Schema.createRecord(schema.getName() + "_processed", schema.getDoc(), schema.getNamespace(), false, newFieldList); - return newSchema; + // handle case where source schema provider may have already set 1 or more of these fields + if (!fieldNames.contains(KAFKA_SOURCE_OFFSET_COLUMN)) { + newFieldList.add(new Schema.Field(KAFKA_SOURCE_OFFSET_COLUMN, Schema.create(Schema.Type.LONG), "offset column", 0)); + } + if (!fieldNames.contains(KAFKA_SOURCE_PARTITION_COLUMN)) { + newFieldList.add(new Schema.Field(KAFKA_SOURCE_PARTITION_COLUMN, Schema.create(Schema.Type.INT), "partition column", 0)); + } + if (!fieldNames.contains(KAFKA_SOURCE_TIMESTAMP_COLUMN)) { + newFieldList.add(new Schema.Field(KAFKA_SOURCE_TIMESTAMP_COLUMN, Schema.create(Schema.Type.LONG), "timestamp column", 0)); + } + if (!fieldNames.contains(KAFKA_SOURCE_KEY_COLUMN)) { + newFieldList.add(new Schema.Field(KAFKA_SOURCE_KEY_COLUMN, createNullableSchema(Schema.Type.STRING), "kafka key column", JsonProperties.NULL_VALUE)); + } + return Schema.createRecord(schema.getName() + "_processed", schema.getDoc(), schema.getNamespace(), false, newFieldList); } catch (Exception e) { throw new HoodieSchemaException("Kafka offset post processor failed with schema: " + schema, e); } diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/AvroKafkaSource.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/AvroKafkaSource.java index 2bf92280faf5..36c83d630300 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/AvroKafkaSource.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/AvroKafkaSource.java @@ -27,6 +27,8 @@ import org.apache.hudi.utilities.schema.SchemaProvider; import org.apache.hudi.utilities.sources.helpers.AvroConvertor; import org.apache.hudi.utilities.sources.helpers.KafkaOffsetGen; +import org.apache.hudi.utilities.streamer.DefaultStreamContext; +import org.apache.hudi.utilities.streamer.StreamContext; import org.apache.avro.generic.GenericRecord; import org.apache.kafka.clients.consumer.ConsumerRecord; @@ -69,10 +71,13 @@ public class AvroKafkaSource extends KafkaSource { public AvroKafkaSource(TypedProperties props, JavaSparkContext sparkContext, SparkSession sparkSession, SchemaProvider schemaProvider, HoodieIngestionMetrics metrics) { - super(props, sparkContext, sparkSession, - UtilHelpers.getSchemaProviderForKafkaSource(schemaProvider, props, sparkContext), - SourceType.AVRO, metrics); - this.originalSchemaProvider = schemaProvider; + this(props, sparkContext, sparkSession, metrics, new DefaultStreamContext(schemaProvider, Option.empty())); + } + + public AvroKafkaSource(TypedProperties properties, JavaSparkContext sparkContext, SparkSession sparkSession, HoodieIngestionMetrics metrics, StreamContext streamContext) { + super(properties, sparkContext, sparkSession, SourceType.AVRO, metrics, + new DefaultStreamContext(UtilHelpers.getSchemaProviderForKafkaSource(streamContext.getSchemaProvider(), properties, sparkContext), streamContext.getSourceProfileSupplier())); + this.originalSchemaProvider = streamContext.getSchemaProvider(); props.put(NATIVE_KAFKA_KEY_DESERIALIZER_PROP, StringDeserializer.class.getName()); deserializerClassName = getStringWithAltKeys(props, KAFKA_AVRO_VALUE_DESERIALIZER_CLASS, true); diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/GcsEventsHoodieIncrSource.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/GcsEventsHoodieIncrSource.java index a06130d39728..079507429093 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/GcsEventsHoodieIncrSource.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/GcsEventsHoodieIncrSource.java @@ -27,6 +27,7 @@ import org.apache.hudi.utilities.sources.helpers.CloudDataFetcher; import org.apache.hudi.utilities.sources.helpers.CloudObjectIncrCheckpoint; import org.apache.hudi.utilities.sources.helpers.CloudObjectMetadata; +import org.apache.hudi.utilities.sources.helpers.CloudObjectsSelectorCommon; import org.apache.hudi.utilities.sources.helpers.IncrSourceHelper; import org.apache.hudi.utilities.sources.helpers.IncrSourceHelper.MissingCheckpointStrategy; import org.apache.hudi.utilities.sources.helpers.QueryInfo; @@ -48,11 +49,9 @@ import static org.apache.hudi.common.util.ConfigUtils.getIntWithAltKeys; import static org.apache.hudi.common.util.ConfigUtils.getStringWithAltKeys; import static org.apache.hudi.common.util.StringUtils.isNullOrEmpty; -import static org.apache.hudi.utilities.config.CloudSourceConfig.DATAFILE_FORMAT; import static org.apache.hudi.utilities.config.CloudSourceConfig.ENABLE_EXISTS_CHECK; import static org.apache.hudi.utilities.config.HoodieIncrSourceConfig.HOODIE_SRC_BASE_PATH; import static org.apache.hudi.utilities.config.HoodieIncrSourceConfig.NUM_INSTANTS_PER_FETCH; -import static org.apache.hudi.utilities.config.HoodieIncrSourceConfig.SOURCE_FILE_FORMAT; import static org.apache.hudi.utilities.sources.helpers.IncrSourceHelper.generateQueryInfo; import static org.apache.hudi.utilities.sources.helpers.IncrSourceHelper.getHollowCommitHandleMode; import static org.apache.hudi.utilities.sources.helpers.IncrSourceHelper.getMissingCheckpointStrategy; @@ -116,18 +115,14 @@ public class GcsEventsHoodieIncrSource extends HoodieIncrSource { private final Option schemaProvider; private final Option snapshotLoadQuerySplitter; - - public static final String GCS_OBJECT_KEY = "name"; - public static final String GCS_OBJECT_SIZE = "size"; - private static final Logger LOG = LoggerFactory.getLogger(GcsEventsHoodieIncrSource.class); public GcsEventsHoodieIncrSource(TypedProperties props, JavaSparkContext jsc, SparkSession spark, SchemaProvider schemaProvider) { this(props, jsc, spark, schemaProvider, - new GcsObjectMetadataFetcher(props, getSourceFileFormat(props)), - new CloudDataFetcher(props, getStringWithAltKeys(props, DATAFILE_FORMAT, true)), + new GcsObjectMetadataFetcher(props), + new CloudDataFetcher(props), new QueryRunner(spark, props) ); } @@ -163,7 +158,8 @@ public Pair>, String> fetchNextBatch(Option lastChec sparkContext, srcPath, numInstantsPerFetch, Option.of(cloudObjectIncrCheckpoint.getCommit()), missingCheckpointStrategy, handlingMode, HoodieRecord.COMMIT_TIME_METADATA_FIELD, - GCS_OBJECT_KEY, GCS_OBJECT_SIZE, true, + CloudObjectsSelectorCommon.GCS_OBJECT_KEY, + CloudObjectsSelectorCommon.GCS_OBJECT_SIZE, true, Option.ofNullable(cloudObjectIncrCheckpoint.getKey())); LOG.info("Querying GCS with:" + cloudObjectIncrCheckpoint + " and queryInfo:" + queryInfo); @@ -196,9 +192,4 @@ private Pair>, String> extractData(QueryInfo queryInfo, Data Option> fileDataRows = gcsObjectDataFetcher.getCloudObjectDataDF(sparkSession, cloudObjectMetadata, props, schemaProvider); return Pair.of(fileDataRows, queryInfo.getEndInstant()); } - - private static String getSourceFileFormat(TypedProperties props) { - return getStringWithAltKeys(props, SOURCE_FILE_FORMAT, true); - } - } diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/JsonKafkaSource.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/JsonKafkaSource.java index eb67abfee3a6..c8c3b3421c6f 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/JsonKafkaSource.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/JsonKafkaSource.java @@ -19,6 +19,7 @@ package org.apache.hudi.utilities.sources; import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.utilities.UtilHelpers; import org.apache.hudi.utilities.config.JsonKafkaPostProcessorConfig; @@ -27,6 +28,8 @@ import org.apache.hudi.utilities.schema.SchemaProvider; import org.apache.hudi.utilities.sources.helpers.KafkaOffsetGen; import org.apache.hudi.utilities.sources.processor.JsonKafkaSourcePostProcessor; +import org.apache.hudi.utilities.streamer.DefaultStreamContext; +import org.apache.hudi.utilities.streamer.StreamContext; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.node.ObjectNode; @@ -44,10 +47,10 @@ import java.util.List; import static org.apache.hudi.common.util.ConfigUtils.getStringWithAltKeys; +import static org.apache.hudi.utilities.schema.KafkaOffsetPostProcessor.KAFKA_SOURCE_KEY_COLUMN; import static org.apache.hudi.utilities.schema.KafkaOffsetPostProcessor.KAFKA_SOURCE_OFFSET_COLUMN; import static org.apache.hudi.utilities.schema.KafkaOffsetPostProcessor.KAFKA_SOURCE_PARTITION_COLUMN; import static org.apache.hudi.utilities.schema.KafkaOffsetPostProcessor.KAFKA_SOURCE_TIMESTAMP_COLUMN; -import static org.apache.hudi.utilities.schema.KafkaOffsetPostProcessor.KAFKA_SOURCE_KEY_COLUMN; /** * Read json kafka data. @@ -56,9 +59,12 @@ public class JsonKafkaSource extends KafkaSource { public JsonKafkaSource(TypedProperties properties, JavaSparkContext sparkContext, SparkSession sparkSession, SchemaProvider schemaProvider, HoodieIngestionMetrics metrics) { - super(properties, sparkContext, sparkSession, - UtilHelpers.getSchemaProviderForKafkaSource(schemaProvider, properties, sparkContext), - SourceType.JSON, metrics); + this(properties, sparkContext, sparkSession, metrics, new DefaultStreamContext(schemaProvider, Option.empty())); + } + + public JsonKafkaSource(TypedProperties properties, JavaSparkContext sparkContext, SparkSession sparkSession, HoodieIngestionMetrics metrics, StreamContext streamContext) { + super(properties, sparkContext, sparkSession, SourceType.JSON, metrics, + new DefaultStreamContext(UtilHelpers.getSchemaProviderForKafkaSource(streamContext.getSchemaProvider(), properties, sparkContext), streamContext.getSourceProfileSupplier())); properties.put("key.deserializer", StringDeserializer.class.getName()); properties.put("value.deserializer", StringDeserializer.class.getName()); this.offsetGen = new KafkaOffsetGen(props); @@ -87,7 +93,9 @@ protected JavaRDD maybeAppendKafkaOffsets(JavaRDD extends Source> { protected final boolean shouldAddOffsets; protected KafkaSource(TypedProperties props, JavaSparkContext sparkContext, SparkSession sparkSession, - SchemaProvider schemaProvider, SourceType sourceType, HoodieIngestionMetrics metrics) { - super(props, sparkContext, sparkSession, schemaProvider, sourceType); - this.schemaProvider = schemaProvider; + SourceType sourceType, HoodieIngestionMetrics metrics, StreamContext streamContext) { + super(props, sparkContext, sparkSession, sourceType, streamContext); + this.schemaProvider = streamContext.getSchemaProvider(); this.metrics = metrics; this.shouldAddOffsets = KafkaOffsetPostProcessor.Config.shouldAddOffsets(props); } @@ -60,21 +62,34 @@ protected KafkaSource(TypedProperties props, JavaSparkContext sparkContext, Spar @Override protected InputBatch> fetchNewData(Option lastCheckpointStr, long sourceLimit) { try { - OffsetRange[] offsetRanges = offsetGen.getNextOffsetRanges(lastCheckpointStr, sourceLimit, metrics); - long totalNewMsgs = KafkaOffsetGen.CheckpointUtils.totalNewMessages(offsetRanges); - LOG.info("About to read " + totalNewMsgs + " from Kafka for topic :" + offsetGen.getTopicName()); - if (totalNewMsgs <= 0) { - metrics.updateStreamerSourceNewMessageCount(METRIC_NAME_KAFKA_MESSAGE_IN_COUNT, 0); - return new InputBatch<>(Option.empty(), KafkaOffsetGen.CheckpointUtils.offsetsToStr(offsetRanges)); + OffsetRange[] offsetRanges; + if (sourceProfileSupplier.isPresent() && sourceProfileSupplier.get().getSourceProfile() != null) { + SourceProfile kafkaSourceProfile = sourceProfileSupplier.get().getSourceProfile(); + offsetRanges = offsetGen.getNextOffsetRanges(lastCheckpointStr, kafkaSourceProfile.getSourceSpecificContext(), kafkaSourceProfile.getSourcePartitions(), metrics); + LOG.info("About to read numEvents {} of size {} bytes in {} partitions from Kafka for topic {} with offsetRanges {}", + kafkaSourceProfile.getSourceSpecificContext(), kafkaSourceProfile.getMaxSourceBytes(), + kafkaSourceProfile.getSourcePartitions(), offsetGen.getTopicName(), offsetRanges); + } else { + offsetRanges = offsetGen.getNextOffsetRanges(lastCheckpointStr, sourceLimit, metrics); } - metrics.updateStreamerSourceNewMessageCount(METRIC_NAME_KAFKA_MESSAGE_IN_COUNT, totalNewMsgs); - JavaRDD newDataRDD = toRDD(offsetRanges); - return new InputBatch<>(Option.of(newDataRDD), KafkaOffsetGen.CheckpointUtils.offsetsToStr(offsetRanges)); + return toInputBatch(offsetRanges); } catch (org.apache.kafka.common.errors.TimeoutException e) { throw new HoodieSourceTimeoutException("Kafka Source timed out " + e.getMessage()); } } + private InputBatch> toInputBatch(OffsetRange[] offsetRanges) { + long totalNewMsgs = KafkaOffsetGen.CheckpointUtils.totalNewMessages(offsetRanges); + LOG.info("About to read " + totalNewMsgs + " from Kafka for topic :" + offsetGen.getTopicName()); + if (totalNewMsgs <= 0) { + metrics.updateStreamerSourceNewMessageCount(METRIC_NAME_KAFKA_MESSAGE_IN_COUNT, 0); + return new InputBatch<>(Option.empty(), KafkaOffsetGen.CheckpointUtils.offsetsToStr(offsetRanges)); + } + metrics.updateStreamerSourceNewMessageCount(METRIC_NAME_KAFKA_MESSAGE_IN_COUNT, totalNewMsgs); + JavaRDD newDataRDD = toRDD(offsetRanges); + return new InputBatch<>(Option.of(newDataRDD), KafkaOffsetGen.CheckpointUtils.offsetsToStr(offsetRanges)); + } + abstract JavaRDD toRDD(OffsetRange[] offsetRanges); @Override diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/ProtoKafkaSource.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/ProtoKafkaSource.java index 67927480454b..d7a15b3932cf 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/ProtoKafkaSource.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/ProtoKafkaSource.java @@ -19,12 +19,16 @@ package org.apache.hudi.utilities.sources; import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.ReflectionUtils; +import org.apache.hudi.utilities.UtilHelpers; import org.apache.hudi.utilities.config.ProtoClassBasedSchemaProviderConfig; import org.apache.hudi.utilities.exception.HoodieReadFromSourceException; import org.apache.hudi.utilities.ingestion.HoodieIngestionMetrics; import org.apache.hudi.utilities.schema.SchemaProvider; import org.apache.hudi.utilities.sources.helpers.KafkaOffsetGen; +import org.apache.hudi.utilities.streamer.DefaultStreamContext; +import org.apache.hudi.utilities.streamer.StreamContext; import com.google.protobuf.Message; import org.apache.kafka.common.serialization.ByteArrayDeserializer; @@ -51,9 +55,14 @@ public class ProtoKafkaSource extends KafkaSource { private final String className; - public ProtoKafkaSource(TypedProperties props, JavaSparkContext sparkContext, - SparkSession sparkSession, SchemaProvider schemaProvider, HoodieIngestionMetrics metrics) { - super(props, sparkContext, sparkSession, schemaProvider, SourceType.PROTO, metrics); + public ProtoKafkaSource(TypedProperties props, JavaSparkContext sparkContext, SparkSession sparkSession, + SchemaProvider schemaProvider, HoodieIngestionMetrics metrics) { + this(props, sparkContext, sparkSession, metrics, new DefaultStreamContext(schemaProvider, Option.empty())); + } + + public ProtoKafkaSource(TypedProperties properties, JavaSparkContext sparkContext, SparkSession sparkSession, HoodieIngestionMetrics metrics, StreamContext streamContext) { + super(properties, sparkContext, sparkSession, SourceType.PROTO, metrics, + new DefaultStreamContext(UtilHelpers.getSchemaProviderForKafkaSource(streamContext.getSchemaProvider(), properties, sparkContext), streamContext.getSourceProfileSupplier())); checkRequiredConfigProperties(props, Collections.singletonList( ProtoClassBasedSchemaProviderConfig.PROTO_SCHEMA_CLASS_NAME)); props.put(NATIVE_KAFKA_KEY_DESERIALIZER_PROP, StringDeserializer.class); diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/RowSource.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/RowSource.java index f2cc48f280c0..1c7e9d990988 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/RowSource.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/RowSource.java @@ -18,10 +18,13 @@ package org.apache.hudi.utilities.sources; +import org.apache.hudi.HoodieSparkUtils; import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.util.ConfigUtils; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.utilities.UtilHelpers; +import org.apache.hudi.utilities.exception.HoodieReadFromSourceException; import org.apache.hudi.utilities.schema.SchemaProvider; import org.apache.hudi.utilities.sources.helpers.SanitizationUtils; @@ -30,6 +33,8 @@ import org.apache.spark.sql.Row; import org.apache.spark.sql.SparkSession; +import static org.apache.hudi.utilities.config.HoodieStreamerConfig.ROW_THROW_EXPLICIT_EXCEPTIONS; + public abstract class RowSource extends Source> { public RowSource(TypedProperties props, JavaSparkContext sparkContext, SparkSession sparkSession, @@ -46,7 +51,9 @@ protected final InputBatch> fetchNewData(Option lastCkptStr Dataset sanitizedRows = SanitizationUtils.sanitizeColumnNamesForAvro(dsr, props); SchemaProvider rowSchemaProvider = UtilHelpers.createRowBasedSchemaProvider(sanitizedRows.schema(), props, sparkContext); - return new InputBatch<>(Option.of(sanitizedRows), res.getValue(), rowSchemaProvider); + Dataset wrappedDf = HoodieSparkUtils.maybeWrapDataFrameWithException(sanitizedRows, HoodieReadFromSourceException.class.getName(), + "Failed to read from row source", ConfigUtils.getBooleanWithAltKeys(props, ROW_THROW_EXPLICIT_EXCEPTIONS)); + return new InputBatch<>(Option.of(wrappedDf), res.getValue(), rowSchemaProvider); }).orElseGet(() -> new InputBatch<>(res.getKey(), res.getValue())); } } diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/S3EventsHoodieIncrSource.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/S3EventsHoodieIncrSource.java index 4cbec4d22121..84b267709ad7 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/S3EventsHoodieIncrSource.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/S3EventsHoodieIncrSource.java @@ -23,14 +23,13 @@ import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.table.timeline.TimelineUtils.HollowCommitHandling; import org.apache.hudi.common.util.Option; -import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.common.util.collection.Pair; -import org.apache.hudi.utilities.config.CloudSourceConfig; import org.apache.hudi.utilities.config.S3EventsHoodieIncrSourceConfig; import org.apache.hudi.utilities.schema.SchemaProvider; import org.apache.hudi.utilities.sources.helpers.CloudDataFetcher; import org.apache.hudi.utilities.sources.helpers.CloudObjectIncrCheckpoint; import org.apache.hudi.utilities.sources.helpers.CloudObjectMetadata; +import org.apache.hudi.utilities.sources.helpers.CloudObjectsSelectorCommon; import org.apache.hudi.utilities.sources.helpers.IncrSourceHelper; import org.apache.hudi.utilities.sources.helpers.QueryInfo; import org.apache.hudi.utilities.sources.helpers.QueryRunner; @@ -51,17 +50,11 @@ import static org.apache.hudi.common.util.ConfigUtils.getIntWithAltKeys; import static org.apache.hudi.common.util.ConfigUtils.getStringWithAltKeys; import static org.apache.hudi.common.util.StringUtils.isNullOrEmpty; -import static org.apache.hudi.utilities.config.CloudSourceConfig.CLOUD_DATAFILE_EXTENSION; -import static org.apache.hudi.utilities.config.CloudSourceConfig.DATAFILE_FORMAT; import static org.apache.hudi.utilities.config.CloudSourceConfig.ENABLE_EXISTS_CHECK; import static org.apache.hudi.utilities.config.HoodieIncrSourceConfig.HOODIE_SRC_BASE_PATH; import static org.apache.hudi.utilities.config.HoodieIncrSourceConfig.NUM_INSTANTS_PER_FETCH; -import static org.apache.hudi.utilities.config.HoodieIncrSourceConfig.SOURCE_FILE_FORMAT; import static org.apache.hudi.utilities.config.S3EventsHoodieIncrSourceConfig.S3_FS_PREFIX; -import static org.apache.hudi.utilities.config.S3EventsHoodieIncrSourceConfig.S3_IGNORE_KEY_PREFIX; -import static org.apache.hudi.utilities.config.S3EventsHoodieIncrSourceConfig.S3_IGNORE_KEY_SUBSTRING; import static org.apache.hudi.utilities.config.S3EventsHoodieIncrSourceConfig.S3_INCR_ENABLE_EXISTS_CHECK; -import static org.apache.hudi.utilities.config.S3EventsHoodieIncrSourceConfig.S3_KEY_PREFIX; import static org.apache.hudi.utilities.sources.helpers.CloudObjectsSelectorCommon.getCloudObjectMetadataPerPartition; import static org.apache.hudi.utilities.sources.helpers.IncrSourceHelper.getHollowCommitHandleMode; import static org.apache.hudi.utilities.sources.helpers.IncrSourceHelper.getMissingCheckpointStrategy; @@ -72,11 +65,9 @@ public class S3EventsHoodieIncrSource extends HoodieIncrSource { private static final Logger LOG = LoggerFactory.getLogger(S3EventsHoodieIncrSource.class); - private static final String EMPTY_STRING = ""; private final String srcPath; private final int numInstantsPerFetch; private final boolean checkIfFileExists; - private final String fileFormat; private final IncrSourceHelper.MissingCheckpointStrategy missingCheckpointStrategy; private final QueryRunner queryRunner; private final CloudDataFetcher cloudDataFetcher; @@ -92,18 +83,9 @@ public static class Config { @Deprecated static final Boolean DEFAULT_ENABLE_EXISTS_CHECK = S3_INCR_ENABLE_EXISTS_CHECK.defaultValue(); - // control whether to filter the s3 objects starting with this prefix - @Deprecated - static final String S3_KEY_PREFIX = S3EventsHoodieIncrSourceConfig.S3_KEY_PREFIX.key(); @Deprecated static final String S3_FS_PREFIX = S3EventsHoodieIncrSourceConfig.S3_FS_PREFIX.key(); - // control whether to ignore the s3 objects starting with this prefix - @Deprecated - static final String S3_IGNORE_KEY_PREFIX = S3EventsHoodieIncrSourceConfig.S3_IGNORE_KEY_PREFIX.key(); - // control whether to ignore the s3 objects with this substring - @Deprecated - static final String S3_IGNORE_KEY_SUBSTRING = S3EventsHoodieIncrSourceConfig.S3_IGNORE_KEY_SUBSTRING.key(); /** * {@link #SPARK_DATASOURCE_OPTIONS} is json string, passed to the reader while loading dataset. * Example Hudi Streamer conf @@ -113,17 +95,13 @@ public static class Config { public static final String SPARK_DATASOURCE_OPTIONS = S3EventsHoodieIncrSourceConfig.SPARK_DATASOURCE_OPTIONS.key(); } - public static final String S3_OBJECT_KEY = "s3.object.key"; - public static final String S3_OBJECT_SIZE = "s3.object.size"; - public static final String S3_BUCKET_NAME = "s3.bucket.name"; - public S3EventsHoodieIncrSource( TypedProperties props, JavaSparkContext sparkContext, SparkSession sparkSession, SchemaProvider schemaProvider) { this(props, sparkContext, sparkSession, schemaProvider, new QueryRunner(sparkSession, props), - new CloudDataFetcher(props, getStringWithAltKeys(props, CloudSourceConfig.DATAFILE_FORMAT, true))); + new CloudDataFetcher(props)); } public S3EventsHoodieIncrSource( @@ -138,13 +116,6 @@ public S3EventsHoodieIncrSource( this.srcPath = getStringWithAltKeys(props, HOODIE_SRC_BASE_PATH); this.numInstantsPerFetch = getIntWithAltKeys(props, NUM_INSTANTS_PER_FETCH); this.checkIfFileExists = getBooleanWithAltKeys(props, ENABLE_EXISTS_CHECK); - - // This is to ensure backward compatibility where we were using the - // config SOURCE_FILE_FORMAT for file format in previous versions. - this.fileFormat = StringUtils.isNullOrEmpty(getStringWithAltKeys(props, DATAFILE_FORMAT, EMPTY_STRING)) - ? getStringWithAltKeys(props, SOURCE_FILE_FORMAT, true) - : getStringWithAltKeys(props, DATAFILE_FORMAT, EMPTY_STRING); - this.missingCheckpointStrategy = getMissingCheckpointStrategy(props); this.queryRunner = queryRunner; this.cloudDataFetcher = cloudDataFetcher; @@ -162,7 +133,8 @@ public Pair>, String> fetchNextBatch(Option lastChec Option.of(cloudObjectIncrCheckpoint.getCommit()), missingCheckpointStrategy, handlingMode, HoodieRecord.COMMIT_TIME_METADATA_FIELD, - S3_OBJECT_KEY, S3_OBJECT_SIZE, true, + CloudObjectsSelectorCommon.S3_OBJECT_KEY, + CloudObjectsSelectorCommon.S3_OBJECT_SIZE, true, Option.ofNullable(cloudObjectIncrCheckpoint.getKey())); LOG.info("Querying S3 with:" + cloudObjectIncrCheckpoint + ", queryInfo:" + queryInfo); @@ -172,7 +144,8 @@ public Pair>, String> fetchNextBatch(Option lastChec } Pair> queryInfoDatasetPair = queryRunner.run(queryInfo, snapshotLoadQuerySplitter); queryInfo = queryInfoDatasetPair.getLeft(); - Dataset filteredSourceData = applyFilter(queryInfoDatasetPair.getRight(), fileFormat); + Dataset filteredSourceData = queryInfoDatasetPair.getRight().filter( + CloudObjectsSelectorCommon.generateFilter(CloudObjectsSelectorCommon.Type.S3, props)); LOG.info("Adjusting end checkpoint:" + queryInfo.getEndInstant() + " based on sourceLimit :" + sourceLimit); Pair>> checkPointAndDataset = @@ -190,7 +163,9 @@ public Pair>, String> fetchNextBatch(Option lastChec // Create S3 paths SerializableConfiguration serializableHadoopConf = new SerializableConfiguration(sparkContext.hadoopConfiguration()); List cloudObjectMetadata = checkPointAndDataset.getRight().get() - .select(S3_BUCKET_NAME, S3_OBJECT_KEY, S3_OBJECT_SIZE) + .select(CloudObjectsSelectorCommon.S3_BUCKET_NAME, + CloudObjectsSelectorCommon.S3_OBJECT_KEY, + CloudObjectsSelectorCommon.S3_OBJECT_SIZE) .distinct() .mapPartitions(getCloudObjectMetadataPerPartition(s3Prefix, serializableHadoopConf, checkIfFileExists), Encoders.kryo(CloudObjectMetadata.class)) .collectAsList(); @@ -199,25 +174,4 @@ public Pair>, String> fetchNextBatch(Option lastChec Option> datasetOption = cloudDataFetcher.getCloudObjectDataDF(sparkSession, cloudObjectMetadata, props, schemaProvider); return Pair.of(datasetOption, checkPointAndDataset.getLeft().toString()); } - - Dataset applyFilter(Dataset source, String fileFormat) { - String filter = S3_OBJECT_SIZE + " > 0"; - if (!StringUtils.isNullOrEmpty(getStringWithAltKeys(props, S3_KEY_PREFIX, true))) { - filter = filter + " and " + S3_OBJECT_KEY + " like '" + getStringWithAltKeys(props, S3_KEY_PREFIX) + "%'"; - } - if (!StringUtils.isNullOrEmpty(getStringWithAltKeys(props, S3_IGNORE_KEY_PREFIX, true))) { - filter = filter + " and " + S3_OBJECT_KEY + " not like '" + getStringWithAltKeys(props, S3_IGNORE_KEY_PREFIX) + "%'"; - } - if (!StringUtils.isNullOrEmpty(getStringWithAltKeys(props, S3_IGNORE_KEY_SUBSTRING, true))) { - filter = filter + " and " + S3_OBJECT_KEY + " not like '%" + getStringWithAltKeys(props, S3_IGNORE_KEY_SUBSTRING) + "%'"; - } - // Match files with a given extension, or use the fileFormat as the fallback incase the config is not set. - if (!StringUtils.isNullOrEmpty(getStringWithAltKeys(props, CLOUD_DATAFILE_EXTENSION, true))) { - filter = filter + " and " + S3_OBJECT_KEY + " like '%" + getStringWithAltKeys(props, CLOUD_DATAFILE_EXTENSION) + "'"; - } else { - filter = filter + " and " + S3_OBJECT_KEY + " like '%" + fileFormat + "%'"; - } - - return source.filter(filter); - } } diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/Source.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/Source.java index cbc0722056bf..dfb07c718a06 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/Source.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/Source.java @@ -25,6 +25,9 @@ import org.apache.hudi.common.util.Option; import org.apache.hudi.utilities.callback.SourceCommitCallback; import org.apache.hudi.utilities.schema.SchemaProvider; +import org.apache.hudi.utilities.streamer.DefaultStreamContext; +import org.apache.hudi.utilities.streamer.SourceProfileSupplier; +import org.apache.hudi.utilities.streamer.StreamContext; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.SparkSession; @@ -44,6 +47,7 @@ public enum SourceType { protected transient TypedProperties props; protected transient JavaSparkContext sparkContext; protected transient SparkSession sparkSession; + protected transient Option sourceProfileSupplier; private transient SchemaProvider overriddenSchemaProvider; private final SourceType sourceType; @@ -55,11 +59,16 @@ protected Source(TypedProperties props, JavaSparkContext sparkContext, SparkSess protected Source(TypedProperties props, JavaSparkContext sparkContext, SparkSession sparkSession, SchemaProvider schemaProvider, SourceType sourceType) { + this(props, sparkContext, sparkSession, sourceType, new DefaultStreamContext(schemaProvider, Option.empty())); + } + + protected Source(TypedProperties props, JavaSparkContext sparkContext, SparkSession sparkSession, SourceType sourceType, StreamContext streamContext) { this.props = props; this.sparkContext = sparkContext; this.sparkSession = sparkSession; - this.overriddenSchemaProvider = schemaProvider; + this.overriddenSchemaProvider = streamContext.getSchemaProvider(); this.sourceType = sourceType; + this.sourceProfileSupplier = streamContext.getSourceProfileSupplier(); } @PublicAPIMethod(maturity = ApiMaturityLevel.STABLE) diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/helpers/CloudDataFetcher.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/helpers/CloudDataFetcher.java index 9595ec1a9e6f..ed1a49e33e76 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/helpers/CloudDataFetcher.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/helpers/CloudDataFetcher.java @@ -20,17 +20,21 @@ import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.utilities.schema.SchemaProvider; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.SparkSession; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.Serializable; import java.util.List; +import static org.apache.hudi.common.util.ConfigUtils.getStringWithAltKeys; +import static org.apache.hudi.utilities.config.CloudSourceConfig.DATAFILE_FORMAT; +import static org.apache.hudi.utilities.config.HoodieIncrSourceConfig.SOURCE_FILE_FORMAT; import static org.apache.hudi.utilities.sources.helpers.CloudObjectsSelectorCommon.loadAsDataset; /** @@ -39,21 +43,28 @@ */ public class CloudDataFetcher implements Serializable { - private final String fileFormat; - private TypedProperties props; + private static final String EMPTY_STRING = ""; + + private final TypedProperties props; private static final Logger LOG = LoggerFactory.getLogger(CloudDataFetcher.class); private static final long serialVersionUID = 1L; - public CloudDataFetcher(TypedProperties props, String fileFormat) { - this.fileFormat = fileFormat; + public CloudDataFetcher(TypedProperties props) { this.props = props; } + public static String getFileFormat(TypedProperties props) { + // This is to ensure backward compatibility where we were using the + // config SOURCE_FILE_FORMAT for file format in previous versions. + return StringUtils.isNullOrEmpty(getStringWithAltKeys(props, DATAFILE_FORMAT, EMPTY_STRING)) + ? getStringWithAltKeys(props, SOURCE_FILE_FORMAT, true) + : getStringWithAltKeys(props, DATAFILE_FORMAT, EMPTY_STRING); + } + public Option> getCloudObjectDataDF(SparkSession spark, List cloudObjectMetadata, TypedProperties props, Option schemaProviderOption) { - return loadAsDataset(spark, cloudObjectMetadata, props, fileFormat, schemaProviderOption); + return loadAsDataset(spark, cloudObjectMetadata, props, getFileFormat(props), schemaProviderOption); } - } diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/helpers/CloudObjectsSelectorCommon.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/helpers/CloudObjectsSelectorCommon.java index 750d619258e0..8676bf41cb50 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/helpers/CloudObjectsSelectorCommon.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/helpers/CloudObjectsSelectorCommon.java @@ -18,8 +18,8 @@ package org.apache.hudi.utilities.sources.helpers; -import org.apache.avro.Schema; import org.apache.hudi.AvroConversionUtils; +import org.apache.hudi.common.config.ConfigProperty; import org.apache.hudi.common.config.SerializableConfiguration; import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.util.Option; @@ -33,6 +33,7 @@ import org.apache.hudi.utilities.sources.InputBatch; import com.fasterxml.jackson.databind.ObjectMapper; +import org.apache.avro.Schema; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -56,9 +57,16 @@ import static org.apache.hudi.common.util.CollectionUtils.isNullOrEmpty; import static org.apache.hudi.common.util.ConfigUtils.containsConfigProperty; import static org.apache.hudi.common.util.ConfigUtils.getStringWithAltKeys; +import static org.apache.hudi.utilities.config.CloudSourceConfig.CLOUD_DATAFILE_EXTENSION; +import static org.apache.hudi.utilities.config.CloudSourceConfig.IGNORE_RELATIVE_PATH_PREFIX; +import static org.apache.hudi.utilities.config.CloudSourceConfig.IGNORE_RELATIVE_PATH_SUBSTR; import static org.apache.hudi.utilities.config.CloudSourceConfig.PATH_BASED_PARTITION_FIELDS; +import static org.apache.hudi.utilities.config.CloudSourceConfig.SELECT_RELATIVE_PATH_PREFIX; import static org.apache.hudi.utilities.config.CloudSourceConfig.SOURCE_MAX_BYTES_PER_PARTITION; import static org.apache.hudi.utilities.config.CloudSourceConfig.SPARK_DATASOURCE_READER_COMMA_SEPARATED_PATH_FORMAT; +import static org.apache.hudi.utilities.config.S3EventsHoodieIncrSourceConfig.S3_IGNORE_KEY_PREFIX; +import static org.apache.hudi.utilities.config.S3EventsHoodieIncrSourceConfig.S3_IGNORE_KEY_SUBSTRING; +import static org.apache.hudi.utilities.config.S3EventsHoodieIncrSourceConfig.S3_KEY_PREFIX; import static org.apache.spark.sql.functions.input_file_name; import static org.apache.spark.sql.functions.split; @@ -71,6 +79,13 @@ public class CloudObjectsSelectorCommon { private static final Logger LOG = LoggerFactory.getLogger(CloudObjectsSelectorCommon.class); + public static final String S3_OBJECT_KEY = "s3.object.key"; + public static final String S3_OBJECT_SIZE = "s3.object.size"; + public static final String S3_BUCKET_NAME = "s3.bucket.name"; + public static final String GCS_OBJECT_KEY = "name"; + public static final String GCS_OBJECT_SIZE = "size"; + private static final String SPACE_DELIMTER = " "; + /** * Return a function that extracts filepaths from a list of Rows. * Here Row is assumed to have the schema [bucket_name, filepath_relative_to_bucket, object_size] @@ -151,6 +166,45 @@ private static boolean checkIfFileExists(String storageUrlSchemePrefix, String b } } + public static String generateFilter(Type type, + TypedProperties props) { + String fileFormat = CloudDataFetcher.getFileFormat(props); + Option selectRelativePathPrefix = getPropVal(props, SELECT_RELATIVE_PATH_PREFIX); + Option ignoreRelativePathPrefix = getPropVal(props, IGNORE_RELATIVE_PATH_PREFIX); + Option ignoreRelativePathSubStr = getPropVal(props, IGNORE_RELATIVE_PATH_SUBSTR); + + String objectKey; + String objectSizeKey; + // This is for backwards compatibility of configs for s3. + if (type.equals(Type.S3)) { + objectKey = S3_OBJECT_KEY; + objectSizeKey = S3_OBJECT_SIZE; + selectRelativePathPrefix = selectRelativePathPrefix.or(() -> getPropVal(props, S3_KEY_PREFIX)); + ignoreRelativePathPrefix = ignoreRelativePathPrefix.or(() -> getPropVal(props, S3_IGNORE_KEY_PREFIX)); + ignoreRelativePathSubStr = ignoreRelativePathSubStr.or(() -> getPropVal(props, S3_IGNORE_KEY_SUBSTRING)); + } else { + objectKey = GCS_OBJECT_KEY; + objectSizeKey = GCS_OBJECT_SIZE; + } + + StringBuilder filter = new StringBuilder(String.format("%s > 0", objectSizeKey)); + if (selectRelativePathPrefix.isPresent()) { + filter.append(SPACE_DELIMTER).append(String.format("and %s like '%s%%'", objectKey, selectRelativePathPrefix.get())); + } + if (ignoreRelativePathPrefix.isPresent()) { + filter.append(SPACE_DELIMTER).append(String.format("and %s not like '%s%%'", objectKey, ignoreRelativePathPrefix.get())); + } + if (ignoreRelativePathSubStr.isPresent()) { + filter.append(SPACE_DELIMTER).append(String.format("and %s not like '%%%s%%'", objectKey, ignoreRelativePathSubStr.get())); + } + + // Match files with a given extension, or use the fileFormat as the default. + getPropVal(props, CLOUD_DATAFILE_EXTENSION).or(() -> Option.of(fileFormat)) + .map(val -> filter.append(SPACE_DELIMTER).append(String.format("and %s like '%%%s'", objectKey, val))); + + return filter.toString(); + } + public static Option> loadAsDataset(SparkSession spark, List cloudObjectMetadata, TypedProperties props, String fileFormat, Option schemaProviderOption) { if (LOG.isDebugEnabled()) { @@ -204,7 +258,6 @@ public static Option> loadAsDataset(SparkSession spark, List> loadAsDataset(SparkSession spark, List coalesceOrRepartition(Dataset dataset, int numPartitions) { + int existingNumPartitions = dataset.rdd().getNumPartitions(); + LOG.info(String.format("existing number of partitions=%d, required number of partitions=%d", existingNumPartitions, numPartitions)); + if (existingNumPartitions < numPartitions) { + dataset = dataset.repartition(numPartitions); + } else { + dataset = dataset.coalesce(numPartitions); + } + return dataset; + } + public static Option> loadAsDataset(SparkSession spark, List cloudObjectMetadata, TypedProperties props, String fileFormat) { return loadAsDataset(spark, cloudObjectMetadata, props, fileFormat, Option.empty()); } + + private static Option getPropVal(TypedProperties props, ConfigProperty configProperty) { + String value = getStringWithAltKeys(props, configProperty, true); + if (!StringUtils.isNullOrEmpty(value)) { + return Option.of(value); + } + + return Option.empty(); + } + + public enum Type { + S3, + GCS + } } diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/helpers/KafkaOffsetGen.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/helpers/KafkaOffsetGen.java index d5faec3595e1..442046cd948a 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/helpers/KafkaOffsetGen.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/helpers/KafkaOffsetGen.java @@ -41,10 +41,10 @@ import java.util.ArrayList; import java.util.Arrays; +import java.util.Collection; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; -import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; @@ -59,6 +59,7 @@ import static org.apache.hudi.common.util.ConfigUtils.getBooleanWithAltKeys; import static org.apache.hudi.common.util.ConfigUtils.getLongWithAltKeys; import static org.apache.hudi.common.util.ConfigUtils.getStringWithAltKeys; +import static org.apache.hudi.utilities.sources.helpers.KafkaOffsetGen.CheckpointUtils.checkTopicCheckpoint; /** * Source to read data from Kafka, incrementally. @@ -68,16 +69,14 @@ public class KafkaOffsetGen { private static final Logger LOG = LoggerFactory.getLogger(KafkaOffsetGen.class); private static final String METRIC_NAME_KAFKA_DELAY_COUNT = "kafkaDelayCount"; private static final Comparator SORT_BY_PARTITION = Comparator.comparing(OffsetRange::partition); - public static final String KAFKA_CHECKPOINT_TYPE_TIMESTAMP = "timestamp"; - /** - * kafka checkpoint Pattern. - * Format: topic_name,partition_num:offset,partition_num:offset,.... - */ - private final Pattern pattern = Pattern.compile(".*,.*:.*"); - public static class CheckpointUtils { + /** + * kafka checkpoint Pattern. + * Format: topic_name,partition_num:offset,partition_num:offset,.... + */ + private static final Pattern PATTERN = Pattern.compile(".*,.*:.*"); /** * Reconstruct checkpoint from timeline. @@ -115,6 +114,7 @@ public static String offsetsToStr(OffsetRange[] ranges) { * @param fromOffsetMap offsets where we left off last time * @param toOffsetMap offsets of where each partitions is currently at * @param numEvents maximum number of events to read. + * @param minPartitions minimum partitions used for */ public static OffsetRange[] computeOffsetRanges(Map fromOffsetMap, Map toOffsetMap, @@ -130,62 +130,58 @@ public static OffsetRange[] computeOffsetRanges(Map fromOf .toArray(new OffsetRange[toOffsetMap.size()]); LOG.debug("numEvents {}, minPartitions {}, ranges {}", numEvents, minPartitions, ranges); - boolean needSplitToMinPartitions = minPartitions > toOffsetMap.size(); - long totalEvents = totalNewMessages(ranges); - long allocedEvents = 0; - Set exhaustedPartitions = new HashSet<>(); - List finalRanges = new ArrayList<>(); // choose the actualNumEvents with min(totalEvents, numEvents) - long actualNumEvents = Math.min(totalEvents, numEvents); - - // keep going until we have events to allocate and partitions still not exhausted. - while (allocedEvents < numEvents && exhaustedPartitions.size() < toOffsetMap.size()) { - // Allocate the remaining events to non-exhausted partitions, in round robin fashion - Set allocatedPartitionsThisLoop = new HashSet<>(exhaustedPartitions); - for (int i = 0; i < ranges.length; i++) { - long remainingEvents = actualNumEvents - allocedEvents; - long remainingPartitions = toOffsetMap.size() - allocatedPartitionsThisLoop.size(); - // if need tp split into minPartitions, recalculate the remainingPartitions - if (needSplitToMinPartitions) { - remainingPartitions = minPartitions - finalRanges.size(); + long actualNumEvents = Math.min(totalNewMessages(ranges), numEvents); + minPartitions = Math.max(minPartitions, toOffsetMap.size()); + // Each OffsetRange computed will have maximum of eventsPerPartition, + // this ensures all ranges are evenly distributed and there's no skew in one particular range. + long eventsPerPartition = Math.max(1L, actualNumEvents / minPartitions); + long allocatedEvents = 0; + Map> finalRanges = new HashMap<>(); + Map partitionToAllocatedOffset = new HashMap<>(); + // keep going until we have events to allocate. + while (allocatedEvents < actualNumEvents) { + // Allocate the remaining events in round-robin fashion. + for (OffsetRange range : ranges) { + // if we have already allocated required no of events, exit + if (allocatedEvents == actualNumEvents) { + break; } - long eventsPerPartition = (long) Math.ceil((1.0 * remainingEvents) / remainingPartitions); - - OffsetRange range = ranges[i]; - if (exhaustedPartitions.contains(range.partition())) { - continue; + // Compute startOffset. + long startOffset = range.fromOffset(); + if (partitionToAllocatedOffset.containsKey(range.topicPartition())) { + startOffset = partitionToAllocatedOffset.get(range.topicPartition()); } - + // for last bucket, we may not have full eventsPerPartition msgs. + long eventsForThisPartition = Math.min(eventsPerPartition, (actualNumEvents - allocatedEvents)); + // Compute toOffset. long toOffset = -1L; - if (range.fromOffset() + eventsPerPartition > range.fromOffset()) { - toOffset = Math.min(range.untilOffset(), range.fromOffset() + eventsPerPartition); + if (startOffset + eventsForThisPartition > startOffset) { + toOffset = Math.min(range.untilOffset(), startOffset + eventsForThisPartition); } else { // handling Long overflow toOffset = range.untilOffset(); } - if (toOffset == range.untilOffset()) { - exhaustedPartitions.add(range.partition()); + allocatedEvents += toOffset - startOffset; + OffsetRange thisRange = OffsetRange.create(range.topicPartition(), startOffset, toOffset); + // Add the offsetRange(startOffset,toOffset) to finalRanges. + if (!finalRanges.containsKey(range.topicPartition())) { + finalRanges.put(range.topicPartition(), new ArrayList<>(Collections.singleton(thisRange))); + partitionToAllocatedOffset.put(range.topicPartition(), thisRange.untilOffset()); + } else if (toOffset > startOffset) { + finalRanges.get(range.topicPartition()).add(thisRange); + partitionToAllocatedOffset.put(range.topicPartition(), thisRange.untilOffset()); } - allocedEvents += toOffset - range.fromOffset(); - // We need recompute toOffset if allocedEvents larger than actualNumEvents. - if (allocedEvents > actualNumEvents) { - long offsetsToAdd = Math.min(eventsPerPartition, (actualNumEvents - allocedEvents)); - toOffset = Math.min(range.untilOffset(), toOffset + offsetsToAdd); - } - OffsetRange thisRange = OffsetRange.create(range.topicPartition(), range.fromOffset(), toOffset); - finalRanges.add(thisRange); - ranges[i] = OffsetRange.create(range.topicPartition(), range.fromOffset() + thisRange.count(), range.untilOffset()); - allocatedPartitionsThisLoop.add(range.partition()); } } - - if (!needSplitToMinPartitions) { - LOG.debug("final ranges merged by topic partition {}", Arrays.toString(mergeRangesByTopicPartition(finalRanges.toArray(new OffsetRange[0])))); - return mergeRangesByTopicPartition(finalRanges.toArray(new OffsetRange[0])); + OffsetRange[] sortedRangeArray = finalRanges.values().stream().flatMap(Collection::stream) + .sorted(SORT_BY_PARTITION).toArray(OffsetRange[]::new); + if (actualNumEvents == 0) { + // We return the same ranges back in case of 0 events for checkpoint computation. + sortedRangeArray = ranges; } - finalRanges.sort(SORT_BY_PARTITION); - LOG.debug("final ranges {}", Arrays.toString(finalRanges.toArray(new OffsetRange[0]))); - return finalRanges.toArray(new OffsetRange[0]); + LOG.info("final ranges {}", Arrays.toString(sortedRangeArray)); + return sortedRangeArray; } /** @@ -209,6 +205,11 @@ public static OffsetRange[] mergeRangesByTopicPartition(OffsetRange[] oldRanges) public static long totalNewMessages(OffsetRange[] ranges) { return Arrays.stream(ranges).mapToLong(OffsetRange::count).sum(); } + + public static boolean checkTopicCheckpoint(Option lastCheckpointStr) { + Matcher matcher = PATTERN.matcher(lastCheckpointStr.get()); + return matcher.matches(); + } } private final Map kafkaParams; @@ -241,7 +242,24 @@ public KafkaOffsetGen(TypedProperties props) { } public OffsetRange[] getNextOffsetRanges(Option lastCheckpointStr, long sourceLimit, HoodieIngestionMetrics metrics) { + // Come up with final set of OffsetRanges to read (account for new partitions, limit number of events) + long maxEventsToReadFromKafka = getLongWithAltKeys(props, KafkaSourceConfig.MAX_EVENTS_FROM_KAFKA_SOURCE); + + long numEvents; + if (sourceLimit == Long.MAX_VALUE) { + numEvents = maxEventsToReadFromKafka; + LOG.info("SourceLimit not configured, set numEvents to default value : " + maxEventsToReadFromKafka); + } else { + numEvents = sourceLimit; + } + + long minPartitions = getLongWithAltKeys(props, KafkaSourceConfig.KAFKA_SOURCE_MIN_PARTITIONS); + LOG.info("getNextOffsetRanges set config " + KafkaSourceConfig.KAFKA_SOURCE_MIN_PARTITIONS.key() + " to " + minPartitions); + + return getNextOffsetRanges(lastCheckpointStr, numEvents, minPartitions, metrics); + } + public OffsetRange[] getNextOffsetRanges(Option lastCheckpointStr, long numEvents, long minPartitions, HoodieIngestionMetrics metrics) { // Obtain current metadata for the topic Map fromOffsets; Map toOffsets; @@ -279,29 +297,9 @@ public OffsetRange[] getNextOffsetRanges(Option lastCheckpointStr, long // Obtain the latest offsets. toOffsets = consumer.endOffsets(topicPartitions); } - - // Come up with final set of OffsetRanges to read (account for new partitions, limit number of events) - long maxEventsToReadFromKafka = getLongWithAltKeys(props, KafkaSourceConfig.MAX_EVENTS_FROM_KAFKA_SOURCE); - - long numEvents; - if (sourceLimit == Long.MAX_VALUE) { - numEvents = maxEventsToReadFromKafka; - LOG.info("SourceLimit not configured, set numEvents to default value : " + maxEventsToReadFromKafka); - } else { - numEvents = sourceLimit; - } - - // TODO(HUDI-4625) remove - if (numEvents < toOffsets.size()) { - throw new HoodieException("sourceLimit should not be less than the number of kafka partitions"); - } - - long minPartitions = getLongWithAltKeys(props, KafkaSourceConfig.KAFKA_SOURCE_MIN_PARTITIONS); - LOG.info("getNextOffsetRanges set config " + KafkaSourceConfig.KAFKA_SOURCE_MIN_PARTITIONS.key() + " to " + minPartitions); - return CheckpointUtils.computeOffsetRanges(fromOffsets, toOffsets, numEvents, minPartitions); } - + /** * Fetch partition infos for given topic. * @@ -427,11 +425,6 @@ public boolean checkTopicExists(KafkaConsumer consumer) { return result.containsKey(topicName); } - private boolean checkTopicCheckpoint(Option lastCheckpointStr) { - Matcher matcher = pattern.matcher(lastCheckpointStr.get()); - return matcher.matches(); - } - public String getTopicName() { return topicName; } diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/helpers/gcs/GcsObjectMetadataFetcher.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/helpers/gcs/GcsObjectMetadataFetcher.java index c92901d14cff..29a50e81fb06 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/helpers/gcs/GcsObjectMetadataFetcher.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/helpers/gcs/GcsObjectMetadataFetcher.java @@ -18,11 +18,10 @@ package org.apache.hudi.utilities.sources.helpers.gcs; -import org.apache.hudi.common.config.ConfigProperty; import org.apache.hudi.common.config.SerializableConfiguration; import org.apache.hudi.common.config.TypedProperties; -import org.apache.hudi.common.util.Option; import org.apache.hudi.utilities.sources.helpers.CloudObjectMetadata; +import org.apache.hudi.utilities.sources.helpers.CloudObjectsSelectorCommon; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.Dataset; @@ -34,12 +33,6 @@ import java.io.Serializable; import java.util.List; -import static org.apache.hudi.common.util.ConfigUtils.getStringWithAltKeys; -import static org.apache.hudi.common.util.StringUtils.isNullOrEmpty; -import static org.apache.hudi.utilities.config.CloudSourceConfig.CLOUD_DATAFILE_EXTENSION; -import static org.apache.hudi.utilities.config.CloudSourceConfig.IGNORE_RELATIVE_PATH_PREFIX; -import static org.apache.hudi.utilities.config.CloudSourceConfig.IGNORE_RELATIVE_PATH_SUBSTR; -import static org.apache.hudi.utilities.config.CloudSourceConfig.SELECT_RELATIVE_PATH_PREFIX; import static org.apache.hudi.utilities.sources.helpers.CloudObjectsSelectorCommon.getCloudObjectMetadataPerPartition; /** @@ -51,10 +44,6 @@ */ public class GcsObjectMetadataFetcher implements Serializable { - /** - * The default file format to assume if {@link GcsIngestionConfig#GCS_INCR_DATAFILE_EXTENSION} is not given. - */ - private final String fileFormat; private final TypedProperties props; private static final String GCS_PREFIX = "gs://"; @@ -62,13 +51,8 @@ public class GcsObjectMetadataFetcher implements Serializable { private static final Logger LOG = LoggerFactory.getLogger(GcsObjectMetadataFetcher.class); - /** - * @param fileFormat The default file format to assume if {@link GcsIngestionConfig#GCS_INCR_DATAFILE_EXTENSION} - * is not given. - */ - public GcsObjectMetadataFetcher(TypedProperties props, String fileFormat) { + public GcsObjectMetadataFetcher(TypedProperties props) { this.props = props; - this.fileFormat = fileFormat; } /** @@ -92,35 +76,9 @@ public List getGcsObjectMetadata(JavaSparkContext jsc, Data * @return Dataset after apply the filtering. */ public Dataset applyFilter(Dataset cloudObjectMetadataDF) { - String filter = createFilter(); + String filter = CloudObjectsSelectorCommon.generateFilter(CloudObjectsSelectorCommon.Type.GCS, props); LOG.info("Adding filter string to Dataset: " + filter); return cloudObjectMetadataDF.filter(filter); } - - /** - * Add optional filters that narrow down the list of GCS objects to fetch. - */ - private String createFilter() { - StringBuilder filter = new StringBuilder("size > 0"); - - getPropVal(SELECT_RELATIVE_PATH_PREFIX).ifPresent(val -> filter.append(" and name like '" + val + "%'")); - getPropVal(IGNORE_RELATIVE_PATH_PREFIX).ifPresent(val -> filter.append(" and name not like '" + val + "%'")); - getPropVal(IGNORE_RELATIVE_PATH_SUBSTR).ifPresent(val -> filter.append(" and name not like '%" + val + "%'")); - - // Match files with a given extension, or use the fileFormat as the default. - getPropVal(CLOUD_DATAFILE_EXTENSION).or(() -> Option.of(fileFormat)) - .map(val -> filter.append(" and name like '%" + val + "'")); - - return filter.toString(); - } - - private Option getPropVal(ConfigProperty configProperty) { - String value = getStringWithAltKeys(props, configProperty, true); - if (!isNullOrEmpty(value)) { - return Option.of(value); - } - - return Option.empty(); - } } diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/streamer/DefaultStreamContext.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/streamer/DefaultStreamContext.java new file mode 100644 index 000000000000..f8dabeb89c96 --- /dev/null +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/streamer/DefaultStreamContext.java @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.utilities.streamer; + +import org.apache.hudi.common.util.Option; +import org.apache.hudi.utilities.schema.SchemaProvider; + +/** + * The default implementation for the StreamContext interface, + * composes SchemaProvider and SourceProfileSupplier currently, + * can be extended for other arguments in the future. + */ +public class DefaultStreamContext implements StreamContext { + + private final SchemaProvider schemaProvider; + private final Option sourceProfileSupplier; + + public DefaultStreamContext(SchemaProvider schemaProvider, Option sourceProfileSupplier) { + this.schemaProvider = schemaProvider; + this.sourceProfileSupplier = sourceProfileSupplier; + } + + @Override + public SchemaProvider getSchemaProvider() { + return schemaProvider; + } + + @Override + public Option getSourceProfileSupplier() { + return sourceProfileSupplier; + } +} diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/streamer/ErrorEvent.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/streamer/ErrorEvent.java index f268464d6f1a..a2f1cb277ec6 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/streamer/ErrorEvent.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/streamer/ErrorEvent.java @@ -19,6 +19,8 @@ package org.apache.hudi.utilities.streamer; +import java.util.Objects; + /** * Error event is an event triggered during write or processing failure of a record. */ @@ -40,6 +42,23 @@ public ErrorReason getReason() { return reason; } + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + ErrorEvent that = (ErrorEvent) o; + return reason == that.reason && Objects.equals(payload, that.payload); + } + + @Override + public int hashCode() { + return Objects.hash(reason, payload); + } + /** * The reason behind write or processing failure of a record */ diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/streamer/HoodieMultiTableStreamer.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/streamer/HoodieMultiTableStreamer.java index d7e3bca49897..a637f7fbbff7 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/streamer/HoodieMultiTableStreamer.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/streamer/HoodieMultiTableStreamer.java @@ -66,7 +66,7 @@ /** * Wrapper over HoodieStreamer.java class. * Helps with ingesting incremental data into hoodie datasets for multiple tables. - * Currently supports only COPY_ON_WRITE storage type. + * Supports COPY_ON_WRITE and MERGE_ON_READ storage types. */ public class HoodieMultiTableStreamer { diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/streamer/HoodieStreamer.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/streamer/HoodieStreamer.java index 8ecc937c5e7c..ef31cc34ab5f 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/streamer/HoodieStreamer.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/streamer/HoodieStreamer.java @@ -143,8 +143,12 @@ public HoodieStreamer(Config cfg, JavaSparkContext jssc, FileSystem fs, Configur this(cfg, jssc, fs, conf, Option.empty()); } + public HoodieStreamer(Config cfg, JavaSparkContext jssc, FileSystem fs, Configuration conf, Option propsOverride) throws IOException { + this(cfg, jssc, fs, conf, propsOverride, Option.empty()); + } + public HoodieStreamer(Config cfg, JavaSparkContext jssc, FileSystem fs, Configuration conf, - Option propsOverride) throws IOException { + Option propsOverride, Option sourceProfileSupplier) throws IOException { this.properties = combineProperties(cfg, propsOverride, jssc.hadoopConfiguration()); if (cfg.initialCheckpointProvider != null && cfg.checkpoint == null) { InitialCheckPointProvider checkPointProvider = @@ -158,7 +162,7 @@ public HoodieStreamer(Config cfg, JavaSparkContext jssc, FileSystem fs, Configur cfg.runBootstrap ? new BootstrapExecutor(cfg, jssc, fs, conf, this.properties) : null); HoodieSparkEngineContext sparkEngineContext = new HoodieSparkEngineContext(jssc); this.ingestionService = Option.ofNullable( - cfg.runBootstrap ? null : new StreamSyncService(cfg, sparkEngineContext, fs, conf, Option.ofNullable(this.properties))); + cfg.runBootstrap ? null : new StreamSyncService(cfg, sparkEngineContext, fs, conf, Option.ofNullable(this.properties), sourceProfileSupplier)); } private static TypedProperties combineProperties(Config cfg, Option propsOverride, Configuration hadoopConf) { @@ -656,7 +660,7 @@ public static class StreamSyncService extends HoodieIngestionService { private final Option configurationHotUpdateStrategyOpt; public StreamSyncService(Config cfg, HoodieSparkEngineContext hoodieSparkContext, FileSystem fs, Configuration conf, - Option properties) throws IOException { + Option properties, Option sourceProfileSupplier) throws IOException { super(HoodieIngestionConfig.newBuilder() .isContinuous(cfg.continuousMode) .withMinSyncInternalSeconds(cfg.minSyncIntervalSeconds).build()); @@ -708,13 +712,18 @@ public StreamSyncService(Config cfg, HoodieSparkEngineContext hoodieSparkContext UtilHelpers.createSchemaProvider(cfg.schemaProviderClassName, props, hoodieSparkContext.jsc()), props, hoodieSparkContext.jsc(), cfg.transformerClassNames); - streamSync = new StreamSync(cfg, sparkSession, schemaProvider, props, hoodieSparkContext, fs, conf, this::onInitializingWriteClient); + streamSync = new StreamSync(cfg, sparkSession, props, hoodieSparkContext, fs, conf, this::onInitializingWriteClient, new DefaultStreamContext(schemaProvider, sourceProfileSupplier)); } public StreamSyncService(HoodieStreamer.Config cfg, HoodieSparkEngineContext hoodieSparkContext, FileSystem fs, Configuration conf) throws IOException { - this(cfg, hoodieSparkContext, fs, conf, Option.empty()); + this(cfg, hoodieSparkContext, fs, conf, Option.empty(), Option.empty()); + } + + public StreamSyncService(HoodieStreamer.Config cfg, HoodieSparkEngineContext hoodieSparkContext, FileSystem fs, Configuration conf, Option properties) + throws IOException { + this(cfg, hoodieSparkContext, fs, conf, properties, Option.empty()); } private void initializeTableTypeAndBaseFileFormat() { @@ -728,7 +737,7 @@ private void reInitDeltaSync() throws IOException { if (streamSync != null) { streamSync.close(); } - streamSync = new StreamSync(cfg, sparkSession, schemaProvider, props, hoodieSparkContext, fs, hiveConf, this::onInitializingWriteClient); + streamSync = new StreamSync(cfg, sparkSession, props, hoodieSparkContext, fs, hiveConf, this::onInitializingWriteClient, new DefaultStreamContext(schemaProvider, Option.empty())); } @Override diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/streamer/HoodieStreamerUtils.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/streamer/HoodieStreamerUtils.java index 44c367ba3843..2ecf0b02fb6a 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/streamer/HoodieStreamerUtils.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/streamer/HoodieStreamerUtils.java @@ -35,7 +35,10 @@ import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.collection.ClosableIterator; import org.apache.hudi.common.util.collection.CloseableMappingIterator; -import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.exception.HoodieKeyException; +import org.apache.hudi.exception.HoodieKeyGeneratorException; +import org.apache.hudi.exception.HoodieRecordCreationException; import org.apache.hudi.keygen.BuiltinKeyGenerator; import org.apache.hudi.keygen.KeyGenUtils; import org.apache.hudi.keygen.constant.KeyGeneratorOptions; @@ -52,7 +55,6 @@ import org.apache.spark.sql.catalyst.InternalRow; import org.apache.spark.sql.types.StructType; -import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Iterator; @@ -105,10 +107,7 @@ public static Option> createHoodieRecords(HoodieStreamer.C : DataSourceUtils.createPayload(cfg.payloadClassName, gr); avroRecords.add(Either.left(new HoodieAvroRecord<>(hoodieKey, payload))); } catch (Exception e) { - if (!shouldErrorTable) { - throw e; - } - avroRecords.add(Either.right(HoodieAvroUtils.avroToJsonString(genRec, false))); + avroRecords.add(generateErrorRecordOrThrowException(genRec, e, shouldErrorTable)); } } return avroRecords.iterator(); @@ -136,14 +135,7 @@ public static Option> createHoodieRecords(HoodieStreamer.C return Either.left(new HoodieSparkRecord(new HoodieKey(recordKey, partitionPath), HoodieInternalRowUtils.getCachedUnsafeProjection(baseStructType, targetStructType).apply(row), targetStructType, false)); } catch (Exception e) { - if (!shouldErrorTable) { - throw e; - } - try { - return Either.right(HoodieAvroUtils.avroToJsonString(rec, false)); - } catch (IOException ex) { - throw new HoodieIOException("Failed to convert illegal record to json", ex); - } + return generateErrorRecordOrThrowException(rec, e, shouldErrorTable); } }); @@ -159,6 +151,28 @@ public static Option> createHoodieRecords(HoodieStreamer.C }); } + /** + * @param genRec Avro {@link GenericRecord} instance. + * @return the representation of error record (empty {@link HoodieRecord} and the error record + * String) for writing to error table. + */ + private static Either generateErrorRecordOrThrowException(GenericRecord genRec, Exception e, boolean shouldErrorTable) { + if (!shouldErrorTable) { + if (e instanceof HoodieKeyException) { + throw (HoodieKeyException) e; + } else if (e instanceof HoodieKeyGeneratorException) { + throw (HoodieKeyGeneratorException) e; + } else { + throw new HoodieRecordCreationException("Failed to create Hoodie Record", e); + } + } + try { + return Either.right(HoodieAvroUtils.safeAvroToJsonString(genRec)); + } catch (Exception ex) { + throw new HoodieException("Failed to convert illegal record to json", ex); + } + } + /** * Set based on hoodie.datasource.write.drop.partition.columns config. * When set to true, will not write the partition columns into the table. diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/streamer/SourceFormatAdapter.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/streamer/SourceFormatAdapter.java index f29404701db9..c379472b26eb 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/streamer/SourceFormatAdapter.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/streamer/SourceFormatAdapter.java @@ -23,8 +23,10 @@ import org.apache.hudi.HoodieSparkUtils; import org.apache.hudi.avro.MercifulJsonConverter; import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.util.ConfigUtils; import org.apache.hudi.common.util.Option; import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.exception.SchemaCompatibilityException; import org.apache.hudi.utilities.UtilHelpers; import org.apache.hudi.utilities.schema.FilebasedSchemaProvider; import org.apache.hudi.utilities.schema.SchemaProvider; @@ -53,6 +55,7 @@ import scala.util.Either; +import static org.apache.hudi.utilities.config.HoodieStreamerConfig.ROW_THROW_EXPLICIT_EXCEPTIONS; import static org.apache.hudi.utilities.config.HoodieStreamerConfig.SANITIZE_SCHEMA_FIELD_NAMES; import static org.apache.hudi.utilities.config.HoodieStreamerConfig.SCHEMA_FIELD_NAME_INVALID_CHAR_MASK; import static org.apache.hudi.utilities.schema.RowBasedSchemaProvider.HOODIE_RECORD_NAMESPACE; @@ -62,10 +65,12 @@ /** * Adapts data-format provided by the source to the data-format required by the client (DeltaStreamer). */ -public final class SourceFormatAdapter implements Closeable { +public class SourceFormatAdapter implements Closeable { private final Source source; private boolean shouldSanitize = SANITIZE_SCHEMA_FIELD_NAMES.defaultValue(); + + private boolean wrapWithException = ROW_THROW_EXPLICIT_EXCEPTIONS.defaultValue(); private String invalidCharMask = SCHEMA_FIELD_NAME_INVALID_CHAR_MASK.defaultValue(); private Option errorTableWriter = Option.empty(); @@ -80,6 +85,7 @@ public SourceFormatAdapter(Source source, Option errorTabl if (props.isPresent()) { this.shouldSanitize = SanitizationUtils.shouldSanitize(props.get()); this.invalidCharMask = SanitizationUtils.getInvalidCharMask(props.get()); + this.wrapWithException = ConfigUtils.getBooleanWithAltKeys(props.get(), ROW_THROW_EXPLICIT_EXCEPTIONS); } if (this.shouldSanitize && source.getSourceType() == Source.SourceType.PROTO) { throw new IllegalArgumentException("PROTO cannot be sanitized"); @@ -244,7 +250,8 @@ public InputBatch> fetchNewDataInRowFormat(Option lastCkptS StructType dataType = AvroConversionUtils.convertAvroSchemaToStructType(sourceSchema); return new InputBatch<>( Option.ofNullable( - r.getBatch().map(rdd -> source.getSparkSession().read().schema(dataType).json(rdd)).orElse(null)), + r.getBatch().map(rdd -> HoodieSparkUtils.maybeWrapDataFrameWithException(source.getSparkSession().read().schema(dataType).json(rdd), + SchemaCompatibilityException.class.getName(), "Schema does not match json data", wrapWithException)).orElse(null)), r.getCheckpointForNextBatch(), r.getSchemaProvider()); } } diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/streamer/SourceProfile.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/streamer/SourceProfile.java new file mode 100644 index 000000000000..d830cf5dee3c --- /dev/null +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/streamer/SourceProfile.java @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.utilities.streamer; + +import org.apache.hudi.ApiMaturityLevel; +import org.apache.hudi.PublicAPIClass; +import org.apache.hudi.PublicAPIMethod; + +/** + * A profile containing details about how the next input batch in StreamSync should be consumed and written. + * For eg: KafkaSourceProfile contains number of events to consume in this sync round. + * S3SourceProfile contains the list of files to consume in this sync round. + * HudiIncrementalSourceProfile contains the beginInstant and endInstant commit times to consume in this sync round etc. + * + * @param The type for source context, varies based on sourceType as described above. + */ +@PublicAPIClass(maturity = ApiMaturityLevel.EVOLVING) +public interface SourceProfile { + + /** + * @return The maxBytes that will be consumed from the source in this sync round. + */ + @PublicAPIMethod(maturity = ApiMaturityLevel.EVOLVING) + long getMaxSourceBytes(); + + /** + * @return The number of output partitions required in source RDD. + */ + @PublicAPIMethod(maturity = ApiMaturityLevel.EVOLVING) + int getSourcePartitions(); + + /** + * @return The source specific context based on sourceType as described above. + */ + @PublicAPIMethod(maturity = ApiMaturityLevel.EVOLVING) + T getSourceSpecificContext(); +} diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/streamer/SourceProfileSupplier.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/streamer/SourceProfileSupplier.java new file mode 100644 index 000000000000..34bfb8dff945 --- /dev/null +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/streamer/SourceProfileSupplier.java @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.utilities.streamer; + +import org.apache.hudi.ApiMaturityLevel; +import org.apache.hudi.PublicAPIClass; +import org.apache.hudi.PublicAPIMethod; + +/** + * Supplier for SourceProfile + */ +@PublicAPIClass(maturity = ApiMaturityLevel.EVOLVING) +public interface SourceProfileSupplier { + @SuppressWarnings("rawtypes") + @PublicAPIMethod(maturity = ApiMaturityLevel.EVOLVING) + SourceProfile getSourceProfile(); +} diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/streamer/StreamContext.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/streamer/StreamContext.java new file mode 100644 index 000000000000..bfe337ee3f25 --- /dev/null +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/streamer/StreamContext.java @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.utilities.streamer; + +import org.apache.hudi.ApiMaturityLevel; +import org.apache.hudi.PublicAPIClass; +import org.apache.hudi.PublicAPIMethod; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.utilities.schema.SchemaProvider; + +/** + * The context required to sync one batch of data to hoodie table using StreamSync. + */ +@PublicAPIClass(maturity = ApiMaturityLevel.EVOLVING) +public interface StreamContext { + + /** + * The schema provider used for reading data from source and also writing to hoodie table. + */ + @PublicAPIMethod(maturity = ApiMaturityLevel.EVOLVING) + SchemaProvider getSchemaProvider(); + + /** + * An optional stream profile supplying details regarding how the next input batch in StreamSync should be consumed and written. + */ + @PublicAPIMethod(maturity = ApiMaturityLevel.EVOLVING) + Option getSourceProfileSupplier(); +} diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/streamer/StreamSync.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/streamer/StreamSync.java index 63eccbf5dc64..ded5348ed8f9 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/streamer/StreamSync.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/streamer/StreamSync.java @@ -55,6 +55,7 @@ import org.apache.hudi.common.util.ReflectionUtils; import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.common.util.ValidationUtils; +import org.apache.hudi.common.util.VisibleForTesting; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.config.HoodieClusteringConfig; import org.apache.hudi.config.HoodieCompactionConfig; @@ -255,25 +256,50 @@ public class StreamSync implements Serializable, Closeable { private final boolean useRowWriter; + @VisibleForTesting + StreamSync(HoodieStreamer.Config cfg, SparkSession sparkSession, + TypedProperties props, HoodieSparkEngineContext hoodieSparkContext, FileSystem fs, Configuration conf, + Function onInitializingHoodieWriteClient, SchemaProvider userProvidedSchemaProvider, + Option errorTableWriter, SourceFormatAdapter formatAdapter, Option transformer, + boolean useRowWriter, boolean autoGenerateRecordKeys) { + this.cfg = cfg; + this.hoodieSparkContext = hoodieSparkContext; + this.sparkSession = sparkSession; + this.fs = fs; + this.onInitializingHoodieWriteClient = onInitializingHoodieWriteClient; + this.props = props; + this.userProvidedSchemaProvider = userProvidedSchemaProvider; + this.processedSchema = new SchemaSet(); + this.autoGenerateRecordKeys = autoGenerateRecordKeys; + this.keyGenClassName = getKeyGeneratorClassName(new TypedProperties(props)); + this.conf = conf; + + this.errorTableWriter = errorTableWriter; + this.formatAdapter = formatAdapter; + this.transformer = transformer; + this.useRowWriter = useRowWriter; + + } + @Deprecated public StreamSync(HoodieStreamer.Config cfg, SparkSession sparkSession, SchemaProvider schemaProvider, TypedProperties props, JavaSparkContext jssc, FileSystem fs, Configuration conf, Function onInitializingHoodieWriteClient) throws IOException { - this(cfg, sparkSession, schemaProvider, props, new HoodieSparkEngineContext(jssc), fs, conf, onInitializingHoodieWriteClient); + this(cfg, sparkSession, props, new HoodieSparkEngineContext(jssc), fs, conf, onInitializingHoodieWriteClient, new DefaultStreamContext(schemaProvider, Option.empty())); } - public StreamSync(HoodieStreamer.Config cfg, SparkSession sparkSession, SchemaProvider schemaProvider, + public StreamSync(HoodieStreamer.Config cfg, SparkSession sparkSession, TypedProperties props, HoodieSparkEngineContext hoodieSparkContext, FileSystem fs, Configuration conf, - Function onInitializingHoodieWriteClient) throws IOException { + Function onInitializingHoodieWriteClient, StreamContext streamContext) throws IOException { this.cfg = cfg; this.hoodieSparkContext = hoodieSparkContext; this.sparkSession = sparkSession; this.fs = fs; this.onInitializingHoodieWriteClient = onInitializingHoodieWriteClient; this.props = props; - this.userProvidedSchemaProvider = schemaProvider; + this.userProvidedSchemaProvider = streamContext.getSchemaProvider(); this.processedSchema = new SchemaSet(); - this.autoGenerateRecordKeys = KeyGenUtils.enableAutoGenerateRecordKeys(props); + this.autoGenerateRecordKeys = KeyGenUtils.isAutoGeneratedRecordKeysEnabled(props); this.keyGenClassName = getKeyGeneratorClassName(new TypedProperties(props)); this.conf = conf; @@ -285,7 +311,7 @@ public StreamSync(HoodieStreamer.Config cfg, SparkSession sparkSession, SchemaPr this.errorWriteFailureStrategy = ErrorTableUtils.getErrorWriteFailureStrategy(props); } refreshTimeline(); - Source source = UtilHelpers.createSource(cfg.sourceClassName, props, hoodieSparkContext.jsc(), sparkSession, schemaProvider, metrics); + Source source = UtilHelpers.createSource(cfg.sourceClassName, props, hoodieSparkContext.jsc(), sparkSession, metrics, streamContext); this.formatAdapter = new SourceFormatAdapter(source, this.errorTableWriter, Option.of(props)); Supplier> schemaSupplier = schemaProvider == null ? Option::empty : () -> Option.ofNullable(schemaProvider.getSourceSchema()); @@ -414,7 +440,7 @@ public Pair, JavaRDD> syncOnce() throws IOException || (newTargetSchema != null && !processedSchema.isSchemaPresent(newTargetSchema))) { String sourceStr = newSourceSchema == null ? NULL_PLACEHOLDER : newSourceSchema.toString(true); String targetStr = newTargetSchema == null ? NULL_PLACEHOLDER : newTargetSchema.toString(true); - LOG.info("Seeing new schema. Source: {0}, Target: {1}", sourceStr, targetStr); + LOG.info("Seeing new schema. Source: {}, Target: {}", sourceStr, targetStr); // We need to recreate write client with new schema and register them. reInitWriteClient(newSourceSchema, newTargetSchema, inputBatch.getBatch()); if (newSourceSchema != null) { @@ -513,6 +539,7 @@ public InputBatch readFromSource(String instantTime, HoodieTableMetaClient metaC private InputBatch fetchFromSourceAndPrepareRecords(Option resumeCheckpointStr, String instantTime, HoodieTableMetaClient metaClient) { + hoodieSparkContext.setJobStatus(this.getClass().getSimpleName(), "Fetching next batch: " + cfg.targetTableName); HoodieRecordType recordType = createRecordMerger(props).getRecordType(); if (recordType == HoodieRecordType.SPARK && HoodieTableType.valueOf(cfg.tableType) == HoodieTableType.MERGE_ON_READ && !cfg.operation.equals(WriteOperationType.BULK_INSERT) @@ -535,7 +562,7 @@ private InputBatch fetchFromSourceAndPrepareRecords(Option resumeCheckpo } // handle empty batch with change in checkpoint - hoodieSparkContext.setJobStatus(this.getClass().getSimpleName(), "Checking if input is empty"); + hoodieSparkContext.setJobStatus(this.getClass().getSimpleName(), "Checking if input is empty: " + cfg.targetTableName); if (useRowWriter) { // no additional processing required for row writer. @@ -552,7 +579,8 @@ private InputBatch fetchFromSourceAndPrepareRecords(Option resumeCheckpo * @param resumeCheckpointStr checkpoint to resume from source. * @return {@link InputBatch} containing the new batch of data from source along with new checkpoint and schema provider instance to use. */ - private InputBatch fetchNextBatchFromSource(Option resumeCheckpointStr, HoodieTableMetaClient metaClient) { + @VisibleForTesting + InputBatch fetchNextBatchFromSource(Option resumeCheckpointStr, HoodieTableMetaClient metaClient) { Option> avroRDDOptional = null; String checkpointStr = null; SchemaProvider schemaProvider = null; @@ -573,12 +601,12 @@ private InputBatch fetchNextBatchFromSource(Option resumeCheckpointStr, checkpointStr = dataAndCheckpoint.getCheckpointForNextBatch(); if (this.userProvidedSchemaProvider != null && this.userProvidedSchemaProvider.getTargetSchema() != null && this.userProvidedSchemaProvider.getTargetSchema() != InputBatch.NULL_SCHEMA) { + // Let's deduce the schema provider for writer side first! + schemaProvider = getDeducedSchemaProvider(this.userProvidedSchemaProvider.getTargetSchema(), this.userProvidedSchemaProvider, metaClient); if (useRowWriter) { - inputBatchForWriter = new InputBatch(transformed, checkpointStr, this.userProvidedSchemaProvider); + inputBatchForWriter = new InputBatch(transformed, checkpointStr, schemaProvider); } else { // non row writer path - // Let's deduce the schema provider for writer side first! - schemaProvider = getDeducedSchemaProvider(this.userProvidedSchemaProvider.getTargetSchema(), this.userProvidedSchemaProvider, metaClient); SchemaProvider finalSchemaProvider = schemaProvider; // If the target schema is specified through Avro schema, // pass in the schema for the Row-to-Avro conversion @@ -606,11 +634,10 @@ private InputBatch fetchNextBatchFromSource(Option resumeCheckpointStr, } else { // Deduce proper target (writer's) schema for the input dataset, reconciling its // schema w/ the table's one - Option incomingSchemaOpt = transformed.map(df -> - AvroConversionUtils.convertStructTypeToAvroSchema(df.schema(), getAvroRecordQualifiedName(cfg.targetTableName))); - - schemaProvider = incomingSchemaOpt.map(incomingSchema -> getDeducedSchemaProvider(incomingSchema, dataAndCheckpoint.getSchemaProvider(), metaClient)) - .orElseGet(dataAndCheckpoint::getSchemaProvider); + Schema incomingSchema = transformed.map(df -> + AvroConversionUtils.convertStructTypeToAvroSchema(df.schema(), getAvroRecordQualifiedName(cfg.targetTableName))) + .orElseGet(dataAndCheckpoint.getSchemaProvider()::getTargetSchema); + schemaProvider = getDeducedSchemaProvider(incomingSchema, dataAndCheckpoint.getSchemaProvider(), metaClient); if (useRowWriter) { inputBatchForWriter = new InputBatch(transformed, checkpointStr, schemaProvider); @@ -622,7 +649,9 @@ private InputBatch fetchNextBatchFromSource(Option resumeCheckpointStr, } } else { if (useRowWriter) { - inputBatchForWriter = formatAdapter.fetchNewDataInRowFormat(resumeCheckpointStr, cfg.sourceLimit); + InputBatch inputBatchNeedsDeduceSchema = formatAdapter.fetchNewDataInRowFormat(resumeCheckpointStr, cfg.sourceLimit); + inputBatchForWriter = new InputBatch<>(inputBatchNeedsDeduceSchema.getBatch(), inputBatchNeedsDeduceSchema.getCheckpointForNextBatch(), + getDeducedSchemaProvider(inputBatchNeedsDeduceSchema.getSchemaProvider().getTargetSchema(), inputBatchNeedsDeduceSchema.getSchemaProvider(), metaClient)); } else { // Pull the data from the source & prepare the write InputBatch> dataAndCheckpoint = formatAdapter.fetchNewDataInAvroFormat(resumeCheckpointStr, cfg.sourceLimit); @@ -661,17 +690,17 @@ private InputBatch fetchNextBatchFromSource(Option resumeCheckpointStr, * @param sourceSchemaProvider Source schema provider. * @return the SchemaProvider that can be used as writer schema. */ - private SchemaProvider getDeducedSchemaProvider(Schema incomingSchema, SchemaProvider sourceSchemaProvider, HoodieTableMetaClient metaClient) { + @VisibleForTesting + SchemaProvider getDeducedSchemaProvider(Schema incomingSchema, SchemaProvider sourceSchemaProvider, HoodieTableMetaClient metaClient) { Option latestTableSchemaOpt = UtilHelpers.getLatestTableSchema(hoodieSparkContext.jsc(), fs, cfg.targetBasePath, metaClient); Option internalSchemaOpt = HoodieConversionUtils.toJavaOption( HoodieSchemaUtils.getLatestTableInternalSchema( new HoodieConfig(HoodieStreamer.Config.getProps(fs, cfg)), metaClient)); // Deduce proper target (writer's) schema for the input dataset, reconciling its // schema w/ the table's one - Schema targetSchema = HoodieSparkSqlWriter.deduceWriterSchema( - HoodieAvroUtils.removeMetadataFields(incomingSchema), - HoodieConversionUtils.toScalaOption(latestTableSchemaOpt), - HoodieConversionUtils.toScalaOption(internalSchemaOpt), props); + Schema targetSchema = HoodieSchemaUtils.deduceWriterSchema( + HoodieAvroUtils.removeMetadataFields(incomingSchema), + latestTableSchemaOpt, internalSchemaOpt, props); // Override schema provider with the reconciled target schema return new DelegatingSchemaProvider(props, hoodieSparkContext.jsc(), sourceSchemaProvider, @@ -865,10 +894,10 @@ private Pair, JavaRDD> writeToSinkAndDoMetaSync(Stri writeClient.rollback(instantTime); throw new HoodieStreamerWriteException("Commit " + instantTime + " failed and rolled-back !"); } - long overallTimeMs = overallTimerContext != null ? overallTimerContext.stop() : 0; + long overallTimeNanos = overallTimerContext != null ? overallTimerContext.stop() : 0; // Send DeltaStreamer Metrics - metrics.updateStreamerMetrics(overallTimeMs); + metrics.updateStreamerMetrics(overallTimeNanos); return Pair.of(scheduledCompactionInstant, writeStatusRDD); } @@ -988,13 +1017,14 @@ public void runMetaSync() { SyncUtilHelpers.runHoodieMetaSync(impl.trim(), metaProps, conf, fs, cfg.targetBasePath, cfg.baseFileFormat); success = true; } catch (HoodieMetaSyncException e) { - LOG.error("SyncTool class {0} failed with exception {1}", impl.trim(), e); + LOG.error("SyncTool class {} failed with exception {}", impl.trim(), e); failedMetaSyncs.put(impl, e); } - long metaSyncTimeMs = syncContext != null ? syncContext.stop() : 0; - metrics.updateStreamerMetaSyncMetrics(getSyncClassShortName(impl), metaSyncTimeMs); + long metaSyncTimeNanos = syncContext != null ? syncContext.stop() : 0; + metrics.updateStreamerMetaSyncMetrics(getSyncClassShortName(impl), metaSyncTimeNanos); if (success) { - LOG.info("[MetaSync] SyncTool class {0} completed successfully and took {1} ", impl.trim(), metaSyncTimeMs); + long timeMs = metaSyncTimeNanos / 1000000L; + LOG.info("[MetaSync] SyncTool class {} completed successfully and took {} s {} ms ", impl.trim(), timeMs / 1000L, timeMs % 1000L); } } if (!failedMetaSyncs.isEmpty()) { diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/TestHoodieIndexer.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/TestHoodieIndexer.java index e3724aee48a7..f637413b63d8 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/TestHoodieIndexer.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/TestHoodieIndexer.java @@ -38,6 +38,8 @@ import org.apache.hudi.common.testutils.HoodieTestDataGenerator; import org.apache.hudi.common.util.Option; import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.exception.HoodieMetadataException; import org.apache.hudi.metadata.HoodieBackedTableMetadata; import org.apache.hudi.metadata.HoodieTableMetadataUtil; import org.apache.hudi.metadata.MetadataPartitionType; @@ -76,6 +78,7 @@ import static org.apache.hudi.utilities.UtilHelpers.SCHEDULE_AND_EXECUTE; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; public class TestHoodieIndexer extends SparkClientFunctionalTestHarness implements SparkProvider { @@ -289,7 +292,10 @@ public void testIndexerWithWriterFinishingLast() throws IOException { // start the indexer and validate files index is completely built out HoodieIndexer indexer = new HoodieIndexer(jsc(), config); // The catchup won't finish due to inflight delta commit, and this is expected - assertEquals(-1, indexer.start(0)); + Throwable cause = assertThrows(RuntimeException.class, () -> indexer.start(0)) + .getCause(); + assertTrue(cause instanceof HoodieMetadataException); + assertTrue(cause.getMessage().contains("Failed to index partition")); // Now, make sure that the inflight delta commit happened before the async indexer // is intact @@ -365,7 +371,10 @@ public void testIndexerForExceptionWithNonFilesPartition() { config.propsFilePath = propsPath; // start the indexer and validate index building fails HoodieIndexer indexer = new HoodieIndexer(jsc(), config); - assertEquals(-1, indexer.start(0)); + Throwable cause = assertThrows(RuntimeException.class, () -> indexer.start(0)) + .getCause(); + assertTrue(cause instanceof HoodieException); + assertTrue(cause.getMessage().contains("Metadata table is not yet initialized")); // validate table config metaClient = reload(metaClient); @@ -373,7 +382,7 @@ public void testIndexerForExceptionWithNonFilesPartition() { assertFalse(metaClient.getTableConfig().getMetadataPartitions().contains(COLUMN_STATS.getPartitionPath())); assertFalse(metaClient.getTableConfig().getMetadataPartitions().contains(BLOOM_FILTERS.getPartitionPath())); // validate metadata partitions actually exist - assertFalse(metadataPartitionExists(basePath(), context(), FILES)); + assertFalse(metadataPartitionExists(basePath(), context(), FILES.getPartitionPath())); // trigger FILES partition and indexing should succeed. indexMetadataPartitionsAndAssert(FILES, Collections.emptyList(), Arrays.asList(new MetadataPartitionType[] {COLUMN_STATS, BLOOM_FILTERS}), tableName, "streamer-config/indexer.properties"); @@ -432,8 +441,8 @@ private void indexMetadataPartitionsAndAssert(MetadataPartitionType partitionTyp nonExistentPartitions.forEach(entry -> assertFalse(completedPartitions.contains(entry.getPartitionPath()))); // validate metadata partitions actually exist - assertTrue(metadataPartitionExists(basePath(), context(), partitionTypeToIndex)); - alreadyCompletedPartitions.forEach(entry -> assertTrue(metadataPartitionExists(basePath(), context(), entry))); + assertTrue(metadataPartitionExists(basePath(), context(), partitionTypeToIndex.getPartitionPath())); + alreadyCompletedPartitions.forEach(entry -> assertTrue(metadataPartitionExists(basePath(), context(), entry.getPartitionPath()))); } @Test @@ -455,9 +464,9 @@ public void testIndexerDropPartitionDeletesInstantFromTimeline() { // validate partitions built successfully assertTrue(reload(metaClient).getTableConfig().getMetadataPartitions().contains(FILES.getPartitionPath())); - assertTrue(metadataPartitionExists(basePath(), context(), FILES)); + assertTrue(metadataPartitionExists(basePath(), context(), FILES.getPartitionPath())); assertTrue(reload(metaClient).getTableConfig().getMetadataPartitions().contains(BLOOM_FILTERS.getPartitionPath())); - assertTrue(metadataPartitionExists(basePath(), context(), BLOOM_FILTERS)); + assertTrue(metadataPartitionExists(basePath(), context(), BLOOM_FILTERS.getPartitionPath())); // build indexer config which has only column_stats enabled (files is enabled by default) HoodieIndexer.Config config = new HoodieIndexer.Config(); @@ -481,13 +490,13 @@ public void testIndexerDropPartitionDeletesInstantFromTimeline() { assertEquals(0, indexer.start(0)); indexInstantInTimeline = metaClient.reloadActiveTimeline().filterPendingIndexTimeline().lastInstant(); assertFalse(indexInstantInTimeline.isPresent()); - assertFalse(metadataPartitionExists(basePath(), context(), COLUMN_STATS)); + assertFalse(metadataPartitionExists(basePath(), context(), COLUMN_STATS.getPartitionPath())); // check other partitions are intact assertTrue(reload(metaClient).getTableConfig().getMetadataPartitions().contains(FILES.getPartitionPath())); - assertTrue(metadataPartitionExists(basePath(), context(), FILES)); + assertTrue(metadataPartitionExists(basePath(), context(), FILES.getPartitionPath())); assertTrue(reload(metaClient).getTableConfig().getMetadataPartitions().contains(BLOOM_FILTERS.getPartitionPath())); - assertTrue(metadataPartitionExists(basePath(), context(), BLOOM_FILTERS)); + assertTrue(metadataPartitionExists(basePath(), context(), BLOOM_FILTERS.getPartitionPath())); } @Test @@ -509,7 +518,7 @@ public void testTwoIndexersOneCreateOneDropPartition() { // validate files partition built successfully assertTrue(reload(metaClient).getTableConfig().getMetadataPartitions().contains(FILES.getPartitionPath())); - assertTrue(metadataPartitionExists(basePath(), context(), FILES)); + assertTrue(metadataPartitionExists(basePath(), context(), FILES.getPartitionPath())); // build indexer config which has only bloom_filters enabled HoodieIndexer.Config config = getHoodieIndexConfig(BLOOM_FILTERS.name(), SCHEDULE_AND_EXECUTE, "streamer-config/indexer-only-bloom.properties", tableName); @@ -517,7 +526,7 @@ public void testTwoIndexersOneCreateOneDropPartition() { HoodieIndexer indexer = new HoodieIndexer(jsc(), config); assertEquals(0, indexer.start(0)); assertTrue(reload(metaClient).getTableConfig().getMetadataPartitions().contains(BLOOM_FILTERS.getPartitionPath())); - assertTrue(metadataPartitionExists(basePath(), context(), BLOOM_FILTERS)); + assertTrue(metadataPartitionExists(basePath(), context(), BLOOM_FILTERS.getPartitionPath())); // completed index timeline for later validation Option bloomIndexInstant = metaClient.reloadActiveTimeline().filterCompletedIndexTimeline().lastInstant(); @@ -540,9 +549,9 @@ public void testTwoIndexersOneCreateOneDropPartition() { // check other partitions are intact assertTrue(reload(metaClient).getTableConfig().getMetadataPartitions().contains(FILES.getPartitionPath())); - assertTrue(metadataPartitionExists(basePath(), context(), FILES)); + assertTrue(metadataPartitionExists(basePath(), context(), FILES.getPartitionPath())); assertTrue(reload(metaClient).getTableConfig().getMetadataPartitions().contains(BLOOM_FILTERS.getPartitionPath())); - assertTrue(metadataPartitionExists(basePath(), context(), BLOOM_FILTERS)); + assertTrue(metadataPartitionExists(basePath(), context(), BLOOM_FILTERS.getPartitionPath())); // drop bloom filter partition. timeline files should not be deleted since the index building is complete. dropIndexAndAssert(BLOOM_FILTERS, "streamer-config/indexer-only-bloom.properties", bloomIndexInstant, tableName); @@ -554,7 +563,7 @@ private void dropIndexAndAssert(MetadataPartitionType indexType, String resource assertEquals(0, indexer.start(0)); Option pendingFlights = metaClient.reloadActiveTimeline().filterPendingIndexTimeline().lastInstant(); assertFalse(pendingFlights.isPresent()); - assertFalse(metadataPartitionExists(basePath(), context(), indexType)); + assertFalse(metadataPartitionExists(basePath(), context(), indexType.getPartitionPath())); if (completedIndexInstant.isPresent()) { assertEquals(completedIndexInstant, metaClient.reloadActiveTimeline().filterCompletedIndexTimeline().lastInstant()); } diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/TestHoodieMetadataTableValidator.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/TestHoodieMetadataTableValidator.java new file mode 100644 index 000000000000..e87f6257c54b --- /dev/null +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/TestHoodieMetadataTableValidator.java @@ -0,0 +1,93 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.utilities; + +import org.apache.hudi.DataSourceWriteOptions; +import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.testutils.HoodieSparkClientTestBase; + +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SaveMode; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +import static org.apache.hudi.common.testutils.RawTripTestPayload.recordToString; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class TestHoodieMetadataTableValidator extends HoodieSparkClientTestBase { + + @Test + public void testMetadataTableValidation() { + + Map writeOptions = new HashMap<>(); + writeOptions.put(DataSourceWriteOptions.TABLE_NAME().key(), "test_table"); + writeOptions.put("hoodie.table.name", "test_table"); + writeOptions.put(DataSourceWriteOptions.TABLE_TYPE().key(), "MERGE_ON_READ"); + writeOptions.put(DataSourceWriteOptions.RECORDKEY_FIELD().key(), "_row_key"); + writeOptions.put(DataSourceWriteOptions.PRECOMBINE_FIELD().key(), "timestamp"); + writeOptions.put(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), "partition_path"); + + Dataset inserts = makeInsertDf("000", 5).cache(); + inserts.write().format("hudi").options(writeOptions) + .option(DataSourceWriteOptions.OPERATION().key(), WriteOperationType.BULK_INSERT.value()) + .mode(SaveMode.Overwrite) + .save(basePath); + Dataset updates = makeUpdateDf("001", 5).cache(); + updates.write().format("hudi").options(writeOptions) + .option(DataSourceWriteOptions.OPERATION().key(), WriteOperationType.UPSERT.value()) + .mode(SaveMode.Append) + .save(basePath); + + // validate MDT + HoodieMetadataTableValidator.Config config = new HoodieMetadataTableValidator.Config(); + config.basePath = basePath; + config.validateLatestFileSlices = true; + config.validateAllFileGroups = true; + HoodieMetadataTableValidator validator = new HoodieMetadataTableValidator(jsc, config); + assertTrue(validator.run()); + assertFalse(validator.hasValidationFailure()); + assertTrue(validator.getThrowables().isEmpty()); + } + + protected Dataset makeInsertDf(String instantTime, Integer n) { + List records = dataGen.generateInserts(instantTime, n).stream() + .map(r -> recordToString(r).get()).collect(Collectors.toList()); + JavaRDD rdd = jsc.parallelize(records); + return sparkSession.read().json(rdd); + } + + protected Dataset makeUpdateDf(String instantTime, Integer n) { + try { + List records = dataGen.generateUpdates(instantTime, n).stream() + .map(r -> recordToString(r).get()).collect(Collectors.toList()); + JavaRDD rdd = jsc.parallelize(records); + return sparkSession.read().json(rdd); + } catch (IOException e) { + throw new RuntimeException(e); + } + } +} diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/checkpointing/TestKafkaConnectHdfsProvider.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/checkpointing/TestKafkaConnectHdfsProvider.java index fb6f5d649cba..e90cfdb6856c 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/checkpointing/TestKafkaConnectHdfsProvider.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/checkpointing/TestKafkaConnectHdfsProvider.java @@ -62,7 +62,7 @@ public void testValidKafkaConnectPath() throws Exception { new File(topicPath + "/year=2016/month=05/day=02/" + "random_snappy_2" + BASE_FILE_EXTENSION).createNewFile(); final TypedProperties props = new TypedProperties(); - props.put("hoodie.deltastreamer.checkpoint.provider.path", topicPath.toString()); + props.put("hoodie.streamer.checkpoint.provider.path", topicPath.toString()); final InitialCheckPointProvider provider = new KafkaConnectHdfsProvider(props); provider.init(HoodieTestUtils.getDefaultHadoopConf()); assertEquals("topic1,0:300,1:200", provider.getCheckpoint()); @@ -83,7 +83,7 @@ public void testMissingPartition() throws Exception { new File(topicPath + "/year=2016/month=05/day=02/" + "topic1+0+201+300" + BASE_FILE_EXTENSION).createNewFile(); final TypedProperties props = new TypedProperties(); - props.put("hoodie.deltastreamer.checkpoint.provider.path", topicPath.toString()); + props.put("hoodie.streamer.checkpoint.provider.path", topicPath.toString()); final InitialCheckPointProvider provider = new KafkaConnectHdfsProvider(props); provider.init(HoodieTestUtils.getDefaultHadoopConf()); assertThrows(HoodieException.class, provider::getCheckpoint); diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/config/SourceTestConfig.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/config/SourceTestConfig.java index 450d6e8dc3ae..760e7ed7ff41 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/config/SourceTestConfig.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/config/SourceTestConfig.java @@ -21,29 +21,36 @@ import org.apache.hudi.common.config.ConfigProperty; +import static org.apache.hudi.common.util.ConfigUtils.DELTA_STREAMER_CONFIG_PREFIX; +import static org.apache.hudi.common.util.ConfigUtils.STREAMER_CONFIG_PREFIX; + /** * Configurations for Test Data Sources. */ public class SourceTestConfig { public static final ConfigProperty NUM_SOURCE_PARTITIONS_PROP = ConfigProperty - .key("hoodie.deltastreamer.source.test.num_partitions") + .key(STREAMER_CONFIG_PREFIX + "source.test.num_partitions") .defaultValue(10) + .withAlternatives(DELTA_STREAMER_CONFIG_PREFIX + "source.test.num_partitions") .withDocumentation("Used by DistributedTestDataSource only. Number of partitions where each partitions generates test-data"); public static final ConfigProperty MAX_UNIQUE_RECORDS_PROP = ConfigProperty - .key("hoodie.deltastreamer.source.test.max_unique_records") + .key(STREAMER_CONFIG_PREFIX + "source.test.max_unique_records") .defaultValue(Integer.MAX_VALUE) + .withAlternatives(DELTA_STREAMER_CONFIG_PREFIX + "source.test.max_unique_records") .withDocumentation("Maximum number of unique records generated for the run"); public static final ConfigProperty USE_ROCKSDB_FOR_TEST_DATAGEN_KEYS = ConfigProperty - .key("hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys") + .key(STREAMER_CONFIG_PREFIX + "source.test.datagen.use_rocksdb_for_storing_existing_keys") .defaultValue(false) + .withAlternatives(DELTA_STREAMER_CONFIG_PREFIX + "source.test.datagen.use_rocksdb_for_storing_existing_keys") .withDocumentation("If true, uses Rocks DB for storing datagen keys"); public static final ConfigProperty ROCKSDB_BASE_DIR_FOR_TEST_DATAGEN_KEYS = ConfigProperty - .key("hoodie.deltastreamer.source.test.datagen.rocksdb_base_dir") + .key(STREAMER_CONFIG_PREFIX + "source.test.datagen.rocksdb_base_dir") .noDefaultValue() + .withAlternatives(DELTA_STREAMER_CONFIG_PREFIX + "source.test.datagen.rocksdb_base_dir") .withDocumentation("Base Dir for storing datagen keys"); } diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/deltastreamer/HoodieDeltaStreamerTestBase.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/deltastreamer/HoodieDeltaStreamerTestBase.java index 864c3502825b..2b2013d04cd0 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/deltastreamer/HoodieDeltaStreamerTestBase.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/deltastreamer/HoodieDeltaStreamerTestBase.java @@ -19,6 +19,7 @@ package org.apache.hudi.utilities.deltastreamer; +import org.apache.hudi.common.config.HoodieCommonConfig; import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.model.HoodieCommitMetadata; import org.apache.hudi.common.model.HoodieRecord; @@ -26,11 +27,15 @@ import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.table.timeline.TimelineMetadataUtils; import org.apache.hudi.common.testutils.HoodieTestDataGenerator; import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.config.HoodieCleanConfig; import org.apache.hudi.config.HoodieClusteringConfig; +import org.apache.hudi.hive.HiveSyncConfigHolder; import org.apache.hudi.hive.MultiPartKeysValueExtractor; +import org.apache.hudi.hive.testutils.HiveTestService; +import org.apache.hudi.sync.common.HoodieSyncConfig; import org.apache.hudi.utilities.config.HoodieStreamerConfig; import org.apache.hudi.utilities.config.KafkaSourceConfig; import org.apache.hudi.utilities.config.SourceTestConfig; @@ -38,6 +43,7 @@ import org.apache.hudi.utilities.sources.HoodieIncrSource; import org.apache.hudi.utilities.sources.TestDataSource; import org.apache.hudi.utilities.sources.TestParquetDFSSourceEmptyBatch; +import org.apache.hudi.utilities.streamer.HoodieStreamer; import org.apache.hudi.utilities.testutils.UtilitiesTestBase; import org.apache.avro.Schema; @@ -68,17 +74,6 @@ import java.util.concurrent.TimeUnit; import java.util.function.Function; -import static org.apache.hudi.common.config.HoodieCommonConfig.SET_NULL_FOR_MISSING_COLUMNS; -import static org.apache.hudi.common.table.timeline.TimelineMetadataUtils.serializeCommitMetadata; -import static org.apache.hudi.common.util.StringUtils.nonEmpty; -import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_URL; -import static org.apache.hudi.hive.testutils.HiveTestService.HS2_JDBC_URL; -import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_DATABASE_NAME; -import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS; -import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_PARTITION_FIELDS; -import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_TABLE_NAME; -import static org.apache.hudi.utilities.config.KafkaSourceConfig.KAFKA_AVRO_VALUE_DESERIALIZER_CLASS; -import static org.apache.hudi.utilities.streamer.HoodieStreamer.CHECKPOINT_KEY; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertTrue; @@ -137,9 +132,7 @@ public class HoodieDeltaStreamerTestBase extends UtilitiesTestBase { @BeforeEach protected void prepareTestSetup() throws IOException { - PARQUET_SOURCE_ROOT = basePath + "/parquetFiles"; - ORC_SOURCE_ROOT = basePath + "/orcFiles"; - JSON_KAFKA_SOURCE_ROOT = basePath + "/jsonKafkaFiles"; + setupTest(); testUtils = new KafkaTestUtils(); testUtils.setup(); topicName = "topic" + testNum; @@ -148,6 +141,36 @@ protected void prepareTestSetup() throws IOException { prepareORCDFSFiles(ORC_NUM_RECORDS, ORC_SOURCE_ROOT); } + @AfterEach + public void cleanupKafkaTestUtils() { + if (testUtils != null) { + testUtils.teardown(); + testUtils = null; + } + if (hudiOpts != null) { + hudiOpts = null; + } + } + + @BeforeAll + public static void initClass() throws Exception { + UtilitiesTestBase.initTestServices(false, true, false); + // basePath is defined in UtilitiesTestBase.initTestServices + PARQUET_SOURCE_ROOT = basePath + "/parquetFiles"; + ORC_SOURCE_ROOT = basePath + "/orcFiles"; + JSON_KAFKA_SOURCE_ROOT = basePath + "/jsonKafkaFiles"; + } + + @AfterAll + public static void tearDown() { + UtilitiesTestBase.cleanUpUtilitiesTestServices(); + } + + public void setupTest() { + TestDataSource.returnEmptyBatch = false; + hudiOpts = new HashMap<>(); + } + protected static void prepareInitialConfigs(FileSystem dfs, String dfsBasePath, String brokerAddress) throws IOException { // prepare the configs. UtilitiesTestBase.Helpers.copyToDFS("streamer-config/base.properties", dfs, dfsBasePath + "/base.properties"); @@ -181,8 +204,8 @@ protected static void prepareInitialConfigs(FileSystem dfs, String dfsBasePath, downstreamProps.setProperty("hoodie.datasource.write.partitionpath.field", "partition_path"); // Source schema is the target schema of upstream table - downstreamProps.setProperty("hoodie.deltastreamer.schemaprovider.source.schema.file", dfsBasePath + "/target.avsc"); - downstreamProps.setProperty("hoodie.deltastreamer.schemaprovider.target.schema.file", dfsBasePath + "/target.avsc"); + downstreamProps.setProperty("hoodie.streamer.schemaprovider.source.schema.file", dfsBasePath + "/target.avsc"); + downstreamProps.setProperty("hoodie.streamer.schemaprovider.target.schema.file", dfsBasePath + "/target.avsc"); UtilitiesTestBase.Helpers.savePropsToDFS(downstreamProps, dfs, dfsBasePath + "/test-downstream-source.properties"); // Properties used for testing invalid key generator @@ -191,8 +214,8 @@ protected static void prepareInitialConfigs(FileSystem dfs, String dfsBasePath, invalidProps.setProperty("hoodie.datasource.write.keygenerator.class", "invalid"); invalidProps.setProperty("hoodie.datasource.write.recordkey.field", "_row_key"); invalidProps.setProperty("hoodie.datasource.write.partitionpath.field", "partition_path"); - invalidProps.setProperty("hoodie.deltastreamer.schemaprovider.source.schema.file", dfsBasePath + "/source.avsc"); - invalidProps.setProperty("hoodie.deltastreamer.schemaprovider.target.schema.file", dfsBasePath + "/target.avsc"); + invalidProps.setProperty("hoodie.streamer.schemaprovider.source.schema.file", dfsBasePath + "/source.avsc"); + invalidProps.setProperty("hoodie.streamer.schemaprovider.target.schema.file", dfsBasePath + "/target.avsc"); UtilitiesTestBase.Helpers.savePropsToDFS(invalidProps, dfs, dfsBasePath + "/" + PROPS_FILENAME_TEST_INVALID); // Properties used for testing inferring key generator for complex key generator @@ -200,8 +223,8 @@ protected static void prepareInitialConfigs(FileSystem dfs, String dfsBasePath, inferKeygenProps.setProperty("include", "base.properties"); inferKeygenProps.setProperty("hoodie.datasource.write.recordkey.field", "timestamp,_row_key"); inferKeygenProps.setProperty("hoodie.datasource.write.partitionpath.field", "partition_path"); - inferKeygenProps.setProperty("hoodie.deltastreamer.schemaprovider.source.schema.file", dfsBasePath + "/source.avsc"); - inferKeygenProps.setProperty("hoodie.deltastreamer.schemaprovider.target.schema.file", dfsBasePath + "/target.avsc"); + inferKeygenProps.setProperty("hoodie.streamer.schemaprovider.source.schema.file", dfsBasePath + "/source.avsc"); + inferKeygenProps.setProperty("hoodie.streamer.schemaprovider.target.schema.file", dfsBasePath + "/target.avsc"); UtilitiesTestBase.Helpers.savePropsToDFS(inferKeygenProps, dfs, dfsBasePath + "/" + PROPS_FILENAME_INFER_COMPLEX_KEYGEN); // Properties used for testing inferring key generator for non-partitioned key generator @@ -217,8 +240,8 @@ protected static void prepareInitialConfigs(FileSystem dfs, String dfsBasePath, UtilitiesTestBase.Helpers.savePropsToDFS(properties, dfs, dfsBasePath + "/" + PROPS_INVALID_TABLE_CONFIG_FILE); TypedProperties invalidHiveSyncProps = new TypedProperties(); - invalidHiveSyncProps.setProperty("hoodie.deltastreamer.ingestion.tablesToBeIngested", "uber_db.dummy_table_uber"); - invalidHiveSyncProps.setProperty("hoodie.deltastreamer.ingestion.uber_db.dummy_table_uber.configFile", dfsBasePath + "/config/invalid_hive_sync_uber_config.properties"); + invalidHiveSyncProps.setProperty("hoodie.streamer.ingestion.tablesToBeIngested", "uber_db.dummy_table_uber"); + invalidHiveSyncProps.setProperty("hoodie.streamer.ingestion.uber_db.dummy_table_uber.configFile", dfsBasePath + "/config/invalid_hive_sync_uber_config.properties"); UtilitiesTestBase.Helpers.savePropsToDFS(invalidHiveSyncProps, dfs, dfsBasePath + "/" + PROPS_INVALID_HIVE_SYNC_TEST_SOURCE1); } @@ -228,47 +251,24 @@ protected static void writeCommonPropsToFile(FileSystem dfs, String dfsBasePath) props.setProperty("hoodie.datasource.write.keygenerator.class", TestHoodieDeltaStreamer.TestGenerator.class.getName()); props.setProperty("hoodie.datasource.write.recordkey.field", "_row_key"); props.setProperty("hoodie.datasource.write.partitionpath.field", "partition_path"); - props.setProperty("hoodie.deltastreamer.schemaprovider.source.schema.file", dfsBasePath + "/source.avsc"); - props.setProperty("hoodie.deltastreamer.schemaprovider.target.schema.file", dfsBasePath + "/target.avsc"); + props.setProperty("hoodie.streamer.schemaprovider.source.schema.file", dfsBasePath + "/source.avsc"); + props.setProperty("hoodie.streamer.schemaprovider.target.schema.file", dfsBasePath + "/target.avsc"); // Hive Configs - props.setProperty(HIVE_URL.key(), HS2_JDBC_URL); - props.setProperty(META_SYNC_DATABASE_NAME.key(), "testdb1"); - props.setProperty(META_SYNC_TABLE_NAME.key(), "hive_trips"); - props.setProperty(META_SYNC_PARTITION_FIELDS.key(), "datestr"); - props.setProperty(META_SYNC_PARTITION_EXTRACTOR_CLASS.key(), + props.setProperty(HiveSyncConfigHolder.HIVE_URL.key(), HiveTestService.HS2_JDBC_URL); + props.setProperty(HoodieSyncConfig.META_SYNC_DATABASE_NAME.key(), "testdb1"); + props.setProperty(HoodieSyncConfig.META_SYNC_TABLE_NAME.key(), "hive_trips"); + props.setProperty(HoodieSyncConfig.META_SYNC_PARTITION_FIELDS.key(), "datestr"); + props.setProperty(HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS.key(), MultiPartKeysValueExtractor.class.getName()); UtilitiesTestBase.Helpers.savePropsToDFS(props, dfs, dfsBasePath + "/" + PROPS_FILENAME_TEST_SOURCE); } - @BeforeAll - public static void initClass() throws Exception { - UtilitiesTestBase.initTestServices(false, true, false); - } - - @AfterAll - public static void tearDown() throws IOException { - UtilitiesTestBase.cleanUpUtilitiesTestServices(); - } - - @AfterEach - public void cleanupKafkaTestUtils() { - if (testUtils != null) { - testUtils.teardown(); - } - } - - @BeforeEach - public void setupTest() { - TestDataSource.returnEmptyBatch = false; - hudiOpts = new HashMap<>(); - } - protected static void populateInvalidTableConfigFilePathProps(TypedProperties props, String dfsBasePath) { props.setProperty("hoodie.datasource.write.keygenerator.class", TestHoodieDeltaStreamer.TestGenerator.class.getName()); - props.setProperty("hoodie.deltastreamer.keygen.timebased.output.dateformat", "yyyyMMdd"); - props.setProperty("hoodie.deltastreamer.ingestion.tablesToBeIngested", "uber_db.dummy_table_uber"); - props.setProperty("hoodie.deltastreamer.ingestion.uber_db.dummy_table_uber.configFile", dfsBasePath + "/config/invalid_uber_config.properties"); + props.setProperty("hoodie.keygen.timebased.output.dateformat", "yyyyMMdd"); + props.setProperty("hoodie.streamer.ingestion.tablesToBeIngested", "uber_db.dummy_table_uber"); + props.setProperty("hoodie.streamer.ingestion.uber_db.dummy_table_uber.configFile", dfsBasePath + "/config/invalid_uber_config.properties"); } protected static void populateAllCommonProps(TypedProperties props, String dfsBasePath, String brokerAddress) { @@ -279,10 +279,10 @@ protected static void populateAllCommonProps(TypedProperties props, String dfsBa protected static void populateCommonProps(TypedProperties props, String dfsBasePath) { props.setProperty("hoodie.datasource.write.keygenerator.class", TestHoodieDeltaStreamer.TestGenerator.class.getName()); - props.setProperty("hoodie.deltastreamer.keygen.timebased.output.dateformat", "yyyyMMdd"); - props.setProperty("hoodie.deltastreamer.ingestion.tablesToBeIngested", "short_trip_db.dummy_table_short_trip,uber_db.dummy_table_uber"); - props.setProperty("hoodie.deltastreamer.ingestion.uber_db.dummy_table_uber.configFile", dfsBasePath + "/config/uber_config.properties"); - props.setProperty("hoodie.deltastreamer.ingestion.short_trip_db.dummy_table_short_trip.configFile", dfsBasePath + "/config/short_trip_uber_config.properties"); + props.setProperty("hoodie.keygen.timebased.output.dateformat", "yyyyMMdd"); + props.setProperty("hoodie.streamer.ingestion.tablesToBeIngested", "short_trip_db.dummy_table_short_trip,uber_db.dummy_table_uber"); + props.setProperty("hoodie.streamer.ingestion.uber_db.dummy_table_uber.configFile", dfsBasePath + "/config/uber_config.properties"); + props.setProperty("hoodie.streamer.ingestion.short_trip_db.dummy_table_short_trip.configFile", dfsBasePath + "/config/short_trip_uber_config.properties"); } protected static void populateCommonKafkaProps(TypedProperties props, String brokerAddress) { @@ -291,15 +291,15 @@ protected static void populateCommonKafkaProps(TypedProperties props, String bro props.setProperty("auto.offset.reset", "earliest"); props.setProperty("key.serializer", "org.apache.kafka.common.serialization.StringSerializer"); props.setProperty("value.serializer", "org.apache.kafka.common.serialization.StringSerializer"); - props.setProperty("hoodie.deltastreamer.kafka.source.maxEvents", String.valueOf(5000)); + props.setProperty("hoodie.streamer.kafka.source.maxEvents", String.valueOf(5000)); } protected static void populateCommonHiveProps(TypedProperties props) { // Hive Configs - props.setProperty(HIVE_URL.key(), HS2_JDBC_URL); - props.setProperty(META_SYNC_DATABASE_NAME.key(), "testdb2"); - props.setProperty(META_SYNC_PARTITION_FIELDS.key(), "datestr"); - props.setProperty(META_SYNC_PARTITION_EXTRACTOR_CLASS.key(), + props.setProperty(HiveSyncConfigHolder.HIVE_URL.key(), HiveTestService.HS2_JDBC_URL); + props.setProperty(HoodieSyncConfig.META_SYNC_DATABASE_NAME.key(), "testdb2"); + props.setProperty(HoodieSyncConfig.META_SYNC_PARTITION_FIELDS.key(), "datestr"); + props.setProperty(HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS.key(), MultiPartKeysValueExtractor.class.getName()); } @@ -312,9 +312,14 @@ protected static void prepareParquetDFSFiles(int numRecords, String baseParquetP } protected static HoodieTestDataGenerator prepareParquetDFSFiles(int numRecords, String baseParquetPath, String fileName, boolean useCustomSchema, - String schemaStr, Schema schema) throws IOException { + String schemaStr, Schema schema) throws IOException { + return prepareParquetDFSFiles(numRecords, baseParquetPath, fileName, useCustomSchema, schemaStr, schema, false); + } + + protected static HoodieTestDataGenerator prepareParquetDFSFiles(int numRecords, String baseParquetPath, String fileName, boolean useCustomSchema, + String schemaStr, Schema schema, boolean makeDatesAmbiguous) throws IOException { String path = baseParquetPath + "/" + fileName; - HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(); + HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(makeDatesAmbiguous); if (useCustomSchema) { Helpers.saveParquetToDFS(Helpers.toGenericRecords( dataGenerator.generateInsertsAsPerSchema("000", numRecords, schemaStr), @@ -379,12 +384,12 @@ protected void prepareParquetDFSSource(boolean useSchemaProvider, boolean hasTra parquetProps.setProperty("hoodie.datasource.write.recordkey.field", "_row_key"); parquetProps.setProperty("hoodie.datasource.write.partitionpath.field", partitionPath); if (useSchemaProvider) { - parquetProps.setProperty("hoodie.deltastreamer.schemaprovider.source.schema.file", basePath + "/" + sourceSchemaFile); + parquetProps.setProperty("hoodie.streamer.schemaprovider.source.schema.file", basePath + "/" + sourceSchemaFile); if (hasTransformer) { - parquetProps.setProperty("hoodie.deltastreamer.schemaprovider.target.schema.file", basePath + "/" + targetSchemaFile); + parquetProps.setProperty("hoodie.streamer.schemaprovider.target.schema.file", basePath + "/" + targetSchemaFile); } } - parquetProps.setProperty("hoodie.deltastreamer.source.dfs.root", parquetSourceRoot); + parquetProps.setProperty("hoodie.streamer.source.dfs.root", parquetSourceRoot); if (!StringUtils.isNullOrEmpty(emptyBatchParam)) { parquetProps.setProperty(TestParquetDFSSourceEmptyBatch.RETURN_EMPTY_BATCH, emptyBatchParam); } @@ -400,11 +405,11 @@ protected void prepareAvroKafkaDFSSource(String propsFileName, Long maxEventsTo props.setProperty("hoodie.embed.timeline.server", "false"); props.setProperty("hoodie.datasource.write.recordkey.field", "_row_key"); props.setProperty("hoodie.datasource.write.partitionpath.field", partitionPath); - props.setProperty("hoodie.deltastreamer.source.kafka.topic", topicName); - props.setProperty("hoodie.deltastreamer.kafka.source.maxEvents", String.valueOf(5000)); + props.setProperty("hoodie.streamer.source.kafka.topic", topicName); + props.setProperty("hoodie.streamer.kafka.source.maxEvents", String.valueOf(5000)); props.setProperty(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "false"); - props.setProperty(KAFKA_AVRO_VALUE_DESERIALIZER_CLASS.key(), ByteArrayDeserializer.class.getName()); - props.setProperty("hoodie.deltastreamer.kafka.source.maxEvents", + props.setProperty(KafkaSourceConfig.KAFKA_AVRO_VALUE_DESERIALIZER_CLASS.key(), ByteArrayDeserializer.class.getName()); + props.setProperty("hoodie.streamer.kafka.source.maxEvents", maxEventsToReadFromKafkaSource != null ? String.valueOf(maxEventsToReadFromKafkaSource) : String.valueOf(KafkaSourceConfig.MAX_EVENTS_FROM_KAFKA_SOURCE.defaultValue())); props.setProperty(ConsumerConfig.GROUP_ID_CONFIG, UUID.randomUUID().toString()); @@ -437,19 +442,19 @@ static List getTableServicesConfigs(int totalRecords, String autoClean, String inlineClusterMaxCommit, String asyncCluster, String asyncClusterMaxCommit) { List configs = new ArrayList<>(); configs.add(String.format("%s=%d", SourceTestConfig.MAX_UNIQUE_RECORDS_PROP.key(), totalRecords)); - if (nonEmpty(autoClean)) { + if (StringUtils.nonEmpty(autoClean)) { configs.add(String.format("%s=%s", HoodieCleanConfig.AUTO_CLEAN.key(), autoClean)); } - if (nonEmpty(inlineCluster)) { + if (StringUtils.nonEmpty(inlineCluster)) { configs.add(String.format("%s=%s", HoodieClusteringConfig.INLINE_CLUSTERING.key(), inlineCluster)); } - if (nonEmpty(inlineClusterMaxCommit)) { + if (StringUtils.nonEmpty(inlineClusterMaxCommit)) { configs.add(String.format("%s=%s", HoodieClusteringConfig.INLINE_CLUSTERING_MAX_COMMITS.key(), inlineClusterMaxCommit)); } - if (nonEmpty(asyncCluster)) { + if (StringUtils.nonEmpty(asyncCluster)) { configs.add(String.format("%s=%s", HoodieClusteringConfig.ASYNC_CLUSTERING_ENABLE.key(), asyncCluster)); } - if (nonEmpty(asyncClusterMaxCommit)) { + if (StringUtils.nonEmpty(asyncClusterMaxCommit)) { configs.add(String.format("%s=%s", HoodieClusteringConfig.ASYNC_CLUSTERING_MAX_COMMITS.key(), asyncClusterMaxCommit)); } return configs; @@ -477,7 +482,7 @@ static void addCommitToTimeline(HoodieTableMetaClient metaClient, WriteOperation metaClient.getActiveTimeline().createNewInstant(new HoodieInstant(HoodieInstant.State.INFLIGHT, commitActiontype, commitTime)); metaClient.getActiveTimeline().saveAsComplete( new HoodieInstant(HoodieInstant.State.INFLIGHT, commitActiontype, commitTime), - serializeCommitMetadata(commitMetadata)); + TimelineMetadataUtils.serializeCommitMetadata(commitMetadata)); } void assertRecordCount(long expected, String tablePath, SQLContext sqlContext) { @@ -611,11 +616,11 @@ static HoodieDeltaStreamer.Config makeConfigForHudiIncrSrc(String srcBasePath, S cfg.schemaProviderClassName = schemaProviderClassName; } List cfgs = new ArrayList<>(); - cfgs.add(SET_NULL_FOR_MISSING_COLUMNS.key() + "=true"); - cfgs.add("hoodie.deltastreamer.source.hoodieincr.read_latest_on_missing_ckpt=" + addReadLatestOnMissingCkpt); - cfgs.add("hoodie.deltastreamer.source.hoodieincr.path=" + srcBasePath); + cfgs.add(HoodieCommonConfig.SET_NULL_FOR_MISSING_COLUMNS.key() + "=true"); + cfgs.add("hoodie.streamer.source.hoodieincr.read_latest_on_missing_ckpt=" + addReadLatestOnMissingCkpt); + cfgs.add("hoodie.streamer.source.hoodieincr.path=" + srcBasePath); // No partition - cfgs.add("hoodie.deltastreamer.source.hoodieincr.partition.fields=datestr"); + cfgs.add("hoodie.streamer.source.hoodieincr.partition.fields=datestr"); cfg.configs = cfgs; return cfg; } @@ -660,7 +665,7 @@ static String assertCommitMetadata(String expected, String tablePath, FileSystem HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes(timeline.getInstantDetails(lastInstant).get(), HoodieCommitMetadata.class); assertEquals(totalCommits, timeline.countInstants()); - assertEquals(expected, commitMetadata.getMetadata(CHECKPOINT_KEY)); + assertEquals(expected, commitMetadata.getMetadata(HoodieStreamer.CHECKPOINT_KEY)); return lastInstant.getTimestamp(); } diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/deltastreamer/TestHoodieDeltaStreamer.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/deltastreamer/TestHoodieDeltaStreamer.java index 5294ae1b4c4a..34486a07ab8b 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/deltastreamer/TestHoodieDeltaStreamer.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/deltastreamer/TestHoodieDeltaStreamer.java @@ -167,6 +167,10 @@ import static org.apache.hudi.utilities.schema.KafkaOffsetPostProcessor.KAFKA_SOURCE_OFFSET_COLUMN; import static org.apache.hudi.utilities.schema.KafkaOffsetPostProcessor.KAFKA_SOURCE_PARTITION_COLUMN; import static org.apache.hudi.utilities.schema.KafkaOffsetPostProcessor.KAFKA_SOURCE_TIMESTAMP_COLUMN; +import static org.apache.hudi.utilities.testutils.JdbcTestUtils.JDBC_DRIVER; +import static org.apache.hudi.utilities.testutils.JdbcTestUtils.JDBC_PASS; +import static org.apache.hudi.utilities.testutils.JdbcTestUtils.JDBC_URL; +import static org.apache.hudi.utilities.testutils.JdbcTestUtils.JDBC_USER; import static org.apache.hudi.utilities.testutils.UtilitiesTestBase.Helpers.jsonifyRecordsByPartitions; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; @@ -372,7 +376,7 @@ public void testKafkaConnectCheckpointProvider() throws IOException { HoodieDeltaStreamer.Config cfg = TestHelpers.makeDropAllConfig(tableBasePath, WriteOperationType.UPSERT); TypedProperties props = new DFSPropertiesConfiguration(fs.getConf(), new Path(basePath + "/" + PROPS_FILENAME_TEST_SOURCE)).getProps(); - props.put("hoodie.deltastreamer.checkpoint.provider.path", bootstrapPath); + props.put("hoodie.streamer.checkpoint.provider.path", bootstrapPath); cfg.initialCheckpointProvider = checkpointProviderClass; // create regular kafka connect hdfs dirs fs.mkdirs(new Path(bootstrapPath)); @@ -564,8 +568,8 @@ public void testSchemaEvolution(String tableType, boolean useUserProvidedSchema, HoodieDeltaStreamer.Config cfg = TestHelpers.makeConfig(tableBasePath, WriteOperationType.INSERT, Collections.singletonList(TestIdentityTransformer.class.getName()), PROPS_FILENAME_TEST_SOURCE, false, true, false, null, tableType); addRecordMerger(recordType, cfg.configs); - cfg.configs.add("hoodie.deltastreamer.schemaprovider.source.schema.file=" + basePath + "/source.avsc"); - cfg.configs.add("hoodie.deltastreamer.schemaprovider.target.schema.file=" + basePath + "/source.avsc"); + cfg.configs.add("hoodie.streamer.schemaprovider.source.schema.file=" + basePath + "/source.avsc"); + cfg.configs.add("hoodie.streamer.schemaprovider.target.schema.file=" + basePath + "/source.avsc"); cfg.configs.add(DataSourceWriteOptions.RECONCILE_SCHEMA().key() + "=true"); if (!useSchemaPostProcessor) { cfg.configs.add(HoodieSchemaProviderConfig.SPARK_AVRO_POST_PROCESSOR_ENABLE.key() + "=false"); @@ -578,8 +582,8 @@ public void testSchemaEvolution(String tableType, boolean useUserProvidedSchema, cfg = TestHelpers.makeConfig(tableBasePath, WriteOperationType.UPSERT, Collections.singletonList(TripsWithEvolvedOptionalFieldTransformer.class.getName()), PROPS_FILENAME_TEST_SOURCE, false, true, false, null, tableType); addRecordMerger(recordType, cfg.configs); - cfg.configs.add("hoodie.deltastreamer.schemaprovider.source.schema.file=" + basePath + "/source.avsc"); - cfg.configs.add("hoodie.deltastreamer.schemaprovider.target.schema.file=" + basePath + "/source_evolved.avsc"); + cfg.configs.add("hoodie.streamer.schemaprovider.source.schema.file=" + basePath + "/source.avsc"); + cfg.configs.add("hoodie.streamer.schemaprovider.target.schema.file=" + basePath + "/source_evolved.avsc"); cfg.configs.add(DataSourceWriteOptions.RECONCILE_SCHEMA().key() + "=true"); if (!useSchemaPostProcessor) { cfg.configs.add(HoodieSchemaProviderConfig.SPARK_AVRO_POST_PROCESSOR_ENABLE.key() + "=false"); @@ -603,9 +607,9 @@ public void testSchemaEvolution(String tableType, boolean useUserProvidedSchema, cfg = TestHelpers.makeConfig(tableBasePath, WriteOperationType.UPSERT, Collections.singletonList(TestIdentityTransformer.class.getName()), PROPS_FILENAME_TEST_SOURCE, false, true, false, null, tableType); addRecordMerger(recordType, cfg.configs); - cfg.configs.add("hoodie.deltastreamer.schemaprovider.source.schema.file=" + basePath + "/source.avsc"); + cfg.configs.add("hoodie.streamer.schemaprovider.source.schema.file=" + basePath + "/source.avsc"); if (useUserProvidedSchema) { - cfg.configs.add("hoodie.deltastreamer.schemaprovider.target.schema.file=" + basePath + "/source_evolved.avsc"); + cfg.configs.add("hoodie.streamer.schemaprovider.target.schema.file=" + basePath + "/source_evolved.avsc"); } if (!useSchemaPostProcessor) { cfg.configs.add(HoodieSchemaProviderConfig.SPARK_AVRO_POST_PROCESSOR_ENABLE.key() + "=false"); @@ -644,12 +648,10 @@ public void testUpsertsCOWContinuousMode(HoodieRecordType recordType) throws Exc testUpsertsContinuousMode(HoodieTableType.COPY_ON_WRITE, "continuous_cow", recordType); } - @ParameterizedTest - @EnumSource(value = HoodieRecordType.class, names = {"AVRO", "SPARK"}) - public void testUpsertsCOW_ContinuousModeDisabled(HoodieRecordType recordType) throws Exception { + @Test + public void testUpsertsCOW_ContinuousModeDisabled() throws Exception { String tableBasePath = basePath + "/non_continuous_cow"; HoodieDeltaStreamer.Config cfg = TestHelpers.makeConfig(tableBasePath, WriteOperationType.UPSERT); - addRecordMerger(recordType, cfg.configs); cfg.tableType = HoodieTableType.COPY_ON_WRITE.name(); cfg.configs.add(String.format("%s=%s", TURN_METRICS_ON.key(), "true")); cfg.configs.add(String.format("%s=%s", METRICS_REPORTER_TYPE_VALUE.key(), MetricsReporterType.INMEMORY.name())); @@ -675,12 +677,10 @@ public void testUpsertsMORContinuousMode(HoodieRecordType recordType) throws Exc testUpsertsContinuousMode(HoodieTableType.MERGE_ON_READ, "continuous_mor", recordType); } - @ParameterizedTest - @EnumSource(value = HoodieRecordType.class, names = {"AVRO", "SPARK"}) - public void testUpsertsMOR_ContinuousModeDisabled(HoodieRecordType recordType) throws Exception { + @Test + public void testUpsertsMOR_ContinuousModeDisabled() throws Exception { String tableBasePath = basePath + "/non_continuous_mor"; HoodieDeltaStreamer.Config cfg = TestHelpers.makeConfig(tableBasePath, WriteOperationType.UPSERT); - addRecordMerger(recordType, cfg.configs); cfg.tableType = HoodieTableType.MERGE_ON_READ.name(); cfg.configs.add(String.format("%s=%s", TURN_METRICS_ON.key(), "true")); cfg.configs.add(String.format("%s=%s", METRICS_REPORTER_TYPE_VALUE.key(), MetricsReporterType.INMEMORY.name())); @@ -878,8 +878,8 @@ public void testDeltaSyncWithPendingCompaction() throws Exception { } @ParameterizedTest - @CsvSource(value = {"true, AVRO", "true, SPARK", "false, AVRO", "false, SPARK"}) - public void testCleanerDeleteReplacedDataWithArchive(Boolean asyncClean, HoodieRecordType recordType) throws Exception { + @ValueSource(booleans = {true, false}) + public void testCleanerDeleteReplacedDataWithArchive(Boolean asyncClean) throws Exception { String tableBasePath = basePath + "/cleanerDeleteReplacedDataWithArchive" + asyncClean; int totalRecords = 3000; @@ -887,7 +887,7 @@ public void testCleanerDeleteReplacedDataWithArchive(Boolean asyncClean, HoodieR // Step 1 : Prepare and insert data without archival and cleaner. // Make sure that there are 6 commits including 2 replacecommits completed. HoodieDeltaStreamer.Config cfg = TestHelpers.makeConfig(tableBasePath, WriteOperationType.INSERT); - addRecordMerger(recordType, cfg.configs); + addRecordMerger(HoodieRecordType.AVRO, cfg.configs); cfg.continuousMode = true; cfg.tableType = HoodieTableType.COPY_ON_WRITE.name(); cfg.configs.addAll(getTableServicesConfigs(totalRecords, "false", "true", "2", "", "")); @@ -956,7 +956,7 @@ public void testCleanerDeleteReplacedDataWithArchive(Boolean asyncClean, HoodieR configs.add(String.format("%s=%s", HoodieLockConfig.LOCK_PROVIDER_CLASS_NAME.key(), InProcessLockProvider.class.getName())); } - addRecordMerger(recordType, configs); + addRecordMerger(HoodieRecordType.AVRO, configs); cfg.configs = configs; cfg.continuousMode = false; // timeline as of now. no cleaner and archival kicked in. @@ -1137,6 +1137,7 @@ private void testAsyncClusteringService(HoodieRecordType recordType) throws Exce cfg.tableType = HoodieTableType.COPY_ON_WRITE.name(); cfg.configs.addAll(getTableServicesConfigs(totalRecords, "false", "", "", "true", "3")); cfg.configs.add(String.format("%s=%s", "hoodie.datasource.write.row.writer.enable", "false")); + cfg.configs.add(String.format("%s=%s", "hoodie.merge.allow.duplicate.on.inserts", "false")); HoodieDeltaStreamer ds = new HoodieDeltaStreamer(cfg, jsc); deltaStreamerTestRunner(ds, cfg, (r) -> { TestHelpers.assertAtLeastNReplaceCommits(1, tableBasePath, fs); @@ -1187,19 +1188,19 @@ private void testAsyncClusteringServiceWithConflicts(HoodieRecordType recordType } @Timeout(600) - @ParameterizedTest - @EnumSource(value = HoodieRecordType.class, names = {"AVRO", "SPARK"}) - public void testAsyncClusteringServiceWithCompaction(HoodieRecordType recordType) throws Exception { + @Test + public void testAsyncClusteringServiceWithCompaction() throws Exception { String tableBasePath = basePath + "/asyncClusteringCompaction"; // Keep it higher than batch-size to test continuous mode int totalRecords = 2000; // Initial bulk insert HoodieDeltaStreamer.Config cfg = TestHelpers.makeConfig(tableBasePath, WriteOperationType.INSERT); - addRecordMerger(recordType, cfg.configs); + addRecordMerger(HoodieRecordType.AVRO, cfg.configs); cfg.continuousMode = true; cfg.tableType = HoodieTableType.MERGE_ON_READ.name(); cfg.configs.addAll(getTableServicesConfigs(totalRecords, "false", "", "", "true", "3")); + cfg.configs.add(String.format("%s=%s", "hoodie.merge.allow.duplicate.on.inserts", "false")); HoodieDeltaStreamer ds = new HoodieDeltaStreamer(cfg, jsc); deltaStreamerTestRunner(ds, cfg, (r) -> { TestHelpers.assertAtleastNCompactionCommits(2, tableBasePath, fs); @@ -1214,14 +1215,14 @@ public void testAsyncClusteringServiceWithCompaction(HoodieRecordType recordType } @ParameterizedTest - @CsvSource(value = {"true, AVRO", "true, SPARK", "false, AVRO", "false, SPARK"}) - public void testAsyncClusteringJobWithRetry(boolean retryLastFailedClusteringJob, HoodieRecordType recordType) throws Exception { + @ValueSource(booleans = {true, false}) + public void testAsyncClusteringJobWithRetry(boolean retryLastFailedClusteringJob) throws Exception { String tableBasePath = basePath + "/asyncClustering3"; // ingest data int totalRecords = 3000; HoodieDeltaStreamer.Config cfg = TestHelpers.makeConfig(tableBasePath, WriteOperationType.INSERT); - addRecordMerger(recordType, cfg.configs); + addRecordMerger(HoodieRecordType.AVRO, cfg.configs); cfg.continuousMode = false; cfg.tableType = HoodieTableType.COPY_ON_WRITE.name(); cfg.configs.addAll(getTableServicesConfigs(totalRecords, "false", "false", "0", "false", "0")); @@ -1249,7 +1250,7 @@ public void testAsyncClusteringJobWithRetry(boolean retryLastFailedClusteringJob // trigger a scheduleAndExecute clustering job // when retryFailedClustering true => will rollback and re-execute failed clustering plan with same instant timestamp. // when retryFailedClustering false => will make and execute a new clustering plan with new instant timestamp. - HoodieClusteringJob scheduleAndExecute = initialHoodieClusteringJob(tableBasePath, null, false, "scheduleAndExecute", retryLastFailedClusteringJob, recordType); + HoodieClusteringJob scheduleAndExecute = initialHoodieClusteringJob(tableBasePath, null, false, "scheduleAndExecute", retryLastFailedClusteringJob, HoodieRecordType.AVRO); scheduleAndExecute.cluster(0); String completeClusteringTimeStamp = meta.getActiveTimeline().reload().getCompletedReplaceTimeline().lastInstant().get().getTimestamp(); @@ -1263,11 +1264,11 @@ public void testAsyncClusteringJobWithRetry(boolean retryLastFailedClusteringJob } @ParameterizedTest - @CsvSource(value = {"execute, AVRO", "schedule, AVRO", "scheduleAndExecute, AVRO", "execute, SPARK", "schedule, SPARK", "scheduleAndExecute, SPARK"}) - public void testHoodieAsyncClusteringJobWithScheduleAndExecute(String runningMode, HoodieRecordType recordType) throws Exception { + @ValueSource(strings = {"execute", "schedule", "scheduleAndExecute"}) + public void testHoodieAsyncClusteringJobWithScheduleAndExecute(String runningMode) throws Exception { String tableBasePath = basePath + "/asyncClustering2"; - HoodieDeltaStreamer ds = initialHoodieDeltaStreamer(tableBasePath, 3000, "false", recordType, WriteOperationType.BULK_INSERT); - HoodieClusteringJob scheduleClusteringJob = initialHoodieClusteringJob(tableBasePath, null, true, runningMode, recordType); + HoodieDeltaStreamer ds = initialHoodieDeltaStreamer(tableBasePath, 3000, "false", HoodieRecordType.AVRO, WriteOperationType.BULK_INSERT); + HoodieClusteringJob scheduleClusteringJob = initialHoodieClusteringJob(tableBasePath, null, true, runningMode, HoodieRecordType.AVRO); deltaStreamerTestRunner(ds, (r) -> { Exception exception = null; @@ -1406,20 +1407,34 @@ private void testBulkInsertRowWriterMultiBatches(Boolean useSchemaProvider, List @Test public void testBulkInsertRowWriterContinuousModeWithAsyncClustering() throws Exception { testBulkInsertRowWriterContinuousMode(false, null, false, - getTableServicesConfigs(2000, "false", "", "", "true", "3")); + getTableServicesConfigs(2000, "false", "", "", "true", "3"), false); } @Test public void testBulkInsertRowWriterContinuousModeWithInlineClustering() throws Exception { testBulkInsertRowWriterContinuousMode(false, null, false, - getTableServicesConfigs(2000, "false", "true", "3", "false", "")); + getTableServicesConfigs(2000, "false", "true", "3", "false", ""), false); } - private void testBulkInsertRowWriterContinuousMode(Boolean useSchemaProvider, List transformerClassNames, boolean testEmptyBatch, List customConfigs) throws Exception { + @Test + public void testBulkInsertRowWriterContinuousModeWithInlineClusteringAmbiguousDates() throws Exception { + sparkSession.sqlContext().setConf("spark.sql.parquet.datetimeRebaseModeInWrite", "LEGACY"); + sparkSession.sqlContext().setConf("spark.sql.avro.datetimeRebaseModeInWrite", "LEGACY"); + sparkSession.sqlContext().setConf("spark.sql.parquet.int96RebaseModeInWrite", "LEGACY"); + sparkSession.sqlContext().setConf("spark.sql.parquet.datetimeRebaseModeInRead", "LEGACY"); + sparkSession.sqlContext().setConf("spark.sql.avro.datetimeRebaseModeInRead", "LEGACY"); + sparkSession.sqlContext().setConf("spark.sql.parquet.int96RebaseModeInRead", "LEGACY"); + testBulkInsertRowWriterContinuousMode(false, null, false, + getTableServicesConfigs(2000, "false", "true", "3", + "false", ""), true); + } + + private void testBulkInsertRowWriterContinuousMode(Boolean useSchemaProvider, List transformerClassNames, + boolean testEmptyBatch, List customConfigs, boolean makeDatesAmbiguous) throws Exception { PARQUET_SOURCE_ROOT = basePath + "/parquetFilesDfs" + testNum; int parquetRecordsCount = 100; boolean hasTransformer = transformerClassNames != null && !transformerClassNames.isEmpty(); - prepareParquetDFSFiles(parquetRecordsCount, PARQUET_SOURCE_ROOT, FIRST_PARQUET_FILE_NAME, false, null, null); + prepareParquetDFSFiles(parquetRecordsCount, PARQUET_SOURCE_ROOT, FIRST_PARQUET_FILE_NAME, false, null, null, makeDatesAmbiguous); prepareParquetDFSSource(useSchemaProvider, hasTransformer, "source.avsc", "target.avsc", PROPS_FILENAME_TEST_PARQUET, PARQUET_SOURCE_ROOT, false, "partition_path", testEmptyBatch ? "1" : ""); @@ -1429,7 +1444,7 @@ private void testBulkInsertRowWriterContinuousMode(Boolean useSchemaProvider, Li int counter = 2; while (counter < 100) { // lets keep going. if the test times out, we will cancel the future within finally. So, safe to generate 100 batches. LOG.info("Generating data for batch " + counter); - prepareParquetDFSFiles(100, PARQUET_SOURCE_ROOT, Integer.toString(counter) + ".parquet", false, null, null); + prepareParquetDFSFiles(100, PARQUET_SOURCE_ROOT, Integer.toString(counter) + ".parquet", false, null, null, makeDatesAmbiguous); counter++; Thread.sleep(2000); } @@ -1473,9 +1488,9 @@ private void testBulkInsertRowWriterContinuousMode(Boolean useSchemaProvider, Li * step involves using a SQL template to transform a source TEST-DATA-SOURCE ============================> HUDI TABLE * 1 ===============> HUDI TABLE 2 (incr-pull with transform) (incr-pull) Hudi Table 1 is synced with Hive. */ - @ParameterizedTest - @EnumSource(value = HoodieRecordType.class, names = {"AVRO", "SPARK"}) - public void testBulkInsertsAndUpsertsWithSQLBasedTransformerFor2StepPipeline(HoodieRecordType recordType) throws Exception { + @Test + public void testBulkInsertsAndUpsertsWithSQLBasedTransformerFor2StepPipeline() throws Exception { + HoodieRecordType recordType = HoodieRecordType.AVRO; String tableBasePath = basePath + "/" + recordType.toString() + "/test_table2"; String downstreamTableBasePath = basePath + "/" + recordType.toString() + "/test_downstream_table2"; @@ -1635,14 +1650,12 @@ public void testPayloadClassUpdateWithCOWTable() throws Exception { assertFalse(props.containsKey(HoodieTableConfig.PAYLOAD_CLASS_NAME.key())); } - @ParameterizedTest - @EnumSource(value = HoodieRecordType.class, names = {"AVRO", "SPARK"}) - public void testFilterDupes(HoodieRecordType recordType) throws Exception { + @Test + public void testFilterDupes() throws Exception { String tableBasePath = basePath + "/test_dupes_table"; // Initial bulk insert HoodieDeltaStreamer.Config cfg = TestHelpers.makeConfig(tableBasePath, WriteOperationType.BULK_INSERT); - addRecordMerger(recordType, cfg.configs); new HoodieDeltaStreamer(cfg, jsc).sync(); assertRecordCount(1000, tableBasePath, sqlContext); TestHelpers.assertCommitMetadata("00000", tableBasePath, fs, 1); @@ -1663,7 +1676,7 @@ public void testFilterDupes(HoodieRecordType recordType) throws Exception { HoodieTableMetaClient mClient = HoodieTableMetaClient.builder().setConf(jsc.hadoopConfiguration()).setBasePath(tableBasePath).setLoadActiveTimelineOnLoad(true).build(); HoodieInstant lastFinished = mClient.getCommitsTimeline().filterCompletedInstants().lastInstant().get(); HoodieDeltaStreamer.Config cfg2 = TestHelpers.makeDropAllConfig(tableBasePath, WriteOperationType.UPSERT); - addRecordMerger(recordType, cfg2.configs); + addRecordMerger(HoodieRecordType.AVRO, cfg2.configs); cfg2.filterDupes = false; cfg2.sourceLimit = 2000; cfg2.operation = WriteOperationType.UPSERT; @@ -1809,12 +1822,12 @@ private void testORCDFSSource(boolean useSchemaProvider, List transforme orcProps.setProperty("hoodie.datasource.write.recordkey.field", "_row_key"); orcProps.setProperty("hoodie.datasource.write.partitionpath.field", "partition_path"); if (useSchemaProvider) { - orcProps.setProperty("hoodie.deltastreamer.schemaprovider.source.schema.file", basePath + "/" + "source.avsc"); + orcProps.setProperty("hoodie.streamer.schemaprovider.source.schema.file", basePath + "/" + "source.avsc"); if (transformerClassNames != null) { - orcProps.setProperty("hoodie.deltastreamer.schemaprovider.target.schema.file", basePath + "/" + "target.avsc"); + orcProps.setProperty("hoodie.streamer.schemaprovider.target.schema.file", basePath + "/" + "target.avsc"); } } - orcProps.setProperty("hoodie.deltastreamer.source.dfs.root", ORC_SOURCE_ROOT); + orcProps.setProperty("hoodie.streamer.source.dfs.root", ORC_SOURCE_ROOT); UtilitiesTestBase.Helpers.savePropsToDFS(orcProps, fs, basePath + "/" + PROPS_FILENAME_TEST_ORC); String tableBasePath = basePath + "/test_orc_source_table" + testNum; @@ -1839,11 +1852,11 @@ private void prepareJsonKafkaDFSSource(String propsFileName, String autoResetVal props.setProperty("hoodie.embed.timeline.server", "false"); props.setProperty("hoodie.datasource.write.recordkey.field", "_row_key"); props.setProperty("hoodie.datasource.write.partitionpath.field", "driver"); - props.setProperty("hoodie.deltastreamer.source.dfs.root", JSON_KAFKA_SOURCE_ROOT); - props.setProperty("hoodie.deltastreamer.source.kafka.topic", topicName); - props.setProperty("hoodie.deltastreamer.source.kafka.checkpoint.type", kafkaCheckpointType); - props.setProperty("hoodie.deltastreamer.schemaprovider.source.schema.file", basePath + "/source_uber.avsc"); - props.setProperty("hoodie.deltastreamer.schemaprovider.target.schema.file", basePath + "/target_uber.avsc"); + props.setProperty("hoodie.streamer.source.dfs.root", JSON_KAFKA_SOURCE_ROOT); + props.setProperty("hoodie.streamer.source.kafka.topic", topicName); + props.setProperty("hoodie.streamer.source.kafka.checkpoint.type", kafkaCheckpointType); + props.setProperty("hoodie.streamer.schemaprovider.source.schema.file", basePath + "/source_uber.avsc"); + props.setProperty("hoodie.streamer.schemaprovider.target.schema.file", basePath + "/target_uber.avsc"); props.setProperty("auto.offset.reset", autoResetValue); if (extraProps != null && !extraProps.isEmpty()) { extraProps.forEach(props::setProperty); @@ -2121,8 +2134,8 @@ public void testEmptyBatchWithNullSchemaFirstBatch() throws Exception { String tableBasePath = basePath + "/test_parquet_table" + testNum; HoodieDeltaStreamer.Config config = TestHelpers.makeConfig(tableBasePath, WriteOperationType.UPSERT, ParquetDFSSource.class.getName(), - null, PROPS_FILENAME_TEST_PARQUET, false, - false, 100000, false, null, null, "timestamp", null); + Collections.singletonList(TestIdentityTransformer.class.getName()), PROPS_FILENAME_TEST_PARQUET, false, + false, 100000, false, null, "MERGE_ON_READ", "timestamp", null); config.schemaProviderClassName = NullValueSchemaProvider.class.getName(); config.sourceClassName = TestParquetDFSSourceEmptyBatch.class.getName(); @@ -2242,22 +2255,22 @@ private void prepareCsvDFSSource( csvProps.setProperty("hoodie.datasource.write.recordkey.field", recordKeyField); csvProps.setProperty("hoodie.datasource.write.partitionpath.field", partitionPath); if (useSchemaProvider) { - csvProps.setProperty("hoodie.deltastreamer.schemaprovider.source.schema.file", basePath + "/source-flattened.avsc"); + csvProps.setProperty("hoodie.streamer.schemaprovider.source.schema.file", basePath + "/source-flattened.avsc"); if (hasTransformer) { - csvProps.setProperty("hoodie.deltastreamer.schemaprovider.target.schema.file", basePath + "/target-flattened.avsc"); + csvProps.setProperty("hoodie.streamer.schemaprovider.target.schema.file", basePath + "/target-flattened.avsc"); } } - csvProps.setProperty("hoodie.deltastreamer.source.dfs.root", sourceRoot); + csvProps.setProperty("hoodie.streamer.source.dfs.root", sourceRoot); if (sep != ',') { if (sep == '\t') { - csvProps.setProperty("hoodie.deltastreamer.csv.sep", "\\t"); + csvProps.setProperty("hoodie.streamer.csv.sep", "\\t"); } else { - csvProps.setProperty("hoodie.deltastreamer.csv.sep", Character.toString(sep)); + csvProps.setProperty("hoodie.streamer.csv.sep", Character.toString(sep)); } } if (hasHeader) { - csvProps.setProperty("hoodie.deltastreamer.csv.header", Boolean.toString(hasHeader)); + csvProps.setProperty("hoodie.streamer.csv.header", Boolean.toString(hasHeader)); } UtilitiesTestBase.Helpers.savePropsToDFS(csvProps, fs, basePath + "/" + PROPS_FILENAME_TEST_CSV); @@ -2378,7 +2391,7 @@ private void prepareSqlSource() throws IOException { sqlSourceProps.setProperty("hoodie.embed.timeline.server", "false"); sqlSourceProps.setProperty("hoodie.datasource.write.recordkey.field", "_row_key"); sqlSourceProps.setProperty("hoodie.datasource.write.partitionpath.field", "partition_path"); - sqlSourceProps.setProperty("hoodie.deltastreamer.source.sql.sql.query", "select * from test_sql_table"); + sqlSourceProps.setProperty("hoodie.streamer.source.sql.sql.query", "select * from test_sql_table"); UtilitiesTestBase.Helpers.savePropsToDFS(sqlSourceProps, fs, basePath + "/" + PROPS_FILENAME_TEST_SQL_SOURCE); @@ -2406,21 +2419,19 @@ public void testSqlSourceSource() throws Exception { assertRecordCount(SQL_SOURCE_NUM_RECORDS, tableBasePath, sqlContext); } - @Disabled @Test public void testJdbcSourceIncrementalFetchInContinuousMode() { - try (Connection connection = DriverManager.getConnection("jdbc:h2:mem:test_mem", "test", "jdbc")) { + try (Connection connection = DriverManager.getConnection(JDBC_URL, JDBC_USER, JDBC_PASS)) { TypedProperties props = new TypedProperties(); - props.setProperty("hoodie.deltastreamer.jdbc.url", "jdbc:h2:mem:test_mem"); - props.setProperty("hoodie.deltastreamer.jdbc.driver.class", "org.h2.Driver"); - props.setProperty("hoodie.deltastreamer.jdbc.user", "test"); - props.setProperty("hoodie.deltastreamer.jdbc.password", "jdbc"); - props.setProperty("hoodie.deltastreamer.jdbc.table.name", "triprec"); - props.setProperty("hoodie.deltastreamer.jdbc.incr.pull", "true"); - props.setProperty("hoodie.deltastreamer.jdbc.table.incr.column.name", "id"); + props.setProperty("hoodie.streamer.jdbc.url", JDBC_URL); + props.setProperty("hoodie.streamer.jdbc.driver.class", JDBC_DRIVER); + props.setProperty("hoodie.streamer.jdbc.user", JDBC_USER); + props.setProperty("hoodie.streamer.jdbc.password", JDBC_PASS); + props.setProperty("hoodie.streamer.jdbc.table.name", "triprec"); + props.setProperty("hoodie.streamer.jdbc.incr.pull", "true"); + props.setProperty("hoodie.streamer.jdbc.table.incr.column.name", "id"); props.setProperty("hoodie.datasource.write.recordkey.field", "ID"); - props.setProperty("hoodie.datasource.write.partitionpath.field", "partition_path"); UtilitiesTestBase.Helpers.savePropsToDFS(props, fs, basePath + "/test-jdbc-source.properties"); @@ -2454,7 +2465,7 @@ public void testHoodieIncrFallback() throws Exception { HoodieDeltaStreamer.Config downstreamCfg = TestHelpers.makeConfigForHudiIncrSrc(tableBasePath, downstreamTableBasePath, WriteOperationType.BULK_INSERT, true, null); - downstreamCfg.configs.add("hoodie.deltastreamer.source.hoodieincr.num_instants=1"); + downstreamCfg.configs.add("hoodie.streamer.source.hoodieincr.num_instants=1"); new HoodieDeltaStreamer(downstreamCfg, jsc).sync(); insertInTable(tableBasePath, 9, WriteOperationType.UPSERT); @@ -2470,7 +2481,7 @@ public void testHoodieIncrFallback() throws Exception { downstreamCfg.configs.remove(downstreamCfg.configs.size() - 1); downstreamCfg.configs.add(DataSourceReadOptions.INCREMENTAL_FALLBACK_TO_FULL_TABLE_SCAN().key() + "=true"); //Adding this conf to make testing easier :) - downstreamCfg.configs.add("hoodie.deltastreamer.source.hoodieincr.num_instants=10"); + downstreamCfg.configs.add("hoodie.streamer.source.hoodieincr.num_instants=10"); downstreamCfg.operation = WriteOperationType.UPSERT; new HoodieDeltaStreamer(downstreamCfg, jsc).sync(); new HoodieDeltaStreamer(downstreamCfg, jsc).sync(); @@ -2825,9 +2836,9 @@ public void testAutoGenerateRecordKeys() throws Exception { } @ParameterizedTest - @CsvSource(value = {"COPY_ON_WRITE, AVRO", "MERGE_ON_READ, AVRO", - "COPY_ON_WRITE, SPARK", "MERGE_ON_READ, SPARK"}) - public void testConfigurationHotUpdate(HoodieTableType tableType, HoodieRecordType recordType) throws Exception { + @EnumSource(HoodieTableType.class) + public void testConfigurationHotUpdate(HoodieTableType tableType) throws Exception { + HoodieRecordType recordType = HoodieRecordType.AVRO; String tableBasePath = basePath + String.format("/configurationHotUpdate_%s_%s", tableType.name(), recordType.name()); HoodieDeltaStreamer.Config cfg = TestHelpers.makeConfig(tableBasePath, WriteOperationType.UPSERT); diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/deltastreamer/TestHoodieDeltaStreamerSchemaEvolutionQuick.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/deltastreamer/TestHoodieDeltaStreamerSchemaEvolutionQuick.java index 096ddf14cc76..5bbeb006029f 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/deltastreamer/TestHoodieDeltaStreamerSchemaEvolutionQuick.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/deltastreamer/TestHoodieDeltaStreamerSchemaEvolutionQuick.java @@ -24,7 +24,7 @@ import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.util.Option; -import org.apache.hudi.exception.SchemaCompatibilityException; +import org.apache.hudi.exception.MissingSchemaFieldException; import org.apache.hudi.utilities.UtilHelpers; import org.apache.hudi.utilities.streamer.HoodieStreamer; @@ -126,6 +126,7 @@ protected static Stream testParamsWithSchemaTransformer() { b.add(Arguments.of("COPY_ON_WRITE", true, true, true, true, true)); b.add(Arguments.of("COPY_ON_WRITE", true, false, false, false, true)); b.add(Arguments.of("MERGE_ON_READ", true, true, true, false, false)); + b.add(Arguments.of("MERGE_ON_READ", true, true, false, false, false)); b.add(Arguments.of("MERGE_ON_READ", true, false, true, true, false)); } return b.build(); @@ -221,8 +222,7 @@ public void testBase(String tableType, addData(df, false); deltaStreamer.sync(); assertTrue(allowNullForDeletedCols); - } catch (SchemaCompatibilityException e) { - assertTrue(e.getMessage().contains("Incoming batch schema is not compatible with the table's one")); + } catch (MissingSchemaFieldException e) { assertFalse(allowNullForDeletedCols); return; } @@ -405,10 +405,8 @@ public void testDroppedColumn(String tableType, assertTrue(latestTableSchemaOpt.get().getField("rider").schema().getTypes() .stream().anyMatch(t -> t.getType().equals(Schema.Type.STRING))); assertTrue(metaClient.reloadActiveTimeline().lastInstant().get().compareTo(lastInstant) > 0); - } catch (SchemaCompatibilityException e) { + } catch (MissingSchemaFieldException e) { assertFalse(allowNullForDeletedCols || targetSchemaSameAsTableSchema); - assertTrue(e.getMessage().contains("Incoming batch schema is not compatible with the table's one")); - assertFalse(allowNullForDeletedCols); } } @@ -488,7 +486,7 @@ public void testNonNullableColumnDrop(String tableType, assertTrue(metaClient.reloadActiveTimeline().lastInstant().get().compareTo(lastInstant) > 0); } catch (Exception e) { assertTrue(containsErrorMessage(e, "java.lang.NullPointerException", - "Incoming batch schema is not compatible with the table's one")); + "Schema validation failed due to missing field.")); } } diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/deltastreamer/TestHoodieDeltaStreamerWithMultiWriter.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/deltastreamer/TestHoodieDeltaStreamerWithMultiWriter.java index 2745edef5846..635b57c9fa67 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/deltastreamer/TestHoodieDeltaStreamerWithMultiWriter.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/deltastreamer/TestHoodieDeltaStreamerWithMultiWriter.java @@ -320,8 +320,8 @@ private static TypedProperties prepareMultiWriterProps(FileSystem fs, String bas props.setProperty("hoodie.datasource.write.keygenerator.class", TestHoodieDeltaStreamer.TestGenerator.class.getName()); props.setProperty("hoodie.datasource.write.recordkey.field", "_row_key"); props.setProperty("hoodie.datasource.write.partitionpath.field", "partition_path"); - props.setProperty("hoodie.deltastreamer.schemaprovider.source.schema.file", basePath + "/source.avsc"); - props.setProperty("hoodie.deltastreamer.schemaprovider.target.schema.file", basePath + "/target.avsc"); + props.setProperty("hoodie.streamer.schemaprovider.source.schema.file", basePath + "/source.avsc"); + props.setProperty("hoodie.streamer.schemaprovider.target.schema.file", basePath + "/target.avsc"); props.setProperty("include", "base.properties"); props.setProperty("hoodie.write.concurrency.mode", "optimistic_concurrency_control"); diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/deltastreamer/TestHoodieMultiTableDeltaStreamer.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/deltastreamer/TestHoodieMultiTableDeltaStreamer.java index 26ea61e31fe6..0c5de8634368 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/deltastreamer/TestHoodieMultiTableDeltaStreamer.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/deltastreamer/TestHoodieMultiTableDeltaStreamer.java @@ -178,16 +178,16 @@ public void testMultiTableExecutionWithKafkaSource() throws IOException { HoodieMultiTableDeltaStreamer streamer = new HoodieMultiTableDeltaStreamer(cfg, jsc); List executionContexts = streamer.getTableExecutionContexts(); TypedProperties properties = executionContexts.get(1).getProperties(); - properties.setProperty("hoodie.deltastreamer.schemaprovider.source.schema.file", basePath + "/source_uber.avsc"); - properties.setProperty("hoodie.deltastreamer.schemaprovider.target.schema.file", basePath + "/target_uber.avsc"); + properties.setProperty("hoodie.streamer.schemaprovider.source.schema.file", basePath + "/source_uber.avsc"); + properties.setProperty("hoodie.streamer.schemaprovider.target.schema.file", basePath + "/target_uber.avsc"); properties.setProperty("hoodie.datasource.write.partitionpath.field", "timestamp"); - properties.setProperty("hoodie.deltastreamer.source.kafka.topic", topicName2); + properties.setProperty("hoodie.streamer.source.kafka.topic", topicName2); executionContexts.get(1).setProperties(properties); TypedProperties properties1 = executionContexts.get(0).getProperties(); - properties1.setProperty("hoodie.deltastreamer.schemaprovider.source.schema.file", basePath + "/source_short_trip_uber.avsc"); - properties1.setProperty("hoodie.deltastreamer.schemaprovider.target.schema.file", basePath + "/target_short_trip_uber.avsc"); + properties1.setProperty("hoodie.streamer.schemaprovider.source.schema.file", basePath + "/source_short_trip_uber.avsc"); + properties1.setProperty("hoodie.streamer.schemaprovider.target.schema.file", basePath + "/target_short_trip_uber.avsc"); properties1.setProperty("hoodie.datasource.write.partitionpath.field", "timestamp"); - properties1.setProperty("hoodie.deltastreamer.source.kafka.topic", topicName1); + properties1.setProperty("hoodie.streamer.source.kafka.topic", topicName1); executionContexts.get(0).setProperties(properties1); String targetBasePath1 = executionContexts.get(0).getConfig().targetBasePath; String targetBasePath2 = executionContexts.get(1).getConfig().targetBasePath; @@ -288,7 +288,7 @@ private TypedProperties getParquetProps(String parquetSourceRoot) { props.setProperty("include", "base.properties"); props.setProperty("hoodie.datasource.write.recordkey.field", "_row_key"); props.setProperty("hoodie.datasource.write.partitionpath.field", "partition_path"); - props.setProperty("hoodie.deltastreamer.source.dfs.root", parquetSourceRoot); + props.setProperty("hoodie.streamer.source.dfs.root", parquetSourceRoot); return props; } diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/deltastreamer/TestSourceFormatAdapter.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/deltastreamer/TestSourceFormatAdapter.java index 1d6f2f110b2b..788105c20284 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/deltastreamer/TestSourceFormatAdapter.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/deltastreamer/TestSourceFormatAdapter.java @@ -49,6 +49,7 @@ import java.util.stream.Stream; +import static org.apache.hudi.testutils.HoodieClientTestUtils.getSparkConfForTest; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertTrue; @@ -64,9 +65,7 @@ public class TestSourceFormatAdapter { public static void start() { spark = SparkSession .builder() - .master("local[*]") - .appName(TestSourceFormatAdapter.class.getName()) - .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") + .config(getSparkConfForTest(TestSourceFormatAdapter.class.getName())) .getOrCreate(); jsc = JavaSparkContext.fromSparkContext(spark.sparkContext()); } diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/TestHiveSchemaProvider.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/TestHiveSchemaProvider.java index e2ae67aae23c..75e812acf374 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/TestHiveSchemaProvider.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/TestHiveSchemaProvider.java @@ -55,8 +55,8 @@ public class TestHiveSchemaProvider extends SparkClientFunctionalTestHarnessWith @BeforeAll public static void init() { Pair dbAndTableName = paresDBAndTableName(SOURCE_SCHEMA_TABLE_NAME); - PROPS.setProperty("hoodie.deltastreamer.schemaprovider.source.schema.hive.database", dbAndTableName.getLeft()); - PROPS.setProperty("hoodie.deltastreamer.schemaprovider.source.schema.hive.table", dbAndTableName.getRight()); + PROPS.setProperty("hoodie.streamer.schemaprovider.source.schema.hive.database", dbAndTableName.getLeft()); + PROPS.setProperty("hoodie.streamer.schemaprovider.source.schema.hive.table", dbAndTableName.getRight()); } @Disabled @@ -84,8 +84,8 @@ public void testSourceSchema() throws Exception { public void testTargetSchema() throws Exception { try { Pair dbAndTableName = paresDBAndTableName(TARGET_SCHEMA_TABLE_NAME); - PROPS.setProperty("hoodie.deltastreamer.schemaprovider.target.schema.hive.database", dbAndTableName.getLeft()); - PROPS.setProperty("hoodie.deltastreamer.schemaprovider.target.schema.hive.table", dbAndTableName.getRight()); + PROPS.setProperty("hoodie.streamer.schemaprovider.target.schema.hive.database", dbAndTableName.getLeft()); + PROPS.setProperty("hoodie.streamer.schemaprovider.target.schema.hive.table", dbAndTableName.getRight()); createSchemaTable(SOURCE_SCHEMA_TABLE_NAME); createSchemaTable(TARGET_SCHEMA_TABLE_NAME); Schema targetSchema = UtilHelpers.createSchemaProvider(HiveSchemaProvider.class.getName(), PROPS, jsc()).getTargetSchema(); @@ -105,7 +105,7 @@ public void testTargetSchema() throws Exception { @Test public void testNotExistTable() { String wrongName = "wrong_schema_tab"; - PROPS.setProperty("hoodie.deltastreamer.schemaprovider.source.schema.hive.table", wrongName); + PROPS.setProperty("hoodie.streamer.schemaprovider.source.schema.hive.table", wrongName); Assertions.assertThrows(NoSuchTableException.class, () -> { try { UtilHelpers.createSchemaProvider(HiveSchemaProvider.class.getName(), PROPS, jsc()).getSourceSchema(); diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/TestHoodieSnapshotCopier.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/TestHoodieSnapshotCopier.java index eb1ac22d69ad..9facedb81413 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/TestHoodieSnapshotCopier.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/TestHoodieSnapshotCopier.java @@ -20,6 +20,7 @@ import org.apache.hudi.common.config.HoodieMetadataConfig; import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.table.HoodieTableConfig; import org.apache.hudi.common.testutils.HoodieTestDataGenerator; import org.apache.hudi.common.testutils.HoodieTestUtils; import org.apache.hudi.hadoop.fs.HadoopFSUtils; @@ -45,6 +46,7 @@ @Tag("functional") public class TestHoodieSnapshotCopier extends FunctionalTestHarness { + private static final String BASE_FILE_EXTENSION = HoodieTableConfig.BASE_FILE_FORMAT.defaultValue().getFileExtension(); private static final String TEST_WRITE_TOKEN = "1-0-1"; private String basePath; @@ -100,27 +102,27 @@ public void testSnapshotCopy() throws Exception { HoodieTestDataGenerator.writePartitionMetadataDeprecated(fs, new String[] {"2016/05/01", "2016/05/02", "2016/05/06"}, basePath); // Make commit1 - File file11 = new File(basePath + "/2016/05/01/" + FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, "id11")); + File file11 = new File(basePath + "/2016/05/01/" + FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, "id11", BASE_FILE_EXTENSION)); file11.createNewFile(); - File file12 = new File(basePath + "/2016/05/02/" + FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, "id12")); + File file12 = new File(basePath + "/2016/05/02/" + FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, "id12", BASE_FILE_EXTENSION)); file12.createNewFile(); - File file13 = new File(basePath + "/2016/05/06/" + FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, "id13")); + File file13 = new File(basePath + "/2016/05/06/" + FSUtils.makeBaseFileName(commitTime1, TEST_WRITE_TOKEN, "id13", BASE_FILE_EXTENSION)); file13.createNewFile(); // Make commit2 - File file21 = new File(basePath + "/2016/05/01/" + FSUtils.makeBaseFileName(commitTime2, TEST_WRITE_TOKEN, "id21")); + File file21 = new File(basePath + "/2016/05/01/" + FSUtils.makeBaseFileName(commitTime2, TEST_WRITE_TOKEN, "id21", BASE_FILE_EXTENSION)); file21.createNewFile(); - File file22 = new File(basePath + "/2016/05/02/" + FSUtils.makeBaseFileName(commitTime2, TEST_WRITE_TOKEN, "id22")); + File file22 = new File(basePath + "/2016/05/02/" + FSUtils.makeBaseFileName(commitTime2, TEST_WRITE_TOKEN, "id22", BASE_FILE_EXTENSION)); file22.createNewFile(); - File file23 = new File(basePath + "/2016/05/06/" + FSUtils.makeBaseFileName(commitTime2, TEST_WRITE_TOKEN, "id23")); + File file23 = new File(basePath + "/2016/05/06/" + FSUtils.makeBaseFileName(commitTime2, TEST_WRITE_TOKEN, "id23", BASE_FILE_EXTENSION)); file23.createNewFile(); // Make commit3 - File file31 = new File(basePath + "/2016/05/01/" + FSUtils.makeBaseFileName(commitTime3, TEST_WRITE_TOKEN, "id31")); + File file31 = new File(basePath + "/2016/05/01/" + FSUtils.makeBaseFileName(commitTime3, TEST_WRITE_TOKEN, "id31", BASE_FILE_EXTENSION)); file31.createNewFile(); - File file32 = new File(basePath + "/2016/05/02/" + FSUtils.makeBaseFileName(commitTime3, TEST_WRITE_TOKEN, "id32")); + File file32 = new File(basePath + "/2016/05/02/" + FSUtils.makeBaseFileName(commitTime3, TEST_WRITE_TOKEN, "id32", BASE_FILE_EXTENSION)); file32.createNewFile(); - File file33 = new File(basePath + "/2016/05/06/" + FSUtils.makeBaseFileName(commitTime3, TEST_WRITE_TOKEN, "id33")); + File file33 = new File(basePath + "/2016/05/06/" + FSUtils.makeBaseFileName(commitTime3, TEST_WRITE_TOKEN, "id33", BASE_FILE_EXTENSION)); file33.createNewFile(); // Do a snapshot copy diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/TestJdbcbasedSchemaProvider.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/TestJdbcbasedSchemaProvider.java index 46400dda48da..82588429db5c 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/TestJdbcbasedSchemaProvider.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/TestJdbcbasedSchemaProvider.java @@ -37,6 +37,10 @@ import java.sql.PreparedStatement; import java.sql.SQLException; +import static org.apache.hudi.utilities.testutils.JdbcTestUtils.JDBC_DRIVER; +import static org.apache.hudi.utilities.testutils.JdbcTestUtils.JDBC_PASS; +import static org.apache.hudi.utilities.testutils.JdbcTestUtils.JDBC_URL; +import static org.apache.hudi.utilities.testutils.JdbcTestUtils.JDBC_USER; import static org.junit.jupiter.api.Assertions.assertEquals; @Tag("functional") @@ -47,13 +51,13 @@ public class TestJdbcbasedSchemaProvider extends SparkClientFunctionalTestHarnes @BeforeAll public static void init() { - PROPS.setProperty("hoodie.deltastreamer.schemaprovider.source.schema.jdbc.connection.url", "jdbc:h2:mem:test_mem"); - PROPS.setProperty("hoodie.deltastreamer.schemaprovider.source.schema.jdbc.driver.type", "org.h2.Driver"); - PROPS.setProperty("hoodie.deltastreamer.schemaprovider.source.schema.jdbc.username", "sa"); - PROPS.setProperty("hoodie.deltastreamer.schemaprovider.source.schema.jdbc.password", ""); - PROPS.setProperty("hoodie.deltastreamer.schemaprovider.source.schema.jdbc.dbtable", "triprec"); - PROPS.setProperty("hoodie.deltastreamer.schemaprovider.source.schema.jdbc.timeout", "0"); - PROPS.setProperty("hoodie.deltastreamer.schemaprovider.source.schema.jdbc.nullable", "false"); + PROPS.setProperty("hoodie.streamer.schemaprovider.source.schema.jdbc.connection.url", JDBC_URL); + PROPS.setProperty("hoodie.streamer.schemaprovider.source.schema.jdbc.driver.type", JDBC_DRIVER); + PROPS.setProperty("hoodie.streamer.schemaprovider.source.schema.jdbc.username", JDBC_USER); + PROPS.setProperty("hoodie.streamer.schemaprovider.source.schema.jdbc.password", JDBC_PASS); + PROPS.setProperty("hoodie.streamer.schemaprovider.source.schema.jdbc.dbtable", "triprec"); + PROPS.setProperty("hoodie.streamer.schemaprovider.source.schema.jdbc.timeout", "0"); + PROPS.setProperty("hoodie.streamer.schemaprovider.source.schema.jdbc.nullable", "false"); } @Test @@ -73,7 +77,7 @@ public void testJdbcbasedSchemaProvider() throws Exception { * @throws SQLException */ private void initH2Database() throws SQLException { - try (Connection conn = DriverManager.getConnection("jdbc:h2:mem:test_mem", "sa", "")) { + try (Connection conn = DriverManager.getConnection(JDBC_URL, JDBC_USER, JDBC_PASS)) { PreparedStatement ps = conn.prepareStatement(UtilitiesTestBase.Helpers.readFile("streamer-config/triprec.sql")); ps.executeUpdate(); } diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/schema/TestKafkaOffsetPostProcessor.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/schema/TestKafkaOffsetPostProcessor.java new file mode 100644 index 000000000000..aac441609ca3 --- /dev/null +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/schema/TestKafkaOffsetPostProcessor.java @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.utilities.schema; + +import org.apache.avro.Schema; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +import java.util.Arrays; +import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +class TestKafkaOffsetPostProcessor { + private static final List + EXPECTED_FIELD_NAMES = Arrays.asList("existing_field", "_hoodie_kafka_source_offset", "_hoodie_kafka_source_partition", "_hoodie_kafka_source_timestamp", "_hoodie_kafka_source_key"); + + @ParameterizedTest + @MethodSource("cases") + void testProcessSchema(Schema inputSchema) { + KafkaOffsetPostProcessor kafkaOffsetPostProcessor = new KafkaOffsetPostProcessor(null, null); + Schema actual = kafkaOffsetPostProcessor.processSchema(inputSchema); + List actualFieldNames = actual.getFields().stream().map(Schema.Field::name).collect(Collectors.toList()); + assertEquals(EXPECTED_FIELD_NAMES, actualFieldNames); + } + + private static Stream cases() { + String offsetField = "{\"name\": \"_hoodie_kafka_source_offset\", \"type\": \"long\", \"doc\": \"offset column\", \"default\": 0}"; + String partitionField = "{\"name\": \"_hoodie_kafka_source_partition\", \"type\": \"int\", \"doc\": \"partition column\", \"default\": 0}"; + String timestampField = "{\"name\": \"_hoodie_kafka_source_timestamp\", \"type\": \"long\", \"doc\": \"timestamp column\", \"default\": 0}"; + String keyField = "{\"name\": \"_hoodie_kafka_source_key\", \"type\": [\"null\", \"string\"], \"doc\": \"kafka key column\", \"default\": null}"; + return Stream.of( + Arguments.of(new Schema.Parser().parse("{\"type\": \"record\", \"name\": \"example\", \"fields\": [{\"name\": \"existing_field\", \"type\": \"string\"}]}")), + Arguments.of(new Schema.Parser().parse("{\"type\": \"record\", \"name\": \"example\", \"fields\": [{\"name\": \"existing_field\", \"type\": \"string\"}, " + + offsetField + "]}")), + Arguments.of(new Schema.Parser().parse("{\"type\": \"record\", \"name\": \"example\", \"fields\": [{\"name\": \"existing_field\", \"type\": \"string\"}, " + + offsetField + ", " + partitionField + "]}")), + Arguments.of( + new Schema.Parser().parse("{\"type\": \"record\", \"name\": \"example\", \"fields\": [{\"name\": \"existing_field\", \"type\": \"string\"}, " + + offsetField + ", " + partitionField + ", " + timestampField + "]}")), + Arguments.of( + new Schema.Parser().parse("{\"type\": \"record\", \"name\": \"example\", \"fields\": [{\"name\": \"existing_field\", \"type\": \"string\"}, " + + offsetField + ", " + partitionField + ", " + timestampField + ", " + keyField + "]}")) + ); + } +} diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/schema/TestSchemaRegistryProvider.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/schema/TestSchemaRegistryProvider.java index 397e72a0ec4a..88f67723c858 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/schema/TestSchemaRegistryProvider.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/schema/TestSchemaRegistryProvider.java @@ -64,10 +64,10 @@ private static Schema getExpectedConvertedSchema() { private static TypedProperties getProps() { return new TypedProperties() { { - put("hoodie.deltastreamer.schemaprovider.registry.baseUrl", "http://" + BASIC_AUTH + "@localhost"); - put("hoodie.deltastreamer.schemaprovider.registry.urlSuffix", "-value"); - put("hoodie.deltastreamer.schemaprovider.registry.url", "http://foo:bar@localhost"); - put("hoodie.deltastreamer.source.kafka.topic", "foo"); + put("hoodie.streamer.schemaprovider.registry.baseUrl", "http://" + BASIC_AUTH + "@localhost"); + put("hoodie.streamer.schemaprovider.registry.urlSuffix", "-value"); + put("hoodie.streamer.schemaprovider.registry.url", "http://foo:bar@localhost"); + put("hoodie.streamer.source.kafka.topic", "foo"); } }; } @@ -102,8 +102,8 @@ public void testGetTargetSchemaShouldRequestSchemaWithCreds() throws IOException @Test public void testGetSourceSchemaShouldRequestSchemaWithoutCreds() throws IOException { TypedProperties props = getProps(); - props.put("hoodie.deltastreamer.schemaprovider.registry.url", "http://localhost"); - props.put("hoodie.deltastreamer.schemaprovider.registry.schemaconverter", DummySchemaConverter.class.getName()); + props.put("hoodie.streamer.schemaprovider.registry.url", "http://localhost"); + props.put("hoodie.streamer.schemaprovider.registry.schemaconverter", DummySchemaConverter.class.getName()); SchemaRegistryProvider spyUnderTest = getUnderTest(props); Schema actual = spyUnderTest.getSourceSchema(); assertNotNull(actual); @@ -114,8 +114,8 @@ public void testGetSourceSchemaShouldRequestSchemaWithoutCreds() throws IOExcept @Test public void testGetTargetSchemaShouldRequestSchemaWithoutCreds() throws IOException { TypedProperties props = getProps(); - props.put("hoodie.deltastreamer.schemaprovider.registry.url", "http://localhost"); - props.put("hoodie.deltastreamer.schemaprovider.registry.schemaconverter", DummySchemaConverter.class.getName()); + props.put("hoodie.streamer.schemaprovider.registry.url", "http://localhost"); + props.put("hoodie.streamer.schemaprovider.registry.schemaconverter", DummySchemaConverter.class.getName()); SchemaRegistryProvider spyUnderTest = getUnderTest(props); Schema actual = spyUnderTest.getTargetSchema(); assertNotNull(actual); diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/BaseTestKafkaSource.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/BaseTestKafkaSource.java index b5cbf2738f65..c5fc7bfaafae 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/BaseTestKafkaSource.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/BaseTestKafkaSource.java @@ -28,6 +28,8 @@ import org.apache.hudi.utilities.ingestion.HoodieIngestionMetrics; import org.apache.hudi.utilities.schema.SchemaProvider; import org.apache.hudi.utilities.streamer.SourceFormatAdapter; +import org.apache.hudi.utilities.streamer.SourceProfile; +import org.apache.hudi.utilities.streamer.SourceProfileSupplier; import org.apache.avro.generic.GenericRecord; import org.apache.kafka.clients.consumer.ConsumerConfig; @@ -52,6 +54,7 @@ import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertThrows; import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; /** * Generic tests for all {@link KafkaSource} to ensure all implementations properly handle offsets, fetch limits, failure modes, etc. @@ -60,6 +63,7 @@ abstract class BaseTestKafkaSource extends SparkClientFunctionalTestHarness { protected static final String TEST_TOPIC_PREFIX = "hoodie_test_"; protected final HoodieIngestionMetrics metrics = mock(HoodieIngestionMetrics.class); + protected final Option sourceProfile = Option.of(mock(SourceProfileSupplier.class)); protected SchemaProvider schemaProvider; protected KafkaTestUtils testUtils; @@ -165,7 +169,7 @@ public void testProtoKafkaSourceInsertRecordsLessSourceLimit() { testUtils.createTopic(topic, 2); TypedProperties props = createPropsForKafkaSource(topic, Long.MAX_VALUE, "earliest"); SourceFormatAdapter kafkaSource = createSource(props); - props.setProperty("hoodie.deltastreamer.kafka.source.maxEvents", "500"); + props.setProperty("hoodie.streamer.kafka.source.maxEvents", "500"); /* 1. maxEventsFromKafkaSourceProp set to more than generated insert records @@ -277,4 +281,51 @@ public void testFailOnDataLoss() throws Exception { + " either the data was aged out by Kafka or the topic may have been deleted before all the data in the topic was processed.", t.getMessage()); } + + @Test + public void testKafkaSourceWithOffsetsFromSourceProfile() { + // topic setup. + final String topic = TEST_TOPIC_PREFIX + "testKafkaSourceWithOffsetRanges"; + testUtils.createTopic(topic, 2); + TypedProperties props = createPropsForKafkaSource(topic, null, "earliest"); + + when(sourceProfile.get().getSourceProfile()).thenReturn(new TestSourceProfile(Long.MAX_VALUE, 4, 500)); + SourceFormatAdapter kafkaSource = createSource(props); + + // Test for empty data. + assertEquals(Option.empty(), kafkaSource.fetchNewDataInAvroFormat(Option.empty(), Long.MAX_VALUE).getBatch()); + + // Publish messages and assert source has picked up all messages in offsetRanges supplied by input batch profile. + sendMessagesToKafka(topic, 1000, 2); + InputBatch> fetch1 = kafkaSource.fetchNewDataInAvroFormat(Option.empty(), 900); + assertEquals(500, fetch1.getBatch().get().count()); + } + + static class TestSourceProfile implements SourceProfile { + + private final long maxSourceBytes; + private final int sourcePartitions; + private final long numEvents; + + public TestSourceProfile(long maxSourceBytes, int sourcePartitions, long numEvents) { + this.maxSourceBytes = maxSourceBytes; + this.sourcePartitions = sourcePartitions; + this.numEvents = numEvents; + } + + @Override + public long getMaxSourceBytes() { + return maxSourceBytes; + } + + @Override + public int getSourcePartitions() { + return sourcePartitions; + } + + @Override + public Long getSourceSpecificContext() { + return numEvents; + } + } } diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestAvroDFSSource.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestAvroDFSSource.java index 1cda910b707b..808a4ca57cea 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestAvroDFSSource.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestAvroDFSSource.java @@ -39,9 +39,8 @@ public void setup() throws Exception { } @Override - protected Source prepareDFSSource() { - TypedProperties props = new TypedProperties(); - props.setProperty("hoodie.deltastreamer.source.dfs.root", dfsRoot); + protected Source prepareDFSSource(TypedProperties props) { + props.setProperty("hoodie.streamer.source.dfs.root", dfsRoot); try { return new AvroDFSSource(props, jsc, sparkSession, schemaProvider); } catch (IOException e) { diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestAvroKafkaSource.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestAvroKafkaSource.java index 558181f42586..497757ab3787 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestAvroKafkaSource.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestAvroKafkaSource.java @@ -97,11 +97,11 @@ public void tearDown() { protected TypedProperties createPropsForKafkaSource(String topic, Long maxEventsToReadFromKafkaSource, String resetStrategy) { TypedProperties props = new TypedProperties(); - props.setProperty("hoodie.deltastreamer.source.kafka.topic", topic); + props.setProperty("hoodie.streamer.source.kafka.topic", topic); props.setProperty("bootstrap.servers", testUtils.brokerAddress()); props.setProperty("auto.offset.reset", resetStrategy); props.setProperty(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "false"); - props.setProperty("hoodie.deltastreamer.kafka.source.maxEvents", + props.setProperty("hoodie.streamer.kafka.source.maxEvents", maxEventsToReadFromKafkaSource != null ? String.valueOf(maxEventsToReadFromKafkaSource) : String.valueOf(KafkaSourceConfig.MAX_EVENTS_FROM_KAFKA_SOURCE.defaultValue())); props.setProperty(ConsumerConfig.GROUP_ID_CONFIG, UUID.randomUUID().toString()); @@ -160,8 +160,8 @@ public void testAppendKafkaOffsets() throws IOException { "test", dataGen.generateGenericRecord()); JavaRDD> rdd = jsc().parallelize(Arrays.asList(recordConsumerRecord)); TypedProperties props = new TypedProperties(); - props.put("hoodie.deltastreamer.source.kafka.topic", "test"); - props.put("hoodie.deltastreamer.schemaprovider.source.schema.file", SCHEMA_PATH); + props.put("hoodie.streamer.source.kafka.topic", "test"); + props.put("hoodie.streamer.schemaprovider.source.schema.file", SCHEMA_PATH); SchemaProvider schemaProvider = UtilHelpers.wrapSchemaProviderWithPostProcessor( UtilHelpers.createSchemaProvider(FilebasedSchemaProvider.class.getName(), props, jsc()), props, jsc(), new ArrayList<>()); @@ -191,11 +191,11 @@ public void testAppendKafkaOffsetsSourceFormatAdapter() throws IOException { final String topic = TEST_TOPIC_PREFIX + "testKafkaOffsetAppend"; TypedProperties props = createPropsForKafkaSource(topic, null, "earliest"); - props.put("hoodie.deltastreamer.schemaprovider.source.schema.file", SCHEMA_PATH); + props.put("hoodie.streamer.schemaprovider.source.schema.file", SCHEMA_PATH); SchemaProvider schemaProvider = UtilHelpers.wrapSchemaProviderWithPostProcessor( UtilHelpers.createSchemaProvider(FilebasedSchemaProvider.class.getName(), props, jsc()), props, jsc(), new ArrayList<>()); - props.put("hoodie.deltastreamer.source.kafka.value.deserializer.class", ByteArrayDeserializer.class.getName()); + props.put("hoodie.streamer.source.kafka.value.deserializer.class", ByteArrayDeserializer.class.getName()); int numPartitions = 2; int numMessages = 30; testUtils.createTopic(topic,numPartitions); diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestCsvDFSSource.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestCsvDFSSource.java index 8eaa1d95b239..c4bb59ff812f 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestCsvDFSSource.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestCsvDFSSource.java @@ -46,11 +46,10 @@ public void setup() throws Exception { } @Override - public Source prepareDFSSource() { - TypedProperties props = new TypedProperties(); - props.setProperty("hoodie.deltastreamer.source.dfs.root", dfsRoot); - props.setProperty("hoodie.deltastreamer.csv.header", Boolean.toString(true)); - props.setProperty("hoodie.deltastreamer.csv.sep", "\t"); + public Source prepareDFSSource(TypedProperties props) { + props.setProperty("hoodie.streamer.source.dfs.root", dfsRoot); + props.setProperty("hoodie.streamer.csv.header", Boolean.toString(true)); + props.setProperty("hoodie.streamer.csv.sep", "\t"); return new CsvDFSSource(props, jsc, sparkSession, schemaProvider); } diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestGcsEventsHoodieIncrSource.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestGcsEventsHoodieIncrSource.java index bc2906d251fc..3b018473dc4b 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestGcsEventsHoodieIncrSource.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestGcsEventsHoodieIncrSource.java @@ -36,6 +36,7 @@ import org.apache.hudi.config.HoodieCleanConfig; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.testutils.SparkClientFunctionalTestHarness; +import org.apache.hudi.utilities.config.CloudSourceConfig; import org.apache.hudi.utilities.schema.FilebasedSchemaProvider; import org.apache.hudi.utilities.schema.SchemaProvider; import org.apache.hudi.utilities.sources.helpers.CloudDataFetcher; @@ -59,6 +60,7 @@ import org.junit.jupiter.api.io.TempDir; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.CsvSource; +import org.junit.jupiter.params.provider.ValueSource; import org.mockito.Mock; import org.mockito.Mockito; import org.mockito.MockitoAnnotations; @@ -85,6 +87,7 @@ public class TestGcsEventsHoodieIncrSource extends SparkClientFunctionalTestHarn private static final Schema GCS_METADATA_SCHEMA = SchemaTestUtil.getSchemaFromResource( TestGcsEventsHoodieIncrSource.class, "/streamer-config/gcs-metadata.avsc", true); + private static final String IGNORE_FILE_EXTENSION = ".ignore"; private ObjectMapper mapper = new ObjectMapper(); @@ -111,8 +114,8 @@ public void setUp() throws IOException { jsc = JavaSparkContext.fromSparkContext(spark().sparkContext()); String schemaFilePath = TestGcsEventsHoodieIncrSource.class.getClassLoader().getResource("schema/sample_gcs_data.avsc").getPath(); TypedProperties props = new TypedProperties(); - props.put("hoodie.deltastreamer.schemaprovider.source.schema.file", schemaFilePath); - props.put("hoodie.deltastreamer.schema.provider.class.name", FilebasedSchemaProvider.class.getName()); + props.put("hoodie.streamer.schemaprovider.source.schema.file", schemaFilePath); + props.put("hoodie.streamer.schema.provider.class.name", FilebasedSchemaProvider.class.getName()); this.schemaProvider = Option.of(new FilebasedSchemaProvider(props, jsc)); MockitoAnnotations.initMocks(this); } @@ -195,28 +198,44 @@ public void largeBootstrapWithFilters() throws IOException { readAndAssert(READ_UPTO_LATEST_COMMIT, Option.of("1#path/to/file10006.json"), 250L, "1#path/to/file10007.json"); } - @Test - public void testTwoFilesAndContinueAcrossCommits() throws IOException { + @ParameterizedTest + @ValueSource(strings = { + ".json", + ".gz" + }) + public void testTwoFilesAndContinueAcrossCommits(String extension) throws IOException { String commitTimeForWrites = "2"; String commitTimeForReads = "1"; Pair> inserts = writeGcsMetadataRecords(commitTimeForWrites); + + TypedProperties typedProperties = setProps(READ_UPTO_LATEST_COMMIT); + // In the case the extension is explicitly set to something other than the file format. + if (!extension.endsWith("json")) { + typedProperties.setProperty(CloudSourceConfig.CLOUD_DATAFILE_EXTENSION.key(), extension); + } + List> filePathSizeAndCommitTime = new ArrayList<>(); - // Add file paths and sizes to the list - filePathSizeAndCommitTime.add(Triple.of("path/to/file1.json", 100L, "1")); - filePathSizeAndCommitTime.add(Triple.of("path/to/file3.json", 200L, "1")); - filePathSizeAndCommitTime.add(Triple.of("path/to/file2.json", 150L, "1")); - filePathSizeAndCommitTime.add(Triple.of("path/to/file4.json", 50L, "2")); - filePathSizeAndCommitTime.add(Triple.of("path/to/file5.json", 150L, "2")); + // Add file paths and sizes to the list. + // Check with a couple of invalid file extensions to ensure they are filtered out. + filePathSizeAndCommitTime.add(Triple.of(String.format("path/to/file1%s", extension), 100L, "1")); + filePathSizeAndCommitTime.add(Triple.of(String.format("path/to/file2%s", IGNORE_FILE_EXTENSION), 800L, "1")); + filePathSizeAndCommitTime.add(Triple.of(String.format("path/to/file3%s", extension), 200L, "1")); + filePathSizeAndCommitTime.add(Triple.of(String.format("path/to/file2%s", extension), 150L, "1")); + filePathSizeAndCommitTime.add(Triple.of(String.format("path/to/file4%s", extension), 50L, "2")); + filePathSizeAndCommitTime.add(Triple.of(String.format("path/to/file4%s", IGNORE_FILE_EXTENSION), 200L, "2")); + filePathSizeAndCommitTime.add(Triple.of(String.format("path/to/file5%s", extension), 150L, "2")); Dataset inputDs = generateDataset(filePathSizeAndCommitTime); setMockQueryRunner(inputDs); - readAndAssert(READ_UPTO_LATEST_COMMIT, Option.of(commitTimeForReads), 100L, "1#path/to/file1.json"); - readAndAssert(READ_UPTO_LATEST_COMMIT, Option.of("1#path/to/file1.json"), 100L, "1#path/to/file2.json"); - readAndAssert(READ_UPTO_LATEST_COMMIT, Option.of("1#path/to/file2.json"), 1000L, "2#path/to/file5.json"); - readAndAssert(READ_UPTO_LATEST_COMMIT, Option.of(commitTimeForReads), 100L, "1#path/to/file1.json"); + readAndAssert(READ_UPTO_LATEST_COMMIT, Option.of("1"), 100L, + "1#path/to/file1" + extension, typedProperties); + readAndAssert(READ_UPTO_LATEST_COMMIT, Option.of("1#path/to/file1" + extension), 100L, + "1#path/to/file2" + extension, typedProperties); + readAndAssert(READ_UPTO_LATEST_COMMIT, Option.of("1#path/to/file2" + extension), 1000L, + "2#path/to/file5" + extension, typedProperties); } @ParameterizedTest @@ -244,14 +263,14 @@ public void testSplitSnapshotLoad(String snapshotCheckPoint, String exptected1, setMockQueryRunner(inputDs, Option.of(snapshotCheckPoint)); TypedProperties typedProperties = setProps(READ_UPTO_LATEST_COMMIT); - typedProperties.setProperty("hoodie.deltastreamer.source.cloud.data.ignore.relpath.prefix", "path/to/skip"); + typedProperties.setProperty("hoodie.streamer.source.cloud.data.ignore.relpath.prefix", "path/to/skip"); //1. snapshot query, read all records readAndAssert(READ_UPTO_LATEST_COMMIT, Option.empty(), 50000L, exptected1, typedProperties); //2. incremental query, as commit is present in timeline readAndAssert(READ_UPTO_LATEST_COMMIT, Option.of(exptected1), 10L, exptected2, typedProperties); //3. snapshot query with source limit less than first commit size readAndAssert(READ_UPTO_LATEST_COMMIT, Option.empty(), 50L, exptected3, typedProperties); - typedProperties.setProperty("hoodie.deltastreamer.source.cloud.data.ignore.relpath.prefix", "path/to"); + typedProperties.setProperty("hoodie.streamer.source.cloud.data.ignore.relpath.prefix", "path/to"); //4. As snapshotQuery will return 1 -> same would be return as nextCheckpoint (dataset is empty due to ignore prefix). readAndAssert(READ_UPTO_LATEST_COMMIT, Option.empty(), 50L, exptected4, typedProperties); } @@ -283,7 +302,7 @@ private void readAndAssert(IncrSourceHelper.MissingCheckpointStrategy missingChe TypedProperties typedProperties) { GcsEventsHoodieIncrSource incrSource = new GcsEventsHoodieIncrSource(typedProperties, jsc(), - spark(), schemaProvider.orElse(null), new GcsObjectMetadataFetcher(typedProperties, "json"), gcsObjectDataFetcher, queryRunner); + spark(), schemaProvider.orElse(null), new GcsObjectMetadataFetcher(typedProperties), gcsObjectDataFetcher, queryRunner); Pair>, String> dataAndCheckpoint = incrSource.fetchNextBatch(checkpointToPull, sourceLimit); @@ -297,7 +316,7 @@ private void readAndAssert(IncrSourceHelper.MissingCheckpointStrategy missingChe private void readAndAssert(IncrSourceHelper.MissingCheckpointStrategy missingCheckpointStrategy, Option checkpointToPull, long sourceLimit, String expectedCheckpoint) { TypedProperties typedProperties = setProps(missingCheckpointStrategy); - typedProperties.put("hoodie.deltastreamer.source.hoodieincr.file.format", "json"); + typedProperties.put("hoodie.streamer.source.hoodieincr.file.format", "json"); readAndAssert(missingCheckpointStrategy, checkpointToPull, sourceLimit, expectedCheckpoint, typedProperties); } @@ -369,12 +388,12 @@ private Pair> writeGcsMetadataRecords(String commitTi private TypedProperties setProps(IncrSourceHelper.MissingCheckpointStrategy missingCheckpointStrategy) { Properties properties = new Properties(); //String schemaFilePath = TestGcsEventsHoodieIncrSource.class.getClassLoader().getResource("schema/sample_gcs_data.avsc").getPath(); - //properties.put("hoodie.deltastreamer.schemaprovider.source.schema.file", schemaFilePath); - properties.put("hoodie.deltastreamer.schema.provider.class.name", FilebasedSchemaProvider.class.getName()); - properties.setProperty("hoodie.deltastreamer.source.hoodieincr.path", basePath()); - properties.setProperty("hoodie.deltastreamer.source.hoodieincr.missing.checkpoint.strategy", + //properties.put("hoodie.streamer.schemaprovider.source.schema.file", schemaFilePath); + properties.put("hoodie.streamer.schema.provider.class.name", FilebasedSchemaProvider.class.getName()); + properties.setProperty("hoodie.streamer.source.hoodieincr.path", basePath()); + properties.setProperty("hoodie.streamer.source.hoodieincr.missing.checkpoint.strategy", missingCheckpointStrategy.name()); - properties.setProperty("hoodie.deltastreamer.source.gcsincr.datafile.format", "json"); + properties.setProperty(CloudSourceConfig.DATAFILE_FORMAT.key(), "json"); return new TypedProperties(properties); } diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestHoodieIncrSource.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestHoodieIncrSource.java index b9e20fb3a192..3d9f3362a15e 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestHoodieIncrSource.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestHoodieIncrSource.java @@ -337,8 +337,8 @@ private void readAndAssert(IncrSourceHelper.MissingCheckpointStrategy missingChe String expectedCheckpoint, Option snapshotCheckPointImplClassOpt) { Properties properties = new Properties(); - properties.setProperty("hoodie.deltastreamer.source.hoodieincr.path", basePath()); - properties.setProperty("hoodie.deltastreamer.source.hoodieincr.missing.checkpoint.strategy", missingCheckpointStrategy.name()); + properties.setProperty("hoodie.streamer.source.hoodieincr.path", basePath()); + properties.setProperty("hoodie.streamer.source.hoodieincr.missing.checkpoint.strategy", missingCheckpointStrategy.name()); // TODO: [HUDI-7081] get rid of this properties.setProperty(HoodieReaderConfig.FILE_GROUP_READER_ENABLED.key(), "false"); snapshotCheckPointImplClassOpt.map(className -> diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestJdbcSource.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestJdbcSource.java index 4c8b264fe168..ade781e6c8bd 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestJdbcSource.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestJdbcSource.java @@ -46,6 +46,10 @@ import java.sql.SQLException; import java.util.stream.Collectors; +import static org.apache.hudi.utilities.testutils.JdbcTestUtils.JDBC_DRIVER; +import static org.apache.hudi.utilities.testutils.JdbcTestUtils.JDBC_PASS; +import static org.apache.hudi.utilities.testutils.JdbcTestUtils.JDBC_URL; +import static org.apache.hudi.utilities.testutils.JdbcTestUtils.JDBC_USER; import static org.apache.hudi.utilities.testutils.JdbcTestUtils.clearAndInsert; import static org.apache.hudi.utilities.testutils.JdbcTestUtils.close; import static org.apache.hudi.utilities.testutils.JdbcTestUtils.count; @@ -73,12 +77,12 @@ public static void beforeAll() throws Exception { @BeforeEach public void setup() throws Exception { super.setup(); - PROPS.setProperty("hoodie.deltastreamer.jdbc.url", "jdbc:h2:mem:test_mem"); - PROPS.setProperty("hoodie.deltastreamer.jdbc.driver.class", "org.h2.Driver"); - PROPS.setProperty("hoodie.deltastreamer.jdbc.user", "test"); - PROPS.setProperty("hoodie.deltastreamer.jdbc.password", "jdbc"); - PROPS.setProperty("hoodie.deltastreamer.jdbc.table.name", "triprec"); - connection = DriverManager.getConnection("jdbc:h2:mem:test_mem", "test", "jdbc"); + PROPS.setProperty("hoodie.streamer.jdbc.url", JDBC_URL); + PROPS.setProperty("hoodie.streamer.jdbc.driver.class", JDBC_DRIVER); + PROPS.setProperty("hoodie.streamer.jdbc.user", JDBC_USER); + PROPS.setProperty("hoodie.streamer.jdbc.password", JDBC_PASS); + PROPS.setProperty("hoodie.streamer.jdbc.table.name", "triprec"); + connection = DriverManager.getConnection(JDBC_URL, JDBC_USER, JDBC_PASS); } @AfterEach @@ -89,8 +93,8 @@ public void teardown() throws Exception { @Test public void testSingleCommit() { - PROPS.setProperty("hoodie.deltastreamer.jdbc.incr.pull", "true"); - PROPS.setProperty("hoodie.deltastreamer.jdbc.table.incr.column.name", "last_insert"); + PROPS.setProperty("hoodie.streamer.jdbc.incr.pull", "true"); + PROPS.setProperty("hoodie.streamer.jdbc.table.incr.column.name", "last_insert"); try { int numRecords = 100; @@ -112,8 +116,8 @@ public void testSingleCommit() { @Test public void testInsertAndUpdate() { - PROPS.setProperty("hoodie.deltastreamer.jdbc.incr.pull", "true"); - PROPS.setProperty("hoodie.deltastreamer.jdbc.table.incr.column.name", "last_insert"); + PROPS.setProperty("hoodie.streamer.jdbc.incr.pull", "true"); + PROPS.setProperty("hoodie.streamer.jdbc.table.incr.column.name", "last_insert"); try { final String commitTime = "000"; @@ -146,8 +150,8 @@ public void testInsertAndUpdate() { @Test public void testTwoCommits() { - PROPS.setProperty("hoodie.deltastreamer.jdbc.incr.pull", "true"); - PROPS.setProperty("hoodie.deltastreamer.jdbc.table.incr.column.name", "last_insert"); + PROPS.setProperty("hoodie.streamer.jdbc.incr.pull", "true"); + PROPS.setProperty("hoodie.streamer.jdbc.table.incr.column.name", "last_insert"); try { // Add 10 records with commit time "000" @@ -174,8 +178,8 @@ public void testTwoCommits() { @Test public void testIncrementalFetchWithCommitTime() { - PROPS.setProperty("hoodie.deltastreamer.jdbc.incr.pull", "true"); - PROPS.setProperty("hoodie.deltastreamer.jdbc.table.incr.column.name", "last_insert"); + PROPS.setProperty("hoodie.streamer.jdbc.incr.pull", "true"); + PROPS.setProperty("hoodie.streamer.jdbc.table.incr.column.name", "last_insert"); try { // Add 10 records with commit time "000" @@ -200,8 +204,8 @@ public void testIncrementalFetchWithCommitTime() { @Test public void testIncrementalFetchWithNoMatchingRows() { - PROPS.setProperty("hoodie.deltastreamer.jdbc.incr.pull", "true"); - PROPS.setProperty("hoodie.deltastreamer.jdbc.table.incr.column.name", "last_insert"); + PROPS.setProperty("hoodie.streamer.jdbc.incr.pull", "true"); + PROPS.setProperty("hoodie.streamer.jdbc.table.incr.column.name", "last_insert"); try { // Add 10 records with commit time "000" @@ -222,8 +226,8 @@ public void testIncrementalFetchWithNoMatchingRows() { @Test public void testIncrementalFetchWhenTableRecordsMoreThanSourceLimit() { - PROPS.setProperty("hoodie.deltastreamer.jdbc.incr.pull", "true"); - PROPS.setProperty("hoodie.deltastreamer.jdbc.table.incr.column.name", "id"); + PROPS.setProperty("hoodie.streamer.jdbc.incr.pull", "true"); + PROPS.setProperty("hoodie.streamer.jdbc.table.incr.column.name", "id"); try { // Add 100 records with commit time "000" @@ -253,8 +257,8 @@ public void testIncrementalFetchWhenTableRecordsMoreThanSourceLimit() { @Test public void testIncrementalFetchWhenLastCheckpointMoreThanTableRecords() { - PROPS.setProperty("hoodie.deltastreamer.jdbc.incr.pull", "true"); - PROPS.setProperty("hoodie.deltastreamer.jdbc.table.incr.column.name", "id"); + PROPS.setProperty("hoodie.streamer.jdbc.incr.pull", "true"); + PROPS.setProperty("hoodie.streamer.jdbc.table.incr.column.name", "id"); try { // Add 100 records with commit time "000" @@ -280,8 +284,8 @@ public void testIncrementalFetchWhenLastCheckpointMoreThanTableRecords() { @Test public void testIncrementalFetchFallbackToFullFetchWhenError() { - PROPS.setProperty("hoodie.deltastreamer.jdbc.incr.pull", "true"); - PROPS.setProperty("hoodie.deltastreamer.jdbc.table.incr.column.name", "last_insert"); + PROPS.setProperty("hoodie.streamer.jdbc.incr.pull", "true"); + PROPS.setProperty("hoodie.streamer.jdbc.table.incr.column.name", "last_insert"); try { // Add 10 records with commit time "000" @@ -295,14 +299,14 @@ public void testIncrementalFetchFallbackToFullFetchWhenError() { // Add 10 records with commit time "001" insert("001", 10, connection, DATA_GENERATOR, PROPS); - PROPS.setProperty("hoodie.deltastreamer.jdbc.table.incr.column.name", "dummy_col"); + PROPS.setProperty("hoodie.streamer.jdbc.table.incr.column.name", "dummy_col"); assertThrows(HoodieException.class, () -> { // Start incremental scan with a dummy column that does not exist. // This will throw an exception as the default behavior is to not fallback to full fetch. runSource(Option.of(batch.getCheckpointForNextBatch()), -1); }); - PROPS.setProperty("hoodie.deltastreamer.jdbc.incr.fallback.to.full.fetch", "true"); + PROPS.setProperty("hoodie.streamer.jdbc.incr.fallback.to.full.fetch", "true"); // Start incremental scan with a dummy column that does not exist. // This will fallback to full fetch mode but still throw an exception checkpointing will fail. @@ -317,7 +321,7 @@ public void testIncrementalFetchFallbackToFullFetchWhenError() { @Test public void testFullFetchWithCommitTime() { - PROPS.setProperty("hoodie.deltastreamer.jdbc.incr.pull", "false"); + PROPS.setProperty("hoodie.streamer.jdbc.incr.pull", "false"); try { // Add 10 records with commit time "000" @@ -341,8 +345,8 @@ public void testFullFetchWithCommitTime() { @Test public void testFullFetchWithCheckpoint() { - PROPS.setProperty("hoodie.deltastreamer.jdbc.incr.pull", "false"); - PROPS.setProperty("hoodie.deltastreamer.jdbc.table.incr.column.name", "last_insert"); + PROPS.setProperty("hoodie.streamer.jdbc.incr.pull", "false"); + PROPS.setProperty("hoodie.streamer.jdbc.table.incr.column.name", "last_insert"); try { // Add 10 records with commit time "000" @@ -356,7 +360,7 @@ public void testFullFetchWithCheckpoint() { // Get max of incremental column Column incrementalColumn = rowDataset - .col(PROPS.getString("hoodie.deltastreamer.jdbc.table.incr.column.name")); + .col(PROPS.getString("hoodie.streamer.jdbc.table.incr.column.name")); final String max = rowDataset.agg(functions.max(incrementalColumn).cast(DataTypes.StringType)).first() .getString(0); @@ -378,10 +382,10 @@ public void testSourceWithPasswordOnFs() { // Write secret string to fs in a file writeSecretToFs(); // Remove secret string from props - PROPS.remove("hoodie.deltastreamer.jdbc.password"); + PROPS.remove("hoodie.streamer.jdbc.password"); // Set property to read secret from fs file - PROPS.setProperty("hoodie.deltastreamer.jdbc.password.file", "file:///tmp/hudi/config/secret"); - PROPS.setProperty("hoodie.deltastreamer.jdbc.incr.pull", "false"); + PROPS.setProperty("hoodie.streamer.jdbc.password.file", "file:///tmp/hudi/config/secret"); + PROPS.setProperty("hoodie.streamer.jdbc.incr.pull", "false"); // Add 10 records with commit time 000 clearAndInsert("000", 10, connection, DATA_GENERATOR, PROPS); Dataset rowDataset = runSource(Option.empty(), 10).getBatch().get(); @@ -397,8 +401,8 @@ public void testSourceWithNoPasswordThrowsException() { // Write secret string to fs in a file writeSecretToFs(); // Remove secret string from props - PROPS.remove("hoodie.deltastreamer.jdbc.password"); - PROPS.setProperty("hoodie.deltastreamer.jdbc.incr.pull", "false"); + PROPS.remove("hoodie.streamer.jdbc.password"); + PROPS.setProperty("hoodie.streamer.jdbc.incr.pull", "false"); // Add 10 records with commit time 000 clearAndInsert("000", 10, connection, DATA_GENERATOR, PROPS); runSource(Option.empty(), 10); @@ -407,9 +411,9 @@ public void testSourceWithNoPasswordThrowsException() { @Test public void testSourceWithExtraOptions() { - PROPS.setProperty("hoodie.deltastreamer.jdbc.extra.options.fetchsize", "10"); - PROPS.setProperty("hoodie.deltastreamer.jdbc.incr.pull", "false"); - PROPS.remove("hoodie.deltastreamer.jdbc.table.incr.column.name"); + PROPS.setProperty("hoodie.streamer.jdbc.extra.options.fetchsize", "10"); + PROPS.setProperty("hoodie.streamer.jdbc.incr.pull", "false"); + PROPS.remove("hoodie.streamer.jdbc.table.incr.column.name"); try { // Add 20 records with commit time 000 clearAndInsert("000", 20, connection, DATA_GENERATOR, PROPS); @@ -422,8 +426,8 @@ public void testSourceWithExtraOptions() { @Test public void testSourceWithStorageLevel() { - PROPS.setProperty("hoodie.deltastreamer.jdbc.storage.level", "NONE"); - PROPS.setProperty("hoodie.deltastreamer.jdbc.incr.pull", "false"); + PROPS.setProperty("hoodie.streamer.jdbc.storage.level", "NONE"); + PROPS.setProperty("hoodie.streamer.jdbc.incr.pull", "false"); try { // Add 10 records with commit time 000 clearAndInsert("000", 10, connection, DATA_GENERATOR, PROPS); @@ -438,7 +442,7 @@ public void testSourceWithStorageLevel() { private void writeSecretToFs() throws IOException { FileSystem fs = FileSystem.get(new Configuration()); FSDataOutputStream outputStream = fs.create(new Path("file:///tmp/hudi/config/secret")); - outputStream.writeBytes("jdbc"); + outputStream.writeBytes(JDBC_PASS); outputStream.close(); } diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestJsonDFSSource.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestJsonDFSSource.java index fde10b2d9a59..ae134e862bea 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestJsonDFSSource.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestJsonDFSSource.java @@ -20,15 +20,29 @@ import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.exception.SchemaCompatibilityException; +import org.apache.hudi.utilities.config.HoodieStreamerConfig; +import org.apache.hudi.utilities.streamer.SourceFormatAdapter; import org.apache.hudi.utilities.testutils.UtilitiesTestBase; import org.apache.hudi.utilities.testutils.sources.AbstractDFSSourceTestBase; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.LocatedFileStatus; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.RemoteIterator; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; import java.io.IOException; +import java.io.PrintStream; import java.util.List; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + /** * Basic tests for {@link JsonDFSSource}. */ @@ -42,9 +56,8 @@ public void setup() throws Exception { } @Override - public Source prepareDFSSource() { - TypedProperties props = new TypedProperties(); - props.setProperty("hoodie.deltastreamer.source.dfs.root", dfsRoot); + public Source prepareDFSSource(TypedProperties props) { + props.setProperty("hoodie.streamer.source.dfs.root", dfsRoot); return new JsonDFSSource(props, jsc, sparkSession, schemaProvider); } @@ -53,4 +66,36 @@ public void writeNewDataToFile(List records, Path path) throws IOE UtilitiesTestBase.Helpers.saveStringsToDFS( Helpers.jsonifyRecords(records), fs, path.toString()); } + + @Test + public void testCorruptedSourceFile() throws IOException { + fs.mkdirs(new Path(dfsRoot)); + TypedProperties props = new TypedProperties(); + props.setProperty(HoodieStreamerConfig.ROW_THROW_EXPLICIT_EXCEPTIONS.key(), "true"); + SourceFormatAdapter sourceFormatAdapter = new SourceFormatAdapter(prepareDFSSource(props), Option.empty(), Option.of(props)); + generateOneFile("1", "000", 10); + generateOneFile("2", "000", 10); + RemoteIterator files = fs.listFiles(generateOneFile("3", "000", 10), true); + + FileStatus file1Status = files.next(); + InputBatch> batch = sourceFormatAdapter.fetchNewDataInRowFormat(Option.empty(), Long.MAX_VALUE); + corruptFile(file1Status.getPath()); + assertTrue(batch.getBatch().isPresent()); + Throwable t = assertThrows(Exception.class, + () -> batch.getBatch().get().show(30)); + while (t != null) { + if (t instanceof SchemaCompatibilityException) { + return; + } + t = t.getCause(); + } + throw new AssertionError("Exception does not have SchemaCompatibility in its trace", t); + } + + protected void corruptFile(Path path) throws IOException { + PrintStream os = new PrintStream(fs.appendFile(path).build()); + os.println("🤷‍"); + os.flush(); + os.close(); + } } diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestJsonKafkaSource.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestJsonKafkaSource.java index 14ffd31582a1..4b615c50ee19 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestJsonKafkaSource.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestJsonKafkaSource.java @@ -23,14 +23,16 @@ import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.testutils.InProcessTimeGenerator; import org.apache.hudi.common.testutils.HoodieTestDataGenerator; +import org.apache.hudi.common.testutils.InProcessTimeGenerator; import org.apache.hudi.common.util.Option; +import org.apache.hudi.utilities.UtilHelpers; import org.apache.hudi.utilities.config.HoodieStreamerConfig; import org.apache.hudi.utilities.config.KafkaSourceConfig; import org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer; import org.apache.hudi.utilities.schema.FilebasedSchemaProvider; import org.apache.hudi.utilities.streamer.BaseErrorTableWriter; +import org.apache.hudi.utilities.streamer.DefaultStreamContext; import org.apache.hudi.utilities.streamer.ErrorEvent; import org.apache.hudi.utilities.streamer.SourceFormatAdapter; @@ -60,10 +62,10 @@ import static org.apache.hudi.config.HoodieErrorTableConfig.ERROR_TABLE_BASE_PATH; import static org.apache.hudi.config.HoodieErrorTableConfig.ERROR_TARGET_TABLE; import static org.apache.hudi.utilities.config.KafkaSourceConfig.ENABLE_KAFKA_COMMIT_OFFSET; +import static org.apache.hudi.utilities.schema.KafkaOffsetPostProcessor.KAFKA_SOURCE_KEY_COLUMN; import static org.apache.hudi.utilities.schema.KafkaOffsetPostProcessor.KAFKA_SOURCE_OFFSET_COLUMN; import static org.apache.hudi.utilities.schema.KafkaOffsetPostProcessor.KAFKA_SOURCE_PARTITION_COLUMN; import static org.apache.hudi.utilities.schema.KafkaOffsetPostProcessor.KAFKA_SOURCE_TIMESTAMP_COLUMN; -import static org.apache.hudi.utilities.schema.KafkaOffsetPostProcessor.KAFKA_SOURCE_KEY_COLUMN; import static org.apache.hudi.utilities.testutils.UtilitiesTestBase.Helpers.jsonifyRecords; import static org.apache.hudi.utilities.testutils.UtilitiesTestBase.Helpers.jsonifyRecordsByPartitions; import static org.apache.hudi.utilities.testutils.UtilitiesTestBase.Helpers.jsonifyRecordsByPartitionsWithNullKafkaKey; @@ -80,7 +82,7 @@ public class TestJsonKafkaSource extends BaseTestKafkaSource { public void init() throws Exception { String schemaFilePath = Objects.requireNonNull(SCHEMA_FILE_URL).toURI().getPath(); TypedProperties props = new TypedProperties(); - props.put("hoodie.deltastreamer.schemaprovider.source.schema.file", schemaFilePath); + props.put("hoodie.streamer.schemaprovider.source.schema.file", schemaFilePath); schemaProvider = new FilebasedSchemaProvider(props, jsc()); } @@ -91,11 +93,11 @@ TypedProperties createPropsForKafkaSource(String topic, Long maxEventsToReadFrom static TypedProperties createPropsForJsonKafkaSource(String brokerAddress, String topic, Long maxEventsToReadFromKafkaSource, String resetStrategy) { TypedProperties props = new TypedProperties(); - props.setProperty("hoodie.deltastreamer.source.kafka.topic", topic); + props.setProperty("hoodie.streamer.source.kafka.topic", topic); props.setProperty("bootstrap.servers", brokerAddress); props.setProperty("auto.offset.reset", resetStrategy); props.setProperty(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "false"); - props.setProperty("hoodie.deltastreamer.kafka.source.maxEvents", + props.setProperty("hoodie.streamer.kafka.source.maxEvents", maxEventsToReadFromKafkaSource != null ? String.valueOf(maxEventsToReadFromKafkaSource) : String.valueOf(KafkaSourceConfig.MAX_EVENTS_FROM_KAFKA_SOURCE.defaultValue())); props.setProperty(ConsumerConfig.GROUP_ID_CONFIG, UUID.randomUUID().toString()); @@ -104,7 +106,7 @@ static TypedProperties createPropsForJsonKafkaSource(String brokerAddress, Strin @Override SourceFormatAdapter createSource(TypedProperties props) { - return new SourceFormatAdapter(new JsonKafkaSource(props, jsc(), spark(), schemaProvider, metrics)); + return new SourceFormatAdapter(new JsonKafkaSource(props, jsc(), spark(), metrics, new DefaultStreamContext(schemaProvider, sourceProfile))); } // test whether empty messages can be filtered @@ -350,10 +352,22 @@ public void testAppendKafkaOffset() { jsonSource = new JsonKafkaSource(props, jsc(), spark(), schemaProvider, metrics); kafkaSource = new SourceFormatAdapter(jsonSource); Dataset dfWithOffsetInfoAndNullKafkaKey = kafkaSource.fetchNewDataInRowFormat(Option.empty(), Long.MAX_VALUE).getBatch().get().cache(); + // total of 2 * numMessages are in the topic at this point, half with a key and half with a null key. All should have the source offset. assertEquals(numMessages, dfWithOffsetInfoAndNullKafkaKey.toDF().filter("_hoodie_kafka_source_key is null").count()); + assertEquals(numMessages, dfWithOffsetInfoAndNullKafkaKey.toDF().filter("_hoodie_kafka_source_key is not null").count()); + assertEquals(numMessages * 2, dfWithOffsetInfoAndNullKafkaKey.toDF().filter("_hoodie_kafka_source_offset is not null").count()); dfNoOffsetInfo.unpersist(); dfWithOffsetInfo.unpersist(); dfWithOffsetInfoAndNullKafkaKey.unpersist(); } + + @Test + public void testCreateSource() throws IOException { + final String topic = TEST_TOPIC_PREFIX + "testJsonKafkaSourceCreation"; + testUtils.createTopic(topic, 2); + TypedProperties props = createPropsForKafkaSource(topic, null, "earliest"); + Source jsonKafkaSource = UtilHelpers.createSource(JsonKafkaSource.class.getName(), props, jsc(), spark(), metrics, new DefaultStreamContext(schemaProvider, sourceProfile)); + assertEquals(Source.SourceType.JSON, jsonKafkaSource.getSourceType()); + } } diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestJsonKafkaSourcePostProcessor.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestJsonKafkaSourcePostProcessor.java index b6bc3480e3d2..1f1a4e2b5c1f 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestJsonKafkaSourcePostProcessor.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestJsonKafkaSourcePostProcessor.java @@ -80,7 +80,7 @@ public static void cleanupClass() { public void init() throws Exception { String schemaFilePath = Objects.requireNonNull(TestJsonKafkaSource.SCHEMA_FILE_URL).toURI().getPath(); TypedProperties props = new TypedProperties(); - props.put("hoodie.deltastreamer.schemaprovider.source.schema.file", schemaFilePath); + props.put("hoodie.streamer.schemaprovider.source.schema.file", schemaFilePath); schemaProvider = new FilebasedSchemaProvider(props, jsc()); } diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestParquetDFSSource.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestParquetDFSSource.java index 44489037e823..a9c448748c91 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestParquetDFSSource.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestParquetDFSSource.java @@ -41,9 +41,8 @@ public void setup() throws Exception { } @Override - public Source prepareDFSSource() { - TypedProperties props = new TypedProperties(); - props.setProperty("hoodie.deltastreamer.source.dfs.root", dfsRoot); + public Source prepareDFSSource(TypedProperties props) { + props.setProperty("hoodie.streamer.source.dfs.root", dfsRoot); return new ParquetDFSSource(props, jsc, sparkSession, schemaProvider); } diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestProtoKafkaSource.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestProtoKafkaSource.java index 52376f897419..f96792111445 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestProtoKafkaSource.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestProtoKafkaSource.java @@ -25,6 +25,7 @@ import org.apache.hudi.utilities.config.ProtoClassBasedSchemaProviderConfig; import org.apache.hudi.utilities.schema.ProtoClassBasedSchemaProvider; import org.apache.hudi.utilities.schema.SchemaProvider; +import org.apache.hudi.utilities.streamer.DefaultStreamContext; import org.apache.hudi.utilities.streamer.SourceFormatAdapter; import org.apache.hudi.utilities.test.proto.Nested; import org.apache.hudi.utilities.test.proto.Sample; @@ -74,11 +75,11 @@ public class TestProtoKafkaSource extends BaseTestKafkaSource { protected TypedProperties createPropsForKafkaSource(String topic, Long maxEventsToReadFromKafkaSource, String resetStrategy) { TypedProperties props = new TypedProperties(); - props.setProperty("hoodie.deltastreamer.source.kafka.topic", topic); + props.setProperty("hoodie.streamer.source.kafka.topic", topic); props.setProperty("bootstrap.servers", testUtils.brokerAddress()); props.setProperty("auto.offset.reset", resetStrategy); props.setProperty(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "false"); - props.setProperty("hoodie.deltastreamer.kafka.source.maxEvents", + props.setProperty("hoodie.streamer.kafka.source.maxEvents", maxEventsToReadFromKafkaSource != null ? String.valueOf(maxEventsToReadFromKafkaSource) : String.valueOf(KafkaSourceConfig.MAX_EVENTS_FROM_KAFKA_SOURCE.defaultValue())); props.setProperty(ConsumerConfig.GROUP_ID_CONFIG, UUID.randomUUID().toString()); @@ -89,7 +90,7 @@ protected TypedProperties createPropsForKafkaSource(String topic, Long maxEvents @Override SourceFormatAdapter createSource(TypedProperties props) { this.schemaProvider = new ProtoClassBasedSchemaProvider(props, jsc()); - Source protoKafkaSource = new ProtoKafkaSource(props, jsc(), spark(), schemaProvider, metrics); + Source protoKafkaSource = new ProtoKafkaSource(props, jsc(), spark(), metrics, new DefaultStreamContext(schemaProvider, sourceProfile)); return new SourceFormatAdapter(protoKafkaSource); } diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestS3EventsHoodieIncrSource.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestS3EventsHoodieIncrSource.java index 33faac5361f7..a9dd11c55440 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestS3EventsHoodieIncrSource.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestS3EventsHoodieIncrSource.java @@ -87,6 +87,7 @@ public class TestS3EventsHoodieIncrSource extends SparkClientFunctionalTestHarne private ObjectMapper mapper = new ObjectMapper(); private static final String MY_BUCKET = "some-bucket"; + private static final String IGNORE_FILE_EXTENSION = ".ignore"; private Option schemaProvider; @Mock @@ -104,8 +105,8 @@ public void setUp() throws IOException { metaClient = getHoodieMetaClient(hadoopConf(), basePath()); String schemaFilePath = TestCloudObjectsSelectorCommon.class.getClassLoader().getResource("schema/sample_gcs_data.avsc").getPath(); TypedProperties props = new TypedProperties(); - props.put("hoodie.deltastreamer.schemaprovider.source.schema.file", schemaFilePath); - props.put("hoodie.deltastreamer.schema.provider.class.name", FilebasedSchemaProvider.class.getName()); + props.put("hoodie.streamer.schemaprovider.source.schema.file", schemaFilePath); + props.put("hoodie.streamer.schema.provider.class.name", FilebasedSchemaProvider.class.getName()); this.schemaProvider = Option.of(new FilebasedSchemaProvider(props, jsc)); } @@ -185,10 +186,10 @@ private HoodieRecord generateS3EventMetadata(String commitTime, String bucketNam private TypedProperties setProps(IncrSourceHelper.MissingCheckpointStrategy missingCheckpointStrategy) { Properties properties = new Properties(); - properties.setProperty("hoodie.deltastreamer.source.hoodieincr.path", basePath()); - properties.setProperty("hoodie.deltastreamer.source.hoodieincr.missing.checkpoint.strategy", + properties.setProperty("hoodie.streamer.source.hoodieincr.path", basePath()); + properties.setProperty("hoodie.streamer.source.hoodieincr.missing.checkpoint.strategy", missingCheckpointStrategy.name()); - properties.setProperty("hoodie.deltastreamer.source.hoodieincr.file.format", "json"); + properties.setProperty("hoodie.streamer.source.hoodieincr.file.format", "json"); return new TypedProperties(properties); } @@ -308,11 +309,14 @@ public void testTwoFilesAndContinueAcrossCommits(String extension) throws IOExce } List> filePathSizeAndCommitTime = new ArrayList<>(); - // Add file paths and sizes to the list + // Add file paths and sizes to the list. + // Check with a couple of invalid file extensions to ensure they are filtered out. filePathSizeAndCommitTime.add(Triple.of(String.format("path/to/file1%s", extension), 100L, "1")); + filePathSizeAndCommitTime.add(Triple.of(String.format("path/to/file2%s", IGNORE_FILE_EXTENSION), 800L, "1")); filePathSizeAndCommitTime.add(Triple.of(String.format("path/to/file3%s", extension), 200L, "1")); filePathSizeAndCommitTime.add(Triple.of(String.format("path/to/file2%s", extension), 150L, "1")); filePathSizeAndCommitTime.add(Triple.of(String.format("path/to/file4%s", extension), 50L, "2")); + filePathSizeAndCommitTime.add(Triple.of(String.format("path/to/file4%s", IGNORE_FILE_EXTENSION), 200L, "2")); filePathSizeAndCommitTime.add(Triple.of(String.format("path/to/file5%s", extension), 150L, "2")); Dataset inputDs = generateDataset(filePathSizeAndCommitTime); @@ -350,7 +354,7 @@ public void testEmptyDataAfterFilter() throws IOException { setMockQueryRunner(inputDs); TypedProperties typedProperties = setProps(READ_UPTO_LATEST_COMMIT); - typedProperties.setProperty("hoodie.deltastreamer.source.s3incr.ignore.key.prefix", "path/to/skip"); + typedProperties.setProperty("hoodie.streamer.source.s3incr.ignore.key.prefix", "path/to/skip"); readAndAssert(READ_UPTO_LATEST_COMMIT, Option.of("1"), 1000L, "2", typedProperties); readAndAssert(READ_UPTO_LATEST_COMMIT, Option.of("1#path/to/file3.json"), 1000L, "2", typedProperties); @@ -384,7 +388,7 @@ public void testFilterAnEntireCommit() throws IOException { when(mockCloudDataFetcher.getCloudObjectDataDF(Mockito.any(), Mockito.any(), Mockito.any(), eq(schemaProvider))) .thenReturn(Option.empty()); TypedProperties typedProperties = setProps(READ_UPTO_LATEST_COMMIT); - typedProperties.setProperty("hoodie.deltastreamer.source.s3incr.ignore.key.prefix", "path/to/skip"); + typedProperties.setProperty("hoodie.streamer.source.s3incr.ignore.key.prefix", "path/to/skip"); readAndAssert(READ_UPTO_LATEST_COMMIT, Option.of("1"), 50L, "2#path/to/file4.json", typedProperties); } @@ -416,7 +420,7 @@ public void testFilterAnEntireMiddleCommit() throws IOException { when(mockCloudDataFetcher.getCloudObjectDataDF(Mockito.any(), Mockito.any(), Mockito.any(), eq(schemaProvider))) .thenReturn(Option.empty()); TypedProperties typedProperties = setProps(READ_UPTO_LATEST_COMMIT); - typedProperties.setProperty("hoodie.deltastreamer.source.s3incr.ignore.key.prefix", "path/to/skip"); + typedProperties.setProperty("hoodie.streamer.source.s3incr.ignore.key.prefix", "path/to/skip"); readAndAssert(READ_UPTO_LATEST_COMMIT, Option.of("1#path/to/file3.json"), 50L, "3#path/to/file4.json", typedProperties); @@ -453,14 +457,14 @@ public void testSplitSnapshotLoad(String snapshotCheckPoint, String exptected1, when(mockCloudDataFetcher.getCloudObjectDataDF(Mockito.any(), Mockito.any(), Mockito.any(), eq(schemaProvider))) .thenReturn(Option.empty()); TypedProperties typedProperties = setProps(READ_UPTO_LATEST_COMMIT); - typedProperties.setProperty("hoodie.deltastreamer.source.s3incr.ignore.key.prefix", "path/to/skip"); + typedProperties.setProperty("hoodie.streamer.source.s3incr.ignore.key.prefix", "path/to/skip"); //1. snapshot query, read all records readAndAssert(READ_UPTO_LATEST_COMMIT, Option.empty(), 50000L, exptected1, typedProperties); //2. incremental query, as commit is present in timeline readAndAssert(READ_UPTO_LATEST_COMMIT, Option.of(exptected1), 10L, exptected2, typedProperties); //3. snapshot query with source limit less than first commit size readAndAssert(READ_UPTO_LATEST_COMMIT, Option.empty(), 50L, exptected3, typedProperties); - typedProperties.setProperty("hoodie.deltastreamer.source.s3incr.ignore.key.prefix", "path/to"); + typedProperties.setProperty("hoodie.streamer.source.s3incr.ignore.key.prefix", "path/to"); //4. As snapshotQuery will return 1 -> same would be return as nextCheckpoint (dataset is empty due to ignore prefix). readAndAssert(READ_UPTO_LATEST_COMMIT, Option.empty(), 50L, exptected4, typedProperties); } diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestSqlFileBasedSource.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestSqlFileBasedSource.java index 89769954d386..ee488e38c6ac 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestSqlFileBasedSource.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestSqlFileBasedSource.java @@ -51,8 +51,8 @@ public class TestSqlFileBasedSource extends UtilitiesTestBase { private final boolean useFlattenedSchema = false; - private final String sqlFileSourceConfig = "hoodie.deltastreamer.source.sql.file"; - private final String sqlFileSourceConfigEmitChkPointConf = "hoodie.deltastreamer.source.sql.checkpoint.emit"; + private final String sqlFileSourceConfig = "hoodie.streamer.source.sql.file"; + private final String sqlFileSourceConfigEmitChkPointConf = "hoodie.streamer.source.sql.checkpoint.emit"; protected FilebasedSchemaProvider schemaProvider; protected HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(); private String dfsRoot; diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestSqlSource.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestSqlSource.java index 64578f3bae36..a738003a3fcd 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestSqlSource.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestSqlSource.java @@ -50,7 +50,7 @@ public class TestSqlSource extends UtilitiesTestBase { private final boolean useFlattenedSchema = false; - private final String sqlSourceConfig = "hoodie.deltastreamer.source.sql.sql.query"; + private final String sqlSourceConfig = "hoodie.streamer.source.sql.sql.query"; protected FilebasedSchemaProvider schemaProvider; protected HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(); private String dfsRoot; diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/debezium/TestAbstractDebeziumSource.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/debezium/TestAbstractDebeziumSource.java index c9f46144e96a..a57383c43b24 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/debezium/TestAbstractDebeziumSource.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/debezium/TestAbstractDebeziumSource.java @@ -86,12 +86,12 @@ public static void cleanupClass() throws IOException { private TypedProperties createPropsForJsonSource() { TypedProperties props = new TypedProperties(); - props.setProperty("hoodie.deltastreamer.source.kafka.topic", testTopicName); + props.setProperty("hoodie.streamer.source.kafka.topic", testTopicName); props.setProperty("bootstrap.servers", testUtils.brokerAddress()); props.setProperty("auto.offset.reset", "earliest"); props.setProperty(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "false"); - props.setProperty("hoodie.deltastreamer.schemaprovider.registry.url", "localhost"); - props.setProperty("hoodie.deltastreamer.source.kafka.value.deserializer.class", StringDeserializer.class.getName()); + props.setProperty("hoodie.streamer.schemaprovider.registry.url", "localhost"); + props.setProperty("hoodie.streamer.source.kafka.value.deserializer.class", StringDeserializer.class.getName()); props.setProperty(ConsumerConfig.GROUP_ID_CONFIG, UUID.randomUUID().toString()); return props; diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/helpers/TestCheckpointUtils.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/helpers/TestCheckpointUtils.java index 49e27d0191bd..7e8b263de331 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/helpers/TestCheckpointUtils.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/helpers/TestCheckpointUtils.java @@ -24,11 +24,13 @@ import org.apache.spark.streaming.kafka010.OffsetRange; import org.junit.jupiter.api.Test; +import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.Set; +import static org.junit.jupiter.api.Assertions.assertArrayEquals; import static org.junit.jupiter.api.Assertions.assertEquals; /** @@ -103,26 +105,35 @@ public void testComputeOffsetRangesWithoutMinPartitions() { ranges = CheckpointUtils.computeOffsetRanges(makeOffsetMap(new int[] {0, 1}, new long[] {200000, 250000}), makeOffsetMap(new int[] {0, 1, 2}, new long[] {200010, 350000, 10000}), 100000, 0); assertEquals(100000, CheckpointUtils.totalNewMessages(ranges)); + assertEquals(5, ranges.length); + assertEquals(0, ranges[0].partition()); assertEquals(10, ranges[0].count()); - assertEquals(89990, ranges[1].count()); - assertEquals(10000, ranges[2].count()); + assertEquals(1, ranges[1].partition()); + assertEquals(33333, ranges[1].count()); + assertEquals(33333, ranges[2].count()); + assertEquals(23324, ranges[3].count()); + assertEquals(2, ranges[4].partition()); + assertEquals(10000, ranges[4].count()); ranges = CheckpointUtils.computeOffsetRanges(makeOffsetMap(new int[] {0, 1}, new long[] {200000, 250000}), makeOffsetMap(new int[] {0, 1, 2}, new long[] {200010, 350000, 10000}), 1000000, 0); assertEquals(110010, CheckpointUtils.totalNewMessages(ranges)); assertEquals(10, ranges[0].count()); - assertEquals(100000, ranges[1].count()); - assertEquals(10000, ranges[2].count()); + assertEquals(36670, ranges[1].count()); + assertEquals(36670, ranges[2].count()); + assertEquals(26660, ranges[3].count()); + assertEquals(10000, ranges[4].count()); // not all partitions consume same entries. ranges = CheckpointUtils.computeOffsetRanges(makeOffsetMap(new int[] {0, 1, 2, 3, 4}, new long[] {0, 0, 0, 0, 0}), makeOffsetMap(new int[] {0, 1, 2, 3, 4}, new long[] {100, 1000, 1000, 1000, 1000}), 1001, 0); assertEquals(1001, CheckpointUtils.totalNewMessages(ranges)); assertEquals(100, ranges[0].count()); - assertEquals(226, ranges[1].count()); - assertEquals(225, ranges[2].count()); - assertEquals(225, ranges[3].count()); - assertEquals(225, ranges[4].count()); + assertEquals(200, ranges[1].count()); + assertEquals(101, ranges[2].count()); + assertEquals(200, ranges[3].count()); + assertEquals(200, ranges[4].count()); + assertEquals(200, ranges[5].count()); } @Test @@ -166,38 +177,44 @@ public void testComputeOffsetRangesWithMinPartitions() { // N skewed TopicPartitions to M offset ranges ranges = CheckpointUtils.computeOffsetRanges(makeOffsetMap(new int[] {0, 1}, new long[] {0, 0}), makeOffsetMap(new int[] {0, 1}, new long[] {100, 500}), 600, 3); - assertEquals(3, ranges.length); + assertEquals(4, ranges.length); assertEquals(0, ranges[0].fromOffset()); assertEquals(100, ranges[0].untilOffset()); assertEquals(0, ranges[1].fromOffset()); - assertEquals(250, ranges[1].untilOffset()); - assertEquals(250, ranges[2].fromOffset()); - assertEquals(500, ranges[2].untilOffset()); + assertEquals(200, ranges[1].untilOffset()); + assertEquals(200, ranges[2].fromOffset()); + assertEquals(400, ranges[2].untilOffset()); + assertEquals(400, ranges[3].fromOffset()); + assertEquals(500, ranges[3].untilOffset()); // range inexact multiple of minPartitions ranges = CheckpointUtils.computeOffsetRanges(makeOffsetMap(new int[] {0}, new long[] {0}), makeOffsetMap(new int[] {0}, new long[] {100}), 600, 3); - assertEquals(3, ranges.length); + assertEquals(4, ranges.length); assertEquals(0, ranges[0].fromOffset()); - assertEquals(34, ranges[0].untilOffset()); - assertEquals(34, ranges[1].fromOffset()); - assertEquals(67, ranges[1].untilOffset()); - assertEquals(67, ranges[2].fromOffset()); - assertEquals(100, ranges[2].untilOffset()); + assertEquals(33, ranges[0].untilOffset()); + assertEquals(33, ranges[1].fromOffset()); + assertEquals(66, ranges[1].untilOffset()); + assertEquals(66, ranges[2].fromOffset()); + assertEquals(99, ranges[2].untilOffset()); + assertEquals(99, ranges[3].fromOffset()); + assertEquals(100, ranges[3].untilOffset()); // do not ignore empty ranges ranges = CheckpointUtils.computeOffsetRanges(makeOffsetMap(new int[] {0, 1}, new long[] {100, 0}), makeOffsetMap(new int[] {0, 1}, new long[] {100, 600}), 600, 3); - assertEquals(3, ranges.length); + assertEquals(4, ranges.length); assertEquals(0, ranges[0].partition()); assertEquals(100, ranges[0].fromOffset()); assertEquals(100, ranges[0].untilOffset()); assertEquals(1, ranges[1].partition()); assertEquals(0, ranges[1].fromOffset()); - assertEquals(300, ranges[1].untilOffset()); + assertEquals(200, ranges[1].untilOffset()); assertEquals(1, ranges[2].partition()); - assertEquals(300, ranges[2].fromOffset()); - assertEquals(600, ranges[2].untilOffset()); + assertEquals(200, ranges[2].fromOffset()); + assertEquals(400, ranges[2].untilOffset()); + assertEquals(400, ranges[3].fromOffset()); + assertEquals(600, ranges[3].untilOffset()); // all empty ranges, do not ignore empty ranges ranges = CheckpointUtils.computeOffsetRanges(makeOffsetMap(new int[] {0, 1}, new long[] {100, 0}), @@ -226,7 +243,7 @@ public void testSplitAndMergeRanges() { OffsetRange range = OffsetRange.apply(TEST_TOPIC_NAME, 0, 0, 100); OffsetRange[] ranges = CheckpointUtils.computeOffsetRanges(makeOffsetMap(new int[] {0, 1}, new long[] {0, 0}), makeOffsetMap(new int[] {0, 1}, new long[] {100, 500}), 600, 4); - assertEquals(4, ranges.length); + assertEquals(5, ranges.length); OffsetRange[] mergedRanges = CheckpointUtils.mergeRangesByTopicPartition(ranges); assertEquals(2, mergedRanges.length); assertEquals(0, mergedRanges[0].partition()); @@ -245,6 +262,134 @@ public void testSplitAndMergeRanges() { assertEquals(300, mergedRanges[0].untilOffset()); } + @Test + public void testNumAllocatedEventsGreaterThanNumActualEvents() { + int[] partitions = new int[] {0, 1, 2, 3, 4}; + long[] committedOffsets = + new long[] {76888767, 76725043, 76899767, 76833267, 76952055}; + long[] latestOffsets = + new long[] {77005407, 76768151, 76985456, 76917973, 77080447}; + long numEvents = 400000; + long minPartitions = 20; + OffsetRange[] ranges = + KafkaOffsetGen.CheckpointUtils.computeOffsetRanges( + makeOffsetMap(partitions, committedOffsets), + makeOffsetMap(partitions, latestOffsets), + numEvents, + minPartitions); + + long totalNewMsgs = KafkaOffsetGen.CheckpointUtils.totalNewMessages(ranges); + assertEquals(400000, totalNewMsgs); + for (OffsetRange range : ranges) { + if (range.fromOffset() > range.untilOffset()) { + throw new IllegalArgumentException("Invalid offset range " + range); + } + } + long eventPerPartition = numEvents / minPartitions; + long rangesWhereDiffIsLessThanEventsPerPartition = Arrays.stream(ranges).filter(offsetRange -> offsetRange.untilOffset() - offsetRange.fromOffset() <= eventPerPartition).count(); + assertEquals(ranges.length, rangesWhereDiffIsLessThanEventsPerPartition); + OffsetRange[] expectedRanges = new OffsetRange[] { + OffsetRange.apply(TEST_TOPIC_NAME, 0, 76888767, 76908767), + OffsetRange.apply(TEST_TOPIC_NAME, 0, 76908767, 76928767), + OffsetRange.apply(TEST_TOPIC_NAME, 0, 76928767, 76948767), + OffsetRange.apply(TEST_TOPIC_NAME, 0, 76948767, 76968767), + OffsetRange.apply(TEST_TOPIC_NAME, 0, 76968767, 76988767), + OffsetRange.apply(TEST_TOPIC_NAME, 1, 76725043, 76745043), + OffsetRange.apply(TEST_TOPIC_NAME, 1, 76745043, 76765043), + OffsetRange.apply(TEST_TOPIC_NAME, 1, 76765043, 76768151), + OffsetRange.apply(TEST_TOPIC_NAME, 2, 76899767, 76919767), + OffsetRange.apply(TEST_TOPIC_NAME, 2, 76919767, 76939767), + OffsetRange.apply(TEST_TOPIC_NAME, 2, 76939767, 76959767), + OffsetRange.apply(TEST_TOPIC_NAME, 2, 76959767, 76979767), + OffsetRange.apply(TEST_TOPIC_NAME, 2, 76979767, 76985456), + OffsetRange.apply(TEST_TOPIC_NAME, 3, 76833267, 76853267), + OffsetRange.apply(TEST_TOPIC_NAME, 3, 76853267, 76873267), + OffsetRange.apply(TEST_TOPIC_NAME, 3, 76873267, 76893267), + OffsetRange.apply(TEST_TOPIC_NAME, 3, 76893267, 76913267), + OffsetRange.apply(TEST_TOPIC_NAME, 3, 76913267, 76917973), + OffsetRange.apply(TEST_TOPIC_NAME, 4, 76952055, 76972055), + OffsetRange.apply(TEST_TOPIC_NAME, 4, 76972055, 76992055), + OffsetRange.apply(TEST_TOPIC_NAME, 4, 76992055, 77012055), + OffsetRange.apply(TEST_TOPIC_NAME, 4, 77012055, 77032055), + OffsetRange.apply(TEST_TOPIC_NAME, 4, 77032055, 77038552), + }; + assertArrayEquals(expectedRanges, ranges); + } + + @Test + public void testNumAllocatedEventsLesserThanNumActualEvents() { + int[] partitions = new int[] {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}; + long[] committedOffsets = + new long[] {788543084, 787746335, 788016034, 788171708, 788327954, 788055939, 788179691, 788004145, 788105897, 788496138, 788317057, 788325907, 788287519, 787958075, 788403560, 788118894, + 788383733, 787273821}; + long[] latestOffsets = + new long[] {788946534, 788442557, 788712188, 788867819, 789023943, 788752030, 788875648, 788700234, 788802091, 789192155, 789013192, 789021874, 788983544, 788654092, 789099516, 788814985, + 789079650, 787273821}; + long numEvents = 10000000; + long minPartitions = 36; + + OffsetRange[] ranges = + KafkaOffsetGen.CheckpointUtils.computeOffsetRanges( + makeOffsetMap(partitions, committedOffsets), + makeOffsetMap(partitions, latestOffsets), + numEvents, + minPartitions); + for (OffsetRange range : ranges) { + if (range.fromOffset() > range.untilOffset()) { + throw new IllegalArgumentException("Invalid offset range " + range); + } + } + assertEquals(10000000, KafkaOffsetGen.CheckpointUtils.totalNewMessages(ranges)); + assertEquals(41, ranges.length); + long eventPerPartition = numEvents / minPartitions; + long rangesWhereDiffIsLessThanEventsPerPartition = Arrays.stream(ranges).filter(offsetRange -> offsetRange.untilOffset() - offsetRange.fromOffset() <= eventPerPartition).count(); + assertEquals(ranges.length, rangesWhereDiffIsLessThanEventsPerPartition); + OffsetRange[] expectedRanges = new OffsetRange[] { + OffsetRange.apply(TEST_TOPIC_NAME, 0, 788543084, 788820861), + OffsetRange.apply(TEST_TOPIC_NAME, 0, 788820861, 788946534), + OffsetRange.apply(TEST_TOPIC_NAME, 1, 787746335, 788024112), + OffsetRange.apply(TEST_TOPIC_NAME, 1, 788024112, 788301889), + OffsetRange.apply(TEST_TOPIC_NAME, 1, 788301889, 788442557), + OffsetRange.apply(TEST_TOPIC_NAME, 2, 788016034, 788293811), + OffsetRange.apply(TEST_TOPIC_NAME, 2, 788293811, 788571588), + OffsetRange.apply(TEST_TOPIC_NAME, 2, 788571588, 788712188), + OffsetRange.apply(TEST_TOPIC_NAME, 3, 788171708, 788449485), + OffsetRange.apply(TEST_TOPIC_NAME, 3, 788449485, 788727262), + OffsetRange.apply(TEST_TOPIC_NAME, 3, 788727262, 788867819), + OffsetRange.apply(TEST_TOPIC_NAME, 4, 788327954, 788605731), + OffsetRange.apply(TEST_TOPIC_NAME, 4, 788605731, 788883508), + OffsetRange.apply(TEST_TOPIC_NAME, 4, 788883508, 789023943), + OffsetRange.apply(TEST_TOPIC_NAME, 5, 788055939, 788333716), + OffsetRange.apply(TEST_TOPIC_NAME, 5, 788333716, 788611493), + OffsetRange.apply(TEST_TOPIC_NAME, 5, 788611493, 788752030), + OffsetRange.apply(TEST_TOPIC_NAME, 6, 788179691, 788457468), + OffsetRange.apply(TEST_TOPIC_NAME, 6, 788457468, 788735245), + OffsetRange.apply(TEST_TOPIC_NAME, 6, 788735245, 788740134), + OffsetRange.apply(TEST_TOPIC_NAME, 7, 788004145, 788281922), + OffsetRange.apply(TEST_TOPIC_NAME, 7, 788281922, 788559699), + OffsetRange.apply(TEST_TOPIC_NAME, 8, 788105897, 788383674), + OffsetRange.apply(TEST_TOPIC_NAME, 8, 788383674, 788661451), + OffsetRange.apply(TEST_TOPIC_NAME, 9, 788496138, 788773915), + OffsetRange.apply(TEST_TOPIC_NAME, 9, 788773915, 789051692), + OffsetRange.apply(TEST_TOPIC_NAME, 10, 788317057, 788594834), + OffsetRange.apply(TEST_TOPIC_NAME, 10, 788594834, 788872611), + OffsetRange.apply(TEST_TOPIC_NAME, 11, 788325907, 788603684), + OffsetRange.apply(TEST_TOPIC_NAME, 11, 788603684, 788881461), + OffsetRange.apply(TEST_TOPIC_NAME, 12, 788287519, 788565296), + OffsetRange.apply(TEST_TOPIC_NAME, 12, 788565296, 788843073), + OffsetRange.apply(TEST_TOPIC_NAME, 13, 787958075, 788235852), + OffsetRange.apply(TEST_TOPIC_NAME, 13, 788235852, 788513629), + OffsetRange.apply(TEST_TOPIC_NAME, 14, 788403560, 788681337), + OffsetRange.apply(TEST_TOPIC_NAME, 14, 788681337, 788959114), + OffsetRange.apply(TEST_TOPIC_NAME, 15, 788118894, 788396671), + OffsetRange.apply(TEST_TOPIC_NAME, 15, 788396671, 788674448), + OffsetRange.apply(TEST_TOPIC_NAME, 16, 788383733, 788661510), + OffsetRange.apply(TEST_TOPIC_NAME, 16, 788661510, 788939287), + OffsetRange.apply(TEST_TOPIC_NAME, 17, 787273821, 787273821), + }; + assertArrayEquals(expectedRanges, ranges); + } + private static Map makeOffsetMap(int[] partitions, long[] offsets) { Map map = new HashMap<>(); for (int i = 0; i < partitions.length; i++) { diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/helpers/TestCloudObjectsSelectorCommon.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/helpers/TestCloudObjectsSelectorCommon.java index b4b6507e074c..79f15975cb51 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/helpers/TestCloudObjectsSelectorCommon.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/helpers/TestCloudObjectsSelectorCommon.java @@ -21,18 +21,19 @@ import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.util.Option; import org.apache.hudi.testutils.HoodieSparkClientTestHarness; - import org.apache.hudi.utilities.schema.FilebasedSchemaProvider; + import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; - import org.apache.spark.sql.RowFactory; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; +import java.util.Arrays; import java.util.Collections; +import java.util.HashSet; import java.util.List; public class TestCloudObjectsSelectorCommon extends HoodieSparkClientTestHarness { @@ -68,7 +69,7 @@ public void partitionValueAddedToRow() { List input = Collections.singletonList(new CloudObjectMetadata("src/test/resources/data/partitioned/country=US/state=CA/data.json", 1)); TypedProperties properties = new TypedProperties(); - properties.put("hoodie.deltastreamer.source.cloud.data.partition.fields.from.path", "country,state"); + properties.put("hoodie.streamer.source.cloud.data.partition.fields.from.path", "country,state"); Option> result = CloudObjectsSelectorCommon.loadAsDataset(sparkSession, input, properties, "json"); Assertions.assertTrue(result.isPresent()); Assertions.assertEquals(1, result.get().count()); @@ -81,9 +82,9 @@ public void loadDatasetWithSchema() { TypedProperties props = new TypedProperties(); TestCloudObjectsSelectorCommon.class.getClassLoader().getResource("schema/sample_data_schema.avsc"); String schemaFilePath = TestCloudObjectsSelectorCommon.class.getClassLoader().getResource("schema/sample_data_schema.avsc").getPath(); - props.put("hoodie.deltastreamer.schemaprovider.source.schema.file", schemaFilePath); - props.put("hoodie.deltastreamer.schema.provider.class.name", FilebasedSchemaProvider.class.getName()); - props.put("hoodie.deltastreamer.source.cloud.data.partition.fields.from.path", "country,state"); + props.put("hoodie.streamer.schemaprovider.source.schema.file", schemaFilePath); + props.put("hoodie.streamer.schema.provider.class.name", FilebasedSchemaProvider.class.getName()); + props.put("hoodie.streamer.source.cloud.data.partition.fields.from.path", "country,state"); List input = Collections.singletonList(new CloudObjectMetadata("src/test/resources/data/partitioned/country=US/state=CA/data.json", 1)); Option> result = CloudObjectsSelectorCommon.loadAsDataset(sparkSession, input, props, "json", Option.of(new FilebasedSchemaProvider(props, jsc))); Assertions.assertTrue(result.isPresent()); @@ -96,12 +97,34 @@ public void loadDatasetWithSchema() { public void partitionKeyNotPresentInPath() { List input = Collections.singletonList(new CloudObjectMetadata("src/test/resources/data/partitioned/country=US/state=CA/data.json", 1)); TypedProperties properties = new TypedProperties(); - properties.put("hoodie.deltastreamer.source.cloud.data.reader.comma.separated.path.format", "false"); - properties.put("hoodie.deltastreamer.source.cloud.data.partition.fields.from.path", "unknown"); + properties.put("hoodie.streamer.source.cloud.data.reader.comma.separated.path.format", "false"); + properties.put("hoodie.streamer.source.cloud.data.partition.fields.from.path", "unknown"); Option> result = CloudObjectsSelectorCommon.loadAsDataset(sparkSession, input, properties, "json"); Assertions.assertTrue(result.isPresent()); Assertions.assertEquals(1, result.get().count()); Row expected = RowFactory.create("some data", null); Assertions.assertEquals(Collections.singletonList(expected), result.get().collectAsList()); } + + @Test + public void loadDatasetWithSchemaAndRepartition() { + TypedProperties props = new TypedProperties(); + TestCloudObjectsSelectorCommon.class.getClassLoader().getResource("schema/sample_data_schema.avsc"); + String schemaFilePath = TestCloudObjectsSelectorCommon.class.getClassLoader().getResource("schema/sample_data_schema.avsc").getPath(); + props.put("hoodie.streamer.schemaprovider.source.schema.file", schemaFilePath); + props.put("hoodie.streamer.schema.provider.class.name", FilebasedSchemaProvider.class.getName()); + props.put("hoodie.streamer.source.cloud.data.partition.fields.from.path", "country,state"); + // Setting this config so that dataset repartition happens inside `loadAsDataset` + props.put("hoodie.streamer.source.cloud.data.partition.max.size", "1"); + List input = Arrays.asList( + new CloudObjectMetadata("src/test/resources/data/partitioned/country=US/state=CA/data.json", 1000), + new CloudObjectMetadata("src/test/resources/data/partitioned/country=US/state=TX/data.json", 1000), + new CloudObjectMetadata("src/test/resources/data/partitioned/country=IND/state=TS/data.json", 1000) + ); + Option> result = CloudObjectsSelectorCommon.loadAsDataset(sparkSession, input, props, "json", Option.of(new FilebasedSchemaProvider(props, jsc))); + Assertions.assertTrue(result.isPresent()); + List expected = Arrays.asList(RowFactory.create("some data", "US", "CA"), RowFactory.create("some data", "US", "TX"), RowFactory.create("some data", "IND", "TS")); + List actual = result.get().collectAsList(); + Assertions.assertEquals(new HashSet<>(expected), new HashSet<>(actual)); + } } diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/helpers/TestKafkaOffsetGen.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/helpers/TestKafkaOffsetGen.java index 6ad6a4c09dbf..fc3ab90a0364 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/helpers/TestKafkaOffsetGen.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/helpers/TestKafkaOffsetGen.java @@ -65,9 +65,9 @@ public void teardown() throws Exception { private TypedProperties getConsumerConfigs(String autoOffsetReset, String kafkaCheckpointType) { TypedProperties props = new TypedProperties(); - props.put("hoodie.deltastreamer.source.kafka.checkpoint.type", kafkaCheckpointType); + props.put("hoodie.streamer.source.kafka.checkpoint.type", kafkaCheckpointType); props.put("auto.offset.reset", autoOffsetReset); - props.put("hoodie.deltastreamer.source.kafka.topic", testTopicName); + props.put("hoodie.streamer.source.kafka.topic", testTopicName); props.setProperty("bootstrap.servers", testUtils.brokerAddress()); props.setProperty("key.deserializer", StringDeserializer.class.getName()); props.setProperty("value.deserializer", StringDeserializer.class.getName()); @@ -140,11 +140,13 @@ public void testGetNextOffsetRangesFromMultiplePartitions() { testUtils.sendMessages(testTopicName, Helpers.jsonifyRecords(dataGenerator.generateInserts("000", 1000))); KafkaOffsetGen kafkaOffsetGen = new KafkaOffsetGen(getConsumerConfigs("earliest", "string")); OffsetRange[] nextOffsetRanges = kafkaOffsetGen.getNextOffsetRanges(Option.empty(), 499, metrics); - assertEquals(2, nextOffsetRanges.length); + assertEquals(3, nextOffsetRanges.length); assertEquals(0, nextOffsetRanges[0].fromOffset()); - assertEquals(250, nextOffsetRanges[0].untilOffset()); - assertEquals(0, nextOffsetRanges[1].fromOffset()); - assertEquals(249, nextOffsetRanges[1].untilOffset()); + assertEquals(249, nextOffsetRanges[0].untilOffset()); + assertEquals(249, nextOffsetRanges[1].fromOffset()); + assertEquals(250, nextOffsetRanges[1].untilOffset()); + assertEquals(0, nextOffsetRanges[2].fromOffset()); + assertEquals(249, nextOffsetRanges[2].untilOffset()); } @Test @@ -248,7 +250,7 @@ public void testCheckTopicExists() { testUtils.createTopic(testTopicName, 1); boolean topicExists = kafkaOffsetGen.checkTopicExists(new KafkaConsumer(props)); assertTrue(topicExists); - props.put("hoodie.deltastreamer.source.kafka.topic", "random"); + props.put("hoodie.streamer.source.kafka.topic", "random"); kafkaOffsetGen = new KafkaOffsetGen(props); topicExists = kafkaOffsetGen.checkTopicExists(new KafkaConsumer(props)); assertFalse(topicExists); diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/helpers/TestSanitizationUtils.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/helpers/TestSanitizationUtils.java index 1a660ac71353..39dfa430268e 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/helpers/TestSanitizationUtils.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/helpers/TestSanitizationUtils.java @@ -22,7 +22,6 @@ import org.apache.hudi.common.util.FileIOUtils; import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.hadoop.fs.HadoopFSUtils; -import org.apache.hudi.utilities.deltastreamer.TestSourceFormatAdapter; import org.apache.hudi.utilities.testutils.SanitizationTestUtils; import org.apache.avro.Schema; @@ -45,6 +44,7 @@ import java.io.InputStream; import java.util.stream.Stream; +import static org.apache.hudi.testutils.HoodieClientTestUtils.getSparkConfForTest; import static org.apache.hudi.utilities.testutils.SanitizationTestUtils.generateProperFormattedSchema; import static org.apache.hudi.utilities.testutils.SanitizationTestUtils.generateRenamedSchemaWithConfiguredReplacement; import static org.apache.hudi.utilities.testutils.SanitizationTestUtils.generateRenamedSchemaWithDefaultReplacement; @@ -61,9 +61,7 @@ public class TestSanitizationUtils { public static void start() { spark = SparkSession .builder() - .master("local[*]") - .appName(TestSourceFormatAdapter.class.getName()) - .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") + .config(getSparkConfForTest(TestSanitizationUtils.class.getName())) .getOrCreate(); jsc = JavaSparkContext.fromSparkContext(spark.sparkContext()); } diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/streamer/TestHoodieStreamerUtils.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/streamer/TestHoodieStreamerUtils.java new file mode 100644 index 000000000000..e6c388b3e3b1 --- /dev/null +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/streamer/TestHoodieStreamerUtils.java @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.utilities.streamer; + +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.model.HoodieRecord.HoodieRecordType; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.utilities.schema.SchemaProvider; +import org.apache.hudi.utilities.schema.SimpleSchemaProvider; +import org.apache.hudi.utilities.testutils.UtilitiesTestBase; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericRecord; +import org.apache.spark.api.java.JavaRDD; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.EnumSource; +import org.mockito.ArgumentCaptor; +import org.mockito.Mockito; + +import java.util.Collections; +import java.util.List; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.mockito.Mockito.doNothing; + +/** + * Tests {@link HoodieStreamerUtils}. + */ +public class TestHoodieStreamerUtils extends UtilitiesTestBase { + private static final String SCHEMA_STRING = "{\"type\": \"record\"," + "\"name\": \"rec\"," + "\"fields\": [ " + + "{\"name\": \"timestamp\",\"type\": \"long\"}," + "{\"name\": \"_row_key\", \"type\": \"string\"}," + + "{\"name\": \"partition_path\", \"type\": [\"null\", \"string\"], \"default\": null }," + + "{\"name\": \"rider\", \"type\": \"string\"}," + "{\"name\": \"driver\", \"type\": \"string\"}]}"; + + @BeforeAll + public static void setupOnce() throws Exception { + initTestServices(); + } + + @ParameterizedTest + @EnumSource(HoodieRecordType.class) + public void testCreateHoodieRecordsWithError(HoodieRecordType recordType) { + Schema schema = new Schema.Parser().parse(SCHEMA_STRING); + JavaRDD recordRdd = jsc.parallelize(Collections.singletonList(1)).map(i -> { + GenericRecord record = new GenericData.Record(schema); + record.put(0, i * 1000L); + record.put(1, "key" + i); + record.put(2, "path" + i); + // The field is non-null in schema but the value is null, so this fails the Hudi record creation + record.put(3, null); + record.put(4, "driver"); + return record; + }); + HoodieStreamer.Config cfg = new HoodieStreamer.Config(); + TypedProperties props = new TypedProperties(); + SchemaProvider schemaProvider = new SimpleSchemaProvider(jsc, schema, props); + BaseErrorTableWriter errorTableWriter = Mockito.mock(BaseErrorTableWriter.class); + ArgumentCaptor> errorEventCaptor = ArgumentCaptor.forClass(JavaRDD.class); + doNothing().when(errorTableWriter).addErrorEvents(errorEventCaptor.capture()); + HoodieStreamerUtils.createHoodieRecords(cfg, props, Option.of(recordRdd), + schemaProvider, recordType, false, "000", Option.of(errorTableWriter)); + List> actualErrorEvents = (List>) errorEventCaptor.getValue().collect(); + ErrorEvent expectedErrorEvent = new ErrorEvent<>("{\"timestamp\": 1000, \"_row_key\": \"key1\", \"partition_path\": \"path1\", \"rider\": null, \"driver\": \"driver\"}", + ErrorEvent.ErrorReason.RECORD_CREATION); + assertEquals(Collections.singletonList(expectedErrorEvent), actualErrorEvents); + } +} diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/streamer/TestStreamSyncUnitTests.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/streamer/TestStreamSyncUnitTests.java new file mode 100644 index 000000000000..99148eb4b072 --- /dev/null +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/streamer/TestStreamSyncUnitTests.java @@ -0,0 +1,192 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.utilities.streamer; + +import org.apache.hudi.DataSourceWriteOptions; +import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieErrorTableConfig; +import org.apache.hudi.utilities.schema.SchemaProvider; +import org.apache.hudi.utilities.sources.InputBatch; +import org.apache.hudi.utilities.transform.Transformer; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SparkSession; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +import java.util.stream.Stream; + +import static org.apache.hudi.config.HoodieErrorTableConfig.ERROR_ENABLE_VALIDATE_TARGET_SCHEMA; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.ArgumentMatchers.anyLong; +import static org.mockito.Mockito.doReturn; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.never; +import static org.mockito.Mockito.spy; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +public class TestStreamSyncUnitTests { + + @ParameterizedTest + @MethodSource("testCasesFetchNextBatchFromSource") + void testFetchNextBatchFromSource(Boolean useRowWriter, Boolean hasTransformer, Boolean hasSchemaProvider, + Boolean isNullTargetSchema, Boolean hasErrorTable, Boolean shouldTryWriteToErrorTable) { + //basic deltastreamer inputs + HoodieSparkEngineContext hoodieSparkEngineContext = mock(HoodieSparkEngineContext.class); + FileSystem fs = mock(FileSystem.class); + SparkSession sparkSession = mock(SparkSession.class); + Configuration configuration = mock(Configuration.class); + HoodieStreamer.Config cfg = new HoodieStreamer.Config(); + cfg.targetTableName = "testTableName"; + cfg.targetBasePath = "/fake/table/name"; + cfg.tableType = "MERGE_ON_READ"; + + //Source format adapter + SourceFormatAdapter sourceFormatAdapter = mock(SourceFormatAdapter.class); + SchemaProvider inputBatchSchemaProvider = getSchemaProvider("InputBatch", false); + Option> fakeDataFrame = Option.of(mock(Dataset.class)); + InputBatch> fakeRowInputBatch = new InputBatch<>(fakeDataFrame, "chkpt", inputBatchSchemaProvider); + when(sourceFormatAdapter.fetchNewDataInRowFormat(any(), anyLong())).thenReturn(fakeRowInputBatch); + //batch is empty because we don't want getBatch().map() to do anything because it calls static method we can't mock + InputBatch> fakeAvroInputBatch = new InputBatch<>(Option.empty(), "chkpt", inputBatchSchemaProvider); + when(sourceFormatAdapter.fetchNewDataInAvroFormat(any(),anyLong())).thenReturn(fakeAvroInputBatch); + + //transformer + //return empty because we don't want .map() to do anything because it calls static method we can't mock + when(sourceFormatAdapter.processErrorEvents(any(), any())).thenReturn(Option.empty()); + Option transformerOption = Option.empty(); + if (hasTransformer) { + transformerOption = Option.of(mock(Transformer.class)); + } + + //user provided schema provider + SchemaProvider schemaProvider = null; + if (hasSchemaProvider) { + schemaProvider = getSchemaProvider("UserProvided", isNullTargetSchema); + } + + //error table + TypedProperties props = new TypedProperties(); + props.put(DataSourceWriteOptions.RECONCILE_SCHEMA().key(), false); + Option errorTableWriterOption = Option.empty(); + if (hasErrorTable) { + errorTableWriterOption = Option.of(mock(BaseErrorTableWriter.class)); + props.put(ERROR_ENABLE_VALIDATE_TARGET_SCHEMA.key(), true); + } + TypedProperties propsSpy = spy(props); + + + //Actually create the deltastreamer + StreamSync streamSync = new StreamSync(cfg, sparkSession, propsSpy, hoodieSparkEngineContext, + fs, configuration, client -> true, schemaProvider, errorTableWriterOption, sourceFormatAdapter, transformerOption, useRowWriter, false); + StreamSync spy = spy(streamSync); + SchemaProvider deducedSchemaProvider; + deducedSchemaProvider = getSchemaProvider("deduced", false); + doReturn(deducedSchemaProvider).when(spy).getDeducedSchemaProvider(any(), any(), any()); + + //run the method we are unit testing: + InputBatch batch = spy.fetchNextBatchFromSource(Option.empty(), mock(HoodieTableMetaClient.class)); + + //make sure getDeducedSchemaProvider is always called once + verify(spy, times(1)).getDeducedSchemaProvider(any(), any(), any()); + + //make sure the deduced schema is actually used + assertEquals(deducedSchemaProvider.getTargetSchema(), batch.getSchemaProvider().getTargetSchema()); + + //make sure we use error table when we should + verify(propsSpy, shouldTryWriteToErrorTable ? times(1) : never()) + .getBoolean(HoodieErrorTableConfig.ERROR_ENABLE_VALIDATE_TARGET_SCHEMA.key(), + HoodieErrorTableConfig.ERROR_ENABLE_VALIDATE_TARGET_SCHEMA.defaultValue()); + } + + private SchemaProvider getSchemaProvider(String name, boolean isNullTargetSchema) { + SchemaProvider schemaProvider = mock(SchemaProvider.class); + Schema sourceSchema = mock(Schema.class); + Schema targetSchema = isNullTargetSchema ? InputBatch.NULL_SCHEMA : mock(Schema.class); + when(schemaProvider.getSourceSchema()).thenReturn(sourceSchema); + when(schemaProvider.getTargetSchema()).thenReturn(targetSchema); + when(sourceSchema.toString()).thenReturn(name + "SourceSchema"); + if (!isNullTargetSchema) { + when(targetSchema.toString()).thenReturn(name + "TargetSchema"); + } + return schemaProvider; + } + + static Stream testCasesFetchNextBatchFromSource() { + Stream.Builder b = Stream.builder(); + + //no transformer + for (Boolean useRowWriter : new Boolean[]{false, true}) { + for (Boolean hasErrorTable : new Boolean[]{false, true}) { + boolean errorTableEnabled = hasErrorTable && !useRowWriter; + b.add(Arguments.of(useRowWriter, false, false, false, + hasErrorTable, errorTableEnabled)); + } + } + + //with transformer + for (Boolean useRowWriter : new Boolean[]{false, true}) { + for (Boolean hasSchemaProvider : new Boolean[]{false, true}) { + for (Boolean isNullTargetSchema : new Boolean[]{false, true}) { + for (Boolean hasErrorTable : new Boolean[]{false, true}) { + boolean errorTableEnabled = hasErrorTable && !useRowWriter; + boolean schemaProviderNullOrMissing = isNullTargetSchema || !hasSchemaProvider; + boolean shouldTryWriteToErrorTable = errorTableEnabled && !schemaProviderNullOrMissing; + b.add(Arguments.of(useRowWriter, true, hasSchemaProvider, isNullTargetSchema, + hasErrorTable, shouldTryWriteToErrorTable)); + } + } + } + } + return b.build(); + } +} diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/testutils/JdbcTestUtils.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/testutils/JdbcTestUtils.java index 79047794f979..227013b05481 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/testutils/JdbcTestUtils.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/testutils/JdbcTestUtils.java @@ -44,6 +44,11 @@ public class JdbcTestUtils { private static final Logger LOG = LoggerFactory.getLogger(JdbcTestUtils.class); + public static final String JDBC_URL = "jdbc:h2:mem:test_mem"; + public static final String JDBC_DRIVER = "org.h2.Driver"; + public static final String JDBC_USER = "test"; + public static final String JDBC_PASS = "jdbc"; + public static List clearAndInsert(String commitTime, int numRecords, Connection connection, HoodieTestDataGenerator dataGenerator, TypedProperties props) throws SQLException { execute(connection, "DROP TABLE triprec", "Table does not exists"); diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/testutils/UtilitiesTestBase.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/testutils/UtilitiesTestBase.java index 0dcba5e5fa27..a200f3a51513 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/testutils/UtilitiesTestBase.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/testutils/UtilitiesTestBase.java @@ -158,7 +158,7 @@ public static void initTestServices(boolean needsHdfs, boolean needsHive, boolea zookeeperTestService.start(); } - jsc = UtilHelpers.buildSparkContext(UtilitiesTestBase.class.getName() + "-hoodie", "local[4]"); + jsc = UtilHelpers.buildSparkContext(UtilitiesTestBase.class.getName() + "-hoodie", "local[8]"); context = new HoodieSparkEngineContext(jsc); sqlContext = new SQLContext(jsc); sparkSession = SparkSession.builder().config(jsc.getConf()).getOrCreate(); @@ -450,14 +450,14 @@ public static TypedProperties setupSchemaOnDFS() throws IOException { public static TypedProperties setupSchemaOnDFS(String scope, String filename) throws IOException { UtilitiesTestBase.Helpers.copyToDFS(scope + "/" + filename, fs, basePath + "/" + filename); TypedProperties props = new TypedProperties(); - props.setProperty("hoodie.deltastreamer.schemaprovider.source.schema.file", basePath + "/" + filename); + props.setProperty("hoodie.streamer.schemaprovider.source.schema.file", basePath + "/" + filename); return props; } public static TypedProperties setupSchemaOnDFSWithAbsoluteScope(String scope, String filename) throws IOException { UtilitiesTestBase.Helpers.copyToDFSFromAbsolutePath(scope + "/" + filename, fs, basePath + "/" + filename); TypedProperties props = new TypedProperties(); - props.setProperty("hoodie.deltastreamer.schemaprovider.source.schema.file", basePath + "/" + filename); + props.setProperty("hoodie.streamer.schemaprovider.source.schema.file", basePath + "/" + filename); return props; } diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/testutils/sources/AbstractBaseTestSource.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/testutils/sources/AbstractBaseTestSource.java index 56d435ddf0f1..08e73d36bc04 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/testutils/sources/AbstractBaseTestSource.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/testutils/sources/AbstractBaseTestSource.java @@ -23,6 +23,7 @@ import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.testutils.HoodieTestDataGenerator; import org.apache.hudi.common.testutils.RawTripTestPayload; +import org.apache.hudi.common.util.ConfigUtils; import org.apache.hudi.common.util.collection.RocksDBBasedMap; import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.utilities.config.SourceTestConfig; @@ -63,11 +64,10 @@ public static void initDataGen() { public static void initDataGen(TypedProperties props, int partition) { try { - boolean useRocksForTestDataGenKeys = props.getBoolean(SourceTestConfig.USE_ROCKSDB_FOR_TEST_DATAGEN_KEYS.key(), - SourceTestConfig.USE_ROCKSDB_FOR_TEST_DATAGEN_KEYS.defaultValue()); - String baseStoreDir = props.getString(SourceTestConfig.ROCKSDB_BASE_DIR_FOR_TEST_DATAGEN_KEYS.key(), + boolean useRocksForTestDataGenKeys = ConfigUtils.getBooleanWithAltKeys(props, SourceTestConfig.USE_ROCKSDB_FOR_TEST_DATAGEN_KEYS); + String baseStoreDir = ConfigUtils.getStringWithAltKeys(props, SourceTestConfig.ROCKSDB_BASE_DIR_FOR_TEST_DATAGEN_KEYS, File.createTempFile("test_data_gen", ".keys").getParent()) + "/" + partition; - LOG.info("useRocksForTestDataGenKeys=" + useRocksForTestDataGenKeys + ", BaseStoreDir=" + baseStoreDir); + LOG.info("useRocksForTestDataGenKeys={}, BaseStoreDir={}", useRocksForTestDataGenKeys, baseStoreDir); dataGeneratorMap.put(partition, new HoodieTestDataGenerator(HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS, useRocksForTestDataGenKeys ? new RocksDBBasedMap<>(baseStoreDir) : new HashMap<>())); } catch (IOException e) { @@ -106,18 +106,17 @@ protected AbstractBaseTestSource(TypedProperties props, JavaSparkContext sparkCo protected static Stream fetchNextBatch(TypedProperties props, int sourceLimit, String instantTime, int partition) { - int maxUniqueKeys = - props.getInteger(SourceTestConfig.MAX_UNIQUE_RECORDS_PROP.key(), SourceTestConfig.MAX_UNIQUE_RECORDS_PROP.defaultValue()); + int maxUniqueKeys = ConfigUtils.getIntWithAltKeys(props, SourceTestConfig.MAX_UNIQUE_RECORDS_PROP); HoodieTestDataGenerator dataGenerator = dataGeneratorMap.get(partition); // generate `sourceLimit` number of upserts each time. int numExistingKeys = dataGenerator.getNumExistingKeys(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA); - LOG.info("NumExistingKeys=" + numExistingKeys); + LOG.info("NumExistingKeys={}", numExistingKeys); int numUpdates = Math.min(numExistingKeys, sourceLimit / 2); int numInserts = sourceLimit - numUpdates; - LOG.info("Before adjustments => numInserts=" + numInserts + ", numUpdates=" + numUpdates); + LOG.info("Before adjustments => numInserts={}, numUpdates={}", numInserts, numUpdates); boolean reachedMax = false; if (numInserts + numExistingKeys > maxUniqueKeys) { @@ -134,17 +133,16 @@ protected static Stream fetchNextBatch(TypedProperties props, int Stream deleteStream = Stream.empty(); Stream updateStream; long memoryUsage1 = Runtime.getRuntime().totalMemory() - Runtime.getRuntime().freeMemory(); - LOG.info("Before DataGen. Memory Usage=" + memoryUsage1 + ", Total Memory=" + Runtime.getRuntime().totalMemory() - + ", Free Memory=" + Runtime.getRuntime().freeMemory()); + LOG.info("Before DataGen. Memory Usage={}, Total Memory={}, Free Memory={}", memoryUsage1, Runtime.getRuntime().totalMemory(), + Runtime.getRuntime().freeMemory()); if (!reachedMax && numUpdates >= 50) { - LOG.info("After adjustments => NumInserts=" + numInserts + ", NumUpdates=" + (numUpdates - 50) + ", NumDeletes=50, maxUniqueRecords=" - + maxUniqueKeys); + LOG.info("After adjustments => NumInserts={}, NumUpdates={}, NumDeletes=50, maxUniqueRecords={}", numInserts, (numUpdates - 50), maxUniqueKeys); // if we generate update followed by deletes -> some keys in update batch might be picked up for deletes. Hence generating delete batch followed by updates deleteStream = dataGenerator.generateUniqueDeleteRecordStream(instantTime, 50).map(AbstractBaseTestSource::toGenericRecord); updateStream = dataGenerator.generateUniqueUpdatesStream(instantTime, numUpdates - 50, HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA) .map(AbstractBaseTestSource::toGenericRecord); } else { - LOG.info("After adjustments => NumInserts=" + numInserts + ", NumUpdates=" + numUpdates + ", maxUniqueRecords=" + maxUniqueKeys); + LOG.info("After adjustments => NumInserts={}, NumUpdates={}, maxUniqueRecords={}", numInserts, numUpdates, maxUniqueKeys); updateStream = dataGenerator.generateUniqueUpdatesStream(instantTime, numUpdates, HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA) .map(AbstractBaseTestSource::toGenericRecord); } diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/testutils/sources/AbstractDFSSourceTestBase.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/testutils/sources/AbstractDFSSourceTestBase.java index 0de087ece73e..76a1a6453670 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/testutils/sources/AbstractDFSSourceTestBase.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/testutils/sources/AbstractDFSSourceTestBase.java @@ -19,6 +19,7 @@ package org.apache.hudi.utilities.testutils.sources; import org.apache.hudi.AvroConversionUtils; +import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.testutils.HoodieTestDataGenerator; import org.apache.hudi.common.util.Option; @@ -74,7 +75,11 @@ public void setup() throws Exception { * * @return A {@link Source} using DFS as the file system. */ - protected abstract Source prepareDFSSource(); + protected final Source prepareDFSSource() { + return prepareDFSSource(new TypedProperties()); + } + + protected abstract Source prepareDFSSource(TypedProperties props); /** * Writes test data, i.e., a {@link List} of {@link HoodieRecord}, to a file on DFS. diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/testutils/sources/DistributedTestDataSource.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/testutils/sources/DistributedTestDataSource.java index 4bcbdbbe874b..808a8efb8a4e 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/testutils/sources/DistributedTestDataSource.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/testutils/sources/DistributedTestDataSource.java @@ -19,6 +19,7 @@ package org.apache.hudi.utilities.testutils.sources; import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.util.ConfigUtils; import org.apache.hudi.common.util.Option; import org.apache.hudi.utilities.config.SourceTestConfig; import org.apache.hudi.utilities.schema.SchemaProvider; @@ -46,15 +47,14 @@ public class DistributedTestDataSource extends AbstractBaseTestSource { public DistributedTestDataSource(TypedProperties props, JavaSparkContext sparkContext, SparkSession sparkSession, SchemaProvider schemaProvider) { super(props, sparkContext, sparkSession, schemaProvider); - this.numTestSourcePartitions = - props.getInteger(SourceTestConfig.NUM_SOURCE_PARTITIONS_PROP.key(), SourceTestConfig.NUM_SOURCE_PARTITIONS_PROP.defaultValue()); + this.numTestSourcePartitions = ConfigUtils.getIntWithAltKeys(props, SourceTestConfig.NUM_SOURCE_PARTITIONS_PROP); } @Override protected InputBatch> fetchNewData(Option lastCkptStr, long sourceLimit) { int nextCommitNum = lastCkptStr.map(s -> Integer.parseInt(s) + 1).orElse(0); String instantTime = String.format("%05d", nextCommitNum); - LOG.info("Source Limit is set to " + sourceLimit); + LOG.info("Source Limit is set to {}", sourceLimit); // No new data. if (sourceLimit <= 0) { @@ -65,15 +65,14 @@ protected InputBatch> fetchNewData(Option lastCkp newProps.putAll(props); // Set the maxUniqueRecords per partition for TestDataSource - int maxUniqueRecords = - props.getInteger(SourceTestConfig.MAX_UNIQUE_RECORDS_PROP.key(), SourceTestConfig.MAX_UNIQUE_RECORDS_PROP.defaultValue()); + int maxUniqueRecords = ConfigUtils.getIntWithAltKeys(props, SourceTestConfig.MAX_UNIQUE_RECORDS_PROP); String maxUniqueRecordsPerPartition = String.valueOf(Math.max(1, maxUniqueRecords / numTestSourcePartitions)); newProps.setProperty(SourceTestConfig.MAX_UNIQUE_RECORDS_PROP.key(), maxUniqueRecordsPerPartition); int perPartitionSourceLimit = Math.max(1, (int) (sourceLimit / numTestSourcePartitions)); JavaRDD avroRDD = sparkContext.parallelize(IntStream.range(0, numTestSourcePartitions).boxed().collect(Collectors.toList()), numTestSourcePartitions).mapPartitionsWithIndex((p, idx) -> { - LOG.info("Initializing source with newProps=" + newProps); + LOG.info("Initializing source with newProps={}", newProps); if (!dataGeneratorMap.containsKey(p)) { initDataGen(newProps, p); } diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/transform/TestSqlFileBasedTransformer.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/transform/TestSqlFileBasedTransformer.java index 1b0cc7f52a6d..ea2ce8ed86f9 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/transform/TestSqlFileBasedTransformer.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/transform/TestSqlFileBasedTransformer.java @@ -87,7 +87,7 @@ public void testSqlFileBasedTransformerIllegalArguments() { public void testSqlFileBasedTransformerIncorrectConfig() { // Test if the class throws hoodie IO exception correctly when given a incorrect config. props.setProperty( - "hoodie.deltastreamer.transformer.sql.file", + "hoodie.streamer.transformer.sql.file", UtilitiesTestBase.basePath + "/non-exist-sql-file.sql"); assertThrows( HoodieTransformException.class, @@ -103,7 +103,7 @@ public void testSqlFileBasedTransformerInvalidSQL() throws IOException { // Test if the SQL file based transformer works as expected for the invalid SQL statements. props.setProperty( - "hoodie.deltastreamer.transformer.sql.file", + "hoodie.streamer.transformer.sql.file", UtilitiesTestBase.basePath + "/sql-file-transformer-invalid.sql"); assertThrows( ParseException.class, @@ -119,7 +119,7 @@ public void testSqlFileBasedTransformerEmptyDataset() throws IOException { // Test if the SQL file based transformer works as expected for the empty SQL statements. props.setProperty( - "hoodie.deltastreamer.transformer.sql.file", + "hoodie.streamer.transformer.sql.file", UtilitiesTestBase.basePath + "/sql-file-transformer-empty.sql"); Dataset emptyRow = sqlFileTransformer.apply(jsc, sparkSession, inputDatasetRows, props); String[] actualRows = emptyRow.as(Encoders.STRING()).collectAsList().toArray(new String[0]); @@ -136,7 +136,7 @@ public void testSqlFileBasedTransformer() throws IOException { // Test if the SQL file based transformer works as expected for the correct input. props.setProperty( - "hoodie.deltastreamer.transformer.sql.file", + "hoodie.streamer.transformer.sql.file", UtilitiesTestBase.basePath + "/sql-file-transformer.sql"); Dataset transformedRow = sqlFileTransformer.apply(jsc, sparkSession, inputDatasetRows, props); diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/transform/TestSqlQueryBasedTransformer.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/transform/TestSqlQueryBasedTransformer.java index b6fdc2582422..e9f6f9e4fd39 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/transform/TestSqlQueryBasedTransformer.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/transform/TestSqlQueryBasedTransformer.java @@ -29,6 +29,7 @@ import java.util.Collections; +import static org.apache.hudi.testutils.HoodieClientTestUtils.getSparkConfForTest; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNotNull; @@ -39,8 +40,7 @@ public void testSqlQuery() { SparkSession spark = SparkSession .builder() - .master("local[2]") - .appName(TestSqlQueryBasedTransformer.class.getName()) + .config(getSparkConfForTest(TestSqlQueryBasedTransformer.class.getName())) .getOrCreate(); JavaSparkContext jsc = JavaSparkContext.fromSparkContext(spark.sparkContext()); @@ -78,7 +78,7 @@ public void testSqlQuery() { + "from\n" + "\t"; TypedProperties props = new TypedProperties(); - props.put("hoodie.deltastreamer.transformer.sql", transSql); + props.put("hoodie.streamer.transformer.sql", transSql); // transform SqlQueryBasedTransformer transformer = new SqlQueryBasedTransformer(); diff --git a/hudi-utilities/src/test/resources/data/partitioned/country=IND/state=TS/data.json b/hudi-utilities/src/test/resources/data/partitioned/country=IND/state=TS/data.json new file mode 100644 index 000000000000..9fb29b4dcf47 --- /dev/null +++ b/hudi-utilities/src/test/resources/data/partitioned/country=IND/state=TS/data.json @@ -0,0 +1 @@ +{"data": "some data"} \ No newline at end of file diff --git a/hudi-utilities/src/test/resources/data/partitioned/country=US/state=TX/data.json b/hudi-utilities/src/test/resources/data/partitioned/country=US/state=TX/data.json new file mode 100644 index 000000000000..9fb29b4dcf47 --- /dev/null +++ b/hudi-utilities/src/test/resources/data/partitioned/country=US/state=TX/data.json @@ -0,0 +1 @@ +{"data": "some data"} \ No newline at end of file diff --git a/hudi-utilities/src/test/resources/streamer-config/dfs-source.properties b/hudi-utilities/src/test/resources/streamer-config/dfs-source.properties index 3a5edb2b6f23..35beefab7b22 100644 --- a/hudi-utilities/src/test/resources/streamer-config/dfs-source.properties +++ b/hudi-utilities/src/test/resources/streamer-config/dfs-source.properties @@ -20,8 +20,8 @@ include=base.properties hoodie.datasource.write.recordkey.field=_row_key hoodie.datasource.write.partitionpath.field=driver # Schema provider props (change to absolute path based on your installation) -hoodie.deltastreamer.filebased.schemaprovider.source.schema.file=file:///path/to/hoodie/hoodie-utilities/src/main/resources/streamer-props/source.avsc -hoodie.deltastreamer.filebased.schemaprovider.target.schema.file=file:///path/to/hoodie/hoodie-utilities/src/main/resources/streamer-props/target.avsc +hoodie.streamer.filebased.schemaprovider.source.schema.file=file:///path/to/hoodie/hoodie-utilities/src/main/resources/streamer-props/source.avsc +hoodie.streamer.filebased.schemaprovider.target.schema.file=file:///path/to/hoodie/hoodie-utilities/src/main/resources/streamer-props/target.avsc # DFS Source -hoodie.deltastreamer.source.dfs.root=file:///tmp/hoodie-dfs-input +hoodie.streamer.source.dfs.root=file:///tmp/hoodie-dfs-input diff --git a/hudi-utilities/src/test/resources/streamer-config/invalid_hive_sync_uber_config.properties b/hudi-utilities/src/test/resources/streamer-config/invalid_hive_sync_uber_config.properties index 5c569c5d0a0d..248de399272e 100644 --- a/hudi-utilities/src/test/resources/streamer-config/invalid_hive_sync_uber_config.properties +++ b/hudi-utilities/src/test/resources/streamer-config/invalid_hive_sync_uber_config.properties @@ -18,6 +18,6 @@ include=base.properties hoodie.datasource.write.recordkey.field=_row_key hoodie.datasource.write.partitionpath.field=created_at -hoodie.deltastreamer.source.kafka.topic=test_topic -hoodie.deltastreamer.keygen.timebased.timestamp.type=UNIX_TIMESTAMP -hoodie.deltastreamer.keygen.timebased.input.dateformat=yyyy-MM-dd \ No newline at end of file +hoodie.streamer.source.kafka.topic=test_topic +hoodie.keygen.timebased.timestamp.type=UNIX_TIMESTAMP +hoodie.keygen.timebased.input.dateformat=yyyy-MM-dd \ No newline at end of file diff --git a/hudi-utilities/src/test/resources/streamer-config/kafka-source.properties b/hudi-utilities/src/test/resources/streamer-config/kafka-source.properties index e256b8c77fbb..87edb1a1df7d 100644 --- a/hudi-utilities/src/test/resources/streamer-config/kafka-source.properties +++ b/hudi-utilities/src/test/resources/streamer-config/kafka-source.properties @@ -20,10 +20,10 @@ include=base.properties hoodie.datasource.write.recordkey.field=impressionid hoodie.datasource.write.partitionpath.field=userid # schema provider configs -hoodie.deltastreamer.schemaprovider.registry.url=http://localhost:8081/subjects/impressions-value/versions/latest +hoodie.streamer.schemaprovider.registry.url=http://localhost:8081/subjects/impressions-value/versions/latest # Kafka Source -#hoodie.deltastreamer.source.kafka.topic=uber_trips -hoodie.deltastreamer.source.kafka.topic=impressions +#hoodie.streamer.source.kafka.topic=uber_trips +hoodie.streamer.source.kafka.topic=impressions #Kafka props bootstrap.servers=localhost:9092 auto.offset.reset=earliest diff --git a/hudi-utilities/src/test/resources/streamer-config/short_trip_uber_config.properties b/hudi-utilities/src/test/resources/streamer-config/short_trip_uber_config.properties index d415e19eb20a..b74f5a080f3d 100644 --- a/hudi-utilities/src/test/resources/streamer-config/short_trip_uber_config.properties +++ b/hudi-utilities/src/test/resources/streamer-config/short_trip_uber_config.properties @@ -18,11 +18,11 @@ include=base.properties hoodie.datasource.write.recordkey.field=_row_key hoodie.datasource.write.partitionpath.field=created_at -hoodie.deltastreamer.source.kafka.topic=topic2 -hoodie.deltastreamer.keygen.timebased.timestamp.type=UNIX_TIMESTAMP -hoodie.deltastreamer.keygen.timebased.input.dateformat=yyyy-MM-dd HH:mm:ss.S +hoodie.streamer.source.kafka.topic=topic2 +hoodie.keygen.timebased.timestamp.type=UNIX_TIMESTAMP +hoodie.keygen.timebased.input.dateformat=yyyy-MM-dd HH:mm:ss.S hoodie.datasource.hive_sync.table=short_trip_uber_hive_dummy_table hoodie.datasource.write.keygenerator.class=org.apache.hudi.utilities.deltastreamer.TestHoodieDeltaStreamer$TestGenerator -hoodie.deltastreamer.schemaprovider.registry.baseUrl=http://localhost:8081/subjects/ -hoodie.deltastreamer.schemaprovider.registry.urlSuffix=-value/versions/latest -hoodie.deltastreamer.transformer.class=org.apache.hudi.utilities.deltastreamer.TestHoodieDeltaStreamer$TestIdentityTransformer +hoodie.streamer.schemaprovider.registry.baseUrl=http://localhost:8081/subjects/ +hoodie.streamer.schemaprovider.registry.urlSuffix=-value/versions/latest +hoodie.streamer.transformer.class=org.apache.hudi.utilities.deltastreamer.TestHoodieDeltaStreamer$TestIdentityTransformer diff --git a/hudi-utilities/src/test/resources/streamer-config/sql-transformer.properties b/hudi-utilities/src/test/resources/streamer-config/sql-transformer.properties index 9172337d0389..9bfbd889de98 100644 --- a/hudi-utilities/src/test/resources/streamer-config/sql-transformer.properties +++ b/hudi-utilities/src/test/resources/streamer-config/sql-transformer.properties @@ -16,4 +16,4 @@ # limitations under the License. ### include=base.properties -hoodie.deltastreamer.transformer.sql=SELECT a.timestamp, a._row_key, a.partition_path, a.trip_type, a.rider, a.driver, a.begin_lat, a.begin_lon, a.end_lat, a.end_lon, a.distance_in_meters, a.seconds_since_epoch, a.weight, a.nation, a.current_date, a.current_ts, a.height, a.city_to_state, a.fare, a.tip_history, a.`_hoodie_is_deleted`, CAST(1.0 AS DOUBLE) AS haversine_distance FROM a +hoodie.streamer.transformer.sql=SELECT a.timestamp, a._row_key, a.partition_path, a.trip_type, a.rider, a.driver, a.begin_lat, a.begin_lon, a.end_lat, a.end_lon, a.distance_in_meters, a.seconds_since_epoch, a.weight, a.nation, a.current_date, a.current_ts, a.height, a.city_to_state, a.fare, a.tip_history, a.`_hoodie_is_deleted`, CAST(1.0 AS DOUBLE) AS haversine_distance FROM a diff --git a/hudi-utilities/src/test/resources/streamer-config/uber_config.properties b/hudi-utilities/src/test/resources/streamer-config/uber_config.properties index f5b079265d43..a8e278249e86 100644 --- a/hudi-utilities/src/test/resources/streamer-config/uber_config.properties +++ b/hudi-utilities/src/test/resources/streamer-config/uber_config.properties @@ -18,10 +18,10 @@ include=base.properties hoodie.datasource.write.recordkey.field=_row_key hoodie.datasource.write.partitionpath.field=created_at -hoodie.deltastreamer.source.kafka.topic=topic1 -hoodie.deltastreamer.keygen.timebased.timestamp.type=UNIX_TIMESTAMP -hoodie.deltastreamer.keygen.timebased.input.dateformat=yyyy-MM-dd HH:mm:ss.S +hoodie.streamer.source.kafka.topic=topic1 +hoodie.keygen.timebased.timestamp.type=UNIX_TIMESTAMP +hoodie.keygen.timebased.input.dateformat=yyyy-MM-dd HH:mm:ss.S hoodie.datasource.hive_sync.database=uber_hive_db hoodie.datasource.hive_sync.table=uber_hive_dummy_table -hoodie.deltastreamer.schemaprovider.registry.url=http://localhost:8081/subjects/random-value/versions/latest -hoodie.deltastreamer.schemaprovider.registry.targetUrl=http://localhost:8081/subjects/random-value/versions/latest \ No newline at end of file +hoodie.streamer.schemaprovider.registry.url=http://localhost:8081/subjects/random-value/versions/latest +hoodie.streamer.schemaprovider.registry.targetUrl=http://localhost:8081/subjects/random-value/versions/latest \ No newline at end of file diff --git a/packaging/hudi-aws-bundle/pom.xml b/packaging/hudi-aws-bundle/pom.xml index 1ab4d9c82ede..f55a97d67e1a 100644 --- a/packaging/hudi-aws-bundle/pom.xml +++ b/packaging/hudi-aws-bundle/pom.xml @@ -139,9 +139,6 @@ src/main/resources - - src/test/resources - diff --git a/packaging/hudi-datahub-sync-bundle/pom.xml b/packaging/hudi-datahub-sync-bundle/pom.xml index 460c48d7258b..f9f4c813eecd 100644 --- a/packaging/hudi-datahub-sync-bundle/pom.xml +++ b/packaging/hudi-datahub-sync-bundle/pom.xml @@ -115,9 +115,6 @@ src/main/resources - - src/test/resources - diff --git a/packaging/hudi-flink-bundle/pom.xml b/packaging/hudi-flink-bundle/pom.xml index 23a21371d6d3..064ce639e818 100644 --- a/packaging/hudi-flink-bundle/pom.xml +++ b/packaging/hudi-flink-bundle/pom.xml @@ -127,6 +127,8 @@ io.prometheus:simpleclient_dropwizard io.prometheus:simpleclient_pushgateway io.prometheus:simpleclient_common + com.uber.m3:tally-m3 + com.uber.m3:tally-core org.eclipse.jetty:* @@ -194,10 +196,6 @@ com.beust.jcommander. ${flink.bundle.shade.prefix}com.beust.jcommander. - - com.codahale.metrics. - ${flink.bundle.shade.prefix}com.codahale.metrics. - org.apache.commons.codec. ${flink.bundle.shade.prefix}org.apache.commons.codec. @@ -214,6 +212,10 @@ org.openjdk.jol. org.apache.hudi.org.openjdk.jol. + + com.uber.m3. + org.apache.hudi.com.uber.m3. + @@ -241,9 +243,6 @@ src/main/resources - - src/test/resources - diff --git a/packaging/hudi-gcp-bundle/pom.xml b/packaging/hudi-gcp-bundle/pom.xml index eeb1ce7ff842..01e722e85926 100644 --- a/packaging/hudi-gcp-bundle/pom.xml +++ b/packaging/hudi-gcp-bundle/pom.xml @@ -39,7 +39,7 @@ com.google.cloud libraries-bom - 25.1.0 + ${gcp-libraries-bom.version} pom import @@ -139,9 +139,6 @@ src/main/resources - - src/test/resources - diff --git a/packaging/hudi-hadoop-mr-bundle/pom.xml b/packaging/hudi-hadoop-mr-bundle/pom.xml index c95a0c0742ac..1994b9951885 100644 --- a/packaging/hudi-hadoop-mr-bundle/pom.xml +++ b/packaging/hudi-hadoop-mr-bundle/pom.xml @@ -160,9 +160,6 @@ src/main/resources - - src/test/resources - diff --git a/packaging/hudi-hive-sync-bundle/pom.xml b/packaging/hudi-hive-sync-bundle/pom.xml index a042c9439934..580b4e96eaa0 100644 --- a/packaging/hudi-hive-sync-bundle/pom.xml +++ b/packaging/hudi-hive-sync-bundle/pom.xml @@ -136,9 +136,6 @@ src/main/resources - - src/test/resources - diff --git a/packaging/hudi-integ-test-bundle/pom.xml b/packaging/hudi-integ-test-bundle/pom.xml index 5b3974ad08de..a21b0ac110a2 100644 --- a/packaging/hudi-integ-test-bundle/pom.xml +++ b/packaging/hudi-integ-test-bundle/pom.xml @@ -164,6 +164,8 @@ io.prometheus:simpleclient_dropwizard io.prometheus:simpleclient_pushgateway io.prometheus:simpleclient_common + com.uber.m3:tally-m3 + com.uber.m3:tally-core org.openjdk.jol:jol-core @@ -272,6 +274,10 @@ org.eclipse.jetty. org.apache.hudi.org.eclipse.jetty. + + com.uber.m3. + org.apache.hudi.com.uber.m3. + diff --git a/packaging/hudi-kafka-connect-bundle/pom.xml b/packaging/hudi-kafka-connect-bundle/pom.xml index bfbafc7672fc..3cc55e6dbeaf 100644 --- a/packaging/hudi-kafka-connect-bundle/pom.xml +++ b/packaging/hudi-kafka-connect-bundle/pom.xml @@ -125,6 +125,8 @@ io.prometheus:simpleclient_dropwizard io.prometheus:simpleclient_pushgateway io.prometheus:simpleclient_common + com.uber.m3:tally-m3 + com.uber.m3:tally-core com.google.protobuf:protobuf-java org.scala-lang:* @@ -182,6 +184,10 @@ com.fasterxml.jackson. org.apache.hudi.com.fasterxml.jackson. + + com.uber.m3. + org.apache.hudi.com.uber.m3. + @@ -206,9 +212,6 @@ src/main/resources - - src/test/resources - diff --git a/packaging/hudi-presto-bundle/pom.xml b/packaging/hudi-presto-bundle/pom.xml index e498a459782b..275d7ae8e4e5 100644 --- a/packaging/hudi-presto-bundle/pom.xml +++ b/packaging/hudi-presto-bundle/pom.xml @@ -173,9 +173,6 @@ src/main/resources - - src/test/resources - diff --git a/packaging/hudi-spark-bundle/pom.xml b/packaging/hudi-spark-bundle/pom.xml index 58e2d802e64a..9f60db19e029 100644 --- a/packaging/hudi-spark-bundle/pom.xml +++ b/packaging/hudi-spark-bundle/pom.xml @@ -112,6 +112,9 @@ io.prometheus:simpleclient_dropwizard io.prometheus:simpleclient_pushgateway io.prometheus:simpleclient_common + com.uber.m3:tally-m3 + com.uber.m3:tally-core + com.yammer.metrics:metrics-core org.apache.hive:hive-common @@ -201,6 +204,10 @@ org.roaringbitmap. org.apache.hudi.org.roaringbitmap. + + com.uber.m3. + org.apache.hudi.com.uber.m3. + @@ -228,9 +235,6 @@ src/main/resources - - src/test/resources - diff --git a/packaging/hudi-trino-bundle/pom.xml b/packaging/hudi-trino-bundle/pom.xml index b56f86cde7d4..f1d7e685dbb0 100644 --- a/packaging/hudi-trino-bundle/pom.xml +++ b/packaging/hudi-trino-bundle/pom.xml @@ -166,9 +166,6 @@ src/main/resources - - src/test/resources - diff --git a/packaging/hudi-utilities-bundle/pom.xml b/packaging/hudi-utilities-bundle/pom.xml index 7bd698580cad..b992e5bbeb8c 100644 --- a/packaging/hudi-utilities-bundle/pom.xml +++ b/packaging/hudi-utilities-bundle/pom.xml @@ -141,6 +141,8 @@ io.prometheus:simpleclient_dropwizard io.prometheus:simpleclient_pushgateway io.prometheus:simpleclient_common + com.uber.m3:tally-m3 + com.uber.m3:tally-core org.apache.spark:spark-streaming-kafka-0-10_${scala.binary.version} org.apache.spark:spark-token-provider-kafka-0-10_${scala.binary.version} org.apache.kafka:kafka_${scala.binary.version} @@ -237,6 +239,10 @@ org.roaringbitmap. org.apache.hudi.org.roaringbitmap. + + com.uber.m3. + org.apache.hudi.com.uber.m3. + @@ -261,9 +267,6 @@ src/main/resources - - src/test/resources - diff --git a/packaging/hudi-utilities-slim-bundle/pom.xml b/packaging/hudi-utilities-slim-bundle/pom.xml index ed10d2ac8773..3919b103465c 100644 --- a/packaging/hudi-utilities-slim-bundle/pom.xml +++ b/packaging/hudi-utilities-slim-bundle/pom.xml @@ -127,6 +127,8 @@ io.prometheus:simpleclient_dropwizard io.prometheus:simpleclient_pushgateway io.prometheus:simpleclient_common + com.uber.m3:tally-m3 + com.uber.m3:tally-core org.apache.spark:spark-streaming-kafka-0-10_${scala.binary.version} org.apache.spark:spark-token-provider-kafka-0-10_${scala.binary.version} org.apache.kafka:kafka_${scala.binary.version} @@ -196,6 +198,10 @@ com.google.protobuf. org.apache.hudi.com.google.protobuf. + + com.uber.m3. + org.apache.hudi.com.uber.m3. + @@ -220,9 +226,6 @@ src/main/resources - - src/test/resources - diff --git a/pom.xml b/pom.xml index 3eeed340178b..34c741f3466e 100644 --- a/pom.xml +++ b/pom.xml @@ -113,7 +113,7 @@ 5.7.2 5.7.2 1.7.2 - 3.3.3 + 3.12.4 2.17.2 1.7.36 2.9.9 @@ -130,6 +130,7 @@ 1.5.6 0.9.47 0.25 + 0.13.0 0.8.0 4.5.13 4.4.13 @@ -166,7 +167,7 @@ 3.2.3 3.3.1 3.4.1 - 3.5.0 + 3.5.1 hudi-spark3.2.x com.fasterxml.jackson.module:jackson-module-afterburner + com.fasterxml.jackson.module:jackson-module-scala_${scala.binary.version} com.google.protobuf:protobuf-java @@ -1109,7 +1112,6 @@ metrics-jmx ${metrics.version} - io.prometheus simpleclient @@ -1130,6 +1132,16 @@ simpleclient_pushgateway ${prometheus.version} + + com.uber.m3 + tally-m3 + ${tally.version} + + + com.uber.m3 + tally-core + ${tally.version} + com.beust diff --git a/rfc/rfc-60/read_flow.png b/rfc/rfc-60/read_flow.png new file mode 100644 index 000000000000..4ef464f41e74 Binary files /dev/null and b/rfc/rfc-60/read_flow.png differ diff --git a/rfc/rfc-60/rfc-60.md b/rfc/rfc-60/rfc-60.md index d509aec1f208..bdfaa58b8990 100644 --- a/rfc/rfc-60/rfc-60.md +++ b/rfc/rfc-60/rfc-60.md @@ -15,7 +15,7 @@ limitations under the License. --> -# RFC-60: Federated Storage Layer +# RFC-60: Federated Storage Layout ## Proposers - @umehrot2 @@ -52,7 +52,10 @@ but there can be a 30 - 60 minute wait time before new partitions are created. T same table path prefix could result in these request limits being hit for the table prefix, specially as workloads scale, and there are several thousands of files being written/updated concurrently. This hurts performance due to re-trying of failed requests affecting throughput, and result in occasional failures if the retries are not able to -succeed either and continue to be throttled. +succeed either and continue to be throttled. Note an exception would be non-partitioned tables +reside directly under S3 buckets (using S3 buckets as their table paths), and those tables would be free +from the throttling problem. However, this exception cannot invalidate the necessity of addressing the throttling +problem for partitioned tables. The traditional storage layout also tightly couples the partitions as folders under the table path. However, some users want flexibility to be able to distribute files/partitions under multiple different paths across cloud stores, @@ -97,22 +100,21 @@ public interface HoodieStorageStrategy extends Serializable { } ``` -### Generating file paths for object store optimized layout +### Generating File Paths for Object Store Optimized Layout We want to distribute files evenly across multiple random prefixes, instead of following the traditional Hive storage layout of keeping them under a common table path/prefix. In addition to the `Table Path`, for this new layout user will configure another `Table Storage Path` under which the actual data files will be distributed. The original `Table Path` will be used to maintain the table/partitions Hudi metadata. -For the purpose of this documentation lets assume: +For the purpose of this documentation let's assume: ``` Table Path => s3://// Table Storage Path => s3:/// ``` -Note: `Table Storage Path` can be a path in the same Amazon S3 bucket or a different bucket. For best results, -`Table Storage Path` should be a top-level bucket instead of a prefix under the bucket to avoid multiple -tables sharing the prefix. +`Table Storage Path` should be a top-level bucket instead of a prefix under the bucket for the best results. +So that we can avoid multiple tables sharing the prefix causing throttling. We will use a Hashing function on the `Partition Path/File ID` to map them to a prefix generated under `Table Storage Path`: ``` @@ -148,7 +150,7 @@ s3:///0bfb3d6e//.075f3295-def8-4a42-a927- ... ``` -Note: Storage strategy would only return a storage location instead of a full path. In the above example, +Storage strategy would only return a storage location instead of a full path. In the above example, the storage location is `s3:///0bfb3d6e/`, and the lower-level folder structure would be appended later automatically to get the actual file path. In another word, users would only be able to customize upper-level folder structure (storage location). @@ -176,7 +178,7 @@ The hashing function should be made user configurable for use cases like bucketi sub-partitioning/re-hash to reduce the number of hash prefixes. Having too many unique hash prefixes would make files too dispersed, and affect performance on other operations such as listing. -### Maintain mapping to files +### Maintaining Mapping to Files with Metadata Table In [RFC-15](https://cwiki.apache.org/confluence/pages/viewpage.action?pageId=147427331), we introduced an internal Metadata Table with a `files` partition that maintains mapping from partitions to list of files in the partition stored @@ -196,13 +198,75 @@ for metadata table to be populated. 4. If there is an error reading from Metadata table, we will not fall back listing from file system. -5. In case of metadata table getting corrupted or lost, we need to have a solution here to reconstruct metadata table -from the files which distributed using federated storage. We will likely have to implement a file system listing -logic, that can get all the partition to files mapping by listing all the prefixes under the `Table Storage Path`. -Following the folder structure of adding table name/partitions under the prefix will help in getting the listing and -identifying the table/partition they belong to. +### Integration +This section mainly describes how storage strategy is integrated with other components and how read/write +would look like from Hudi side with object storage layout. + +We propose integrating the storage strategy at the filesystem level, specifically within `HoodieWrapperFileSystem`. +This way, only file read/write operations undergo path conversion and we can limit the usage of +storage strategy to only filesystem level so other upper-level components don't need to be aware of physical paths. + +This also mandates that `HoodieWrapperFileSystem` is the filesystem of choice for all upper-level Hudi components. +Getting filesystem from `Path` or such won't be allowed anymore as using raw filesystem may not reach +to physical locations without storage strategy. Hudi components can simply call `HoodieMetaClient#getFs` +to get `HoodieWrapperFileSystem`, and this needs to be the only allowed way for any filesystem-related operation. +The only exception is when we need to interact with metadata that's still stored under the original table path, +and we should call `HoodieMetaClient#getRawFs` in this case so `HoodieMetaClient` can still be the single entry +for getting filesystem. + +![](wrapper_fs.png) + +When conducting a read operation, Hudi would: +1. Access filesystem view, `HoodieMetadataFileSystemView` specifically +2. Scan metadata table via filesystem view to compose `HoodieMetadataPayload` +3. Call `HoodieMetadataPayload#getFileStatuses` and employ `HoodieWrapperFileSystem` to get +file statuses with physical locations + +This flow can be concluded in the chart below. + +![](read_flow.png) + +#### Considerations +- Path conversion happens on the fly when reading/writing files. This saves Hudi from storing physical locations, +and adds the cost of hashing, but the performance burden should be negligible. +- Since table path and data path will most likely have different top-level folders/authorities, +`HoodieWrapperFileSystem` should maintain at least two `FileSystem` objects: one to access table path and another +to access storage path. `HoodieWrapperFileSystem` should intelligently tell if it needs +to convert the path by checking the path on the fly. +- When using Hudi file reader/writer implementation, we will need to pass `HoodieWrapperFileSystem` down +to parent reader. For instance, when using `HoodieAvroHFileReader`, we will need to pass `HoodieWrapperFileSystem` +to `HFile.Reader` so it can have access to storage strategy. If reader/writer doesn't take filesystem +directly (e.g. `ParquetFileReader` only takes `Configuration` and `Path` for reading), then we will +need to register `HoodieWrapperFileSystem` to `Configuration` so it can be initialized/used later. + +### Repair Tool +In case of metadata table getting corrupted or lost, we need to have a solution here to reconstruct metadata table +from the files that are distributed using federated storage. We will need a repair tool +to get all the partition to files mapping by listing all the prefixes under the `Table Storage Path` +and then reconstruct metadata table. + +In Hudi we already have `HoodieBackedTableMetadataWriter` to list existing data files to initialize/construct +metadata table. We can extract the logic of listing files and get partition info to a new method `getPartitionInfo`, +and then extend `HoodieBackedTableMetadataWriter` and override `getPartitionInfo` so +for repair tool it can list data files stored under storage path instead of table path. -### Query Side Integration +```java + public class StorageRepairMetadataWriter extends SparkHoodieBackedTableMetadataWriter { + StorageRepairMetadataWriter(Configuration hadoopConf, + HoodieWriteConfig writeConfig, + HoodieEngineContext engineContext, + Option inflightInstantTimestamp) { + super(hadoopConf, writeConfig, HoodieFailedWritesCleaningPolicy.EAGER, engineContext, inflightInstantTimestamp); + } + + @Override + protected Map> getPartitionToFilesMap() { + return listFilesUnderStoragePath(); + } + } +``` + +### Query Engine Side Integration Spark, Hive, [Presto](https://github.com/prestodb/presto/commit/ef1fd25c582631513ccdd097e0a654cda44ec3dc), and [Trino](https://github.com/trinodb/trino/pull/10228) are already integrated to use metadata based listing. @@ -224,4 +288,7 @@ should not be user's responsibility to enable metadata listing from query engine - We need a tool to bootstrap existing Hudi table to switch to another storage strategy. - Partition-level storage strategy: Each partition can have its own storage strategy for users to have finer grasp on how data is stored. It would also make new storage strategies more accessible for -existing Hudi tables as they would only need to re-construct the metadata table. \ No newline at end of file +existing Hudi tables as they would only need to re-construct the metadata table. +- For the first cut, we would only have 2 `FileSystem` objects in `HoodieWrapperFileSystem`, and this +prevents users from distributing their data across multiple different buckets. We'll need to support +this in the future. \ No newline at end of file diff --git a/rfc/rfc-60/wrapper_fs.png b/rfc/rfc-60/wrapper_fs.png new file mode 100644 index 000000000000..179d41b9c296 Binary files /dev/null and b/rfc/rfc-60/wrapper_fs.png differ diff --git a/rfc/rfc-76/rfc-76.md b/rfc/rfc-76/rfc-76.md index 1ddc107b5ce7..e9f176f1d5f7 100644 --- a/rfc/rfc-76/rfc-76.md +++ b/rfc/rfc-76/rfc-76.md @@ -61,7 +61,7 @@ Let's consider following scenario: while persisting the dataset, writing one of To provide for aforementioned requirement of the records obtaining globally unique synthetic keys either of the 2 following properties have to hold true: Key generation has to be deterministic and reproducible (so that upon Spark retries we could be certain same records will be obtaining the identity value they did during previous pass) Records have to be getting globally unique identity value every time (such that key collisions are simply impossible) -Note that, deterministic and reproducible identity value association is is only feasible for the incoming datasets represented as "determinate" RDDs. However, It's worth pointing out that other RDD classes (such as "unordered", "indeterminate") are very rare occurrences involving some inherent non-determinism (varying content, order, etc), and pose challenges in terms of their respective handling by Hudi even w/o auto-generation (for ex, for such RDDs Hudi can't provide for uniqueness guarantee even for "insert" operation in the presence of failures). +Note that, deterministic and reproducible identity value association is only feasible for the incoming datasets represented as "determinate" RDDs. However, It's worth pointing out that other RDD classes (such as "unordered", "indeterminate") are very rare occurrences involving some inherent non-determinism (varying content, order, etc), and pose challenges in terms of their respective handling by Hudi even w/o auto-generation (for ex, for such RDDs Hudi can't provide for uniqueness guarantee even for "insert" operation in the presence of failures). For achieving our goal of providing globally unique keys we're planning on relying on the following synthetic key format comprised of 2 components (Reserved) Commit timestamp: Use reserved commit timestamp as prefix (to provide for global uniqueness of rows) Row id: unique identifier of the row (record) w/in the provided batch diff --git a/scripts/ci/move_surefire_reports.sh b/scripts/ci/move_surefire_reports.sh new file mode 100755 index 000000000000..a4b9b2869bda --- /dev/null +++ b/scripts/ci/move_surefire_reports.sh @@ -0,0 +1,58 @@ +#!/bin/bash + +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +# Check if two arguments were provided +if [ "$#" -ne 2 ]; then + echo "Usage: $0 " + exit 1 +fi + +# Assign the first and second argument to SOURCE and DEST variables +SOURCE="$1" +DEST="$2" + +# Ensure the source directory exists +if [ ! -d "$SOURCE" ]; then + echo "Source directory does not exist: $SOURCE" + exit 1 +fi + +# Create the destination directory if it doesn't exist +if [ ! -d "$DEST" ]; then + mkdir -p "$DEST" +fi + +find "$SOURCE" -type f -name "TEST-*.xml" | while IFS= read -r file; do + # Extract the relative directory path + relative_path="${file#$SOURCE}" + destination_path="$DEST$relative_path" + destination_dir=$(dirname "$destination_path") + + if [[ "$relative_path" == *"scripts/ci"* ]]; then + continue # Skip this file + fi + + # Create the destination directory if it doesn't exist + mkdir -p "$destination_dir" + + # Move the file to the new location, preserving the directory structure + mv "$file" "$destination_path" +done diff --git a/scripts/pr_compliance.py b/scripts/pr_compliance.py index af7d9454f70f..dcd3c4c0caf4 100644 --- a/scripts/pr_compliance.py +++ b/scripts/pr_compliance.py @@ -108,7 +108,7 @@ def test_title(): # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # -#Enums for the the outcome of parsing a single line +#Enums for the outcome of parsing a single line class Outcomes: #error was found so we should stop parsing and exit with error ERROR = 0 @@ -389,21 +389,29 @@ def validate(self): #Generate the validator for the current template. #needs to be manually updated def make_default_validator(body, debug=False): - changelogs = ParseSectionData("CHANGELOGS", + changelogs = ParseSectionData("CHANGE_LOGS", "### Change Logs", {"_Describe context and summary for this change. Highlight if any code was copied._"}) impact = ParseSectionData("IMPACT", "### Impact", {"_Describe any public API or user-facing feature change or any performance impact._"}) - risklevel = RiskLevelData("RISKLEVEL", + risklevel = RiskLevelData("RISK_LEVEL", "### Risk level", {"_If medium or high, explain what verification was done to mitigate the risks._"}) + docsUpdate = ParseSectionData("DOCUMENTATION_UPDATE", + "### Documentation Update", + {"_Describe any necessary documentation update if there is any new feature, config, or user-facing change_", + "", + "- _The config description must be updated if new configs are added or the default value of the configs are changed. If not, put \"none\"._", + "- _Any new feature or user-facing change requires updating the Hudi website. Please create a Jira ticket, attach the", + " ticket number here and follow the [instruction](https://hudi.apache.org/contribute/developer-setup#website) to make", + " changes to the website._"}) checklist = ParseSectionData("CHECKLIST", "### Contributor's checklist", {}) - parseSections = ParseSections([changelogs, impact, risklevel, checklist]) + parseSections = ParseSections([changelogs, impact, risklevel, docsUpdate, checklist]) - return ValidateBody(body, "CHANGELOGS", parseSections, debug) + return ValidateBody(body, "CHANGE_LOGS", parseSections, debug) #takes a list of strings and returns a string of those lines separated by \n @@ -466,6 +474,21 @@ def test_body(): good_risklevel = template_risklevel.copy() good_risklevel[1] = "none" + template_docs_update = [ + "### Documentation Update", + "", + "_Describe any necessary documentation update if there is any new feature, config, or user-facing change_", + "", + "- _The config description must be updated if new configs are added or the default value of the configs are changed. If not, put \"none\"._", + "- _Any new feature or user-facing change requires updating the Hudi website. Please create a Jira ticket, attach the", + " ticket number here and follow the [instruction](https://hudi.apache.org/contribute/developer-setup#website) to make", + " changes to the website._", + "" + ] + + good_docs_update = template_docs_update.copy() + good_docs_update[1] = "update docs" + template_checklist = [ "### Contributor's checklist", "", @@ -476,10 +499,10 @@ def test_body(): ] #list of sections that when combined form a valid body - good_sections = [good_changelogs, good_impact, good_risklevel, template_checklist] + good_sections = [good_changelogs, good_impact, good_risklevel, good_docs_update, template_checklist] #list of sections that when combined form the template - template_sections = [template_changelogs, template_impact, template_risklevel, template_checklist] + template_sections = [template_changelogs, template_impact, template_risklevel, template_docs_update, template_checklist] tests_passed = True #Test section not filled out @@ -532,9 +555,6 @@ def test_body(): return tests_passed - - - if __name__ == '__main__': if len(sys.argv) > 1: title_tests = test_title()