Merge branch 'master' into python-new-timer

apache · Nov 14, 2024 · d998e48 · d998e48
2 parents a0a53cf + 0aee601
commit d998e48
Show file tree

Hide file tree

Showing 202 changed files with 7,689 additions and 2,437 deletions.
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
@@ -60,6 +60,8 @@ jobs:
       image_url: ${{ steps.infra-image-outputs.outputs.image_url }}
       image_docs_url: ${{ steps.infra-image-docs-outputs.outputs.image_docs_url }}
       image_docs_url_link: ${{ steps.infra-image-link.outputs.image_docs_url_link }}
+      image_lint_url: ${{ steps.infra-image-lint-outputs.outputs.image_lint_url }}
+      image_lint_url_link: ${{ steps.infra-image-link.outputs.image_lint_url_link }}
     steps:
     - name: Checkout Spark repository
       uses: actions/checkout@v4
@@ -144,15 +146,25 @@ jobs:
         IMG_NAME="apache-spark-ci-image-docs:${{ inputs.branch }}-${{ github.run_id }}"
         IMG_URL="ghcr.io/$REPO_OWNER/$IMG_NAME"
         echo "image_docs_url=$IMG_URL" >> $GITHUB_OUTPUT
+    - name: Generate infra image URL (Linter)
+      id: infra-image-lint-outputs
+      run: |
+        # Convert to lowercase to meet Docker repo name requirement
+        REPO_OWNER=$(echo "${{ github.repository_owner }}" | tr '[:upper:]' '[:lower:]')
+        IMG_NAME="apache-spark-ci-image-lint:${{ inputs.branch }}-${{ github.run_id }}"
+        IMG_URL="ghcr.io/$REPO_OWNER/$IMG_NAME"
+        echo "image_lint_url=$IMG_URL" >> $GITHUB_OUTPUT
     - name: Link the docker images
       id: infra-image-link
       run: |
         # Set the image URL for job "docs"
         # Should delete the link and directly use image_docs_url after SPARK 3.x EOL
         if [[ "${{ inputs.branch }}" == 'branch-3.5' ]]; then
           echo "image_docs_url_link=${{ steps.infra-image-outputs.outputs.image_url }}" >> $GITHUB_OUTPUT
+          echo "image_lint_url_link=${{ steps.infra-image-outputs.outputs.image_url }}" >> $GITHUB_OUTPUT
         else
           echo "image_docs_url_link=${{ steps.infra-image-docs-outputs.outputs.image_docs_url }}" >> $GITHUB_OUTPUT
+          echo "image_lint_url_link=${{ steps.infra-image-lint-outputs.outputs.image_lint_url }}" >> $GITHUB_OUTPUT
         fi
 
   # Build: build Spark and run the tests for specified modules.
@@ -382,6 +394,17 @@ jobs:
             ${{ needs.precondition.outputs.image_docs_url }}
           # Use the infra image cache to speed up
           cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-docs-cache:${{ inputs.branch }}
+      - name: Build and push (Linter)
+        if: hashFiles('dev/spark-test-image/lint/Dockerfile') != ''
+        id: docker_build_lint
+        uses: docker/build-push-action@v6
+        with:
+          context: ./dev/spark-test-image/lint/
+          push: true
+          tags: |
+            ${{ needs.precondition.outputs.image_lint_url }}
+          # Use the infra image cache to speed up
+          cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-lint-cache:${{ inputs.branch }}
 
 
   pyspark:
@@ -648,8 +671,12 @@ jobs:
       run: |
         python3.11 -m pip install 'black==23.9.1' 'protobuf==5.28.3' 'mypy==1.8.0' 'mypy-protobuf==3.3.0'
         python3.11 -m pip list
-    - name: Python CodeGen check
+    - name: Python CodeGen check for branch-3.5
+      if: inputs.branch == 'branch-3.5'
       run: ./dev/connect-check-protos.py
+    - name: Python CodeGen check
+      if: inputs.branch != 'branch-3.5'
+      run: ./dev/check-protos.py
 
   # Static analysis
   lint:
@@ -667,7 +694,7 @@ jobs:
       PYSPARK_PYTHON: python3.9
       GITHUB_PREV_SHA: ${{ github.event.before }}
     container:
-      image: ${{ needs.precondition.outputs.image_url }}
+      image: ${{ needs.precondition.outputs.image_lint_url_link }}
     steps:
     - name: Checkout Spark repository
       uses: actions/checkout@v4
@@ -741,18 +768,8 @@ jobs:
         # Should delete this section after SPARK 3.5 EOL.
         python3.9 -m pip install 'flake8==3.9.0' pydata_sphinx_theme 'mypy==0.982' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' numpydoc 'jinja2<3.0.0' 'black==22.6.0'
         python3.9 -m pip install 'pandas-stubs==1.2.0.53' ipython 'grpcio==1.56.0' 'grpc-stubs==1.24.11' 'googleapis-common-protos-stubs==2.2.0'
-    - name: Install Python dependencies for python linter and documentation generation
-      if: inputs.branch != 'branch-3.5'
-      run: |
-        # Should unpin 'sphinxcontrib-*' after upgrading sphinx>5
-        # See 'ipython_genutils' in SPARK-38517
-        # See 'docutils<0.18.0' in SPARK-39421
-        python3.9 -m pip install 'sphinx==4.5.0' mkdocs 'pydata_sphinx_theme>=0.13' sphinx-copybutton nbsphinx numpydoc jinja2 markupsafe 'pyzmq<24.0.0' \
-          ipython ipython_genutils sphinx_plotly_directive numpy pyarrow pandas 'plotly>=4.8' 'docutils<0.18.0' \
-          'flake8==3.9.0' 'mypy==1.8.0' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' 'black==23.9.1' \
-          'pandas-stubs==1.2.0.53' 'grpcio==1.67.0' 'grpc-stubs==1.24.11' 'googleapis-common-protos-stubs==2.2.0' \
-          'sphinxcontrib-applehelp==1.0.4' 'sphinxcontrib-devhelp==1.0.2' 'sphinxcontrib-htmlhelp==2.0.1' 'sphinxcontrib-qthelp==1.0.3' 'sphinxcontrib-serializinghtml==1.1.5'
-        python3.9 -m pip list
+    - name: List Python packages
+      run: python3.9 -m pip list
     - name: Python linter
       run: PYTHON_EXECUTABLE=python3.9 ./dev/lint-python
     # Should delete this section after SPARK 3.5 EOL.

diff --git a/.github/workflows/build_infra_images_cache.yml b/.github/workflows/build_infra_images_cache.yml
@@ -28,6 +28,7 @@ on:
     paths:
     - 'dev/infra/Dockerfile'
     - 'dev/spark-test-image/docs/Dockerfile'
+    - 'dev/spark-test-image/lint/Dockerfile'
     - '.github/workflows/build_infra_images_cache.yml'
   # Create infra image when cutting down branches/tags
   create:
@@ -74,3 +75,16 @@ jobs:
       - name: Image digest (Documentation)
         if: hashFiles('dev/spark-test-image/docs/Dockerfile') != ''
         run: echo ${{ steps.docker_build_docs.outputs.digest }}
+      - name: Build and push (Linter)
+        if: hashFiles('dev/spark-test-image/lint/Dockerfile') != ''
+        id: docker_build_lint
+        uses: docker/build-push-action@v6
+        with:
+          context: ./dev/spark-test-image/lint/
+          push: true
+          tags: ghcr.io/apache/spark/apache-spark-github-action-image-lint-cache:${{ github.ref_name }}-static
+          cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-lint-cache:${{ github.ref_name }}
+          cache-to: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-lint-cache:${{ github.ref_name }},mode=max
+      - name: Image digest (Linter)
+        if: hashFiles('dev/spark-test-image/lint/Dockerfile') != ''
+        run: echo ${{ steps.docker_build_lint.outputs.digest }}
diff --git a/build/mvn b/build/mvn
@@ -56,7 +56,7 @@ install_app() {
   local binary="${_DIR}/$6"
   local remote_tarball="${mirror_host}/${url_path}${url_query}"
   local local_checksum="${local_tarball}.${checksum_suffix}"
-  local remote_checksum="https://archive.apache.org/dist/${url_path}.${checksum_suffix}"
+  local remote_checksum="${mirror_host}/${url_path}.${checksum_suffix}${url_query}"
 
   local curl_opts="--retry 3 --silent --show-error -L"
   local wget_opts="--no-verbose"

diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java
@@ -1023,12 +1023,14 @@ protected Collation buildCollation() {
 
       @Override
       protected CollationMeta buildCollationMeta() {
+        String language = ICULocaleMap.get(locale).getDisplayLanguage();
+        String country = ICULocaleMap.get(locale).getDisplayCountry();
         return new CollationMeta(
           CATALOG,
           SCHEMA,
           normalizedCollationName(),
-          ICULocaleMap.get(locale).getDisplayLanguage(),
-          ICULocaleMap.get(locale).getDisplayCountry(),
+          language.isEmpty() ? null : language,
+          country.isEmpty() ? null : country,
           VersionInfo.ICU_VERSION.toString(),
           COLLATION_PAD_ATTRIBUTE,
           accentSensitivity == AccentSensitivity.AS,

diff --git a/common/utils/src/main/resources/error/error-conditions.json b/common/utils/src/main/resources/error/error-conditions.json
@@ -1045,16 +1045,6 @@
           "The input of <functionName> can't be <dataType> type data."
         ]
       },
-      "UNSUPPORTED_UDF_INPUT_TYPE" : {
-        "message" : [
-          "UDFs do not support '<dataType>' as an input data type."
-        ]
-      },
-      "UNSUPPORTED_UDF_OUTPUT_TYPE" : {
-        "message" : [
-          "UDFs do not support '<dataType>' as an output data type."
-        ]
-      },
       "VALUE_OUT_OF_RANGE" : {
         "message" : [
           "The <exprName> must be between <valueRange> (current value = <currentValue>)."
@@ -1111,6 +1101,12 @@
     ],
     "sqlState" : "42K03"
   },
+  "DATETIME_FIELD_OUT_OF_BOUNDS" : {
+    "message" : [
+      "<rangeMessage>. If necessary set <ansiConfig> to \"false\" to bypass this error."
+    ],
+    "sqlState" : "22023"
+  },
   "DATETIME_OVERFLOW" : {
     "message" : [
       "Datetime operation overflow: <operation>."
@@ -2022,8 +2018,20 @@
   },
   "INTERVAL_ARITHMETIC_OVERFLOW" : {
     "message" : [
-      "<message>.<alternative>"
+      "Integer overflow while operating with intervals."
     ],
+    "subClass" : {
+      "WITHOUT_SUGGESTION" : {
+        "message" : [
+          "Try devising appropriate values for the interval parameters."
+        ]
+      },
+      "WITH_SUGGESTION" : {
+        "message" : [
+          "Use <functionName> to tolerate overflow and return NULL instead."
+        ]
+      }
+    },
     "sqlState" : "22015"
   },
   "INTERVAL_DIVIDED_BY_ZERO" : {
@@ -2607,6 +2615,12 @@
     },
     "sqlState" : "22006"
   },
+  "INVALID_INTERVAL_WITH_MICROSECONDS_ADDITION" : {
+    "message" : [
+      "Cannot add an interval to a date because its microseconds part is not 0. If necessary set <ansiConfig> to \"false\" to bypass this error."
+    ],
+    "sqlState" : "22006"
+  },
   "INVALID_INVERSE_DISTRIBUTION_FUNCTION" : {
     "message" : [
       "Invalid inverse distribution function <funcName>."
@@ -2655,6 +2669,12 @@
     ],
     "sqlState" : "2203G"
   },
+  "INVALID_JSON_RECORD_TYPE" : {
+    "message" : [
+      "Detected an invalid type of a JSON record while inferring a common schema in the mode <failFastMode>. Expected a STRUCT type, but found <invalidType>."
+    ],
+    "sqlState" : "22023"
+  },
   "INVALID_JSON_ROOT_FIELD" : {
     "message" : [
       "Cannot convert JSON root field to target Spark type."
@@ -2960,6 +2980,12 @@
     },
     "sqlState" : "42601"
   },
+  "INVALID_PARTITION_VALUE" : {
+    "message" : [
+      "Failed to cast value <value> to data type <dataType> for partition column <columnName>. Ensure the value matches the expected data type for this partition column."
+    ],
+    "sqlState" : "42846"
+  },
   "INVALID_PROPERTY_KEY" : {
     "message" : [
       "<key> is an invalid property key, please use quotes, e.g. SET <key>=<value>."
@@ -3319,6 +3345,12 @@
     ],
     "sqlState" : "22023"
   },
+  "INVALID_VARIANT_SHREDDING_SCHEMA" : {
+    "message" : [
+      "The schema `<schema>` is not a valid variant shredding schema."
+    ],
+    "sqlState" : "22023"
+  },
   "INVALID_WHERE_CONDITION" : {
     "message" : [
       "The WHERE condition <condition> contains invalid expressions: <expressionList>.",
@@ -3379,6 +3411,12 @@
     ],
     "sqlState" : "42K0L"
   },
+  "LABEL_ALREADY_EXISTS" : {
+    "message" : [
+      "The label <label> already exists. Choose another name or rename the existing label."
+    ],
+    "sqlState" : "42K0L"
+  },
   "LOAD_DATA_PATH_NOT_EXISTS" : {
     "message" : [
       "LOAD DATA input path does not exist: <path>."
@@ -3642,6 +3680,19 @@
     },
     "sqlState" : "42601"
   },
+  "NOT_ALLOWED_IN_PIPE_OPERATOR_WHERE" : {
+    "message" : [
+      "Not allowed in the pipe WHERE clause:"
+    ],
+    "subClass" : {
+      "WINDOW_CLAUSE" : {
+        "message" : [
+          "WINDOW clause."
+        ]
+      }
+    },
+    "sqlState" : "42601"
+  },
   "NOT_A_CONSTANT_STRING" : {
     "message" : [
       "The expression <expr> used for the routine or clause <name> must be a constant STRING which is NOT NULL."
@@ -4233,12 +4284,6 @@
     ],
     "sqlState" : "42802"
   },
-  "STATEFUL_PROCESSOR_CANNOT_REINITIALIZE_STATE_ON_KEY" : {
-    "message" : [
-      "Cannot re-initialize state on the same grouping key during initial state handling for stateful processor. Invalid grouping key=<groupingKey>."
-    ],
-    "sqlState" : "42802"
-  },
   "STATEFUL_PROCESSOR_DUPLICATE_STATE_VARIABLE_DEFINED" : {
     "message" : [
       "State variable with name <stateVarName> has already been defined in the StatefulProcessor."
@@ -4697,6 +4742,12 @@
     ],
     "sqlState" : "42KD9"
   },
+  "UNANALYZABLE_EXPRESSION" : {
+    "message" : [
+      "The plan contains an unanalyzable expression <expr> that holds the analysis."
+    ],
+    "sqlState" : "03000"
+  },
   "UNBOUND_SQL_PARAMETER" : {
     "message" : [
       "Found the unbound parameter: <name>. Please, fix `args` and provide a mapping of the parameter to either a SQL literal or collection constructor functions such as `map()`, `array()`, `struct()`."
@@ -5373,6 +5424,11 @@
         "message" : [
           "Update column nullability for MySQL and MS SQL Server."
         ]
+      },
+      "WRITE_FOR_BINARY_SOURCE" : {
+        "message" : [
+          "Write for the binary file data source."
+        ]
       }
     },
     "sqlState" : "0A000"
@@ -6867,11 +6923,6 @@
       "Sinks cannot request distribution and ordering in continuous execution mode."
     ]
   },
-  "_LEGACY_ERROR_TEMP_2000" : {
-    "message" : [
-      "<message>. If necessary set <ansiConfig> to false to bypass this error."
-    ]
-  },
   "_LEGACY_ERROR_TEMP_2003" : {
     "message" : [
       "Unsuccessful try to zip maps with <size> unique keys due to exceeding the array size limit <maxRoundedArrayLength>."
@@ -7012,11 +7063,6 @@
       "Unable to clear partition directory <path> prior to writing to it."
     ]
   },
-  "_LEGACY_ERROR_TEMP_2058" : {
-    "message" : [
-      "Failed to cast value `<value>` to `<dataType>` for partition column `<columnName>`."
-    ]
-  },
   "_LEGACY_ERROR_TEMP_2059" : {
     "message" : [
       "End of stream."
@@ -7074,11 +7120,6 @@
       "user-specified schema."
     ]
   },
-  "_LEGACY_ERROR_TEMP_2075" : {
-    "message" : [
-      "Write is not supported for binary file data source."
-    ]
-  },
   "_LEGACY_ERROR_TEMP_2076" : {
     "message" : [
       "The length of <path> is <len>, which exceeds the max length allowed: <maxLength>."
@@ -7344,11 +7385,6 @@
       "Malformed JSON."
     ]
   },
-  "_LEGACY_ERROR_TEMP_2167" : {
-    "message" : [
-      "Malformed records are detected in schema inference. Parse Mode: <failFastMode>. Reasons: Failed to infer a common schema. Struct types are expected, but `<dataType>` was found."
-    ]
-  },
   "_LEGACY_ERROR_TEMP_2168" : {
     "message" : [
       "Decorrelate inner query through <plan> is not supported."

diff --git a/common/utils/src/main/scala/org/apache/spark/internal/LogKey.scala b/common/utils/src/main/scala/org/apache/spark/internal/LogKey.scala
@@ -692,6 +692,7 @@ private[spark] object LogKeys {
   case object RPC_ENDPOINT_REF extends LogKey
   case object RPC_MESSAGE_CAPACITY extends LogKey
   case object RPC_SSL_ENABLED extends LogKey
+  case object RULE_EXECUTOR_NAME extends LogKey
   case object RULE_NAME extends LogKey
   case object RUN_ID extends LogKey
   case object SCALA_VERSION extends LogKey

diff --git a/common/variant/src/main/java/org/apache/spark/types/variant/Variant.java b/common/variant/src/main/java/org/apache/spark/types/variant/Variant.java
@@ -193,6 +193,18 @@ public ObjectField getFieldAtIndex(int index) {
     });
   }
 
+  // Get the dictionary ID for the object field at the `index` slot. Throws malformedVariant if
+  // `index` is out of the bound of `[0, objectSize())`.
+  // It is only legal to call it when `getType()` is `Type.OBJECT`.
+  public int getDictionaryIdAtIndex(int index) {
+    return handleObject(value, pos, (size, idSize, offsetSize, idStart, offsetStart, dataStart) -> {
+      if (index < 0 || index >= size) {
+        throw malformedVariant();
+      }
+      return readUnsigned(value, idStart + idSize * index, idSize);
+    });
+  }
+
   // Get the number of array elements in the variant.
   // It is only legal to call it when `getType()` is `Type.ARRAY`.
   public int arraySize() {