Skip to content

Commit

Permalink
Merge branch 'master' into python-new-timer
Browse files Browse the repository at this point in the history
  • Loading branch information
jingz-db authored Nov 14, 2024
2 parents a0a53cf + 0aee601 commit d998e48
Show file tree
Hide file tree
Showing 202 changed files with 7,689 additions and 2,437 deletions.
45 changes: 31 additions & 14 deletions .github/workflows/build_and_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ jobs:
image_url: ${{ steps.infra-image-outputs.outputs.image_url }}
image_docs_url: ${{ steps.infra-image-docs-outputs.outputs.image_docs_url }}
image_docs_url_link: ${{ steps.infra-image-link.outputs.image_docs_url_link }}
image_lint_url: ${{ steps.infra-image-lint-outputs.outputs.image_lint_url }}
image_lint_url_link: ${{ steps.infra-image-link.outputs.image_lint_url_link }}
steps:
- name: Checkout Spark repository
uses: actions/checkout@v4
Expand Down Expand Up @@ -144,15 +146,25 @@ jobs:
IMG_NAME="apache-spark-ci-image-docs:${{ inputs.branch }}-${{ github.run_id }}"
IMG_URL="ghcr.io/$REPO_OWNER/$IMG_NAME"
echo "image_docs_url=$IMG_URL" >> $GITHUB_OUTPUT
- name: Generate infra image URL (Linter)
id: infra-image-lint-outputs
run: |
# Convert to lowercase to meet Docker repo name requirement
REPO_OWNER=$(echo "${{ github.repository_owner }}" | tr '[:upper:]' '[:lower:]')
IMG_NAME="apache-spark-ci-image-lint:${{ inputs.branch }}-${{ github.run_id }}"
IMG_URL="ghcr.io/$REPO_OWNER/$IMG_NAME"
echo "image_lint_url=$IMG_URL" >> $GITHUB_OUTPUT
- name: Link the docker images
id: infra-image-link
run: |
# Set the image URL for job "docs"
# Should delete the link and directly use image_docs_url after SPARK 3.x EOL
if [[ "${{ inputs.branch }}" == 'branch-3.5' ]]; then
echo "image_docs_url_link=${{ steps.infra-image-outputs.outputs.image_url }}" >> $GITHUB_OUTPUT
echo "image_lint_url_link=${{ steps.infra-image-outputs.outputs.image_url }}" >> $GITHUB_OUTPUT
else
echo "image_docs_url_link=${{ steps.infra-image-docs-outputs.outputs.image_docs_url }}" >> $GITHUB_OUTPUT
echo "image_lint_url_link=${{ steps.infra-image-lint-outputs.outputs.image_lint_url }}" >> $GITHUB_OUTPUT
fi
# Build: build Spark and run the tests for specified modules.
Expand Down Expand Up @@ -382,6 +394,17 @@ jobs:
${{ needs.precondition.outputs.image_docs_url }}
# Use the infra image cache to speed up
cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-docs-cache:${{ inputs.branch }}
- name: Build and push (Linter)
if: hashFiles('dev/spark-test-image/lint/Dockerfile') != ''
id: docker_build_lint
uses: docker/build-push-action@v6
with:
context: ./dev/spark-test-image/lint/
push: true
tags: |
${{ needs.precondition.outputs.image_lint_url }}
# Use the infra image cache to speed up
cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-lint-cache:${{ inputs.branch }}


pyspark:
Expand Down Expand Up @@ -648,8 +671,12 @@ jobs:
run: |
python3.11 -m pip install 'black==23.9.1' 'protobuf==5.28.3' 'mypy==1.8.0' 'mypy-protobuf==3.3.0'
python3.11 -m pip list
- name: Python CodeGen check
- name: Python CodeGen check for branch-3.5
if: inputs.branch == 'branch-3.5'
run: ./dev/connect-check-protos.py
- name: Python CodeGen check
if: inputs.branch != 'branch-3.5'
run: ./dev/check-protos.py

# Static analysis
lint:
Expand All @@ -667,7 +694,7 @@ jobs:
PYSPARK_PYTHON: python3.9
GITHUB_PREV_SHA: ${{ github.event.before }}
container:
image: ${{ needs.precondition.outputs.image_url }}
image: ${{ needs.precondition.outputs.image_lint_url_link }}
steps:
- name: Checkout Spark repository
uses: actions/checkout@v4
Expand Down Expand Up @@ -741,18 +768,8 @@ jobs:
# Should delete this section after SPARK 3.5 EOL.
python3.9 -m pip install 'flake8==3.9.0' pydata_sphinx_theme 'mypy==0.982' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' numpydoc 'jinja2<3.0.0' 'black==22.6.0'
python3.9 -m pip install 'pandas-stubs==1.2.0.53' ipython 'grpcio==1.56.0' 'grpc-stubs==1.24.11' 'googleapis-common-protos-stubs==2.2.0'
- name: Install Python dependencies for python linter and documentation generation
if: inputs.branch != 'branch-3.5'
run: |
# Should unpin 'sphinxcontrib-*' after upgrading sphinx>5
# See 'ipython_genutils' in SPARK-38517
# See 'docutils<0.18.0' in SPARK-39421
python3.9 -m pip install 'sphinx==4.5.0' mkdocs 'pydata_sphinx_theme>=0.13' sphinx-copybutton nbsphinx numpydoc jinja2 markupsafe 'pyzmq<24.0.0' \
ipython ipython_genutils sphinx_plotly_directive numpy pyarrow pandas 'plotly>=4.8' 'docutils<0.18.0' \
'flake8==3.9.0' 'mypy==1.8.0' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' 'black==23.9.1' \
'pandas-stubs==1.2.0.53' 'grpcio==1.67.0' 'grpc-stubs==1.24.11' 'googleapis-common-protos-stubs==2.2.0' \
'sphinxcontrib-applehelp==1.0.4' 'sphinxcontrib-devhelp==1.0.2' 'sphinxcontrib-htmlhelp==2.0.1' 'sphinxcontrib-qthelp==1.0.3' 'sphinxcontrib-serializinghtml==1.1.5'
python3.9 -m pip list
- name: List Python packages
run: python3.9 -m pip list
- name: Python linter
run: PYTHON_EXECUTABLE=python3.9 ./dev/lint-python
# Should delete this section after SPARK 3.5 EOL.
Expand Down
14 changes: 14 additions & 0 deletions .github/workflows/build_infra_images_cache.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ on:
paths:
- 'dev/infra/Dockerfile'
- 'dev/spark-test-image/docs/Dockerfile'
- 'dev/spark-test-image/lint/Dockerfile'
- '.github/workflows/build_infra_images_cache.yml'
# Create infra image when cutting down branches/tags
create:
Expand Down Expand Up @@ -74,3 +75,16 @@ jobs:
- name: Image digest (Documentation)
if: hashFiles('dev/spark-test-image/docs/Dockerfile') != ''
run: echo ${{ steps.docker_build_docs.outputs.digest }}
- name: Build and push (Linter)
if: hashFiles('dev/spark-test-image/lint/Dockerfile') != ''
id: docker_build_lint
uses: docker/build-push-action@v6
with:
context: ./dev/spark-test-image/lint/
push: true
tags: ghcr.io/apache/spark/apache-spark-github-action-image-lint-cache:${{ github.ref_name }}-static
cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-lint-cache:${{ github.ref_name }}
cache-to: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-lint-cache:${{ github.ref_name }},mode=max
- name: Image digest (Linter)
if: hashFiles('dev/spark-test-image/lint/Dockerfile') != ''
run: echo ${{ steps.docker_build_lint.outputs.digest }}
2 changes: 1 addition & 1 deletion build/mvn
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ install_app() {
local binary="${_DIR}/$6"
local remote_tarball="${mirror_host}/${url_path}${url_query}"
local local_checksum="${local_tarball}.${checksum_suffix}"
local remote_checksum="https://archive.apache.org/dist/${url_path}.${checksum_suffix}"
local remote_checksum="${mirror_host}/${url_path}.${checksum_suffix}${url_query}"

local curl_opts="--retry 3 --silent --show-error -L"
local wget_opts="--no-verbose"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1023,12 +1023,14 @@ protected Collation buildCollation() {

@Override
protected CollationMeta buildCollationMeta() {
String language = ICULocaleMap.get(locale).getDisplayLanguage();
String country = ICULocaleMap.get(locale).getDisplayCountry();
return new CollationMeta(
CATALOG,
SCHEMA,
normalizedCollationName(),
ICULocaleMap.get(locale).getDisplayLanguage(),
ICULocaleMap.get(locale).getDisplayCountry(),
language.isEmpty() ? null : language,
country.isEmpty() ? null : country,
VersionInfo.ICU_VERSION.toString(),
COLLATION_PAD_ATTRIBUTE,
accentSensitivity == AccentSensitivity.AS,
Expand Down
110 changes: 73 additions & 37 deletions common/utils/src/main/resources/error/error-conditions.json
Original file line number Diff line number Diff line change
Expand Up @@ -1045,16 +1045,6 @@
"The input of <functionName> can't be <dataType> type data."
]
},
"UNSUPPORTED_UDF_INPUT_TYPE" : {
"message" : [
"UDFs do not support '<dataType>' as an input data type."
]
},
"UNSUPPORTED_UDF_OUTPUT_TYPE" : {
"message" : [
"UDFs do not support '<dataType>' as an output data type."
]
},
"VALUE_OUT_OF_RANGE" : {
"message" : [
"The <exprName> must be between <valueRange> (current value = <currentValue>)."
Expand Down Expand Up @@ -1111,6 +1101,12 @@
],
"sqlState" : "42K03"
},
"DATETIME_FIELD_OUT_OF_BOUNDS" : {
"message" : [
"<rangeMessage>. If necessary set <ansiConfig> to \"false\" to bypass this error."
],
"sqlState" : "22023"
},
"DATETIME_OVERFLOW" : {
"message" : [
"Datetime operation overflow: <operation>."
Expand Down Expand Up @@ -2022,8 +2018,20 @@
},
"INTERVAL_ARITHMETIC_OVERFLOW" : {
"message" : [
"<message>.<alternative>"
"Integer overflow while operating with intervals."
],
"subClass" : {
"WITHOUT_SUGGESTION" : {
"message" : [
"Try devising appropriate values for the interval parameters."
]
},
"WITH_SUGGESTION" : {
"message" : [
"Use <functionName> to tolerate overflow and return NULL instead."
]
}
},
"sqlState" : "22015"
},
"INTERVAL_DIVIDED_BY_ZERO" : {
Expand Down Expand Up @@ -2607,6 +2615,12 @@
},
"sqlState" : "22006"
},
"INVALID_INTERVAL_WITH_MICROSECONDS_ADDITION" : {
"message" : [
"Cannot add an interval to a date because its microseconds part is not 0. If necessary set <ansiConfig> to \"false\" to bypass this error."
],
"sqlState" : "22006"
},
"INVALID_INVERSE_DISTRIBUTION_FUNCTION" : {
"message" : [
"Invalid inverse distribution function <funcName>."
Expand Down Expand Up @@ -2655,6 +2669,12 @@
],
"sqlState" : "2203G"
},
"INVALID_JSON_RECORD_TYPE" : {
"message" : [
"Detected an invalid type of a JSON record while inferring a common schema in the mode <failFastMode>. Expected a STRUCT type, but found <invalidType>."
],
"sqlState" : "22023"
},
"INVALID_JSON_ROOT_FIELD" : {
"message" : [
"Cannot convert JSON root field to target Spark type."
Expand Down Expand Up @@ -2960,6 +2980,12 @@
},
"sqlState" : "42601"
},
"INVALID_PARTITION_VALUE" : {
"message" : [
"Failed to cast value <value> to data type <dataType> for partition column <columnName>. Ensure the value matches the expected data type for this partition column."
],
"sqlState" : "42846"
},
"INVALID_PROPERTY_KEY" : {
"message" : [
"<key> is an invalid property key, please use quotes, e.g. SET <key>=<value>."
Expand Down Expand Up @@ -3319,6 +3345,12 @@
],
"sqlState" : "22023"
},
"INVALID_VARIANT_SHREDDING_SCHEMA" : {
"message" : [
"The schema `<schema>` is not a valid variant shredding schema."
],
"sqlState" : "22023"
},
"INVALID_WHERE_CONDITION" : {
"message" : [
"The WHERE condition <condition> contains invalid expressions: <expressionList>.",
Expand Down Expand Up @@ -3379,6 +3411,12 @@
],
"sqlState" : "42K0L"
},
"LABEL_ALREADY_EXISTS" : {
"message" : [
"The label <label> already exists. Choose another name or rename the existing label."
],
"sqlState" : "42K0L"
},
"LOAD_DATA_PATH_NOT_EXISTS" : {
"message" : [
"LOAD DATA input path does not exist: <path>."
Expand Down Expand Up @@ -3642,6 +3680,19 @@
},
"sqlState" : "42601"
},
"NOT_ALLOWED_IN_PIPE_OPERATOR_WHERE" : {
"message" : [
"Not allowed in the pipe WHERE clause:"
],
"subClass" : {
"WINDOW_CLAUSE" : {
"message" : [
"WINDOW clause."
]
}
},
"sqlState" : "42601"
},
"NOT_A_CONSTANT_STRING" : {
"message" : [
"The expression <expr> used for the routine or clause <name> must be a constant STRING which is NOT NULL."
Expand Down Expand Up @@ -4233,12 +4284,6 @@
],
"sqlState" : "42802"
},
"STATEFUL_PROCESSOR_CANNOT_REINITIALIZE_STATE_ON_KEY" : {
"message" : [
"Cannot re-initialize state on the same grouping key during initial state handling for stateful processor. Invalid grouping key=<groupingKey>."
],
"sqlState" : "42802"
},
"STATEFUL_PROCESSOR_DUPLICATE_STATE_VARIABLE_DEFINED" : {
"message" : [
"State variable with name <stateVarName> has already been defined in the StatefulProcessor."
Expand Down Expand Up @@ -4697,6 +4742,12 @@
],
"sqlState" : "42KD9"
},
"UNANALYZABLE_EXPRESSION" : {
"message" : [
"The plan contains an unanalyzable expression <expr> that holds the analysis."
],
"sqlState" : "03000"
},
"UNBOUND_SQL_PARAMETER" : {
"message" : [
"Found the unbound parameter: <name>. Please, fix `args` and provide a mapping of the parameter to either a SQL literal or collection constructor functions such as `map()`, `array()`, `struct()`."
Expand Down Expand Up @@ -5373,6 +5424,11 @@
"message" : [
"Update column nullability for MySQL and MS SQL Server."
]
},
"WRITE_FOR_BINARY_SOURCE" : {
"message" : [
"Write for the binary file data source."
]
}
},
"sqlState" : "0A000"
Expand Down Expand Up @@ -6867,11 +6923,6 @@
"Sinks cannot request distribution and ordering in continuous execution mode."
]
},
"_LEGACY_ERROR_TEMP_2000" : {
"message" : [
"<message>. If necessary set <ansiConfig> to false to bypass this error."
]
},
"_LEGACY_ERROR_TEMP_2003" : {
"message" : [
"Unsuccessful try to zip maps with <size> unique keys due to exceeding the array size limit <maxRoundedArrayLength>."
Expand Down Expand Up @@ -7012,11 +7063,6 @@
"Unable to clear partition directory <path> prior to writing to it."
]
},
"_LEGACY_ERROR_TEMP_2058" : {
"message" : [
"Failed to cast value `<value>` to `<dataType>` for partition column `<columnName>`."
]
},
"_LEGACY_ERROR_TEMP_2059" : {
"message" : [
"End of stream."
Expand Down Expand Up @@ -7074,11 +7120,6 @@
"user-specified schema."
]
},
"_LEGACY_ERROR_TEMP_2075" : {
"message" : [
"Write is not supported for binary file data source."
]
},
"_LEGACY_ERROR_TEMP_2076" : {
"message" : [
"The length of <path> is <len>, which exceeds the max length allowed: <maxLength>."
Expand Down Expand Up @@ -7344,11 +7385,6 @@
"Malformed JSON."
]
},
"_LEGACY_ERROR_TEMP_2167" : {
"message" : [
"Malformed records are detected in schema inference. Parse Mode: <failFastMode>. Reasons: Failed to infer a common schema. Struct types are expected, but `<dataType>` was found."
]
},
"_LEGACY_ERROR_TEMP_2168" : {
"message" : [
"Decorrelate inner query through <plan> is not supported."
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -692,6 +692,7 @@ private[spark] object LogKeys {
case object RPC_ENDPOINT_REF extends LogKey
case object RPC_MESSAGE_CAPACITY extends LogKey
case object RPC_SSL_ENABLED extends LogKey
case object RULE_EXECUTOR_NAME extends LogKey
case object RULE_NAME extends LogKey
case object RUN_ID extends LogKey
case object SCALA_VERSION extends LogKey
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,18 @@ public ObjectField getFieldAtIndex(int index) {
});
}

// Get the dictionary ID for the object field at the `index` slot. Throws malformedVariant if
// `index` is out of the bound of `[0, objectSize())`.
// It is only legal to call it when `getType()` is `Type.OBJECT`.
public int getDictionaryIdAtIndex(int index) {
return handleObject(value, pos, (size, idSize, offsetSize, idStart, offsetStart, dataStart) -> {
if (index < 0 || index >= size) {
throw malformedVariant();
}
return readUnsigned(value, idStart + idSize * index, idSize);
});
}

// Get the number of array elements in the variant.
// It is only legal to call it when `getType()` is `Type.ARRAY`.
public int arraySize() {
Expand Down
Loading

0 comments on commit d998e48

Please sign in to comment.