Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve JSON scan and from_json #11702

Draft
wants to merge 7 commits into
base: branch-24.12
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 2 additions & 4 deletions integration_tests/src/main/python/json_matrix_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,20 +123,18 @@ def test_json_tuple_allow_comments_off(std_input_path):
@allow_non_gpu('FileSourceScanExec')
@pytest.mark.parametrize('read_func', [read_json_df, read_json_sql])
def test_scan_json_allow_single_quotes_off(std_input_path, read_func, spark_tmp_table_factory):
assert_gpu_fallback_collect(
assert_gpu_and_cpu_are_equal_collect(
read_func(std_input_path + '/' + WITH_SQ_FILE,
WITH_SQ_SCHEMA,
spark_tmp_table_factory,
{"allowSingleQuotes": "false"}),
'FileSourceScanExec',
conf=_enable_all_types_json_scan_conf)

@allow_non_gpu('ProjectExec', TEXT_INPUT_EXEC)
def test_from_json_allow_single_quotes_off(std_input_path):
schema = WITH_SQ_SCHEMA
assert_gpu_fallback_collect(
assert_gpu_and_cpu_are_equal_collect(
lambda spark : read_json_as_text(spark, std_input_path + '/' + WITH_SQ_FILE, "json").select(f.col('json'), f.from_json(f.col('json'), schema, {'allowSingleQuotes': "false"})),
'JsonToStructs',
conf =_enable_json_to_structs_conf)

# On is the default so it really needs to work
Expand Down
46 changes: 46 additions & 0 deletions integration_tests/src/main/python/json_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -679,6 +679,52 @@ def test_from_json_map():
.select(f.from_json(f.col('a'), 'MAP<STRING,STRING>')),
conf=_enable_all_types_conf)

@allow_non_gpu(*non_utc_allow)
def test_from_json_map_with_invalid():
# The test here is working around some inconsistencies in how the keys are parsed for maps
# on the GPU the keys are dense, but on the CPU they are sparse
json_string_gen = StringGen(r'{"a": "[0-9]{0,5}"(, "b": "[A-Z]{0,5}")?}') \
.with_special_pattern('', weight=50) \
.with_special_pattern(' ', weight=50) \
.with_special_pattern('null', weight=50) \
.with_special_pattern('invalid', weight=50) \
.with_special_pattern(r'{"a": "[0-9]{0,5}"', weight=50) \
.with_special_pattern(r'{"a": "[0-9]{0,5}', weight=50) \
.with_special_pattern(r'{"a": "[0-9]{0,5}"}abc', weight=50) \
.with_special_pattern(r'{"a": "[0-9]{0,5}"}{"b": "B"}', weight=50)
assert_gpu_and_cpu_are_equal_collect(
lambda spark : unary_op_df(spark, json_string_gen) \
.select(f.from_json(f.col('a'), 'MAP<STRING,STRING>')),
conf=_enable_all_types_conf)

@allow_non_gpu(*non_utc_allow)
@pytest.mark.parametrize('allow_single_quotes', ['true', 'false'])
@pytest.mark.parametrize('allow_non_numeric_numbers', ['true', 'false'])
@pytest.mark.parametrize('allow_unquoted_chars', ['true', 'false'])
def test_from_json_map_with_options(allow_single_quotes,
allow_non_numeric_numbers, allow_unquoted_chars):
# Test the input with:
# - Double quotes
# - Single quotes
# - Numbers with leading zeros
# - Non-numeric numbers
# - Unquoted control characters in quoted strings
json_string_gen = StringGen(r'{"a": "[0-9]{0,5}"}') \
.with_special_pattern(r"""{'a': "[0-9]{0,5}"}""", weight=50) \
.with_special_pattern(r'{"a": 0[0-9]{0,5}}', weight=50) \
.with_special_pattern(r'{"a": [+-]?(INF|Infinity|NaN)}', weight=50) \
.with_special_pattern(r'{"(a|a\r\n\tb)": "(xyz|01\r\n\t23)"}', weight=50)
options = {"allowSingleQuotes": allow_single_quotes,
# Cannot test `allowNumericLeadingZeros==true` because the GPU output always has
# leading zeros while the CPU output does not, thus test will always fail.
"allowNumericLeadingZeros": "false",
"allowNonNumericNumbers": allow_non_numeric_numbers,
"allowUnquotedControlChars": allow_unquoted_chars}
assert_gpu_and_cpu_are_equal_collect(
lambda spark : unary_op_df(spark, json_string_gen, length=20) \
.select(f.from_json(f.col('a'), 'MAP<STRING,STRING>', options)),
conf=_enable_all_types_conf)

@allow_non_gpu('ProjectExec', 'JsonToStructs')
def test_from_json_map_fallback():
# The test here is working around some inconsistencies in how the keys are parsed for maps
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,11 +90,6 @@ object GpuJsonScan {
meta.willNotWorkOnGpu(s"$op does not support allowUnquotedFieldNames")
}

// {'name': 'Reynold Xin'} turning single quotes off is not supported by CUDF
if (!options.allowSingleQuotes) {
meta.willNotWorkOnGpu(s"$op does not support disabling allowSingleQuotes")
}

// {"name": "Cazen Lee", "price": "\$10"} is not supported by CUDF
if (options.allowBackslashEscapingAnyCharacter) {
meta.willNotWorkOnGpu(s"$op does not support allowBackslashEscapingAnyCharacter")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,8 @@ case class GpuJsonToStructs(
override protected def doColumnar(input: GpuColumnVector): cudf.ColumnVector = {
withResource(new NvtxRange("GpuJsonToStructs", NvtxColor.YELLOW)) { _ =>
schema match {
case _: MapType => JSONUtils.extractRawMapFromJsonString(input.getBase)
case _: MapType =>
JSONUtils.extractRawMapFromJsonString(input.getBase, jsonOptionBuilder.build())
case struct: StructType =>
// if we ever need to support duplicate keys we need to keep track of the duplicates
// and make the first one null, but I don't think this will ever happen in practice
Expand Down
Loading