From a4f1118f23cc7cfdb7e3d03abf7726740ff52af7 Mon Sep 17 00:00:00 2001 From: Suraj Aralihalli Date: Thu, 29 Feb 2024 14:21:11 -0800 Subject: [PATCH] Resolve path parsing issues in `get_json_object` (#15082) This PR addresses a parsing issue related to JSONPath by implementing distinct parsing rules for values inside and outside brackets. For instance, in `{ "A.B": 2, "'A": { "B'": 3 } }`, `$.'A.B'` differs from `$['A.B']`. (See [Assertible JSON Path Documentation](https://assertible.com/docs/guide/json-path)) The fix ensures accurate parsing of JSONPath values containing quotes. For example in `{ "A.B": 2, "'A": { "B'": 3 } }` | JSONPath | Before Fix | Spark | After Fix | |---------------|-------------------------------------------------------|----------------------|---------------------| | $.'A.B' | 2 | 3 | 3 | | $.'A | CUDF_FAIL("Encountered invalid JSONPath input string")| {"B'": 3} | {"B'": 3} | Resolves [12483](https://github.com/rapidsai/cudf/issues/12483). Authors: - Suraj Aralihalli (https://github.com/SurajAralihalli) - Nghia Truong (https://github.com/ttnghia) Approvers: - Robert (Bobby) Evans (https://github.com/revans2) - Mike Wilson (https://github.com/hyperbolic2346) - Nghia Truong (https://github.com/ttnghia) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/15082 --- cpp/src/json/json_path.cu | 24 +++++++++--- cpp/tests/json/json_tests.cpp | 38 +++++++++++++++++++ java/src/main/native/src/ColumnViewJni.cpp | 10 ++++- .../java/ai/rapids/cudf/ColumnVectorTest.java | 16 ++++++++ 4 files changed, 81 insertions(+), 7 deletions(-) diff --git a/cpp/src/json/json_path.cu b/cpp/src/json/json_path.cu index 25f136e2336..ff42d9c8620 100644 --- a/cpp/src/json/json_path.cu +++ b/cpp/src/json/json_path.cu @@ -521,6 +521,14 @@ struct path_operator { int index{-1}; // index for subscript operator }; +/** + * @brief Enum to specify whether parsing values enclosed within brackets, like `['book']`. + */ +enum class bracket_state : bool { + INSIDE, ///< Parsing inside brackets + OUTSIDE ///< Parsing outside brackets +}; + /** * @brief Parsing class that holds the current state of the JSONPath string to be parsed * and provides functions for navigating through it. This is only called on the host @@ -541,7 +549,7 @@ class path_state : private parser { case '.': { path_operator op; string_view term{".[", 2}; - if (parse_path_name(op.name, term)) { + if (parse_path_name(op.name, term, bracket_state::OUTSIDE)) { // this is another potential use case for __SPARK_BEHAVIORS / configurability // Spark currently only handles the wildcard operator inside [*], it does // not handle .* @@ -564,7 +572,7 @@ class path_state : private parser { path_operator op; string_view term{"]", 1}; bool const is_string = *pos == '\''; - if (parse_path_name(op.name, term)) { + if (parse_path_name(op.name, term, bracket_state::INSIDE)) { pos++; if (op.name.size_bytes() == 1 && op.name.data()[0] == '*') { op.type = path_operator_type::CHILD_WILDCARD; @@ -600,7 +608,8 @@ class path_state : private parser { private: cudf::io::parse_options_view json_opts{',', '\n', '\"', '.'}; - bool parse_path_name(string_view& name, string_view const& terminators) + // b_state is set to INSIDE while parsing values enclosed within [ ], otherwise OUTSIDE + bool parse_path_name(string_view& name, string_view const& terminators, bracket_state b_state) { switch (*pos) { case '*': @@ -609,8 +618,11 @@ class path_state : private parser { break; case '\'': - if (parse_string(name, false, '\'') != parse_result::SUCCESS) { return false; } - break; + if (b_state == bracket_state::INSIDE) { + if (parse_string(name, false, '\'') != parse_result::SUCCESS) { return false; } + break; + } + // if not inside the [ ] -> go to default default: { size_t const chars_left = input_len - (pos - input); @@ -656,7 +668,7 @@ std::pair>, int> build_comma do { op = p_state.get_next_operator(); if (op.type == path_operator_type::ERROR) { - CUDF_FAIL("Encountered invalid JSONPath input string"); + CUDF_FAIL("Encountered invalid JSONPath input string", std::invalid_argument); } if (op.type == path_operator_type::CHILD_WILDCARD) { max_stack_depth++; } // convert pointer to device pointer diff --git a/cpp/tests/json/json_tests.cpp b/cpp/tests/json/json_tests.cpp index 0894472dcc3..6c9050becc1 100644 --- a/cpp/tests/json/json_tests.cpp +++ b/cpp/tests/json/json_tests.cpp @@ -588,6 +588,15 @@ TEST_F(JsonPathTests, GetJsonObjectIllegalQuery) }; EXPECT_THROW(query(), std::invalid_argument); } + + { + auto const input = cudf::test::strings_column_wrapper{R"({"a": "b"})"}; + auto const json_path = std::string{"${a}"}; + auto const query = [&]() { + auto const result = cudf::get_json_object(cudf::strings_column_view(input), json_path); + }; + EXPECT_THROW(query(), std::invalid_argument); + } } // queries that are legal, but reference invalid parts of the input @@ -1018,4 +1027,33 @@ TEST_F(JsonPathTests, MissingFieldsAsNulls) do_test("$.tup[*].a.x", "[\"5\"]", "[null,null,null,\"5\"]"); } +TEST_F(JsonPathTests, QueriesContainingQuotes) +{ + std::string input_string = R"({"AB": 1, "A.B": 2, "'A": {"B'": 3}, "A": {"B": 4} })"; + + auto do_test = [&input_string](auto const& json_path_string, + auto const& expected_string, + bool const& expect_null = false) { + auto const input = cudf::test::strings_column_wrapper{input_string}; + auto const json_path = std::string{json_path_string}; + cudf::get_json_object_options options; + options.set_allow_single_quotes(true); + auto const result = cudf::get_json_object(cudf::strings_column_view(input), json_path, options); + auto const expected = + cudf::test::strings_column_wrapper{std::initializer_list{expected_string}, + std::initializer_list{!expect_null}}; + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); + }; + + // Set 1 + do_test(R"($.AB)", "1"); + do_test(R"($['A.B'])", "2"); + do_test(R"($.'A.B')", "3"); + do_test(R"($.A.B)", "4"); + + // Set 2 + do_test(R"($.'A)", R"({"B'": 3})"); +} + CUDF_TEST_PROGRAM_MAIN() diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp index 1c4eb8a83ab..dd3859a4160 100644 --- a/java/src/main/native/src/ColumnViewJni.cpp +++ b/java/src/main/native/src/ColumnViewJni.cpp @@ -2452,7 +2452,15 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getJSONObject( options.set_allow_single_quotes(allow_single_quotes); options.set_strip_quotes_from_single_strings(strip_quotes_from_single_strings); options.set_missing_fields_as_nulls(missing_fields_as_nulls); - return release_as_jlong(cudf::get_json_object(n_strings_col_view, *n_scalar_path, options)); + auto result_col_ptr = [&]() { + try { + return cudf::get_json_object(n_strings_col_view, *n_scalar_path, options); + } catch (std::invalid_argument const &err) { + auto const null_scalar = cudf::string_scalar(std::string(""), false); + return cudf::make_column_from_scalar(null_scalar, n_strings_col_view.size()); + } catch (...) { throw; } + }(); + return release_as_jlong(result_col_ptr); } CATCH_STD(env, 0) } diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index 75573046af2..bac4d1e4b3e 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -6405,6 +6405,22 @@ void testGetJSONObjectWithSingleQuotes() { } } +@Test +void testGetJSONObjectWithInvalidQueries() { + String jsonString = "{" + + "\'a\': \'A\"\'" + + "}"; + + GetJsonObjectOptions options = GetJsonObjectOptions.builder().allowSingleQuotes(true).build(); + try (ColumnVector json = ColumnVector.fromStrings(jsonString, jsonString); + Scalar nullString = Scalar.fromString(null); + ColumnVector expectedAuthors = ColumnVector.fromScalar(nullString, 2); + Scalar path = Scalar.fromString("."); + ColumnVector gotAuthors = json.getJSONObject(path, options)) { + assertColumnsAreEqual(expectedAuthors, gotAuthors); + } +} + @Test void testMakeStructEmpty() { final int numRows = 10;