From 34fc68d481fea20368b6a9e0e1df698de98072cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Patrick=20Sodr=C3=A9?= Date: Mon, 12 Apr 2021 10:32:25 -0400 Subject: [PATCH 1/3] Reproduce UTF8 substring issue Ref: #1064 --- .../json/modifier/functions/stringsTests.json | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/jolt-core/src/test/resources/json/modifier/functions/stringsTests.json b/jolt-core/src/test/resources/json/modifier/functions/stringsTests.json index 47aa33e5..ed60763a 100644 --- a/jolt-core/src/test/resources/json/modifier/functions/stringsTests.json +++ b/jolt-core/src/test/resources/json/modifier/functions/stringsTests.json @@ -3,7 +3,8 @@ "string": "the QuIcK brOwn fox", "zeroIndex" : 0, "threeIndex" : 3, - "trimMe" : " tuna " + "trimMe" : " tuna ", + "utf8String": "\uD83D\uDC47\uD83D\uDD34 UTF-8 String" }, "spec": { @@ -44,7 +45,8 @@ "custom2": "=substring('the quick brown fox', 16, 19)", // // verify that we can actually lookup start and end indices - "advancedLookupRanges" : "=substring(@(2,string), @(2,zeroIndex), @(2,threeIndex))" + "advancedLookupRanges" : "=substring(@(2,string), @(2,zeroIndex), @(2,threeIndex))", + "utf8Substring": "=substring(@(2,utf8String), 3, 15)" }, "trim" :{ "trimed" : "=trim(@(2,trimMe))" @@ -61,6 +63,7 @@ "string" : "the QuIcK brOwn fox", "zeroIndex" : 0, "threeIndex" : 3, + "utf8String": "\uD83D\uDC47\uD83D\uDD34 UTF-8 String", // // from the input, but overwritten by modify "trimMe" : "tuna", @@ -88,7 +91,8 @@ "basic2": "QuIcK", "custom1": "the quick brown", "custom2": "fox", - "advancedLookupRanges": "the" + "advancedLookupRanges": "the", + "utf8Substring": "UTF-8 String" }, "trim" :{ "trimed" : "tuna" From a4adb218ced3f2fa1f64a2e49fff0731262cf50b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Patrick=20Sodr=C3=A9?= Date: Mon, 12 Apr 2021 11:42:47 -0400 Subject: [PATCH 2/3] Fix UTF8String substring issue Close #1064 ref: https://stackoverflow.com/questions/17524432/substring-or-characterat-method-for-utf8-strings-with-2-bytes-in-java --- .../java/com/bazaarvoice/jolt/modifier/function/Strings.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/jolt-core/src/main/java/com/bazaarvoice/jolt/modifier/function/Strings.java b/jolt-core/src/main/java/com/bazaarvoice/jolt/modifier/function/Strings.java index 3af56224..73a48d31 100644 --- a/jolt-core/src/main/java/com/bazaarvoice/jolt/modifier/function/Strings.java +++ b/jolt-core/src/main/java/com/bazaarvoice/jolt/modifier/function/Strings.java @@ -101,8 +101,8 @@ protected Optional applyList(List argList) { // If we get here, then all these casts should work. String tuna = (String) argList.get(0); - int start = (Integer) argList.get(1); - int end = (Integer) argList.get(2); + int start = tuna.offsetByCodePoints(0, (Integer) argList.get(1)); + int end = tuna.offsetByCodePoints(0, (Integer) argList.get(2)); // do start and end make sense? if ( start >= end || start < 0 || end < 1 || end > tuna.length() ) { From 7d5b69deb54466d729403f81d82f489e00ff5448 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Patrick=20Sodr=C3=A9?= Date: Mon, 12 Apr 2021 12:08:37 -0400 Subject: [PATCH 3/3] Fix size/length issue for UTF-8 Strings --- .../com/bazaarvoice/jolt/modifier/function/Objects.java | 2 +- .../test/resources/json/modifier/functions/sizeTests.json | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/jolt-core/src/main/java/com/bazaarvoice/jolt/modifier/function/Objects.java b/jolt-core/src/main/java/com/bazaarvoice/jolt/modifier/function/Objects.java index 382cf73b..1656ea42 100644 --- a/jolt-core/src/main/java/com/bazaarvoice/jolt/modifier/function/Objects.java +++ b/jolt-core/src/main/java/com/bazaarvoice/jolt/modifier/function/Objects.java @@ -286,7 +286,7 @@ else if(args[0] instanceof List ) { return Optional.of(((List) args[0]).size()); } else if(args[0] instanceof String) { - return Optional.of( ((String) args[0]).length() ); + return Optional.of(((String) args[0]).codePointCount(0, ((String) args[0]).length())); } else if(args[0] instanceof Map) { return Optional.of( ((Map) args[0]).size() ); diff --git a/jolt-core/src/test/resources/json/modifier/functions/sizeTests.json b/jolt-core/src/test/resources/json/modifier/functions/sizeTests.json index 07cdf5e1..bc923870 100644 --- a/jolt-core/src/test/resources/json/modifier/functions/sizeTests.json +++ b/jolt-core/src/test/resources/json/modifier/functions/sizeTests.json @@ -8,6 +8,7 @@ "legitList" : [ 1, "foo" ], "legitMap" : { "a":"b", "d":"e"}, "legitString" : "foo", + "legitUtf8String": "\uD83D\uDC47", "legitNumber" : 3.1415 }, @@ -20,6 +21,7 @@ "legitListSize": "=size(@(1,legitList))", "legitMapSize": "=size(@(1,legitMap))", "legitStringSize": "=size(@(1,legitString))", + "legitUtf8StringSize": "=size(@(1,legitUtf8String))", "legitNumberSize": "=size(@(1,legitNumber))" }, "context": {}, @@ -39,11 +41,13 @@ "legitList" : [ 1, "foo" ], "legitMap" : { "a":"b", "d":"e"}, "legitString" : "foo", + "legitUtf8String": "\uD83D\uDC47", "legitNumber" : 3.1415, "legitListSize": 2, "legitMapSize": 2, - "legitStringSize": 3 + "legitStringSize": 3, + "legitUtf8StringSize": 1 // legitNumberSize does not get created because there can be no value for it } }