Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

mochijson:decode() doesn't handle surrogate pairs correctly #196

Open
sigsergv opened this issue Dec 28, 2017 · 5 comments
Open

mochijson:decode() doesn't handle surrogate pairs correctly #196

sigsergv opened this issue Dec 28, 2017 · 5 comments

Comments

@sigsergv
Copy link

mochijson:decode doesn't convert surrogate pairs in JSON into proper unicode characters.

For example (character 💩):

59> mochijson:decode("\"\\ud83d\\udca9\"").
[56489,55357]

But it should return instead:

59> mochijson:decode("\"\\ud83d\\udca9\"").
[128169]
@sigsergv
Copy link
Author

sigsergv commented Dec 28, 2017

Quick and dirty patch:

diff -uNr ChicagoBoss-orig/deps/mochiweb/src/mochijson.erl ChicagoBoss/deps/mochiweb/src/mochijson.erl
--- ChicagoBoss-orig/deps/mochiweb/src/mochijson.erl    2017-12-28 16:57:09.557338445 +0700
+++ ChicagoBoss/deps/mochiweb/src/mochijson.erl 2017-12-28 17:45:17.673802239 +0700
@@ -306,6 +306,21 @@
             decode_array(Rest, S1#decoder{state=any}, Acc)
     end.
 
+tokenize_string_surrogate_pair(SP1, [$\\, $u, C3, C2, C1, C0 | Rest], S, Acc) ->
+    C = dehex(C0) bor
+        (dehex(C1) bsl 4) bor
+        (dehex(C2) bsl 8) bor 
+        (dehex(C3) bsl 12),
+    if
+        C >= 16#DC00 andalso C =< 16#DFFF ->
+            case catch unicode:characters_to_list(unicode:characters_to_binary(<<SP1:16,C:16>>,utf16,utf8)) of
+                [UnicodeChar] -> tokenize_string(Rest, ?ADV_COL(S, 6), [UnicodeChar | Acc]);
+                _ ->tokenize_string(Rest, ?ADV_COL(S, 6), [SP1, C | Acc])
+            end;
+        true ->
+            tokenize_string(Rest, ?ADV_COL(S, 6), [SP1, C | Acc])
+    end.
+
 tokenize_string(IoList=[C | _], S=#decoder{input_encoding=utf8}, Acc)
   when is_list(C); is_binary(C); C >= 16#7f ->
     List = xmerl_ucs:from_utf8(iolist_to_binary(IoList)),
@@ -334,7 +349,13 @@
         (dehex(C1) bsl 4) bor
         (dehex(C2) bsl 8) bor 
         (dehex(C3) bsl 12),
-    tokenize_string(Rest, ?ADV_COL(S, 6), [C | Acc]);
+    if
+        C >= 16#D800 andalso C =< 16#DBFF ->
+            %% Surrogate pair
+            tokenize_string_surrogate_pair(C, Rest, ?ADV_COL(S, 6), Acc);
+        true ->
+            tokenize_string(Rest, ?ADV_COL(S, 6), [C | Acc])
+    end;
 tokenize_string([C | Rest], S, Acc) when C >= $\s; C < 16#10FFFF ->
     tokenize_string(Rest, ?ADV_COL(S, 1), [C | Acc]).

@etrepum
Copy link
Member

etrepum commented Dec 28, 2017

A pull request with a test would be the preferred method of contribution for this, if you have the time

@sigsergv
Copy link
Author

I'm not sure is that a proper fix actually. Don't know unicode that well.

@etrepum
Copy link
Member

etrepum commented Dec 28, 2017

Have you tried using mochijson2? UTF8 binaries are generally better to work with than lists of code points. mochijson exists only for compatibility reasons.

@sigsergv
Copy link
Author

We are planning to migrate to mochijson2 but at this moment we heavily depend upon unicode strings (with unicode characters).

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants