Merge pull request skulpt#1541 from s-cork/feature/bytes-decode-encod…

…ings bytes: support decode encodings that the browser supports #noa
l3dlp-sandbox · Feb 29, 2024 · e2289c9 · e2289c9
2 parents 4b05db8 + b76858b
commit e2289c9
Show file tree

Hide file tree

Showing 3 changed files with 95 additions and 18 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -12,7 +12,7 @@ jobs:
       - name: Use Node.js
         uses: actions/setup-node@v1
         with:
-          node-version: "12.x"
+          node-version: "16.x"
       - run: npm ci
       - run: npm run build
       - run: npm test

diff --git a/src/bytes.js b/src/bytes.js
@@ -5,7 +5,10 @@ const supportedEncodings = {
     utf: "utf-8",
     utf8: "utf-8",
     utf_8: "utf-8",
+    latin_1: "latin1", // browser spec
     ascii: "ascii",
+    utf16: "utf-16",
+    utf_16: "utf-16",
 };
 
 var space_reg = /\s+/g;
@@ -19,8 +22,8 @@ function normalizeEncoding(encoding) {
         return supported;
     }
 }
-const Encoder = new TextEncoder();
-const Decoder = new TextDecoder();
+const UtfEncoder = new TextEncoder();
+const UtfDecoder = new TextDecoder();
 
 /**
  * @constructor
@@ -77,7 +80,12 @@ Sk.builtin.bytes = Sk.abstr.buildNativeClass("bytes", {
             if (args.length <= 1 && +kwargs.length === 0) {
                 pySource = args[0];
             } else {
-                [pySource, encoding, errors] = Sk.abstr.copyKeywordsToNamedArgs("bytes", [null, "pySource", "errors"], args, kwargs);
+                [pySource, encoding, errors] = Sk.abstr.copyKeywordsToNamedArgs(
+                    "bytes",
+                    [null, "encoding", "errors"],
+                    args,
+                    kwargs
+                );
                 ({ encoding, errors } = checkGetEncodingErrors("bytes", encoding, errors));
                 if (!Sk.builtin.checkString(pySource)) {
                     throw new Sk.builtin.TypeError("encoding or errors without a string argument");
@@ -965,19 +973,25 @@ function checkGetEncodingErrors(funcname, encoding, errors) {
     return { encoding: encoding, errors: errors };
 }
 
+function checkErrorsIsValid(errors) {
+    if (!(errors === "strict" || errors === "ignore" || errors === "replace")) {
+        throw new Sk.builtin.LookupError(
+            "Unsupported or invalid error type '" + errors + "'"
+        );
+    }
+}
+
 function strEncode(pyStr, encoding, errors) {
     const source = pyStr.$jsstr();
     encoding = normalizeEncoding(encoding);
-    if (!(errors === "strict" || errors === "ignore" || errors === "replace")) {
-        throw new Sk.builtin.NotImplementedError("'" + errors + "' error handling not implemented in Skulpt");
-    }
+    checkErrorsIsValid(errors);
     let uint8;
     if (encoding === "ascii") {
         uint8 = encodeAscii(source, errors);
     } else if (encoding === "utf-8") {
-        uint8 = Encoder.encode(source);
+        uint8 = UtfEncoder.encode(source);
     } else {
-        throw new Sk.builtin.LookupError("unknown encoding: " + encoding);
+        throw new Sk.builtin.LookupError("Unsupported or unknown encoding: '" + encoding + "'");
     }
     return new Sk.builtin.bytes(uint8);
 }
@@ -1040,8 +1054,8 @@ function decodeAscii(source, errors) {
     return final;
 }
 
-function decodeUtf(source, errors) {
-    const string = Decoder.decode(source);
+function decode(decoder, source, errors, encoding) {
+    const string = decoder.decode(source);
     if (errors === "replace") {
         return string;
     } else if (errors === "strict") {
@@ -1050,7 +1064,7 @@ function decodeUtf(source, errors) {
             return string;
         }
         throw new Sk.builtin.UnicodeDecodeError(
-            "'utf-8' codec can't decode byte 0x" + source[i].toString(16) + " in position " + i + ": invalid start byte"
+            `'${encoding}' codec can't decode byte 0x ${source[i].toString(16)} in position ${i}: invalid start byte`
         );
     }
     return string.replace(/�/g, "");
@@ -1060,17 +1074,21 @@ function bytesDecode(encoding, errors) {
     ({ encoding, errors } = checkGetEncodingErrors("decode", encoding, errors));
     encoding = normalizeEncoding(encoding);
 
-    if (!(errors === "strict" || errors === "ignore" || errors === "replace")) {
-        throw new Sk.builtin.NotImplementedError("'" + errors + "' error handling not implemented in Skulpt");
-    }
+    checkErrorsIsValid(errors);
 
     let jsstr;
     if (encoding === "ascii") {
         jsstr = decodeAscii(this.v, errors);
     } else if (encoding === "utf-8") {
-        jsstr = decodeUtf(this.v, errors);
+        jsstr = decode(UtfDecoder, this.v, errors, encoding);
     } else {
-        throw new Sk.builtin.LookupError("unknown encoding: " + encoding);
+        let decoder;
+        try {
+            decoder = new TextDecoder(encoding);
+        } catch (e) {
+            throw new Sk.builtin.LookupError(`Unsupported or unknown encoding: ${encoding}. ${e.message}`);
+        }
+        jsstr = decode(decoder, this.v, errors, encoding);
     }
     return new Sk.builtin.str(jsstr);
 }

diff --git a/test/unit3/test_bytes.py b/test/unit3/test_bytes.py
@@ -206,7 +206,7 @@ def test_compare_bytes_to_bytearray(self):
         self.assertEqual(bytes(b"abc") < b"ab", False)
         self.assertEqual(bytes(b"abc") <= b"ab", False)
 
-    def test_decode(self):
+    def test_decode_basic(self):
         a = bytes("abc", "ascii")
         b0 = [67,127,102]
         b = bytes(b0)
@@ -217,6 +217,65 @@ def test_decode(self):
         self.assertRaises(TypeError, a.decode, [], "strict")
         self.assertRaises(TypeError, a.decode, "ascii", [])
 
+    def test_decode(self):
+        sample = "Hello world\n\u1234\u5678\u9abc"
+        encodings = {
+            "utf-8": b'Hello world\n\xe1\x88\xb4\xe5\x99\xb8\xe9\xaa\xbc',
+            "utf-16": b'\xff\xfeH\x00e\x00l\x00l\x00o\x00 \x00w\x00o\x00r\x00l\x00d\x00\n\x004\x12xV\xbc\x9a',
+        }
+        for enc in ("utf-8", "utf-16"):
+            # b = self.type2test(sample, enc)
+            b = encodings[enc]
+            self.assertEqual(b.decode(enc), sample)
+        # sample = "Hello world\n\x80\x81\xfe\xff"
+        # b = self.type2test(sample, "latin-1")
+        sample = "Hello world\nÆ"
+        b = b'Hello world\n\xc6'
+        self.assertRaises(UnicodeDecodeError, b.decode, "utf-8")
+        self.assertEqual(b.decode("utf-8", "ignore"), "Hello world\n")
+        self.assertEqual(b.decode(errors="ignore", encoding="utf-8"),
+                         "Hello world\n")
+        # Default encoding is utf-8
+        self.assertEqual(self.type2test(b'\xe2\x98\x83').decode(), '\u2603')
+        self.assertEqual(b.decode("latin-1"), sample)
+
+    def test_check_encoding_errors(self):
+        # bpo-37388: bytes(str) and bytes.encode() must check encoding
+        # and errors arguments in dev mode
+        invalid = 'Boom, Shaka Laka, Boom!'
+        encodings = ('ascii', 'utf8', 'latin1')
+        type2test = self.type2test
+        for data in ('', 'short string'):
+            with self.assertRaises(LookupError):
+                type2test(data, encoding=invalid)
+
+            for encoding in encodings:
+                try:
+                    type2test(data, encoding=encoding, errors=invalid)
+                except LookupError:
+                    pass
+                else:
+                    self.fail()
+
+        for data in (b'', b'short string'):
+            data = type2test(data)
+            with self.assertRaises(LookupError):
+                data.decode(encoding=invalid)
+            try:
+                data.decode(errors=invalid)
+            except LookupError:
+                pass
+            else:
+                self.fail()
+
+            for encoding in encodings:
+                try:
+                    data.decode(encoding=encoding, errors=invalid)
+                except LookupError:
+                    pass
+                else:
+                    self.fail()
+
     def test_encode(self):
         a = "abc".encode("ascii")
         self.assertEqual(list(a), [97, 98, 99])