diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 275a0458eb..68f89af70f 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -12,7 +12,7 @@ jobs: - name: Use Node.js uses: actions/setup-node@v1 with: - node-version: "12.x" + node-version: "16.x" - run: npm ci - run: npm run build - run: npm test diff --git a/src/bytes.js b/src/bytes.js index 92b825516d..041907c288 100644 --- a/src/bytes.js +++ b/src/bytes.js @@ -5,7 +5,10 @@ const supportedEncodings = { utf: "utf-8", utf8: "utf-8", utf_8: "utf-8", + latin_1: "latin1", // browser spec ascii: "ascii", + utf16: "utf-16", + utf_16: "utf-16", }; var space_reg = /\s+/g; @@ -19,8 +22,8 @@ function normalizeEncoding(encoding) { return supported; } } -const Encoder = new TextEncoder(); -const Decoder = new TextDecoder(); +const UtfEncoder = new TextEncoder(); +const UtfDecoder = new TextDecoder(); /** * @constructor @@ -77,7 +80,12 @@ Sk.builtin.bytes = Sk.abstr.buildNativeClass("bytes", { if (args.length <= 1 && +kwargs.length === 0) { pySource = args[0]; } else { - [pySource, encoding, errors] = Sk.abstr.copyKeywordsToNamedArgs("bytes", [null, "pySource", "errors"], args, kwargs); + [pySource, encoding, errors] = Sk.abstr.copyKeywordsToNamedArgs( + "bytes", + [null, "encoding", "errors"], + args, + kwargs + ); ({ encoding, errors } = checkGetEncodingErrors("bytes", encoding, errors)); if (!Sk.builtin.checkString(pySource)) { throw new Sk.builtin.TypeError("encoding or errors without a string argument"); @@ -965,19 +973,25 @@ function checkGetEncodingErrors(funcname, encoding, errors) { return { encoding: encoding, errors: errors }; } +function checkErrorsIsValid(errors) { + if (!(errors === "strict" || errors === "ignore" || errors === "replace")) { + throw new Sk.builtin.LookupError( + "Unsupported or invalid error type '" + errors + "'" + ); + } +} + function strEncode(pyStr, encoding, errors) { const source = pyStr.$jsstr(); encoding = normalizeEncoding(encoding); - if (!(errors === "strict" || errors === "ignore" || errors === "replace")) { - throw new Sk.builtin.NotImplementedError("'" + errors + "' error handling not implemented in Skulpt"); - } + checkErrorsIsValid(errors); let uint8; if (encoding === "ascii") { uint8 = encodeAscii(source, errors); } else if (encoding === "utf-8") { - uint8 = Encoder.encode(source); + uint8 = UtfEncoder.encode(source); } else { - throw new Sk.builtin.LookupError("unknown encoding: " + encoding); + throw new Sk.builtin.LookupError("Unsupported or unknown encoding: '" + encoding + "'"); } return new Sk.builtin.bytes(uint8); } @@ -1040,8 +1054,8 @@ function decodeAscii(source, errors) { return final; } -function decodeUtf(source, errors) { - const string = Decoder.decode(source); +function decode(decoder, source, errors, encoding) { + const string = decoder.decode(source); if (errors === "replace") { return string; } else if (errors === "strict") { @@ -1050,7 +1064,7 @@ function decodeUtf(source, errors) { return string; } throw new Sk.builtin.UnicodeDecodeError( - "'utf-8' codec can't decode byte 0x" + source[i].toString(16) + " in position " + i + ": invalid start byte" + `'${encoding}' codec can't decode byte 0x ${source[i].toString(16)} in position ${i}: invalid start byte` ); } return string.replace(/�/g, ""); @@ -1060,17 +1074,21 @@ function bytesDecode(encoding, errors) { ({ encoding, errors } = checkGetEncodingErrors("decode", encoding, errors)); encoding = normalizeEncoding(encoding); - if (!(errors === "strict" || errors === "ignore" || errors === "replace")) { - throw new Sk.builtin.NotImplementedError("'" + errors + "' error handling not implemented in Skulpt"); - } + checkErrorsIsValid(errors); let jsstr; if (encoding === "ascii") { jsstr = decodeAscii(this.v, errors); } else if (encoding === "utf-8") { - jsstr = decodeUtf(this.v, errors); + jsstr = decode(UtfDecoder, this.v, errors, encoding); } else { - throw new Sk.builtin.LookupError("unknown encoding: " + encoding); + let decoder; + try { + decoder = new TextDecoder(encoding); + } catch (e) { + throw new Sk.builtin.LookupError(`Unsupported or unknown encoding: ${encoding}. ${e.message}`); + } + jsstr = decode(decoder, this.v, errors, encoding); } return new Sk.builtin.str(jsstr); } diff --git a/test/unit3/test_bytes.py b/test/unit3/test_bytes.py index e08bca5585..5c3bea6375 100644 --- a/test/unit3/test_bytes.py +++ b/test/unit3/test_bytes.py @@ -206,7 +206,7 @@ def test_compare_bytes_to_bytearray(self): self.assertEqual(bytes(b"abc") < b"ab", False) self.assertEqual(bytes(b"abc") <= b"ab", False) - def test_decode(self): + def test_decode_basic(self): a = bytes("abc", "ascii") b0 = [67,127,102] b = bytes(b0) @@ -217,6 +217,65 @@ def test_decode(self): self.assertRaises(TypeError, a.decode, [], "strict") self.assertRaises(TypeError, a.decode, "ascii", []) + def test_decode(self): + sample = "Hello world\n\u1234\u5678\u9abc" + encodings = { + "utf-8": b'Hello world\n\xe1\x88\xb4\xe5\x99\xb8\xe9\xaa\xbc', + "utf-16": b'\xff\xfeH\x00e\x00l\x00l\x00o\x00 \x00w\x00o\x00r\x00l\x00d\x00\n\x004\x12xV\xbc\x9a', + } + for enc in ("utf-8", "utf-16"): + # b = self.type2test(sample, enc) + b = encodings[enc] + self.assertEqual(b.decode(enc), sample) + # sample = "Hello world\n\x80\x81\xfe\xff" + # b = self.type2test(sample, "latin-1") + sample = "Hello world\nÆ" + b = b'Hello world\n\xc6' + self.assertRaises(UnicodeDecodeError, b.decode, "utf-8") + self.assertEqual(b.decode("utf-8", "ignore"), "Hello world\n") + self.assertEqual(b.decode(errors="ignore", encoding="utf-8"), + "Hello world\n") + # Default encoding is utf-8 + self.assertEqual(self.type2test(b'\xe2\x98\x83').decode(), '\u2603') + self.assertEqual(b.decode("latin-1"), sample) + + def test_check_encoding_errors(self): + # bpo-37388: bytes(str) and bytes.encode() must check encoding + # and errors arguments in dev mode + invalid = 'Boom, Shaka Laka, Boom!' + encodings = ('ascii', 'utf8', 'latin1') + type2test = self.type2test + for data in ('', 'short string'): + with self.assertRaises(LookupError): + type2test(data, encoding=invalid) + + for encoding in encodings: + try: + type2test(data, encoding=encoding, errors=invalid) + except LookupError: + pass + else: + self.fail() + + for data in (b'', b'short string'): + data = type2test(data) + with self.assertRaises(LookupError): + data.decode(encoding=invalid) + try: + data.decode(errors=invalid) + except LookupError: + pass + else: + self.fail() + + for encoding in encodings: + try: + data.decode(encoding=encoding, errors=invalid) + except LookupError: + pass + else: + self.fail() + def test_encode(self): a = "abc".encode("ascii") self.assertEqual(list(a), [97, 98, 99])