Skip to content

Commit

Permalink
Merge pull request skulpt#1541 from s-cork/feature/bytes-decode-encod…
Browse files Browse the repository at this point in the history
…ings

bytes: support decode encodings that the browser supports #noa
  • Loading branch information
s-cork authored Feb 29, 2024
2 parents 4b05db8 + b76858b commit e2289c9
Show file tree
Hide file tree
Showing 3 changed files with 95 additions and 18 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ jobs:
- name: Use Node.js
uses: actions/setup-node@v1
with:
node-version: "12.x"
node-version: "16.x"
- run: npm ci
- run: npm run build
- run: npm test
Expand Down
50 changes: 34 additions & 16 deletions src/bytes.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,10 @@ const supportedEncodings = {
utf: "utf-8",
utf8: "utf-8",
utf_8: "utf-8",
latin_1: "latin1", // browser spec
ascii: "ascii",
utf16: "utf-16",
utf_16: "utf-16",
};

var space_reg = /\s+/g;
Expand All @@ -19,8 +22,8 @@ function normalizeEncoding(encoding) {
return supported;
}
}
const Encoder = new TextEncoder();
const Decoder = new TextDecoder();
const UtfEncoder = new TextEncoder();
const UtfDecoder = new TextDecoder();

/**
* @constructor
Expand Down Expand Up @@ -77,7 +80,12 @@ Sk.builtin.bytes = Sk.abstr.buildNativeClass("bytes", {
if (args.length <= 1 && +kwargs.length === 0) {
pySource = args[0];
} else {
[pySource, encoding, errors] = Sk.abstr.copyKeywordsToNamedArgs("bytes", [null, "pySource", "errors"], args, kwargs);
[pySource, encoding, errors] = Sk.abstr.copyKeywordsToNamedArgs(
"bytes",
[null, "encoding", "errors"],
args,
kwargs
);
({ encoding, errors } = checkGetEncodingErrors("bytes", encoding, errors));
if (!Sk.builtin.checkString(pySource)) {
throw new Sk.builtin.TypeError("encoding or errors without a string argument");
Expand Down Expand Up @@ -965,19 +973,25 @@ function checkGetEncodingErrors(funcname, encoding, errors) {
return { encoding: encoding, errors: errors };
}

function checkErrorsIsValid(errors) {
if (!(errors === "strict" || errors === "ignore" || errors === "replace")) {
throw new Sk.builtin.LookupError(
"Unsupported or invalid error type '" + errors + "'"
);
}
}

function strEncode(pyStr, encoding, errors) {
const source = pyStr.$jsstr();
encoding = normalizeEncoding(encoding);
if (!(errors === "strict" || errors === "ignore" || errors === "replace")) {
throw new Sk.builtin.NotImplementedError("'" + errors + "' error handling not implemented in Skulpt");
}
checkErrorsIsValid(errors);
let uint8;
if (encoding === "ascii") {
uint8 = encodeAscii(source, errors);
} else if (encoding === "utf-8") {
uint8 = Encoder.encode(source);
uint8 = UtfEncoder.encode(source);
} else {
throw new Sk.builtin.LookupError("unknown encoding: " + encoding);
throw new Sk.builtin.LookupError("Unsupported or unknown encoding: '" + encoding + "'");
}
return new Sk.builtin.bytes(uint8);
}
Expand Down Expand Up @@ -1040,8 +1054,8 @@ function decodeAscii(source, errors) {
return final;
}

function decodeUtf(source, errors) {
const string = Decoder.decode(source);
function decode(decoder, source, errors, encoding) {
const string = decoder.decode(source);
if (errors === "replace") {
return string;
} else if (errors === "strict") {
Expand All @@ -1050,7 +1064,7 @@ function decodeUtf(source, errors) {
return string;
}
throw new Sk.builtin.UnicodeDecodeError(
"'utf-8' codec can't decode byte 0x" + source[i].toString(16) + " in position " + i + ": invalid start byte"
`'${encoding}' codec can't decode byte 0x ${source[i].toString(16)} in position ${i}: invalid start byte`
);
}
return string.replace(//g, "");
Expand All @@ -1060,17 +1074,21 @@ function bytesDecode(encoding, errors) {
({ encoding, errors } = checkGetEncodingErrors("decode", encoding, errors));
encoding = normalizeEncoding(encoding);

if (!(errors === "strict" || errors === "ignore" || errors === "replace")) {
throw new Sk.builtin.NotImplementedError("'" + errors + "' error handling not implemented in Skulpt");
}
checkErrorsIsValid(errors);

let jsstr;
if (encoding === "ascii") {
jsstr = decodeAscii(this.v, errors);
} else if (encoding === "utf-8") {
jsstr = decodeUtf(this.v, errors);
jsstr = decode(UtfDecoder, this.v, errors, encoding);
} else {
throw new Sk.builtin.LookupError("unknown encoding: " + encoding);
let decoder;
try {
decoder = new TextDecoder(encoding);
} catch (e) {
throw new Sk.builtin.LookupError(`Unsupported or unknown encoding: ${encoding}. ${e.message}`);
}
jsstr = decode(decoder, this.v, errors, encoding);
}
return new Sk.builtin.str(jsstr);
}
Expand Down
61 changes: 60 additions & 1 deletion test/unit3/test_bytes.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ def test_compare_bytes_to_bytearray(self):
self.assertEqual(bytes(b"abc") < b"ab", False)
self.assertEqual(bytes(b"abc") <= b"ab", False)

def test_decode(self):
def test_decode_basic(self):
a = bytes("abc", "ascii")
b0 = [67,127,102]
b = bytes(b0)
Expand All @@ -217,6 +217,65 @@ def test_decode(self):
self.assertRaises(TypeError, a.decode, [], "strict")
self.assertRaises(TypeError, a.decode, "ascii", [])

def test_decode(self):
sample = "Hello world\n\u1234\u5678\u9abc"
encodings = {
"utf-8": b'Hello world\n\xe1\x88\xb4\xe5\x99\xb8\xe9\xaa\xbc',
"utf-16": b'\xff\xfeH\x00e\x00l\x00l\x00o\x00 \x00w\x00o\x00r\x00l\x00d\x00\n\x004\x12xV\xbc\x9a',
}
for enc in ("utf-8", "utf-16"):
# b = self.type2test(sample, enc)
b = encodings[enc]
self.assertEqual(b.decode(enc), sample)
# sample = "Hello world\n\x80\x81\xfe\xff"
# b = self.type2test(sample, "latin-1")
sample = "Hello world\nÆ"
b = b'Hello world\n\xc6'
self.assertRaises(UnicodeDecodeError, b.decode, "utf-8")
self.assertEqual(b.decode("utf-8", "ignore"), "Hello world\n")
self.assertEqual(b.decode(errors="ignore", encoding="utf-8"),
"Hello world\n")
# Default encoding is utf-8
self.assertEqual(self.type2test(b'\xe2\x98\x83').decode(), '\u2603')
self.assertEqual(b.decode("latin-1"), sample)

def test_check_encoding_errors(self):
# bpo-37388: bytes(str) and bytes.encode() must check encoding
# and errors arguments in dev mode
invalid = 'Boom, Shaka Laka, Boom!'
encodings = ('ascii', 'utf8', 'latin1')
type2test = self.type2test
for data in ('', 'short string'):
with self.assertRaises(LookupError):
type2test(data, encoding=invalid)

for encoding in encodings:
try:
type2test(data, encoding=encoding, errors=invalid)
except LookupError:
pass
else:
self.fail()

for data in (b'', b'short string'):
data = type2test(data)
with self.assertRaises(LookupError):
data.decode(encoding=invalid)
try:
data.decode(errors=invalid)
except LookupError:
pass
else:
self.fail()

for encoding in encodings:
try:
data.decode(encoding=encoding, errors=invalid)
except LookupError:
pass
else:
self.fail()

def test_encode(self):
a = "abc".encode("ascii")
self.assertEqual(list(a), [97, 98, 99])
Expand Down

0 comments on commit e2289c9

Please sign in to comment.