Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Buffer#toString throw on unsupported encodings #2418

Open
wants to merge 2 commits into
base: v2-maintenance
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions AUTHORS.rst
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ and agreed to irrevocably license their contributions under the Duktape
* Luis de Bethencourt (https://github.com/luisbg)
* Ian Whyman (https://github.com/v00d00)
* Rick Sayre (https://github.com/whorfin)
* SheetJS (https://github.com/SheetJS)

Other contributions
===================
Expand Down
50 changes: 46 additions & 4 deletions src-input/duk_bi_buffer.c
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,42 @@ static duk_uint16_t duk__buffer_elemtype_copy_compatible[9] = {
};
#endif /* !DUK_USE_PREFER_SIZE */

#if defined(DUK_USE_BUFFEROBJECT_SUPPORT)
/* Buffer supported encodings */

#define DUK_BUF_ENC_UNKNOWN 0
#define DUK_BUF_ENC_UTF8 1

/* longest encoding string + 1 -- should be updated when longer strings are added */
#define DUK_BUFFER_ENCODING_MAX_LEN 7

#define DUK_BUFFER_ENCODING_COUNT 2
DUK_LOCAL const char * const duk__buffer_encoding_names[DUK_BUFFER_ENCODING_COUNT] = {
"utf8",
"utf-8"
};

DUK_LOCAL const duk_int_t duk__buffer_encoding_type_from_name[DUK_BUFFER_ENCODING_COUNT] = {
DUK_BUF_ENC_UTF8,
DUK_BUF_ENC_UTF8
};

DUK_LOCAL duk_int_t duk__parse_string_encoding(const char *encoding) {
duk_uint8_t i;
char buf[DUK_BUFFER_ENCODING_MAX_LEN];
/* the valid nodejs buffer encodings only contain letters numbers and hyphens */
for (i = 0; i < DUK_BUFFER_ENCODING_MAX_LEN; ++i) {
if (encoding[i] == 0) { buf[i] = 0; break; }
buf[i] = (char) (encoding[i] | 0x20);
}
Copy link
Owner

@svaarala svaarala Sep 9, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This loop works incorrectly if i reaches the maximum encoding length without finding a NUL terminator in the input: buf will remain unterminated and it's used for a strcmp() later.

Inputs like utf8\u0000bar are (technically) handled incorrectly, as the loop will terminate on the NUL.

ORing 0x20 on all input chars also results in some incorrect inputs accepted, e.g. "utf\u000d8" will be accepted because 0x0d | 0x20 = 0x2d which is '-'.

So overall I don't think approach is easy to make work.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if it'd produce less code to just short circuit compare the input, e.g.:

if ((p[0] | 0x20) == 'u' && (p[1] | 0x20) == 't' && (p[2] | 0x20) == 'f') {
    if (p[3] == '8' && encoding_len == 4) {
        /* utf8 */
    } else if (p[3] == '-' && p[4] == '8' && encoding_len == 5) {
        /* utf8 */
    }
}
/* invalid */

for (i = 0; i < DUK_BUFFER_ENCODING_COUNT; ++i) {
if(DUK_STRCMP((const char *)buf, duk__buffer_encoding_names[i]) == 0) return duk__buffer_encoding_type_from_name[i];
}
return DUK_BUF_ENC_UNKNOWN;
}
#undef DUK_BUFFER_ENCODING_COUNT
#endif /* DUK_USE_BUFFEROBJECT_SUPPORT */

DUK_LOCAL duk_hbufobj *duk__hbufobj_promote_this(duk_hthread *thr) {
duk_tval *tv_dst;
duk_hbufobj *res;
Expand Down Expand Up @@ -1183,11 +1219,14 @@ DUK_INTERNAL duk_ret_t duk_bi_uint8array_plainof(duk_hthread *thr) {

#if defined(DUK_USE_BUFFEROBJECT_SUPPORT)
DUK_INTERNAL duk_ret_t duk_bi_nodejs_buffer_tostring(duk_hthread *thr) {
const char *encoding;
duk_int_t encoding_type;
duk_hbufobj *h_this;
duk_int_t start_offset, end_offset;
duk_uint8_t *buf_slice;
duk_size_t slice_length;


h_this = duk__get_bufobj_this(thr);
if (h_this == NULL) {
/* XXX: happens e.g. when evaluating: String(Buffer.prototype). */
Expand All @@ -1196,7 +1235,11 @@ DUK_INTERNAL duk_ret_t duk_bi_nodejs_buffer_tostring(duk_hthread *thr) {
}
DUK_HBUFOBJ_ASSERT_VALID(h_this);

/* Ignore encoding for now. */
encoding = duk_opt_string(thr, 0, "utf8");
encoding_type = duk__parse_string_encoding(encoding);
if(encoding_type == DUK_BUF_ENC_UNKNOWN) {
DUK_DCERROR_TYPE_INVALID_ARGS(thr);
}
SheetJSDev marked this conversation as resolved.
Show resolved Hide resolved

duk__clamp_startend_nonegidx_noshift(thr,
(duk_int_t) h_this->length,
Expand Down Expand Up @@ -1232,6 +1275,7 @@ DUK_INTERNAL duk_ret_t duk_bi_nodejs_buffer_tostring(duk_hthread *thr) {
*/
duk_replace(thr, 0);
duk_set_top(thr, 1);
/* TODO: support other encodings. currently only 'utf8' is supported. */
return duk_textdecoder_decode_utf8_nodejs(thr);
}
#endif /* DUK_USE_BUFFEROBJECT_SUPPORT */
Expand Down Expand Up @@ -2060,11 +2104,9 @@ DUK_INTERNAL duk_ret_t duk_bi_buffer_slice_shared(duk_hthread *thr) {
DUK_INTERNAL duk_ret_t duk_bi_nodejs_buffer_is_encoding(duk_hthread *thr) {
const char *encoding;

/* only accept lowercase 'utf8' now. */

encoding = duk_to_string(thr, 0);
DUK_ASSERT(duk_is_string(thr, 0)); /* guaranteed by duk_to_string() */
duk_push_boolean(thr, DUK_STRCMP(encoding, "utf8") == 0);
duk_push_boolean(thr, duk__parse_string_encoding(encoding) != DUK_BUF_ENC_UNKNOWN);
return 1;
}
#endif /* DUK_USE_BUFFEROBJECT_SUPPORT */
Expand Down
17 changes: 9 additions & 8 deletions tests/ecmascript/test-bi-nodejs-buffer-isencoding.js
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,17 @@ isEncoding test
empty: false
undefined: false
utf8: true
utf-8: false
UTF8: false
UTF-8: false
Utf8: false
Utf-8: false
uTf8: false
uTf-8: false
utf-8: true
UTF8: true
UTF-8: true
Utf8: true
Utf-8: true
uTf8: true
uTf-8: true
ascii: false
ASCII: false
AsCiI: false
binary: false
dummy: false
undefined: false
null: false
Expand All @@ -41,14 +42,14 @@ function isEncodingTest() {

[
// Any capitalization (and dash / no dash) is accepted by Node.js.
// Duktape accepts 'utf8' only for now.
'utf8', 'utf-8', 'UTF8', 'UTF-8',
'Utf8', 'Utf-8', 'uTf8', 'uTf-8',

// Duktape doesn't support 'ascii' etc.
'ascii',
'ASCII',
'AsCiI',
'binary',
'dummy',

// Non-string values
Expand Down
24 changes: 20 additions & 4 deletions tests/ecmascript/test-bi-nodejs-buffer-tostring.js
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,11 @@ true
"ABC"
"ABC"
"ABC"
"ABC"
"ABC"
"ABC"
"TypeError"
"TypeError"
"DEFG"
"EFG"
"E"
Expand Down Expand Up @@ -241,20 +246,31 @@ function nodejsBufferToStringTest() {
// buf.toString([encoding], [start], [end])

// Without arguments encoding defaults to UTF-8 and the entire
// buffer is converted to string. At least undefined and null
// buffer is converted to string. At least undefined
// are accepted as "not defined" for encoding.
b = new Buffer('ABC');
safePrintString(b.toString());
safePrintString(b.toString(undefined));
safePrintString(b.toString(null));

// supported encodings
safePrintString(b.toString("utf8"));
safePrintString(b.toString("utf-8"));

// encodings are case insensitive
safePrintString(b.toString("UtF8"));
safePrintString(b.toString("uTf-8"));

// invalid encodings should throw a TypeError
try { safePrintString(b.toString(null)); } catch(e) { safePrintString(e.name); }
try { safePrintString(b.toString("wtf")); } catch(e) { safePrintString(e.name); }

// If the buffer is a slice of an underlying buffer, only that slice
// is string converted. Offsets are relative to the slice.
b = new Buffer('ABCDEFGH');
b = b.slice(3, 7); // DEFG
safePrintString(b.toString());
safePrintString(b.toString(null, 1));
safePrintString(b.toString(null, 1, 2));
safePrintString(b.toString(undefined, 1));
safePrintString(b.toString(undefined, 1, 2));

// When the buffer data is legal UTF-8 and the chosen encoding
// is UTF-8 (default), Duktape internal representation is correct
Expand Down