From e2e9d21e6e5500dbb96045ffe8c1ee8efd1b0339 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Tue, 21 Feb 2023 17:50:36 +0000 Subject: [PATCH 1/2] Proposed fix for [d19fe0a5b]: Handling incomplete byte sequences for utf-16/utf-32 --- generic/tclEncoding.c | 27 ++++++++++++++++++++++++--- tests/encoding.test | 6 ++++++ 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index dfa79071179..ecec6e97edb 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -2457,21 +2457,27 @@ UnicodeToUtfProc( } result = TCL_OK; - /* check alignment with utf-16 (2 == sizeof(UTF-16)) */ + /* + * Check alignment with utf-16 (2 == sizeof(UTF-16)) + */ + if ((srcLen % 2) != 0) { result = TCL_CONVERT_MULTIBYTE; srcLen--; } +#if TCL_UTF_MAX > 3 /* - * If last code point is a high surrogate, we cannot handle that yet. + * If last code point is a high surrogate, we cannot handle that yet, + * unless we are at the end. */ - if ((srcLen >= 2) && + if (!(flags & TCL_ENCODING_END) && (srcLen >= 2) && ((src[srcLen - (clientData?1:2)] & 0xFC) == 0xD8)) { result = TCL_CONVERT_MULTIBYTE; srcLen-= 2; } +#endif srcStart = src; srcEnd = src + srcLen; @@ -2504,6 +2510,21 @@ UnicodeToUtfProc( src += sizeof(unsigned short); } + if ((flags & TCL_ENCODING_END) && (result == TCL_CONVERT_MULTIBYTE)) { + /* We have a single byte left-over at the end */ + if (dst > dstEnd) { + result = TCL_CONVERT_NOSPACE; + } else { + /* destination is not full, so we really are at the end now */ + if (flags & TCL_ENCODING_STOPONERROR) { + result = TCL_CONVERT_SYNTAX; + } else { + dst += Tcl_UniCharToUtf(0xFFFD, dst); + numChars++; + src++; + } + } + } *srcReadPtr = src - srcStart; *dstWrotePtr = dst - dstStart; *dstCharsPtr = numChars; diff --git a/tests/encoding.test b/tests/encoding.test index f558e01004e..f6f9abcc56b 100644 --- a/tests/encoding.test +++ b/tests/encoding.test @@ -419,6 +419,12 @@ test encoding-16.3 {UnicodeToUtfProc} -body { set val [encoding convertfrom unicode "\xDC\xDC"] list $val [format %X [scan $val %c]] } -result "\uDCDC DCDC" +test encoding-16.4 {UnicodeToUtfProc, bug [d19fe0a5b]} -body { + encoding convertfrom unicode "\x41\x41\x41" +} -result \u4141\uFFFD +test encoding-16.5 {UnicodeToUtfProc, bug [d19fe0a5b]} -constraints ucs2 -body { + encoding convertfrom unicode "\xD8\xD8" +} -result \uD8D8 test encoding-17.1 {UtfToUnicodeProc} -constraints fullutf -body { encoding convertto unicode "\U460DC" From 6c757731b2aae0ae8a452d3875d26f6b1a172ba9 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Wed, 22 Feb 2023 20:44:13 +0000 Subject: [PATCH 2/2] minor bug-fix in utf-16/utf-32: 2 testcases failed in Tcl 9 compatibility mode (-DTCL_NO_DEPRECATED) --- generic/tclEncoding.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 0490831ee09..8e13b43e48b 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -237,8 +237,13 @@ static Tcl_EncodingConvertProc Iso88591ToUtfProc; */ static const Tcl_ObjType encodingType = { - "encoding", FreeEncodingInternalRep, DupEncodingInternalRep, NULL, NULL + "encoding", + FreeEncodingInternalRep, + DupEncodingInternalRep, + NULL, + NULL }; + #define EncodingSetInternalRep(objPtr, encoding) \ do { \ Tcl_ObjInternalRep ir; \ @@ -461,7 +466,7 @@ FillEncodingFileMap(void) map = Tcl_NewDictObj(); Tcl_IncrRefCount(map); - for (i = numDirs-1; i >= 0; i--) { + for (i = numDirs-1; i != TCL_INDEX_NONE; i--) { /* * Iterate backwards through the search path so as we overwrite * entries found, we favor files earlier on the search path. @@ -1182,7 +1187,7 @@ Tcl_ExternalToUtfDString( * Tcl_ExternalToUtfDStringEx -- * * Convert a source buffer from the specified encoding into UTF-8. -* The parameter flags controls the behavior, if any of the bytes in + * The parameter flags controls the behavior, if any of the bytes in * the source buffer are invalid or cannot be represented in utf-8. * Possible flags values: * TCL_ENCODING_STOPONERROR: don't replace invalid characters/bytes but @@ -1458,8 +1463,9 @@ Tcl_UtfToExternalDStringEx( char *dst; Tcl_EncodingState state; const Encoding *encodingPtr; - int dstLen, result, soFar, srcRead, dstWrote, dstChars; + int result, soFar, srcRead, dstWrote, dstChars; const char *srcStart = src; + int dstLen; Tcl_DStringInit(dstPtr); dst = Tcl_DStringValue(dstPtr); @@ -2627,9 +2633,10 @@ Utf32ToUtfProc( result = TCL_CONVERT_NOSPACE; } else { /* destination is not full, so we really are at the end now */ - if (flags & TCL_ENCODING_STOPONERROR) { + if ((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT) { result = TCL_CONVERT_SYNTAX; } else { + result = TCL_OK; dst += Tcl_UniCharToUtf(0xFFFD, dst); numChars++; src += bytesLeft; @@ -2854,9 +2861,10 @@ Utf16ToUtfProc( result = TCL_CONVERT_NOSPACE; } else { /* destination is not full, so we really are at the end now */ - if (flags & TCL_ENCODING_STOPONERROR) { + if (((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT)) { result = TCL_CONVERT_SYNTAX; } else { + result = TCL_OK; dst += Tcl_UniCharToUtf(0xFFFD, dst); numChars++; src++;