Skip to content

Commit

Permalink
Merge 8.7
Browse files Browse the repository at this point in the history
  • Loading branch information
apnadkarni committed Feb 23, 2023
2 parents 9b7ebcf + 6c75773 commit 64d96db
Show file tree
Hide file tree
Showing 3 changed files with 61 additions and 40 deletions.
68 changes: 42 additions & 26 deletions generic/tclEncoding.c
Original file line number Diff line number Diff line change
Expand Up @@ -264,8 +264,13 @@ static Tcl_EncodingConvertProc Iso88591ToUtfProc;
*/

static const Tcl_ObjType encodingType = {
"encoding", FreeEncodingInternalRep, DupEncodingInternalRep, NULL, NULL
"encoding",
FreeEncodingInternalRep,
DupEncodingInternalRep,
NULL,
NULL
};

#define EncodingSetInternalRep(objPtr, encoding) \
do { \
Tcl_ObjInternalRep ir; \
Expand Down Expand Up @@ -488,7 +493,7 @@ FillEncodingFileMap(void)
map = Tcl_NewDictObj();
Tcl_IncrRefCount(map);

for (i = numDirs-1; i >= 0; i--) {
for (i = numDirs-1; i != TCL_INDEX_NONE; i--) {
/*
* Iterate backwards through the search path so as we overwrite
* entries found, we favor files earlier on the search path.
Expand Down Expand Up @@ -1209,7 +1214,7 @@ Tcl_ExternalToUtfDString(
* Tcl_ExternalToUtfDStringEx --
*
* Convert a source buffer from the specified encoding into UTF-8.
* The parameter flags controls the behavior, if any of the bytes in
* The parameter flags controls the behavior, if any of the bytes in
* the source buffer are invalid or cannot be represented in utf-8.
* Possible flags values:
* target encoding. It should be composed by OR-ing the following:
Expand Down Expand Up @@ -1482,8 +1487,9 @@ Tcl_UtfToExternalDStringEx(
char *dst;
Tcl_EncodingState state;
const Encoding *encodingPtr;
int dstLen, result, soFar, srcRead, dstWrote, dstChars;
int result, soFar, srcRead, dstWrote, dstChars;
const char *srcStart = src;
int dstLen;

Tcl_DStringInit(dstPtr);
dst = Tcl_DStringValue(dstPtr);
Expand Down Expand Up @@ -2594,8 +2600,8 @@ Utf32ToUtfProc(
{
const char *srcStart, *srcEnd;
const char *dstEnd, *dstStart;
int result, extra, numChars, charLimit = INT_MAX;
int ch = 0;
int result, numChars, charLimit = INT_MAX;
int ch = 0, bytesLeft = srcLen % 4;

flags |= PTR2INT(clientData);
if (flags & TCL_ENCODING_CHAR_LIMIT) {
Expand All @@ -2606,11 +2612,10 @@ Utf32ToUtfProc(
/*
* Check alignment with utf-32 (4 == sizeof(UTF-32))
*/
extra = srcLen % 4;
if (extra != 0) {
/* We have a truncated code unit */
if (bytesLeft != 0) {
/* We have a truncated code unit */
result = TCL_CONVERT_MULTIBYTE;
srcLen &= -4;
srcLen -= bytesLeft;
}

/*
Expand Down Expand Up @@ -2648,7 +2653,7 @@ Utf32ToUtfProc(
/* Bug [10c2c17c32]. If Hi surrogate not followed by Lo surrogate, finish 3-byte UTF-8 */
dst += Tcl_UniCharToUtf(-1, dst);
}

if ((unsigned)ch > 0x10FFFF || SURROGATE(ch)) {
if (PROFILE_STRICT(flags)) {
result = TCL_CONVERT_SYNTAX;
Expand Down Expand Up @@ -2679,16 +2684,22 @@ Utf32ToUtfProc(
}
/*
* If we had a truncated code unit at the end AND this is the last
* fragment AND profile is "replace", stick FFFD in its place.
* fragment AND profile is not "strict", stick FFFD in its place.
*/
if (extra && (flags & TCL_ENCODING_END) && PROFILE_REPLACE(flags)) {
src += extra; /* Go past truncated code unit */
if ((flags & TCL_ENCODING_END) && (result == TCL_CONVERT_MULTIBYTE)) {
if (dst > dstEnd) {
result = TCL_CONVERT_NOSPACE;
} else {
dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst);
result = TCL_OK;
}
if (PROFILE_STRICT(flags)) {
result = TCL_CONVERT_SYNTAX;
} else {
/* PROFILE_REPLACE or PROFILE_TCL8 */
result = TCL_OK;
dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst);
numChars++;
src += bytesLeft; /* Go past truncated code unit */
}
}
}

*srcReadPtr = src - srcStart;
Expand Down Expand Up @@ -2837,7 +2848,7 @@ Utf16ToUtfProc(
{
const char *srcStart, *srcEnd;
const char *dstEnd, *dstStart;
int result, extra, numChars, charLimit = INT_MAX;
int result, numChars, charLimit = INT_MAX;
unsigned short ch = 0;

flags |= PTR2INT(clientData);
Expand All @@ -2850,8 +2861,7 @@ Utf16ToUtfProc(
* Check alignment with utf-16 (2 == sizeof(UTF-16))
*/

extra = srcLen % 2;
if (extra != 0) {
if ((srcLen % 2) != 0) {
result = TCL_CONVERT_MULTIBYTE;
srcLen--;
}
Expand Down Expand Up @@ -2909,16 +2919,22 @@ Utf16ToUtfProc(
}
/*
* If we had a truncated code unit at the end AND this is the last
* fragment AND profile is "replace", stick FFFD in its place.
* fragment AND profile is not "strict", stick FFFD in its place.
*/
if (extra && (flags & TCL_ENCODING_END) && PROFILE_REPLACE(flags)) {
++src;/* Go past the truncated code unit */
if ((flags & TCL_ENCODING_END) && (result == TCL_CONVERT_MULTIBYTE)) {
if (dst > dstEnd) {
result = TCL_CONVERT_NOSPACE;
} else {
dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst);
result = TCL_OK;
}
if (PROFILE_STRICT(flags)) {
result = TCL_CONVERT_SYNTAX;
} else {
/* PROFILE_REPLACE or PROFILE_TCL8 */
result = TCL_OK;
dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst);
numChars++;
src++; /* Go past truncated code unit */
}
}
}

*srcReadPtr = src - srcStart;
Expand Down
14 changes: 7 additions & 7 deletions tests/cmdAH.test
Original file line number Diff line number Diff line change
Expand Up @@ -703,7 +703,7 @@ lappend encInvalidBytes {*}{
# happen when the sequence is at the end (including by itself) Thus {solo tail}
# in some cases.
lappend encInvalidBytes {*}{
utf-16le 41 tcl8 {} -1 {solo tail} {Truncated}
utf-16le 41 tcl8 \uFFFD -1 {solo tail} {Truncated}
utf-16le 41 replace \uFFFD -1 {solo tail} {Truncated}
utf-16le 41 strict {} 0 {solo tail} {Truncated}
utf-16le 00D8 tcl8 \uD800 -1 {} {Missing low surrogate}
Expand All @@ -719,13 +719,13 @@ lappend encInvalidBytes {*}{
# happen when the sequence is at the end (including by itself) Thus {solo tail}
# in some cases.
lappend encInvalidBytes {*}{
utf-32le 41 tcl8 {} -1 {solo tail} {Truncated}
utf-32le 41 tcl8 \uFFFD -1 {solo tail} {Truncated}
utf-32le 41 replace \uFFFD -1 {solo} {Truncated}
utf-32le 41 strict {} 0 {solo tail} {Truncated}
utf-32le 4100 tcl8 {} -1 {solo tail} {Truncated}
utf-32le 4100 tcl8 \uFFFD -1 {solo tail} {Truncated}
utf-32le 4100 replace \uFFFD -1 {solo} {Truncated}
utf-32le 4100 strict {} 0 {solo tail} {Truncated}
utf-32le 410000 tcl8 {} -1 {solo tail} {Truncated}
utf-32le 410000 tcl8 \uFFFD -1 {solo tail} {Truncated}
utf-32le 410000 replace \uFFFD -1 {solo} {Truncated}
utf-32le 410000 strict {} 0 {solo tail} {Truncated}
utf-32le 00D80000 tcl8 \uD800 -1 {} {High-surrogate}
Expand All @@ -744,9 +744,9 @@ lappend encInvalidBytes {*}{
utf-32le FFFFFFFF replace \UFFFD -1 {} {Out of range}
utf-32le FFFFFFFF strict {} 0 {} {Out of range}

utf-32be 41 tcl8 {} -1 {solo tail} {Truncated}
utf-32be 0041 tcl8 {} -1 {solo tail} {Truncated}
utf-32be 000041 tcl8 {} -1 {solo tail} {Truncated}
utf-32be 41 tcl8 \uFFFD -1 {solo tail} {Truncated}
utf-32be 0041 tcl8 \uFFFD -1 {solo tail} {Truncated}
utf-32be 000041 tcl8 \uFFFD -1 {solo tail} {Truncated}
utf-32be 0000D800 tcl8 \uD800 -1 {} {High-surrogate}
utf-32be 0000D800 replace \uFFFD -1 {} {High-surrogate}
utf-32be 0000D800 strict {} 0 {} {High-surrogate}
Expand Down
19 changes: 12 additions & 7 deletions tests/encoding.test
Original file line number Diff line number Diff line change
Expand Up @@ -534,7 +534,7 @@ test encoding-16.17 {Utf32ToUtfProc} -body {
list [encoding convertfrom -profile strict -failindex idx utf-32le \x41\x00\x00\x00\x00\xD8\x00\x00\x42\x00\x00\x00] [set idx]
} -result {A 4}

test encoding-16.9 {
test encoding-16.18 {
Utf16ToUtfProc, Tcl_UniCharToUtf, surrogate pairs in utf-16
} -body {
apply [list {} {
Expand All @@ -553,10 +553,15 @@ test encoding-16.9 {
return done
} [namespace current]]
} -result done




test encoding-16.19 {UnicodeToUtfProc, bug [d19fe0a5b]} -body {
encoding convertfrom utf-16 "\x41\x41\x41"
} -result \u4141\uFFFD
test encoding-16.20 {UnicodeToUtfProc, bug [d19fe0a5b]} -constraints deprecated -body {
encoding convertfrom utf-16 "\xD8\xD8"
} -result \uD8D8
test encoding-16.21 {UnicodeToUtfProc, bug [d19fe0a5b]} -body {
encoding convertfrom utf-32 "\x00\x00\x00\x00\x41\x41"
} -result \x00\uFFFD

test encoding-17.1 {UtfToUtf16Proc} -body {
encoding convertto utf-16 "\U460DC"
Expand Down Expand Up @@ -783,10 +788,10 @@ test encoding-24.19 {Parse valid or invalid utf-8} -constraints deprecated -body
} -result ZX\xED\xA0\x80
test encoding-24.20 {Parse with -profile tcl8 but without providing encoding} -body {
encoding convertfrom -profile tcl8 "\x20"
} -result {wrong # args: should be "::tcl::encoding::convertfrom ??-profile profile? ?-failindex var? ?encoding?? data"} -returnCodes error
} -result {wrong # args: should be "::tcl::encoding::convertfrom ? ?-profile profile? ?-failindex var? encoding ? data"} -returnCodes error
test encoding-24.21 {Parse with -profile tcl8 but without providing encoding} -body {
string length [encoding convertto -profile tcl8 "\x20"]
} -result {wrong # args: should be "::tcl::encoding::convertto ??-profile profile? ?-failindex var? ?encoding?? data"} -returnCodes error
} -result {wrong # args: should be "::tcl::encoding::convertto ? ?-profile profile? ?-failindex var? encoding ? data"} -returnCodes error
test encoding-24.22 {Syntax error, two encodings} -body {
encoding convertfrom iso8859-1 utf-8 "ZX\uD800"
} -result {bad option "iso8859-1": must be -profile or -failindex} -returnCodes error
Expand Down

0 comments on commit 64d96db

Please sign in to comment.