From 62b996d09df36eae147362511cabfc3b2368dedd Mon Sep 17 00:00:00 2001 From: Reuben Thomas Date: Tue, 15 Feb 2022 16:09:35 +0000 Subject: [PATCH] Fix error handling with iconv (fix #38) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When --strict is given, add //IGNORE to the encoding (remove manually set “-ignore” encodings). Return an error when the input is untranslatable, unless --force is also given. Only modify the to-encoding in this way, not the from-encoding, as that doesn’t do anything. --- doc/recode.texi | 19 +++++++++---- src/iconv.c | 69 +++++++++++++++++++++++++----------------------- src/main.c | 12 ++++++--- src/outer.c | 2 ++ src/recode.h | 4 ++- src/recodext.h | 7 +++++ tables.py | 4 +-- tests/Recode.pyx | 4 +++ 8 files changed, 76 insertions(+), 45 deletions(-) diff --git a/doc/recode.texi b/doc/recode.texi index 9d0cd02..236b264 100644 --- a/doc/recode.texi +++ b/doc/recode.texi @@ -1803,9 +1803,20 @@ external @code{iconv} library. This means that the charsets and aliases provided by the @code{iconv} external library and not by Recode itself are not available. +@item RECODE_STRICT_MAPPING_FLAG + +When this flag is set (corresponding to the @samp{--strict} command-line +option), untranslatable characters are discarded, but an error is +returned on completion unless @samp{RECODE_FORCE_FLAG} is also set. + +@item RECODE_FORCE_FLAG + +When this flag is set (corresponding to the @samp{--force} command-line +option), errors caused by untranslatable characters are ignored. + @end table -In previous incatations of the Recode library, @var{flags} +In previous incarnations of the Recode library, @var{flags} was a Boolean instead of a collection of flags, meant to set @code{RECODE_AUTO_ABORT_FLAG}. This still works, but is deprecated. @@ -2945,10 +2956,8 @@ an external @code{iconv} library, as they likely share many charsets. We discuss, here, the issues related to this duplication, and other peculiarities specific to the @code{iconv} library. -If the string @code{-ignore} is appended to the @var{after} encoding, -characters that cannot be converted are discarded and an error is -printed after conversion. This corresponds to the @code{iconv} option -@code{//IGNORE}. +The @code{strict_mapping} request option is implemented by adding +@code{iconv} option @code{//IGNORE} to the @samp{after} encoding. If the string @code{-translit} is appended to the @var{after} encoding, characters being converted are transliterated when needed and possible. diff --git a/src/iconv.c b/src/iconv.c index 543e4f4..f514f24 100644 --- a/src/iconv.c +++ b/src/iconv.c @@ -1,5 +1,5 @@ /* Conversion of files between different charsets and surfaces. - Copyright © 1999, 2000, 2001, 2008 Free Software Foundation, Inc. + Copyright © 1999-2022 Free Software Foundation, Inc. Contributed by François Pinard , 1999, and Bruno Haible , 2000. @@ -28,6 +28,18 @@ | Use `iconv' to handle a double step. | `--------------------------------------*/ +static void +do_iconv (RECODE_OUTER outer, + iconv_t conversion, + char **input, size_t *input_left, + char **output, size_t *output_left, + int *saved_errno) +{ + size_t converted = iconv (conversion, input, input_left, output, output_left); + if (converted == (size_t) -1 && !(errno == EILSEQ && outer->force)) + *saved_errno = errno; +} + #define BUFFER_SIZE 2048 static bool @@ -48,15 +60,16 @@ wrapped_transform (iconv_t conversion, RECODE_SUBTASK subtask) size_t input_left = 0; size_t output_left = BUFFER_SIZE; int saved_errno = 0; - size_t converted; if (drain_first) { /* Drain all accumulated partial state and emit output to return to the initial shift state. */ - converted = iconv (conversion, NULL, NULL, &output, &output_left); - if (converted == (size_t) -1) - saved_errno = errno; + do_iconv (subtask->task->request->outer, + conversion, + NULL, NULL, + &output, &output_left, + &saved_errno); } if (saved_errno == 0) @@ -84,11 +97,11 @@ wrapped_transform (iconv_t conversion, RECODE_SUBTASK subtask) /* Convert accumulated input and add it to the output buffer. */ input = input_buffer; input_left = cursor - input_buffer; - converted = iconv (conversion, - &input, &input_left, - &output, &output_left); - if (converted == (size_t) -1) - saved_errno = errno; + do_iconv (subtask->task->request->outer, + conversion, + &input, &input_left, + &output, &output_left, + &saved_errno); } } @@ -166,30 +179,21 @@ ends_with (const char *s, size_t s_len, const char *suff, size_t suff_len) } static char * -iconv_fix_options (const char *charset) +iconv_fix_options (RECODE_OUTER outer, const char *charset) { size_t charset_len = strlen (charset); - bool ignore = false, translit = false; - - do { - if (ends_with (charset, charset_len, "-translit", strlen ("-translit"))) - { - translit = true; - charset_len -= strlen ("-translit"); - } - else if (ends_with (charset, charset_len, "-ignore", strlen ("-ignore"))) - { - ignore = true; - charset_len -= strlen ("-ignore"); - } - else - break; - } while (true); + bool translit = false; + + if (ends_with (charset, charset_len, "-translit", strlen ("-translit"))) + { + translit = true; + charset_len -= strlen ("-translit"); + } char *result; if (asprintf (&result, "%.*s%s%s", (int) charset_len, charset, translit ? "//TRANSLIT" : "", - ignore ? "//IGNORE": "") + outer->strict_mapping ? "//IGNORE": "") == -1) return NULL; return result; @@ -198,24 +202,23 @@ iconv_fix_options (const char *charset) bool transform_with_iconv (RECODE_SUBTASK subtask) { + RECODE_OUTER outer = subtask->task->request->outer; RECODE_CONST_STEP step = subtask->step; - char *tocode = iconv_fix_options (step->after->iconv_name); - char *fromcode = iconv_fix_options (step->before->iconv_name); + char *tocode = iconv_fix_options (outer, step->after->iconv_name); + const char *fromcode = step->before->iconv_name; iconv_t conversion = (iconv_t) -1; - if (tocode && fromcode) + if (tocode) conversion = iconv_open (tocode, fromcode); if (conversion == (iconv_t) -1) { recode_if_nogo (RECODE_SYSTEM_ERROR, subtask); - free (fromcode); free (tocode); SUBTASK_RETURN (subtask); } bool status = wrapped_transform (conversion, subtask); iconv_close (conversion); - free (fromcode); free (tocode); return status; } diff --git a/src/main.c b/src/main.c index eb96a8d..43efeff 100644 --- a/src/main.c +++ b/src/main.c @@ -333,13 +333,14 @@ new_outer(unsigned flags) { /* Register all modules and build internal tables. */ - RECODE_OUTER outer = recode_new_outer (flags | RECODE_AUTO_ABORT_FLAG); + RECODE_OUTER outer = + recode_new_outer (flags | RECODE_AUTO_ABORT_FLAG); if (!outer) abort (); - /* Set strict mapping. */ + /* If using strict mapping, remove fallbacks. */ - if (strict_mapping) + if (outer->strict_mapping) for (RECODE_SINGLE single = outer->single_list; single; single = single->next) @@ -494,6 +495,7 @@ main (int argc, char *const *argv) case 'f': task_option.fail_level = RECODE_SYSTEM_ERROR; task_option.abort_level = RECODE_USER_ERROR; + force_flag = true; break; case 'g': @@ -647,6 +649,10 @@ warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n"), if ((ignored_name && *ignored_name == ':') || request_option.make_header_flag) flags |= RECODE_NO_ICONV_FLAG; + if (strict_mapping) + flags |= RECODE_STRICT_MAPPING_FLAG; + if (force_flag) + flags |= RECODE_FORCE_FLAG; RECODE_OUTER outer = new_outer (flags); /* Process charset listing options. */ diff --git a/src/outer.c b/src/outer.c index 8a62653..6f0f9fa 100644 --- a/src/outer.c +++ b/src/outer.c @@ -475,6 +475,8 @@ recode_new_outer (unsigned flags) outer->auto_abort = (flags & RECODE_AUTO_ABORT_FLAG) != 0; outer->use_iconv = (flags & RECODE_NO_ICONV_FLAG) == 0; + outer->strict_mapping = (flags & RECODE_STRICT_MAPPING_FLAG) != 0; + outer->force = (flags & RECODE_FORCE_FLAG) != 0; if (!register_all_modules (outer) || !make_argmatch_arrays (outer)) { diff --git a/src/recode.h b/src/recode.h index c872fd9..a41ccd0 100644 --- a/src/recode.h +++ b/src/recode.h @@ -1,5 +1,5 @@ /* Conversion of files between different charsets and surfaces. - Copyright © 1990, 93, 94, 96, 97, 98, 99, 00 Free Software Foundation, Inc. + Copyright © 1990-2022 Free Software Foundation, Inc. Contributed by François Pinard , 1988. This library is free software; you can redistribute it and/or @@ -56,6 +56,8 @@ extern "C" { #define RECODE_AUTO_ABORT_FLAG 1 #define RECODE_NO_ICONV_FLAG 2 +#define RECODE_STRICT_MAPPING_FLAG 4 +#define RECODE_FORCE_FLAG 8 RECODE_OUTER recode_new_outer (unsigned); bool recode_delete_outer (RECODE_OUTER); diff --git a/src/recodext.h b/src/recodext.h index d5dd04d..a836d52 100644 --- a/src/recodext.h +++ b/src/recodext.h @@ -114,6 +114,13 @@ struct recode_outer /* If the external `iconv' library should be initialized and used. */ bool use_iconv; + /* If we should discard untranslatable input and return an error, + unless 'force' is set (see below). */ + bool strict_mapping; + + /* If we should ignore untranslatable input altogether. */ + bool force; + /* charset.c */ /* --------- */ diff --git a/tables.py b/tables.py index 3d97b65..2666ab8 100755 --- a/tables.py +++ b/tables.py @@ -489,15 +489,13 @@ def digest(self): def complete(self, french): def write_charset(format, charset): write(format % charset) - write(format % (charset + "-ignore")) write(format % (charset + "-translit")) - write(format % (charset + "-translit-ignore")) if not self.do_sources: return write = Output(self.SOURCES).write count = 1 for charset, aliases in self.data: - versions = 4 # Normal, //IGNORE, //TRANSLIT, //TRANSLIT//IGNORE + versions = 2 # Normal, //TRANSLIT count = count + (versions + len(aliases)) * versions write('\n' "/* This is derived from Bruno Haible's `libiconv' package. */" diff --git a/tests/Recode.pyx b/tests/Recode.pyx index 545c1a1..374a8df 100644 --- a/tests/Recode.pyx +++ b/tests/Recode.pyx @@ -414,6 +414,7 @@ cdef extern from "common.h": enum: RECODE_AUTO_ABORT_FLAG RECODE_NO_ICONV_FLAG + RECODE_STRICT_MAPPING_FLAG RECODE_OUTER recode_new_outer(unsigned) bool recode_delete_outer(RECODE_OUTER) @@ -514,6 +515,7 @@ BYTE_ORDER_MARK_SWAPPED = BYTE_ORDER_MARK_SWAPPED_ AUTO_ABORT_FLAG = RECODE_AUTO_ABORT_FLAG NO_ICONV_FLAG = RECODE_NO_ICONV_FLAG +STRICT_MAPPING_FLAG = RECODE_STRICT_MAPPING_FLAG ## Recode library at OUTER level. @@ -528,6 +530,8 @@ cdef class Outer: flags = flags | RECODE_AUTO_ABORT_FLAG if not iconv: flags = flags | RECODE_NO_ICONV_FLAG + if strict: + flags = flags | RECODE_STRICT_MAPPING_FLAG self.outer = recode_new_outer(flags) if strict: single = self.outer.single_list