Branch: refs/heads/master Author: Colomban Wendling ban@herbesfolles.org Committer: GitHub noreply@github.com Date: Sun, 21 Apr 2024 19:33:21 UTC Commit: 9fff385604685e6707b0741ed9a737aaa5c23248 https://github.com/geany/geany/commit/9fff385604685e6707b0741ed9a737aaa5c232...
Log Message: ----------- Merge pull request #3716 from b4n/encodings-fixes
Various encodings conversion fixes
Modified Paths: -------------- src/document.c src/encodings.c src/encodingsprivate.h src/libmain.c src/templates.c tests/Makefile.am tests/meson.build tests/test_encodings.c
Modified: src/document.c 16 lines changed, 6 insertions(+), 10 deletions(-) =================================================================== @@ -998,19 +998,15 @@ static gboolean load_text_file(const gchar *locale_filename, const gchar *displa }
if (! encodings_convert_to_utf8_auto(&filedata->data, &filedata->len, forced_enc, - &filedata->enc, &filedata->bom, &filedata->readonly)) + &filedata->enc, &filedata->bom, &filedata->readonly, &err)) { if (forced_enc) - { - ui_set_statusbar(TRUE, _("The file "%s" is not valid %s."), - display_filename, forced_enc); - } + ui_set_statusbar(TRUE, _("Failed to load file "%s" as %s: %s."), + display_filename, forced_enc, err->message); else - { - ui_set_statusbar(TRUE, - _("The file "%s" does not look like a text file or the file encoding is not supported."), - display_filename); - } + ui_set_statusbar(TRUE, _("Failed to load file "%s": %s."), + display_filename, err->message); + g_error_free(err); g_free(filedata->data); return FALSE; }
Modified: src/encodings.c 194 lines changed, 103 insertions(+), 91 deletions(-) =================================================================== @@ -195,37 +195,27 @@ static gboolean encodings_charset_equals(const gchar *a, const gchar *b)
GeanyEncodingIndex encodings_get_idx_from_charset(const gchar *charset) { - gint i; - if (charset == NULL) return GEANY_ENCODING_UTF_8;
- i = 0; - while (i < GEANY_ENCODINGS_MAX) + for (gint i = 0; i < GEANY_ENCODINGS_MAX; i++) { if (encodings_charset_equals(charset, encodings[i].charset)) return i; - - ++i; } return GEANY_ENCODING_UTF_8; }
const GeanyEncoding *encodings_get_from_charset(const gchar *charset) { - gint i; - if (charset == NULL) return &encodings[GEANY_ENCODING_UTF_8];
- i = 0; - while (i < GEANY_ENCODINGS_MAX) + for (gint i = 0; i < GEANY_ENCODINGS_MAX; i++) { if (encodings_charset_equals(charset, encodings[i].charset)) return &encodings[i]; - - ++i; }
return NULL; @@ -303,12 +293,10 @@ void encodings_select_radio_item(const gchar *charset)
g_return_if_fail(charset != NULL);
- i = 0; - while (i < GEANY_ENCODINGS_MAX) + for (i = 0; i < GEANY_ENCODINGS_MAX; i++) { if (utils_str_equal(charset, encodings[i].charset)) break; - i++; } if (i == GEANY_ENCODINGS_MAX) i = GEANY_ENCODING_UTF_8; /* fallback to UTF-8 */ @@ -326,7 +314,7 @@ void encodings_select_radio_item(const gchar *charset) static GRegex *regex_compile(const gchar *pattern) { GError *error = NULL; - GRegex *regex = g_regex_new(pattern, G_REGEX_CASELESS, 0, &error); + GRegex *regex = g_regex_new(pattern, G_REGEX_CASELESS | G_REGEX_RAW, 0, &error);
if (!regex) { @@ -405,11 +393,31 @@ void encodings_finalize(void) }
+/* initialization of non-UI parts */ +void encodings_init_headless(void) +{ + static gboolean initialized = FALSE; + + if (initialized) + return; + + init_encodings(); + + if (! pregs_loaded) + { + pregs[0] = regex_compile(PATTERN_HTMLMETA); + pregs[1] = regex_compile(PATTERN_CODING); + pregs_loaded = TRUE; + } + + initialized = TRUE; +} + + void encodings_init(void) { GtkWidget *menu[2]; GCallback cb_func[2]; - gint group_sizes[GEANY_ENCODING_GROUPS_MAX] = { 0 }; const gchar *const groups[GEANY_ENCODING_GROUPS_MAX] = { [NONE] = NULL, @@ -421,24 +429,14 @@ void encodings_init(void) [UNICODE] = N_("_Unicode"), };
- init_encodings(); - - if (! pregs_loaded) - { - pregs[0] = regex_compile(PATTERN_HTMLMETA); - pregs[1] = regex_compile(PATTERN_CODING); - pregs_loaded = TRUE; - } + encodings_init_headless();
/* create encodings submenu in document menu */ menu[0] = ui_lookup_widget(main_widgets.window, "set_encoding1_menu"); menu[1] = ui_lookup_widget(main_widgets.window, "menu_reload_as1_menu"); cb_func[0] = G_CALLBACK(encodings_radio_item_change_cb); cb_func[1] = G_CALLBACK(encodings_reload_radio_item_change_cb);
- for (guint i = 0; i < G_N_ELEMENTS(encodings); i++) - group_sizes[encodings[i].group]++; - for (guint k = 0; k < 2; k++) { GSList *group = NULL; @@ -612,21 +610,9 @@ void encodings_encoding_store_cell_data_func(GtkCellLayout *cell_layout, }
-/** - * Tries to convert @a buffer into UTF-8 encoding from the encoding specified with @a charset. - * If @a fast is not set, additional checks to validate the converted string are performed. - * - * @param buffer The input string to convert. - * @param size The length of the string, or -1 if the string is nul-terminated. - * @param charset The charset to be used for conversion. - * @param fast @c TRUE to only convert the input and skip extended checks on the converted string. - * - * @return If the conversion was successful, a newly allocated nul-terminated string, - * which must be freed with @c g_free(). Otherwise @c NULL. - **/ -GEANY_API_SYMBOL -gchar *encodings_convert_to_utf8_from_charset(const gchar *buffer, gssize size, - const gchar *charset, gboolean fast) +static gchar *convert_to_utf8_from_charset(const gchar *buffer, gssize size, + const gchar *charset, gboolean fast, + gsize *utf8_size, GError **error) { gchar *utf8_content = NULL; GError *conv_error = NULL; @@ -642,18 +628,22 @@ gchar *encodings_convert_to_utf8_from_charset(const gchar *buffer, gssize size, if (fast) { utf8_content = converted_contents; - if (conv_error != NULL) g_error_free(conv_error); + if (conv_error != NULL) g_propagate_error(error, conv_error); } else if (conv_error != NULL || ! g_utf8_validate(converted_contents, bytes_written, NULL)) { if (conv_error != NULL) { geany_debug("Couldn't convert from %s to UTF-8 (%s).", charset, conv_error->message); - g_error_free(conv_error); + g_propagate_error(error, conv_error); conv_error = NULL; } else + { geany_debug("Couldn't convert from %s to UTF-8.", charset); + g_set_error(error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, + _("Data contains NULs")); + }
utf8_content = NULL; g_free(converted_contents); @@ -664,10 +654,35 @@ gchar *encodings_convert_to_utf8_from_charset(const gchar *buffer, gssize size, utf8_content = converted_contents; }
+ if (utf8_content && utf8_size) + *utf8_size = bytes_written; + return utf8_content; }
+/** + * Tries to convert @a buffer into UTF-8 encoding from the encoding specified with @a charset. + * If @a fast is not set, additional checks to validate the converted string are performed. + * + * @param buffer The input string to convert. + * @param size The length of the string, or -1 if the string is nul-terminated. + * @param charset The charset to be used for conversion. + * @param fast @c TRUE to only convert the input and skip extended checks on the converted string. + * + * @return If the conversion was successful, a newly allocated nul-terminated string, + * which must be freed with @c g_free(). Otherwise @c NULL. + **/ +GEANY_API_SYMBOL +gchar *encodings_convert_to_utf8_from_charset(const gchar *buffer, gssize size, + const gchar *charset, gboolean fast) +{ + /* If fast=FALSE, we can safely ignore the size as the output cannot contain NULs. + * Otherwise, the caller already agrees on partial data anyway. */ + return convert_to_utf8_from_charset(buffer, size, charset, fast, NULL, NULL); +} + + static gchar *encodings_check_regexes(const gchar *buffer, gsize size) { guint i; @@ -684,7 +699,7 @@ static gchar *encodings_check_regexes(const gchar *buffer, gsize size)
static gchar *encodings_convert_to_utf8_with_suggestion(const gchar *buffer, gssize size, - const gchar *suggested_charset, gchar **used_encoding) + const gchar *suggested_charset, gchar **used_encoding, gsize *utf8_size, GError **error) { const gchar *locale_charset = NULL; const gchar *charset; @@ -751,7 +766,7 @@ static gchar *encodings_convert_to_utf8_with_suggestion(const gchar *buffer, gss
geany_debug("Trying to convert %" G_GSIZE_FORMAT " bytes of data from %s into UTF-8.", size, charset); - utf8_content = encodings_convert_to_utf8_from_charset(buffer, size, charset, FALSE); + utf8_content = convert_to_utf8_from_charset(buffer, size, charset, FALSE, utf8_size, NULL);
if (G_LIKELY(utf8_content != NULL)) { @@ -768,6 +783,9 @@ static gchar *encodings_convert_to_utf8_with_suggestion(const gchar *buffer, gss } }
+ g_set_error(error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED, + _("Data contains NULs or the encoding is not supported")); + return NULL; }
@@ -791,7 +809,8 @@ gchar *encodings_convert_to_utf8(const gchar *buffer, gssize size, gchar **used_
/* first try to read the encoding from the file content */ regex_charset = encodings_check_regexes(buffer, size); - utf8 = encodings_convert_to_utf8_with_suggestion(buffer, size, regex_charset, used_encoding); + /* we know this cannot succeed if there are NULs in the output, so ignoring the size is OK */ + utf8 = encodings_convert_to_utf8_with_suggestion(buffer, size, regex_charset, used_encoding, NULL, NULL); g_free(regex_charset);
return utf8; @@ -870,38 +889,37 @@ typedef struct { gchar *data; /* null-terminated data */ gsize size; /* actual data size */ - gsize len; /* string length of data */ gchar *enc; gboolean bom; - gboolean partial; } BufferData;
/* convert data with the specified encoding */ static gboolean -handle_forced_encoding(BufferData *buffer, const gchar *forced_enc) +handle_forced_encoding(BufferData *buffer, const gchar *forced_enc, GError **error) { GeanyEncodingIndex enc_idx;
if (utils_str_equal(forced_enc, "UTF-8")) { - if (! g_utf8_validate(buffer->data, buffer->len, NULL)) + if (! g_utf8_validate(buffer->data, buffer->size, NULL)) { + g_set_error(error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, + _("Data contains NULs or is not valid UTF-8")); return FALSE; } } else { - gchar *converted_text = encodings_convert_to_utf8_from_charset( - buffer->data, buffer->size, forced_enc, FALSE); + gchar *converted_text = convert_to_utf8_from_charset( + buffer->data, buffer->size, forced_enc, FALSE, &buffer->size, error); if (converted_text == NULL) { return FALSE; } else { SETPTR(buffer->data, converted_text); - buffer->len = strlen(converted_text); } } enc_idx = encodings_scan_unicode_bom(buffer->data, buffer->size, NULL); @@ -913,15 +931,14 @@ handle_forced_encoding(BufferData *buffer, const gchar *forced_enc)
/* detect encoding and convert to UTF-8 if necessary */ static gboolean -handle_encoding(BufferData *buffer, GeanyEncodingIndex enc_idx) +handle_encoding(BufferData *buffer, GeanyEncodingIndex enc_idx, GError **error) { g_return_val_if_fail(buffer->enc == NULL, FALSE); g_return_val_if_fail(buffer->bom == FALSE, FALSE);
if (buffer->size == 0) { - /* we have no data so assume UTF-8, buffer->len can be 0 even we have an empty - * e.g. UTF32 file with a BOM(so size is 4, len is 0) */ + /* we have no data so assume UTF-8 */ buffer->enc = g_strdup("UTF-8"); } else @@ -932,14 +949,22 @@ handle_encoding(BufferData *buffer, GeanyEncodingIndex enc_idx) buffer->enc = g_strdup(encodings[enc_idx].charset); buffer->bom = TRUE;
- if (enc_idx != GEANY_ENCODING_UTF_8) /* the BOM indicated something else than UTF-8 */ + if (enc_idx == GEANY_ENCODING_UTF_8) { - gchar *converted_text = encodings_convert_to_utf8_from_charset( - buffer->data, buffer->size, buffer->enc, FALSE); + if (! g_utf8_validate(buffer->data, buffer->size, NULL)) + { + /* this is not actually valid UTF-8 */ + SETPTR(buffer->enc, NULL); + buffer->bom = FALSE; + } + } + else /* the BOM indicated something else than UTF-8 */ + { + gchar *converted_text = convert_to_utf8_from_charset( + buffer->data, buffer->size, buffer->enc, FALSE, &buffer->size, NULL); if (converted_text != NULL) { SETPTR(buffer->data, converted_text); - buffer->len = strlen(converted_text); } else { @@ -957,23 +982,22 @@ handle_encoding(BufferData *buffer, GeanyEncodingIndex enc_idx)
/* try UTF-8 first */ if (encodings_get_idx_from_charset(regex_charset) == GEANY_ENCODING_UTF_8 && - (buffer->size == buffer->len) && g_utf8_validate(buffer->data, buffer->len, NULL)) + g_utf8_validate(buffer->data, buffer->size, NULL)) { buffer->enc = g_strdup("UTF-8"); } else { /* detect the encoding */ gchar *converted_text = encodings_convert_to_utf8_with_suggestion(buffer->data, - buffer->size, regex_charset, &buffer->enc); + buffer->size, regex_charset, &buffer->enc, &buffer->size, error);
if (converted_text == NULL) { g_free(regex_charset); return FALSE; } SETPTR(buffer->data, converted_text); - buffer->len = strlen(converted_text); } g_free(regex_charset); } @@ -990,33 +1014,23 @@ handle_bom(BufferData *buffer) encodings_scan_unicode_bom(buffer->data, buffer->size, &bom_len); g_return_if_fail(bom_len != 0);
- /* use filedata->len here because the contents are already converted into UTF-8 */ - buffer->len -= bom_len; + /* the contents are already converted into UTF-8 here */ + buffer->size -= bom_len; /* overwrite the BOM with the remainder of the file contents, plus the NULL terminator. */ - memmove(buffer->data, buffer->data + bom_len, buffer->len + 1); - buffer->data = g_realloc(buffer->data, buffer->len + 1); + memmove(buffer->data, buffer->data + bom_len, buffer->size + 1); + buffer->data = g_realloc(buffer->data, buffer->size + 1); }
/* loads textfile data, verifies and converts to forced_enc or UTF-8. Also handles BOM. */ -static gboolean handle_buffer(BufferData *buffer, const gchar *forced_enc) +static gboolean handle_buffer(BufferData *buffer, const gchar *forced_enc, GError **error) { GeanyEncodingIndex tmp_enc_idx;
/* temporarily retrieve the encoding idx based on the BOM to suppress the following warning * if we have a BOM */ tmp_enc_idx = encodings_scan_unicode_bom(buffer->data, buffer->size, NULL);
- /* check whether the size of the loaded data is equal to the size of the file in the - * filesystem file size may be 0 to allow opening files in /proc/ which have typically a - * file size of 0 bytes */ - if (buffer->len != buffer->size && buffer->size != 0 && ( - tmp_enc_idx == GEANY_ENCODING_UTF_8 || /* tmp_enc_idx can be UTF-7/8/16/32, UCS and None */ - tmp_enc_idx == GEANY_ENCODING_UTF_7)) /* filter UTF-7/8 where no NULL bytes are allowed */ - { - buffer->partial = TRUE; - } - /* Determine character encoding and convert to UTF-8 */ if (forced_enc != NULL) { @@ -1026,12 +1040,12 @@ static gboolean handle_buffer(BufferData *buffer, const gchar *forced_enc) buffer->bom = FALSE; buffer->enc = g_strdup(encodings[GEANY_ENCODING_NONE].charset); } - else if (! handle_forced_encoding(buffer, forced_enc)) + else if (! handle_forced_encoding(buffer, forced_enc, error)) { return FALSE; } } - else if (! handle_encoding(buffer, tmp_enc_idx)) + else if (! handle_encoding(buffer, tmp_enc_idx, error)) { return FALSE; } @@ -1053,35 +1067,33 @@ static gboolean handle_buffer(BufferData *buffer, const gchar *forced_enc) * @param forced_enc forced encoding to use, or @c NULL * @param used_encoding return location for the actually used encoding, or @c NULL * @param has_bom return location to store whether the data had a BOM, or @c NULL - * @param partial return location to store whether the conversion may be partial, or @c NULL + * @param has_nuls return location to store whether the converted data contains NULs, or @c NULL * * @return @C TRUE if the conversion succeeded, @c FALSE otherwise. */ +GEANY_EXPORT_SYMBOL gboolean encodings_convert_to_utf8_auto(gchar **buf, gsize *size, const gchar *forced_enc, - gchar **used_encoding, gboolean *has_bom, gboolean *partial) + gchar **used_encoding, gboolean *has_bom, gboolean *has_nuls, GError **error) { BufferData buffer;
buffer.data = *buf; buffer.size = *size; - /* use strlen to check for null chars */ - buffer.len = strlen(buffer.data); buffer.enc = NULL; buffer.bom = FALSE; - buffer.partial = FALSE;
- if (! handle_buffer(&buffer, forced_enc)) + if (! handle_buffer(&buffer, forced_enc, error)) return FALSE;
- *size = buffer.len; + *size = buffer.size; if (used_encoding) *used_encoding = buffer.enc; else g_free(buffer.enc); if (has_bom) *has_bom = buffer.bom; - if (partial) - *partial = buffer.partial; + if (has_nuls) + *has_nuls = strlen(buffer.data) != buffer.size;
*buf = buffer.data; return TRUE;
Modified: src/encodingsprivate.h 4 lines changed, 3 insertions(+), 1 deletions(-) =================================================================== @@ -57,6 +57,7 @@ const gchar* encodings_get_charset(const GeanyEncoding* enc);
void encodings_select_radio_item(const gchar *charset);
+void encodings_init_headless(void); void encodings_init(void); void encodings_finalize(void);
@@ -72,7 +73,8 @@ void encodings_encoding_store_cell_data_func(GtkCellLayout *cell_layout, GtkCell gboolean encodings_is_unicode_charset(const gchar *string);
gboolean encodings_convert_to_utf8_auto(gchar **buf, gsize *size, const gchar *forced_enc, - gchar **used_encoding, gboolean *has_bom, gboolean *partial); + gchar **used_encoding, gboolean *has_bom, gboolean *has_nuls, + GError **error);
GeanyEncodingIndex encodings_scan_unicode_bom(const gchar *string, gsize len, guint *bom_len);
Modified: src/libmain.c 2 lines changed, 2 insertions(+), 0 deletions(-) =================================================================== @@ -1033,6 +1033,8 @@ void main_init_headless(void) memset(&template_prefs, 0, sizeof(GeanyTemplatePrefs)); memset(&ui_prefs, 0, sizeof(UIPrefs)); memset(&ui_widgets, 0, sizeof(UIWidgets)); + + encodings_init_headless(); }
Modified: src/templates.c 7 lines changed, 5 insertions(+), 2 deletions(-) =================================================================== @@ -70,15 +70,18 @@ static gchar *read_file(const gchar *locale_fname) gchar *contents; gsize length; GString *str; + GError *err = NULL;
if (! g_file_get_contents(locale_fname, &contents, &length, NULL)) return NULL;
- if (! encodings_convert_to_utf8_auto(&contents, &length, NULL, NULL, NULL, NULL)) + if (! encodings_convert_to_utf8_auto(&contents, &length, NULL, NULL, NULL, NULL, &err)) { gchar *utf8_fname = utils_get_utf8_from_locale(locale_fname);
- ui_set_statusbar(TRUE, _("Failed to convert template file "%s" to UTF-8"), utf8_fname); + ui_set_statusbar(TRUE, _("Failed to convert template file "%s" to UTF-8: %s"), + utf8_fname, err->message); + g_error_free(err); g_free(utf8_fname); g_free(contents); return NULL;
Modified: tests/Makefile.am 3 lines changed, 2 insertions(+), 1 deletions(-) =================================================================== @@ -7,9 +7,10 @@ AM_CPPFLAGS += -I$(top_srcdir)/src/tagmanager -I$(top_srcdir)/src AM_CFLAGS = $(GTK_CFLAGS) AM_LDFLAGS = $(GTK_LIBS) $(INTLLIBS) -no-install
-check_PROGRAMS = test_utils test_sidebar +check_PROGRAMS = test_utils test_sidebar test_encodings
test_utils_LDADD = $(top_builddir)/src/libgeany.la test_sidebar_LDADD = $(top_builddir)/src/libgeany.la +test_encodings_LDADD = $(top_builddir)/src/libgeany.la
TESTS = $(check_PROGRAMS)
Modified: tests/meson.build 1 lines changed, 1 insertions(+), 0 deletions(-) =================================================================== @@ -372,3 +372,4 @@ test('ctags/processing-order', runner, env: ['top_srcdir='+meson.source_root(), 'top_builddir='+meson.build_root()]) test('utils', executable('test_utils', 'test_utils.c', dependencies: test_deps)) test('sidebar', executable('test_sidebar', 'test_sidebar.c', dependencies: test_deps)) +test('encodings', executable('test_encodings', 'test_encodings.c', dependencies: test_deps))
Modified: tests/test_encodings.c 289 lines changed, 289 insertions(+), 0 deletions(-) =================================================================== @@ -0,0 +1,289 @@ +/* + * Copyright 2023 The Geany contributors + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include "encodingsprivate.h" +#include "main.h" + + +/* Asserts 2 bytes buffers are identical, trying to provide a somewhat useful + * error if not. */ +static void assert_cmpmem_eq_impl(const char *p1, const char *p2, gsize len, + const char *domain, const char *file, int line, const char *func, + const char *expr) +{ + gchar *msg; + gsize i; + + for (i = 0; i < len && p1[i] == p2[i]; i++) + ; + if (i == len) + return; + + msg = g_strdup_printf("assertion failed (%s): bytes %#x and %#x differ at offset %lu (at "%s" and "%s")", + expr, (guint) (guchar) p1[i], (guint) (guchar) p2[i], i, p1 + i, p2 + i); + g_assertion_message(domain, file, line, func, msg); + g_free(msg); +} + +#define assert_cmpmem_eq_with_caller(p1, p2, len, domain, file, line, func) \ + assert_cmpmem_eq_impl(p1, p2, len, domain, file, line, func, #p1 " == " #p2) + +#define assert_cmpmem_eq(p1, p2, len) assert_cmpmem_eq_impl(p1, p2, len, \ + G_LOG_DOMAIN, __FILE__, __LINE__, G_STRFUNC, #p1 " == " #p2) + +/* + * @brief More convenient test API for encodings_convert_to_utf8_auto() + * @param input Input buffer, NUL-terminated (well, at least there should be a + * trailing NUL). + * @param input_size Actual size of @p input buffer, without the trailing NUL + * @param disk_size Size on disk (as reported by e.g stat -- that may be 0 for + * virtual files, otherwise should be input_size) + * @param forced_enc Forced encoding, or NULL + * @param expected_output Expected output data + * @param expected_size Expected output size + * @param expected_encoding Expected output encoding + * @param expected_has_bom Whether the input contains a BOM + * @param expected_partial Whether the output is expected to be truncated + * @returns Whether the conversion succeeded and followed the parameters + */ +static gboolean assert_convert_to_utf8_auto_impl( + const char *domain, const char *file, int line, const char *func, + const gchar *input, gsize input_size, + const gsize disk_size, const gchar *forced_enc, + const gchar *expected_output, gsize expected_size, const gchar *expected_encoding, + gboolean expected_has_bom, gboolean expected_partial) +{ + gchar *buf = g_memdup(input, input_size + 1); + gsize size = disk_size; + gchar *used_encoding = NULL; + gboolean has_bom = FALSE; + gboolean partial = FALSE; + gboolean ret; + GError *err = NULL; + + g_log(domain, G_LOG_LEVEL_INFO, "%s:%d:%s: converting %lu bytes", file, line, func, input_size); + ret = encodings_convert_to_utf8_auto(&buf, &size, forced_enc, &used_encoding, &has_bom, &partial, &err); + fflush(stdout); + if (! ret) + { + g_log(domain, G_LOG_LEVEL_INFO, "%s:%d:%s: conversion failed: %s", file, line, func, err->message); + g_error_free(err); + } + else + { + assert_cmpmem_eq_with_caller(buf, expected_output, MIN(size, expected_size), + domain, file, line, func); + g_assert_cmpuint(size, ==, expected_size); + if (expected_encoding) + g_assert_cmpstr(expected_encoding, ==, used_encoding); + g_assert_cmpint(has_bom, ==, expected_has_bom); + g_assert_cmpint(partial, ==, expected_partial); + + g_free(used_encoding); + } + + g_free(buf); + + return ret; +} + + +#define assert_convert_to_utf8_auto(input, input_size, disk_size, forced_enc, \ + expected_output, expected_size, expected_encoding, expected_has_bom, expected_partial) \ + assert_convert_to_utf8_auto_impl(G_LOG_DOMAIN, __FILE__, __LINE__, G_STRFUNC, \ + input, input_size, disk_size, forced_enc, \ + expected_output, expected_size, expected_encoding, expected_has_bom, expected_partial) + + +static void test_encodings_convert_ascii_to_utf8_auto(void) +{ +#define TEST_ASCII(success, str, forced_enc) \ + g_assert(success == assert_convert_to_utf8_auto(str, G_N_ELEMENTS(str) - 1, G_N_ELEMENTS(str) - 1, \ + forced_enc, str, G_N_ELEMENTS(str) - 1, forced_enc, FALSE, \ + strlen(str) != G_N_ELEMENTS(str) - 1)) + + TEST_ASCII(TRUE, "This is a very basic ASCII test", NULL); + TEST_ASCII(TRUE, "This is a very basic ASCII test", "None"); + TEST_ASCII(TRUE, "This is a very basic ASCII test", "ASCII"); + TEST_ASCII(TRUE, "This is a very basic ASCII test", "UTF-8"); + TEST_ASCII(TRUE, "S\till ve\ry \b\asic", NULL); + TEST_ASCII(FALSE, "With\0some\0NULs\0", NULL); + TEST_ASCII(TRUE, "With\0some\0NULs\0", "None"); + TEST_ASCII(FALSE, "With\0some\0NULs\0", "UTF-8"); + +#undef TEST_ASCII +} + + +static void test_encodings_convert_utf8_to_utf8_auto(void) +{ +#define UTF8_BOM "\xef\xbb\xbf" +#define TEST_UTF8(success, str, forced_enc) \ + G_STMT_START { \ + gboolean has_bom = strncmp(str, UTF8_BOM, 3) == 0; \ + g_assert(success == assert_convert_to_utf8_auto(str, G_N_ELEMENTS(str) - 1, G_N_ELEMENTS(str) - 1, \ + forced_enc, str + (has_bom ? 3 : 0), G_N_ELEMENTS(str) - 1 - (has_bom ? 3 : 0), \ + forced_enc, has_bom, strlen(str) != G_N_ELEMENTS(str) - 1)); \ + } G_STMT_END + + TEST_UTF8(TRUE, "Thĩs îs å véry basìč ÅSÇǏÍ test", NULL); + TEST_UTF8(TRUE, "Thĩs îs å véry basìč ÅSÇǏÍ test", "None"); + TEST_UTF8(TRUE, "Thĩs îs å véry basìč ÅSÇǏÍ test", "UTF-8"); + TEST_UTF8(FALSE, "Wíťh\0søme\0NÙLs\0", NULL); + TEST_UTF8(FALSE, "Wíťh\0søme\0NÙLs\0", "UTF-8"); /* the NUL doesn't pass the UTF-8 check */ + TEST_UTF8(TRUE, "Wíťh\0søme\0NÙLs\0", "None"); /* with None we do no data validation, but report partial output */ + + /* with the inline hint */ + TEST_UTF8(TRUE, "coding:utf-8 bãśïč", NULL); + TEST_UTF8(FALSE, "coding:utf-8 Wíťh\0søme\0NÙLs", NULL); + + TEST_UTF8(TRUE, UTF8_BOM"With BOM", NULL); + /* These won't pass the UTF-8 validation despite the BOM, so we fallback to + * testing other options, and it will succeed with UTF-16 so there's no real + * point in verifying this */ + /*TEST_UTF8(FALSE, UTF8_BOM"With BOM\0and NULs", NULL);*/ + /*TEST_UTF8(FALSE, UTF8_BOM"Wíth BØM\0añd NÙLs", NULL);*/ + + /* non-UTF-8 */ + TEST_UTF8(FALSE, "Th\xec""s", "UTF-8"); + TEST_UTF8(FALSE, "Th\xec""s\0", "UTF-8"); + TEST_UTF8(FALSE, "\0Th\xec""s", "UTF-8"); + +#undef TEST_UTF8 +#undef UTF8_BOM +} + + +static void test_encodings_convert_utf_other_to_utf8_auto(void) +{ +#define UTF16_LE_BOM "\xff\xfe" +#define UTF16_BE_BOM "\xfe\xff" +#define UTF32_LE_BOM "\xff\xfe\x00\x00" +#define UTF32_BE_BOM "\x00\x00\xfe\xff" +#define TEST_ENC(success, input, output, has_bom, forced_enc, expected_encoding) \ + g_assert(success == assert_convert_to_utf8_auto(input, G_N_ELEMENTS(input) - 1, G_N_ELEMENTS(input) - 1, \ + forced_enc, output, G_N_ELEMENTS(output) - 1, expected_encoding, has_bom, \ + strlen(output) != G_N_ELEMENTS(output) - 1)) +#define TEST(success, input, output, has_bom, forced_enc) \ + TEST_ENC(success, input, output, has_bom, forced_enc, forced_enc) + + TEST(TRUE, "N\000o\000 \000B\000O\000M\000", "No BOM", FALSE, NULL); + TEST(TRUE, "N\000o\000 \000B\000\330\000M\000", "No BØM", FALSE, NULL); + /* doesn't accept the NULs */ + TEST(FALSE, "N\000o\000 \000B\000O\000M\000\000\000a\000n\000d\000 \000N\000U\000L\000s\000", "No BOM\0and NULs", FALSE, NULL); + TEST(FALSE, "N\000o\000 \000B\000\330\000M\000\000\000a\000\361\000d\000 \000N\000\331\000L\000s\000", "No BØM\0añd NÙLs", FALSE, NULL); + + TEST(TRUE, UTF16_LE_BOM"W\000i\000t\000h\000 \000B\000O\000M\000", "With BOM", TRUE, NULL); + TEST(TRUE, UTF16_LE_BOM"W\000i\000t\000h\000 \000B\000\330\000M\000", "With BØM", TRUE, NULL); + /* doesn't accept the NULs */ + TEST(FALSE, UTF16_LE_BOM"W\000i\000t\000h\000 \000B\000O\000M\000\000\000a\000n\000d\000 \000N\000U\000L\000s\000", "With BOM\0and NULs", TRUE, NULL); + TEST(FALSE, UTF16_LE_BOM"W\000\355\000t\000h\000 \000B\000\330\000M\000\000\000a\000\361\000d\000 \000N\000\331\000L\000s\000", "Wíth BØM\0añd NÙLs", TRUE, NULL); + + /* We should actually be smarter in our selection of encoding introducing + * probability scores, because this loads as UTF-16LE but is "圀椀琀栀 䈀伀䴀" + * which doesn't seem to be real Chinese */ + TEST(TRUE, "\000N\000o\000 \000B\000O\000M", "No BOM", FALSE, "UTF-16BE"); + TEST(TRUE, "\000N\000o\000 \000B\000\330\000M", "No BØM", FALSE, NULL); + /* doesn't accept the NULs -- and see above for the encoding choice */ + TEST(FALSE, "\000N\000o\000 \000B\000O\000M\000\000\000a\000n\000d\000 \000N\000U\000L\000s", "No BOM\0and NULs", FALSE, "UTF-16BE"); + TEST(FALSE, "\000N\000o\000 \000B\000\330\000M\000\000\000a\000\361\000d\000 \000N\000\331\000L\000s", "No BØM\0añd NÙLs", FALSE, NULL); + + TEST(TRUE, UTF16_BE_BOM"\000W\000i\000t\000h\000 \000B\000O\000M", "With BOM", TRUE, NULL); + TEST(TRUE, UTF16_BE_BOM"\000W\000i\000t\000h\000 \000B\000\330\000M", "With BØM", TRUE, NULL); + /* doesn't accept the NULs */ + TEST(FALSE, UTF16_BE_BOM"\000W\000i\000t\000h\000 \000B\000O\000M\000\000\000a\000n\000d\000 \000N\000U\000L\000s", "With BOM\0and NULs", TRUE, NULL); + TEST(FALSE, UTF16_BE_BOM"\000W\000\355\000t\000h\000 \000B\000\330\000M\000\000\000a\000\361\000d\000 \000N\000\331\000L\000s", "Wíth BØM\0añd NÙLs", TRUE, NULL); + + TEST(TRUE, UTF32_LE_BOM"W\000\000\000i\000\000\000t\000\000\000h\000\000\000 \000\000\000B\000\000\000O\000\000\000M\000\000\000", "With BOM", TRUE, NULL); + TEST(TRUE, UTF32_LE_BOM"W\000\000\000i\000\000\000t\000\000\000h\000\000\000 \000\000\000B\000\000\000\330\000\000\000M\000\000\000", "With BØM", TRUE, NULL); + /* doesn't accept the NULs */ + TEST(FALSE, UTF32_LE_BOM"W\000\000\000i\000\000\000t\000\000\000h\000\000\000 \000\000\000B\000\000\000O\000\000\000M\000\000\000\000\000\000\000a\000\000\000n\000\000\000d\000\000\000 \000\000\000N\000\000\000U\000\000\000L\000\000\000s\000\000\000", "With BOM\0and NULs", TRUE, NULL); + TEST(FALSE, UTF32_LE_BOM"W\000\000\000\355\000\000\000t\000\000\000h\000\000\000 \000\000\000B\000\000\000\330\000\000\000M\000\000\000\000\000\000\000a\000\000\000\361\000\000\000d\000\000\000 \000\000\000N\000\000\000\331\000\000\000L\000\000\000s\000\000\000", "Wíth BØM\0añd NÙLs", TRUE, NULL); + + TEST(TRUE, UTF32_BE_BOM"\000\000\000W\000\000\000i\000\000\000t\000\000\000h\000\000\000 \000\000\000B\000\000\000O\000\000\000M", "With BOM", TRUE, NULL); + TEST(TRUE, UTF32_BE_BOM"\000\000\000W\000\000\000i\000\000\000t\000\000\000h\000\000\000 \000\000\000B\000\000\000\330\000\000\000M", "With BØM", TRUE, NULL); + /* doesn't accept the NULs */ + TEST(FALSE, UTF32_BE_BOM"\000\000\000W\000\000\000i\000\000\000t\000\000\000h\000\000\000 \000\000\000B\000\000\000O\000\000\000M\000\000\000\000\000\000\000a\000\000\000n\000\000\000d\000\000\000 \000\000\000N\000\000\000U\000\000\000L\000\000\000s", "With BOM\0and NULs", TRUE, NULL); + TEST(FALSE, UTF32_BE_BOM"\000\000\000W\000\000\000\355\000\000\000t\000\000\000h\000\000\000 \000\000\000B\000\000\000\330\000\000\000M\000\000\000\000\000\000\000a\000\000\000\361\000\000\000d\000\000\000 \000\000\000N\000\000\000\331\000\000\000L\000\000\000s", "Wíth BØM\0añd NÙLs", TRUE, NULL); + + /* meh, UTF-7 */ + TEST(TRUE, "No B+ANg-M", "No BØM", FALSE, "UTF-7"); + TEST(TRUE, "+/v8-With B+ANg-M", "With BØM", TRUE, NULL); + TEST(FALSE, "No B+ANg-M+AAA-but NULs", "No BØM\0but NULs", FALSE, "UTF-7"); + /* Fails to load as UTF-7 because of the NUL, and succeeds as UTF-8 but + * obviously doesn't match expectations */ + /*TEST(FALSE, "+/v8-With B+ANg-M+AAA-and NULs", "With BØM\0and NULs", TRUE, NULL);*/ + + /* empty data with BOMs */ + TEST_ENC(TRUE, "+/v8-", "", TRUE, NULL, "UTF-7"); /* UTF-7 */ + TEST_ENC(TRUE, UTF16_BE_BOM, "", TRUE, NULL, "UTF-16BE"); + TEST_ENC(TRUE, UTF16_LE_BOM, "", TRUE, NULL, "UTF-16LE"); + TEST_ENC(TRUE, UTF32_BE_BOM, "", TRUE, NULL, "UTF-32BE"); + TEST_ENC(TRUE, UTF32_LE_BOM, "", TRUE, NULL, "UTF-32LE"); + +#undef TEST +#undef TEST_ENC +#undef UTF32_BE_BOM +#undef UTF32_LE_BOM +#undef UTF16_BE_BOM +#undef UTF16_LE_BOM +} + + +static void test_encodings_convert_iso8859_to_utf8_auto(void) +{ +#define TEST(success, input, output, forced_enc) \ + g_assert(success == assert_convert_to_utf8_auto(input, G_N_ELEMENTS(input) - 1, G_N_ELEMENTS(input) - 1, \ + forced_enc, output, G_N_ELEMENTS(output) - 1, forced_enc, FALSE, \ + strlen(output) != G_N_ELEMENTS(output) - 1)) + + TEST(TRUE, "Th\xec""s", "Thìs", NULL); + TEST(TRUE, "Th\xec""s", "Thìs", "ISO-8859-1"); + TEST(TRUE, "Th\xec""s", "Thìs", "ISO-8859-15"); + TEST(TRUE, "\xa4""uro", "¤uro", "ISO-8859-1"); + TEST(TRUE, "\xa4""uro", "€uro", "ISO-8859-15"); + TEST(TRUE, "\xd8""ed", "Řed", "ISO-8859-2"); + /* make-believe UTF-8 BOM followed by non-UTF-8 data */ + TEST(TRUE, "\xef\xbb\xbf""not B\xd3M", "not BÓM", NULL); + TEST(TRUE, "coding:iso-8859-2 \xd8""ed", "coding:iso-8859-2 Řed", NULL); + /* with NULs */ + TEST(FALSE, "W\xec""th\0z\xe9""r\xf8""s", "Wìth\0zérøs", "ISO-8859-1"); + TEST(FALSE, "W\xec""th\0z\xe9""r\xf8""s", "Wìth\0zérøs", "ISO-8859-15"); + /* This parses as UTF-16, but that's not really what we'd expect */ + /*TEST(FALSE, "W\xec""th\0z\xe9""r\xf8""s", "Wìth\0zérøs", NULL);*/ + + /* UTF-8 BOM with non-UTF-8 data, we should fallback */ + TEST(TRUE, "\xef\xbb\xbfW\xec""th\xf8""ut BOM", "Wìthøut BOM", NULL); + +#undef TEST +} + + +int main(int argc, char **argv) +{ + g_test_init(&argc, &argv, NULL); + gtk_init_check(&argc, &argv); + main_init_headless(); + + g_test_add_func("/encodings/ascii/convert_to_utf8_auto", test_encodings_convert_ascii_to_utf8_auto); + g_test_add_func("/encodings/utf8/convert_to_utf8_auto", test_encodings_convert_utf8_to_utf8_auto); + g_test_add_func("/encodings/utf_other/convert_to_utf_other_auto", test_encodings_convert_utf_other_to_utf8_auto); + g_test_add_func("/encodings/iso8859/convert_to_utf8_auto", test_encodings_convert_iso8859_to_utf8_auto); + + return g_test_run(); +}
-------------- This E-Mail was brought to you by github_commit_mail.py (Source: https://github.com/geany/infrastructure).