[geany/geany] 9fff38: Merge pull request #3716 from b4n/encodings-fixes - Commits

21 Apr 2024


      Branch:      refs/heads/master
Author:      Colomban Wendling ban@herbesfolles.org
Committer:   GitHub noreply@github.com
Date:        Sun, 21 Apr 2024 19:33:21 UTC
Commit:      9fff385604685e6707b0741ed9a737aaa5c23248
             https://github.com/geany/geany/commit/9fff385604685e6707b0741ed9a737aaa5c232...
Log Message:
-----------
Merge pull request #3716 from b4n/encodings-fixes
Various encodings conversion fixes
Modified Paths:
--------------
    src/document.c
    src/encodings.c
    src/encodingsprivate.h
    src/libmain.c
    src/templates.c
    tests/Makefile.am
    tests/meson.build
    tests/test_encodings.c
Modified: src/document.c
16 lines changed, 6 insertions(+), 10 deletions(-)
===================================================================
@@ -998,19 +998,15 @@ static gboolean load_text_file(const gchar *locale_filename, const gchar *displa
    }
if (! encodings_convert_to_utf8_auto(&filedata->data, &filedata->len, forced_enc,
-				&filedata->enc, &filedata->bom, &filedata->readonly))
+				&filedata->enc, &filedata->bom, &filedata->readonly, &err))
    {
    	if (forced_enc)
-		{
-			ui_set_statusbar(TRUE, _("The file "%s" is not valid %s."),
-				display_filename, forced_enc);
-		}
+			ui_set_statusbar(TRUE, _("Failed to load file "%s" as %s: %s."),
+				display_filename, forced_enc, err->message);
    	else
-		{
-			ui_set_statusbar(TRUE,
-	_("The file "%s" does not look like a text file or the file encoding is not supported."),
-			display_filename);
-		}
+			ui_set_statusbar(TRUE, _("Failed to load file "%s": %s."),
+				display_filename, err->message);
+		g_error_free(err);
    	g_free(filedata->data);
    	return FALSE;
    }
Modified: src/encodings.c
194 lines changed, 103 insertions(+), 91 deletions(-)
===================================================================
@@ -195,37 +195,27 @@ static gboolean encodings_charset_equals(const gchar *a, const gchar *b)
GeanyEncodingIndex encodings_get_idx_from_charset(const gchar *charset)
 {
-	gint i;
-
    if (charset == NULL)
    	return GEANY_ENCODING_UTF_8;
-	i = 0;
-	while (i < GEANY_ENCODINGS_MAX)
+	for (gint i = 0; i < GEANY_ENCODINGS_MAX; i++)
    {
    	if (encodings_charset_equals(charset, encodings[i].charset))
    		return i;
-
-		++i;
    }
    return GEANY_ENCODING_UTF_8;
 }
const GeanyEncoding *encodings_get_from_charset(const gchar *charset)
 {
-	gint i;
-
    if (charset == NULL)
    	return &encodings[GEANY_ENCODING_UTF_8];
-	i = 0;
-	while (i < GEANY_ENCODINGS_MAX)
+	for (gint i = 0; i < GEANY_ENCODINGS_MAX; i++)
    {
    	if (encodings_charset_equals(charset, encodings[i].charset))
    		return &encodings[i];
-
-		++i;
    }
return NULL;
@@ -303,12 +293,10 @@ void encodings_select_radio_item(const gchar *charset)
g_return_if_fail(charset != NULL);
-	i = 0;
-	while (i < GEANY_ENCODINGS_MAX)
+	for (i = 0; i < GEANY_ENCODINGS_MAX; i++)
    {
    	if (utils_str_equal(charset, encodings[i].charset))
    		break;
-		i++;
    }
    if (i == GEANY_ENCODINGS_MAX)
    	i = GEANY_ENCODING_UTF_8; /* fallback to UTF-8 */
@@ -326,7 +314,7 @@ void encodings_select_radio_item(const gchar *charset)
 static GRegex *regex_compile(const gchar *pattern)
 {
    GError *error = NULL;
-	GRegex *regex = g_regex_new(pattern, G_REGEX_CASELESS, 0, &error);
+	GRegex *regex = g_regex_new(pattern, G_REGEX_CASELESS | G_REGEX_RAW, 0, &error);
if (!regex)
    {
@@ -405,11 +393,31 @@ void encodings_finalize(void)
 }
+/* initialization of non-UI parts */
+void encodings_init_headless(void)
+{
+	static gboolean initialized = FALSE;
+
+	if (initialized)
+		return;
+
+	init_encodings();
+
+	if (! pregs_loaded)
+	{
+		pregs[0] = regex_compile(PATTERN_HTMLMETA);
+		pregs[1] = regex_compile(PATTERN_CODING);
+		pregs_loaded = TRUE;
+	}
+
+	initialized = TRUE;
+}
+
+
 void encodings_init(void)
 {
    GtkWidget *menu[2];
    GCallback cb_func[2];
-	gint group_sizes[GEANY_ENCODING_GROUPS_MAX] = { 0 };
    const gchar *const groups[GEANY_ENCODING_GROUPS_MAX] =
    {
    	[NONE]			= NULL,
@@ -421,24 +429,14 @@ void encodings_init(void)
    	[UNICODE]		= N_("_Unicode"),
    };
-	init_encodings();
-
-	if (! pregs_loaded)
-	{
-		pregs[0] = regex_compile(PATTERN_HTMLMETA);
-		pregs[1] = regex_compile(PATTERN_CODING);
-		pregs_loaded = TRUE;
-	}
+	encodings_init_headless();
/* create encodings submenu in document menu */
    menu[0] = ui_lookup_widget(main_widgets.window, "set_encoding1_menu");
    menu[1] = ui_lookup_widget(main_widgets.window, "menu_reload_as1_menu");
    cb_func[0] = G_CALLBACK(encodings_radio_item_change_cb);
    cb_func[1] = G_CALLBACK(encodings_reload_radio_item_change_cb);
-	for (guint i = 0; i < G_N_ELEMENTS(encodings); i++)
-		group_sizes[encodings[i].group]++;
-
    for (guint k = 0; k < 2; k++)
    {
    	GSList *group = NULL;
@@ -612,21 +610,9 @@ void encodings_encoding_store_cell_data_func(GtkCellLayout *cell_layout,
 }
-/**
- *  Tries to convert @a buffer into UTF-8 encoding from the encoding specified with @a charset.
- *  If @a fast is not set, additional checks to validate the converted string are performed.
- *
- *  @param buffer The input string to convert.
- *  @param size The length of the string, or -1 if the string is nul-terminated.
- *  @param charset The charset to be used for conversion.
- *  @param fast @c TRUE to only convert the input and skip extended checks on the converted string.
- *
- *  @return If the conversion was successful, a newly allocated nul-terminated string,
- *    which must be freed with @c g_free(). Otherwise @c NULL.
- **/
-GEANY_API_SYMBOL
-gchar *encodings_convert_to_utf8_from_charset(const gchar *buffer, gssize size,
-											  const gchar *charset, gboolean fast)
+static gchar *convert_to_utf8_from_charset(const gchar *buffer, gssize size,
+										   const gchar *charset, gboolean fast,
+										   gsize *utf8_size, GError **error)
 {
    gchar *utf8_content = NULL;
    GError *conv_error = NULL;
@@ -642,18 +628,22 @@ gchar *encodings_convert_to_utf8_from_charset(const gchar *buffer, gssize size,
    if (fast)
    {
    	utf8_content = converted_contents;
-		if (conv_error != NULL) g_error_free(conv_error);
+		if (conv_error != NULL) g_propagate_error(error, conv_error);
    }
    else if (conv_error != NULL || ! g_utf8_validate(converted_contents, bytes_written, NULL))
    {
    	if (conv_error != NULL)
    	{
    		geany_debug("Couldn't convert from %s to UTF-8 (%s).", charset, conv_error->message);
-			g_error_free(conv_error);
+			g_propagate_error(error, conv_error);
    		conv_error = NULL;
    	}
    	else
+		{
    		geany_debug("Couldn't convert from %s to UTF-8.", charset);
+			g_set_error(error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
+					_("Data contains NULs"));
+		}
utf8_content = NULL;
    	g_free(converted_contents);
@@ -664,10 +654,35 @@ gchar *encodings_convert_to_utf8_from_charset(const gchar *buffer, gssize size,
    	utf8_content = converted_contents;
    }
+	if (utf8_content && utf8_size)
+		*utf8_size = bytes_written;
+
    return utf8_content;
 }
+/**
+ *  Tries to convert @a buffer into UTF-8 encoding from the encoding specified with @a charset.
+ *  If @a fast is not set, additional checks to validate the converted string are performed.
+ *
+ *  @param buffer The input string to convert.
+ *  @param size The length of the string, or -1 if the string is nul-terminated.
+ *  @param charset The charset to be used for conversion.
+ *  @param fast @c TRUE to only convert the input and skip extended checks on the converted string.
+ *
+ *  @return If the conversion was successful, a newly allocated nul-terminated string,
+ *    which must be freed with @c g_free(). Otherwise @c NULL.
+ **/
+GEANY_API_SYMBOL
+gchar *encodings_convert_to_utf8_from_charset(const gchar *buffer, gssize size,
+											  const gchar *charset, gboolean fast)
+{
+	/* If fast=FALSE, we can safely ignore the size as the output cannot contain NULs.
+	 * Otherwise, the caller already agrees on partial data anyway. */
+	return convert_to_utf8_from_charset(buffer, size, charset, fast, NULL, NULL);
+}
+
+
 static gchar *encodings_check_regexes(const gchar *buffer, gsize size)
 {
    guint i;
@@ -684,7 +699,7 @@ static gchar *encodings_check_regexes(const gchar *buffer, gsize size)
static gchar *encodings_convert_to_utf8_with_suggestion(const gchar *buffer, gssize size,
-		const gchar *suggested_charset, gchar **used_encoding)
+		const gchar *suggested_charset, gchar **used_encoding, gsize *utf8_size, GError **error)
 {
    const gchar *locale_charset = NULL;
    const gchar *charset;
@@ -751,7 +766,7 @@ static gchar *encodings_convert_to_utf8_with_suggestion(const gchar *buffer, gss
geany_debug("Trying to convert %" G_GSIZE_FORMAT " bytes of data from %s into UTF-8.",
    		size, charset);
-		utf8_content = encodings_convert_to_utf8_from_charset(buffer, size, charset, FALSE);
+		utf8_content = convert_to_utf8_from_charset(buffer, size, charset, FALSE, utf8_size, NULL);
if (G_LIKELY(utf8_content != NULL))
    	{
@@ -768,6 +783,9 @@ static gchar *encodings_convert_to_utf8_with_suggestion(const gchar *buffer, gss
    	}
    }
+	g_set_error(error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
+			_("Data contains NULs or the encoding is not supported"));
+
    return NULL;
 }
@@ -791,7 +809,8 @@ gchar *encodings_convert_to_utf8(const gchar *buffer, gssize size, gchar **used_
/* first try to read the encoding from the file content */
    regex_charset = encodings_check_regexes(buffer, size);
-	utf8 = encodings_convert_to_utf8_with_suggestion(buffer, size, regex_charset, used_encoding);
+	/* we know this cannot succeed if there are NULs in the output, so ignoring the size is OK */
+	utf8 = encodings_convert_to_utf8_with_suggestion(buffer, size, regex_charset, used_encoding, NULL, NULL);
    g_free(regex_charset);
return utf8;
@@ -870,38 +889,37 @@ typedef struct
 {
    gchar		*data;	/* null-terminated data */
    gsize		 size;	/* actual data size */
-	gsize		 len;	/* string length of data */
    gchar		*enc;
    gboolean	 bom;
-	gboolean	 partial;
 } BufferData;
/* convert data with the specified encoding */
 static gboolean
-handle_forced_encoding(BufferData *buffer, const gchar *forced_enc)
+handle_forced_encoding(BufferData *buffer, const gchar *forced_enc, GError **error)
 {
    GeanyEncodingIndex enc_idx;
if (utils_str_equal(forced_enc, "UTF-8"))
    {
-		if (! g_utf8_validate(buffer->data, buffer->len, NULL))
+		if (! g_utf8_validate(buffer->data, buffer->size, NULL))
    	{
+			g_set_error(error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
+					_("Data contains NULs or is not valid UTF-8"));
    		return FALSE;
    	}
    }
    else
    {
-		gchar *converted_text = encodings_convert_to_utf8_from_charset(
-										buffer->data, buffer->size, forced_enc, FALSE);
+		gchar *converted_text = convert_to_utf8_from_charset(
+										buffer->data, buffer->size, forced_enc, FALSE, &buffer->size, error);
    	if (converted_text == NULL)
    	{
    		return FALSE;
    	}
    	else
    	{
    		SETPTR(buffer->data, converted_text);
-			buffer->len = strlen(converted_text);
    	}
    }
    enc_idx = encodings_scan_unicode_bom(buffer->data, buffer->size, NULL);
@@ -913,15 +931,14 @@ handle_forced_encoding(BufferData *buffer, const gchar *forced_enc)
/* detect encoding and convert to UTF-8 if necessary */
 static gboolean
-handle_encoding(BufferData *buffer, GeanyEncodingIndex enc_idx)
+handle_encoding(BufferData *buffer, GeanyEncodingIndex enc_idx, GError **error)
 {
    g_return_val_if_fail(buffer->enc == NULL, FALSE);
    g_return_val_if_fail(buffer->bom == FALSE, FALSE);
if (buffer->size == 0)
    {
-		/* we have no data so assume UTF-8, buffer->len can be 0 even we have an empty
-		 * e.g. UTF32 file with a BOM(so size is 4, len is 0) */
+		/* we have no data so assume UTF-8 */
    	buffer->enc = g_strdup("UTF-8");
    }
    else
@@ -932,14 +949,22 @@ handle_encoding(BufferData *buffer, GeanyEncodingIndex enc_idx)
    		buffer->enc = g_strdup(encodings[enc_idx].charset);
    		buffer->bom = TRUE;
-			if (enc_idx != GEANY_ENCODING_UTF_8) /* the BOM indicated something else than UTF-8 */
+			if (enc_idx == GEANY_ENCODING_UTF_8)
    		{
-				gchar *converted_text = encodings_convert_to_utf8_from_charset(
-										buffer->data, buffer->size, buffer->enc, FALSE);
+				if (! g_utf8_validate(buffer->data, buffer->size, NULL))
+				{
+					/* this is not actually valid UTF-8 */
+					SETPTR(buffer->enc, NULL);
+					buffer->bom = FALSE;
+				}
+			}
+			else /* the BOM indicated something else than UTF-8 */
+			{
+				gchar *converted_text = convert_to_utf8_from_charset(
+										buffer->data, buffer->size, buffer->enc, FALSE, &buffer->size, NULL);
    			if (converted_text != NULL)
    			{
    				SETPTR(buffer->data, converted_text);
-					buffer->len = strlen(converted_text);
    			}
    			else
    			{
@@ -957,23 +982,22 @@ handle_encoding(BufferData *buffer, GeanyEncodingIndex enc_idx)
/* try UTF-8 first */
    		if (encodings_get_idx_from_charset(regex_charset) == GEANY_ENCODING_UTF_8 &&
-				(buffer->size == buffer->len) && g_utf8_validate(buffer->data, buffer->len, NULL))
+				g_utf8_validate(buffer->data, buffer->size, NULL))
    		{
    			buffer->enc = g_strdup("UTF-8");
    		}
    		else
    		{
    			/* detect the encoding */
    			gchar *converted_text = encodings_convert_to_utf8_with_suggestion(buffer->data,
-					buffer->size, regex_charset, &buffer->enc);
+					buffer->size, regex_charset, &buffer->enc, &buffer->size, error);
if (converted_text == NULL)
    			{
    				g_free(regex_charset);
    				return FALSE;
    			}
    			SETPTR(buffer->data, converted_text);
-				buffer->len = strlen(converted_text);
    		}
    		g_free(regex_charset);
    	}
@@ -990,33 +1014,23 @@ handle_bom(BufferData *buffer)
    encodings_scan_unicode_bom(buffer->data, buffer->size, &bom_len);
    g_return_if_fail(bom_len != 0);
-	/* use filedata->len here because the contents are already converted into UTF-8 */
-	buffer->len -= bom_len;
+	/* the contents are already converted into UTF-8 here */
+	buffer->size -= bom_len;
    /* overwrite the BOM with the remainder of the file contents, plus the NULL terminator. */
-	memmove(buffer->data, buffer->data + bom_len, buffer->len + 1);
-	buffer->data = g_realloc(buffer->data, buffer->len + 1);
+	memmove(buffer->data, buffer->data + bom_len, buffer->size + 1);
+	buffer->data = g_realloc(buffer->data, buffer->size + 1);
 }
/* loads textfile data, verifies and converts to forced_enc or UTF-8. Also handles BOM. */
-static gboolean handle_buffer(BufferData *buffer, const gchar *forced_enc)
+static gboolean handle_buffer(BufferData *buffer, const gchar *forced_enc, GError **error)
 {
    GeanyEncodingIndex tmp_enc_idx;
/* temporarily retrieve the encoding idx based on the BOM to suppress the following warning
     * if we have a BOM */
    tmp_enc_idx = encodings_scan_unicode_bom(buffer->data, buffer->size, NULL);
-	/* check whether the size of the loaded data is equal to the size of the file in the
-	 * filesystem file size may be 0 to allow opening files in /proc/ which have typically a
-	 * file size of 0 bytes */
-	if (buffer->len != buffer->size && buffer->size != 0 && (
-		tmp_enc_idx == GEANY_ENCODING_UTF_8 || /* tmp_enc_idx can be UTF-7/8/16/32, UCS and None */
-		tmp_enc_idx == GEANY_ENCODING_UTF_7))  /* filter UTF-7/8 where no NULL bytes are allowed */
-	{
-		buffer->partial = TRUE;
-	}
-
    /* Determine character encoding and convert to UTF-8 */
    if (forced_enc != NULL)
    {
@@ -1026,12 +1040,12 @@ static gboolean handle_buffer(BufferData *buffer, const gchar *forced_enc)
    		buffer->bom = FALSE;
    		buffer->enc = g_strdup(encodings[GEANY_ENCODING_NONE].charset);
    	}
-		else if (! handle_forced_encoding(buffer, forced_enc))
+		else if (! handle_forced_encoding(buffer, forced_enc, error))
    	{
    		return FALSE;
    	}
    }
-	else if (! handle_encoding(buffer, tmp_enc_idx))
+	else if (! handle_encoding(buffer, tmp_enc_idx, error))
    {
    	return FALSE;
    }
@@ -1053,35 +1067,33 @@ static gboolean handle_buffer(BufferData *buffer, const gchar *forced_enc)
  * @param forced_enc forced encoding to use, or @c NULL
  * @param used_encoding return location for the actually used encoding, or @c NULL
  * @param has_bom return location to store whether the data had a BOM, or @c NULL
- * @param partial return location to store whether the conversion may be partial, or @c NULL
+ * @param has_nuls return location to store whether the converted data contains NULs, or @c NULL
  *
  * @return @C TRUE if the conversion succeeded, @c FALSE otherwise.
  */
+GEANY_EXPORT_SYMBOL
 gboolean encodings_convert_to_utf8_auto(gchar **buf, gsize *size, const gchar *forced_enc,
-		gchar **used_encoding, gboolean *has_bom, gboolean *partial)
+		gchar **used_encoding, gboolean *has_bom, gboolean *has_nuls, GError **error)
 {
    BufferData buffer;
buffer.data = *buf;
    buffer.size = *size;
-	/* use strlen to check for null chars */
-	buffer.len = strlen(buffer.data);
    buffer.enc = NULL;
    buffer.bom = FALSE;
-	buffer.partial = FALSE;
-	if (! handle_buffer(&buffer, forced_enc))
+	if (! handle_buffer(&buffer, forced_enc, error))
    	return FALSE;
-	*size = buffer.len;
+	*size = buffer.size;
    if (used_encoding)
    	*used_encoding = buffer.enc;
    else
    	g_free(buffer.enc);
    if (has_bom)
    	*has_bom = buffer.bom;
-	if (partial)
-		*partial = buffer.partial;
+	if (has_nuls)
+		*has_nuls = strlen(buffer.data) != buffer.size;
*buf = buffer.data;
    return TRUE;
Modified: src/encodingsprivate.h
4 lines changed, 3 insertions(+), 1 deletions(-)
===================================================================
@@ -57,6 +57,7 @@ const gchar* encodings_get_charset(const GeanyEncoding* enc);
void encodings_select_radio_item(const gchar *charset);
+void encodings_init_headless(void);
 void encodings_init(void);
 void encodings_finalize(void);
@@ -72,7 +73,8 @@ void encodings_encoding_store_cell_data_func(GtkCellLayout *cell_layout, GtkCell
 gboolean encodings_is_unicode_charset(const gchar *string);
gboolean encodings_convert_to_utf8_auto(gchar **buf, gsize *size, const gchar *forced_enc,
-                                        gchar **used_encoding, gboolean *has_bom, gboolean *partial);
+                                        gchar **used_encoding, gboolean *has_bom, gboolean *has_nuls,
+                                        GError **error);
GeanyEncodingIndex encodings_scan_unicode_bom(const gchar *string, gsize len, guint *bom_len);
Modified: src/libmain.c
2 lines changed, 2 insertions(+), 0 deletions(-)
===================================================================
@@ -1033,6 +1033,8 @@ void main_init_headless(void)
    memset(&template_prefs, 0, sizeof(GeanyTemplatePrefs));
    memset(&ui_prefs, 0, sizeof(UIPrefs));
    memset(&ui_widgets, 0, sizeof(UIWidgets));
+
+	encodings_init_headless();
 }
Modified: src/templates.c
7 lines changed, 5 insertions(+), 2 deletions(-)
===================================================================
@@ -70,15 +70,18 @@ static gchar *read_file(const gchar *locale_fname)
    gchar *contents;
    gsize length;
    GString *str;
+	GError *err = NULL;
if (! g_file_get_contents(locale_fname, &contents, &length, NULL))
    	return NULL;
-	if (! encodings_convert_to_utf8_auto(&contents, &length, NULL, NULL, NULL, NULL))
+	if (! encodings_convert_to_utf8_auto(&contents, &length, NULL, NULL, NULL, NULL, &err))
    {
    	gchar *utf8_fname = utils_get_utf8_from_locale(locale_fname);
-		ui_set_statusbar(TRUE, _("Failed to convert template file "%s" to UTF-8"), utf8_fname);
+		ui_set_statusbar(TRUE, _("Failed to convert template file "%s" to UTF-8: %s"),
+				utf8_fname, err->message);
+		g_error_free(err);
    	g_free(utf8_fname);
    	g_free(contents);
    	return NULL;
Modified: tests/Makefile.am
3 lines changed, 2 insertions(+), 1 deletions(-)
===================================================================
@@ -7,9 +7,10 @@ AM_CPPFLAGS += -I$(top_srcdir)/src/tagmanager -I$(top_srcdir)/src
 AM_CFLAGS = $(GTK_CFLAGS)
 AM_LDFLAGS = $(GTK_LIBS) $(INTLLIBS) -no-install
-check_PROGRAMS = test_utils test_sidebar
+check_PROGRAMS = test_utils test_sidebar test_encodings
test_utils_LDADD = $(top_builddir)/src/libgeany.la
 test_sidebar_LDADD = $(top_builddir)/src/libgeany.la
+test_encodings_LDADD = $(top_builddir)/src/libgeany.la
TESTS = $(check_PROGRAMS)
Modified: tests/meson.build
1 lines changed, 1 insertions(+), 0 deletions(-)
===================================================================
@@ -372,3 +372,4 @@ test('ctags/processing-order', runner,
      env: ['top_srcdir='+meson.source_root(), 'top_builddir='+meson.build_root()])
 test('utils', executable('test_utils', 'test_utils.c', dependencies: test_deps))
 test('sidebar', executable('test_sidebar', 'test_sidebar.c', dependencies: test_deps))
+test('encodings', executable('test_encodings', 'test_encodings.c', dependencies: test_deps))
Modified: tests/test_encodings.c
289 lines changed, 289 insertions(+), 0 deletions(-)
===================================================================
@@ -0,0 +1,289 @@
+/*
+ *      Copyright 2023 The Geany contributors
+ *
+ *      This program is free software; you can redistribute it and/or modify
+ *      it under the terms of the GNU General Public License as published by
+ *      the Free Software Foundation; either version 2 of the License, or
+ *      (at your option) any later version.
+ *
+ *      This program is distributed in the hope that it will be useful,
+ *      but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *      GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along
+ *      with this program; if not, write to the Free Software Foundation, Inc.,
+ *      51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include "encodingsprivate.h"
+#include "main.h"
+
+
+/* Asserts 2 bytes buffers are identical, trying to provide a somewhat useful
+ * error if not. */
+static void assert_cmpmem_eq_impl(const char *p1, const char *p2, gsize len,
+		const char *domain, const char *file, int line, const char *func,
+		const char *expr)
+{
+	gchar *msg;
+	gsize i;
+
+	for (i = 0; i < len && p1[i] == p2[i]; i++)
+		;
+	if (i == len)
+		return;
+
+	msg = g_strdup_printf("assertion failed (%s): bytes %#x and %#x differ at offset %lu (at "%s" and "%s")",
+			expr, (guint) (guchar) p1[i], (guint) (guchar) p2[i], i, p1 + i, p2 + i);
+	g_assertion_message(domain, file, line, func, msg);
+	g_free(msg);
+}
+
+#define assert_cmpmem_eq_with_caller(p1, p2, len, domain, file, line, func) \
+	assert_cmpmem_eq_impl(p1, p2, len, domain, file, line, func, #p1 " == " #p2)
+
+#define assert_cmpmem_eq(p1, p2, len) assert_cmpmem_eq_impl(p1, p2, len, \
+		G_LOG_DOMAIN, __FILE__, __LINE__, G_STRFUNC, #p1 " == " #p2)
+
+/*
+ * @brief More convenient test API for encodings_convert_to_utf8_auto()
+ * @param input Input buffer, NUL-terminated (well, at least there should be a
+ *        trailing NUL).
+ * @param input_size Actual size of @p input buffer, without the trailing NUL
+ * @param disk_size Size on disk (as reported by e.g stat -- that may be 0 for
+ *                  virtual files, otherwise should be input_size)
+ * @param forced_enc Forced encoding, or NULL
+ * @param expected_output Expected output data
+ * @param expected_size Expected output size
+ * @param expected_encoding Expected output encoding
+ * @param expected_has_bom Whether the input contains a BOM
+ * @param expected_partial Whether the output is expected to be truncated
+ * @returns Whether the conversion succeeded and followed the parameters
+ */
+static gboolean assert_convert_to_utf8_auto_impl(
+		const char *domain, const char *file, int line, const char *func,
+		const gchar *input, gsize input_size,
+		const gsize disk_size, const gchar *forced_enc,
+		const gchar *expected_output, gsize expected_size, const gchar *expected_encoding,
+		gboolean expected_has_bom, gboolean expected_partial)
+{
+	gchar *buf = g_memdup(input, input_size + 1);
+	gsize size = disk_size;
+	gchar *used_encoding = NULL;
+	gboolean has_bom = FALSE;
+	gboolean partial = FALSE;
+	gboolean ret;
+	GError *err = NULL;
+
+	g_log(domain, G_LOG_LEVEL_INFO, "%s:%d:%s: converting %lu bytes", file, line, func, input_size);
+	ret = encodings_convert_to_utf8_auto(&buf, &size, forced_enc, &used_encoding, &has_bom, &partial, &err);
+	fflush(stdout);
+	if (! ret)
+	{
+		g_log(domain, G_LOG_LEVEL_INFO, "%s:%d:%s: conversion failed: %s", file, line, func, err->message);
+		g_error_free(err);
+	}
+	else
+	{
+		assert_cmpmem_eq_with_caller(buf, expected_output, MIN(size, expected_size),
+				domain, file, line, func);
+		g_assert_cmpuint(size, ==, expected_size);
+		if (expected_encoding)
+			g_assert_cmpstr(expected_encoding, ==, used_encoding);
+		g_assert_cmpint(has_bom, ==, expected_has_bom);
+		g_assert_cmpint(partial, ==, expected_partial);
+
+		g_free(used_encoding);
+	}
+
+	g_free(buf);
+
+	return ret;
+}
+
+
+#define assert_convert_to_utf8_auto(input, input_size, disk_size, forced_enc, \
+		expected_output, expected_size, expected_encoding, expected_has_bom, expected_partial) \
+	assert_convert_to_utf8_auto_impl(G_LOG_DOMAIN, __FILE__, __LINE__, G_STRFUNC, \
+			input, input_size, disk_size, forced_enc, \
+			expected_output, expected_size, expected_encoding, expected_has_bom, expected_partial)
+
+
+static void test_encodings_convert_ascii_to_utf8_auto(void)
+{
+#define TEST_ASCII(success, str, forced_enc) \
+		g_assert(success == assert_convert_to_utf8_auto(str, G_N_ELEMENTS(str) - 1, G_N_ELEMENTS(str) - 1, \
+				forced_enc, str, G_N_ELEMENTS(str) - 1, forced_enc, FALSE, \
+				strlen(str) != G_N_ELEMENTS(str) - 1))
+
+	TEST_ASCII(TRUE, "This is a very basic ASCII test", NULL);
+	TEST_ASCII(TRUE, "This is a very basic ASCII test", "None");
+	TEST_ASCII(TRUE, "This is a very basic ASCII test", "ASCII");
+	TEST_ASCII(TRUE, "This is a very basic ASCII test", "UTF-8");
+	TEST_ASCII(TRUE, "S\till ve\ry \b\asic", NULL);
+	TEST_ASCII(FALSE, "With\0some\0NULs\0", NULL);
+	TEST_ASCII(TRUE, "With\0some\0NULs\0", "None");
+	TEST_ASCII(FALSE, "With\0some\0NULs\0", "UTF-8");
+
+#undef TEST_ASCII
+}
+
+
+static void test_encodings_convert_utf8_to_utf8_auto(void)
+{
+#define UTF8_BOM "\xef\xbb\xbf"
+#define TEST_UTF8(success, str, forced_enc)																	\
+	G_STMT_START {																							\
+		gboolean has_bom = strncmp(str, UTF8_BOM, 3) == 0;													\
+		g_assert(success == assert_convert_to_utf8_auto(str, G_N_ELEMENTS(str) - 1, G_N_ELEMENTS(str) - 1,	\
+				forced_enc, str + (has_bom ? 3 : 0), G_N_ELEMENTS(str) - 1 - (has_bom ? 3 : 0),				\
+				forced_enc, has_bom, strlen(str) != G_N_ELEMENTS(str) - 1));								\
+	} G_STMT_END
+
+	TEST_UTF8(TRUE, "Thĩs îs å véry basìč ÅSÇǏÍ test", NULL);
+	TEST_UTF8(TRUE, "Thĩs îs å véry basìč ÅSÇǏÍ test", "None");
+	TEST_UTF8(TRUE, "Thĩs îs å véry basìč ÅSÇǏÍ test", "UTF-8");
+	TEST_UTF8(FALSE, "Wíťh\0søme\0NÙLs\0", NULL);
+	TEST_UTF8(FALSE, "Wíťh\0søme\0NÙLs\0", "UTF-8"); /* the NUL doesn't pass the UTF-8 check */
+	TEST_UTF8(TRUE, "Wíťh\0søme\0NÙLs\0", "None"); /* with None we do no data validation, but report partial output */
+
+	/* with the inline hint */
+	TEST_UTF8(TRUE, "coding:utf-8 bãśïč", NULL);
+	TEST_UTF8(FALSE, "coding:utf-8 Wíťh\0søme\0NÙLs", NULL);
+
+	TEST_UTF8(TRUE, UTF8_BOM"With BOM", NULL);
+	/* These won't pass the UTF-8 validation despite the BOM, so we fallback to
+	 * testing other options, and it will succeed with UTF-16 so there's no real
+	 * point in verifying this */
+	/*TEST_UTF8(FALSE, UTF8_BOM"With BOM\0and NULs", NULL);*/
+	/*TEST_UTF8(FALSE, UTF8_BOM"Wíth BØM\0añd NÙLs", NULL);*/
+
+	/* non-UTF-8 */
+	TEST_UTF8(FALSE, "Th\xec""s", "UTF-8");
+	TEST_UTF8(FALSE, "Th\xec""s\0", "UTF-8");
+	TEST_UTF8(FALSE, "\0Th\xec""s", "UTF-8");
+
+#undef TEST_UTF8
+#undef UTF8_BOM
+}
+
+
+static void test_encodings_convert_utf_other_to_utf8_auto(void)
+{
+#define UTF16_LE_BOM "\xff\xfe"
+#define UTF16_BE_BOM "\xfe\xff"
+#define UTF32_LE_BOM "\xff\xfe\x00\x00"
+#define UTF32_BE_BOM "\x00\x00\xfe\xff"
+#define TEST_ENC(success, input, output, has_bom, forced_enc, expected_encoding) \
+		g_assert(success == assert_convert_to_utf8_auto(input, G_N_ELEMENTS(input) - 1, G_N_ELEMENTS(input) - 1, \
+				forced_enc, output, G_N_ELEMENTS(output) - 1, expected_encoding, has_bom, \
+				strlen(output) != G_N_ELEMENTS(output) - 1))
+#define TEST(success, input, output, has_bom, forced_enc) \
+		TEST_ENC(success, input, output, has_bom, forced_enc, forced_enc)
+
+	TEST(TRUE, "N\000o\000 \000B\000O\000M\000", "No BOM", FALSE, NULL);
+	TEST(TRUE, "N\000o\000 \000B\000\330\000M\000", "No BØM", FALSE, NULL);
+	/* doesn't accept the NULs */
+	TEST(FALSE, "N\000o\000 \000B\000O\000M\000\000\000a\000n\000d\000 \000N\000U\000L\000s\000", "No BOM\0and NULs", FALSE, NULL);
+	TEST(FALSE, "N\000o\000 \000B\000\330\000M\000\000\000a\000\361\000d\000 \000N\000\331\000L\000s\000", "No BØM\0añd NÙLs", FALSE, NULL);
+
+	TEST(TRUE, UTF16_LE_BOM"W\000i\000t\000h\000 \000B\000O\000M\000", "With BOM", TRUE, NULL);
+	TEST(TRUE, UTF16_LE_BOM"W\000i\000t\000h\000 \000B\000\330\000M\000", "With BØM", TRUE, NULL);
+	/* doesn't accept the NULs */
+	TEST(FALSE, UTF16_LE_BOM"W\000i\000t\000h\000 \000B\000O\000M\000\000\000a\000n\000d\000 \000N\000U\000L\000s\000", "With BOM\0and NULs", TRUE, NULL);
+	TEST(FALSE, UTF16_LE_BOM"W\000\355\000t\000h\000 \000B\000\330\000M\000\000\000a\000\361\000d\000 \000N\000\331\000L\000s\000", "Wíth BØM\0añd NÙLs", TRUE, NULL);
+
+	/* We should actually be smarter in our selection of encoding introducing
+	 * probability scores, because this loads as UTF-16LE but is "圀椀琀栀 䈀伀䴀"
+	 * which doesn't seem to be real Chinese */
+	TEST(TRUE, "\000N\000o\000 \000B\000O\000M", "No BOM", FALSE, "UTF-16BE");
+	TEST(TRUE, "\000N\000o\000 \000B\000\330\000M", "No BØM", FALSE, NULL);
+	/* doesn't accept the NULs -- and see above for the encoding choice */
+	TEST(FALSE, "\000N\000o\000 \000B\000O\000M\000\000\000a\000n\000d\000 \000N\000U\000L\000s", "No BOM\0and NULs", FALSE, "UTF-16BE");
+	TEST(FALSE, "\000N\000o\000 \000B\000\330\000M\000\000\000a\000\361\000d\000 \000N\000\331\000L\000s", "No BØM\0añd NÙLs", FALSE, NULL);
+
+	TEST(TRUE, UTF16_BE_BOM"\000W\000i\000t\000h\000 \000B\000O\000M", "With BOM", TRUE, NULL);
+	TEST(TRUE, UTF16_BE_BOM"\000W\000i\000t\000h\000 \000B\000\330\000M", "With BØM", TRUE, NULL);
+	/* doesn't accept the NULs */
+	TEST(FALSE, UTF16_BE_BOM"\000W\000i\000t\000h\000 \000B\000O\000M\000\000\000a\000n\000d\000 \000N\000U\000L\000s", "With BOM\0and NULs", TRUE, NULL);
+	TEST(FALSE, UTF16_BE_BOM"\000W\000\355\000t\000h\000 \000B\000\330\000M\000\000\000a\000\361\000d\000 \000N\000\331\000L\000s", "Wíth BØM\0añd NÙLs", TRUE, NULL);
+
+	TEST(TRUE, UTF32_LE_BOM"W\000\000\000i\000\000\000t\000\000\000h\000\000\000 \000\000\000B\000\000\000O\000\000\000M\000\000\000", "With BOM", TRUE, NULL);
+	TEST(TRUE, UTF32_LE_BOM"W\000\000\000i\000\000\000t\000\000\000h\000\000\000 \000\000\000B\000\000\000\330\000\000\000M\000\000\000", "With BØM", TRUE, NULL);
+	/* doesn't accept the NULs */
+	TEST(FALSE, UTF32_LE_BOM"W\000\000\000i\000\000\000t\000\000\000h\000\000\000 \000\000\000B\000\000\000O\000\000\000M\000\000\000\000\000\000\000a\000\000\000n\000\000\000d\000\000\000 \000\000\000N\000\000\000U\000\000\000L\000\000\000s\000\000\000", "With BOM\0and NULs", TRUE, NULL);
+	TEST(FALSE, UTF32_LE_BOM"W\000\000\000\355\000\000\000t\000\000\000h\000\000\000 \000\000\000B\000\000\000\330\000\000\000M\000\000\000\000\000\000\000a\000\000\000\361\000\000\000d\000\000\000 \000\000\000N\000\000\000\331\000\000\000L\000\000\000s\000\000\000", "Wíth BØM\0añd NÙLs", TRUE, NULL);
+
+	TEST(TRUE, UTF32_BE_BOM"\000\000\000W\000\000\000i\000\000\000t\000\000\000h\000\000\000 \000\000\000B\000\000\000O\000\000\000M", "With BOM", TRUE, NULL);
+	TEST(TRUE, UTF32_BE_BOM"\000\000\000W\000\000\000i\000\000\000t\000\000\000h\000\000\000 \000\000\000B\000\000\000\330\000\000\000M", "With BØM", TRUE, NULL);
+	/* doesn't accept the NULs */
+	TEST(FALSE, UTF32_BE_BOM"\000\000\000W\000\000\000i\000\000\000t\000\000\000h\000\000\000 \000\000\000B\000\000\000O\000\000\000M\000\000\000\000\000\000\000a\000\000\000n\000\000\000d\000\000\000 \000\000\000N\000\000\000U\000\000\000L\000\000\000s", "With BOM\0and NULs", TRUE, NULL);
+	TEST(FALSE, UTF32_BE_BOM"\000\000\000W\000\000\000\355\000\000\000t\000\000\000h\000\000\000 \000\000\000B\000\000\000\330\000\000\000M\000\000\000\000\000\000\000a\000\000\000\361\000\000\000d\000\000\000 \000\000\000N\000\000\000\331\000\000\000L\000\000\000s", "Wíth BØM\0añd NÙLs", TRUE, NULL);
+
+	/* meh, UTF-7 */
+	TEST(TRUE, "No B+ANg-M", "No BØM", FALSE, "UTF-7");
+	TEST(TRUE, "+/v8-With B+ANg-M", "With BØM", TRUE, NULL);
+	TEST(FALSE, "No B+ANg-M+AAA-but NULs", "No BØM\0but NULs", FALSE, "UTF-7");
+	/* Fails to load as UTF-7 because of the NUL, and succeeds as UTF-8 but
+	 * obviously doesn't match expectations */
+	/*TEST(FALSE, "+/v8-With B+ANg-M+AAA-and NULs", "With BØM\0and NULs", TRUE, NULL);*/
+
+	/* empty data with BOMs */
+	TEST_ENC(TRUE, "+/v8-", "", TRUE, NULL, "UTF-7"); /* UTF-7 */
+	TEST_ENC(TRUE, UTF16_BE_BOM, "", TRUE, NULL, "UTF-16BE");
+	TEST_ENC(TRUE, UTF16_LE_BOM, "", TRUE, NULL, "UTF-16LE");
+	TEST_ENC(TRUE, UTF32_BE_BOM, "", TRUE, NULL, "UTF-32BE");
+	TEST_ENC(TRUE, UTF32_LE_BOM, "", TRUE, NULL, "UTF-32LE");
+
+#undef TEST
+#undef TEST_ENC
+#undef UTF32_BE_BOM
+#undef UTF32_LE_BOM
+#undef UTF16_BE_BOM
+#undef UTF16_LE_BOM
+}
+
+
+static void test_encodings_convert_iso8859_to_utf8_auto(void)
+{
+#define TEST(success, input, output, forced_enc) \
+		g_assert(success == assert_convert_to_utf8_auto(input, G_N_ELEMENTS(input) - 1, G_N_ELEMENTS(input) - 1, \
+				forced_enc, output, G_N_ELEMENTS(output) - 1, forced_enc, FALSE, \
+				strlen(output) != G_N_ELEMENTS(output) - 1))
+
+	TEST(TRUE, "Th\xec""s", "Thìs", NULL);
+	TEST(TRUE, "Th\xec""s", "Thìs", "ISO-8859-1");
+	TEST(TRUE, "Th\xec""s", "Thìs", "ISO-8859-15");
+	TEST(TRUE, "\xa4""uro", "¤uro", "ISO-8859-1");
+	TEST(TRUE, "\xa4""uro", "€uro", "ISO-8859-15");
+	TEST(TRUE, "\xd8""ed", "Řed", "ISO-8859-2");
+	/* make-believe UTF-8 BOM followed by non-UTF-8 data */
+	TEST(TRUE, "\xef\xbb\xbf""not B\xd3M", "ï»¿not BÓM", NULL);
+	TEST(TRUE, "coding:iso-8859-2 \xd8""ed", "coding:iso-8859-2 Řed", NULL);
+	/* with NULs */
+	TEST(FALSE, "W\xec""th\0z\xe9""r\xf8""s", "Wìth\0zérøs", "ISO-8859-1");
+	TEST(FALSE, "W\xec""th\0z\xe9""r\xf8""s", "Wìth\0zérøs", "ISO-8859-15");
+	/* This parses as UTF-16, but that's not really what we'd expect */
+	/*TEST(FALSE, "W\xec""th\0z\xe9""r\xf8""s", "Wìth\0zérøs", NULL);*/
+
+	/* UTF-8 BOM with non-UTF-8 data, we should fallback */
+	TEST(TRUE, "\xef\xbb\xbfW\xec""th\xf8""ut BOM", "ï»¿Wìthøut BOM", NULL);
+
+#undef TEST
+}
+
+
+int main(int argc, char **argv)
+{
+	g_test_init(&argc, &argv, NULL);
+	gtk_init_check(&argc, &argv);
+	main_init_headless();
+
+	g_test_add_func("/encodings/ascii/convert_to_utf8_auto", test_encodings_convert_ascii_to_utf8_auto);
+	g_test_add_func("/encodings/utf8/convert_to_utf8_auto", test_encodings_convert_utf8_to_utf8_auto);
+	g_test_add_func("/encodings/utf_other/convert_to_utf_other_auto", test_encodings_convert_utf_other_to_utf8_auto);
+	g_test_add_func("/encodings/iso8859/convert_to_utf8_auto", test_encodings_convert_iso8859_to_utf8_auto);
+
+	return g_test_run();
+}
--------------
This E-Mail was brought to you by github_commit_mail.py (Source: https://github.com/geany/infrastructure).