[geany/geany] ae01d3: Add some basic tests for encoding detection and conversion - Commits

8 Dec 2023


      Branch:      refs/heads/encodings-fixes
Author:      Colomban Wendling ban@herbesfolles.org
Committer:   Colomban Wendling ban@herbesfolles.org
Date:        Fri, 08 Dec 2023 20:41:54 UTC
Commit:      ae01d302425e53b82a90d87c4445d98441a77414
             https://github.com/geany/geany/commit/ae01d302425e53b82a90d87c4445d98441a774...
Log Message:
-----------
Add some basic tests for encoding detection and conversion
Modified Paths:
--------------
    src/encodings.c
    src/encodingsprivate.h
    src/libmain.c
    tests/Makefile.am
    tests/test_encodings.c
Modified: src/encodings.c
31 lines changed, 23 insertions(+), 8 deletions(-)
===================================================================
@@ -402,6 +402,27 @@ void encodings_finalize(void)
 }
+/* initialization of non-UI parts */
+void encodings_init_headless(void)
+{
+	static gboolean initialized = FALSE;
+
+	if (initialized)
+		return;
+
+	init_encodings();
+
+	if (! pregs_loaded)
+	{
+		pregs[0] = regex_compile(PATTERN_HTMLMETA);
+		pregs[1] = regex_compile(PATTERN_CODING);
+		pregs_loaded = TRUE;
+	}
+
+	initialized = TRUE;
+}
+
+
 void encodings_init(void)
 {
    GtkWidget *menu[2];
@@ -418,14 +439,7 @@ void encodings_init(void)
    	[UNICODE]		= N_("_Unicode"),
    };
-	init_encodings();
-
-	if (! pregs_loaded)
-	{
-		pregs[0] = regex_compile(PATTERN_HTMLMETA);
-		pregs[1] = regex_compile(PATTERN_CODING);
-		pregs_loaded = TRUE;
-	}
+	encodings_init_headless();
/* create encodings submenu in document menu */
    menu[0] = ui_lookup_widget(main_widgets.window, "set_encoding1_menu");
@@ -1054,6 +1068,7 @@ static gboolean handle_buffer(BufferData *buffer, const gchar *forced_enc)
  *
  * @return @C TRUE if the conversion succeeded, @c FALSE otherwise.
  */
+GEANY_EXPORT_SYMBOL
 gboolean encodings_convert_to_utf8_auto(gchar **buf, gsize *size, const gchar *forced_enc,
    	gchar **used_encoding, gboolean *has_bom, gboolean *partial)
 {
Modified: src/encodingsprivate.h
1 lines changed, 1 insertions(+), 0 deletions(-)
===================================================================
@@ -57,6 +57,7 @@ const gchar* encodings_get_charset(const GeanyEncoding* enc);
void encodings_select_radio_item(const gchar *charset);
+void encodings_init_headless(void);
 void encodings_init(void);
 void encodings_finalize(void);
Modified: src/libmain.c
2 lines changed, 2 insertions(+), 0 deletions(-)
===================================================================
@@ -1033,6 +1033,8 @@ void main_init_headless(void)
    memset(&template_prefs, 0, sizeof(GeanyTemplatePrefs));
    memset(&ui_prefs, 0, sizeof(UIPrefs));
    memset(&ui_widgets, 0, sizeof(UIWidgets));
+
+	encodings_init_headless();
 }
Modified: tests/Makefile.am
3 lines changed, 2 insertions(+), 1 deletions(-)
===================================================================
@@ -7,9 +7,10 @@ AM_CPPFLAGS += -I$(top_srcdir)/src/tagmanager -I$(top_srcdir)/src
 AM_CFLAGS = $(GTK_CFLAGS)
 AM_LDFLAGS = $(GTK_LIBS) $(INTLLIBS) -no-install
-check_PROGRAMS = test_utils test_sidebar
+check_PROGRAMS = test_utils test_sidebar test_encodings
test_utils_LDADD = $(top_builddir)/src/libgeany.la
 test_sidebar_LDADD = $(top_builddir)/src/libgeany.la
+test_encodings_LDADD = $(top_builddir)/src/libgeany.la
TESTS = $(check_PROGRAMS)
Modified: tests/test_encodings.c
217 lines changed, 217 insertions(+), 0 deletions(-)
===================================================================
@@ -0,0 +1,217 @@
+/*
+ *      Copyright 2023 The Geany contributors
+ *
+ *      This program is free software; you can redistribute it and/or modify
+ *      it under the terms of the GNU General Public License as published by
+ *      the Free Software Foundation; either version 2 of the License, or
+ *      (at your option) any later version.
+ *
+ *      This program is distributed in the hope that it will be useful,
+ *      but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *      GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along
+ *      with this program; if not, write to the Free Software Foundation, Inc.,
+ *      51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include "encodingsprivate.h"
+#include "main.h"
+
+
+/* Asserts 2 bytes buffers are identical, trying to provide a somewhat useful
+ * error if not. */
+static void assert_cmpmem_eq_impl(const char *p1, const char *p2, gsize len,
+		const char *domain, const char *file, int line, const char *func,
+		const char *expr)
+{
+	gchar *msg;
+	gsize i;
+
+	for (i = 0; i < len && p1[i] == p2[i]; i++)
+		;
+	if (i == len)
+		return;
+
+	msg = g_strdup_printf("assertion failed (%s): bytes %#x and %#x differ at offset %lu (at "%s" and "%s")",
+			expr, (guint) (guchar) p1[i], (guint) (guchar) p2[i], i, p1 + i, p2 + i);
+	g_assertion_message(domain, file, line, func, msg);
+	g_free(msg);
+}
+
+#define assert_cmpmem_eq_with_caller(p1, p2, len, domain, file, line, func) \
+	assert_cmpmem_eq_impl(p1, p2, len, domain, file, line, func, #p1 " == " #p2)
+
+#define assert_cmpmem_eq(p1, p2, len) assert_cmpmem_eq_impl(p1, p2, len, \
+		G_LOG_DOMAIN, __FILE__, __LINE__, G_STRFUNC, #p1 " == " #p2)
+
+/*
+ * @brief More convenient test API for encodings_convert_to_utf8_auto()
+ * @param input Input buffer, NUL-terminated (well, at least there should be a
+ *        trailing NUL).
+ * @param input_size Actual size of @p input buffer, without the trailing NUL
+ * @param disk_size Size on disk (as reported by e.g stat -- that may be 0 for
+ *                  virtual files, otherwise should be input_size)
+ * @param forced_enc Forced encoding, or NULL
+ * @param expected_output Expected output data
+ * @param expected_size Expected output size
+ * @param expected_encoding Expected output encoding
+ * @param expected_has_bom Whether the input contains a BOM
+ * @param expected_partial Whether the output is expected to be truncated
+ * @returns Whether the conversion succeeded and followed the parameters
+ */
+static gboolean assert_convert_to_utf8_auto_impl(
+		const char *domain, const char *file, int line, const char *func,
+		const gchar *input, gsize input_size,
+		const gsize disk_size, const gchar *forced_enc,
+		const gchar *expected_output, gsize expected_size, const gchar *expected_encoding,
+		gboolean expected_has_bom, gboolean expected_partial)
+{
+	gchar *buf = g_memdup(input, input_size + 1);
+	gsize size = disk_size;
+	gchar *used_encoding = NULL;
+	gboolean has_bom = FALSE;
+	gboolean partial = FALSE;
+	gboolean ret;
+
+	g_log(domain, G_LOG_LEVEL_INFO, "%s:%d:%s: converting %lu bytes", file, line, func, input_size);
+	ret = encodings_convert_to_utf8_auto(&buf, &size, forced_enc, &used_encoding, &has_bom, &partial);
+	fflush(stdout);
+	if (ret)
+	{
+		/* FIXME: that's probably a bug in encodings_convert_to_utf8_auto() */
+		if (size != expected_size && expected_partial)
+			expected_size = strlen(expected_output);
+
+		g_assert_cmpuint(size, ==, expected_size);
+		assert_cmpmem_eq_with_caller(buf, expected_output, MIN(size, expected_size),
+				domain, file, line, func);
+		if (expected_encoding)
+			g_assert_cmpstr(expected_encoding, ==, used_encoding);
+		g_assert_cmpint(has_bom, ==, expected_has_bom);
+		g_assert_cmpint(partial, ==, expected_partial);
+
+		g_free(used_encoding);
+	}
+
+	g_free(buf);
+
+	return ret;
+}
+
+
+#define assert_convert_to_utf8_auto(input, input_size, disk_size, forced_enc, \
+		expected_output, expected_size, expected_encoding, expected_has_bom, expected_partial) \
+	assert_convert_to_utf8_auto_impl(G_LOG_DOMAIN, __FILE__, __LINE__, G_STRFUNC, \
+			input, input_size, disk_size, forced_enc, \
+			expected_output, expected_size, expected_encoding, expected_has_bom, expected_partial)
+
+
+static void test_encodings_convert_ascii_to_utf8_auto(void)
+{
+#define TEST_ASCII(success, str, forced_enc) \
+		g_assert(success == assert_convert_to_utf8_auto(str, G_N_ELEMENTS(str) - 1, G_N_ELEMENTS(str) - 1, \
+				forced_enc, str, G_N_ELEMENTS(str) - 1, forced_enc, FALSE, \
+				strlen(str) != G_N_ELEMENTS(str) - 1))
+
+	TEST_ASCII(TRUE, "This is a very basic ASCII test", NULL);
+	TEST_ASCII(TRUE, "This is a very basic ASCII test", "None");
+	TEST_ASCII(TRUE, "This is a very basic ASCII test", "ASCII");
+	TEST_ASCII(TRUE, "This is a very basic ASCII test", "UTF-8");
+	TEST_ASCII(TRUE, "S\till ve\ry \b\asic", NULL);
+	TEST_ASCII(FALSE, "With\0some\0NULs\0", NULL);
+	/* these fails to report partial output! */
+	/*TEST_ASCII(FALSE, "With\0some\0NULs\0", "None");*/
+	/*TEST_ASCII(FALSE, "With\0some\0NULs\0", "UTF-8");*/
+
+#undef TEST_ASCII
+}
+
+
+static void test_encodings_convert_utf8_to_utf8_auto(void)
+{
+#define UTF8_BOM "\xef\xbb\xbf"
+#define TEST_UTF8(success, str, forced_enc)																	\
+	G_STMT_START {																							\
+		gboolean has_bom = strncmp(str, UTF8_BOM, 3) == 0;													\
+		g_assert(success == assert_convert_to_utf8_auto(str, G_N_ELEMENTS(str) - 1, G_N_ELEMENTS(str) - 1,	\
+				forced_enc, str + (has_bom ? 3 : 0), G_N_ELEMENTS(str) - 1 - (has_bom ? 3 : 0),				\
+				forced_enc, has_bom, strlen(str) != G_N_ELEMENTS(str) - 1));								\
+	} G_STMT_END
+
+	TEST_UTF8(TRUE, "Thĩs îs å véry basìč ÅSÇǏÍ test", NULL);
+	TEST_UTF8(TRUE, "Thĩs îs å véry basìč ÅSÇǏÍ test", "None");
+	TEST_UTF8(TRUE, "Thĩs îs å véry basìč ÅSÇǏÍ test", "UTF-8");
+	TEST_UTF8(FALSE, "Wíťh\0søme\0NÙLs\0", NULL);
+	/* these fails to report partial output! */
+	/*TEST_UTF8(FALSE, "Wíťh\0søme\0NÙLs\0", "UTF-8");*/
+	/*TEST_UTF8(FALSE, "Wíťh\0søme\0NÙLs\0", "None");*/
+
+	/* with the inline hint */
+	TEST_UTF8(TRUE, "coding:utf-8 bãśïč", NULL);
+	TEST_UTF8(FALSE, "coding:utf-8 Wíťh\0søme\0NÙLs", NULL);
+
+	TEST_UTF8(TRUE, UTF8_BOM"With BOM", NULL);
+	TEST_UTF8(TRUE, UTF8_BOM"With BOM\0and NULs", NULL);
+	TEST_UTF8(TRUE, UTF8_BOM"Wíth BØM\0añd NÙLs", NULL);
+
+	/* non-UTF-8 */
+	TEST_UTF8(FALSE, "Th\xec""s", "UTF-8");
+	TEST_UTF8(FALSE, "Th\xec""s\0", "UTF-8");
+	/* erroneously succeeds and fails to report partial */
+	/*TEST_UTF8(FALSE, "\0Th\xec""s", "UTF-8");*/
+
+#undef TEST_UTF8
+#undef UTF8_BOM
+}
+
+
+static void test_encodings_convert_iso8859_to_utf8_auto(void)
+{
+#define TEST(success, input, output, forced_enc) \
+		g_assert(success == assert_convert_to_utf8_auto(input, G_N_ELEMENTS(input) - 1, G_N_ELEMENTS(input) - 1, \
+				forced_enc, output, G_N_ELEMENTS(output) - 1, forced_enc, FALSE, \
+				strlen(output) != G_N_ELEMENTS(output) - 1))
+
+	TEST(TRUE, "Th\xec""s", "Thìs", NULL);
+	TEST(TRUE, "Th\xec""s", "Thìs", "ISO-8859-1");
+	TEST(TRUE, "Th\xec""s", "Thìs", "ISO-8859-15");
+	TEST(TRUE, "\xa4""uro", "¤uro", "ISO-8859-1");
+	TEST(TRUE, "\xa4""uro", "€uro", "ISO-8859-15");
+	TEST(TRUE, "\xd8""ed", "Řed", "ISO-8859-2");
+	/* huh?  the UTF-8 BOM takes over, although \xd3 is NOT valid UTF-8!?
+	 * - file(1) says "iso8859 text", OK
+	 * - kate(1) loads as ISO-8859-15
+	 * - vim(1) loads as "latin1" whatever that means (but looks OK)
+	 * - chardet(1) wrongly reports "UTF-8-SIG with confidence 1.0", which is
+	 *   a tad sad for a tool which only purpose IS detecting encoding...
+	 * - pluma(1) doesn't open it and asks for encoding input
+	 * - gedit(1) opens as broken UTF-8, but warns about it and asks
+	 * - gnome-text-editor(1) is just broken, opens as gedit, but says I don't
+	 *   have permission to open that file :)  looks like a generic error. */
+	/*TEST(TRUE, "\xef\xbb\xbf""not B\xd3M", "ï»¿not BÓM", NULL);*/
+	/* fails to detect the suggested encoding */
+	/*TEST(TRUE, "coding:iso-8859-2 \xd8""ed", "coding:iso-8859-2 Řed", NULL);*/
+	/* with NULs */
+	TEST(FALSE, "W\xec""th\0z\xe9""r\xf8""s", "Wìth\0zérøs", "ISO-8859-1");
+	TEST(FALSE, "W\xec""th\0z\xe9""r\xf8""s", "Wìth\0zérøs", "ISO-8859-15");
+	/* This parses as UTF-16, but that's not really what we'd expect */
+	/*TEST(FALSE, "W\xec""th\0z\xe9""r\xf8""s", "Wìth\0zérøs", NULL);*/
+
+#undef TEST
+}
+
+
+int main(int argc, char **argv)
+{
+	g_test_init(&argc, &argv, NULL);
+	gtk_init_check(&argc, &argv);
+	main_init_headless();
+
+	g_test_add_func("/encodings/ascii/convert_to_utf8_auto", test_encodings_convert_ascii_to_utf8_auto);
+	g_test_add_func("/encodings/utf8/convert_to_utf8_auto", test_encodings_convert_utf8_to_utf8_auto);
+	g_test_add_func("/encodings/iso8859/convert_to_utf8_auto", test_encodings_convert_iso8859_to_utf8_auto);
+
+	return g_test_run();
+}
--------------
This E-Mail was brought to you by github_commit_mail.py (Source: https://github.com/geany/infrastructure).