[geany/geany] 3d4761: Fix silently truncating files with NULs - Commits

21 Apr 2024


      Branch:      refs/heads/master
Author:      Colomban Wendling ban@herbesfolles.org
Committer:   Colomban Wendling ban@herbesfolles.org
Date:        Fri, 08 Dec 2023 20:57:05 UTC
Commit:      3d4761788ab284687f46ae4c1285363a6b73ad22
             https://github.com/geany/geany/commit/3d4761788ab284687f46ae4c1285363a6b73ad...
Log Message:
-----------
Fix silently truncating files with NULs
This is slightly brittle, so it might require some refactoring.
Modified Paths:
--------------
    src/encodings.c
    src/encodingsprivate.h
    tests/test_encodings.c
Modified: src/encodings.c
63 lines changed, 31 insertions(+), 32 deletions(-)
===================================================================
@@ -877,10 +877,9 @@ typedef struct
 {
    gchar		*data;	/* null-terminated data */
    gsize		 size;	/* actual data size */
-	gsize		 len;	/* string length of data */
    gchar		*enc;
    gboolean	 bom;
-	gboolean	 partial;
+	gboolean	 has_nuls;
 } BufferData;
@@ -892,7 +891,7 @@ handle_forced_encoding(BufferData *buffer, const gchar *forced_enc)
if (utils_str_equal(forced_enc, "UTF-8"))
    {
-		if (! g_utf8_validate(buffer->data, buffer->len, NULL))
+		if (! g_utf8_validate(buffer->data, buffer->size, NULL))
    	{
    		return FALSE;
    	}
@@ -908,7 +907,8 @@ handle_forced_encoding(BufferData *buffer, const gchar *forced_enc)
    	else
    	{
    		SETPTR(buffer->data, converted_text);
-			buffer->len = strlen(converted_text);
+			/* we can't succeed with NULs, so this is OK */
+			buffer->size = strlen(converted_text);
    	}
    }
    enc_idx = encodings_scan_unicode_bom(buffer->data, buffer->size, NULL);
@@ -927,8 +927,7 @@ handle_encoding(BufferData *buffer, GeanyEncodingIndex enc_idx)
if (buffer->size == 0)
    {
-		/* we have no data so assume UTF-8, buffer->len can be 0 even we have an empty
-		 * e.g. UTF32 file with a BOM(so size is 4, len is 0) */
+		/* we have no data so assume UTF-8 */
    	buffer->enc = g_strdup("UTF-8");
    }
    else
@@ -939,14 +938,24 @@ handle_encoding(BufferData *buffer, GeanyEncodingIndex enc_idx)
    		buffer->enc = g_strdup(encodings[enc_idx].charset);
    		buffer->bom = TRUE;
-			if (enc_idx != GEANY_ENCODING_UTF_8) /* the BOM indicated something else than UTF-8 */
+			if (enc_idx == GEANY_ENCODING_UTF_8)
+			{
+				if (! g_utf8_validate(buffer->data, buffer->size, NULL))
+				{
+					/* this is not actually valid UTF-8 */
+					SETPTR(buffer->enc, NULL);
+					buffer->bom = FALSE;
+				}
+			}
+			else /* the BOM indicated something else than UTF-8 */
    		{
    			gchar *converted_text = encodings_convert_to_utf8_from_charset(
    									buffer->data, buffer->size, buffer->enc, FALSE);
    			if (converted_text != NULL)
    			{
    				SETPTR(buffer->data, converted_text);
-					buffer->len = strlen(converted_text);
+					/* we can't succeed with NULs, so this is OK */
+					buffer->size = strlen(converted_text);
    			}
    			else
    			{
@@ -964,7 +973,7 @@ handle_encoding(BufferData *buffer, GeanyEncodingIndex enc_idx)
/* try UTF-8 first */
    		if (encodings_get_idx_from_charset(regex_charset) == GEANY_ENCODING_UTF_8 &&
-				(buffer->size == buffer->len) && g_utf8_validate(buffer->data, buffer->len, NULL))
+				g_utf8_validate(buffer->data, buffer->size, NULL))
    		{
    			buffer->enc = g_strdup("UTF-8");
    		}
@@ -980,7 +989,8 @@ handle_encoding(BufferData *buffer, GeanyEncodingIndex enc_idx)
    				return FALSE;
    			}
    			SETPTR(buffer->data, converted_text);
-				buffer->len = strlen(converted_text);
+				/* we can't succeed with NULs, so this is OK */
+				buffer->size = strlen(converted_text);
    		}
    		g_free(regex_charset);
    	}
@@ -997,11 +1007,11 @@ handle_bom(BufferData *buffer)
    encodings_scan_unicode_bom(buffer->data, buffer->size, &bom_len);
    g_return_if_fail(bom_len != 0);
-	/* use filedata->len here because the contents are already converted into UTF-8 */
-	buffer->len -= bom_len;
+	/* the contents are already converted into UTF-8 here */
+	buffer->size -= bom_len;
    /* overwrite the BOM with the remainder of the file contents, plus the NULL terminator. */
-	memmove(buffer->data, buffer->data + bom_len, buffer->len + 1);
-	buffer->data = g_realloc(buffer->data, buffer->len + 1);
+	memmove(buffer->data, buffer->data + bom_len, buffer->size + 1);
+	buffer->data = g_realloc(buffer->data, buffer->size + 1);
 }
@@ -1014,16 +1024,6 @@ static gboolean handle_buffer(BufferData *buffer, const gchar *forced_enc)
     * if we have a BOM */
    tmp_enc_idx = encodings_scan_unicode_bom(buffer->data, buffer->size, NULL);
-	/* check whether the size of the loaded data is equal to the size of the file in the
-	 * filesystem file size may be 0 to allow opening files in /proc/ which have typically a
-	 * file size of 0 bytes */
-	if (buffer->len != buffer->size && buffer->size != 0 && (
-		tmp_enc_idx == GEANY_ENCODING_UTF_8 || /* tmp_enc_idx can be UTF-7/8/16/32, UCS and None */
-		tmp_enc_idx == GEANY_ENCODING_UTF_7))  /* filter UTF-7/8 where no NULL bytes are allowed */
-	{
-		buffer->partial = TRUE;
-	}
-
    /* Determine character encoding and convert to UTF-8 */
    if (forced_enc != NULL)
    {
@@ -1032,6 +1032,7 @@ static gboolean handle_buffer(BufferData *buffer, const gchar *forced_enc)
    	{
    		buffer->bom = FALSE;
    		buffer->enc = g_strdup(encodings[GEANY_ENCODING_NONE].charset);
+			buffer->has_nuls = strlen(buffer->data) != buffer->size;
    	}
    	else if (! handle_forced_encoding(buffer, forced_enc))
    	{
@@ -1060,36 +1061,34 @@ static gboolean handle_buffer(BufferData *buffer, const gchar *forced_enc)
  * @param forced_enc forced encoding to use, or @c NULL
  * @param used_encoding return location for the actually used encoding, or @c NULL
  * @param has_bom return location to store whether the data had a BOM, or @c NULL
- * @param partial return location to store whether the conversion may be partial, or @c NULL
+ * @param has_nuls return location to store whether the converted data contains NULs, or @c NULL
  *
  * @return @C TRUE if the conversion succeeded, @c FALSE otherwise.
  */
 GEANY_EXPORT_SYMBOL
 gboolean encodings_convert_to_utf8_auto(gchar **buf, gsize *size, const gchar *forced_enc,
-		gchar **used_encoding, gboolean *has_bom, gboolean *partial)
+		gchar **used_encoding, gboolean *has_bom, gboolean *has_nuls)
 {
    BufferData buffer;
buffer.data = *buf;
    buffer.size = *size;
-	/* use strlen to check for null chars */
-	buffer.len = strlen(buffer.data);
    buffer.enc = NULL;
    buffer.bom = FALSE;
-	buffer.partial = FALSE;
+	buffer.has_nuls = FALSE;
if (! handle_buffer(&buffer, forced_enc))
    	return FALSE;
-	*size = buffer.len;
+	*size = buffer.size;
    if (used_encoding)
    	*used_encoding = buffer.enc;
    else
    	g_free(buffer.enc);
    if (has_bom)
    	*has_bom = buffer.bom;
-	if (partial)
-		*partial = buffer.partial;
+	if (has_nuls)
+		*has_nuls = buffer.has_nuls;
*buf = buffer.data;
    return TRUE;
Modified: src/encodingsprivate.h
2 lines changed, 1 insertions(+), 1 deletions(-)
===================================================================
@@ -73,7 +73,7 @@ void encodings_encoding_store_cell_data_func(GtkCellLayout *cell_layout, GtkCell
 gboolean encodings_is_unicode_charset(const gchar *string);
gboolean encodings_convert_to_utf8_auto(gchar **buf, gsize *size, const gchar *forced_enc,
-                                        gchar **used_encoding, gboolean *has_bom, gboolean *partial);
+                                        gchar **used_encoding, gboolean *has_bom, gboolean *has_nuls);
GeanyEncodingIndex encodings_scan_unicode_bom(const gchar *string, gsize len, guint *bom_len);
Modified: tests/test_encodings.c
47 lines changed, 18 insertions(+), 29 deletions(-)
===================================================================
@@ -80,13 +80,9 @@ static gboolean assert_convert_to_utf8_auto_impl(
    fflush(stdout);
    if (ret)
    {
-		/* FIXME: that's probably a bug in encodings_convert_to_utf8_auto() */
-		if (size != expected_size && expected_partial)
-			expected_size = strlen(expected_output);
-
-		g_assert_cmpuint(size, ==, expected_size);
    	assert_cmpmem_eq_with_caller(buf, expected_output, MIN(size, expected_size),
    			domain, file, line, func);
+		g_assert_cmpuint(size, ==, expected_size);
    	if (expected_encoding)
    		g_assert_cmpstr(expected_encoding, ==, used_encoding);
    	g_assert_cmpint(has_bom, ==, expected_has_bom);
@@ -121,9 +117,8 @@ static void test_encodings_convert_ascii_to_utf8_auto(void)
    TEST_ASCII(TRUE, "This is a very basic ASCII test", "UTF-8");
    TEST_ASCII(TRUE, "S\till ve\ry \b\asic", NULL);
    TEST_ASCII(FALSE, "With\0some\0NULs\0", NULL);
-	/* these fails to report partial output! */
-	/*TEST_ASCII(FALSE, "With\0some\0NULs\0", "None");*/
-	/*TEST_ASCII(FALSE, "With\0some\0NULs\0", "UTF-8");*/
+	TEST_ASCII(TRUE, "With\0some\0NULs\0", "None");
+	TEST_ASCII(FALSE, "With\0some\0NULs\0", "UTF-8");
#undef TEST_ASCII
 }
@@ -144,23 +139,24 @@ static void test_encodings_convert_utf8_to_utf8_auto(void)
    TEST_UTF8(TRUE, "Thĩs îs å véry basìč ÅSÇǏÍ test", "None");
    TEST_UTF8(TRUE, "Thĩs îs å véry basìč ÅSÇǏÍ test", "UTF-8");
    TEST_UTF8(FALSE, "Wíťh\0søme\0NÙLs\0", NULL);
-	/* these fails to report partial output! */
-	/*TEST_UTF8(FALSE, "Wíťh\0søme\0NÙLs\0", "UTF-8");*/
-	/*TEST_UTF8(FALSE, "Wíťh\0søme\0NÙLs\0", "None");*/
+	TEST_UTF8(FALSE, "Wíťh\0søme\0NÙLs\0", "UTF-8"); /* the NUL doesn't pass the UTF-8 check */
+	TEST_UTF8(TRUE, "Wíťh\0søme\0NÙLs\0", "None"); /* with None we do no data validation, but report partial output */
/* with the inline hint */
    TEST_UTF8(TRUE, "coding:utf-8 bãśïč", NULL);
    TEST_UTF8(FALSE, "coding:utf-8 Wíťh\0søme\0NÙLs", NULL);
TEST_UTF8(TRUE, UTF8_BOM"With BOM", NULL);
-	TEST_UTF8(TRUE, UTF8_BOM"With BOM\0and NULs", NULL);
-	TEST_UTF8(TRUE, UTF8_BOM"Wíth BØM\0añd NÙLs", NULL);
+	/* These won't pass the UTF-8 validation despite the BOM, so we fallback to
+	 * testing other options, and it will succeed with UTF-16 so there's no real
+	 * point in verifying this */
+	/*TEST_UTF8(FALSE, UTF8_BOM"With BOM\0and NULs", NULL);*/
+	/*TEST_UTF8(FALSE, UTF8_BOM"Wíth BØM\0añd NÙLs", NULL);*/
/* non-UTF-8 */
    TEST_UTF8(FALSE, "Th\xec""s", "UTF-8");
    TEST_UTF8(FALSE, "Th\xec""s\0", "UTF-8");
-	/* erroneously succeeds and fails to report partial */
-	/*TEST_UTF8(FALSE, "\0Th\xec""s", "UTF-8");*/
+	TEST_UTF8(FALSE, "\0Th\xec""s", "UTF-8");
#undef TEST_UTF8
 #undef UTF8_BOM
@@ -229,9 +225,8 @@ static void test_encodings_convert_utf_other_to_utf8_auto(void)
/* empty data with BOMs */
    TEST_ENC(TRUE, "+/v8-", "", TRUE, NULL, "UTF-7"); /* UTF-7 */
-	/* these two actually lead to reading past the buffer's bounds */
-	/*TEST_ENC(TRUE, UTF16_BE_BOM, "", TRUE, NULL, "UTF-16BE");*/
-	/*TEST_ENC(TRUE, UTF16_LE_BOM, "", TRUE, NULL, "UTF-16LE");*/
+	TEST_ENC(TRUE, UTF16_BE_BOM, "", TRUE, NULL, "UTF-16BE");
+	TEST_ENC(TRUE, UTF16_LE_BOM, "", TRUE, NULL, "UTF-16LE");
    TEST_ENC(TRUE, UTF32_BE_BOM, "", TRUE, NULL, "UTF-32BE");
    TEST_ENC(TRUE, UTF32_LE_BOM, "", TRUE, NULL, "UTF-32LE");
@@ -257,24 +252,18 @@ static void test_encodings_convert_iso8859_to_utf8_auto(void)
    TEST(TRUE, "\xa4""uro", "¤uro", "ISO-8859-1");
    TEST(TRUE, "\xa4""uro", "€uro", "ISO-8859-15");
    TEST(TRUE, "\xd8""ed", "Řed", "ISO-8859-2");
-	/* huh?  the UTF-8 BOM takes over, although \xd3 is NOT valid UTF-8!?
-	 * - file(1) says "iso8859 text", OK
-	 * - kate(1) loads as ISO-8859-15
-	 * - vim(1) loads as "latin1" whatever that means (but looks OK)
-	 * - chardet(1) wrongly reports "UTF-8-SIG with confidence 1.0", which is
-	 *   a tad sad for a tool which only purpose IS detecting encoding...
-	 * - pluma(1) doesn't open it and asks for encoding input
-	 * - gedit(1) opens as broken UTF-8, but warns about it and asks
-	 * - gnome-text-editor(1) is just broken, opens as gedit, but says I don't
-	 *   have permission to open that file :)  looks like a generic error. */
-	/*TEST(TRUE, "\xef\xbb\xbf""not B\xd3M", "ï»¿not BÓM", NULL);*/
+	/* make-believe UTF-8 BOM followed by non-UTF-8 data */
+	TEST(TRUE, "\xef\xbb\xbf""not B\xd3M", "ï»¿not BÓM", NULL);
    TEST(TRUE, "coding:iso-8859-2 \xd8""ed", "coding:iso-8859-2 Řed", NULL);
    /* with NULs */
    TEST(FALSE, "W\xec""th\0z\xe9""r\xf8""s", "Wìth\0zérøs", "ISO-8859-1");
    TEST(FALSE, "W\xec""th\0z\xe9""r\xf8""s", "Wìth\0zérøs", "ISO-8859-15");
    /* This parses as UTF-16, but that's not really what we'd expect */
    /*TEST(FALSE, "W\xec""th\0z\xe9""r\xf8""s", "Wìth\0zérøs", NULL);*/
+	/* UTF-8 BOM with non-UTF-8 data, we should fallback */
+	TEST(TRUE, "\xef\xbb\xbfW\xec""th\xf8""ut BOM", "ï»¿Wìthøut BOM", NULL);
+
 #undef TEST
 }
--------------
This E-Mail was brought to you by github_commit_mail.py (Source: https://github.com/geany/infrastructure).