SF.net SVN: geany: [1167] trunk

Sun Jan 7 16:22:41 UTC 2007

Revision: 1167
          http://svn.sourceforge.net/geany/?rev=1167&view=rev
Author:   ntrel
Date:     2007-01-07 08:22:41 -0800 (Sun, 07 Jan 2007)

Log Message:
-----------
Fix memory leak when using utils_scan_unicode_bom().
Prevent invalid memory read in utils_scan_unicode_bom() when text
length is < 4.
Move utils_scan_unicode_bom(), utils_is_unicode_charset() to
encodings.c.
Read the BOM length in handle_bom().

Modified Paths:
--------------
    trunk/ChangeLog
    trunk/src/dialogs.c
    trunk/src/document.c
    trunk/src/encodings.c
    trunk/src/encodings.h
    trunk/src/ui_utils.c
    trunk/src/utils.c
    trunk/src/utils.h

Modified: trunk/ChangeLog
===================================================================

--- trunk/ChangeLog	2007-01-07 14:04:13 UTC (rev 1166)
+++ trunk/ChangeLog	2007-01-07 16:22:41 UTC (rev 1167)
@@ -1,3 +1,15 @@
+2007-01-07  Nick Treleaven  <nick.treleaven at btinternet.com>
+
+ * src/utils.c, src/utils.h, src/encodings.c, src/document.c,
+   src/encodings.h, src/dialogs.c, src/ui_utils.c:
+   Fix memory leak when using utils_scan_unicode_bom().
+   Prevent invalid memory read in utils_scan_unicode_bom() when text
+   length is < 4.
+   Move utils_scan_unicode_bom(), utils_is_unicode_charset() to
+   encodings.c.
+   Read the BOM length in handle_bom().
+
+
 2007-01-07  Enrico Tröger  <enrico.troeger at uvena.de>
 
  * geany.glade, src/interface.c: Fixed typo.

Modified: trunk/src/dialogs.c
===================================================================
--- trunk/src/dialogs.c	2007-01-07 14:04:13 UTC (rev 1166)
+++ trunk/src/dialogs.c	2007-01-07 16:22:41 UTC (rev 1167)
@@ -1013,8 +1013,9 @@
 	gtk_misc_set_alignment(GTK_MISC(label), 1, 0);
 
 	enctext = g_strdup_printf("%s %s",
-	doc_list[idx].encoding,
-	(utils_is_unicode_charset(doc_list[idx].encoding)) ? ((doc_list[idx].has_bom) ? _("(with BOM)") : _("(without BOM)")) : "");
+		doc_list[idx].encoding,
+		(encodings_is_unicode_charset(doc_list[idx].encoding)) ?
+			((doc_list[idx].has_bom) ? _("(with BOM)") : _("(without BOM)")) : "");
 
 	label = gtk_label_new(enctext);
 	g_free(enctext);

Modified: trunk/src/document.c
===================================================================
--- trunk/src/document.c	2007-01-07 14:04:13 UTC (rev 1166)
+++ trunk/src/document.c	2007-01-07 16:22:41 UTC (rev 1167)
@@ -436,17 +436,14 @@
 static gboolean
 handle_forced_encoding(FileData *filedata, const gchar *forced_enc)
 {
+	GeanyEncodingIndex enc_idx;
+
 	if (utils_str_equal(forced_enc, "UTF-8"))
 	{
 		if (! g_utf8_validate(filedata->data, filedata->len, NULL))
 		{
 			return FALSE;
 		}
-		else
-		{
-			filedata->bom = utils_str_equal(utils_scan_unicode_bom(filedata->data), "UTF-8");
-			filedata->enc = g_strdup(forced_enc);
-		}
 	}
 	else
 	{
@@ -461,53 +458,68 @@
 			g_free(filedata->data);
 			filedata->data = converted_text;
 			filedata->len = strlen(converted_text);
-			filedata->bom = utils_str_equal(utils_scan_unicode_bom(filedata->data), "UTF-8");
-			filedata->enc = g_strdup(forced_enc);
 		}
 	}
+	enc_idx = encodings_scan_unicode_bom(filedata->data, filedata->len, NULL);
+	filedata->bom = (enc_idx == GEANY_ENCODING_UTF_8);
+	filedata->enc = g_strdup(forced_enc);
 	return TRUE;
 }
 
 
+// detect encoding and convert to UTF-8 if necessary
 static gboolean
 handle_encoding(FileData *filedata)
 {
-	if (filedata->len > 0)
-	{	// the usual way to detect encoding and convert to UTF-8
-		if (filedata->len >= 4)
+	g_return_val_if_fail(filedata->enc == NULL, FALSE);
+	g_return_val_if_fail(filedata->bom == FALSE, FALSE);
+
+	if (filedata->len == 0)
+	{
+		// we have no data so assume UTF-8
+		filedata->enc = g_strdup("UTF-8");
+	}
+	else
+	{
+		// first check for a BOM
+		GeanyEncodingIndex enc_idx =
+			encodings_scan_unicode_bom(filedata->data, filedata->len, NULL);
+
+		if (enc_idx != GEANY_ENCODING_NONE)
 		{
-			filedata->enc = utils_scan_unicode_bom(filedata->data);
-		}
-		if (filedata->enc != NULL)
-		{
+			filedata->enc = g_strdup(encodings[enc_idx].charset);
 			filedata->bom = TRUE;
-			if ((filedata->enc)[4] != '8') // the BOM indicated something else than UTF-8
+
+			if (enc_idx != GEANY_ENCODING_UTF_8) // the BOM indicated something else than UTF-8
 			{
 				gchar *converted_text = encodings_convert_to_utf8_from_charset(
-															filedata->data, filedata->len, filedata->enc, FALSE);
-				if (converted_text == NULL)
+										filedata->data, filedata->len, filedata->enc, FALSE);
+				if (converted_text != NULL)
 				{
+					g_free(filedata->data);
+					filedata->data = converted_text;
+					filedata->len = strlen(converted_text);
+				}
+				else
+				{
+					// there was a problem converting data from BOM encoding type
 					g_free(filedata->enc);
 					filedata->enc = NULL;
 					filedata->bom = FALSE;
 				}
-				else
-				{
-					g_free(filedata->data);
-					filedata->data = converted_text;
-					filedata->len = strlen(converted_text);
-				}
 			}
 		}
-		// this if is important, else doesn't work because enc can be altered in the above block
-		if (filedata->enc == NULL)
+
+		if (filedata->enc == NULL)	// either there was no BOM or the BOM encoding failed
 		{
+			// try UTF-8 first
 			if (g_utf8_validate(filedata->data, filedata->len, NULL))
 			{
 				filedata->enc = g_strdup("UTF-8");
 			}
 			else
 			{
+				// detect the encoding
 				gchar *converted_text = encodings_convert_to_utf8(filedata->data,
 					filedata->len, &filedata->enc);
 
@@ -515,19 +527,12 @@
 				{
 					return FALSE;
 				}
-				else
-				{
-					g_free(filedata->data);
-					filedata->data = converted_text;
-					filedata->len = strlen(converted_text);
-				}
+				g_free(filedata->data);
+				filedata->data = converted_text;
+				filedata->len = strlen(converted_text);
 			}
 		}
 	}
-	else
-	{
-		filedata->enc = g_strdup("UTF-8");
-	}
 	return TRUE;
 }
 
@@ -535,14 +540,15 @@
 static void
 handle_bom(FileData *filedata)
 {
-	gchar *data_without_bom;
+	guint bom_len;
 
-	g_return_if_fail(filedata->len >= 3);
+	encodings_scan_unicode_bom(filedata->data, filedata->len, &bom_len);
+	g_return_if_fail(bom_len != 0);
 
-	data_without_bom = g_strdup(filedata->data + 3);
-	g_free(filedata->data);
-	filedata->data = data_without_bom;
-	filedata->len -= 3;
+	filedata->len -= bom_len;
+	// overwrite the BOM with the remainder of the file contents, plus the NULL terminator.
+	g_memmove(filedata->data, filedata->data + bom_len, filedata->len + 1);
+	g_realloc(filedata->data, filedata->len + 1);
 }
 
 
@@ -871,7 +877,7 @@
 	sci_convert_eols(doc_list[idx].sci, sci_get_eol_mode(doc_list[idx].sci));
 
 	len = sci_get_length(doc_list[idx].sci) + 1;
-	if (doc_list[idx].has_bom && utils_is_unicode_charset(doc_list[idx].encoding))
+	if (doc_list[idx].has_bom && encodings_is_unicode_charset(doc_list[idx].encoding))
 	{
 		data = (gchar*) g_malloc(len + 3);	// 3 chars for BOM
 		data[0] = 0xef;
@@ -1642,7 +1648,7 @@
 
 	ui_update_statusbar(idx, -1);
 	gtk_widget_set_sensitive(lookup_widget(app->window, "menu_write_unicode_bom1"),
-			utils_is_unicode_charset(doc_list[idx].encoding));
+			encodings_is_unicode_charset(doc_list[idx].encoding));
 }
 
 

Modified: trunk/src/encodings.c
===================================================================
--- trunk/src/encodings.c	2007-01-07 14:04:13 UTC (rev 1166)
+++ trunk/src/encodings.c	2007-01-07 16:22:41 UTC (rev 1167)
@@ -39,12 +39,12 @@
 
 
 
-#define fill(v, w, x, y, z) \
-		encodings[x].idx = x; \
-		encodings[x].order = v; \
-		encodings[x].group = w; \
-		encodings[x].charset = y; \
-		encodings[x].name = z;
+#define fill(Order, Group, Idx, Charset, Name) \
+		encodings[Idx].idx = Idx; \
+		encodings[Idx].order = Order; \
+		encodings[Idx].group = Group; \
+		encodings[Idx].charset = Charset; \
+		encodings[Idx].name = Name;
 
 static void init_encodings(void)
 {
@@ -377,3 +377,72 @@
 
 	return NULL;
 }
+
+
+/* If there's a BOM, return a corresponding GEANY_ENCODING_UTF_* index,
+ * otherwise GEANY_ENCODING_NONE.
+ * */
+GeanyEncodingIndex encodings_scan_unicode_bom(const gchar *string, gsize len, guint *bom_len)
+{
+	if (len >= 3)
+	{
+		if (bom_len)
+			*bom_len = 3;
+
+		if ((guchar)string[0] == 0xef && (guchar)string[1] == 0xbb &&
+			(guchar)string[2] == 0xbf)
+		{
+			return GEANY_ENCODING_UTF_8;
+		}
+	}
+	if (len >= 4)
+	{
+		if (bom_len)
+			*bom_len = 4;
+
+		if ((guchar)string[0] == 0x00 && (guchar)string[1] == 0x00 &&
+				 (guchar)string[2] == 0xfe && (guchar)string[3] == 0xff)
+		{
+			return GEANY_ENCODING_UTF_32BE; // Big endian
+		}
+		if ((guchar)string[0] == 0xff && (guchar)string[1] == 0xfe &&
+				 (guchar)string[2] == 0x00 && (guchar)string[3] == 0x00)
+		{
+			return GEANY_ENCODING_UTF_32LE; // Little endian
+		}
+		if ((string[0] == 0x2b && string[1] == 0x2f && string[2] == 0x76) &&
+				 (string[3] == 0x38 || string[3] == 0x39 || string[3] == 0x2b || string[3] == 0x2f))
+		{
+			 return GEANY_ENCODING_UTF_7;
+		}
+	}
+	if (len >= 2)
+	{
+		if (bom_len)
+			*bom_len = 2;
+
+		if ((guchar)string[0]==0xfe && (guchar)string[1] == 0xff)
+		{
+			return GEANY_ENCODING_UTF_16BE; // Big endian
+		}
+		if ((guchar)string[0] == 0xff && (guchar)string[1] == 0xfe)
+		{
+			return GEANY_ENCODING_UTF_16LE; // Little endian
+		}
+	}
+	if (bom_len)
+		*bom_len = 0;
+	return GEANY_ENCODING_NONE;
+}
+
+
+gboolean encodings_is_unicode_charset(const gchar *string)
+{
+	if (string != NULL && (strncmp(string, "UTF", 3) == 0 || strncmp(string, "UCS", 3) == 0))
+	{
+		return TRUE;
+	}
+	return FALSE;
+}
+
+

Modified: trunk/src/encodings.h
===================================================================
--- trunk/src/encodings.h	2007-01-07 14:04:13 UTC (rev 1166)
+++ trunk/src/encodings.h	2007-01-07 16:22:41 UTC (rev 1167)
@@ -74,7 +74,9 @@
 gchar *encodings_convert_to_utf8_from_charset(const gchar *buffer, gsize size,
 											  const gchar *charset, gboolean fast);
 
+gboolean encodings_is_unicode_charset(const gchar *string);
 
+
 /*
  * The original versions of the following tables are taken from profterm
  *
@@ -152,7 +154,7 @@
 	GEANY_ENCODING_WINDOWS_1256,
 	GEANY_ENCODING_WINDOWS_1257,
 	GEANY_ENCODING_WINDOWS_1258,
-	
+
 	GEANY_ENCODING_NONE,
 
 	GEANY_ENCODINGS_MAX
@@ -161,4 +163,7 @@
 
 GeanyEncoding encodings[GEANY_ENCODINGS_MAX];
 
+
+GeanyEncodingIndex encodings_scan_unicode_bom(const gchar *string, gsize len, guint *bom_len);
+
 #endif

Modified: trunk/src/ui_utils.c
===================================================================
--- trunk/src/ui_utils.c	2007-01-07 14:04:13 UTC (rev 1166)
+++ trunk/src/ui_utils.c	2007-01-07 16:22:41 UTC (rev 1167)
@@ -120,7 +120,8 @@
 			(doc_list[idx].readonly) ? ", read only" : "",
 			cur_tag,
 			(doc_list[idx].encoding) ? doc_list[idx].encoding : _("unknown"),
-			(utils_is_unicode_charset(doc_list[idx].encoding)) ? ((doc_list[idx].has_bom) ? _("(with BOM)") : _("(without BOM)")) : "",
+			(encodings_is_unicode_charset(doc_list[idx].encoding)) ?
+				((doc_list[idx].has_bom) ? _("(with BOM)") : _("(without BOM)")) : "",
 			(doc_list[idx].file_type) ? doc_list[idx].file_type->title : _("unknown"));
 		set_statusbar(text, TRUE);	// can be overridden by status messages
 		g_free(text);
@@ -706,7 +707,7 @@
 																					TRUE);
 
 	gtk_widget_set_sensitive(lookup_widget(app->window, "menu_write_unicode_bom1"),
-			utils_is_unicode_charset(doc_list[idx].encoding));
+			encodings_is_unicode_charset(doc_list[idx].encoding));
 
 	encodings_select_radio_item(doc_list[idx].encoding);
 	filetypes_select_radio_item(doc_list[idx].file_type);

Modified: trunk/src/utils.c
===================================================================
--- trunk/src/utils.c	2007-01-07 14:04:13 UTC (rev 1166)
+++ trunk/src/utils.c	2007-01-07 16:22:41 UTC (rev 1167)
@@ -1403,50 +1403,6 @@
 }
 
 
-gchar *utils_scan_unicode_bom(const gchar *string)
-{
-	if ((unsigned char)string[0] == 0xef && (unsigned char)string[1] == 0xbb &&
-		(unsigned char)string[2] == 0xbf)
-	{
-		return g_strdup("UTF-8");
-	}
-	else if ((unsigned char)string[0] == 0x00 && (unsigned char)string[1] == 0x00 &&
-			 (unsigned char)string[2] == 0xfe && (unsigned char)string[3] == 0xff)
-	{
-		return g_strdup("UTF-32BE"); // Big endian
-	}
-	else if ((unsigned char)string[0] == 0xff && (unsigned char)string[1] == 0xfe &&
-			 (unsigned char)string[2] == 0x00 && (unsigned char)string[3] == 0x00)
-	{
-		return g_strdup("UTF-32LE"); // Little endian
-	}
-	else if ((unsigned char)string[0]==0xfe && (unsigned char)string[1] == 0xff)
-	{
-		return g_strdup("UTF-16BE"); // Big endian
-	}
-	else if ((unsigned char)string[0] == 0xff && (unsigned char)string[1] == 0xfe)
-	{
-		return g_strdup("UTF-16LE"); // Little endian
-	}
-	else if ((string[0] == 0x2b && string[1] == 0x2f && string[2] == 0x76) &&
-			 (string[3] == 0x38 || string[3] == 0x39 || string[3] == 0x2b || string[3] == 0x2f))
-	{
-		 return g_strdup("UTF-7");
-	}
-	return NULL;
-}
-
-
-gboolean utils_is_unicode_charset(const gchar *string)
-{
-	if (string != NULL && (strncmp(string, "UTF", 3) == 0 || strncmp(string, "UCS", 3) == 0))
-	{
-		return TRUE;
-	}
-	return FALSE;
-}
-
-
 /* Wraps a string in place, replacing a space with a newline character.
  * wrapstart is the minimum position to start wrapping or -1 for default */
 gboolean utils_wrap_string(gchar *string, gint wrapstart)

Modified: trunk/src/utils.h
===================================================================
--- trunk/src/utils.h	2007-01-07 14:04:13 UTC (rev 1166)
+++ trunk/src/utils.h	2007-01-07 16:22:41 UTC (rev 1167)
@@ -135,10 +135,6 @@
  * Replaces \\, \r, \n, \t and \uXXX by their real counterparts */
 gboolean utils_str_replace_escape(gchar *string);
 
-gchar *utils_scan_unicode_bom(const gchar *string);
-
-gboolean utils_is_unicode_charset(const gchar *string);
-
 /* Wraps a string in place, replacing a space with a newline character.
  * wrapstart is the minimum position to start wrapping or -1 for default */
 gboolean utils_wrap_string(gchar *string, gint wrapstart);


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.