Revision: 1167 http://svn.sourceforge.net/geany/?rev=1167&view=rev Author: ntrel Date: 2007-01-07 08:22:41 -0800 (Sun, 07 Jan 2007)
Log Message: ----------- Fix memory leak when using utils_scan_unicode_bom(). Prevent invalid memory read in utils_scan_unicode_bom() when text length is < 4. Move utils_scan_unicode_bom(), utils_is_unicode_charset() to encodings.c. Read the BOM length in handle_bom().
Modified Paths: -------------- trunk/ChangeLog trunk/src/dialogs.c trunk/src/document.c trunk/src/encodings.c trunk/src/encodings.h trunk/src/ui_utils.c trunk/src/utils.c trunk/src/utils.h
Modified: trunk/ChangeLog =================================================================== --- trunk/ChangeLog 2007-01-07 14:04:13 UTC (rev 1166) +++ trunk/ChangeLog 2007-01-07 16:22:41 UTC (rev 1167) @@ -1,3 +1,15 @@ +2007-01-07 Nick Treleaven nick.treleaven@btinternet.com + + * src/utils.c, src/utils.h, src/encodings.c, src/document.c, + src/encodings.h, src/dialogs.c, src/ui_utils.c: + Fix memory leak when using utils_scan_unicode_bom(). + Prevent invalid memory read in utils_scan_unicode_bom() when text + length is < 4. + Move utils_scan_unicode_bom(), utils_is_unicode_charset() to + encodings.c. + Read the BOM length in handle_bom(). + + 2007-01-07 Enrico Tröger enrico.troeger@uvena.de
* geany.glade, src/interface.c: Fixed typo.
Modified: trunk/src/dialogs.c =================================================================== --- trunk/src/dialogs.c 2007-01-07 14:04:13 UTC (rev 1166) +++ trunk/src/dialogs.c 2007-01-07 16:22:41 UTC (rev 1167) @@ -1013,8 +1013,9 @@ gtk_misc_set_alignment(GTK_MISC(label), 1, 0);
enctext = g_strdup_printf("%s %s", - doc_list[idx].encoding, - (utils_is_unicode_charset(doc_list[idx].encoding)) ? ((doc_list[idx].has_bom) ? _("(with BOM)") : _("(without BOM)")) : ""); + doc_list[idx].encoding, + (encodings_is_unicode_charset(doc_list[idx].encoding)) ? + ((doc_list[idx].has_bom) ? _("(with BOM)") : _("(without BOM)")) : "");
label = gtk_label_new(enctext); g_free(enctext);
Modified: trunk/src/document.c =================================================================== --- trunk/src/document.c 2007-01-07 14:04:13 UTC (rev 1166) +++ trunk/src/document.c 2007-01-07 16:22:41 UTC (rev 1167) @@ -436,17 +436,14 @@ static gboolean handle_forced_encoding(FileData *filedata, const gchar *forced_enc) { + GeanyEncodingIndex enc_idx; + if (utils_str_equal(forced_enc, "UTF-8")) { if (! g_utf8_validate(filedata->data, filedata->len, NULL)) { return FALSE; } - else - { - filedata->bom = utils_str_equal(utils_scan_unicode_bom(filedata->data), "UTF-8"); - filedata->enc = g_strdup(forced_enc); - } } else { @@ -461,53 +458,68 @@ g_free(filedata->data); filedata->data = converted_text; filedata->len = strlen(converted_text); - filedata->bom = utils_str_equal(utils_scan_unicode_bom(filedata->data), "UTF-8"); - filedata->enc = g_strdup(forced_enc); } } + enc_idx = encodings_scan_unicode_bom(filedata->data, filedata->len, NULL); + filedata->bom = (enc_idx == GEANY_ENCODING_UTF_8); + filedata->enc = g_strdup(forced_enc); return TRUE; }
+// detect encoding and convert to UTF-8 if necessary static gboolean handle_encoding(FileData *filedata) { - if (filedata->len > 0) - { // the usual way to detect encoding and convert to UTF-8 - if (filedata->len >= 4) + g_return_val_if_fail(filedata->enc == NULL, FALSE); + g_return_val_if_fail(filedata->bom == FALSE, FALSE); + + if (filedata->len == 0) + { + // we have no data so assume UTF-8 + filedata->enc = g_strdup("UTF-8"); + } + else + { + // first check for a BOM + GeanyEncodingIndex enc_idx = + encodings_scan_unicode_bom(filedata->data, filedata->len, NULL); + + if (enc_idx != GEANY_ENCODING_NONE) { - filedata->enc = utils_scan_unicode_bom(filedata->data); - } - if (filedata->enc != NULL) - { + filedata->enc = g_strdup(encodings[enc_idx].charset); filedata->bom = TRUE; - if ((filedata->enc)[4] != '8') // the BOM indicated something else than UTF-8 + + if (enc_idx != GEANY_ENCODING_UTF_8) // the BOM indicated something else than UTF-8 { gchar *converted_text = encodings_convert_to_utf8_from_charset( - filedata->data, filedata->len, filedata->enc, FALSE); - if (converted_text == NULL) + filedata->data, filedata->len, filedata->enc, FALSE); + if (converted_text != NULL) { + g_free(filedata->data); + filedata->data = converted_text; + filedata->len = strlen(converted_text); + } + else + { + // there was a problem converting data from BOM encoding type g_free(filedata->enc); filedata->enc = NULL; filedata->bom = FALSE; } - else - { - g_free(filedata->data); - filedata->data = converted_text; - filedata->len = strlen(converted_text); - } } } - // this if is important, else doesn't work because enc can be altered in the above block - if (filedata->enc == NULL) + + if (filedata->enc == NULL) // either there was no BOM or the BOM encoding failed { + // try UTF-8 first if (g_utf8_validate(filedata->data, filedata->len, NULL)) { filedata->enc = g_strdup("UTF-8"); } else { + // detect the encoding gchar *converted_text = encodings_convert_to_utf8(filedata->data, filedata->len, &filedata->enc);
@@ -515,19 +527,12 @@ { return FALSE; } - else - { - g_free(filedata->data); - filedata->data = converted_text; - filedata->len = strlen(converted_text); - } + g_free(filedata->data); + filedata->data = converted_text; + filedata->len = strlen(converted_text); } } } - else - { - filedata->enc = g_strdup("UTF-8"); - } return TRUE; }
@@ -535,14 +540,15 @@ static void handle_bom(FileData *filedata) { - gchar *data_without_bom; + guint bom_len;
- g_return_if_fail(filedata->len >= 3); + encodings_scan_unicode_bom(filedata->data, filedata->len, &bom_len); + g_return_if_fail(bom_len != 0);
- data_without_bom = g_strdup(filedata->data + 3); - g_free(filedata->data); - filedata->data = data_without_bom; - filedata->len -= 3; + filedata->len -= bom_len; + // overwrite the BOM with the remainder of the file contents, plus the NULL terminator. + g_memmove(filedata->data, filedata->data + bom_len, filedata->len + 1); + g_realloc(filedata->data, filedata->len + 1); }
@@ -871,7 +877,7 @@ sci_convert_eols(doc_list[idx].sci, sci_get_eol_mode(doc_list[idx].sci));
len = sci_get_length(doc_list[idx].sci) + 1; - if (doc_list[idx].has_bom && utils_is_unicode_charset(doc_list[idx].encoding)) + if (doc_list[idx].has_bom && encodings_is_unicode_charset(doc_list[idx].encoding)) { data = (gchar*) g_malloc(len + 3); // 3 chars for BOM data[0] = 0xef; @@ -1642,7 +1648,7 @@
ui_update_statusbar(idx, -1); gtk_widget_set_sensitive(lookup_widget(app->window, "menu_write_unicode_bom1"), - utils_is_unicode_charset(doc_list[idx].encoding)); + encodings_is_unicode_charset(doc_list[idx].encoding)); }
Modified: trunk/src/encodings.c =================================================================== --- trunk/src/encodings.c 2007-01-07 14:04:13 UTC (rev 1166) +++ trunk/src/encodings.c 2007-01-07 16:22:41 UTC (rev 1167) @@ -39,12 +39,12 @@
-#define fill(v, w, x, y, z) \ - encodings[x].idx = x; \ - encodings[x].order = v; \ - encodings[x].group = w; \ - encodings[x].charset = y; \ - encodings[x].name = z; +#define fill(Order, Group, Idx, Charset, Name) \ + encodings[Idx].idx = Idx; \ + encodings[Idx].order = Order; \ + encodings[Idx].group = Group; \ + encodings[Idx].charset = Charset; \ + encodings[Idx].name = Name;
static void init_encodings(void) { @@ -377,3 +377,72 @@
return NULL; } + + +/* If there's a BOM, return a corresponding GEANY_ENCODING_UTF_* index, + * otherwise GEANY_ENCODING_NONE. + * */ +GeanyEncodingIndex encodings_scan_unicode_bom(const gchar *string, gsize len, guint *bom_len) +{ + if (len >= 3) + { + if (bom_len) + *bom_len = 3; + + if ((guchar)string[0] == 0xef && (guchar)string[1] == 0xbb && + (guchar)string[2] == 0xbf) + { + return GEANY_ENCODING_UTF_8; + } + } + if (len >= 4) + { + if (bom_len) + *bom_len = 4; + + if ((guchar)string[0] == 0x00 && (guchar)string[1] == 0x00 && + (guchar)string[2] == 0xfe && (guchar)string[3] == 0xff) + { + return GEANY_ENCODING_UTF_32BE; // Big endian + } + if ((guchar)string[0] == 0xff && (guchar)string[1] == 0xfe && + (guchar)string[2] == 0x00 && (guchar)string[3] == 0x00) + { + return GEANY_ENCODING_UTF_32LE; // Little endian + } + if ((string[0] == 0x2b && string[1] == 0x2f && string[2] == 0x76) && + (string[3] == 0x38 || string[3] == 0x39 || string[3] == 0x2b || string[3] == 0x2f)) + { + return GEANY_ENCODING_UTF_7; + } + } + if (len >= 2) + { + if (bom_len) + *bom_len = 2; + + if ((guchar)string[0]==0xfe && (guchar)string[1] == 0xff) + { + return GEANY_ENCODING_UTF_16BE; // Big endian + } + if ((guchar)string[0] == 0xff && (guchar)string[1] == 0xfe) + { + return GEANY_ENCODING_UTF_16LE; // Little endian + } + } + if (bom_len) + *bom_len = 0; + return GEANY_ENCODING_NONE; +} + + +gboolean encodings_is_unicode_charset(const gchar *string) +{ + if (string != NULL && (strncmp(string, "UTF", 3) == 0 || strncmp(string, "UCS", 3) == 0)) + { + return TRUE; + } + return FALSE; +} + +
Modified: trunk/src/encodings.h =================================================================== --- trunk/src/encodings.h 2007-01-07 14:04:13 UTC (rev 1166) +++ trunk/src/encodings.h 2007-01-07 16:22:41 UTC (rev 1167) @@ -74,7 +74,9 @@ gchar *encodings_convert_to_utf8_from_charset(const gchar *buffer, gsize size, const gchar *charset, gboolean fast);
+gboolean encodings_is_unicode_charset(const gchar *string);
+ /* * The original versions of the following tables are taken from profterm * @@ -152,7 +154,7 @@ GEANY_ENCODING_WINDOWS_1256, GEANY_ENCODING_WINDOWS_1257, GEANY_ENCODING_WINDOWS_1258, - + GEANY_ENCODING_NONE,
GEANY_ENCODINGS_MAX @@ -161,4 +163,7 @@
GeanyEncoding encodings[GEANY_ENCODINGS_MAX];
+ +GeanyEncodingIndex encodings_scan_unicode_bom(const gchar *string, gsize len, guint *bom_len); + #endif
Modified: trunk/src/ui_utils.c =================================================================== --- trunk/src/ui_utils.c 2007-01-07 14:04:13 UTC (rev 1166) +++ trunk/src/ui_utils.c 2007-01-07 16:22:41 UTC (rev 1167) @@ -120,7 +120,8 @@ (doc_list[idx].readonly) ? ", read only" : "", cur_tag, (doc_list[idx].encoding) ? doc_list[idx].encoding : _("unknown"), - (utils_is_unicode_charset(doc_list[idx].encoding)) ? ((doc_list[idx].has_bom) ? _("(with BOM)") : _("(without BOM)")) : "", + (encodings_is_unicode_charset(doc_list[idx].encoding)) ? + ((doc_list[idx].has_bom) ? _("(with BOM)") : _("(without BOM)")) : "", (doc_list[idx].file_type) ? doc_list[idx].file_type->title : _("unknown")); set_statusbar(text, TRUE); // can be overridden by status messages g_free(text); @@ -706,7 +707,7 @@ TRUE);
gtk_widget_set_sensitive(lookup_widget(app->window, "menu_write_unicode_bom1"), - utils_is_unicode_charset(doc_list[idx].encoding)); + encodings_is_unicode_charset(doc_list[idx].encoding));
encodings_select_radio_item(doc_list[idx].encoding); filetypes_select_radio_item(doc_list[idx].file_type);
Modified: trunk/src/utils.c =================================================================== --- trunk/src/utils.c 2007-01-07 14:04:13 UTC (rev 1166) +++ trunk/src/utils.c 2007-01-07 16:22:41 UTC (rev 1167) @@ -1403,50 +1403,6 @@ }
-gchar *utils_scan_unicode_bom(const gchar *string) -{ - if ((unsigned char)string[0] == 0xef && (unsigned char)string[1] == 0xbb && - (unsigned char)string[2] == 0xbf) - { - return g_strdup("UTF-8"); - } - else if ((unsigned char)string[0] == 0x00 && (unsigned char)string[1] == 0x00 && - (unsigned char)string[2] == 0xfe && (unsigned char)string[3] == 0xff) - { - return g_strdup("UTF-32BE"); // Big endian - } - else if ((unsigned char)string[0] == 0xff && (unsigned char)string[1] == 0xfe && - (unsigned char)string[2] == 0x00 && (unsigned char)string[3] == 0x00) - { - return g_strdup("UTF-32LE"); // Little endian - } - else if ((unsigned char)string[0]==0xfe && (unsigned char)string[1] == 0xff) - { - return g_strdup("UTF-16BE"); // Big endian - } - else if ((unsigned char)string[0] == 0xff && (unsigned char)string[1] == 0xfe) - { - return g_strdup("UTF-16LE"); // Little endian - } - else if ((string[0] == 0x2b && string[1] == 0x2f && string[2] == 0x76) && - (string[3] == 0x38 || string[3] == 0x39 || string[3] == 0x2b || string[3] == 0x2f)) - { - return g_strdup("UTF-7"); - } - return NULL; -} - - -gboolean utils_is_unicode_charset(const gchar *string) -{ - if (string != NULL && (strncmp(string, "UTF", 3) == 0 || strncmp(string, "UCS", 3) == 0)) - { - return TRUE; - } - return FALSE; -} - - /* Wraps a string in place, replacing a space with a newline character. * wrapstart is the minimum position to start wrapping or -1 for default */ gboolean utils_wrap_string(gchar *string, gint wrapstart)
Modified: trunk/src/utils.h =================================================================== --- trunk/src/utils.h 2007-01-07 14:04:13 UTC (rev 1166) +++ trunk/src/utils.h 2007-01-07 16:22:41 UTC (rev 1167) @@ -135,10 +135,6 @@ * Replaces \, \r, \n, \t and \uXXX by their real counterparts */ gboolean utils_str_replace_escape(gchar *string);
-gchar *utils_scan_unicode_bom(const gchar *string); - -gboolean utils_is_unicode_charset(const gchar *string); - /* Wraps a string in place, replacing a space with a newline character. * wrapstart is the minimum position to start wrapping or -1 for default */ gboolean utils_wrap_string(gchar *string, gint wrapstart);
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.