SF.net SVN: geany:[5667] trunk

Thu Mar 31 23:28:11 UTC 2011

Revision: 5667
          http://geany.svn.sourceforge.net/geany/?rev=5667&view=rev
Author:   colombanw
Date:     2011-03-31 23:28:10 +0000 (Thu, 31 Mar 2011)

Log Message:
-----------
Always try to honor the charset found in the document

Don't assume a file that is valid as UTF-8 is actually UTF-8 if we
have an information telling us otherwise in the document (a regex
match).

Also fix check for locale encoding.

Modified Paths:
--------------
    trunk/ChangeLog
    trunk/src/encodings.c

Modified: trunk/ChangeLog
===================================================================

--- trunk/ChangeLog	2011-03-31 23:27:58 UTC (rev 5666)
+++ trunk/ChangeLog	2011-03-31 23:28:10 UTC (rev 5667)
@@ -9,6 +9,13 @@
    This also makes encodings_get_idx_from_charset() and
    encodings_get_from_charset() more permissive regarding the passed-in
    encoding name.
+ * src/encodings.c:
+   Always try to honor the charset found in the document, even if the
+   document can be loaded as UTF-8.
+   This make files encoded with e.g. ISO-8859-1 that have the proper
+   information in them but only use the UTF-8 compatible part of
+   ISO-8859-1 to be properly loaded as ISO-8859-1, rather than UTF-8.
+   Also fix check for locale encoding.
 
 
 2011-03-31  Nick Treleaven  <nick(dot)treleaven(at)btinternet(dot)com>

Modified: trunk/src/encodings.c
===================================================================
--- trunk/src/encodings.c	2011-03-31 23:27:58 UTC (rev 5666)
+++ trunk/src/encodings.c	2011-03-31 23:28:10 UTC (rev 5667)
@@ -562,42 +562,38 @@
 }
 
 
-/**
- *  Tries to convert @a buffer into UTF-8 encoding and store the detected original encoding in
- *  @a used_encoding.
- *
- *  @param buffer the input string to convert.
- *  @param size the length of the string, or -1 if the string is nul-terminated.
- *  @param used_encoding return location of the detected encoding of the input string, or @c NULL.
- *
- *  @return If the conversion was successful, a newly allocated nul-terminated string,
- *    which must be freed with @c g_free(). Otherwise @c NULL.
- **/
-gchar *encodings_convert_to_utf8(const gchar *buffer, gsize size, gchar **used_encoding)
+static gchar *encodings_check_regexes(const gchar *buffer, gsize size)
 {
-	gchar *locale_charset = NULL;
-	gchar *regex_charset = NULL;
+	guint i;
+
+	for (i = 0; i < G_N_ELEMENTS(pregs); i++)
+	{
+		gchar *charset;
+
+		if ((charset = regex_match(&pregs[i], buffer, size)) != NULL)
+			return charset;
+	}
+	return NULL;
+}
+
+
+static gchar *encodings_convert_to_utf8_with_suggestion(const gchar *buffer, gsize size,
+		const gchar *suggested_charset, gchar **used_encoding)
+{
+	const gchar *locale_charset = NULL;
 	const gchar *charset;
 	gchar *utf8_content;
-	gboolean check_regex = FALSE;
+	gboolean check_suggestion = suggested_charset != NULL;
 	gboolean check_locale = FALSE;
-	gint i, len, preferred_charset;
+	gint i, preferred_charset;
 
 	if ((gint)size == -1)
 	{
 		size = strlen(buffer);
 	}
 
-	/* first try to read the encoding from the file content */
-	len = (gint) G_N_ELEMENTS(pregs);
-	for (i = 0; i < len && ! check_regex; i++)
-	{
-		if ((regex_charset = regex_match(&pregs[i], buffer, size)) != NULL)
-			check_regex = TRUE;
-	}
-
 	/* current locale is not UTF-8, we have to check this charset */
-	check_locale = ! g_get_charset((const gchar**) &charset);
+	check_locale = ! g_get_charset(&locale_charset);
 
 	/* First check for preferred charset, if specified */
 	preferred_charset = file_prefs.default_open_encoding;
@@ -615,12 +611,12 @@
 		if (G_UNLIKELY(i == encodings[GEANY_ENCODING_NONE].idx))
 			continue;
 
-		if (check_regex)
+		if (check_suggestion)
 		{
-			check_regex = FALSE;
-			charset = encodings_normalize_charset(regex_charset);
-			if (! charset) /* we found a regex encoding that we can't normalize, try it as is */
-				charset = regex_charset;
+			check_suggestion = FALSE;
+			charset = encodings_normalize_charset(suggested_charset);
+			if (charset == NULL) /* we failed at normalizing suggested encoding, try it as is */
+				charset = suggested_charset;
 			i = -2; /* keep i below the start value to have it again at -1 on the next loop run */
 		}
 		else if (check_locale)
@@ -662,16 +658,39 @@
 				}
 				*used_encoding = g_strdup(charset);
 			}
-			g_free(regex_charset);
 			return utf8_content;
 		}
 	}
-	g_free(regex_charset);
 
 	return NULL;
 }
 
 
+/**
+ *  Tries to convert @a buffer into UTF-8 encoding and store the detected original encoding in
+ *  @a used_encoding.
+ *
+ *  @param buffer the input string to convert.
+ *  @param size the length of the string, or -1 if the string is nul-terminated.
+ *  @param used_encoding return location of the detected encoding of the input string, or @c NULL.
+ *
+ *  @return If the conversion was successful, a newly allocated nul-terminated string,
+ *    which must be freed with @c g_free(). Otherwise @c NULL.
+ **/
+gchar *encodings_convert_to_utf8(const gchar *buffer, gsize size, gchar **used_encoding)
+{
+	gchar *regex_charset;
+	gchar *utf8;
+
+	/* first try to read the encoding from the file content */
+	regex_charset = encodings_check_regexes(buffer, size);
+	utf8 = encodings_convert_to_utf8_with_suggestion(buffer, size, regex_charset, used_encoding);
+	g_free(regex_charset);
+
+	return utf8;
+}
+
+
 /* If there's a BOM, return a corresponding GEANY_ENCODING_UTF_* index,
  * otherwise GEANY_ENCODING_NONE.
  * */
@@ -826,25 +845,30 @@
 
 		if (buffer->enc == NULL)	/* either there was no BOM or the BOM encoding failed */
 		{
+			/* first try to read the encoding from the file content */
+			gchar *regex_charset = encodings_check_regexes(buffer->data, buffer->size);
+
 			/* try UTF-8 first */
-			if ((buffer->size == buffer->len) &&
-				g_utf8_validate(buffer->data, buffer->len, NULL))
+			if (encodings_get_idx_from_charset(regex_charset) == GEANY_ENCODING_UTF_8 &&
+				(buffer->size == buffer->len) && g_utf8_validate(buffer->data, buffer->len, NULL))
 			{
 				buffer->enc = g_strdup("UTF-8");
 			}
 			else
 			{
 				/* detect the encoding */
-				gchar *converted_text = encodings_convert_to_utf8(buffer->data,
-					buffer->size, &buffer->enc);
+				gchar *converted_text = encodings_convert_to_utf8_with_suggestion(buffer->data,
+					buffer->size, regex_charset, &buffer->enc);
 
 				if (converted_text == NULL)
 				{
+					g_free(regex_charset);
 					return FALSE;
 				}
 				setptr(buffer->data, converted_text);
 				buffer->len = strlen(converted_text);
 			}
+			g_free(regex_charset);
 		}
 	}
 	return TRUE;


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.