SF.net SVN: geany:[5667] trunk

1 Apr 2011

Revision: 5667
          http://geany.svn.sourceforge.net/geany/?rev=5667&view=rev
Author:   colombanw
Date:     2011-03-31 23:28:10 +0000 (Thu, 31 Mar 2011)
Log Message:
-----------
Always try to honor the charset found in the document
Don't assume a file that is valid as UTF-8 is actually UTF-8 if we
have an information telling us otherwise in the document (a regex
match).
Also fix check for locale encoding.
Modified Paths:
--------------
    trunk/ChangeLog
    trunk/src/encodings.c
Modified: trunk/ChangeLog
===================================================================

--- trunk/ChangeLog	2011-03-31 23:27:58 UTC (rev 5666)
+++ trunk/ChangeLog	2011-03-31 23:28:10 UTC (rev 5667)
@@ -9,6 +9,13 @@
    This also makes encodings_get_idx_from_charset() and
    encodings_get_from_charset() more permissive regarding the passed-in
    encoding name.
+ * src/encodings.c:
+   Always try to honor the charset found in the document, even if the
+   document can be loaded as UTF-8.
+   This make files encoded with e.g. ISO-8859-1 that have the proper
+   information in them but only use the UTF-8 compatible part of
+   ISO-8859-1 to be properly loaded as ISO-8859-1, rather than UTF-8.
+   Also fix check for locale encoding.
2011-03-31  Nick Treleaven  <nick(dot)treleaven(at)btinternet(dot)com>
Modified: trunk/src/encodings.c
===================================================================
--- trunk/src/encodings.c	2011-03-31 23:27:58 UTC (rev 5666)
+++ trunk/src/encodings.c	2011-03-31 23:28:10 UTC (rev 5667)
@@ -562,42 +562,38 @@
 }
-/**
- *  Tries to convert @a buffer into UTF-8 encoding and store the detected original encoding in
- *  @a used_encoding.
- *
- *  @param buffer the input string to convert.
- *  @param size the length of the string, or -1 if the string is nul-terminated.
- *  @param used_encoding return location of the detected encoding of the input string, or @c NULL.
- *
- *  @return If the conversion was successful, a newly allocated nul-terminated string,
- *    which must be freed with @c g_free(). Otherwise @c NULL.
- **/
-gchar *encodings_convert_to_utf8(const gchar *buffer, gsize size, gchar **used_encoding)
+static gchar *encodings_check_regexes(const gchar *buffer, gsize size)
 {
-	gchar *locale_charset = NULL;
-	gchar *regex_charset = NULL;
+	guint i;
+
+	for (i = 0; i < G_N_ELEMENTS(pregs); i++)
+	{
+		gchar *charset;
+
+		if ((charset = regex_match(&pregs[i], buffer, size)) != NULL)
+			return charset;
+	}
+	return NULL;
+}
+
+
+static gchar *encodings_convert_to_utf8_with_suggestion(const gchar *buffer, gsize size,
+		const gchar *suggested_charset, gchar **used_encoding)
+{
+	const gchar *locale_charset = NULL;
    const gchar *charset;
    gchar *utf8_content;
-	gboolean check_regex = FALSE;
+	gboolean check_suggestion = suggested_charset != NULL;
    gboolean check_locale = FALSE;
-	gint i, len, preferred_charset;
+	gint i, preferred_charset;
if ((gint)size == -1)
    {
    	size = strlen(buffer);
    }
-	/* first try to read the encoding from the file content */
-	len = (gint) G_N_ELEMENTS(pregs);
-	for (i = 0; i < len && ! check_regex; i++)
-	{
-		if ((regex_charset = regex_match(&pregs[i], buffer, size)) != NULL)
-			check_regex = TRUE;
-	}
-
    /* current locale is not UTF-8, we have to check this charset */
-	check_locale = ! g_get_charset((const gchar**) &charset);
+	check_locale = ! g_get_charset(&locale_charset);
/* First check for preferred charset, if specified */
    preferred_charset = file_prefs.default_open_encoding;
@@ -615,12 +611,12 @@
    	if (G_UNLIKELY(i == encodings[GEANY_ENCODING_NONE].idx))
    		continue;
-		if (check_regex)
+		if (check_suggestion)
    	{
-			check_regex = FALSE;
-			charset = encodings_normalize_charset(regex_charset);
-			if (! charset) /* we found a regex encoding that we can't normalize, try it as is */
-				charset = regex_charset;
+			check_suggestion = FALSE;
+			charset = encodings_normalize_charset(suggested_charset);
+			if (charset == NULL) /* we failed at normalizing suggested encoding, try it as is */
+				charset = suggested_charset;
    		i = -2; /* keep i below the start value to have it again at -1 on the next loop run */
    	}
    	else if (check_locale)
@@ -662,16 +658,39 @@
    			}
    			*used_encoding = g_strdup(charset);
    		}
-			g_free(regex_charset);
    		return utf8_content;
    	}
    }
-	g_free(regex_charset);
return NULL;
 }
+/**
+ *  Tries to convert @a buffer into UTF-8 encoding and store the detected original encoding in
+ *  @a used_encoding.
+ *
+ *  @param buffer the input string to convert.
+ *  @param size the length of the string, or -1 if the string is nul-terminated.
+ *  @param used_encoding return location of the detected encoding of the input string, or @c NULL.
+ *
+ *  @return If the conversion was successful, a newly allocated nul-terminated string,
+ *    which must be freed with @c g_free(). Otherwise @c NULL.
+ **/
+gchar *encodings_convert_to_utf8(const gchar *buffer, gsize size, gchar **used_encoding)
+{
+	gchar *regex_charset;
+	gchar *utf8;
+
+	/* first try to read the encoding from the file content */
+	regex_charset = encodings_check_regexes(buffer, size);
+	utf8 = encodings_convert_to_utf8_with_suggestion(buffer, size, regex_charset, used_encoding);
+	g_free(regex_charset);
+
+	return utf8;
+}
+
+
 /* If there's a BOM, return a corresponding GEANY_ENCODING_UTF_* index,
  * otherwise GEANY_ENCODING_NONE.
  * */
@@ -826,25 +845,30 @@
if (buffer->enc == NULL)	/* either there was no BOM or the BOM encoding failed */
    	{
+			/* first try to read the encoding from the file content */
+			gchar *regex_charset = encodings_check_regexes(buffer->data, buffer->size);
+
    		/* try UTF-8 first */
-			if ((buffer->size == buffer->len) &&
-				g_utf8_validate(buffer->data, buffer->len, NULL))
+			if (encodings_get_idx_from_charset(regex_charset) == GEANY_ENCODING_UTF_8 &&
+				(buffer->size == buffer->len) && g_utf8_validate(buffer->data, buffer->len, NULL))
    		{
    			buffer->enc = g_strdup("UTF-8");
    		}
    		else
    		{
    			/* detect the encoding */
-				gchar *converted_text = encodings_convert_to_utf8(buffer->data,
-					buffer->size, &buffer->enc);
+				gchar *converted_text = encodings_convert_to_utf8_with_suggestion(buffer->data,
+					buffer->size, regex_charset, &buffer->enc);
if (converted_text == NULL)
    			{
+					g_free(regex_charset);
    				return FALSE;
    			}
    			setptr(buffer->data, converted_text);
    			buffer->len = strlen(converted_text);
    		}
+			g_free(regex_charset);
    	}
    }
    return TRUE;
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

    

2025

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

2012

2011

2010

2009

2008

2007

2006

SF.net SVN: geany:[5667] trunk