SF.net SVN: geany:[5667] trunk
colombanw at users.sourceforge.net
colombanw at xxxxx
Thu Mar 31 23:28:11 UTC 2011
Revision: 5667
http://geany.svn.sourceforge.net/geany/?rev=5667&view=rev
Author: colombanw
Date: 2011-03-31 23:28:10 +0000 (Thu, 31 Mar 2011)
Log Message:
-----------
Always try to honor the charset found in the document
Don't assume a file that is valid as UTF-8 is actually UTF-8 if we
have an information telling us otherwise in the document (a regex
match).
Also fix check for locale encoding.
Modified Paths:
--------------
trunk/ChangeLog
trunk/src/encodings.c
Modified: trunk/ChangeLog
===================================================================
--- trunk/ChangeLog 2011-03-31 23:27:58 UTC (rev 5666)
+++ trunk/ChangeLog 2011-03-31 23:28:10 UTC (rev 5667)
@@ -9,6 +9,13 @@
This also makes encodings_get_idx_from_charset() and
encodings_get_from_charset() more permissive regarding the passed-in
encoding name.
+ * src/encodings.c:
+ Always try to honor the charset found in the document, even if the
+ document can be loaded as UTF-8.
+ This make files encoded with e.g. ISO-8859-1 that have the proper
+ information in them but only use the UTF-8 compatible part of
+ ISO-8859-1 to be properly loaded as ISO-8859-1, rather than UTF-8.
+ Also fix check for locale encoding.
2011-03-31 Nick Treleaven <nick(dot)treleaven(at)btinternet(dot)com>
Modified: trunk/src/encodings.c
===================================================================
--- trunk/src/encodings.c 2011-03-31 23:27:58 UTC (rev 5666)
+++ trunk/src/encodings.c 2011-03-31 23:28:10 UTC (rev 5667)
@@ -562,42 +562,38 @@
}
-/**
- * Tries to convert @a buffer into UTF-8 encoding and store the detected original encoding in
- * @a used_encoding.
- *
- * @param buffer the input string to convert.
- * @param size the length of the string, or -1 if the string is nul-terminated.
- * @param used_encoding return location of the detected encoding of the input string, or @c NULL.
- *
- * @return If the conversion was successful, a newly allocated nul-terminated string,
- * which must be freed with @c g_free(). Otherwise @c NULL.
- **/
-gchar *encodings_convert_to_utf8(const gchar *buffer, gsize size, gchar **used_encoding)
+static gchar *encodings_check_regexes(const gchar *buffer, gsize size)
{
- gchar *locale_charset = NULL;
- gchar *regex_charset = NULL;
+ guint i;
+
+ for (i = 0; i < G_N_ELEMENTS(pregs); i++)
+ {
+ gchar *charset;
+
+ if ((charset = regex_match(&pregs[i], buffer, size)) != NULL)
+ return charset;
+ }
+ return NULL;
+}
+
+
+static gchar *encodings_convert_to_utf8_with_suggestion(const gchar *buffer, gsize size,
+ const gchar *suggested_charset, gchar **used_encoding)
+{
+ const gchar *locale_charset = NULL;
const gchar *charset;
gchar *utf8_content;
- gboolean check_regex = FALSE;
+ gboolean check_suggestion = suggested_charset != NULL;
gboolean check_locale = FALSE;
- gint i, len, preferred_charset;
+ gint i, preferred_charset;
if ((gint)size == -1)
{
size = strlen(buffer);
}
- /* first try to read the encoding from the file content */
- len = (gint) G_N_ELEMENTS(pregs);
- for (i = 0; i < len && ! check_regex; i++)
- {
- if ((regex_charset = regex_match(&pregs[i], buffer, size)) != NULL)
- check_regex = TRUE;
- }
-
/* current locale is not UTF-8, we have to check this charset */
- check_locale = ! g_get_charset((const gchar**) &charset);
+ check_locale = ! g_get_charset(&locale_charset);
/* First check for preferred charset, if specified */
preferred_charset = file_prefs.default_open_encoding;
@@ -615,12 +611,12 @@
if (G_UNLIKELY(i == encodings[GEANY_ENCODING_NONE].idx))
continue;
- if (check_regex)
+ if (check_suggestion)
{
- check_regex = FALSE;
- charset = encodings_normalize_charset(regex_charset);
- if (! charset) /* we found a regex encoding that we can't normalize, try it as is */
- charset = regex_charset;
+ check_suggestion = FALSE;
+ charset = encodings_normalize_charset(suggested_charset);
+ if (charset == NULL) /* we failed at normalizing suggested encoding, try it as is */
+ charset = suggested_charset;
i = -2; /* keep i below the start value to have it again at -1 on the next loop run */
}
else if (check_locale)
@@ -662,16 +658,39 @@
}
*used_encoding = g_strdup(charset);
}
- g_free(regex_charset);
return utf8_content;
}
}
- g_free(regex_charset);
return NULL;
}
+/**
+ * Tries to convert @a buffer into UTF-8 encoding and store the detected original encoding in
+ * @a used_encoding.
+ *
+ * @param buffer the input string to convert.
+ * @param size the length of the string, or -1 if the string is nul-terminated.
+ * @param used_encoding return location of the detected encoding of the input string, or @c NULL.
+ *
+ * @return If the conversion was successful, a newly allocated nul-terminated string,
+ * which must be freed with @c g_free(). Otherwise @c NULL.
+ **/
+gchar *encodings_convert_to_utf8(const gchar *buffer, gsize size, gchar **used_encoding)
+{
+ gchar *regex_charset;
+ gchar *utf8;
+
+ /* first try to read the encoding from the file content */
+ regex_charset = encodings_check_regexes(buffer, size);
+ utf8 = encodings_convert_to_utf8_with_suggestion(buffer, size, regex_charset, used_encoding);
+ g_free(regex_charset);
+
+ return utf8;
+}
+
+
/* If there's a BOM, return a corresponding GEANY_ENCODING_UTF_* index,
* otherwise GEANY_ENCODING_NONE.
* */
@@ -826,25 +845,30 @@
if (buffer->enc == NULL) /* either there was no BOM or the BOM encoding failed */
{
+ /* first try to read the encoding from the file content */
+ gchar *regex_charset = encodings_check_regexes(buffer->data, buffer->size);
+
/* try UTF-8 first */
- if ((buffer->size == buffer->len) &&
- g_utf8_validate(buffer->data, buffer->len, NULL))
+ if (encodings_get_idx_from_charset(regex_charset) == GEANY_ENCODING_UTF_8 &&
+ (buffer->size == buffer->len) && g_utf8_validate(buffer->data, buffer->len, NULL))
{
buffer->enc = g_strdup("UTF-8");
}
else
{
/* detect the encoding */
- gchar *converted_text = encodings_convert_to_utf8(buffer->data,
- buffer->size, &buffer->enc);
+ gchar *converted_text = encodings_convert_to_utf8_with_suggestion(buffer->data,
+ buffer->size, regex_charset, &buffer->enc);
if (converted_text == NULL)
{
+ g_free(regex_charset);
return FALSE;
}
setptr(buffer->data, converted_text);
buffer->len = strlen(converted_text);
}
+ g_free(regex_charset);
}
}
return TRUE;
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
More information about the Commits
mailing list