Revision: 5667
http://geany.svn.sourceforge.net/geany/?rev=5667&view=rev
Author: colombanw
Date: 2011-03-31 23:28:10 +0000 (Thu, 31 Mar 2011)
Log Message:
-----------
Always try to honor the charset found in the document
Don't assume a file that is valid as UTF-8 is actually UTF-8 if we
have an information telling us otherwise in the document (a regex
match).
Also fix check for locale encoding.
Modified Paths:
--------------
trunk/ChangeLog
trunk/src/encodings.c
Modified: trunk/ChangeLog
===================================================================
--- trunk/ChangeLog 2011-03-31 23:27:58 UTC (rev 5666)
+++ trunk/ChangeLog 2011-03-31 23:28:10 UTC (rev 5667)
@@ -9,6 +9,13 @@
This also makes encodings_get_idx_from_charset() and
encodings_get_from_charset() more permissive regarding the passed-in
encoding name.
+ * src/encodings.c:
+ Always try to honor the charset found in the document, even if the
+ document can be loaded as UTF-8.
+ This make files encoded with e.g. ISO-8859-1 that have the proper
+ information in them but only use the UTF-8 compatible part of
+ ISO-8859-1 to be properly loaded as ISO-8859-1, rather than UTF-8.
+ Also fix check for locale encoding.
2011-03-31 Nick Treleaven <nick(dot)treleaven(at)btinternet(dot)com>
Modified: trunk/src/encodings.c
===================================================================
--- trunk/src/encodings.c 2011-03-31 23:27:58 UTC (rev 5666)
+++ trunk/src/encodings.c 2011-03-31 23:28:10 UTC (rev 5667)
@@ -562,42 +562,38 @@
}
-/**
- * Tries to convert @a buffer into UTF-8 encoding and store the detected original encoding in
- * @a used_encoding.
- *
- * @param buffer the input string to convert.
- * @param size the length of the string, or -1 if the string is nul-terminated.
- * @param used_encoding return location of the detected encoding of the input string, or @c NULL.
- *
- * @return If the conversion was successful, a newly allocated nul-terminated string,
- * which must be freed with @c g_free(). Otherwise @c NULL.
- **/
-gchar *encodings_convert_to_utf8(const gchar *buffer, gsize size, gchar **used_encoding)
+static gchar *encodings_check_regexes(const gchar *buffer, gsize size)
{
- gchar *locale_charset = NULL;
- gchar *regex_charset = NULL;
+ guint i;
+
+ for (i = 0; i < G_N_ELEMENTS(pregs); i++)
+ {
+ gchar *charset;
+
+ if ((charset = regex_match(&pregs[i], buffer, size)) != NULL)
+ return charset;
+ }
+ return NULL;
+}
+
+
+static gchar *encodings_convert_to_utf8_with_suggestion(const gchar *buffer, gsize size,
+ const gchar *suggested_charset, gchar **used_encoding)
+{
+ const gchar *locale_charset = NULL;
const gchar *charset;
gchar *utf8_content;
- gboolean check_regex = FALSE;
+ gboolean check_suggestion = suggested_charset != NULL;
gboolean check_locale = FALSE;
- gint i, len, preferred_charset;
+ gint i, preferred_charset;
if ((gint)size == -1)
{
size = strlen(buffer);
}
- /* first try to read the encoding from the file content */
- len = (gint) G_N_ELEMENTS(pregs);
- for (i = 0; i < len && ! check_regex; i++)
- {
- if ((regex_charset = regex_match(&pregs[i], buffer, size)) != NULL)
- check_regex = TRUE;
- }
-
/* current locale is not UTF-8, we have to check this charset */
- check_locale = ! g_get_charset((const gchar**) &charset);
+ check_locale = ! g_get_charset(&locale_charset);
/* First check for preferred charset, if specified */
preferred_charset = file_prefs.default_open_encoding;
@@ -615,12 +611,12 @@
if (G_UNLIKELY(i == encodings[GEANY_ENCODING_NONE].idx))
continue;
- if (check_regex)
+ if (check_suggestion)
{
- check_regex = FALSE;
- charset = encodings_normalize_charset(regex_charset);
- if (! charset) /* we found a regex encoding that we can't normalize, try it as is */
- charset = regex_charset;
+ check_suggestion = FALSE;
+ charset = encodings_normalize_charset(suggested_charset);
+ if (charset == NULL) /* we failed at normalizing suggested encoding, try it as is */
+ charset = suggested_charset;
i = -2; /* keep i below the start value to have it again at -1 on the next loop run */
}
else if (check_locale)
@@ -662,16 +658,39 @@
}
*used_encoding = g_strdup(charset);
}
- g_free(regex_charset);
return utf8_content;
}
}
- g_free(regex_charset);
return NULL;
}
+/**
+ * Tries to convert @a buffer into UTF-8 encoding and store the detected original encoding in
+ * @a used_encoding.
+ *
+ * @param buffer the input string to convert.
+ * @param size the length of the string, or -1 if the string is nul-terminated.
+ * @param used_encoding return location of the detected encoding of the input string, or @c NULL.
+ *
+ * @return If the conversion was successful, a newly allocated nul-terminated string,
+ * which must be freed with @c g_free(). Otherwise @c NULL.
+ **/
+gchar *encodings_convert_to_utf8(const gchar *buffer, gsize size, gchar **used_encoding)
+{
+ gchar *regex_charset;
+ gchar *utf8;
+
+ /* first try to read the encoding from the file content */
+ regex_charset = encodings_check_regexes(buffer, size);
+ utf8 = encodings_convert_to_utf8_with_suggestion(buffer, size, regex_charset, used_encoding);
+ g_free(regex_charset);
+
+ return utf8;
+}
+
+
/* If there's a BOM, return a corresponding GEANY_ENCODING_UTF_* index,
* otherwise GEANY_ENCODING_NONE.
* */
@@ -826,25 +845,30 @@
if (buffer->enc == NULL) /* either there was no BOM or the BOM encoding failed */
{
+ /* first try to read the encoding from the file content */
+ gchar *regex_charset = encodings_check_regexes(buffer->data, buffer->size);
+
/* try UTF-8 first */
- if ((buffer->size == buffer->len) &&
- g_utf8_validate(buffer->data, buffer->len, NULL))
+ if (encodings_get_idx_from_charset(regex_charset) == GEANY_ENCODING_UTF_8 &&
+ (buffer->size == buffer->len) && g_utf8_validate(buffer->data, buffer->len, NULL))
{
buffer->enc = g_strdup("UTF-8");
}
else
{
/* detect the encoding */
- gchar *converted_text = encodings_convert_to_utf8(buffer->data,
- buffer->size, &buffer->enc);
+ gchar *converted_text = encodings_convert_to_utf8_with_suggestion(buffer->data,
+ buffer->size, regex_charset, &buffer->enc);
if (converted_text == NULL)
{
+ g_free(regex_charset);
return FALSE;
}
setptr(buffer->data, converted_text);
buffer->len = strlen(converted_text);
}
+ g_free(regex_charset);
}
}
return TRUE;
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
Revision: 5666
http://geany.svn.sourceforge.net/geany/?rev=5666&view=rev
Author: colombanw
Date: 2011-03-31 23:27:58 +0000 (Thu, 31 Mar 2011)
Log Message:
-----------
Better handle badly-written encoding names
Make encoding name comparison more permissive, finding names that are
very likely to refer to the same encoding.
For example, "utf8" now matches "UTF-8", and "iso8859_1" matches
"ISO-8859-1".
This makes encodings_get_idx_from_charset() and
encodings_get_from_charset() more permissive, and allow to normalize
an encoding name.
It is used to better handle user-provided encodings (e.g. one found by
a regex search) by normalizing it to the Geany name.
Modified Paths:
--------------
trunk/ChangeLog
trunk/src/encodings.c
Modified: trunk/ChangeLog
===================================================================
--- trunk/ChangeLog 2011-03-31 23:27:45 UTC (rev 5665)
+++ trunk/ChangeLog 2011-03-31 23:27:58 UTC (rev 5666)
@@ -3,6 +3,12 @@
* src/encodings.c:
Update regex used to find encodings for it to allow the encoding to
be quoted, adding support for XML (closes #3183506).
+ * src/encodings.c:
+ Implement charset name normalization in order to better deal with
+ badly-written encoding names (i.e. names found by regex search).
+ This also makes encodings_get_idx_from_charset() and
+ encodings_get_from_charset() more permissive regarding the passed-in
+ encoding name.
2011-03-31 Nick Treleaven <nick(dot)treleaven(at)btinternet(dot)com>
Modified: trunk/src/encodings.c
===================================================================
--- trunk/src/encodings.c 2011-03-31 23:27:45 UTC (rev 5665)
+++ trunk/src/encodings.c 2011-03-31 23:27:58 UTC (rev 5666)
@@ -147,6 +147,56 @@
}
+/* compares two encoding names in a permissive fashion.
+ * e.g. "utf8" matches "UTF-8", "iso8859_1" matches "ISO-8859-1", etc. */
+static gboolean encodings_charset_equals(const gchar *a, const gchar *b)
+{
+ gboolean was_alpha = FALSE; /* whether last character of previous word was a letter */
+ gboolean need_sep = FALSE; /* whether we're expecting an implicit separator */
+
+ while (*a && *b)
+ {
+ gboolean is_alpha;
+
+ if (g_ascii_toupper(*a) == g_ascii_toupper(*b) &&
+ ((is_alpha = g_ascii_isalpha(*a)) || g_ascii_isdigit(*a)))
+ {
+ /* either there was a real separator, or we need a implicit one (a chage from alpha to
+ * numeric or so) */
+ if (! need_sep || (was_alpha != is_alpha))
+ {
+ a++;
+ b++;
+ was_alpha = is_alpha;
+ need_sep = FALSE;
+ }
+ else
+ return FALSE;
+ }
+ else
+ {
+ guint n_sep = 0;
+
+ if (! g_ascii_isalnum(*a))
+ {
+ a++;
+ n_sep++;
+ }
+ if (! g_ascii_isalnum(*b))
+ {
+ b++;
+ n_sep++;
+ }
+ if (n_sep < 1)
+ return FALSE;
+ else if (n_sep < 2)
+ need_sep = TRUE;
+ }
+ }
+ return *a == *b;
+}
+
+
GeanyEncodingIndex encodings_get_idx_from_charset(const gchar *charset)
{
gint i;
@@ -157,7 +207,7 @@
i = 0;
while (i < GEANY_ENCODINGS_MAX)
{
- if (strcmp(charset, encodings[i].charset) == 0)
+ if (encodings_charset_equals(charset, encodings[i].charset))
return i;
++i;
@@ -176,7 +226,7 @@
i = 0;
while (i < GEANY_ENCODINGS_MAX)
{
- if (strcmp(charset, encodings[i].charset) == 0)
+ if (encodings_charset_equals(charset, encodings[i].charset))
return &encodings[i];
++i;
@@ -186,6 +236,18 @@
}
+static const gchar *encodings_normalize_charset(const gchar *charset)
+{
+ const GeanyEncoding *encoding;
+
+ encoding = encodings_get_from_charset(charset);
+ if (encoding != NULL)
+ return encoding->charset;
+
+ return NULL;
+}
+
+
const GeanyEncoding *encodings_get_from_index(gint idx)
{
g_return_val_if_fail(idx >= 0 && idx < GEANY_ENCODINGS_MAX, NULL);
@@ -556,7 +618,9 @@
if (check_regex)
{
check_regex = FALSE;
- charset = regex_charset;
+ charset = encodings_normalize_charset(regex_charset);
+ if (! charset) /* we found a regex encoding that we can't normalize, try it as is */
+ charset = regex_charset;
i = -2; /* keep i below the start value to have it again at -1 on the next loop run */
}
else if (check_locale)
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
Revision: 5658
http://geany.svn.sourceforge.net/geany/?rev=5658&view=rev
Author: colombanw
Date: 2011-03-30 15:04:42 +0000 (Wed, 30 Mar 2011)
Log Message:
-----------
Reflow a few paragraphs in the ChangeLog not to go beyond 72th column
Modified Paths:
--------------
trunk/ChangeLog
Modified: trunk/ChangeLog
===================================================================
--- trunk/ChangeLog 2011-03-30 14:59:25 UTC (rev 5657)
+++ trunk/ChangeLog 2011-03-30 15:04:42 UTC (rev 5658)
@@ -62,11 +62,12 @@
2011-03-26 Colomban Wendling <colomban(at)geany(dot)org>
* src/sidebar.c:
- Fix the sidebar popup menu to properly use the currently selected item
- rather than the previous one.
- Also change the hack used for the selection to be updated in the input
- handlers to call the GtkTreeView's handler manually rather than doing
- the actual job in IDLE callbacks for the TreeView's handler to have run.
+ Fix the sidebar popup menu to properly use the currently selected
+ item rather than the previous one.
+ Also change the hack used for the selection to be updated in the
+ input handlers to call the GtkTreeView's handler manually rather than
+ doing the actual job in IDLE callbacks for the TreeView's handler to
+ have run.
* doc/geany.html, doc/geany.txt, geany.glade, src/document.c,
src/editor.h, src/interface.c, src/interface.h, src/keyfile.c,
src/project.c:
@@ -113,8 +114,8 @@
* plugins/filebrowser.c, plugins/saveactions.c, src/callbacks.c,
src/dialogs.c, src/document.c, src/document.h, src/editor.c,
src/encodings.c, src/filetypes.c, src/highlighting.c, src/log.c,
- src/main.c, src/plugins.c, src/printing.c, src/project.c, src/search.c,
- src/socket.c, src/toolbar.c, src/utils.c, src/utils.h:
+ src/main.c, src/plugins.c, src/printing.c, src/project.c,
+ src/search.c, src/socket.c, src/toolbar.c, src/utils.c, src/utils.h:
Improve usage of G_LIKELY() and G_UNLIKELY() macros.
@@ -147,15 +148,16 @@
2011-03-19 Colomban Wendling <colomban(at)geany(dot)org>
* src/callbacks.c:
- Create a new undo action when inserting templates, making sure the user
- can undo the template insertion without also undoing a previous action.
+ Create a new undo action when inserting templates, making sure the
+ user can undo the template insertion without also undoing a previous
+ action.
2011-03-18 Colomban Wendling <colomban(at)geany(dot)org>
* src/document.c, src/encodings.c, src/encodings.h:
- Move document encoding conversion with BOM support to encodings.[ch] as
- encodings_convert_to_utf8_auto().
+ Move document encoding conversion with BOM support to encodings.[ch]
+ as encodings_convert_to_utf8_auto().
* src/templates.c:
Properly convert template files to UTF-8 on loading, fixing encoding
issues if templates files are not encoded in UTF-8.
@@ -246,9 +248,10 @@
2011-03-06 Colomban Wendling <colomban(at)geany(dot)org>
* src/symbols.c:
- When sorting tags by line, also sort by scope if line is the same, avoiding
- wrong sorting if a parent tag is on the same line than its children, and one
- of it's children would be sorted before alphabetically (closes #3193982).
+ When sorting tags by line, also sort by scope if line is the same,
+ avoiding wrong sorting if a parent tag is on the same line than its
+ children, and one of it's children would be sorted before
+ alphabetically (closes #3193982).
2011-03-06 Enrico Tröger <enrico(dot)troeger(at)uvena(dot)de>
@@ -273,16 +276,17 @@
* configure.ac, wscript, makefile.win32, src/Makefile.am,
src/makefile.win32, tagmanager/Makefile.am, tagmanager/c.c,
tagmanager/entry.c, tagmanager/entry.h, tagmanager/fortran.c,
- tagmanager/get.c, tagmanager/get.h, tagmanager/js.c, tagmanager/lregex.c,
- tagmanager/makefile.win32, tagmanager/parse.c, tagmanager/read.c,
- tagmanager/read.h, tagmanager/sort.c, tagmanager/sql.c,
- tagmanager/strlist.c, tagmanager/mio/*:
+ tagmanager/get.c, tagmanager/get.h, tagmanager/js.c,
+ tagmanager/lregex.c, tagmanager/makefile.win32, tagmanager/parse.c,
+ tagmanager/read.c, tagmanager/read.h, tagmanager/sort.c,
+ tagmanager/sql.c, tagmanager/strlist.c, tagmanager/mio/*:
Move most of TagManager's I/O to MIO.
* src/document.c:
Enable in-memory tag parsing.
- * geany.glade, src/editor.c, src/editor.h, src/interface.c, src/keyfile.c:
- Add possibility to update symbol list in IDLE time, enabled by default
- using a minimal delay of 250ms.
+ * geany.glade, src/editor.c, src/editor.h, src/interface.c,
+ src/keyfile.c:
+ Add possibility to update symbol list in IDLE time, enabled by
+ default using a minimal delay of 250ms.
* src/symbols.c:
Improve implementation of hide_empty_rows() to do all in one shot.
* tagmanager/include/tm_tag.h, tagmanager/tm_project.c,
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.