SF.net SVN: geany:[5666] trunk

Thu Mar 31 23:27:58 UTC 2011

Revision: 5666
          http://geany.svn.sourceforge.net/geany/?rev=5666&view=rev
Author:   colombanw
Date:     2011-03-31 23:27:58 +0000 (Thu, 31 Mar 2011)

Log Message:
-----------
Better handle badly-written encoding names

Make encoding name comparison more permissive, finding names that are
very likely to refer to the same encoding.
For example, "utf8" now matches "UTF-8", and "iso8859_1" matches
"ISO-8859-1".

This makes encodings_get_idx_from_charset() and
encodings_get_from_charset() more permissive, and allow to normalize
an encoding name.
It is used to better handle user-provided encodings (e.g. one found by
a regex search) by normalizing it to the Geany name.

Modified Paths:
--------------
    trunk/ChangeLog
    trunk/src/encodings.c

Modified: trunk/ChangeLog
===================================================================

--- trunk/ChangeLog	2011-03-31 23:27:45 UTC (rev 5665)
+++ trunk/ChangeLog	2011-03-31 23:27:58 UTC (rev 5666)
@@ -3,6 +3,12 @@
  * src/encodings.c:
    Update regex used to find encodings for it to allow the encoding to
    be quoted, adding support for XML (closes #3183506).
+ * src/encodings.c:
+   Implement charset name normalization in order to better deal with
+   badly-written encoding names (i.e. names found by regex search).
+   This also makes encodings_get_idx_from_charset() and
+   encodings_get_from_charset() more permissive regarding the passed-in
+   encoding name.
 
 
 2011-03-31  Nick Treleaven  <nick(dot)treleaven(at)btinternet(dot)com>

Modified: trunk/src/encodings.c
===================================================================
--- trunk/src/encodings.c	2011-03-31 23:27:45 UTC (rev 5665)
+++ trunk/src/encodings.c	2011-03-31 23:27:58 UTC (rev 5666)
@@ -147,6 +147,56 @@
 }
 
 
+/* compares two encoding names in a permissive fashion.
+ * e.g. "utf8" matches "UTF-8", "iso8859_1" matches "ISO-8859-1", etc. */
+static gboolean encodings_charset_equals(const gchar *a, const gchar *b)
+{
+	gboolean was_alpha = FALSE; /* whether last character of previous word was a letter */
+	gboolean need_sep = FALSE; /* whether we're expecting an implicit separator */
+
+	while (*a && *b)
+	{
+		gboolean is_alpha;
+
+		if (g_ascii_toupper(*a) == g_ascii_toupper(*b) &&
+			((is_alpha = g_ascii_isalpha(*a)) || g_ascii_isdigit(*a)))
+		{
+			/* either there was a real separator, or we need a implicit one (a chage from alpha to
+			 * numeric or so) */
+			if (! need_sep || (was_alpha != is_alpha))
+			{
+				a++;
+				b++;
+				was_alpha = is_alpha;
+				need_sep = FALSE;
+			}
+			else
+				return FALSE;
+		}
+		else
+		{
+			guint n_sep = 0;
+
+			if (! g_ascii_isalnum(*a))
+			{
+				a++;
+				n_sep++;
+			}
+			if (! g_ascii_isalnum(*b))
+			{
+				b++;
+				n_sep++;
+			}
+			if (n_sep < 1)
+				return FALSE;
+			else if (n_sep < 2)
+				need_sep = TRUE;
+		}
+	}
+	return *a == *b;
+}
+
+
 GeanyEncodingIndex encodings_get_idx_from_charset(const gchar *charset)
 {
 	gint i;
@@ -157,7 +207,7 @@
 	i = 0;
 	while (i < GEANY_ENCODINGS_MAX)
 	{
-		if (strcmp(charset, encodings[i].charset) == 0)
+		if (encodings_charset_equals(charset, encodings[i].charset))
 			return i;
 
 		++i;
@@ -176,7 +226,7 @@
 	i = 0;
 	while (i < GEANY_ENCODINGS_MAX)
 	{
-		if (strcmp(charset, encodings[i].charset) == 0)
+		if (encodings_charset_equals(charset, encodings[i].charset))
 			return &encodings[i];
 
 		++i;
@@ -186,6 +236,18 @@
 }
 
 
+static const gchar *encodings_normalize_charset(const gchar *charset)
+{
+	const GeanyEncoding *encoding;
+
+	encoding = encodings_get_from_charset(charset);
+	if (encoding != NULL)
+		return encoding->charset;
+
+	return NULL;
+}
+
+
 const GeanyEncoding *encodings_get_from_index(gint idx)
 {
 	g_return_val_if_fail(idx >= 0 && idx < GEANY_ENCODINGS_MAX, NULL);
@@ -556,7 +618,9 @@
 		if (check_regex)
 		{
 			check_regex = FALSE;
-			charset = regex_charset;
+			charset = encodings_normalize_charset(regex_charset);
+			if (! charset) /* we found a regex encoding that we can't normalize, try it as is */
+				charset = regex_charset;
 			i = -2; /* keep i below the start value to have it again at -1 on the next loop run */
 		}
 		else if (check_locale)


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.