[geany/geany-plugins] 4a0ce3: Improve stripping of whitespace and punctuation characters (fixes #98) - Plugins-Commits

13 Oct 2013


      Branch:      refs/heads/master
Author:      Enrico Tröger enrico.troeger@uvena.de
Committer:   Enrico Tröger enrico.troeger@uvena.de
Date:        Sun, 13 Oct 2013 11:16:41 UTC
Commit:      4a0ce3edc3effcd8e050484c464201ac9d0027e8
             https://github.com/geany/geany-plugins/commit/4a0ce3edc3effcd8e050484c464201...
Log Message:
-----------
Improve stripping of whitespace and punctuation characters (fixes #98)
More expensive stripping of whitespace and punctuation characters, including
Unicode characters before checking a word. However, this is rather a workaround
than a solution. The proper way would be to read words using Scintilla but
split a line into words manually with correct handling of Unicode characters.
Modified Paths:
--------------
    spellcheck/src/speller.c
Modified: spellcheck/src/speller.c
101 files changed, 90 insertions(+), 11 deletions(-)
===================================================================
@@ -52,10 +52,81 @@ static void dict_describe(const gchar* const lang, const gchar* const name,
 }
+static gboolean is_word_sep(gunichar c)
+{
+	return (g_unichar_isspace(c) || g_unichar_ispunct(c)) && c != (gunichar)''';
+}
+
+
+/* Strip punctuation and white space, more or less Unicode-safe.
+ * The offset of the start of the word is stored in offset if non-NULL. */
+static gchar *strip_word(const gchar *word_to_check, gint *result_offset)
+{
+	gunichar c;
+	gchar *word = g_strdup(word_to_check);
+	gchar *word_start = word;
+	gchar *word_end;
+	gint offset = 0;
+	gint word_len;
+	gint new_word_len;
+
+	/* strip from the left */
+	do
+	{
+		c = g_utf8_get_char_validated(word, -1);
+		if (is_word_sep(c))
+		{	/* skip this character */
+			word = g_utf8_next_char(word);
+		}
+		else
+			break;
+	} while (c != (gunichar) -1 && c != 0 && *word != '\0');
+	word_len = strlen(word_to_check);
+	offset = word - word_start;
+	new_word_len = word_len - offset;
+
+	if (new_word_len <= 0)
+	{	/* empty or only punctuation in input string */
+		*result_offset = 0;
+		g_free(word_start);
+		return NULL;
+	}
+	/* move the string in-place and truncate it */
+	g_memmove(word_start, word, new_word_len);
+	word = word_start;
+	word[new_word_len] = '\0';
+	if (! NZV(word))
+	{
+		g_free(word);
+		return NULL;
+	}
+	/* strip from the right */
+	word_end = word + strlen(word);
+	do
+	{
+		word_end = g_utf8_prev_char(word_end);
+		c = g_utf8_get_char_validated(word_end, -1);
+		if (is_word_sep(c))
+		{	/* skip this character */
+			*word_end = '\0';
+		}
+		else
+			break;
+	} while (c != (gunichar) -1 && word_end >= word);
+
+	if (result_offset != NULL)
+		*result_offset = offset;
+
+	return word;
+}
+
+
 static gint sc_speller_check_word(GeanyDocument *doc, gint line_number, const gchar *word,
    					   gint start_pos, gint end_pos)
 {
    gsize n_suggs = 0;
+	gchar *word_to_check;
+	gint offset;
g_return_val_if_fail(sc_speller_dict != NULL, 0);
    g_return_val_if_fail(doc != NULL, 0);
@@ -73,9 +144,24 @@ static gint sc_speller_check_word(GeanyDocument *doc, gint line_number, const gc
    if (! sc_speller_is_text(doc, start_pos))
    	return 0;
+	/* strip punctuation and white space */
+	word_to_check = strip_word(word, &offset);
+	if (! NZV(word_to_check))
+	{
+		g_free(word_to_check);
+		return 0;
+	}
+
+	/* recalculate start_pos and end_pos */
+	start_pos += offset;
+	end_pos = start_pos + strlen(word_to_check);
+
    /* early out if the word is spelled correctly */
-	if (enchant_dict_check(sc_speller_dict, word, -1) == 0)
+	if (enchant_dict_check(sc_speller_dict, word_to_check, -1) == 0)
+	{
+		g_free(word_to_check);
    	return 0;
+	}
editor_indicator_set_on_range(doc->editor, GEANY_INDICATOR_ERROR, start_pos, end_pos);
@@ -86,10 +172,10 @@ static gint sc_speller_check_word(GeanyDocument *doc, gint line_number, const gc
    	GString *str;
str = g_string_sized_new(256);
-		suggs = enchant_dict_suggest(sc_speller_dict, word, -1, &n_suggs);
+		suggs = enchant_dict_suggest(sc_speller_dict, word_to_check, -1, &n_suggs);
    	if (suggs != NULL)
    	{
-			g_string_append_printf(str, "line %d: %s | ",  line_number + 1, word);
+			g_string_append_printf(str, "line %d: %s | ",  line_number + 1, word_to_check);
g_string_append(str, _("Try: "));
@@ -108,6 +194,7 @@ static gint sc_speller_check_word(GeanyDocument *doc, gint line_number, const gc
    	g_string_free(str, TRUE);
    }
+	g_free(word_to_check);
    return n_suggs;
 }
@@ -118,7 +205,6 @@ gint sc_speller_process_line(GeanyDocument *doc, gint line_number, const gchar *
    gint wstart, wend;
    GString *str;
    gint suggestions_found = 0;
-	gchar c;
g_return_val_if_fail(sc_speller_dict != NULL, 0);
    g_return_val_if_fail(doc != NULL, 0);
@@ -135,13 +221,6 @@ gint sc_speller_process_line(GeanyDocument *doc, gint line_number, const gchar *
    	wend = scintilla_send_message(doc->editor->sci, SCI_WORDENDPOSITION, wstart, FALSE);
    	if (wstart == wend)
    		break;
-		c = sci_get_char_at(doc->editor->sci, wstart);
-		/* hopefully it's enough to check for these both */
-		if (ispunct(c) || isspace(c))
-		{
-			pos_start++;
-			continue;
-		}
/* ensure the string has enough allocated memory */
    	if (str->len < (guint)(wend - wstart))
--------------
This E-Mail was brought to you by github_commit_mail.py (Source: https://github.com/geany/infrastructure).