SF.net SVN: geany:[5606] trunk

Fri Mar 18 15:57:02 UTC 2011

Revision: 5606
          http://geany.svn.sourceforge.net/geany/?rev=5606&view=rev
Author:   colombanw
Date:     2011-03-18 15:57:02 +0000 (Fri, 18 Mar 2011)

Log Message:
-----------
Move document encoding conversion with BOM support to encodings.[ch]

Adds the new function encodings_convert_to_utf8_auto(). This makes easy
to convert file data to UTF-8 in a high-level fashion, with BOM support
and everything.

Modified Paths:
--------------
    trunk/ChangeLog
    trunk/src/document.c
    trunk/src/encodings.c
    trunk/src/encodings.h

Modified: trunk/ChangeLog
===================================================================

--- trunk/ChangeLog	2011-03-18 09:35:31 UTC (rev 5605)
+++ trunk/ChangeLog	2011-03-18 15:57:02 UTC (rev 5606)
@@ -1,3 +1,10 @@
+2011-03-18  Colomban Wendling  <colomban(at)geany(dot)org>
+
+ * src/document.c, src/encodings.c, src/encodings.h:
+   Move document encoding conversion with BOM support to encodings.[ch] as
+   encodings_convert_to_utf8_auto().
+
+
 2011-03-18  Enrico Tröger  <enrico(dot)troeger(at)uvena(dot)de>
 
  * data/templates/files/main.vala:

Modified: trunk/src/document.c
===================================================================
--- trunk/src/document.c	2011-03-18 09:35:31 UTC (rev 5605)
+++ trunk/src/document.c	2011-03-18 15:57:02 UTC (rev 5606)
@@ -819,7 +819,6 @@
 typedef struct
 {
 	gchar		*data;	/* null-terminated file data */
-	gsize		 size;	/* actual file size on disk */
 	gsize		 len;	/* string length of data */
 	gchar		*enc;
 	gboolean	 bom;
@@ -828,133 +827,12 @@
 } FileData;
 
 
-/* reload file with specified encoding */
-static gboolean
-handle_forced_encoding(FileData *filedata, const gchar *forced_enc)
-{
-	GeanyEncodingIndex enc_idx;
-
-	if (utils_str_equal(forced_enc, "UTF-8"))
-	{
-		if (! g_utf8_validate(filedata->data, filedata->len, NULL))
-		{
-			return FALSE;
-		}
-	}
-	else
-	{
-		gchar *converted_text = encodings_convert_to_utf8_from_charset(
-										filedata->data, filedata->size, forced_enc, FALSE);
-		if (converted_text == NULL)
-		{
-			return FALSE;
-		}
-		else
-		{
-			g_free(filedata->data);
-			filedata->data = converted_text;
-			filedata->len = strlen(converted_text);
-		}
-	}
-	enc_idx = encodings_scan_unicode_bom(filedata->data, filedata->size, NULL);
-	filedata->bom = (enc_idx == GEANY_ENCODING_UTF_8);
-	filedata->enc = g_strdup(forced_enc);
-	return TRUE;
-}
-
-
-/* detect encoding and convert to UTF-8 if necessary */
-static gboolean
-handle_encoding(FileData *filedata, GeanyEncodingIndex enc_idx)
-{
-	g_return_val_if_fail(filedata->enc == NULL, FALSE);
-	g_return_val_if_fail(filedata->bom == FALSE, FALSE);
-
-	if (filedata->size == 0)
-	{
-		/* we have no data so assume UTF-8, filedata->len can be 0 even we have an empty
-		 * e.g. UTF32 file with a BOM(so size is 4, len is 0) */
-		filedata->enc = g_strdup("UTF-8");
-	}
-	else
-	{
-		/* first check for a BOM */
-		if (enc_idx != GEANY_ENCODING_NONE)
-		{
-			filedata->enc = g_strdup(encodings[enc_idx].charset);
-			filedata->bom = TRUE;
-
-			if (enc_idx != GEANY_ENCODING_UTF_8) /* the BOM indicated something else than UTF-8 */
-			{
-				gchar *converted_text = encodings_convert_to_utf8_from_charset(
-										filedata->data, filedata->size, filedata->enc, FALSE);
-				if (converted_text != NULL)
-				{
-					g_free(filedata->data);
-					filedata->data = converted_text;
-					filedata->len = strlen(converted_text);
-				}
-				else
-				{
-					/* there was a problem converting data from BOM encoding type */
-					g_free(filedata->enc);
-					filedata->enc = NULL;
-					filedata->bom = FALSE;
-				}
-			}
-		}
-
-		if (filedata->enc == NULL)	/* either there was no BOM or the BOM encoding failed */
-		{
-			/* try UTF-8 first */
-			if ((filedata->size == filedata->len) &&
-				g_utf8_validate(filedata->data, filedata->len, NULL))
-			{
-				filedata->enc = g_strdup("UTF-8");
-			}
-			else
-			{
-				/* detect the encoding */
-				gchar *converted_text = encodings_convert_to_utf8(filedata->data,
-					filedata->size, &filedata->enc);
-
-				if (converted_text == NULL)
-				{
-					return FALSE;
-				}
-				g_free(filedata->data);
-				filedata->data = converted_text;
-				filedata->len = strlen(converted_text);
-			}
-		}
-	}
-	return TRUE;
-}
-
-
-static void
-handle_bom(FileData *filedata)
-{
-	guint bom_len;
-
-	encodings_scan_unicode_bom(filedata->data, filedata->size, &bom_len);
-	g_return_if_fail(bom_len != 0);
-
-	/* use filedata->len here because the contents are already converted into UTF-8 */
-	filedata->len -= bom_len;
-	/* overwrite the BOM with the remainder of the file contents, plus the NULL terminator. */
-	g_memmove(filedata->data, filedata->data + bom_len, filedata->len + 1);
-	filedata->data = g_realloc(filedata->data, filedata->len + 1);
-}
-
-
 /* loads textfile data, verifies and converts to forced_enc or UTF-8. Also handles BOM. */
 static gboolean load_text_file(const gchar *locale_filename, const gchar *display_filename,
 	FileData *filedata, const gchar *forced_enc)
 {
 	GError *err = NULL;
 	struct stat st;
-	GeanyEncodingIndex tmp_enc_idx;
 
 	filedata->data = NULL;
 	filedata->len = 0;
@@ -978,20 +856,26 @@
 		return FALSE;
 	}
 
-	/* use strlen to check for null chars */
-	filedata->size = (gsize) st.st_size;
-	filedata->len = strlen(filedata->data);
+	filedata->len = (gsize) st.st_size;
+	if (! encodings_convert_to_utf8_auto(&filedata->data, &filedata->len, forced_enc,
+				&filedata->enc, &filedata->bom, &filedata->readonly))
+	{
+		if (forced_enc)
+		{
+			ui_set_statusbar(TRUE, _("The file \"%s\" is not valid %s."),
+				display_filename, forced_enc);
+		}
+		else
+		{
+			ui_set_statusbar(TRUE,
+	_("The file \"%s\" does not look like a text file or the file encoding is not supported."),
+			display_filename);
+		}
+		g_free(filedata->data);
+		return FALSE;
+	}
 
-	/* temporarily retrieve the encoding idx based on the BOM to suppress the following warning
-	 * if we have a BOM */
-	tmp_enc_idx = encodings_scan_unicode_bom(filedata->data, filedata->size, NULL);
-
-	/* check whether the size of the loaded data is equal to the size of the file in the
-	 * filesystem file size may be 0 to allow opening files in /proc/ which have typically a
-	 * file size of 0 bytes */
-	if (filedata->len != filedata->size && filedata->size != 0 && (
-		tmp_enc_idx == GEANY_ENCODING_UTF_8 || /* tmp_enc_idx can be UTF-7/8/16/32, UCS and None */
-		tmp_enc_idx == GEANY_ENCODING_UTF_7))  /* filter UTF-7/8 where no NULL bytes are allowed */
+	if (filedata->readonly)
 	{
 		const gchar *warn_msg = _(
 			"The file \"%s\" could not be opened properly and has been truncated. " \
@@ -1002,43 +886,8 @@
 			dialogs_show_msgbox(GTK_MESSAGE_WARNING, warn_msg, display_filename);
 
 		ui_set_statusbar(TRUE, warn_msg, display_filename);
-
-		/* set the file to read-only mode because saving it is probably dangerous */
-		filedata->readonly = TRUE;
 	}
 
-	/* Determine character encoding and convert to UTF-8 */
-	if (forced_enc != NULL)
-	{
-		/* the encoding should be ignored(requested by user), so open the file "as it is" */
-		if (utils_str_equal(forced_enc, encodings[GEANY_ENCODING_NONE].charset))
-		{
-			filedata->bom = FALSE;
-			filedata->enc = g_strdup(encodings[GEANY_ENCODING_NONE].charset);
-		}
-		else if (! handle_forced_encoding(filedata, forced_enc))
-		{
-			/* For translators: the second wildcard is an encoding name, e.g.
-			 * The file \"test.txt\" is not valid UTF-8. */
-			ui_set_statusbar(TRUE, _("The file \"%s\" is not valid %s."),
-				display_filename, forced_enc);
-			utils_beep();
-			g_free(filedata->data);
-			return FALSE;
-		}
-	}
-	else if (! handle_encoding(filedata, tmp_enc_idx))
-	{
-		ui_set_statusbar(TRUE,
-	_("The file \"%s\" does not look like a text file or the file encoding is not supported."),
-			display_filename);
-		utils_beep();
-		g_free(filedata->data);
-		return FALSE;
-	}
-
-	if (filedata->bom)
-		handle_bom(filedata);
 	return TRUE;
 }
 

Modified: trunk/src/encodings.c
===================================================================
--- trunk/src/encodings.c	2011-03-18 09:35:31 UTC (rev 5605)
+++ trunk/src/encodings.c	2011-03-18 15:57:02 UTC (rev 5606)
@@ -676,3 +676,216 @@
 }
 
 
+typedef struct
+{
+	gchar		*data;	/* null-terminated data */
+	gsize		 size;	/* actual data size */
+	gsize		 len;	/* string length of data */
+	gchar		*enc;
+	gboolean	 bom;
+	gboolean	 partial;
+} BufferData;
+
+
+/* convert data with the specified encoding */
+static gboolean
+handle_forced_encoding(BufferData *buffer, const gchar *forced_enc)
+{
+	GeanyEncodingIndex enc_idx;
+
+	if (utils_str_equal(forced_enc, "UTF-8"))
+	{
+		if (! g_utf8_validate(buffer->data, buffer->len, NULL))
+		{
+			return FALSE;
+		}
+	}
+	else
+	{
+		gchar *converted_text = encodings_convert_to_utf8_from_charset(
+										buffer->data, buffer->size, forced_enc, FALSE);
+		if (converted_text == NULL)
+		{
+			return FALSE;
+		}
+		else
+		{
+			setptr(buffer->data, converted_text);
+			buffer->len = strlen(converted_text);
+		}
+	}
+	enc_idx = encodings_scan_unicode_bom(buffer->data, buffer->size, NULL);
+	buffer->bom = (enc_idx == GEANY_ENCODING_UTF_8);
+	buffer->enc = g_strdup(forced_enc);
+	return TRUE;
+}
+
+
+/* detect encoding and convert to UTF-8 if necessary */
+static gboolean
+handle_encoding(BufferData *buffer, GeanyEncodingIndex enc_idx)
+{
+	g_return_val_if_fail(buffer->enc == NULL, FALSE);
+	g_return_val_if_fail(buffer->bom == FALSE, FALSE);
+
+	if (buffer->size == 0)
+	{
+		/* we have no data so assume UTF-8, buffer->len can be 0 even we have an empty
+		 * e.g. UTF32 file with a BOM(so size is 4, len is 0) */
+		buffer->enc = g_strdup("UTF-8");
+	}
+	else
+	{
+		/* first check for a BOM */
+		if (enc_idx != GEANY_ENCODING_NONE)
+		{
+			buffer->enc = g_strdup(encodings[enc_idx].charset);
+			buffer->bom = TRUE;
+
+			if (enc_idx != GEANY_ENCODING_UTF_8) /* the BOM indicated something else than UTF-8 */
+			{
+				gchar *converted_text = encodings_convert_to_utf8_from_charset(
+										buffer->data, buffer->size, buffer->enc, FALSE);
+				if (converted_text != NULL)
+				{
+					setptr(buffer->data, converted_text);
+					buffer->len = strlen(converted_text);
+				}
+				else
+				{
+					/* there was a problem converting data from BOM encoding type */
+					setptr(buffer->enc, NULL);
+					buffer->bom = FALSE;
+				}
+			}
+		}
+
+		if (buffer->enc == NULL)	/* either there was no BOM or the BOM encoding failed */
+		{
+			/* try UTF-8 first */
+			if ((buffer->size == buffer->len) &&
+				g_utf8_validate(buffer->data, buffer->len, NULL))
+			{
+				buffer->enc = g_strdup("UTF-8");
+			}
+			else
+			{
+				/* detect the encoding */
+				gchar *converted_text = encodings_convert_to_utf8(buffer->data,
+					buffer->size, &buffer->enc);
+
+				if (converted_text == NULL)
+				{
+					return FALSE;
+				}
+				setptr(buffer->data, converted_text);
+				buffer->len = strlen(converted_text);
+			}
+		}
+	}
+	return TRUE;
+}
+
+
+static void
+handle_bom(BufferData *buffer)
+{
+	guint bom_len;
+
+	encodings_scan_unicode_bom(buffer->data, buffer->size, &bom_len);
+	g_return_if_fail(bom_len != 0);
+
+	/* use filedata->len here because the contents are already converted into UTF-8 */
+	buffer->len -= bom_len;
+	/* overwrite the BOM with the remainder of the file contents, plus the NULL terminator. */
+	g_memmove(buffer->data, buffer->data + bom_len, buffer->len + 1);
+	buffer->data = g_realloc(buffer->data, buffer->len + 1);
+}
+
+
+/* loads textfile data, verifies and converts to forced_enc or UTF-8. Also handles BOM. */
+static gboolean handle_buffer(BufferData *buffer, const gchar *forced_enc)
+{
+	GeanyEncodingIndex tmp_enc_idx;
+
+	/* temporarily retrieve the encoding idx based on the BOM to suppress the following warning
+	 * if we have a BOM */
+	tmp_enc_idx = encodings_scan_unicode_bom(buffer->data, buffer->size, NULL);
+
+	/* check whether the size of the loaded data is equal to the size of the file in the
+	 * filesystem file size may be 0 to allow opening files in /proc/ which have typically a
+	 * file size of 0 bytes */
+	if (buffer->len != buffer->size && buffer->size != 0 && (
+		tmp_enc_idx == GEANY_ENCODING_UTF_8 || /* tmp_enc_idx can be UTF-7/8/16/32, UCS and None */
+		tmp_enc_idx == GEANY_ENCODING_UTF_7))  /* filter UTF-7/8 where no NULL bytes are allowed */
+	{
+		buffer->partial = TRUE;
+	}
+
+	/* Determine character encoding and convert to UTF-8 */
+	if (forced_enc != NULL)
+	{
+		/* the encoding should be ignored(requested by user), so open the file "as it is" */
+		if (utils_str_equal(forced_enc, encodings[GEANY_ENCODING_NONE].charset))
+		{
+			buffer->bom = FALSE;
+			buffer->enc = g_strdup(encodings[GEANY_ENCODING_NONE].charset);
+		}
+		else if (! handle_forced_encoding(buffer, forced_enc))
+		{
+			return FALSE;
+		}
+	}
+	else if (! handle_encoding(buffer, tmp_enc_idx))
+	{
+		return FALSE;
+	}
+
+	if (buffer->bom)
+		handle_bom(buffer);
+	return TRUE;
+}
+
+
+/*
+ * Tries to convert @a buffer into UTF-8 encoding. Unlike encodings_convert_to_utf8()
+ * and encodings_convert_to_utf8_from_charset() it handles the possible BOM in the data.
+ *
+ * @param buf a pointer to modifiable null-terminated buffer to convert.
+ *   It may or may not be modified, and should be freed whatever happens.
+ * @param size a pointer to the size of the buffer (expected to be e.g. the on-disk
+ *   file size). It will be updated to the new size.
+ * @param forced_enc forced encoding to use, or @c NULL
+ * @param used_encoding return location for the actually used encoding, or @c NULL
+ * @param has_bom return location to store whether the data had a BOM, or @c NULL
+ * @param partial return location to store whether the conversion may be partial, or @c NULL
+ *
+ * @return @C TRUE if the conversion succeeded, @c FALSE otherwise.
+ */
+gboolean encodings_convert_to_utf8_auto(gchar **buf, gsize *size, const gchar *forced_enc,
+		gchar **used_encoding, gboolean *has_bom, gboolean *partial)
+{
+	BufferData buffer;
+
+	buffer.data = *buf;
+	buffer.size = *size;
+	/* use strlen to check for null chars */
+	buffer.len = strlen(buffer.data);
+	buffer.enc = NULL;
+	buffer.bom = FALSE;
+	buffer.partial = FALSE;
+
+	if (! handle_buffer(&buffer, forced_enc))
+		return FALSE;
+
+	*size = buffer.len;
+	if (used_encoding)
+		*used_encoding = buffer.enc;
+	if (has_bom)
+		*has_bom = buffer.bom;
+	if (partial)
+		*partial = buffer.partial;
+
+	*buf = buffer.data;
+	return TRUE;
+}

Modified: trunk/src/encodings.h
===================================================================
--- trunk/src/encodings.h	2011-03-18 09:35:31 UTC (rev 5605)
+++ trunk/src/encodings.h	2011-03-18 15:57:02 UTC (rev 5606)
@@ -90,6 +90,8 @@
 
 gboolean encodings_is_unicode_charset(const gchar *string);
 
+gboolean encodings_convert_to_utf8_auto(gchar **buf, gsize *size, const gchar *forced_enc,
+		gchar **used_encoding, gboolean *has_bom, gboolean *partial);
 
 /*
  * The original versions of the following tables are taken from profterm


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.