SF.net SVN: geany:[5606] trunk

colombanw at users.sourceforge.net colombanw at xxxxx
Fri Mar 18 15:57:02 UTC 2011


Revision: 5606
          http://geany.svn.sourceforge.net/geany/?rev=5606&view=rev
Author:   colombanw
Date:     2011-03-18 15:57:02 +0000 (Fri, 18 Mar 2011)

Log Message:
-----------
Move document encoding conversion with BOM support to encodings.[ch]

Adds the new function encodings_convert_to_utf8_auto(). This makes easy
to convert file data to UTF-8 in a high-level fashion, with BOM support
and everything.

Modified Paths:
--------------
    trunk/ChangeLog
    trunk/src/document.c
    trunk/src/encodings.c
    trunk/src/encodings.h

Modified: trunk/ChangeLog
===================================================================
--- trunk/ChangeLog	2011-03-18 09:35:31 UTC (rev 5605)
+++ trunk/ChangeLog	2011-03-18 15:57:02 UTC (rev 5606)
@@ -1,3 +1,10 @@
+2011-03-18  Colomban Wendling  <colomban(at)geany(dot)org>
+
+ * src/document.c, src/encodings.c, src/encodings.h:
+   Move document encoding conversion with BOM support to encodings.[ch] as
+   encodings_convert_to_utf8_auto().
+
+
 2011-03-18  Enrico Tröger  <enrico(dot)troeger(at)uvena(dot)de>
 
  * data/templates/files/main.vala:

Modified: trunk/src/document.c
===================================================================
--- trunk/src/document.c	2011-03-18 09:35:31 UTC (rev 5605)
+++ trunk/src/document.c	2011-03-18 15:57:02 UTC (rev 5606)
@@ -819,7 +819,6 @@
 typedef struct
 {
 	gchar		*data;	/* null-terminated file data */
-	gsize		 size;	/* actual file size on disk */
 	gsize		 len;	/* string length of data */
 	gchar		*enc;
 	gboolean	 bom;
@@ -828,133 +827,12 @@
 } FileData;
 
 
-/* reload file with specified encoding */
-static gboolean
-handle_forced_encoding(FileData *filedata, const gchar *forced_enc)
-{
-	GeanyEncodingIndex enc_idx;
-
-	if (utils_str_equal(forced_enc, "UTF-8"))
-	{
-		if (! g_utf8_validate(filedata->data, filedata->len, NULL))
-		{
-			return FALSE;
-		}
-	}
-	else
-	{
-		gchar *converted_text = encodings_convert_to_utf8_from_charset(
-										filedata->data, filedata->size, forced_enc, FALSE);
-		if (converted_text == NULL)
-		{
-			return FALSE;
-		}
-		else
-		{
-			g_free(filedata->data);
-			filedata->data = converted_text;
-			filedata->len = strlen(converted_text);
-		}
-	}
-	enc_idx = encodings_scan_unicode_bom(filedata->data, filedata->size, NULL);
-	filedata->bom = (enc_idx == GEANY_ENCODING_UTF_8);
-	filedata->enc = g_strdup(forced_enc);
-	return TRUE;
-}
-
-
-/* detect encoding and convert to UTF-8 if necessary */
-static gboolean
-handle_encoding(FileData *filedata, GeanyEncodingIndex enc_idx)
-{
-	g_return_val_if_fail(filedata->enc == NULL, FALSE);
-	g_return_val_if_fail(filedata->bom == FALSE, FALSE);
-
-	if (filedata->size == 0)
-	{
-		/* we have no data so assume UTF-8, filedata->len can be 0 even we have an empty
-		 * e.g. UTF32 file with a BOM(so size is 4, len is 0) */
-		filedata->enc = g_strdup("UTF-8");
-	}
-	else
-	{
-		/* first check for a BOM */
-		if (enc_idx != GEANY_ENCODING_NONE)
-		{
-			filedata->enc = g_strdup(encodings[enc_idx].charset);
-			filedata->bom = TRUE;
-
-			if (enc_idx != GEANY_ENCODING_UTF_8) /* the BOM indicated something else than UTF-8 */
-			{
-				gchar *converted_text = encodings_convert_to_utf8_from_charset(
-										filedata->data, filedata->size, filedata->enc, FALSE);
-				if (converted_text != NULL)
-				{
-					g_free(filedata->data);
-					filedata->data = converted_text;
-					filedata->len = strlen(converted_text);
-				}
-				else
-				{
-					/* there was a problem converting data from BOM encoding type */
-					g_free(filedata->enc);
-					filedata->enc = NULL;
-					filedata->bom = FALSE;
-				}
-			}
-		}
-
-		if (filedata->enc == NULL)	/* either there was no BOM or the BOM encoding failed */
-		{
-			/* try UTF-8 first */
-			if ((filedata->size == filedata->len) &&
-				g_utf8_validate(filedata->data, filedata->len, NULL))
-			{
-				filedata->enc = g_strdup("UTF-8");
-			}
-			else
-			{
-				/* detect the encoding */
-				gchar *converted_text = encodings_convert_to_utf8(filedata->data,
-					filedata->size, &filedata->enc);
-
-				if (converted_text == NULL)
-				{
-					return FALSE;
-				}
-				g_free(filedata->data);
-				filedata->data = converted_text;
-				filedata->len = strlen(converted_text);
-			}
-		}
-	}
-	return TRUE;
-}
-
-
-static void
-handle_bom(FileData *filedata)
-{
-	guint bom_len;
-
-	encodings_scan_unicode_bom(filedata->data, filedata->size, &bom_len);
-	g_return_if_fail(bom_len != 0);
-
-	/* use filedata->len here because the contents are already converted into UTF-8 */
-	filedata->len -= bom_len;
-	/* overwrite the BOM with the remainder of the file contents, plus the NULL terminator. */
-	g_memmove(filedata->data, filedata->data + bom_len, filedata->len + 1);
-	filedata->data = g_realloc(filedata->data, filedata->len + 1);
-}
-
-
 /* loads textfile data, verifies and converts to forced_enc or UTF-8. Also handles BOM. */
 static gboolean load_text_file(const gchar *locale_filename, const gchar *display_filename,
 	FileData *filedata, const gchar *forced_enc)
 {
 	GError *err = NULL;
 	struct stat st;
-	GeanyEncodingIndex tmp_enc_idx;
 
 	filedata->data = NULL;
 	filedata->len = 0;
@@ -978,20 +856,26 @@
 		return FALSE;
 	}
 
-	/* use strlen to check for null chars */
-	filedata->size = (gsize) st.st_size;
-	filedata->len = strlen(filedata->data);
+	filedata->len = (gsize) st.st_size;
+	if (! encodings_convert_to_utf8_auto(&filedata->data, &filedata->len, forced_enc,
+				&filedata->enc, &filedata->bom, &filedata->readonly))
+	{
+		if (forced_enc)
+		{
+			ui_set_statusbar(TRUE, _("The file \"%s\" is not valid %s."),
+				display_filename, forced_enc);
+		}
+		else
+		{
+			ui_set_statusbar(TRUE,
+	_("The file \"%s\" does not look like a text file or the file encoding is not supported."),
+			display_filename);
+		}
+		g_free(filedata->data);
+		return FALSE;
+	}
 
-	/* temporarily retrieve the encoding idx based on the BOM to suppress the following warning
-	 * if we have a BOM */
-	tmp_enc_idx = encodings_scan_unicode_bom(filedata->data, filedata->size, NULL);
-
-	/* check whether the size of the loaded data is equal to the size of the file in the
-	 * filesystem file size may be 0 to allow opening files in /proc/ which have typically a
-	 * file size of 0 bytes */
-	if (filedata->len != filedata->size && filedata->size != 0 && (
-		tmp_enc_idx == GEANY_ENCODING_UTF_8 || /* tmp_enc_idx can be UTF-7/8/16/32, UCS and None */
-		tmp_enc_idx == GEANY_ENCODING_UTF_7))  /* filter UTF-7/8 where no NULL bytes are allowed */
+	if (filedata->readonly)
 	{
 		const gchar *warn_msg = _(
 			"The file \"%s\" could not be opened properly and has been truncated. " \
@@ -1002,43 +886,8 @@
 			dialogs_show_msgbox(GTK_MESSAGE_WARNING, warn_msg, display_filename);
 
 		ui_set_statusbar(TRUE, warn_msg, display_filename);
-
-		/* set the file to read-only mode because saving it is probably dangerous */
-		filedata->readonly = TRUE;
 	}
 
-	/* Determine character encoding and convert to UTF-8 */
-	if (forced_enc != NULL)
-	{
-		/* the encoding should be ignored(requested by user), so open the file "as it is" */
-		if (utils_str_equal(forced_enc, encodings[GEANY_ENCODING_NONE].charset))
-		{
-			filedata->bom = FALSE;
-			filedata->enc = g_strdup(encodings[GEANY_ENCODING_NONE].charset);
-		}
-		else if (! handle_forced_encoding(filedata, forced_enc))
-		{
-			/* For translators: the second wildcard is an encoding name, e.g.
-			 * The file \"test.txt\" is not valid UTF-8. */
-			ui_set_statusbar(TRUE, _("The file \"%s\" is not valid %s."),
-				display_filename, forced_enc);
-			utils_beep();
-			g_free(filedata->data);
-			return FALSE;
-		}
-	}
-	else if (! handle_encoding(filedata, tmp_enc_idx))
-	{
-		ui_set_statusbar(TRUE,
-	_("The file \"%s\" does not look like a text file or the file encoding is not supported."),
-			display_filename);
-		utils_beep();
-		g_free(filedata->data);
-		return FALSE;
-	}
-
-	if (filedata->bom)
-		handle_bom(filedata);
 	return TRUE;
 }
 

Modified: trunk/src/encodings.c
===================================================================
--- trunk/src/encodings.c	2011-03-18 09:35:31 UTC (rev 5605)
+++ trunk/src/encodings.c	2011-03-18 15:57:02 UTC (rev 5606)
@@ -676,3 +676,216 @@
 }
 
 
+typedef struct
+{
+	gchar		*data;	/* null-terminated data */
+	gsize		 size;	/* actual data size */
+	gsize		 len;	/* string length of data */
+	gchar		*enc;
+	gboolean	 bom;
+	gboolean	 partial;
+} BufferData;
+
+
+/* convert data with the specified encoding */
+static gboolean
+handle_forced_encoding(BufferData *buffer, const gchar *forced_enc)
+{
+	GeanyEncodingIndex enc_idx;
+
+	if (utils_str_equal(forced_enc, "UTF-8"))
+	{
+		if (! g_utf8_validate(buffer->data, buffer->len, NULL))
+		{
+			return FALSE;
+		}
+	}
+	else
+	{
+		gchar *converted_text = encodings_convert_to_utf8_from_charset(
+										buffer->data, buffer->size, forced_enc, FALSE);
+		if (converted_text == NULL)
+		{
+			return FALSE;
+		}
+		else
+		{
+			setptr(buffer->data, converted_text);
+			buffer->len = strlen(converted_text);
+		}
+	}
+	enc_idx = encodings_scan_unicode_bom(buffer->data, buffer->size, NULL);
+	buffer->bom = (enc_idx == GEANY_ENCODING_UTF_8);
+	buffer->enc = g_strdup(forced_enc);
+	return TRUE;
+}
+
+
+/* detect encoding and convert to UTF-8 if necessary */
+static gboolean
+handle_encoding(BufferData *buffer, GeanyEncodingIndex enc_idx)
+{
+	g_return_val_if_fail(buffer->enc == NULL, FALSE);
+	g_return_val_if_fail(buffer->bom == FALSE, FALSE);
+
+	if (buffer->size == 0)
+	{
+		/* we have no data so assume UTF-8, buffer->len can be 0 even we have an empty
+		 * e.g. UTF32 file with a BOM(so size is 4, len is 0) */
+		buffer->enc = g_strdup("UTF-8");
+	}
+	else
+	{
+		/* first check for a BOM */
+		if (enc_idx != GEANY_ENCODING_NONE)
+		{
+			buffer->enc = g_strdup(encodings[enc_idx].charset);
+			buffer->bom = TRUE;
+
+			if (enc_idx != GEANY_ENCODING_UTF_8) /* the BOM indicated something else than UTF-8 */
+			{
+				gchar *converted_text = encodings_convert_to_utf8_from_charset(
+										buffer->data, buffer->size, buffer->enc, FALSE);
+				if (converted_text != NULL)
+				{
+					setptr(buffer->data, converted_text);
+					buffer->len = strlen(converted_text);
+				}
+				else
+				{
+					/* there was a problem converting data from BOM encoding type */
+					setptr(buffer->enc, NULL);
+					buffer->bom = FALSE;
+				}
+			}
+		}
+
+		if (buffer->enc == NULL)	/* either there was no BOM or the BOM encoding failed */
+		{
+			/* try UTF-8 first */
+			if ((buffer->size == buffer->len) &&
+				g_utf8_validate(buffer->data, buffer->len, NULL))
+			{
+				buffer->enc = g_strdup("UTF-8");
+			}
+			else
+			{
+				/* detect the encoding */
+				gchar *converted_text = encodings_convert_to_utf8(buffer->data,
+					buffer->size, &buffer->enc);
+
+				if (converted_text == NULL)
+				{
+					return FALSE;
+				}
+				setptr(buffer->data, converted_text);
+				buffer->len = strlen(converted_text);
+			}
+		}
+	}
+	return TRUE;
+}
+
+
+static void
+handle_bom(BufferData *buffer)
+{
+	guint bom_len;
+
+	encodings_scan_unicode_bom(buffer->data, buffer->size, &bom_len);
+	g_return_if_fail(bom_len != 0);
+
+	/* use filedata->len here because the contents are already converted into UTF-8 */
+	buffer->len -= bom_len;
+	/* overwrite the BOM with the remainder of the file contents, plus the NULL terminator. */
+	g_memmove(buffer->data, buffer->data + bom_len, buffer->len + 1);
+	buffer->data = g_realloc(buffer->data, buffer->len + 1);
+}
+
+
+/* loads textfile data, verifies and converts to forced_enc or UTF-8. Also handles BOM. */
+static gboolean handle_buffer(BufferData *buffer, const gchar *forced_enc)
+{
+	GeanyEncodingIndex tmp_enc_idx;
+
+	/* temporarily retrieve the encoding idx based on the BOM to suppress the following warning
+	 * if we have a BOM */
+	tmp_enc_idx = encodings_scan_unicode_bom(buffer->data, buffer->size, NULL);
+
+	/* check whether the size of the loaded data is equal to the size of the file in the
+	 * filesystem file size may be 0 to allow opening files in /proc/ which have typically a
+	 * file size of 0 bytes */
+	if (buffer->len != buffer->size && buffer->size != 0 && (
+		tmp_enc_idx == GEANY_ENCODING_UTF_8 || /* tmp_enc_idx can be UTF-7/8/16/32, UCS and None */
+		tmp_enc_idx == GEANY_ENCODING_UTF_7))  /* filter UTF-7/8 where no NULL bytes are allowed */
+	{
+		buffer->partial = TRUE;
+	}
+
+	/* Determine character encoding and convert to UTF-8 */
+	if (forced_enc != NULL)
+	{
+		/* the encoding should be ignored(requested by user), so open the file "as it is" */
+		if (utils_str_equal(forced_enc, encodings[GEANY_ENCODING_NONE].charset))
+		{
+			buffer->bom = FALSE;
+			buffer->enc = g_strdup(encodings[GEANY_ENCODING_NONE].charset);
+		}
+		else if (! handle_forced_encoding(buffer, forced_enc))
+		{
+			return FALSE;
+		}
+	}
+	else if (! handle_encoding(buffer, tmp_enc_idx))
+	{
+		return FALSE;
+	}
+
+	if (buffer->bom)
+		handle_bom(buffer);
+	return TRUE;
+}
+
+
+/*
+ * Tries to convert @a buffer into UTF-8 encoding. Unlike encodings_convert_to_utf8()
+ * and encodings_convert_to_utf8_from_charset() it handles the possible BOM in the data.
+ *
+ * @param buf a pointer to modifiable null-terminated buffer to convert.
+ *   It may or may not be modified, and should be freed whatever happens.
+ * @param size a pointer to the size of the buffer (expected to be e.g. the on-disk
+ *   file size). It will be updated to the new size.
+ * @param forced_enc forced encoding to use, or @c NULL
+ * @param used_encoding return location for the actually used encoding, or @c NULL
+ * @param has_bom return location to store whether the data had a BOM, or @c NULL
+ * @param partial return location to store whether the conversion may be partial, or @c NULL
+ *
+ * @return @C TRUE if the conversion succeeded, @c FALSE otherwise.
+ */
+gboolean encodings_convert_to_utf8_auto(gchar **buf, gsize *size, const gchar *forced_enc,
+		gchar **used_encoding, gboolean *has_bom, gboolean *partial)
+{
+	BufferData buffer;
+
+	buffer.data = *buf;
+	buffer.size = *size;
+	/* use strlen to check for null chars */
+	buffer.len = strlen(buffer.data);
+	buffer.enc = NULL;
+	buffer.bom = FALSE;
+	buffer.partial = FALSE;
+
+	if (! handle_buffer(&buffer, forced_enc))
+		return FALSE;
+
+	*size = buffer.len;
+	if (used_encoding)
+		*used_encoding = buffer.enc;
+	if (has_bom)
+		*has_bom = buffer.bom;
+	if (partial)
+		*partial = buffer.partial;
+
+	*buf = buffer.data;
+	return TRUE;
+}

Modified: trunk/src/encodings.h
===================================================================
--- trunk/src/encodings.h	2011-03-18 09:35:31 UTC (rev 5605)
+++ trunk/src/encodings.h	2011-03-18 15:57:02 UTC (rev 5606)
@@ -90,6 +90,8 @@
 
 gboolean encodings_is_unicode_charset(const gchar *string);
 
+gboolean encodings_convert_to_utf8_auto(gchar **buf, gsize *size, const gchar *forced_enc,
+		gchar **used_encoding, gboolean *has_bom, gboolean *partial);
 
 /*
  * The original versions of the following tables are taken from profterm


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.



More information about the Commits mailing list