[geany/geany] a14aa9: c++: Fix parsing of C++11 raw string literals

Colomban Wendling git-noreply at xxxxx
Thu Feb 11 14:36:04 UTC 2016


Branch:      refs/heads/master
Author:      Colomban Wendling <ban at herbesfolles.org>
Committer:   Colomban Wendling <ban at herbesfolles.org>
Date:        Sat, 23 Jan 2016 20:52:40 UTC
Commit:      a14aa908c5f0a945f6d817a1aa46173b492193e6
             https://github.com/geany/geany/commit/a14aa908c5f0a945f6d817a1aa46173b492193e6

Log Message:
-----------
c++: Fix parsing of C++11 raw string literals

See http://en.cppreference.com/w/cpp/language/string_literal

Closes #877.

---

This contains a pretty ugly hack to fetch the previous character, in
order not to get fooled by string concatenation hidden behind a macro,
like in `FOUR"five"`, which is not a raw string literal but simply the
identifier `FOUR` followed by the string `"five"`.

While this may sound uncommon, it is not and lead to complaints [2][3]
when Scintilla [1] broke this when they introduced C++11 raw string
literal support themselves.

The implementation here still contains a bug with line continuations: a
raw literal of the form:

```c
const char *str = R\
"xxx(...)xxx";
```

is not properly recognized as such, although it's perfectly valid (yet
probably very uncommon).  For the record, Scintilla has also suffers
from this but nobody complained about it yet.

[1] http://scintilla.org/
[2] https://sourceforge.net/p/scintilla/bugs/1207/
[3] https://sourceforge.net/p/scintilla/bugs/1454/


Modified Paths:
--------------
    tagmanager/ctags/c.c
    tagmanager/ctags/get.c
    tagmanager/ctags/get.h
    tests/ctags/Makefile.am
    tests/ctags/cxx11-raw-strings.cpp
    tests/ctags/cxx11-raw-strings.cpp.tags

Modified: tagmanager/ctags/c.c
2 lines changed, 1 insertions(+), 1 deletions(-)
===================================================================
@@ -3120,7 +3120,7 @@ static boolean findCTags (const unsigned int passCount)
 	contextual_fake_count = 0;
 
 	Assert (passCount < 3);
-	cppInit ((boolean) (passCount > 1), isLanguage (Lang_csharp));
+	cppInit ((boolean) (passCount > 1), isLanguage (Lang_csharp), isLanguage (Lang_cpp));
 
 	exception = (exception_t) setjmp (Exception);
 	retry = FALSE;


Modified: tagmanager/ctags/get.c
92 lines changed, 91 insertions(+), 1 deletions(-)
===================================================================
@@ -62,6 +62,7 @@ typedef struct sCppState {
 	int		ungetch, ungetch2;   /* ungotten characters, if any */
 	boolean resolveRequired;     /* must resolve if/else/elif/endif branch */
 	boolean hasAtLiteralStrings; /* supports @"c:\" strings */
+	boolean hasCxxRawLiteralStrings; /* supports R"xxx(...)xxx" strings */
 	struct sDirective {
 		enum eState state;       /* current directive being processed */
 		boolean	accept;          /* is a directive syntactically permitted? */
@@ -83,6 +84,7 @@ static cppState Cpp = {
 	'\0', '\0',  /* ungetch characters */
 	FALSE,       /* resolveRequired */
 	FALSE,       /* hasAtLiteralStrings */
+	FALSE,       /* hasCxxRawLiteralStrings */
 	{
 		DRCTV_NONE,  /* state */
 		FALSE,       /* accept */
@@ -106,7 +108,8 @@ extern unsigned int getDirectiveNestLevel (void)
 	return Cpp.directive.nestLevel;
 }
 
-extern void cppInit (const boolean state, const boolean hasAtLiteralStrings)
+extern void cppInit (const boolean state, const boolean hasAtLiteralStrings,
+                     const boolean hasCxxRawLiteralStrings)
 {
 	BraceFormat = state;
 
@@ -114,6 +117,7 @@ extern void cppInit (const boolean state, const boolean hasAtLiteralStrings)
 	Cpp.ungetch2        = '\0';
 	Cpp.resolveRequired = FALSE;
 	Cpp.hasAtLiteralStrings = hasAtLiteralStrings;
+	Cpp.hasCxxRawLiteralStrings = hasCxxRawLiteralStrings;
 
 	Cpp.directive.state     = DRCTV_NONE;
 	Cpp.directive.accept    = TRUE;
@@ -533,6 +537,55 @@ static int skipToEndOfString (boolean ignoreBackslash)
 	return STRING_SYMBOL;  /* symbolic representation of string */
 }
 
+static int isCxxRawLiteralDelimiterChar (int c)
+{
+	return (c != ' ' && c != '\f' && c != '\n' && c != '\r' && c != '\t' && c != '\v' &&
+	        c != '(' && c != ')' && c != '\\');
+}
+
+static int skipToEndOfCxxRawLiteralString (void)
+{
+	int c = fileGetc ();
+
+	if (c != '(' && ! isCxxRawLiteralDelimiterChar (c))
+	{
+		fileUngetc (c);
+		c = skipToEndOfString (FALSE);
+	}
+	else
+	{
+		char delim[16];
+		unsigned int delimLen = 0;
+		boolean collectDelim = TRUE;
+
+		do
+		{
+			if (collectDelim)
+			{
+				if (isCxxRawLiteralDelimiterChar (c) &&
+				    delimLen < (sizeof delim / sizeof *delim))
+					delim[delimLen++] = c;
+				else
+					collectDelim = FALSE;
+			}
+			else if (c == ')')
+			{
+				unsigned int i = 0;
+
+				while ((c = fileGetc ()) != EOF && i < delimLen && delim[i] == c)
+					i++;
+				if (i == delimLen && c == DOUBLE_QUOTE)
+					break;
+				else
+					fileUngetc (c);
+			}
+		}
+		while ((c = fileGetc ()) != EOF);
+		c = STRING_SYMBOL;
+	}
+	return c;
+}
+
 /*  Skips to the end of the three (possibly four) 'c' sequence, returning a
  *  special character to symbolically represent a generic character.
  *  Also detects Vera numbers that include a base specifier (ie. 'b1010).
@@ -729,6 +782,43 @@ extern int cppGetc (void)
 					else
 						fileUngetc (next);
 				}
+				else if (c == 'R' && Cpp.hasCxxRawLiteralStrings)
+				{
+					/* OMG!11 HACK!!11  Get the previous character.
+					 *
+					 * We need to know whether the previous character was an identifier or not,
+					 * because "R" has to be on its own, not part of an identifier.  This allows
+					 * for constructs like:
+					 *
+					 * 	#define FOUR "4"
+					 * 	const char *p = FOUR"5";
+					 *
+					 * which is not a raw literal, but a preprocessor concatenation.
+					 *
+					 * FIXME: handle
+					 *
+					 * 	const char *p = R\
+					 * 	"xxx(raw)xxx";
+					 *
+					 * which is perfectly valid (yet probably very unlikely). */
+					const unsigned char *base = (unsigned char *) vStringValue (File.line);
+					int prev = '\n';
+					if (File.currentLine - File.ungetchIdx - 2 >= base)
+						prev = (int) *(File.currentLine - File.ungetchIdx - 2);
+
+					if (! isident (prev))
+					{
+						int next = fileGetc ();
+						if (next != DOUBLE_QUOTE)
+							fileUngetc (next);
+						else
+						{
+							Cpp.directive.accept = FALSE;
+							c = skipToEndOfCxxRawLiteralString ();
+							break;
+						}
+					}
+				}
 			enter:
 				Cpp.directive.accept = FALSE;
 				if (directive)


Modified: tagmanager/ctags/get.h
3 lines changed, 2 insertions(+), 1 deletions(-)
===================================================================
@@ -36,7 +36,8 @@
 */
 extern boolean isBraceFormat (void);
 extern unsigned int getDirectiveNestLevel (void);
-extern void cppInit (const boolean state, const boolean hasAtLiteralStrings);
+extern void cppInit (const boolean state, const boolean hasAtLiteralStrings,
+                     const boolean hasCxxRawLiteralStrings);
 extern void cppTerminate (void);
 extern void cppBeginStatement (void);
 extern void cppEndStatement (void);


Modified: tests/ctags/Makefile.am
1 lines changed, 1 insertions(+), 0 deletions(-)
===================================================================
@@ -135,6 +135,7 @@ test_sources = \
 	cxx11-final.cpp					\
 	cxx11-noexcept.cpp				\
 	cxx11-override.cpp				\
+	cxx11-raw-strings.cpp			\
 	cxx14-combined.cpp				\
 	db-trig.sql						\
 	debian_432872.f90				\


Modified: tests/ctags/cxx11-raw-strings.cpp
21 lines changed, 21 insertions(+), 0 deletions(-)
===================================================================
@@ -0,0 +1,21 @@
+
+static const char* str1 = R"blah(
+lots
+of text
+)blah";
+
+struct typ1 { int memb1; };
+
+static const char* str2 = R"blah(
+lots
+of text including a quote"
+)blah";
+
+struct typ2 { int memb2; };
+
+/* check we don't get confused by string concatenation */
+#define FOUR "four"
+
+static const char* str3 = FOUR"f(iv)e";
+
+struct typ3 { int memb3; };


Modified: tests/ctags/cxx11-raw-strings.cpp.tags
11 lines changed, 11 insertions(+), 0 deletions(-)
===================================================================
@@ -0,0 +1,11 @@
+# format=tagmanager
+FOUR�65536�0
+memb1�64�typ1�0�int
+memb2�64�typ2�0�int
+memb3�64�typ3�0�int
+str1�16384�0�char
+str2�16384�0�char
+str3�16384�0�char
+typ1�2048�0
+typ2�2048�0
+typ3�2048�0



--------------
This E-Mail was brought to you by github_commit_mail.py (Source: https://github.com/geany/infrastructure).


More information about the Commits mailing list