[geany/geany] 9bf6ac: Use the upstream Markdown parser

Thu May 12 22:56:23 UTC 2022

Branch:      refs/heads/master
Author:      Jiří Techet <techet at gmail.com>
Committer:   Jiří Techet <techet at gmail.com>
Date:        Tue, 12 Apr 2022 18:16:06 UTC
Commit:      9bf6ac286dc6e99a0b75216f0c81c98283e0523d
             https://github.com/geany/geany/commit/9bf6ac286dc6e99a0b75216f0c81c98283e0523d

Log Message:
-----------
Use the upstream Markdown parser

This is a new Markdown parser supporting scope generation.


Modified Paths:
--------------
    ctags/Makefile.am
    ctags/parsers/geany_markdown.c
    ctags/parsers/markdown.c
    ctags/parsers/markdown.h
    meson.build
    src/tagmanager/tm_parser.c
    tests/ctags/simple.md.tags

Modified: ctags/Makefile.am
3 lines changed, 2 insertions(+), 1 deletions(-)
===================================================================
@@ -72,7 +72,8 @@ parsers = \
 	parsers/lua.c \
 	parsers/make.c \
 	parsers/make.h \
-	parsers/geany_markdown.c \
+	parsers/markdown.c \
+	parsers/markdown.h \
 	parsers/geany_matlab.c \
 	parsers/nsis.c \
 	parsers/objc.c \


Modified: ctags/parsers/geany_markdown.c
103 lines changed, 0 insertions(+), 103 deletions(-)
===================================================================
@@ -1,103 +0,0 @@
-/*
-*
-*   Copyright (c) 2009, Jon Strait
-*
-*   This source code is released for free distribution under the terms of the
-*   GNU General Public License.
-*
-*   This module contains functions for generating tags for Markdown files.
-*/
-
-/*
-*   INCLUDE FILES
-*/
-#include "general.h"	/* must always come first */
-
-#include <ctype.h>
-#include <string.h>
-
-#include "parse.h"
-#include "read.h"
-#include "vstring.h"
-#include "routines.h"
-#include "entry.h"
-
-/*
-*   DATA DEFINITIONS
-*/
-
-static kindDefinition MarkdownKinds[] = {
-	{ true, 'v', "variable", "sections" }
-};
-
-/*
-*   FUNCTION DEFINITIONS
-*/
-
-/* checks if str is all the same character */
-static bool issame(const char *str)
-{
-	char first = *str;
-
-	while (*(++str))
-	{
-		if (*str && *str != first)
-			return false;
-	}
-	return true;
-}
-
-static void makeMarkdownTag (const vString* const name, bool name_before)
-{
-	tagEntryInfo e;
-	initTagEntry (&e, vStringValue(name), 0);
-
-	if (name_before)
-		e.lineNumber--;	/* we want the line before the underline chars */
-
-	makeTagEntry(&e);
-}
-
-
-static void findMarkdownTags (void)
-{
-	vString *name = vStringNew();
-	const unsigned char *line;
-
-	while ((line = readLineFromInputFile()) != NULL)
-	{
-		int name_len = vStringLength(name);
-
-		/* underlines must be the same length or more */
-		if (name_len > 0 &&	(line[0] == '=' || line[0] == '-') && issame((const char*) line))
-		{
-			makeMarkdownTag(name, true);
-		}
-		else if (line[0] == '#') {
-			vStringClear(name);
-			vStringCatS(name, (const char *) line);
-			makeMarkdownTag(name, false);
-		}
-		else {
-			vStringClear (name);
-			if (! isspace(*line))
-				vStringCatS(name, (const char*) line);
-		}
-	}
-	vStringDelete (name);
-}
-
-extern parserDefinition* MarkdownParser (void)
-{
-	static const char *const patterns [] = { "*.md", NULL };
-	static const char *const extensions [] = { "md", NULL };
-	parserDefinition* const def = parserNew ("Markdown");
-
-	def->kindTable = MarkdownKinds;
-	def->kindCount = ARRAY_SIZE (MarkdownKinds);
-	def->patterns = patterns;
-	def->extensions = extensions;
-	def->parser = findMarkdownTags;
-	return def;
-}
-


Modified: ctags/parsers/markdown.c
420 lines changed, 420 insertions(+), 0 deletions(-)
===================================================================
@@ -0,0 +1,420 @@
+/*
+ *
+ *  Copyright (c) 2007-2011, Nick Treleaven
+ *  Copyright (c) 2012, Lex Trotman
+ *  Copyright (c) 2021, Jiri Techet
+ *
+ *   This source code is released for free distribution under the terms of the
+ *   GNU General Public License version 2 or (at your option) any later version.
+ *
+ * This module contains functions for generating tags for markdown files.
+ *
+ * This parser was based on the asciidoc parser.
+ *
+ * Extended syntax like footnotes is described in
+ * https://www.markdownguide.org/extended-syntax/
+ */
+
+/*
+ *   INCLUDE FILES
+ */
+#include "general.h"	/* must always come first */
+
+#include <ctype.h>
+#include <string.h>
+
+#include "debug.h"
+#include "entry.h"
+#include "parse.h"
+#include "read.h"
+#include "vstring.h"
+#include "nestlevel.h"
+#include "routines.h"
+#include "promise.h"
+#include "htable.h"
+
+#include "markdown.h"
+
+/*
+ *   DATA DEFINITIONS
+ */
+typedef enum {
+	K_CHAPTER = 0,
+	K_SECTION,
+	K_SUBSECTION,
+	K_SUBSUBSECTION,
+	K_LEVEL4SECTION,
+	K_LEVEL5SECTION,
+	K_SECTION_COUNT,
+	K_FOOTNOTE = K_SECTION_COUNT,
+} markdownKind;
+
+static kindDefinition MarkdownKinds[] = {
+	{ true, 'c', "chapter",       "chapters"},
+	{ true, 's', "section",       "sections" },
+	{ true, 'S', "subsection",    "level 2 sections" },
+	{ true, 't', "subsubsection", "level 3 sections" },
+	{ true, 'T', "l4subsection",  "level 4 sections" },
+	{ true, 'u', "l5subsection",  "level 5 sections" },
+	{ true, 'n', "footnote",      "footnotes" },
+};
+
+static fieldDefinition MarkdownFields [] = {
+	{
+	  .enabled     = false,
+	  .name        = "sectionMarker",
+	  .description = "character used for declaring section(#, ##, =, or -)",
+	},
+};
+
+typedef enum {
+	F_MARKER,
+} markdownField;
+
+static NestingLevels *nestingLevels = NULL;
+
+/*
+*   FUNCTION DEFINITIONS
+*/
+
+static NestingLevel *getNestingLevel (const int kind, unsigned long adjustmentWhenPop)
+{
+	NestingLevel *nl;
+	tagEntryInfo *e;
+	unsigned long line = getInputLineNumber ();
+
+	line = (line > adjustmentWhenPop)? (line - adjustmentWhenPop): 0;
+
+	while (1)
+	{
+		nl = nestingLevelsGetCurrent (nestingLevels);
+		e = getEntryOfNestingLevel (nl);
+		if ((nl && (e == NULL)) || (e && (e->kindIndex >= kind)))
+			nestingLevelsPop (nestingLevels);
+		else
+			break;
+	}
+	return nl;
+}
+
+
+static int makeMarkdownTag (const vString* const name, const int kind, const bool twoLine)
+{
+	int r = CORK_NIL;
+
+	if (vStringLength (name) > 0)
+	{
+		const NestingLevel *const nl = getNestingLevel (kind, twoLine? 2: 1);
+		tagEntryInfo *parent = getEntryOfNestingLevel (nl);
+		tagEntryInfo e;
+
+		initTagEntry (&e, vStringValue (name), kind);
+
+		if (twoLine)
+		{
+			/* we want the line before the '---' underline chars */
+			const unsigned long line = getInputLineNumber ();
+			Assert (line > 0);
+			if (line > 0)
+			{
+				e.lineNumber--;
+				e.filePosition = getInputFilePositionForLine (line - 1);
+			}
+		}
+
+		if (parent && (parent->kindIndex < kind))
+			e.extensionFields.scopeIndex = nl->corkIndex;
+
+		r = makeTagEntry (&e);
+	}
+	return r;
+}
+
+
+static int makeSectionMarkdownTag (const vString* const name, const int kind, const char *marker)
+{
+	int r = makeMarkdownTag (name, kind, marker[0] != '#');
+	attachParserFieldToCorkEntry (r, MarkdownFields [F_MARKER].ftype, marker);
+
+	nestingLevelsPush (nestingLevels, r);
+	return r;
+}
+
+
+static vString *getHeading (const int kind, const unsigned char *line,
+	const int lineLen, bool *delimited)
+{
+	int pos = 0;
+	int start = kind + 1;
+	int end = lineLen - 1;
+	vString *name = vStringNew ();
+
+	Assert (kind >= 0 && kind < K_SECTION_COUNT);
+	Assert (lineLen > start);
+
+	*delimited = false;
+	while (isspace (line[pos])) ++pos;
+	while (line[end] == line[pos] && end - 1 >= 0 && line[end - 1] != '\\')
+	{
+		--end;
+		*delimited = true;
+	}
+	while (isspace (line[start])) ++start;
+	while (isspace (line[end])) --end;
+
+	if (start <= end)
+		vStringNCatS (name, (const char*)(&(line[start])), end - start + 1);
+
+	return name;
+}
+
+
+static int getFirstCharPos (const unsigned char *line, int lineLen, bool *indented)
+{
+	int indent = 0;
+	int i;
+	for (i = 0; i < lineLen && isspace (line[i]); i++)
+		indent += line[i] == '\t' ? 4 : 1;
+	*indented = indent >= 4;
+	return i;
+}
+
+
+static void getFootnoteMaybe (const char *line)
+{
+	const char *start = strstr (line, "[^");
+	const char *end = start? strstr(start + 2, "]:"): NULL;
+
+	if (! (start && end))
+		return;
+	if (! (end > (start + 2)))
+		return;
+
+	vString * footnote = vStringNewNInit (start + 2, end - (start + 2));
+	const NestingLevel *const nl = nestingLevelsGetCurrent (nestingLevels);
+	tagEntryInfo e;
+
+	initTagEntry (&e, vStringValue (footnote), K_FOOTNOTE);
+	if (nl)
+		e.extensionFields.scopeIndex = nl->corkIndex;
+	makeTagEntry (&e);
+
+	vStringDelete (footnote);
+}
+
+static bool extractLanguageForCodeBlock (const char *langMarker,
+										 vString *codeLang)
+{
+	subparser *s;
+	bool r = false;
+
+	foreachSubparser (s, false)
+	{
+		markdownSubparser *m = (markdownSubparser *)s;
+		enterSubparser(s);
+		if (m->extractLanguageForCodeBlock)
+			r = m->extractLanguageForCodeBlock (m, langMarker, codeLang);
+		leaveSubparser();
+		if (r)
+			break;
+	}
+
+	return r;
+}
+
+static void findMarkdownTags (void)
+{
+	vString *prevLine = vStringNew ();
+	vString *codeLang = vStringNew ();
+	const unsigned char *line;
+	char inCodeChar = 0;
+	long startSourceLineNumber = 0;
+	long startLineNumber = 0;
+	bool inPreambule = false;
+	bool inComment = false;
+
+	subparser *sub = getSubparserRunningBaseparser();
+	if (sub)
+		chooseExclusiveSubparser (sub, NULL);
+
+	nestingLevels = nestingLevelsNew (0);
+
+	while ((line = readLineFromInputFile ()) != NULL)
+	{
+		int lineLen = strlen ((const char*) line);
+		bool lineProcessed = false;
+		bool indented;
+		int pos = getFirstCharPos (line, lineLen, &indented);
+		const int lineNum = getInputLineNumber ();
+
+		if (lineNum == 1 || inPreambule)
+		{
+			if (line[pos] == '-' && line[pos + 1] == '-' && line[pos + 2] == '-')
+			{
+				if (inPreambule)
+				{
+					long endLineNumber = lineNum;
+					if (startLineNumber < endLineNumber)
+						makePromise ("FrontMatter", startLineNumber, 0,
+									 endLineNumber, 0, startSourceLineNumber);
+				}
+				else
+					startSourceLineNumber = startLineNumber = lineNum;
+				inPreambule = !inPreambule;
+			}
+		}
+
+		if (inPreambule)
+			continue;
+
+		/* fenced code block */
+		if (line[pos] == '`' || line[pos] == '~')
+		{
+			char c = line[pos];
+			char otherC = c == '`' ? '~' : '`';
+			int nSame;
+			for (nSame = 1; line[nSame] == line[pos]; ++nSame);
+
+			if (inCodeChar != otherC && nSame >= 3)
+			{
+				inCodeChar = inCodeChar ? 0 : c;
+				if (inCodeChar == c && strstr ((const char *)(line + pos + nSame), "```") != NULL)
+					inCodeChar = 0;
+				else if (inCodeChar)
+				{
+					const char *langMarker = (const char *)(line + pos + nSame);
+					startLineNumber = startSourceLineNumber = lineNum + 1;
+
+					vStringClear (codeLang);
+					if (! extractLanguageForCodeBlock (langMarker, codeLang))
+					{
+						vStringCopyS (codeLang, langMarker);
+						vStringStripLeading (codeLang);
+						vStringStripTrailing (codeLang);
+					}
+				}
+				else
+				{
+					long endLineNumber = lineNum;
+					if (vStringLength (codeLang) > 0
+						&& startLineNumber < endLineNumber)
+						makePromise (vStringValue (codeLang), startLineNumber, 0,
+							endLineNumber, 0, startSourceLineNumber);
+				}
+
+				lineProcessed = true;
+			}
+		}
+		/* XML comment start */
+		else if (lineLen >= pos + 4 && line[pos] == '<' && line[pos + 1] == '!' &&
+			line[pos + 2] == '-' && line[pos + 3] == '-')
+		{
+			if (strstr ((const char *)(line + pos + 4), "-->") == NULL)
+				inComment = true;
+			lineProcessed = true;
+		}
+		/* XML comment end */
+		else if (inComment && strstr ((const char *)(line + pos), "-->"))
+		{
+			inComment = false;
+			lineProcessed = true;
+		}
+
+		/* code block or comment */
+		if (inCodeChar || inComment)
+			lineProcessed = true;
+
+		/* code block using indent */
+		else if (indented)
+			lineProcessed = true;
+
+		/* if it's a title underline, or a delimited block marking character */
+		else if (line[pos] == '=' || line[pos] == '-' || line[pos] == '#' || line[pos] == '>')
+		{
+			int nSame;
+			for (nSame = 1; line[nSame] == line[pos]; ++nSame);
+
+			/* quote */
+			if (line[pos] == '>')
+				;  /* just to make sure lineProcessed = true so it won't be in a heading */
+			/* is it a two line title */
+			else if (line[pos] == '=' || line[pos] == '-')
+			{
+				char marker[2] = { line[pos], '\0' };
+				int kind = line[pos] == '=' ? K_CHAPTER : K_SECTION;
+				bool whitespaceTerminated = true;
+
+				for (int i = pos + nSame; i < lineLen; i++)
+				{
+					if (!isspace (line[i]))
+					{
+						whitespaceTerminated = false;
+						break;
+					}
+				}
+
+				vStringStripLeading (prevLine);
+				vStringStripTrailing (prevLine);
+				if (whitespaceTerminated && vStringLength (prevLine) > 0)
+					makeSectionMarkdownTag (prevLine, kind, marker);
+			}
+			/* otherwise is it a one line title */
+			else if (line[pos] == '#' && nSame <= K_SECTION_COUNT && isspace (line[nSame]))
+			{
+				int kind = nSame - 1;
+				bool delimited = false;
+				vString *name = getHeading (kind, line, lineLen, &delimited);
+				if (vStringLength (name) > 0)
+					makeSectionMarkdownTag (name, kind, delimited ? "##" : "#");
+				vStringDelete (name);
+			}
+
+			lineProcessed = true;
+		}
+
+		vStringClear (prevLine);
+		if (!lineProcessed)
+		{
+			getFootnoteMaybe ((const char *)line);
+			vStringCatS (prevLine, (const char*) line);
+		}
+	}
+	vStringDelete (prevLine);
+	vStringDelete (codeLang);
+	{
+		unsigned int line = (unsigned int)getInputLineNumber ();
+		nestingLevelsFree (nestingLevels);
+	}
+}
+
+extern parserDefinition* MarkdownParser (void)
+{
+	parserDefinition* const def = parserNew ("Markdown");
+	static const char *const extensions [] = { "md", "markdown", NULL };
+
+	def->enabled  = true;
+	def->extensions = extensions;
+	def->useCork = CORK_QUEUE;
+	def->kindTable = MarkdownKinds;
+	def->kindCount = ARRAY_SIZE (MarkdownKinds);
+	def->fieldTable = MarkdownFields;
+	def->fieldCount = ARRAY_SIZE (MarkdownFields);
+	def->defaultScopeSeparator = "\"\"";
+	def->parser = findMarkdownTags;
+
+	/*
+	 * This setting (useMemoryStreamInput) is for running
+	 * Yaml parser from YamlFrontMatter as subparser.
+	 * YamlFrontMatter is run from FrontMatter as a gust parser.
+	 * FrontMatter is run from Markdown as a guest parser.
+	 * This stacked structure hits the limitation of the main
+	 * part: subparser's requirement for memory based input stream
+	 * is not propagated to the main part.
+	 *
+	 * TODO: instead of setting useMemoryStreamInput here, we
+	 * should remove the limitation.
+	 */
+	def->useMemoryStreamInput = true;
+
+	return def;
+}


Modified: ctags/parsers/markdown.h
29 lines changed, 29 insertions(+), 0 deletions(-)
===================================================================
@@ -0,0 +1,29 @@
+/*
+*   Copyright (c) 2022, Masatake YAMATO
+*
+*   This source code is released for free distribution under the terms of the
+*   GNU General Public License version 2 or (at your option) any later version.
+*
+*   The interface for subparsers of Markdown
+*/
+#ifndef CTAGS_PARSER_MARKDOWN_H
+#define CTAGS_PARSER_MARKDOWN_H
+
+/*
+*   INCLUDE FILES
+*/
+#include "general.h"  /* must always come first */
+
+#include "subparser.h"
+#include "vstring.h"
+
+typedef struct sMarkdownSubparser markdownSubparser;
+
+struct sMarkdownSubparser {
+	subparser subparser;
+	bool (* extractLanguageForCodeBlock) (markdownSubparser *s,
+										  const char *langMarker,
+										  vString *langName);
+};
+
+#endif


Modified: meson.build
3 lines changed, 2 insertions(+), 1 deletions(-)
===================================================================
@@ -636,7 +636,6 @@ ctags = static_library('ctags',
 	'ctags/parsers/geany_fortran.c',
 	'ctags/parsers/geany_lcpp.c',
 	'ctags/parsers/geany_lcpp.h',
-	'ctags/parsers/geany_markdown.c',
 	'ctags/parsers/geany_matlab.c',
 	'ctags/parsers/geany_tcl.c',
 	'ctags/parsers/geany_tex.c',
@@ -653,6 +652,8 @@ ctags = static_library('ctags',
 	'ctags/parsers/lua.c',
 	'ctags/parsers/make.c',
 	'ctags/parsers/make.h',
+	'ctags/parsers/markdown.c',
+	'ctags/parsers/markdown.h',
 	'ctags/parsers/nsis.c',
 	'ctags/parsers/objc.c',
 	'ctags/parsers/pascal.c',


Modified: src/tagmanager/tm_parser.c
20 lines changed, 17 insertions(+), 3 deletions(-)
===================================================================
@@ -694,10 +694,21 @@ static TMParserMapGroup group_NSIS[] = {
 };
 
 static TMParserMapEntry map_MARKDOWN[] = {
-	{'v', tm_tag_variable_t},
+	{'c', tm_tag_namespace_t},  //chapter
+	{'s', tm_tag_member_t},     //section
+	{'S', tm_tag_macro_t},      //subsection
+	{'t', tm_tag_variable_t},   //subsubsection
+	{'T', tm_tag_struct_t},     //l4subsection
+	{'u', tm_tag_union_t},      //l5subsection
+	{'n', tm_tag_undef_t},      //footnote
 };
 static TMParserMapGroup group_MARKDOWN[] = {
-	{_("Variables"), TM_ICON_VAR, tm_tag_variable_t},
+	{_("Chapters"), TM_ICON_NONE, tm_tag_namespace_t},
+	{_("Sections"), TM_ICON_NONE, tm_tag_member_t},
+	{_("Subsections"), TM_ICON_NONE, tm_tag_macro_t},
+	{_("Subsubsections"), TM_ICON_NONE, tm_tag_variable_t},
+	{_("Level 4 sections"), TM_ICON_NONE, tm_tag_struct_t},
+	{_("Level 5 sections"), TM_ICON_NONE, tm_tag_union_t},
 };
 
 static TMParserMapEntry map_TXT2TAGS[] = {
@@ -1435,6 +1446,7 @@ const gchar *tm_parser_scope_separator(TMParserType lang)
 		case TM_PARSER_ZEPHIR:
 			return "::";
 
+		case TM_PARSER_MARKDOWN:
 		case TM_PARSER_TXT2TAGS:
 			return "\"\"";
 
@@ -1455,10 +1467,11 @@ const gchar *tm_parser_scope_separator_printable(TMParserType lang)
 {
 	switch (lang)
 	{
-		case TM_PARSER_TXT2TAGS:
 		case TM_PARSER_ASCIIDOC:
 		case TM_PARSER_CONF:
+		case TM_PARSER_MARKDOWN:
 		case TM_PARSER_REST:
+		case TM_PARSER_TXT2TAGS:
 			return " > ";
 
 		default:
@@ -1485,6 +1498,7 @@ gboolean tm_parser_has_full_scope(TMParserType lang)
 		case TM_PARSER_JAVASCRIPT:
 		case TM_PARSER_JSON:
 		case TM_PARSER_LUA:
+		case TM_PARSER_MARKDOWN:
 		case TM_PARSER_PHP:
 		case TM_PARSER_POWERSHELL:
 		case TM_PARSER_PYTHON:


Modified: tests/ctags/simple.md.tags
54 lines changed, 27 insertions(+), 27 deletions(-)
===================================================================
@@ -1,28 +1,28 @@
 # format=tagmanager
-# a�16384�0
-# g #�16384�0
-# h ##�16384�0
-## b�16384�0
-## i #�16384�0
-## j ##�16384�0
-## k ###�16384�0
-### c�16384�0
-### l #�16384�0
-### m ##�16384�0
-### n ###�16384�0
-### o ###�16384�0
-#### d�16384�0
-#### p #�16384�0
-#### q #####�16384�0
-##### e�16384�0
-##### r #�16384�0
-##### s ######�16384�0
-###### f�16384�0
-###### t #�16384�0
-###### u #######�16384�0
-A�16384�0
-B�16384�0
-C�16384�0
-D�16384�0
-E�16384�0
-F�16384�0
+A�256�0
+B�256�0
+C�256�0
+D�64�C�0
+E�64�C�0
+F�64�C�0
+a�256�0
+b�64�a�0
+c�65536�a""b�0
+d�16384�a""b""c�0
+e�2048�a""b""c""d�0
+f�8192�a""b""c""d""e�0
+g�256�0
+h�256�0
+i�64�h�0
+j�64�h�0
+k�64�h�0
+l�65536�h""k�0
+m�65536�h""k�0
+n�65536�h""k�0
+o�65536�h""k�0
+p�16384�h""k""o�0
+q�16384�h""k""o�0
+r�2048�h""k""o""q�0
+s�2048�h""k""o""q�0
+t�8192�h""k""o""q""s�0
+u�8192�h""k""o""q""s�0



--------------
This E-Mail was brought to you by github_commit_mail.py (Source: https://github.com/geany/infrastructure).