Revision: 2053 http://geany.svn.sourceforge.net/geany/?rev=2053&view=rev Author: eht16 Date: 2007-11-17 11:27:50 -0800 (Sat, 17 Nov 2007)
Log Message: ----------- Fix two more compiler warnings. Use php.c and lregex.c from CTags SVN (closes #1795810). Add regex.c and regex.h (GNU regex) for regex support on Windows.
Modified Paths: -------------- trunk/ChangeLog trunk/tagmanager/Makefile.am trunk/tagmanager/include/Makefile.am trunk/tagmanager/makefile.win32 trunk/tagmanager/parse.c trunk/tagmanager/parse.h trunk/tagmanager/php.c trunk/tagmanager/regex.c trunk/tagmanager/tm_tag.c
Added Paths: ----------- trunk/tagmanager/include/regex.h trunk/tagmanager/lregex.c
Modified: trunk/ChangeLog =================================================================== --- trunk/ChangeLog 2007-11-17 17:03:25 UTC (rev 2052) +++ trunk/ChangeLog 2007-11-17 19:27:50 UTC (rev 2053) @@ -11,6 +11,13 @@ Add native GTK printing support. * src/printing.c: Set line width for page header, handle empty filename in page header correctly. + * tagmanager/Makefile.am, tagmanager/lregex.c, + tagmanager/makefile.win32, tagmanager/parse.c, tagmanager/parse.h, + tagmanager/php.c, tagmanager/regex.c, tagmanager/tm_tag.c, + tagmanager/include/Makefile.am, tagmanager/include/regex.h: + Fix two more compiler warnings. + Use php.c and lregex.c from CTags SVN (closes #1795810). + Add regex.c and regex.h (GNU regex) for regex support on Windows.
2007-11-14 Nick Treleaven <nick(dot)treleaven(at)btinternet(dot)com>
Modified: trunk/tagmanager/Makefile.am =================================================================== --- trunk/tagmanager/Makefile.am 2007-11-17 17:03:25 UTC (rev 2052) +++ trunk/tagmanager/Makefile.am 2007-11-17 19:27:50 UTC (rev 2053) @@ -7,6 +7,8 @@ # -DGDK_PIXBUF_DEPRECATED \ # -DGTK_DISABLE_DEPRECATED -DGNOME_DISABLE_DEPRECATED
+# regex.c is the GNU regex implementation needed for Windows +EXTRA_DIST = regex.c
noinst_LIBRARIES = libtagmanager.a libtagmanager_a_SOURCES =\ @@ -39,6 +41,7 @@ make.c\ asm.c\ latex.c\ + lregex.c\ pascal.c\ perl.c\ rest.c\ @@ -46,7 +49,6 @@ sql.c\ php.c\ python.c\ - regex.c\ tcl.c\ sh.c\ vhdl.c\
Modified: trunk/tagmanager/include/Makefile.am =================================================================== --- trunk/tagmanager/include/Makefile.am 2007-11-17 17:03:25 UTC (rev 2052) +++ trunk/tagmanager/include/Makefile.am 2007-11-17 19:27:50 UTC (rev 2053) @@ -1,4 +1,5 @@ noinst_HEADERS = \ + regex.h \ tm_project.h\ tm_source_file.h\ tm_tag.h\
Added: trunk/tagmanager/include/regex.h =================================================================== --- trunk/tagmanager/include/regex.h (rev 0) +++ trunk/tagmanager/include/regex.h 2007-11-17 19:27:50 UTC (rev 2053) @@ -0,0 +1,490 @@ +/* Definitions for data structures and routines for the regular + expression library, version 0.12. + + Copyright (C) 1985, 1989, 1990, 1991, 1992, 1993 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + +#ifndef __REGEXP_LIBRARY_H__ +#define __REGEXP_LIBRARY_H__ + +/* POSIX says that <sys/types.h> must be included (by the caller) before + <regex.h>. */ + +#ifdef VMS +/* VMS doesn't have `size_t' in <sys/types.h>, even though POSIX says it + should be there. */ +#include <stddef.h> +#endif + + +/* The following bits are used to determine the regexp syntax we + recognize. The set/not-set meanings are chosen so that Emacs syntax + remains the value 0. The bits are given in alphabetical order, and + the definitions shifted by one from the previous bit; thus, when we + add or remove a bit, only one other definition need change. */ +typedef unsigned reg_syntax_t; + +/* If this bit is not set, then \ inside a bracket expression is literal. + If set, then such a \ quotes the following character. */ +#define RE_BACKSLASH_ESCAPE_IN_LISTS (1) + +/* If this bit is not set, then + and ? are operators, and + and ? are + literals. + If set, then + and ? are operators and + and ? are literals. */ +#define RE_BK_PLUS_QM (RE_BACKSLASH_ESCAPE_IN_LISTS << 1) + +/* If this bit is set, then character classes are supported. They are: + [:alpha:], [:upper:], [:lower:], [:digit:], [:alnum:], [:xdigit:], + [:space:], [:print:], [:punct:], [:graph:], and [:cntrl:]. + If not set, then character classes are not supported. */ +#define RE_CHAR_CLASSES (RE_BK_PLUS_QM << 1) + +/* If this bit is set, then ^ and $ are always anchors (outside bracket + expressions, of course). + If this bit is not set, then it depends: + ^ is an anchor if it is at the beginning of a regular + expression or after an open-group or an alternation operator; + $ is an anchor if it is at the end of a regular expression, or + before a close-group or an alternation operator. + + This bit could be (re)combined with RE_CONTEXT_INDEP_OPS, because + POSIX draft 11.2 says that * etc. in leading positions is undefined. + We already implemented a previous draft which made those constructs + invalid, though, so we haven't changed the code back. */ +#define RE_CONTEXT_INDEP_ANCHORS (RE_CHAR_CLASSES << 1) + +/* If this bit is set, then special characters are always special + regardless of where they are in the pattern. + If this bit is not set, then special characters are special only in + some contexts; otherwise they are ordinary. Specifically, + * + ? and intervals are only special when not after the beginning, + open-group, or alternation operator. */ +#define RE_CONTEXT_INDEP_OPS (RE_CONTEXT_INDEP_ANCHORS << 1) + +/* If this bit is set, then *, +, ?, and { cannot be first in an re or + immediately after an alternation or begin-group operator. */ +#define RE_CONTEXT_INVALID_OPS (RE_CONTEXT_INDEP_OPS << 1) + +/* If this bit is set, then . matches newline. + If not set, then it doesn't. */ +#define RE_DOT_NEWLINE (RE_CONTEXT_INVALID_OPS << 1) + +/* If this bit is set, then . doesn't match NUL. + If not set, then it does. */ +#define RE_DOT_NOT_NULL (RE_DOT_NEWLINE << 1) + +/* If this bit is set, nonmatching lists [^...] do not match newline. + If not set, they do. */ +#define RE_HAT_LISTS_NOT_NEWLINE (RE_DOT_NOT_NULL << 1) + +/* If this bit is set, either {...} or {...} defines an + interval, depending on RE_NO_BK_BRACES. + If not set, {, }, {, and } are literals. */ +#define RE_INTERVALS (RE_HAT_LISTS_NOT_NEWLINE << 1) + +/* If this bit is set, +, ? and | aren't recognized as operators. + If not set, they are. */ +#define RE_LIMITED_OPS (RE_INTERVALS << 1) + +/* If this bit is set, newline is an alternation operator. + If not set, newline is literal. */ +#define RE_NEWLINE_ALT (RE_LIMITED_OPS << 1) + +/* If this bit is set, then `{...}' defines an interval, and { and } + are literals. + If not set, then `{...}' defines an interval. */ +#define RE_NO_BK_BRACES (RE_NEWLINE_ALT << 1) + +/* If this bit is set, (...) defines a group, and ( and ) are literals. + If not set, (...) defines a group, and ( and ) are literals. */ +#define RE_NO_BK_PARENS (RE_NO_BK_BRACES << 1) + +/* If this bit is set, then <digit> matches <digit>. + If not set, then <digit> is a back-reference. */ +#define RE_NO_BK_REFS (RE_NO_BK_PARENS << 1) + +/* If this bit is set, then | is an alternation operator, and | is literal. + If not set, then | is an alternation operator, and | is literal. */ +#define RE_NO_BK_VBAR (RE_NO_BK_REFS << 1) + +/* If this bit is set, then an ending range point collating higher + than the starting range point, as in [z-a], is invalid. + If not set, then when ending range point collates higher than the + starting range point, the range is ignored. */ +#define RE_NO_EMPTY_RANGES (RE_NO_BK_VBAR << 1) + +/* If this bit is set, then an unmatched ) is ordinary. + If not set, then an unmatched ) is invalid. */ +#define RE_UNMATCHED_RIGHT_PAREN_ORD (RE_NO_EMPTY_RANGES << 1) + +/* This global variable defines the particular regexp syntax to use (for + some interfaces). When a regexp is compiled, the syntax used is + stored in the pattern buffer, so changing this does not affect + already-compiled regexps. */ +extern reg_syntax_t re_syntax_options; + +/* Define combinations of the above bits for the standard possibilities. + (The [[[ comments delimit what gets put into the Texinfo file, so + don't delete them!) */ +/* [[[begin syntaxes]]] */ +#define RE_SYNTAX_EMACS 0 + +#define RE_SYNTAX_AWK \ + (RE_BACKSLASH_ESCAPE_IN_LISTS | RE_DOT_NOT_NULL \ + | RE_NO_BK_PARENS | RE_NO_BK_REFS \ + | RE_NO_BK_VBAR | RE_NO_EMPTY_RANGES \ + | RE_UNMATCHED_RIGHT_PAREN_ORD) + +#define RE_SYNTAX_POSIX_AWK \ + (RE_SYNTAX_POSIX_EXTENDED | RE_BACKSLASH_ESCAPE_IN_LISTS) + +#define RE_SYNTAX_GREP \ + (RE_BK_PLUS_QM | RE_CHAR_CLASSES \ + | RE_HAT_LISTS_NOT_NEWLINE | RE_INTERVALS \ + | RE_NEWLINE_ALT) + +#define RE_SYNTAX_EGREP \ + (RE_CHAR_CLASSES | RE_CONTEXT_INDEP_ANCHORS \ + | RE_CONTEXT_INDEP_OPS | RE_HAT_LISTS_NOT_NEWLINE \ + | RE_NEWLINE_ALT | RE_NO_BK_PARENS \ + | RE_NO_BK_VBAR) + +#define RE_SYNTAX_POSIX_EGREP \ + (RE_SYNTAX_EGREP | RE_INTERVALS | RE_NO_BK_BRACES) + +/* P1003.2/D11.2, section 4.20.7.1, lines 5078ff. */ +#define RE_SYNTAX_ED RE_SYNTAX_POSIX_BASIC + +#define RE_SYNTAX_SED RE_SYNTAX_POSIX_BASIC + +/* Syntax bits common to both basic and extended POSIX regex syntax. */ +#define _RE_SYNTAX_POSIX_COMMON \ + (RE_CHAR_CLASSES | RE_DOT_NEWLINE | RE_DOT_NOT_NULL \ + | RE_INTERVALS | RE_NO_EMPTY_RANGES) + +#define RE_SYNTAX_POSIX_BASIC \ + (_RE_SYNTAX_POSIX_COMMON | RE_BK_PLUS_QM) + +/* Differs from ..._POSIX_BASIC only in that RE_BK_PLUS_QM becomes + RE_LIMITED_OPS, i.e., ? + | are not recognized. Actually, this + isn't minimal, since other operators, such as `, aren't disabled. */ +#define RE_SYNTAX_POSIX_MINIMAL_BASIC \ + (_RE_SYNTAX_POSIX_COMMON | RE_LIMITED_OPS) + +#define RE_SYNTAX_POSIX_EXTENDED \ + (_RE_SYNTAX_POSIX_COMMON | RE_CONTEXT_INDEP_ANCHORS \ + | RE_CONTEXT_INDEP_OPS | RE_NO_BK_BRACES \ + | RE_NO_BK_PARENS | RE_NO_BK_VBAR \ + | RE_UNMATCHED_RIGHT_PAREN_ORD) + +/* Differs from ..._POSIX_EXTENDED in that RE_CONTEXT_INVALID_OPS + replaces RE_CONTEXT_INDEP_OPS and RE_NO_BK_REFS is added. */ +#define RE_SYNTAX_POSIX_MINIMAL_EXTENDED \ + (_RE_SYNTAX_POSIX_COMMON | RE_CONTEXT_INDEP_ANCHORS \ + | RE_CONTEXT_INVALID_OPS | RE_NO_BK_BRACES \ + | RE_NO_BK_PARENS | RE_NO_BK_REFS \ + | RE_NO_BK_VBAR | RE_UNMATCHED_RIGHT_PAREN_ORD) +/* [[[end syntaxes]]] */ + +/* Maximum number of duplicates an interval can allow. Some systems + (erroneously) define this in other header files, but we want our + value, so remove any previous define. */ +#ifdef RE_DUP_MAX +#undef RE_DUP_MAX +#endif +#define RE_DUP_MAX ((1 << 15) - 1) + + +/* POSIX `cflags' bits (i.e., information for `regcomp'). */ + +/* If this bit is set, then use extended regular expression syntax. + If not set, then use basic regular expression syntax. */ +#define REG_EXTENDED 1 + +/* If this bit is set, then ignore case when matching. + If not set, then case is significant. */ +#define REG_ICASE (REG_EXTENDED << 1) + +/* If this bit is set, then anchors do not match at newline + characters in the string. + If not set, then anchors do match at newlines. */ +#define REG_NEWLINE (REG_ICASE << 1) + +/* If this bit is set, then report only success or fail in regexec. + If not set, then returns differ between not matching and errors. */ +#define REG_NOSUB (REG_NEWLINE << 1) + + +/* POSIX `eflags' bits (i.e., information for regexec). */ + +/* If this bit is set, then the beginning-of-line operator doesn't match + the beginning of the string (presumably because it's not the + beginning of a line). + If not set, then the beginning-of-line operator does match the + beginning of the string. */ +#define REG_NOTBOL 1 + +/* Like REG_NOTBOL, except for the end-of-line. */ +#define REG_NOTEOL (1 << 1) + + +/* If any error codes are removed, changed, or added, update the + `re_error_msg' table in regex.c. */ +typedef enum +{ + REG_NOERROR = 0, /* Success. */ + REG_NOMATCH, /* Didn't find a match (for regexec). */ + + /* POSIX regcomp return error codes. (In the order listed in the + standard.) */ + REG_BADPAT, /* Invalid pattern. */ + REG_ECOLLATE, /* Not implemented. */ + REG_ECTYPE, /* Invalid character class name. */ + REG_EESCAPE, /* Trailing backslash. */ + REG_ESUBREG, /* Invalid back reference. */ + REG_EBRACK, /* Unmatched left bracket. */ + REG_EPAREN, /* Parenthesis imbalance. */ + REG_EBRACE, /* Unmatched {. */ + REG_BADBR, /* Invalid contents of {}. */ + REG_ERANGE, /* Invalid range end. */ + REG_ESPACE, /* Ran out of memory. */ + REG_BADRPT, /* No preceding re for repetition op. */ + + /* Error codes we've added. */ + REG_EEND, /* Premature end. */ + REG_ESIZE, /* Compiled pattern bigger than 2^16 bytes. */ + REG_ERPAREN /* Unmatched ) or ); not returned from regcomp. */ +} reg_errcode_t; + +/* This data structure represents a compiled pattern. Before calling + the pattern compiler, the fields `buffer', `allocated', `fastmap', + `translate', and `no_sub' can be set. After the pattern has been + compiled, the `re_nsub' field is available. All other fields are + private to the regex routines. */ + +struct re_pattern_buffer +{ +/* [[[begin pattern_buffer]]] */ + /* Space that holds the compiled pattern. It is declared as + `unsigned char *' because its elements are + sometimes used as array indexes. */ + unsigned char *buffer; + + /* Number of bytes to which `buffer' points. */ + unsigned long allocated; + + /* Number of bytes actually used in `buffer'. */ + unsigned long used; + + /* Syntax setting with which the pattern was compiled. */ + reg_syntax_t syntax; + + /* Pointer to a fastmap, if any, otherwise zero. re_search uses + the fastmap, if there is one, to skip over impossible + starting points for matches. */ + char *fastmap; + + /* Either a translate table to apply to all characters before + comparing them, or zero for no translation. The translation + is applied to a pattern when it is compiled and to a string + when it is matched. */ + char *translate; + + /* Number of subexpressions found by the compiler. */ + size_t re_nsub; + + /* Zero if this pattern cannot match the empty string, one else. + Well, in truth it's used only in `re_search_2', to see + whether or not we should use the fastmap, so we don't set + this absolutely perfectly; see `re_compile_fastmap' (the + `duplicate' case). */ + unsigned can_be_null : 1; + + /* If REGS_UNALLOCATED, allocate space in the `regs' structure + for `max (RE_NREGS, re_nsub + 1)' groups. + If REGS_REALLOCATE, reallocate space if necessary. + If REGS_FIXED, use what's there. */ +#define REGS_UNALLOCATED 0 +#define REGS_REALLOCATE 1 +#define REGS_FIXED 2 + unsigned regs_allocated : 2; + + /* Set to zero when `regex_compile' compiles a pattern; set to one + by `re_compile_fastmap' if it updates the fastmap. */ + unsigned fastmap_accurate : 1; + + /* If set, `re_match_2' does not return information about + subexpressions. */ + unsigned no_sub : 1; + + /* If set, a beginning-of-line anchor doesn't match at the + beginning of the string. */ + unsigned not_bol : 1; + + /* Similarly for an end-of-line anchor. */ + unsigned not_eol : 1; + + /* If true, an anchor at a newline matches. */ + unsigned newline_anchor : 1; + +/* [[[end pattern_buffer]]] */ +}; + +typedef struct re_pattern_buffer regex_t; + + +/* search.c (search_buffer) in Emacs needs this one opcode value. It is + defined both in `regex.c' and here. */ +#define RE_EXACTN_VALUE 1 + +/* Type for byte offsets within the string. POSIX mandates this. */ +typedef int regoff_t; + + +/* This is the structure we store register match data in. See + regex.texinfo for a full description of what registers match. */ +struct re_registers +{ + unsigned num_regs; + regoff_t *start; + regoff_t *end; +}; + + +/* If `regs_allocated' is REGS_UNALLOCATED in the pattern buffer, + `re_match_2' returns information about at least this many registers + the first time a `regs' structure is passed. */ +#ifndef RE_NREGS +#define RE_NREGS 30 +#endif + + +/* POSIX specification for registers. Aside from the different names than + `re_registers', POSIX uses an array of structures, instead of a + structure of arrays. */ +typedef struct +{ + regoff_t rm_so; /* Byte offset from string's start to substring's start. */ + regoff_t rm_eo; /* Byte offset from string's start to substring's end. */ +} regmatch_t; + +/* Declarations for routines. */ + +/* To avoid duplicating every routine declaration -- once with a + prototype (if we are ANSI), and once without (if we aren't) -- we + use the following macro to declare argument types. This + unfortunately clutters up the declarations a bit, but I think it's + worth it. */ + +#if __STDC__ + +#define _RE_ARGS(args) args + +#else /* not __STDC__ */ + +#define _RE_ARGS(args) () + +#endif /* not __STDC__ */ + +/* Sets the current default syntax to SYNTAX, and return the old syntax. + You can also simply assign to the `re_syntax_options' variable. */ +extern reg_syntax_t re_set_syntax _RE_ARGS ((reg_syntax_t syntax)); + +/* Compile the regular expression PATTERN, with length LENGTH + and syntax given by the global `re_syntax_options', into the buffer + BUFFER. Return NULL if successful, and an error string if not. */ +extern const char *re_compile_pattern + _RE_ARGS ((const char *pattern, int length, + struct re_pattern_buffer *buffer)); + + +/* Compile a fastmap for the compiled pattern in BUFFER; used to + accelerate searches. Return 0 if successful and -2 if was an + internal error. */ +extern int re_compile_fastmap _RE_ARGS ((struct re_pattern_buffer *buffer)); + + +/* Search in the string STRING (with length LENGTH) for the pattern + compiled into BUFFER. Start searching at position START, for RANGE + characters. Return the starting position of the match, -1 for no + match, or -2 for an internal error. Also return register + information in REGS (if REGS and BUFFER->no_sub are nonzero). */ +extern int re_search + _RE_ARGS ((struct re_pattern_buffer *buffer, const char *string, + int length, int start, int range, struct re_registers *regs)); + + +/* Like `re_search', but search in the concatenation of STRING1 and + STRING2. Also, stop searching at index START + STOP. */ +extern int re_search_2 + _RE_ARGS ((struct re_pattern_buffer *buffer, const char *string1, + int length1, const char *string2, int length2, + int start, int range, struct re_registers *regs, int stop)); + + +/* Like `re_search', but return how many characters in STRING the regexp + in BUFFER matched, starting at position START. */ +extern int re_match + _RE_ARGS ((struct re_pattern_buffer *buffer, const char *string, + int length, int start, struct re_registers *regs)); + + +/* Relates to `re_match' as `re_search_2' relates to `re_search'. */ +extern int re_match_2 + _RE_ARGS ((struct re_pattern_buffer *buffer, const char *string1, + int length1, const char *string2, int length2, + int start, struct re_registers *regs, int stop)); + + +/* Set REGS to hold NUM_REGS registers, storing them in STARTS and + ENDS. Subsequent matches using BUFFER and REGS will use this memory + for recording register information. STARTS and ENDS must be + allocated with malloc, and must each be at least `NUM_REGS * sizeof + (regoff_t)' bytes long. + + If NUM_REGS == 0, then subsequent matches should allocate their own + register data. + + Unless this function is called, the first search or match using + PATTERN_BUFFER will allocate its own register data, without + freeing the old data. */ +extern void re_set_registers + _RE_ARGS ((struct re_pattern_buffer *buffer, struct re_registers *regs, + unsigned num_regs, regoff_t *starts, regoff_t *ends)); + +/* 4.2 bsd compatibility. */ +extern char *re_comp _RE_ARGS ((const char *)); +extern int re_exec _RE_ARGS ((const char *)); + +/* POSIX compatibility. */ +extern int regcomp _RE_ARGS ((regex_t *preg, const char *pattern, int cflags)); +extern int regexec + _RE_ARGS ((const regex_t *preg, const char *string, size_t nmatch, + regmatch_t pmatch[], int eflags)); +extern size_t regerror + _RE_ARGS ((int errcode, const regex_t *preg, char *errbuf, + size_t errbuf_size)); +extern void regfree _RE_ARGS ((regex_t *preg)); + +#endif /* not __REGEXP_LIBRARY_H__ */ + +/* +Local variables: +make-backup-files: t +version-control: t +trim-versions-without-asking: nil +End: +*/
Added: trunk/tagmanager/lregex.c =================================================================== --- trunk/tagmanager/lregex.c (rev 0) +++ trunk/tagmanager/lregex.c 2007-11-17 19:27:50 UTC (rev 2053) @@ -0,0 +1,704 @@ +/* +* $Id: lregex.c 576 2007-06-30 04:16:23Z elliotth $ +* +* Copyright (c) 2000-2003, Darren Hiebert +* +* This source code is released for free distribution under the terms of the +* GNU General Public License. +* +* This module contains functions for applying regular expression matching. +* +* The code for utlizing the Gnu regex package with regards to processing the +* regex option and checking for regex matches was adapted from routines in +* Gnu etags. +*/ + +/* +* INCLUDE FILES +*/ +#include "general.h" /* must always come first */ + +#include <string.h> +#include <glib.h> + +#ifdef HAVE_REGCOMP +# include <ctype.h> +# include <stddef.h> +# ifdef HAVE_SYS_TYPES_H +# include <sys/types.h> /* declare off_t (not known to regex.h on FreeBSD) */ +# endif +# include <regex.h> +#endif + +#include "main.h" +#include "entry.h" +#include "parse.h" +#include "read.h" + +#ifdef HAVE_REGEX + +/* +* MACROS +*/ + +/* Back-references \0 through \9 */ +#define BACK_REFERENCE_COUNT 10 + +#if defined (HAVE_REGCOMP) && !defined (REGCOMP_BROKEN) +# define POSIX_REGEX +#endif + +#define REGEX_NAME "Regex" + +/* +* DATA DECLARATIONS +*/ +#if defined (POSIX_REGEX) + +struct sKind { + boolean enabled; + char letter; + char* name; + char* description; +}; + +enum pType { PTRN_TAG, PTRN_CALLBACK }; + +typedef struct { + regex_t *pattern; + enum pType type; + union { + struct { + char *name_pattern; + struct sKind kind; + } tag; + struct { + regexCallback function; + } callback; + } u; +} regexPattern; + +#endif + +typedef struct { + regexPattern *patterns; + unsigned int count; +} patternSet; + +/* +* DATA DEFINITIONS +*/ + +static boolean regexBroken = FALSE; + +/* Array of pattern sets, indexed by language */ +static patternSet* Sets = NULL; +static int SetUpper = -1; /* upper language index in list */ + +/* +* FUNCTION DEFINITIONS +*/ + +static void clearPatternSet (const langType language) +{ + if (language <= SetUpper) + { + patternSet* const set = Sets + language; + unsigned int i; + for (i = 0 ; i < set->count ; ++i) + { + regexPattern *p = &set->patterns [i]; +#if defined (POSIX_REGEX) + regfree (p->pattern); +#endif + eFree (p->pattern); + p->pattern = NULL; + + if (p->type == PTRN_TAG) + { + eFree (p->u.tag.name_pattern); + p->u.tag.name_pattern = NULL; + eFree (p->u.tag.kind.name); + p->u.tag.kind.name = NULL; + if (p->u.tag.kind.description != NULL) + { + eFree (p->u.tag.kind.description); + p->u.tag.kind.description = NULL; + } + } + } + if (set->patterns != NULL) + eFree (set->patterns); + set->patterns = NULL; + set->count = 0; + } +} + +/* +* Regex psuedo-parser +*/ + +static void makeRegexTag ( + const vString* const name, const struct sKind* const kind) +{ + if (kind->enabled) + { + tagEntryInfo e; + Assert (name != NULL && vStringLength (name) > 0); + Assert (kind != NULL); + initTagEntry (&e, vStringValue (name)); + e.kind = kind->letter; + e.kindName = kind->name; + makeTagEntry (&e); + } +} + +/* +* Regex pattern definition +*/ + +/* Take a string like "/blah/" and turn it into "blah", making sure + * that the first and last characters are the same, and handling + * quoted separator characters. Actually, stops on the occurrence of + * an unquoted separator. Also turns "\t" into a Tab character. + * Returns pointer to terminating separator. Works in place. Null + * terminates name string. + */ +static char* scanSeparators (char* name) +{ + char sep = name [0]; + char *copyto = name; + boolean quoted = FALSE; + + for (++name ; *name != '\0' ; ++name) + { + if (quoted) + { + if (*name == sep) + *copyto++ = sep; + else if (*name == 't') + *copyto++ = '\t'; + else + { + /* Something else is quoted, so preserve the quote. */ + *copyto++ = '\'; + *copyto++ = *name; + } + quoted = FALSE; + } + else if (*name == '\') + quoted = TRUE; + else if (*name == sep) + { + break; + } + else + *copyto++ = *name; + } + *copyto = '\0'; + return name; +} + +/* Parse `regexp', in form "/regex/name/[k,Kind/]flags" (where the separator + * character is whatever the first character of `regexp' is), by breaking it + * up into null terminated strings, removing the separators, and expanding + * '\t' into tabs. When complete, `regexp' points to the line matching + * pattern, a pointer to the name matching pattern is written to `name', a + * pointer to the kinds is written to `kinds' (possibly NULL), and a pointer + * to the trailing flags is written to `flags'. If the pattern is not in the + * correct format, a false value is returned. + */ +static boolean parseTagRegex ( + char* const regexp, char** const name, + char** const kinds, char** const flags) +{ + boolean result = FALSE; + const int separator = (unsigned char) regexp [0]; + + *name = scanSeparators (regexp); + if (*regexp == '\0') + printf ("regex: empty regexp"); + else if (**name != separator) + printf ("regex: %s: incomplete regexp", regexp); + else + { + char* const third = scanSeparators (*name); + if (**name == '\0') + printf ("regex: %s: regexp missing name pattern", regexp); + if ((*name) [strlen (*name) - 1] == '\') + printf ("regex: error in name pattern: "%s"", *name); + if (*third != separator) + printf ("regex: %s: regexp missing final separator", regexp); + else + { + char* const fourth = scanSeparators (third); + if (*fourth == separator) + { + *kinds = third; + scanSeparators (fourth); + *flags = fourth; + } + else + { + *flags = third; + *kinds = NULL; + } + result = TRUE; + } + } + return result; +} + +static void addCompiledTagPattern ( + const langType language, regex_t* const pattern, + char* const name, const char kind, char* const kindName, + char *const description) +{ + patternSet* set; + regexPattern *ptrn; + if (language > SetUpper) + { + int i; + Sets = xRealloc (Sets, (language + 1), patternSet); + for (i = SetUpper + 1 ; i <= language ; ++i) + { + Sets [i].patterns = NULL; + Sets [i].count = 0; + } + SetUpper = language; + } + set = Sets + language; + set->patterns = xRealloc (set->patterns, (set->count + 1), regexPattern); + ptrn = &set->patterns [set->count]; + set->count += 1; + + ptrn->pattern = pattern; + ptrn->type = PTRN_TAG; + ptrn->u.tag.name_pattern = name; + ptrn->u.tag.kind.enabled = TRUE; + ptrn->u.tag.kind.letter = kind; + ptrn->u.tag.kind.name = kindName; + ptrn->u.tag.kind.description = description; +} + +static void addCompiledCallbackPattern ( + const langType language, regex_t* const pattern, + const regexCallback callback) +{ + patternSet* set; + regexPattern *ptrn; + if (language > SetUpper) + { + int i; + Sets = xRealloc (Sets, (language + 1), patternSet); + for (i = SetUpper + 1 ; i <= language ; ++i) + { + Sets [i].patterns = NULL; + Sets [i].count = 0; + } + SetUpper = language; + } + set = Sets + language; + set->patterns = xRealloc (set->patterns, (set->count + 1), regexPattern); + ptrn = &set->patterns [set->count]; + set->count += 1; + + ptrn->pattern = pattern; + ptrn->type = PTRN_CALLBACK; + ptrn->u.callback.function = callback; +} + +#if defined (POSIX_REGEX) + +static regex_t* compileRegex (const char* const regexp, const char* const flags) +{ + int cflags = REG_EXTENDED | REG_NEWLINE; + regex_t *result = NULL; + int errcode; + int i; + for (i = 0 ; flags != NULL && flags [i] != '\0' ; ++i) + { + switch ((int) flags [i]) + { + case 'b': cflags &= ~REG_EXTENDED; break; + case 'e': cflags |= REG_EXTENDED; break; + case 'i': cflags |= REG_ICASE; break; + default: printf ("regex: unknown regex flag: '%c'", *flags); break; + } + } + result = xMalloc (1, regex_t); + errcode = regcomp (result, regexp, cflags); + if (errcode != 0) + { + char errmsg[256]; + regerror (errcode, result, errmsg, 256); + printf ("regex: regcomp %s: %s", regexp, errmsg); + regfree (result); + eFree (result); + result = NULL; + } + return result; +} + +#endif + +static void parseKinds ( + const char* const kinds, char* const kind, char** const kindName, + char **description) +{ + *kind = '\0'; + *kindName = NULL; + *description = NULL; + if (kinds == NULL || kinds [0] == '\0') + { + *kind = 'r'; + *kindName = eStrdup ("regex"); + } + else if (kinds [0] != '\0') + { + const char* k = kinds; + if (k [0] != ',' && (k [1] == ',' || k [1] == '\0')) + *kind = *k++; + else + *kind = 'r'; + if (*k == ',') + ++k; + if (k [0] == '\0') + *kindName = eStrdup ("regex"); + else + { + const char *const comma = strchr (k, ','); + if (comma == NULL) + *kindName = eStrdup (k); + else + { + *kindName = (char*) eMalloc (comma - k + 1); + strncpy (*kindName, k, comma - k); + (*kindName) [comma - k] = '\0'; + k = comma + 1; + if (k [0] != '\0') + *description = eStrdup (k); + } + } + } +} + +static void printRegexKind (const regexPattern *pat, unsigned int i, boolean indent) +{ + const struct sKind *const kind = &pat [i].u.tag.kind; + const char *const indentation = indent ? " " : ""; + Assert (pat [i].type == PTRN_TAG); + printf ("%s%c %s %s\n", indentation, + kind->letter != '\0' ? kind->letter : '?', + kind->description != NULL ? kind->description : kind->name, + kind->enabled ? "" : " [off]"); +} + +static void processLanguageRegex (const langType language, + const char* const parameter) +{ + if (parameter == NULL || parameter [0] == '\0') + clearPatternSet (language); + else if (parameter [0] != '@') + addLanguageRegex (language, parameter); + else if (! doesFileExist (parameter + 1)) + printf ("regex: cannot open regex file"); + else + { + const char* regexfile = parameter + 1; + FILE* const fp = fopen (regexfile, "r"); + if (fp == NULL) + printf ("regex: %s", regexfile); + else + { + vString* const regex = vStringNew (); + while (readLine (regex, fp)) + addLanguageRegex (language, vStringValue (regex)); + fclose (fp); + vStringDelete (regex); + } + } +} + +/* +* Regex pattern matching +*/ + +#if defined (POSIX_REGEX) + +static vString* substitute ( + const char* const in, const char* out, + const int nmatch, const regmatch_t* const pmatch) +{ + vString* result = vStringNew (); + const char* p; + for (p = out ; *p != '\0' ; p++) + { + if (*p == '\' && isdigit ((int) *++p)) + { + const int dig = *p - '0'; + if (0 < dig && dig < nmatch && pmatch [dig].rm_so != -1) + { + const int diglen = pmatch [dig].rm_eo - pmatch [dig].rm_so; + vStringNCatS (result, in + pmatch [dig].rm_so, diglen); + } + } + else if (*p != '\n' && *p != '\r') + vStringPut (result, *p); + } + vStringTerminate (result); + return result; +} + +static void matchTagPattern (const vString* const line, + const regexPattern* const patbuf, + const regmatch_t* const pmatch) +{ + vString *const name = substitute (vStringValue (line), + patbuf->u.tag.name_pattern, BACK_REFERENCE_COUNT, pmatch); + vStringStripLeading (name); + vStringStripTrailing (name); + if (vStringLength (name) > 0) + makeRegexTag (name, &patbuf->u.tag.kind); + else + error (WARNING, "%s:%ld: null expansion of name pattern "%s"", + getInputFileName (), getInputLineNumber (), + patbuf->u.tag.name_pattern); + vStringDelete (name); +} + +static void matchCallbackPattern ( + const vString* const line, const regexPattern* const patbuf, + const regmatch_t* const pmatch) +{ + regexMatch matches [BACK_REFERENCE_COUNT]; + unsigned int count = 0; + int i; + for (i = 0 ; i < BACK_REFERENCE_COUNT && pmatch [i].rm_so != -1 ; ++i) + { + matches [i].start = pmatch [i].rm_so; + matches [i].length = pmatch [i].rm_eo - pmatch [i].rm_so; + ++count; + } + patbuf->u.callback.function (vStringValue (line), matches, count); +} + +static boolean matchRegexPattern (const vString* const line, + const regexPattern* const patbuf) +{ + boolean result = FALSE; + regmatch_t pmatch [BACK_REFERENCE_COUNT]; + const int match = regexec (patbuf->pattern, vStringValue (line), + BACK_REFERENCE_COUNT, pmatch, 0); + if (match == 0) + { + result = TRUE; + if (patbuf->type == PTRN_TAG) + matchTagPattern (line, patbuf, pmatch); + else if (patbuf->type == PTRN_CALLBACK) + matchCallbackPattern (line, patbuf, pmatch); + else + { + Assert ("invalid pattern type" == NULL); + result = FALSE; + } + } + return result; +} + +#endif + +/* PUBLIC INTERFACE */ + +/* Match against all patterns for specified language. Returns true if at least + * on pattern matched. + */ +extern boolean matchRegex (const vString* const line, const langType language) +{ + boolean result = FALSE; + if (language != LANG_IGNORE && language <= SetUpper && + Sets [language].count > 0) + { + const patternSet* const set = Sets + language; + unsigned int i; + for (i = 0 ; i < set->count ; ++i) + if (matchRegexPattern (line, set->patterns + i)) + result = TRUE; + } + return result; +} + +extern void findRegexTags (void) +{ + /* merely read all lines of the file */ + while (fileReadLine () != NULL) + ; +} + +#endif /* HAVE_REGEX */ + +extern void addTagRegex ( + const langType language __unused__, + const char* const regex __unused__, + const char* const name __unused__, + const char* const kinds __unused__, + const char* const flags __unused__) +{ +#ifdef HAVE_REGEX + Assert (regex != NULL); + Assert (name != NULL); + if (! regexBroken) + { + regex_t* const cp = compileRegex (regex, flags); + if (cp != NULL) + { + char kind; + char* kindName; + char* description; + parseKinds (kinds, &kind, &kindName, &description); + addCompiledTagPattern (language, cp, eStrdup (name), + kind, kindName, description); + } + } +#endif +} + +extern void addCallbackRegex ( + const langType language __unused__, + const char* const regex __unused__, + const char* const flags __unused__, + const regexCallback callback __unused__) +{ +#ifdef HAVE_REGEX + Assert (regex != NULL); + if (! regexBroken) + { + regex_t* const cp = compileRegex (regex, flags); + if (cp != NULL) + addCompiledCallbackPattern (language, cp, callback); + } +#endif +} + +extern void addLanguageRegex ( + const langType language __unused__, const char* const regex __unused__) +{ +#ifdef HAVE_REGEX + if (! regexBroken) + { + char *const regex_pat = eStrdup (regex); + char *name, *kinds, *flags; + if (parseTagRegex (regex_pat, &name, &kinds, &flags)) + { + addTagRegex (language, regex_pat, name, kinds, flags); + eFree (regex_pat); + } + } +#endif +} + +/* +* Regex option parsing +*/ + +extern boolean processRegexOption (const char *const option, + const char *const parameter __unused__) +{ + boolean handled = FALSE; + const char* const dash = strchr (option, '-'); + if (dash != NULL && strncmp (option, "regex", dash - option) == 0) + { +#ifdef HAVE_REGEX + langType language; + language = getNamedLanguage (dash + 1); + if (language == LANG_IGNORE) + printf ("regex: unknown language "%s" in --%s option", (dash + 1), option); + else + processLanguageRegex (language, parameter); +#else + printf ("regex: regex support not available; required for --%s option", + option); +#endif + handled = TRUE; + } + return handled; +} + +extern void disableRegexKinds (const langType language __unused__) +{ +#ifdef HAVE_REGEX + if (language <= SetUpper && Sets [language].count > 0) + { + patternSet* const set = Sets + language; + unsigned int i; + for (i = 0 ; i < set->count ; ++i) + if (set->patterns [i].type == PTRN_TAG) + set->patterns [i].u.tag.kind.enabled = FALSE; + } +#endif +} + +extern boolean enableRegexKind ( + const langType language __unused__, + const int kind __unused__, const boolean mode __unused__) +{ + boolean result = FALSE; +#ifdef HAVE_REGEX + if (language <= SetUpper && Sets [language].count > 0) + { + patternSet* const set = Sets + language; + unsigned int i; + for (i = 0 ; i < set->count ; ++i) + if (set->patterns [i].type == PTRN_TAG && + set->patterns [i].u.tag.kind.letter == kind) + { + set->patterns [i].u.tag.kind.enabled = mode; + result = TRUE; + } + } +#endif + return result; +} + +extern void printRegexKinds (const langType language __unused__, boolean indent __unused__) +{ +#ifdef HAVE_REGEX + if (language <= SetUpper && Sets [language].count > 0) + { + patternSet* const set = Sets + language; + unsigned int i; + for (i = 0 ; i < set->count ; ++i) + if (set->patterns [i].type == PTRN_TAG) + printRegexKind (set->patterns, i, indent); + } +#endif +} + +extern void freeRegexResources (void) +{ +#ifdef HAVE_REGEX + int i; + for (i = 0 ; i <= SetUpper ; ++i) + clearPatternSet (i); + if (Sets != NULL) + eFree (Sets); + Sets = NULL; + SetUpper = -1; +#endif +} + +/* Check for broken regcomp() on Cygwin */ +extern void checkRegex (void) +{ +#if defined (HAVE_REGEX) && defined (CHECK_REGCOMP) + regex_t patbuf; + int errcode; + if (regcomp (&patbuf, "/hello/", 0) != 0) + { + error (WARNING, "Disabling broken regex"); + regexBroken = TRUE; + } +#endif +} + +/* vi:set tabstop=4 shiftwidth=4: */
Modified: trunk/tagmanager/makefile.win32 =================================================================== --- trunk/tagmanager/makefile.win32 2007-11-17 17:03:25 UTC (rev 2052) +++ trunk/tagmanager/makefile.win32 2007-11-17 19:27:50 UTC (rev 2053) @@ -9,6 +9,8 @@
COMPLIB=tagmanager.a
+REGEX_DEFINES = -DHAVE_REGCOMP -DREGEX_MALLOC -DSTDC_HEADERS=1 + GTK_INCLUDES= \ -I$(PREFIX)/include/gtk-2.0 \ -I$(PREFIX)/lib/gtk-2.0/include \ @@ -25,7 +27,7 @@ CCFLAGS=-Wall -O2 -g -mms-bitfields -DPACKAGE="geany" -DG_OS_WIN32 -Wno-missing-braces -Wno-char-subscripts $(INCLUDEDIRS)
.c.o: - $(CC) $(CCFLAGS) -w -c $< + $(CC) $(REGEX_DEFINES) $(CCFLAGS) -w -c $<
all: $(COMPLIB)
@@ -33,9 +35,9 @@ -$(RM) deps.mak *.o $(COMPLIB)
$(COMPLIB): args.o c.o fortran.o make.o conf.o pascal.o perl.o php.o diff.o vhdl.o lua.o js.o \ -haskell.o haxe.o python.o regex.o rest.o sh.o ctags.o entry.o get.o keyword.o options.o parse.o basic.o \ -read.o sort.o strlist.o latex.o docbook.o tcl.o ruby.o asm.o sql.o css.o vstring.o tm_workspace.o tm_work_object.o \ -tm_source_file.o tm_project.o tm_tag.o tm_symbol.o tm_file_entry.o \ +haskell.o haxe.o python.o lregex.o rest.o sh.o ctags.o entry.o get.o keyword.o options.o parse.o basic.o \ +read.o sort.o strlist.o latex.o docbook.o tcl.o ruby.o asm.o sql.o css.o vstring.o regex.o \ +tm_workspace.o tm_work_object.o tm_source_file.o tm_project.o tm_tag.o tm_symbol.o tm_file_entry.o \ tm_tagmanager.o $(AR) rc $@ $^ $(RANLIB) $@
Modified: trunk/tagmanager/parse.c =================================================================== --- trunk/tagmanager/parse.c 2007-11-17 17:03:25 UTC (rev 2052) +++ trunk/tagmanager/parse.c 2007-11-17 19:27:50 UTC (rev 2053) @@ -56,7 +56,7 @@ extern void makeSimpleScopedTag (const vString* const name, kindOption* const kinds, const int kind, const char* scope, const char *scope2, - const char *access) + const char *laccess) { if (name != NULL && vStringLength (name) > 0) { @@ -67,7 +67,7 @@ e.kind = kinds [kind].letter; e.extensionFields.scope[0] = scope; e.extensionFields.scope[1] = scope2; - e.extensionFields.access = access; + e.extensionFields.access = laccess;
makeTagEntry (&e); } @@ -541,7 +541,7 @@ for (i = 0 ; i < lang->kindCount ; ++i) printLangugageKindOption (lang->kinds + i); #ifdef HAVE_REGEX - printRegexKindOptions (language); + // printRegexKindOptions (language); // unused #endif } }
Modified: trunk/tagmanager/parse.h =================================================================== --- trunk/tagmanager/parse.h 2007-11-17 17:03:25 UTC (rev 2052) +++ trunk/tagmanager/parse.h 2007-11-17 19:27:50 UTC (rev 2053) @@ -113,7 +113,7 @@ /* Regex interface */ #ifdef HAVE_REGEX extern void findRegexTags (void); -extern void matchRegex (const vString* const line, const langType language); +extern boolean matchRegex (const vString* const line, const langType language); #endif extern boolean processRegexOption (const char *const option, const char *const parameter); extern void addLanguageRegex (const langType language, const char* const regex);
Modified: trunk/tagmanager/php.c =================================================================== --- trunk/tagmanager/php.c 2007-11-17 17:03:25 UTC (rev 2052) +++ trunk/tagmanager/php.c 2007-11-17 19:27:50 UTC (rev 2053) @@ -31,16 +31,74 @@ K_CLASS, K_DEFINE, K_FUNCTION, K_VARIABLE } phpKind;
+#if 0 static kindOption PhpKinds [] = { { TRUE, 'c', "class", "classes" }, - { TRUE, 'd', "macro", "constant definitions" }, + { TRUE, 'd', "define", "constant definitions" }, { TRUE, 'f', "function", "functions" }, { TRUE, 'v', "variable", "variables" } }; +#endif
/* * FUNCTION DEFINITIONS */ + +/* JavaScript patterns are duplicated in jscript.c */ + +/* + * Cygwin doesn't support non-ASCII characters in character classes. + * This isn't a good solution to the underlying problem, because we're still + * making assumptions about the character encoding. + * Really, these regular expressions need to concentrate on what marks the + * end of an identifier, and we need something like iconv to take into + * account the user's locale (or an override on the command-line.) + */ +#ifdef __CYGWIN__ +#define ALPHA "[:alpha:]" +#define ALNUM "[:alnum:]" +#else +#define ALPHA "A-Za-z\x7f-\xff" +#define ALNUM "0-9A-Za-z\x7f-\xff" +#endif + +static void installPHPRegex (const langType language) +{ + addTagRegex(language, "(^|[ \t])class[ \t]+([" ALPHA "_][" ALNUM "_]*)", + "\2", "c,class,classes", NULL); + addTagRegex(language, "(^|[ \t])interface[ \t]+([" ALPHA "_][" ALNUM "_]*)", + "\2", "i,interface,interfaces", NULL); + addTagRegex(language, "(^|[ \t])define[ \t]*\([ \t]*['"]?([" ALPHA "_][" ALNUM "_]*)", + "\2", "d,define,constant definitions", NULL); + addTagRegex(language, "(^|[ \t])function[ \t]+&?[ \t]*([" ALPHA "_][" ALNUM "_]*)", + "\2", "f,function,functions", NULL); + addTagRegex(language, "(^|[ \t])(\$|::\$|\$this->)([" ALPHA "_][" ALNUM "_]*)[ \t]*=", + "\3", "v,variable,variables", NULL); + addTagRegex(language, "(^|[ \t])(var|public|protected|private|static)[ \t]+\$([" ALPHA "_][" ALNUM "_]*)[ \t]*[=;]", + "\3", "v,variable,variables", NULL); + + /* function regex is covered by PHP regex */ + addTagRegex (language, "(^|[ \t])([A-Za-z0-9_]+)[ \t]*[=:][ \t]*function[ \t]*\(", + "\2", "j,jsfunction,javascript functions", NULL); + addTagRegex (language, "(^|[ \t])([A-Za-z0-9_.]+)\.([A-Za-z0-9_]+)[ \t]*=[ \t]*function[ \t]*\(", + "\2.\3", "j,jsfunction,javascript functions", NULL); + addTagRegex (language, "(^|[ \t])([A-Za-z0-9_.]+)\.([A-Za-z0-9_]+)[ \t]*=[ \t]*function[ \t]*\(", + "\3", "j,jsfunction,javascript functions", NULL); +} + +/* Create parser definition structure */ +extern parserDefinition* PhpParser (void) +{ + static const char *const extensions [] = { "php", "php3", "phtml", NULL }; + parserDefinition* def = parserNew ("PHP"); + def->extensions = extensions; + def->initialize = installPHPRegex; + def->regex = TRUE; + return def; +} + +#if 0 + static boolean isLetter(const int c) { return (boolean)(isalpha(c) || (c >= 127 && c <= 255)); @@ -101,7 +159,7 @@ cp++;
while (isspace ((int) *cp)) - ++cp; + ++cp; }
vStringClear (name); @@ -113,12 +171,11 @@ vStringTerminate (name); makeSimpleTag (name, PhpKinds, K_FUNCTION); vStringClear (name); - } - else if ((f = strstr ((const char*) cp, "class")) != NULL && - (f == (const char*) cp || isspace ((int) f [-1])) && - isspace ((int) f [5])) + } + else if (strncmp ((const char*) cp, "class", (size_t) 5) == 0 && + isspace ((int) cp [5])) { - cp = ((const unsigned char *) f) + 5; + cp += 5;
while (isspace ((int) *cp)) ++cp; @@ -149,7 +206,7 @@ ++cp; else if (! ((*cp == '_') || isalnum ((int) *cp))) continue; - + vStringClear (name); while (isalnum ((int) *cp) || *cp == '_') { @@ -175,5 +232,6 @@ return def; }
+#endif
/* vi:set tabstop=4 shiftwidth=4: */
Modified: trunk/tagmanager/regex.c =================================================================== --- trunk/tagmanager/regex.c 2007-11-17 17:03:25 UTC (rev 2052) +++ trunk/tagmanager/regex.c 2007-11-17 19:27:50 UTC (rev 2053) @@ -1,654 +1,4952 @@ -/* -* -* Copyright (c) 2000-2001, Darren Hiebert -* -* This source code is released for free distribution under the terms of the -* GNU General Public License. -* -* This module contains functions for applying regular expression matching. -* -* The code for utlizing the Gnu regex package with regards to processing the -* regex option and checking for regex matches was adapted from routines in -* Gnu etags. -*/ +/* Extended regular expression matching and search library, + version 0.12, with minor changes by Darren Hiebert. + (Implements POSIX draft P10003.2/D11.2, except for + internationalization features.)
-/* -* INCLUDE FILES -*/ -#include "general.h" /* must always come first */ + Copyright (C) 1993 Free Software Foundation, Inc.
-#include <string.h> + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version.
-#if defined (HAVE_REGCOMP) || defined (HAVE_RE_COMPILE_PATTERN) -# include <ctype.h> -# include <stddef.h> -# ifdef HAVE_SYS_TYPES_H -# include <sys/types.h> /* declare off_t (not known to regex.h on FreeBSD) */ -# endif -# include "regex.h" -#endif -#include <glib/gstdio.h> + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details.
-#include "entry.h" -#include "main.h" -#include "parse.h" -#include "read.h" + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
-#ifdef HAVE_REGEX +/* AIX requires this to be the first thing in the file. */ +#if defined (_AIX) && !defined (REGEX_MALLOC) + #pragma alloca +#endif
-/* -* MACROS -*/ +#define _GNU_SOURCE
-/* Back-references \0 through \9 */ -#define BACK_REFERENCE_COUNT 10 +/* We need this for `regex.h', and perhaps for the Emacs include files. */ +#include <sys/types.h>
-#if defined (HAVE_REGCOMP) && !defined (REGCOMP_BROKEN) -# define POSIX_REGEX +#ifdef HAVE_CONFIG_H +#include "config.h" #endif
-#define REGEX_NAME "Regex" +/* The `emacs' switch turns on certain matching commands + that make sense only in Emacs. */ +#ifdef emacs
-/* -* DATA DECLARATIONS -*/ -#if defined (POSIX_REGEX) +#include "lisp.h" +#include "buffer.h" +#include "syntax.h"
-struct sKind { - boolean enabled; - char letter; - char* name; -}; +/* Emacs uses `NULL' as a predicate. */ +#undef NULL
-enum pType { PTRN_TAG, PTRN_CALLBACK }; +#else /* not emacs */
-typedef struct { - regex_t *pattern; - enum pType type; - union { - struct { - char *name_pattern; - struct sKind kind; - } tag; - struct { - regexCallback function; - } callback; - } u; -} regexPattern; +/* We used to test for `BSTRING' here, but only GCC and Emacs define + `BSTRING', as far as I know, and neither of them use this code. */ +#if HAVE_STRING_H || STDC_HEADERS +#include <string.h> +#ifndef bcmp +#define bcmp(s1, s2, n) memcmp ((s1), (s2), (n)) +#endif +#ifndef bcopy +#define bcopy(s, d, n) memcpy ((d), (s), (n)) +#endif +#ifndef bzero +#define bzero(s, n) memset ((s), 0, (n)) +#endif +#else +#include <strings.h> +#endif
+#ifdef STDC_HEADERS +#include <stdlib.h> +#else +char *malloc (); +char *realloc (); #endif
-typedef struct { - regexPattern *patterns; - unsigned int count; -} patternSet;
-/* -* DATA DEFINITIONS -*/ +/* Define the syntax stuff for <, >, etc. */
-static boolean regexBroken = FALSE; +/* This must be nonzero for the wordchar and notwordchar pattern + commands in re_match_2. */ +#ifndef Sword +#define Sword 1 +#endif
-/* Array of pattern sets, indexed by language */ -static patternSet* Sets = NULL; -static int SetUpper = -1; /* upper language index in list */ +#ifdef SYNTAX_TABLE
-/* -* FUNCTION DEFINITIONS -*/ +extern char *re_syntax_table;
-static void clearPatternSet (const langType language) +#else /* not SYNTAX_TABLE */ + +/* How many characters in the character set. */ +#define CHAR_SET_SIZE 256 + +static char re_syntax_table[CHAR_SET_SIZE]; + +static void +init_syntax_once () { - if (language < SetUpper) - { - patternSet* const set = Sets + language; - unsigned int i; - for (i = 0 ; i < set->count ; ++i) - { -#if defined (POSIX_REGEX) - regfree (set->patterns [i].pattern); + register int c; + static int done = 0; + + if (done) + return; + + bzero (re_syntax_table, sizeof re_syntax_table); + + for (c = 'a'; c <= 'z'; c++) + re_syntax_table[c] = Sword; + + for (c = 'A'; c <= 'Z'; c++) + re_syntax_table[c] = Sword; + + for (c = '0'; c <= '9'; c++) + re_syntax_table[c] = Sword; + + re_syntax_table['_'] = Sword; + + done = 1; +} + +#endif /* not SYNTAX_TABLE */ + +#define SYNTAX(c) re_syntax_table[c] + +#endif /* not emacs */ + +/* Get the interface, including the syntax bits. */ +#include "regex.h" + +/* isalpha etc. are used for the character classes. */ +#include <ctype.h> + +#ifndef isascii +#define isascii(c) 1 #endif - eFree (set->patterns [i].pattern); - set->patterns [i].pattern = NULL;
- if (set->patterns [i].type == PTRN_TAG) - { - eFree (set->patterns [i].u.tag.name_pattern); - set->patterns [i].u.tag.name_pattern = NULL; - } - } - if (set->patterns != NULL) - eFree (set->patterns); - set->patterns = NULL; - set->count = 0; - } +#ifdef isblank +#define ISBLANK(c) (isascii (c) && isblank (c)) +#else +#define ISBLANK(c) ((c) == ' ' || (c) == '\t') +#endif +#ifdef isgraph +#define ISGRAPH(c) (isascii (c) && isgraph (c)) +#else +#define ISGRAPH(c) (isascii (c) && isprint (c) && !isspace (c)) +#endif + +#define ISPRINT(c) (isascii (c) && isprint (c)) +#define ISDIGIT(c) (isascii (c) && isdigit (c)) +#define ISALNUM(c) (isascii (c) && isalnum (c)) +#define ISALPHA(c) (isascii (c) && isalpha (c)) +#define ISCNTRL(c) (isascii (c) && iscntrl (c)) +#define ISLOWER(c) (isascii (c) && islower (c)) +#define ISPUNCT(c) (isascii (c) && ispunct (c)) +#define ISSPACE(c) (isascii (c) && isspace (c)) +#define ISUPPER(c) (isascii (c) && isupper (c)) +#define ISXDIGIT(c) (isascii (c) && isxdigit (c)) + +#ifndef NULL +#define NULL 0 +#endif + +/* We remove any previous definition of `SIGN_EXTEND_CHAR', + since ours (we hope) works properly with all combinations of + machines, compilers, `char' and `unsigned char' argument types. + (Per Bothner suggested the basic approach.) */ +#undef SIGN_EXTEND_CHAR +#if __STDC__ +#define SIGN_EXTEND_CHAR(c) ((signed char) (c)) +#else /* not __STDC__ */ +/* As in Harbison and Steele. */ +#define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128) +#endif + +/* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we + use `alloca' instead of `malloc'. This is because using malloc in + re_search* or re_match* could cause memory leaks when C-g is used in + Emacs; also, malloc is slower and causes storage fragmentation. On + the other hand, malloc is more portable, and easier to debug. + + Because we sometimes use alloca, some routines have to be macros, + not functions -- `alloca'-allocated space disappears at the end of the + function it is called in. */ + +#ifdef REGEX_MALLOC + +#define REGEX_ALLOCATE malloc +#define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize) + +#else /* not REGEX_MALLOC */ + +/* Emacs already defines alloca, sometimes. */ +#ifndef alloca + +/* Make alloca work the best possible way. */ +#ifdef __GNUC__ +#define alloca __builtin_alloca +#else /* not __GNUC__ */ +#if HAVE_ALLOCA_H +#include <alloca.h> +#else /* not __GNUC__ or HAVE_ALLOCA_H */ +#ifndef _AIX /* Already did AIX, up at the top. */ +char *alloca (); +#endif /* not _AIX */ +#endif /* not HAVE_ALLOCA_H */ +#endif /* not __GNUC__ */ + +#endif /* not alloca */ + +#define REGEX_ALLOCATE alloca + +/* Assumes a `char *destination' variable. */ +#define REGEX_REALLOCATE(source, osize, nsize) \ + (destination = (char *) alloca (nsize), \ + bcopy (source, destination, osize), \ + destination) + +#endif /* not REGEX_MALLOC */ + + +/* True if `size1' is non-NULL and PTR is pointing anywhere inside + `string1' or just past its end. This works if PTR is NULL, which is + a good thing. */ +#define FIRST_STRING_P(ptr) \ + (size1 && string1 <= (ptr) && (ptr) <= string1 + size1) + +/* (Re)Allocate N items of type T using malloc, or fail. */ +#define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t))) +#define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t))) +#define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t))) + +#define BYTEWIDTH 8 /* In bits. */ + +#define STREQ(s1, s2) ((strcmp (s1, s2) == 0)) + +#define MAX(a, b) ((a) > (b) ? (a) : (b)) +#define MIN(a, b) ((a) < (b) ? (a) : (b)) + +typedef char boolean; +#define false 0 +#define true 1 + +/* These are the command codes that appear in compiled regular + expressions. Some opcodes are followed by argument bytes. A + command code can specify any interpretation whatsoever for its + arguments. Zero bytes may appear in the compiled regular expression. + + The value of `exactn' is needed in search.c (search_buffer) in Emacs. + So regex.h defines a symbol `RE_EXACTN_VALUE' to be 1; the value of + `exactn' we use here must also be 1. */ + +typedef enum +{ + no_op = 0, + + /* Followed by one byte giving n, then by n literal bytes. */ + exactn = 1, + + /* Matches any (more or less) character. */ + anychar, + + /* Matches any one char belonging to specified set. First + following byte is number of bitmap bytes. Then come bytes + for a bitmap saying which chars are in. Bits in each byte + are ordered low-bit-first. A character is in the set if its + bit is 1. A character too large to have a bit in the map is + automatically not in the set. */ + charset, + + /* Same parameters as charset, but match any character that is + not one of those specified. */ + charset_not, + + /* Start remembering the text that is matched, for storing in a + register. Followed by one byte with the register number, in + the range 0 to one less than the pattern buffer's re_nsub + field. Then followed by one byte with the number of groups + inner to this one. (This last has to be part of the + start_memory only because we need it in the on_failure_jump + of re_match_2.) */ + start_memory, + + /* Stop remembering the text that is matched and store it in a + memory register. Followed by one byte with the register + number, in the range 0 to one less than `re_nsub' in the + pattern buffer, and one byte with the number of inner groups, + just like `start_memory'. (We need the number of inner + groups here because we don't have any easy way of finding the + corresponding start_memory when we're at a stop_memory.) */ + stop_memory, + + /* Match a duplicate of something remembered. Followed by one + byte containing the register number. */ + duplicate, + + /* Fail unless at beginning of line. */ + begline, + + /* Fail unless at end of line. */ + endline, + + /* Succeeds if at beginning of buffer (if emacs) or at beginning + of string to be matched (if not). */ + begbuf, + + /* Analogously, for end of buffer/string. */ + endbuf, + + /* Followed by two byte relative address to which to jump. */ + jump, + + /* Same as jump, but marks the end of an alternative. */ + jump_past_alt, + + /* Followed by two-byte relative address of place to resume at + in case of failure. */ + on_failure_jump, + + /* Like on_failure_jump, but pushes a placeholder instead of the + current string position when executed. */ + on_failure_keep_string_jump, + + /* Throw away latest failure point and then jump to following + two-byte relative address. */ + pop_failure_jump, + + /* Change to pop_failure_jump if know won't have to backtrack to + match; otherwise change to jump. This is used to jump + back to the beginning of a repeat. If what follows this jump + clearly won't match what the repeat does, such that we can be + sure that there is no use backtracking out of repetitions + already matched, then we change it to a pop_failure_jump. + Followed by two-byte address. */ + maybe_pop_jump, + + /* Jump to following two-byte address, and push a dummy failure + point. This failure point will be thrown away if an attempt + is made to use it for a failure. A `+' construct makes this + before the first repeat. Also used as an intermediary kind + of jump when compiling an alternative. */ + dummy_failure_jump, + + /* Push a dummy failure point and continue. Used at the end of + alternatives. */ + push_dummy_failure, + + /* Followed by two-byte relative address and two-byte number n. + After matching N times, jump to the address upon failure. */ + succeed_n, + + /* Followed by two-byte relative address, and two-byte number n. + Jump to the address N times, then fail. */ + jump_n, + + /* Set the following two-byte relative address to the + subsequent two-byte number. The address *includes* the two + bytes of number. */ + set_number_at, + + wordchar, /* Matches any word-constituent character. */ + notwordchar, /* Matches any char that is not a word-constituent. */ + + wordbeg, /* Succeeds if at word beginning. */ + wordend, /* Succeeds if at word end. */ + + wordbound, /* Succeeds if at a word boundary. */ + notwordbound /* Succeeds if not at a word boundary. */ + +#ifdef emacs + ,before_dot, /* Succeeds if before point. */ + at_dot, /* Succeeds if at point. */ + after_dot, /* Succeeds if after point. */ + + /* Matches any character whose syntax is specified. Followed by + a byte which contains a syntax code, e.g., Sword. */ + syntaxspec, + + /* Matches any character whose syntax is not that specified. */ + notsyntaxspec +#endif /* emacs */ +} re_opcode_t; + +/* Common operations on the compiled pattern. */ + +/* Store NUMBER in two contiguous bytes starting at DESTINATION. */ + +#define STORE_NUMBER(destination, number) \ + do { \ + (destination)[0] = (number) & 0377; \ + (destination)[1] = (number) >> 8; \ + } while (0) + +/* Same as STORE_NUMBER, except increment DESTINATION to + the byte after where the number is stored. Therefore, DESTINATION + must be an lvalue. */ + +#define STORE_NUMBER_AND_INCR(destination, number) \ + do { \ + STORE_NUMBER (destination, number); \ + (destination) += 2; \ + } while (0) + +/* Put into DESTINATION a number stored in two contiguous bytes starting + at SOURCE. */ + +#define EXTRACT_NUMBER(destination, source) \ + do { \ + (destination) = *(source) & 0377; \ + (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \ + } while (0) + +#ifdef DEBUG +static void +extract_number (dest, source) + int *dest; + unsigned char *source; +{ + int temp = SIGN_EXTEND_CHAR (*(source + 1)); + *dest = *source & 0377; + *dest += temp << 8; }
-/* -* Regex psuedo-parser -*/ +#ifndef EXTRACT_MACROS /* To debug the macros. */ +#undef EXTRACT_NUMBER +#define EXTRACT_NUMBER(dest, src) extract_number (&dest, src) +#endif /* not EXTRACT_MACROS */
-static void makeRegexTag (const vString* const name, - const struct sKind* const kind) +#endif /* DEBUG */ + +/* Same as EXTRACT_NUMBER, except increment SOURCE to after the number. + SOURCE must be an lvalue. */ + +#define EXTRACT_NUMBER_AND_INCR(destination, source) \ + do { \ + EXTRACT_NUMBER (destination, source); \ + (source) += 2; \ + } while (0) + +#ifdef DEBUG +static void +extract_number_and_incr (destination, source) + int *destination; + unsigned char **source; { - if (kind->enabled) - { - tagEntryInfo e; - Assert (name != NULL && vStringLength (name) > 0); - Assert (kind != NULL); - initTagEntry (&e, vStringValue (name)); - e.kind = kind->letter; - e.kindName = kind->name; - makeTagEntry (&e); - } + extract_number (destination, *source); + *source += 2; }
-/* -* Regex pattern definition -*/ +#ifndef EXTRACT_MACROS +#undef EXTRACT_NUMBER_AND_INCR +#define EXTRACT_NUMBER_AND_INCR(dest, src) \ + extract_number_and_incr (&dest, &src) +#endif /* not EXTRACT_MACROS */
-/* Take a string like "/blah/" and turn it into "blah", making sure - * that the first and last characters are the same, and handling - * quoted separator characters. Actually, stops on the occurrence of - * an unquoted separator. Also turns "\t" into a Tab character. - * Returns pointer to terminating separator. Works in place. Null - * terminates name string. - */ -static char* scanSeparators (char* name) +#endif /* DEBUG */ + +/* If DEBUG is defined, Regex prints many voluminous messages about what + it is doing (if the variable `debug' is nonzero). If linked with the + main program in `iregex.c', you can enter patterns and strings + interactively. And if linked with the main program in `main.c' and + the other test files, you can run the already-written tests. */ + +#ifdef DEBUG + +/* We use standard I/O for debugging. */ +#include <stdio.h> + +/* It is useful to test things that ``must'' be true when debugging. */ +#include <assert.h> + +static int debug = 0; + +#define DEBUG_STATEMENT(e) e +#define DEBUG_PRINT1(x) if (debug) printf (x) +#define DEBUG_PRINT2(x1, x2) if (debug) printf (x1, x2) +#define DEBUG_PRINT3(x1, x2, x3) if (debug) printf (x1, x2, x3) +#define DEBUG_PRINT4(x1, x2, x3, x4) if (debug) printf (x1, x2, x3, x4) +#define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \ + if (debug) print_partial_compiled_pattern (s, e) +#define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \ + if (debug) print_double_string (w, s1, sz1, s2, sz2) + + +extern void printchar (); + +/* Print the fastmap in human-readable form. */ + +void +print_fastmap (fastmap) + char *fastmap; { - char sep = name [0]; - char *copyto = name; - boolean quoted = FALSE; + unsigned was_a_range = 0; + unsigned i = 0;
- for (++name ; *name != '\0' ; ++name) + while (i < (1 << BYTEWIDTH)) { - if (quoted) + if (fastmap[i++]) { - if (*name == sep) - *copyto++ = sep; - else if (*name == 't') - *copyto++ = '\t'; - else - { - /* Something else is quoted, so preserve the quote. */ - *copyto++ = '\'; - *copyto++ = *name; - } - quoted = FALSE; - } - else if (*name == '\') - quoted = TRUE; - else if (*name == sep) - { - break; - } - else - *copyto++ = *name; + was_a_range = 0; + printchar (i - 1); + while (i < (1 << BYTEWIDTH) && fastmap[i]) + { + was_a_range = 1; + i++; + } + if (was_a_range) + { + printf ("-"); + printchar (i - 1); + } + } } - *copyto = '\0'; - return name; + putchar ('\n'); }
-/* Parse `regexp', in form "/regex/name/[k,Kind/]flags" (where the separator - * character is whatever the first character of `regexp' is), by breaking it - * up into null terminated strings, removing the separators, and expanding - * '\t' into tabs. When complete, `regexp' points to the line matching - * pattern, a pointer to the name matching pattern is written to `name', a - * pointer to the kinds is written to `kinds' (possibly NULL), and a pointer - * to the trailing flags is written to `flags'. If the pattern is not in the - * correct format, a false value is returned. - */ -static boolean parseTagRegex (char* const regexp, char** const name, - char** const kinds, char** const flags) + +/* Print a compiled pattern string in human-readable form, starting at + the START pointer into it and ending just before the pointer END. */ + +void +print_partial_compiled_pattern (start, end) + unsigned char *start; + unsigned char *end; { - boolean result = FALSE; - const int separator = (unsigned char) regexp [0]; + int mcnt, mcnt2; + unsigned char *p = start; + unsigned char *pend = end;
- *name = scanSeparators (regexp); - if (*regexp == '\0') - error (WARNING, "empty regexp"); - else if (**name != separator) - error (WARNING, "%s: incomplete regexp", regexp); - else + if (start == NULL) { - char* const third = scanSeparators (*name); - if (**name == '\0') - error (WARNING, "%s: regexp missing name pattern", regexp); - if ((*name) [strlen (*name) - 1] == '\') - error (WARNING, "error in name pattern: "%s"", *name); - if (*third != separator) - error (WARNING, "%s: regexp missing final separator", regexp); - else + printf ("(null)\n"); + return; + } + + /* Loop over pattern commands. */ + while (p < pend) + { + switch ((re_opcode_t) *p++) { - char* const fourth = scanSeparators (third); - if (*fourth == separator) + case no_op: + printf ("/no_op"); + break; + + case exactn: + mcnt = *p++; + printf ("/exactn/%d", mcnt); + do { - *kinds = third; - scanSeparators (fourth); - *flags = fourth; - } - else - { - *flags = third; - *kinds = NULL; - } - result = TRUE; + putchar ('/'); + printchar (*p++); + } + while (--mcnt); + break; + + case start_memory: + mcnt = *p++; + printf ("/start_memory/%d/%d", mcnt, *p++); + break; + + case stop_memory: + mcnt = *p++; + printf ("/stop_memory/%d/%d", mcnt, *p++); + break; + + case duplicate: + printf ("/duplicate/%d", *p++); + break; + + case anychar: + printf ("/anychar"); + break; + + case charset: + case charset_not: + { + register int c; + + printf ("/charset%s", + (re_opcode_t) *(p - 1) == charset_not ? "_not" : ""); + + assert (p + *p < pend); + + for (c = 0; c < *p; c++) + { + unsigned bit; + unsigned char map_byte = p[1 + c]; + + putchar ('/'); + + for (bit = 0; bit < BYTEWIDTH; bit++) + if (map_byte & (1 << bit)) + printchar (c * BYTEWIDTH + bit); + } + p += 1 + *p; + break; + } + + case begline: + printf ("/begline"); + break; + + case endline: + printf ("/endline"); + break; + + case on_failure_jump: + extract_number_and_incr (&mcnt, &p); + printf ("/on_failure_jump/0/%d", mcnt); + break; + + case on_failure_keep_string_jump: + extract_number_and_incr (&mcnt, &p); + printf ("/on_failure_keep_string_jump/0/%d", mcnt); + break; + + case dummy_failure_jump: + extract_number_and_incr (&mcnt, &p); + printf ("/dummy_failure_jump/0/%d", mcnt); + break; + + case push_dummy_failure: + printf ("/push_dummy_failure"); + break; + + case maybe_pop_jump: + extract_number_and_incr (&mcnt, &p); + printf ("/maybe_pop_jump/0/%d", mcnt); + break; + + case pop_failure_jump: + extract_number_and_incr (&mcnt, &p); + printf ("/pop_failure_jump/0/%d", mcnt); + break; + + case jump_past_alt: + extract_number_and_incr (&mcnt, &p); + printf ("/jump_past_alt/0/%d", mcnt); + break; + + case jump: + extract_number_and_incr (&mcnt, &p); + printf ("/jump/0/%d", mcnt); + break; + + case succeed_n: + extract_number_and_incr (&mcnt, &p); + extract_number_and_incr (&mcnt2, &p); + printf ("/succeed_n/0/%d/0/%d", mcnt, mcnt2); + break; + + case jump_n: + extract_number_and_incr (&mcnt, &p); + extract_number_and_incr (&mcnt2, &p); + printf ("/jump_n/0/%d/0/%d", mcnt, mcnt2); + break; + + case set_number_at: + extract_number_and_incr (&mcnt, &p); + extract_number_and_incr (&mcnt2, &p); + printf ("/set_number_at/0/%d/0/%d", mcnt, mcnt2); + break; + + case wordbound: + printf ("/wordbound"); + break; + + case notwordbound: + printf ("/notwordbound"); + break; + + case wordbeg: + printf ("/wordbeg"); + break; + + case wordend: + printf ("/wordend"); + +#ifdef emacs + case before_dot: + printf ("/before_dot"); + break; + + case at_dot: + printf ("/at_dot"); + break; + + case after_dot: + printf ("/after_dot"); + break; + + case syntaxspec: + printf ("/syntaxspec"); + mcnt = *p++; + printf ("/%d", mcnt); + break; + + case notsyntaxspec: + printf ("/notsyntaxspec"); + mcnt = *p++; + printf ("/%d", mcnt); + break; +#endif /* emacs */ + + case wordchar: + printf ("/wordchar"); + break; + + case notwordchar: + printf ("/notwordchar"); + break; + + case begbuf: + printf ("/begbuf"); + break; + + case endbuf: + printf ("/endbuf"); + break; + + default: + printf ("?%d", *(p-1)); } } - return result; + printf ("/\n"); }
-static void addCompiledTagPattern (const langType language, - regex_t* const pattern, char* const name, - const char kind, char* const kindName) + +void +print_compiled_pattern (bufp) + struct re_pattern_buffer *bufp; { - patternSet* set; - regexPattern *ptrn; - if (language > SetUpper) + unsigned char *buffer = bufp->buffer; + + print_partial_compiled_pattern (buffer, buffer + bufp->used); + printf ("%d bytes used/%d bytes allocated.\n", bufp->used, bufp->allocated); + + if (bufp->fastmap_accurate && bufp->fastmap) { - int i; - Sets = xRealloc (Sets, (language + 1), patternSet); - for (i = SetUpper + 1 ; i <= language ; ++i) - { - Sets [i].patterns = NULL; - Sets [i].count = 0; - } - SetUpper = language; + printf ("fastmap: "); + print_fastmap (bufp->fastmap); } - set = Sets + language; - set->patterns = xRealloc (set->patterns, (set->count + 1), regexPattern); - ptrn = &set->patterns [set->count]; - set->count += 1;
- ptrn->pattern = pattern; - ptrn->type = PTRN_TAG; - ptrn->u.tag.name_pattern = name; - ptrn->u.tag.kind.enabled = TRUE; - ptrn->u.tag.kind.letter = kind; - ptrn->u.tag.kind.name = kindName; + printf ("re_nsub: %d\t", bufp->re_nsub); + printf ("regs_alloc: %d\t", bufp->regs_allocated); + printf ("can_be_null: %d\t", bufp->can_be_null); + printf ("newline_anchor: %d\n", bufp->newline_anchor); + printf ("no_sub: %d\t", bufp->no_sub); + printf ("not_bol: %d\t", bufp->not_bol); + printf ("not_eol: %d\t", bufp->not_eol); + printf ("syntax: %d\n", bufp->syntax); + /* Perhaps we should print the translate table? */ }
-static void addCompiledCallbackPattern (const langType language, - regex_t* const pattern, - const regexCallback callback) + +void +print_double_string (where, string1, size1, string2, size2) + const char *where; + const char *string1; + const char *string2; + int size1; + int size2; { - patternSet* set; - regexPattern *ptrn; - if (language > SetUpper) + unsigned this_char; + + if (where == NULL) + printf ("(null)"); + else { - int i; - Sets = xRealloc (Sets, (language + 1), patternSet); - for (i = SetUpper + 1 ; i <= language ; ++i) - { - Sets [i].patterns = NULL; - Sets [i].count = 0; - } - SetUpper = language; + if (FIRST_STRING_P (where)) + { + for (this_char = where - string1; this_char < size1; this_char++) + printchar (string1[this_char]); + + where = string2; + } + + for (this_char = where - string2; this_char < size2; this_char++) + printchar (string2[this_char]); } - set = Sets + language; - set->patterns = xRealloc (set->patterns, (set->count + 1), regexPattern); - ptrn = &set->patterns [set->count]; - set->count += 1; +}
- ptrn->pattern = pattern; - ptrn->type = PTRN_CALLBACK; - ptrn->u.callback.function = callback; +#else /* not DEBUG */ + +#undef assert +#define assert(e) + +#define DEBUG_STATEMENT(e) +#define DEBUG_PRINT1(x) +#define DEBUG_PRINT2(x1, x2) +#define DEBUG_PRINT3(x1, x2, x3) +#define DEBUG_PRINT4(x1, x2, x3, x4) +#define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) +#define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) + +#endif /* not DEBUG */ + +/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can + also be assigned to arbitrarily: each pattern buffer stores its own + syntax, so it can be changed between regex compilations. */ +reg_syntax_t re_syntax_options = RE_SYNTAX_EMACS; + + +/* Specify the precise syntax of regexps for compilation. This provides + for compatibility for various utilities which historically have + different, incompatible syntaxes. + + The argument SYNTAX is a bit mask comprised of the various bits + defined in regex.h. We return the old syntax. */ + +reg_syntax_t +re_set_syntax (syntax) + reg_syntax_t syntax; +{ + reg_syntax_t ret = re_syntax_options; + + re_syntax_options = syntax; + return ret; } + +/* This table gives an error message for each of the error codes listed + in regex.h. Obviously the order here has to be same as there. */
-#if defined (POSIX_REGEX) +static const char *re_error_msg[] = + { NULL, /* REG_NOERROR */ + "No match", /* REG_NOMATCH */ + "Invalid regular expression", /* REG_BADPAT */ + "Invalid collation character", /* REG_ECOLLATE */ + "Invalid character class name", /* REG_ECTYPE */ + "Trailing backslash", /* REG_EESCAPE */ + "Invalid back reference", /* REG_ESUBREG */ + "Unmatched [ or [^", /* REG_EBRACK */ + "Unmatched ( or \(", /* REG_EPAREN */ + "Unmatched \{", /* REG_EBRACE */ + "Invalid content of \{\}", /* REG_BADBR */ + "Invalid range end", /* REG_ERANGE */ + "Memory exhausted", /* REG_ESPACE */ + "Invalid preceding regular expression", /* REG_BADRPT */ + "Premature end of regular expression", /* REG_EEND */ + "Regular expression too big", /* REG_ESIZE */ + "Unmatched ) or \)", /* REG_ERPAREN */ + }; + +/* Subroutine declarations and macros for regex_compile. */
-static regex_t* compileRegex (const char* const regexp, const char* const flags) +static void store_op1 (), store_op2 (); +static void insert_op1 (), insert_op2 (); +static boolean at_begline_loc_p (), at_endline_loc_p (); +static boolean group_in_compile_stack (); +static reg_errcode_t compile_range (); + +/* Fetch the next character in the uncompiled pattern---translating it + if necessary. Also cast from a signed character in the constant + string passed to us by the user to an unsigned char that we can use + as an array index (in, e.g., `translate'). */ +#define PATFETCH(c) \ + do {if (p == pend) return REG_EEND; \ + c = (unsigned char) *p++; \ + if (translate) c = translate[c]; \ + } while (0) + +/* Fetch the next character in the uncompiled pattern, with no + translation. */ +#define PATFETCH_RAW(c) \ + do {if (p == pend) return REG_EEND; \ + c = (unsigned char) *p++; \ + } while (0) + +/* Go backwards one character in the pattern. */ +#define PATUNFETCH p-- + + +/* If `translate' is non-null, return translate[D], else just D. We + cast the subscript to translate because some data is declared as + `char *', to avoid warnings when a string constant is passed. But + when we use a character as a subscript we must make it unsigned. */ +#define TRANSLATE(d) (translate ? translate[(unsigned char) (d)] : (d)) + + +/* Macros for outputting the compiled pattern into `buffer'. */ + +/* If the buffer isn't allocated when it comes in, use this. */ +#define INIT_BUF_SIZE 32 + +/* Make sure we have at least N more bytes of space in buffer. */ +#define GET_BUFFER_SPACE(n) \ + while ((unsigned long)(b - bufp->buffer + (n)) > bufp->allocated) \ + EXTEND_BUFFER () + +/* Make sure we have one more byte of buffer space and then add C to it. */ +#define BUF_PUSH(c) \ + do { \ + GET_BUFFER_SPACE (1); \ + *b++ = (unsigned char) (c); \ + } while (0) + + +/* Ensure we have two more bytes of buffer space and then append C1 and C2. */ +#define BUF_PUSH_2(c1, c2) \ + do { \ + GET_BUFFER_SPACE (2); \ + *b++ = (unsigned char) (c1); \ + *b++ = (unsigned char) (c2); \ + } while (0) + + +/* As with BUF_PUSH_2, except for three bytes. */ +#define BUF_PUSH_3(c1, c2, c3) \ + do { \ + GET_BUFFER_SPACE (3); \ + *b++ = (unsigned char) (c1); \ + *b++ = (unsigned char) (c2); \ + *b++ = (unsigned char) (c3); \ + } while (0) + + +/* Store a jump with opcode OP at LOC to location TO. We store a + relative address offset by the three bytes the jump itself occupies. */ +#define STORE_JUMP(op, loc, to) \ + store_op1 (op, loc, (to) - (loc) - 3) + +/* Likewise, for a two-argument jump. */ +#define STORE_JUMP2(op, loc, to, arg) \ + store_op2 (op, loc, (to) - (loc) - 3, arg) + +/* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */ +#define INSERT_JUMP(op, loc, to) \ + insert_op1 (op, loc, (to) - (loc) - 3, b) + +/* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */ +#define INSERT_JUMP2(op, loc, to, arg) \ + insert_op2 (op, loc, (to) - (loc) - 3, arg, b) + + +/* This is not an arbitrary limit: the arguments which represent offsets + into the pattern are two bytes long. So if 2^16 bytes turns out to + be too small, many things would have to change. */ +#define MAX_BUF_SIZE (1L << 16) + + +/* Extend the buffer by twice its current size via realloc and + reset the pointers that pointed into the old block to point to the + correct places in the new one. If extending the buffer results in it + being larger than MAX_BUF_SIZE, then flag memory exhausted. */ +#define EXTEND_BUFFER() \ + do { \ + unsigned char *old_buffer = bufp->buffer; \ + if (bufp->allocated == MAX_BUF_SIZE) \ + return REG_ESIZE; \ + bufp->allocated <<= 1; \ + if (bufp->allocated > MAX_BUF_SIZE) \ + bufp->allocated = MAX_BUF_SIZE; \ + bufp->buffer = (unsigned char *) realloc (bufp->buffer, bufp->allocated);\ + if (bufp->buffer == NULL) \ + return REG_ESPACE; \ + /* If the buffer moved, move all the pointers into it. */ \ + if (old_buffer != bufp->buffer) \ + { \ + b = (b - old_buffer) + bufp->buffer; \ + begalt = (begalt - old_buffer) + bufp->buffer; \ + if (fixup_alt_jump) \ + fixup_alt_jump = (fixup_alt_jump - old_buffer) + bufp->buffer;\ + if (laststart) \ + laststart = (laststart - old_buffer) + bufp->buffer; \ + if (pending_exact) \ + pending_exact = (pending_exact - old_buffer) + bufp->buffer; \ + } \ + } while (0) + + +/* Since we have one byte reserved for the register number argument to + {start,stop}_memory, the maximum number of groups we can report + things about is what fits in that byte. */ +#define MAX_REGNUM 255 + +/* But patterns can have more than `MAX_REGNUM' registers. We just + ignore the excess. */ +typedef unsigned regnum_t; + + +/* Macros for the compile stack. */ + +/* Since offsets can go either forwards or backwards, this type needs to + be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */ +typedef int pattern_offset_t; + +typedef struct { - int cflags = REG_EXTENDED | REG_NEWLINE; - regex_t *result = NULL; - int errcode; - int i; - for (i = 0 ; flags != NULL && flags [i] != '\0' ; ++i) - { - switch ((int) flags [i]) - { - case 'b': cflags &= ~REG_EXTENDED; break; - case 'e': cflags |= REG_EXTENDED; break; - case 'i': cflags |= REG_ICASE; break; - default: error (WARNING, "unknown regex flag: '%c'", *flags); break; - } + pattern_offset_t begalt_offset; + pattern_offset_t fixup_alt_jump; + pattern_offset_t inner_group_offset; + pattern_offset_t laststart_offset; + regnum_t regnum; +} compile_stack_elt_t; + + +typedef struct +{ + compile_stack_elt_t *stack; + unsigned size; + unsigned avail; /* Offset of next open position. */ +} compile_stack_type; + + +#define INIT_COMPILE_STACK_SIZE 32 + +#define COMPILE_STACK_EMPTY (compile_stack.avail == 0) +#define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size) + +/* The next available element. */ +#define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail]) + + +/* Set the bit for character C in a list. */ +#define SET_LIST_BIT(c) \ + (b[((unsigned char) (c)) / BYTEWIDTH] \ + |= 1 << (((unsigned char) c) % BYTEWIDTH)) + + +/* Get the next unsigned number in the uncompiled pattern. */ +#define GET_UNSIGNED_NUMBER(num) \ + { if (p != pend) \ + { \ + PATFETCH (c); \ + while (ISDIGIT (c)) \ + { \ + if (num < 0) \ + num = 0; \ + num = num * 10 + c - '0'; \ + if (p == pend) \ + break; \ + PATFETCH (c); \ + } \ + } \ } - result = xMalloc (1, regex_t); - errcode = regcomp (result, regexp, cflags); - if (errcode != 0) + +#define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */ + +#define IS_CHAR_CLASS(string) \ + (STREQ (string, "alpha") || STREQ (string, "upper") \ + || STREQ (string, "lower") || STREQ (string, "digit") \ + || STREQ (string, "alnum") || STREQ (string, "xdigit") \ + || STREQ (string, "space") || STREQ (string, "print") \ + || STREQ (string, "punct") || STREQ (string, "graph") \ + || STREQ (string, "cntrl") || STREQ (string, "blank")) + +/* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX. + Returns one of error codes defined in `regex.h', or zero for success. + + Assumes the `allocated' (and perhaps `buffer') and `translate' + fields are set in BUFP on entry. + + If it succeeds, results are put in BUFP (if it returns an error, the + contents of BUFP are undefined): + `buffer' is the compiled pattern; + `syntax' is set to SYNTAX; + `used' is set to the length of the compiled pattern; + `fastmap_accurate' is zero; + `re_nsub' is the number of subexpressions in PATTERN; + `not_bol' and `not_eol' are zero; + + The `fastmap' and `newline_anchor' fields are neither + examined nor set. */ + +static reg_errcode_t +regex_compile (pattern, size, syntax, bufp) + const char *pattern; + int size; + reg_syntax_t syntax; + struct re_pattern_buffer *bufp; +{ + /* We fetch characters from PATTERN here. Even though PATTERN is + `char *' (i.e., signed), we declare these variables as unsigned, so + they can be reliably used as array indices. */ + register unsigned char c, c1; + + /* A random tempory spot in PATTERN. */ + const char *p1; + + /* Points to the end of the buffer, where we should append. */ + register unsigned char *b; + + /* Keeps track of unclosed groups. */ + compile_stack_type compile_stack; + + /* Points to the current (ending) position in the pattern. */ + const char *p = pattern; + const char *pend = pattern + size; + + /* How to translate the characters in the pattern. */ + char *translate = bufp->translate; + + /* Address of the count-byte of the most recently inserted `exactn' + command. This makes it possible to tell if a new exact-match + character can be added to that command or if the character requires + a new `exactn' command. */ + unsigned char *pending_exact = 0; + + /* Address of start of the most recently finished expression. + This tells, e.g., postfix * where to find the start of its + operand. Reset at the beginning of groups and alternatives. */ + unsigned char *laststart = 0; + + /* Address of beginning of regexp, or inside of last group. */ + unsigned char *begalt; + + /* Place in the uncompiled pattern (i.e., the {) to + which to go back if the interval is invalid. */ + const char *beg_interval; + + /* Address of the place where a forward jump should go to the end of + the containing expression. Each alternative of an `or' -- except the + last -- ends with a forward jump of this sort. */ + unsigned char *fixup_alt_jump = 0; + + /* Counts open-groups as they are encountered. Remembered for the + matching close-group on the compile stack, so the same register + number is put in the stop_memory as the start_memory. */ + regnum_t regnum = 0; + +#ifdef DEBUG + DEBUG_PRINT1 ("\nCompiling pattern: "); + if (debug) { - char errmsg[256]; - regerror (errcode, result, errmsg, 256); - error (WARNING, "%s", errmsg); - regfree (result); - eFree (result); - result = NULL; + unsigned debug_count; + + for (debug_count = 0; debug_count < size; debug_count++) + printchar (pattern[debug_count]); + putchar ('\n'); } - return result; -} +#endif /* DEBUG */
+ /* Initialize the compile stack. */ + compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t); + if (compile_stack.stack == NULL) + return REG_ESPACE; + + compile_stack.size = INIT_COMPILE_STACK_SIZE; + compile_stack.avail = 0; + + /* Initialize the pattern buffer. */ + bufp->syntax = syntax; + bufp->fastmap_accurate = 0; + bufp->not_bol = bufp->not_eol = 0; + + /* Set `used' to zero, so that if we return an error, the pattern + printer (for debugging) will think there's no pattern. We reset it + at the end. */ + bufp->used = 0; + + /* Always count groups, whether or not bufp->no_sub is set. */ + bufp->re_nsub = 0; + +#if !defined (emacs) && !defined (SYNTAX_TABLE) + /* Initialize the syntax table. */ + init_syntax_once (); #endif
-static void parseKinds (const char* const kinds, - char* const kind, char** const kindName) -{ - *kind = '\0'; - *kindName = NULL; - if (kinds == NULL) + if (bufp->allocated == 0) { - *kind = 'r'; - *kindName = eStrdup ("regex"); + if (bufp->buffer) + { /* If zero allocated, but buffer is non-null, try to realloc + enough space. This loses if buffer's address is bogus, but + that is the user's responsibility. */ + RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char); + } + else + { /* Caller did not allocate a buffer. Do it for them. */ + bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char); + } + if (!bufp->buffer) return REG_ESPACE; + + bufp->allocated = INIT_BUF_SIZE; } - else if (kinds [0] != '\0') + + begalt = b = bufp->buffer; + + /* Loop through the uncompiled pattern until we're at the end. */ + while (p != pend) { - const char* k = kinds; - if (k [1] == ',' || k [1] == '\0') - *kind = *k++; - if (*k == ',') - ++k; - if (*k != '\0') - *kindName = eStrdup (k); + PATFETCH (c); + + switch (c) + { + case '^': + { + if ( /* If at start of pattern, it's an operator. */ + p == pattern + 1 + /* If context independent, it's an operator. */ + || syntax & RE_CONTEXT_INDEP_ANCHORS + /* Otherwise, depends on what's come before. */ + || at_begline_loc_p (pattern, p, syntax)) + BUF_PUSH (begline); + else + goto normal_char; + } + break; + + + case '$': + { + if ( /* If at end of pattern, it's an operator. */ + p == pend + /* If context independent, it's an operator. */ + || syntax & RE_CONTEXT_INDEP_ANCHORS + /* Otherwise, depends on what's next. */ + || at_endline_loc_p (p, pend, syntax)) + BUF_PUSH (endline); + else + goto normal_char; + } + break; + + + case '+': + case '?': + if ((syntax & RE_BK_PLUS_QM) + || (syntax & RE_LIMITED_OPS)) + goto normal_char; + handle_plus: + case '*': + /* If there is no previous pattern... */ + if (!laststart) + { + if (syntax & RE_CONTEXT_INVALID_OPS) + return REG_BADRPT; + else if (!(syntax & RE_CONTEXT_INDEP_OPS)) + goto normal_char; + } + + { + /* Are we optimizing this jump? */ + boolean keep_string_p = false; + + /* 1 means zero (many) matches is allowed. */ + char zero_times_ok = 0, many_times_ok = 0; + + /* If there is a sequence of repetition chars, collapse it + down to just one (the right one). We can't combine + interval operators with these because of, e.g., `a{2}*', + which should only match an even number of `a's. */ + + for (;;) + { + zero_times_ok |= c != '+'; + many_times_ok |= c != '?'; + + if (p == pend) + break; + + PATFETCH (c); + + if (c == '*' + || (!(syntax & RE_BK_PLUS_QM) && (c == '+' || c == '?'))) + ; + + else if (syntax & RE_BK_PLUS_QM && c == '\') + { + if (p == pend) return REG_EESCAPE; + + PATFETCH (c1); + if (!(c1 == '+' || c1 == '?')) + { + PATUNFETCH; + PATUNFETCH; + break; + } + + c = c1; + } + else + { + PATUNFETCH; + break; + } + + /* If we get here, we found another repeat character. */ + } + + /* Star, etc. applied to an empty pattern is equivalent + to an empty pattern. */ + if (!laststart) + break; + + /* Now we know whether or not zero matches is allowed + and also whether or not two or more matches is allowed. */ + if (many_times_ok) + { /* More than one repetition is allowed, so put in at the + end a backward relative jump from `b' to before the next + jump we're going to put in below (which jumps from + laststart to after this jump). + + But if we are at the `*' in the exact sequence `.*\n', + insert an unconditional jump backwards to the ., + instead of the beginning of the loop. This way we only + push a failure point once, instead of every time + through the loop. */ + assert (p - 1 > pattern); + + /* Allocate the space for the jump. */ + GET_BUFFER_SPACE (3); + + /* We know we are not at the first character of the pattern, + because laststart was nonzero. And we've already + incremented `p', by the way, to be the character after + the `*'. Do we have to do something analogous here + for null bytes, because of RE_DOT_NOT_NULL? */ + if (TRANSLATE (*(p - 2)) == TRANSLATE ('.') + && zero_times_ok + && p < pend && TRANSLATE (*p) == TRANSLATE ('\n') + && !(syntax & RE_DOT_NEWLINE)) + { /* We have .*\n. */ + STORE_JUMP (jump, b, laststart); + keep_string_p = true; + } + else + /* Anything else. */ + STORE_JUMP (maybe_pop_jump, b, laststart - 3); + + /* We've added more stuff to the buffer. */ + b += 3; + } + + /* On failure, jump from laststart to b + 3, which will be the + end of the buffer after this jump is inserted. */ + GET_BUFFER_SPACE (3); + INSERT_JUMP (keep_string_p ? on_failure_keep_string_jump + : on_failure_jump, + laststart, b + 3); + pending_exact = 0; + b += 3; + + if (!zero_times_ok) + { + /* At least one repetition is required, so insert a + `dummy_failure_jump' before the initial + `on_failure_jump' instruction of the loop. This + effects a skip over that instruction the first time + we hit that loop. */ + GET_BUFFER_SPACE (3); + INSERT_JUMP (dummy_failure_jump, laststart, laststart + 6); + b += 3; + } + } + break; + + + case '.': + laststart = b; + BUF_PUSH (anychar); + break; + + + case '[': + { + boolean had_char_class = false; + + if (p == pend) return REG_EBRACK; + + /* Ensure that we have enough space to push a charset: the + opcode, the length count, and the bitset; 34 bytes in all. */ + GET_BUFFER_SPACE (34); + + laststart = b; + + /* We test `*p == '^' twice, instead of using an if + statement, so we only need one BUF_PUSH. */ + BUF_PUSH (*p == '^' ? charset_not : charset); + if (*p == '^') + p++; + + /* Remember the first position in the bracket expression. */ + p1 = p; + + /* Push the number of bytes in the bitmap. */ + BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH); + + /* Clear the whole map. */ + bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH); + + /* charset_not matches newline according to a syntax bit. */ + if ((re_opcode_t) b[-2] == charset_not + && (syntax & RE_HAT_LISTS_NOT_NEWLINE)) + SET_LIST_BIT ('\n'); + + /* Read in characters and ranges, setting map bits. */ + for (;;) + { + if (p == pend) return REG_EBRACK; + + PATFETCH (c); + + /* \ might escape characters inside [...] and [^...]. */ + if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\') + { + if (p == pend) return REG_EESCAPE; + + PATFETCH (c1); + SET_LIST_BIT (c1); + continue; + } + + /* Could be the end of the bracket expression. If it's + not (i.e., when the bracket expression is `[]' so + far), the ']' character bit gets set way below. */ + if (c == ']' && p != p1 + 1) + break; + + /* Look ahead to see if it's a range when the last thing + was a character class. */ + if (had_char_class && c == '-' && *p != ']') + return REG_ERANGE; + + /* Look ahead to see if it's a range when the last thing + was a character: if this is a hyphen not at the + beginning or the end of a list, then it's the range + operator. */ + if (c == '-' + && !(p - 2 >= pattern && p[-2] == '[') + && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^') + && *p != ']') + { + reg_errcode_t ret + = compile_range (&p, pend, translate, syntax, b); + if (ret != REG_NOERROR) return ret; + } + + else if (p[0] == '-' && p[1] != ']') + { /* This handles ranges made up of characters only. */ + reg_errcode_t ret; + + /* Move past the `-'. */ + PATFETCH (c1); + + ret = compile_range (&p, pend, translate, syntax, b); + if (ret != REG_NOERROR) return ret; + } + + /* See if we're at the beginning of a possible character + class. */ + + else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':') + { /* Leave room for the null. */ + char str[CHAR_CLASS_MAX_LENGTH + 1]; + + PATFETCH (c); + c1 = 0; + + /* If pattern is `[[:'. */ + if (p == pend) return REG_EBRACK; + + for (;;) + { + PATFETCH (c); + if (c == ':' || c == ']' || p == pend + || c1 == CHAR_CLASS_MAX_LENGTH) + break; + str[c1++] = c; + } + str[c1] = '\0'; + + /* If isn't a word bracketed by `[:' and:`]': + undo the ending character, the letters, and leave + the leading `:' and `[' (but set bits for them). */ + if (c == ':' && *p == ']') + { + int ch; + boolean is_alnum = STREQ (str, "alnum"); + boolean is_alpha = STREQ (str, "alpha"); + boolean is_blank = STREQ (str, "blank"); + boolean is_cntrl = STREQ (str, "cntrl"); + boolean is_digit = STREQ (str, "digit"); + boolean is_graph = STREQ (str, "graph"); + boolean is_lower = STREQ (str, "lower"); + boolean is_print = STREQ (str, "print"); + boolean is_punct = STREQ (str, "punct"); + boolean is_space = STREQ (str, "space"); + boolean is_upper = STREQ (str, "upper"); + boolean is_xdigit = STREQ (str, "xdigit"); + + if (!IS_CHAR_CLASS (str)) return REG_ECTYPE; + + /* Throw away the ] at the end of the character + class. */ + PATFETCH (c); + + if (p == pend) return REG_EBRACK; + + for (ch = 0; ch < 1 << BYTEWIDTH; ch++) + { + if ( (is_alnum && ISALNUM (ch)) + || (is_alpha && ISALPHA (ch)) + || (is_blank && ISBLANK (ch)) + || (is_cntrl && ISCNTRL (ch)) + || (is_digit && ISDIGIT (ch)) + || (is_graph && ISGRAPH (ch)) + || (is_lower && ISLOWER (ch)) + || (is_print && ISPRINT (ch)) + || (is_punct && ISPUNCT (ch)) + || (is_space && ISSPACE (ch)) + || (is_upper && ISUPPER (ch)) + || (is_xdigit && ISXDIGIT (ch))) + SET_LIST_BIT (ch); + } + had_char_class = true; + } + else + { + c1++; + while (c1--) + PATUNFETCH; + SET_LIST_BIT ('['); + SET_LIST_BIT (':'); + had_char_class = false; + } + } + else + { + had_char_class = false; + SET_LIST_BIT (c); + } + } +
@@ Diff output truncated at 100000 characters. @@
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.