[geany/geany] d350da: Rewrite Python standard library tags creation script for Python 3 - Commits

21 May 2023


      Branch:      refs/heads/master
Author:      Enrico Tröger enrico.troeger@uvena.de
Committer:   Enrico Tröger enrico.troeger@uvena.de
Date:        Sun, 07 May 2023 07:34:16 UTC
Commit:      d350dad27a45656cd9c84a0ac9dfb3a0412d6604
             https://github.com/geany/geany/commit/d350dad27a45656cd9c84a0ac9dfb3a0412d66...
Log Message:
-----------
Rewrite Python standard library tags creation script for Python 3
Modified Paths:
--------------
    data/tags/std.py.tags
    scripts/create_py_tags.py
Modified: data/tags/std.py.tags
21550 lines changed, 15586 insertions(+), 5964 deletions(-)
===================================================================
No diff available, check online
Modified: scripts/create_py_tags.py
514 lines changed, 286 insertions(+), 228 deletions(-)
===================================================================
@@ -7,310 +7,368 @@
 #
 # This script should be run in the top source directory.
 #
-# Parses all files given on command line for Python classes or functions and write
-# them into data/tags/std.py.tags (internal tagmanager format).
+# Parses all files in the directories given on command line for Python classes or functions and
+# write them into data/tags/std.py.tags (internal tagmanager format).
 # If called without command line arguments, a preset of common Python libs is used.
 #
 # WARNING
-# Be aware that running this script will actually *import* modules in the specified directory
+# Be aware that running this script will actually *import* all modules given on the command line
 # or in the standard library path of your Python installation. Dependent on what Python modules
 # you have installed, this might not be want you want and can have weird side effects.
 # You have been warned.
 #
 # It should be however relatively safe to execute this script from a fresh Python installation
-# installed into a dedicated prefix. Then nothing else is necessary as to change the interpreter
-# with which you start this script.
+# installed into a dedicated prefix or from an empty virtualenv or ideally in a Docker container
+# in the Geany project directory:
+# docker run --rm -it --user $(id -u):$(id -g) -v $(pwd):/data --workdir /data python:3.11-alpine python scripts/create_py_tags.py
 #
import datetime
-import imp
+import importlib.util
 import inspect
 import os
+import platform
 import re
 import sys
-
-PYTHON_LIB_DIRECTORY = os.path.dirname(os.__file__)
-PYTHON_LIB_IGNORE_PACKAGES = ('test', 'dist-packages', 'site-packages', 'Tools')
-# some modules execute funky code when they are imported which we really don't want here
-# (though if you feel funny, try: 'import antigravity')
-PYTHON_LIB_IGNORE_MODULES = ('antigravity.py', 'idlelib/idle.py', 'multiprocessing/util.py')
-PYTHON_KEYWORDS = ('and', 'as', 'assert', 'break', 'class', 'continue', 'def', 'del', 'elif',
-                   'else', 'except', 'exec', 'finally', 'for', 'from', 'global', 'if', 'import',
-                   'in', 'is', 'lambda', 'not', 'or', 'pass', 'print', 'raise', 'return', 'try',
-                   'while', 'with', 'yield', 'False', 'None', 'True')
-
-# (from tagmanager/tm_tag.c:32)
-TA_NAME = '%c' % 200,
-TA_TYPE = '%c' % 204
-TA_ARGLIST = '%c' % 205
-TA_SCOPE = '%c' % 206
-
-# TMTagType (tagmanager/tm_tag.h:47)
-TYPE_CLASS = '%d' % 1
-TYPE_FUNCTION = '%d' % 128
-
-tag_filename = 'data/tags/std.py.tags'
-tag_regexp = '^[ \t]*(def|class)[ \t]+([a-zA-Z0-9_]+)[ \t]*((.*))[:]'
-
-def joinseq(seq):
-    if len(seq) == 1:
-        return '(' + seq[0] + ',)'
-    else:
-        return '(' + ', '.join(seq) + ')'
-
-def strseq(object, convert, join=joinseq):
-    """Recursively walk a sequence, stringifying each element."""
-    if type(object) in (list, tuple):
-        return join(map(lambda o, c=convert, j=join: strseq(o, c, j), object))
-    else:
-        return convert(object)
+import sysconfig
+import warnings
+from pathlib import Path
+
+# treat all DeprecationWarnings as errors so we can catch them to ignore the corresponding modules
+warnings.filterwarnings('error', category=DeprecationWarning)
+
+PYTHON_LIB_DIRECTORY = Path(os.__file__).parent
+PYTHON_LIB_IGNORE_PACKAGES = ['dist-packages', 'distutils', 'encodings', 'idlelib', 'lib2to3',
+                              'site-packages', 'test', 'turtledemo', 'Tools']
+# some modules/classes are deprecated or execute funky code when they are imported
+# which we really don't want here (though if you feel funny, try: 'import antigravity')
+PYTHON_LIB_IGNORE_MODULES = ('__phello__.foo', 'antigravity', 'asyncio.windows_events',
+                             'asyncio.windows_utils', 'ctypes.wintypes', 'ensurepip._bundled',
+                             'lib2to3', 'multiprocessing.popen_spawn_win32', 'this', 'turtle')
+PYTHON_LIB_IGNORE_CLASSES = ('typing.io', 'typing.re')
+
+# Python kinds
+KIND_CLASS = 'class'
+KIND_FUNCTION = 'function'
+KIND_MEMBER = 'member'
+
+TAG_FILENAME = 'data/tags/std.py.tags'
+TAG_REGEXP = re.compile(r'^[ \t]*(def|class)[ \t]+([a-zA-Z0-9_]+)[ \t]*((.*))[:]')
+OBJECT_MEMORY_ADDRESS_REGEXP = re.compile(r'<(.+?) at 0x[0-9a-f]+(?:.+)>', flags=re.IGNORECASE)
+
+CTAGS_FILE_HEADER = f'''!_TAG_FILE_FORMAT	2	/extended format; --format=1 will not append ;" to lines/
+!_TAG_FILE_SORTED	1	/0=unsorted, 1=sorted, 2=foldcase/
+!_TAG_OUTPUT_EXCMD	mixed	/number, pattern, mixed, or combineV2/
+!_TAG_OUTPUT_FILESEP	slash	/slash or backslash/
+!_TAG_OUTPUT_MODE	u-ctags	/u-ctags or e-ctags/
+!_TAG_PATTERN_LENGTH_LIMIT	96	/0 for no limit/
+!_TAG_PROGRAM_NAME scripts/create_py_tags.py Automatically generated file - do not edit (created on {datetime.datetime.now().ctime()} with Python {platform.python_version()})
+'''
+
+# pylint: disable=no-else-return,no-self-use
-########################################################################
 class Parser:
-    #----------------------------------------------------------------------
     def __init__(self):
         self.tags = {}
-        self.re_matcher = re.compile(tag_regexp)
-    #----------------------------------------------------------------------
-    def _get_superclass(self, _object):
+    def _add_tag(self, object_name, object_, kind, module_path=None, parent=''):
         """
-        Python class base-finder
-        (found on http://mail.python.org/pipermail/python-list/2002-November/173949.html)
+        Verify the found tag name and if it is valid, add it to the list
-        @param _object (object)
-        @return superclass (object)
+        @param object_ (instance)
+        @param tag_type (str)
+        @param parent (str)
         """
-        try:
-            #~ TODO print inspect.getmro(c)
-            if isinstance(_object, type):
-                return _object.__bases__[0].__name__
-            else:
-                return _object.__mro__[1].__name__
-        except IndexError:
-            return ''
+        if len(object_name) < 4 or is_private_identifier(object_name):
+            return  # skip short and private tags
+        if object_ is not None and not is_relevant_identifier(object_):
+            return
+
+        tag_key = (module_path, parent, object_name)
+        if tag_key not in self.tags:
+            signature = self._create_signature(object_) if object_ is not None else None
+            self.tags[tag_key] = self._format_tag(object_name, kind, signature, parent)
-    #----------------------------------------------------------------------
-    def _formatargspec(self, args, varargs=None, varkw=None, defaults=None,
-                      formatarg=str,
-                      formatvarargs=lambda name: '*' + name,
-                      formatvarkw=lambda name: '**' + name,
-                      formatvalue=lambda value: '=' + repr(value),
-                      join=joinseq):
-        """Format an argument spec from the 4 values returned by getargspec.
-
-        The first four arguments are (args, varargs, varkw, defaults).  The
-        other four arguments are the corresponding optional formatting functions
-        that are called to turn names and values into strings.  The ninth
-        argument is an optional function to format the sequence of arguments."""
-        specs = []
-        if defaults:
-            firstdefault = len(args) - len(defaults)
-        for i in range(len(args)):
-            spec = strseq(args[i], formatarg, join)
-            if defaults and i >= firstdefault:
-                d = defaults[i - firstdefault]
-                # this is the difference from the original formatargspec() function
-                # to use nicer names then the default repr() output
-                if hasattr(d, '__name__'):
-                    d = d.__name__
-                spec = spec + formatvalue(d)
-            specs.append(spec)
-        if varargs is not None:
-            specs.append(formatvarargs(varargs))
-        if varkw is not None:
-            specs.append(formatvarkw(varkw))
-        return ', '.join(specs)
-
-    #----------------------------------------------------------------------
-    def _add_tag(self, obj, tag_type, parent=''):
+    def _format_tag(self, tagname, kind, signature, parent):
+        signature_field = f'\tsignature:{signature}' if signature else ''
+        parent_field = f'\tclass:{parent}' if parent else ''
+
+        return f'{tagname}\t/unknown\t1;"\tkind:{kind}{parent_field}{signature_field}\n'
+
+    def _get_safe_parameter_default_value(self, value):
         """
-        Verify the found tag name and if it is valid, add it to the list
+        Replace possibly sensitive or just much information from the default value
+        """
+        # prevent evaluating of `os.environ` in cgi.print_environ(environ=os.environ) which
+        # would lead to include the current full environment variables to be included
+        # in the tags file
+        if isinstance(value, (dict, os._Environ)) and value:  # pylint: disable=protected-access
+            return f'<default-value-stripped {type(value)}>'
+        if isinstance(value, str):
+            # remove interpreter paths
+            if sys.executable in value:
+                return '/nonexistent/bin/python3'
+            # remove interpreter paths
+            if sys.prefix in value:
+                return '/nonexistent'
+
+        # for all other default values, return the string representation,
+        # assuming it is shorter than repr()
+        value_str = str(value)
+
+        # remove object hex addresses, e.g
+        # subTest(self, msg='<object object at 0x7f14bdfcd5a0>', **params)
+        if OBJECT_MEMORY_ADDRESS_REGEXP.search(value_str):
+            return OBJECT_MEMORY_ADDRESS_REGEXP.sub(r'<\1>', value_str)
+
+        return value_str
+
+    def _stringify_parameter_default_if_necessary(self, parameter):
+        """
+        Replace default values of the parameters with their string variants if they are not
+        basic types. This is to avoid signatures like (`ssl.SSLContext.load_default_certs`):
+        create_default_contextÌ128Í(purpose=<Purpose.SERVER_AUTH: _ASN1Object(nid=129, shortname='serverAuth', longname='TLS Web Server Authentication', oid='1.3.6.1.5.5.7.3.1')>, *, cafile=None, capath=None, cadata=None)ÎSSLContext  # noqa pylint: disable=line-too-long
+        and create instead:
+        create_default_contextÌ128Í(purpose='Purpose.SERVER_AUTH', *, cafile=None, capath=None, cadata=None)
+
+        This is not perfect as it might suggest that the `purpose` parameter accepts a string.
+        But having the full `repr()` result is even worse.
+        """
+        if not parameter.default or parameter.default is parameter.empty:
+            return parameter
+        if isinstance(parameter.default, (bool, int, float)):
+            return parameter
-        @param obj (instance)
-        @param tag_type (str)
-        @param parent (str)
+        new_default = self._get_safe_parameter_default_value(parameter.default)
+        return parameter.replace(default=new_default)
+
+    def _create_signature(self, object_):
+        """
+        Create signature for the given `object_`.
         """
-        args = ''
-        scope = ''
         try:
-            args = self._formatargspec(inspect.getfullargspec(obj))
-        except (TypeError, KeyError):
-            pass
-        if parent:
-            if tag_type == TYPE_CLASS:
-                args = '(%s)' % parent
+            signature = inspect.signature(object_)
+        except (ValueError, TypeError):
+            # inspect.signature() throws ValueError and TypeError for unsupported callables,
+            # so we need to ignore the signature for this callable
+            return ''
+
+        new_parameters = []
+        for parameter_name in signature.parameters:
+            parameter = signature.parameters[parameter_name]
+            if parameter.default and not isinstance(parameter.default, parameter.empty):
+                new_parameter = self._stringify_parameter_default_if_necessary(parameter)
+                new_parameters.append(new_parameter)
             else:
-                scope = '%s%s' % (TA_SCOPE, parent)
-        if isinstance(obj, str):
-            tagname = obj
-        else:
-            tagname = obj.__name__
-        # check for duplicates
-        if len(tagname) < 4:
-            # skip short tags
-            return
-        tag = '%s%s%s%s%s%s\n' % (tagname, TA_TYPE, tag_type, TA_ARGLIST, args, scope)
+                new_parameters.append(parameter)
-        if not tagname in self.tags and not tagname_is_like_keyword(tagname):
-            self.tags[tagname] = tag
+        return signature.replace(parameters=new_parameters)
-    #----------------------------------------------------------------------
-    def process_file(self, filename):
+    def process_module(self, module_path, module_filename):
         """
-        Read the file specified by filename and look for class and function definitions
-
-        @param filename (str)
+        Import the given module path and look for class and function definitions
         """
+        module = None
+        symbols = None
+        module_error = None
+
+        if module_path.endswith('__main__'):
+            return  # ignore any executable modules, importing them would execute the module
+
         try:
-            module = imp.load_source('tags_file_module', filename)
-        except IOError as e:
-            # file not found
-            print('%s: %s' % (filename, e))
+            module = importlib.import_module(module_path)
+        except DeprecationWarning as exc:
+            print(f'Ignoring deprecated module "{module_path}" (reason: {exc})')
             return
-        except Exception:
-            module = None
-
-        if module:
-            symbols = inspect.getmembers(module, callable)
-            for obj_name, obj in symbols:
-                try:
-                    name = obj.__name__
-                except AttributeError:
-                    name = obj_name
-                if not name or not isinstance(name, str) or is_private_identifier(name):
-                    # skip non-public tags
-                    continue
-                if inspect.isfunction(obj):
-                    self._add_tag(obj, TYPE_FUNCTION)
-                elif inspect.isclass(obj):
-                    self._add_tag(obj, TYPE_CLASS, self._get_superclass(obj))
-                    try:
-                        methods = inspect.getmembers(obj, inspect.ismethod)
-                    except (TypeError, AttributeError):
-                        methods = []
-                    for m_name, m_obj in methods:
-                        # skip non-public tags
-                        if is_private_identifier(m_name) or not inspect.ismethod(m_obj):
-                            continue
-                        self._add_tag(m_obj, TYPE_FUNCTION, name)
+        except Exception as exc:
+            module_error = str(exc)
+        else:
+            symbols = inspect.getmembers(module)
+
+        if symbols:
+            self._process_module_with_inspect(symbols, module_path)
         else:
-            # plain regular expression based parsing
-            filep = open(filename)
-            for line in filep:
-                m = self.re_matcher.match(line)
-                if m:
-                    tag_type_str, tagname, args = m.groups()
+            # If error is empty, there are probably just no symbols in the module, e.g. on empty
+            # __init__.py files. Try to parse them anyway. But log module_errors.
+            if module_error:
+                print(f'Using fallback parser for: {module_path} ({module_filename}, reason: {module_error})')
+
+            self._process_module_with_fallback_parser(module_filename)
+
+    def _process_module_with_inspect(self, symbols, module_path):
+        """
+        Try to analyse all symbols in the module as found by `inspect.getmembers`.
+        """
+        for obj_name, obj in symbols:
+            if is_import(obj, module_path):
+                continue
+
+            # function and similar callables
+            if inspect.isroutine(obj):
+                self._add_tag(obj_name, obj, KIND_FUNCTION, module_path)
+            # class
+            elif inspect.isclass(obj):
+                if _ignore_class(module_path, obj_name):
+                    continue
+                self._add_tag(obj_name, obj, KIND_CLASS, module_path)
+                methods = inspect.getmembers(obj)
+                # methods
+                for m_name, m_obj in methods:
+                    self._add_tag(m_name, m_obj, KIND_MEMBER, module_path, parent=obj_name)
+
+    def _process_module_with_fallback_parser(self, module_filename):
+        """
+        Plain regular expression based parsing, used as fallback if `inspect`'ing the module is not possible
+        """
+        with open(module_filename, encoding='utf-8') as filep:
+            for line_number, line in enumerate(filep):
+                match = TAG_REGEXP.match(line)
+                if match:
+                    tag_type_str, tagname, args = match.groups()
                     if not tagname or is_private_identifier(tagname):
-                        # skip non-public tags
                         continue
-                    if tag_type_str == 'class':
-                        tag_type = TYPE_CLASS
-                    else:
-                        tag_type = TYPE_FUNCTION
-                    args = args.strip()
-                    tag = '%s%s%s%s%s\n' % (tagname, TA_TYPE, tag_type, TA_ARGLIST, args)
-                    if not tagname in self.tags and not tagname_is_like_keyword(tagname):
-                        self.tags[tagname] = tag
-            filep.close()
-
-    #----------------------------------------------------------------------
+                    if tagname in self.tags:
+                        continue
+
+                    kind = KIND_CLASS if tag_type_str == 'class' else KIND_FUNCTION
+                    signature = args.strip()
+                    self.tags[tagname] = self._format_tag(tagname, kind, signature, parent=None)
+
     def add_builtins(self):
         """
         Add the contents of __builtins__ as simple tags
         """
-        for tag_name in dir(__builtins__):
-            # check if the tag name starts with upper case, then we assume it is a class
-            # note that this is a very very simple heuristic to determine the type and will give
-            # false positives
-            if tag_name[0].isupper():
-                tag_type = TYPE_CLASS
-            else:
-                tag_type = TYPE_FUNCTION
-
-            self._add_tag(tag_name, tag_type)
+        builtins = inspect.getmembers(__builtins__)
+        for b_name, b_obj in builtins:
+            if inspect.isclass(b_obj):
+                self._add_tag(b_name, b_obj, KIND_CLASS)
+            elif is_relevant_identifier(b_obj):
+                self._add_tag(b_name, b_obj, KIND_FUNCTION)
-    #----------------------------------------------------------------------
     def write_to_file(self, filename):
         """
         Sort the found tags and write them into the file specified by filename
@param filename (str)
         """
-        result = list(self.tags.values())
-        # sort the tags
-        result.sort()
+        result = sorted(self.tags.values())
         # write them
-        with open(filename, 'wb') as target_file:
-            target_file.write(
-                ('# format=tagmanager - Automatically generated file - do not edit (created on %s)\n' % \
-                datetime.datetime.now().ctime()).encode('latin-1'))
+        with open(filename, 'w') as target_file:
+            target_file.write(CTAGS_FILE_HEADER)
             for symbol in result:
-                if not symbol == '\n': # skip empty lines
-                    target_file.write(symbol.encode('latin-1'))
+                if symbol != '\n':  # skip empty lines
+                    target_file.write(symbol)
-#----------------------------------------------------------------------
-def tagname_is_like_keyword(tagname):
-    """ignore tags which start with a keyword to avoid annoying completions of 'pass_' and similar ones"""
-    # this is not really efficient but in this script speed doesn't really matter
-    for keyword in PYTHON_KEYWORDS:
-        if tagname.startswith(keyword):
-            return True
+def is_import(object_, module_path):
+    object_module = getattr(object_, '__module__', None)
+    if object_module and object_module != module_path:
+        return True
+
     return False
-#----------------------------------------------------------------------
 def is_private_identifier(tagname):
     return tagname.startswith('_') or tagname.endswith('_')
-#----------------------------------------------------------------------
-def get_module_filenames(path):
-    def ignore_package(package):
-        for ignore in PYTHON_LIB_IGNORE_PACKAGES:
-            if ignore in package:
-                return True
-        return False
+def is_relevant_identifier(object_):
+    # TODO add "inspect.isdatadescriptor" for properties
+    # TODO maybe also consider attributes, e.g. by checking against __dict__ or so
+    return \
+        inspect.ismethod(object_) or \
+        inspect.isclass(object_) or \
+        inspect.isfunction(object_) or \
+        inspect.isgeneratorfunction(object_) or \
+        inspect.isgenerator(object_) or \
+        inspect.iscoroutinefunction(object_) or \
+        inspect.iscoroutine(object_) or \
+        inspect.isawaitable(object_) or \
+        inspect.isasyncgenfunction(object_) or \
+        inspect.isasyncgen(object_) or \
+        inspect.isroutine(object_) or \
+        inspect.isabstract(object_)
+
+
+def _setup_global_package_ignore_list():
+    """Read the python-config path from LIBPL and strip the prefix part
+       (e.g. /usr/lib/python3.8/config-3.8-x86_64-linux-gnu gets config-3.8-x86_64-linux-gnu)
+    """
+    python_config_dir = Path(sysconfig.get_config_var('LIBPL'))
+    try:
+        python_config_package = python_config_dir.relative_to(PYTHON_LIB_DIRECTORY)
+    except ValueError:
+        python_config_package = python_config_dir
+
+    PYTHON_LIB_IGNORE_PACKAGES.append(python_config_package.as_posix())
+
+
+def _ignore_package(package):
+    for ignore in PYTHON_LIB_IGNORE_PACKAGES:
+        if ignore in package:
+            return True
+    return False
+
+
+def _ignore_module(module):
+    return module in PYTHON_LIB_IGNORE_MODULES
+
+
+def _ignore_class(module, class_):
+    return f'{module}.{class_}' in PYTHON_LIB_IGNORE_CLASSES
+
+def _get_module_list(*paths):
     # the loop is quite slow but it doesn't matter for this script
-    filenames = list()
-    python_lib_directory_len = len(PYTHON_LIB_DIRECTORY)
-    for base, dirs, files in os.walk(path):
-        package = base[(python_lib_directory_len + 1):]
-        if ignore_package(package):
-            continue
-        for filename in files:
-            module_name = os.path.join(package, filename)
-            if module_name in PYTHON_LIB_IGNORE_MODULES:
+    modules = []
+    for path in paths:
+        for module_filename in path.rglob('*.py'):
+            module_name = module_filename.stem
+            package_path = module_filename.relative_to(path)
+            package = '.'.join(package_path.parent.parts)
+            # construct full module path (e.g. xml.sax.xmlreader)
+            if module_name == '__init__':
+                module_path = package
+            elif package:
+                module_path = f'{package}.{module_name}'
+            else:
+                module_path = module_name
+
+            # ignore unwanted modules and packages
+            if _ignore_package(package):
+                continue
+            if _ignore_module(module_path):
                 continue
-            if filename.endswith('.py'):
-                module_filename = os.path.join(base, filename)
-                filenames.append(module_filename)
-    return filenames
+
+            modules.append((module_path, module_filename))
+
+    # sort module list for nicer output
+    return sorted(modules)
-#----------------------------------------------------------------------
 def main():
-    # process files given on command line
+    _setup_global_package_ignore_list()
+    # process modules given on command line
     args = sys.argv[1:]
-    if not args:
-        args = get_module_filenames(PYTHON_LIB_DIRECTORY)
+    if args:
+        modules = _get_module_list(*args)
+    else:
+        modules = _get_module_list(PYTHON_LIB_DIRECTORY)
parser = Parser()
     parser.add_builtins()
-    for filename in args:
+    for module_path, module_filename in modules:
         try:
-            parser.process_file(filename)
-        except (SystemExit, ImportError, TypeError):
-            continue
+            parser.process_module(module_path, module_filename)
+        except Exception as exc:
+            print(f'{exc.__class__.__name__} in {module_path}: {exc}')
+            raise
-    parser.write_to_file(tag_filename)
+    parser.write_to_file(TAG_FILENAME)
if __name__ == '__main__':
     main()
-
--------------
This E-Mail was brought to you by github_commit_mail.py (Source: https://github.com/geany/infrastructure).