Lift some code out of parse_identifiers

Make parse_identifiers less complex. Pylint was complaining that it had too many local variables, and it had a point. * Lift the constants identifier_regex and exclusion_lines to class constants (renamed to uppercase because they're constants). * Lift the per-file loop into a new function parse_identifiers_in_file. No intended behavior change. Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>
2025-04-01 04:20:45 +00:00 · 2021-11-16 20:56:47 +01:00 · 2021-11-16 20:56:47 +01:00 · 152de23518
commit 152de23518
parent c8794202e6
1 changed files with 100 additions and 88 deletions
--- a/tests/scripts/check_names.py
+++ b/tests/scripts/check_names.py
@ -457,6 +457,105 @@ class CodeParser():
        return enum_consts
    IDENTIFIER_REGEX = re.compile(
        # Match " something(a" or " *something(a". Functions.
        # Assumptions:
        # - function definition from return type to one of its arguments is
        #   all on one line
        # - function definition line only contains alphanumeric, asterisk,
        #   underscore, and open bracket
        r".* \**(\w+) *\( *\w|"
        # Match "(*something)(".
        r".*\( *\* *(\w+) *\) *\(|"
        # Match names of named data structures.
        r"(?:typedef +)?(?:struct|union|enum) +(\w+)(?: *{)?$|"
        # Match names of typedef instances, after closing bracket.
        r"}? *(\w+)[;[].*"
    )
    # The regex below is indented for clarity.
    EXCLUSION_LINES = re.compile(
        r"^("
            r"extern +\"C\"|" # pylint: disable=bad-continuation
            r"(typedef +)?(struct|union|enum)( *{)?$|"
            r"} *;?$|"
            r"$|"
            r"//|"
            r"#"
        r")"
    )
    def parse_identifiers_in_file(self, header_file, identifiers):
        """
        Parse all lines of a header where a function/enum/struct/union/typedef
        identifier is declared, based on some regex and heuristics. Highly
        dependent on formatting style.
        Append found matches to the list ``identifiers``.
        """
        with open(header_file, "r", encoding="utf-8") as header:
            in_block_comment = False
            # The previous line variable is used for concatenating lines
            # when identifiers are formatted and spread across multiple
            # lines.
            previous_line = ""
            for line_no, line in enumerate(header):
                # Terminate current comment?
                if in_block_comment:
                    line = re.sub(r".*?\*/", r"", line, 1)
                    in_block_comment = False
                # Remove full comments and string literals
                line = re.sub(r'/\*.*?\*/|(")(?:[^\\\"]|\\.)*"',
                              lambda s: '""' if s.group(1) else ' ',
                              line)
                # Start an unfinished comment?
                m = re.match(r"/\*", line)
                if m:
                    in_block_comment = True
                    line = line[:m.end(0)]
                if self.EXCLUSION_LINES.search(line):
                    previous_line = ""
                    continue
                # If the line contains only space-separated alphanumeric
                # characters (or underscore, asterisk, or, open bracket),
                # and nothing else, high chance it's a declaration that
                # continues on the next line
                if re.search(r"^([\w\*\(]+\s+)+$", line):
                    previous_line += line
                    continue
                # If previous line seemed to start an unfinished declaration
                # (as above), concat and treat them as one.
                if previous_line:
                    line = previous_line.strip() + " " + line.strip() + "\n"
                    previous_line = ""
                # Skip parsing if line has a space in front = heuristic to
                # skip function argument lines (highly subject to formatting
                # changes)
                if line[0] == " ":
                    continue
                identifier = self.IDENTIFIER_REGEX.search(line)
                if not identifier:
                    continue
                # Find the group that matched, and append it
                for group in identifier.groups():
                    if not group:
                        continue
                    identifiers.append(Match(
                        header_file,
                        line,
                        line_no,
                        identifier.span(),
                        group))
    def parse_identifiers(self, include, exclude=None):
        """
        Parse all lines of a header where a function/enum/struct/union/typedef
@ -469,100 +568,13 @@ class CodeParser():
        Returns a List of Match objects with identifiers.
        """
        identifier_regex = re.compile(
            # Match " something(a" or " *something(a". Functions.
            # Assumptions:
            # - function definition from return type to one of its arguments is
            #   all on one line
            # - function definition line only contains alphanumeric, asterisk,
            #   underscore, and open bracket
            r".* \**(\w+) *\( *\w|"
            # Match "(*something)(".
            r".*\( *\* *(\w+) *\) *\(|"
            # Match names of named data structures.
            r"(?:typedef +)?(?:struct|union|enum) +(\w+)(?: *{)?$|"
            # Match names of typedef instances, after closing bracket.
            r"}? *(\w+)[;[].*"
        )
        # The regex below is indented for clarity.
        exclusion_lines = re.compile(
            r"^("
                r"extern +\"C\"|" # pylint: disable=bad-continuation
                r"(typedef +)?(struct|union|enum)( *{)?$|"
                r"} *;?$|"
                r"$|"
                r"//|"
                r"#"
            r")"
        )
        files = self.get_files(include, exclude)
        self.log.debug("Looking for identifiers in {} files".format(len(files)))
        identifiers = []
        for header_file in files:
-            with open(header_file, "r", encoding="utf-8") as header:
+            self.parse_identifiers_in_file(header_file, identifiers)
                in_block_comment = False
                # The previous line variable is used for concatenating lines
                # when identifiers are formatted and spread across multiple
                # lines.
                previous_line = ""
                for line_no, line in enumerate(header):
                    # Terminate current comment?
                    if in_block_comment:
                        line = re.sub(r".*?\*/", r"", line, 1)
                        in_block_comment = False
                    # Remove full comments and string literals
                    line = re.sub(r'/\*.*?\*/|(")(?:[^\\\"]|\\.)*"',
                                  lambda s: '""' if s.group(1) else ' ',
                                  line)
                    # Start an unfinished comment?
                    m = re.match(r"/\*", line)
                    if m:
                        in_block_comment = True
                        line = line[:m.end(0)]
                    if exclusion_lines.search(line):
                        previous_line = ""
                        continue
                    # If the line contains only space-separated alphanumeric
                    # characters (or underscore, asterisk, or, open bracket),
                    # and nothing else, high chance it's a declaration that
                    # continues on the next line
                    if re.search(r"^([\w\*\(]+\s+)+$", line):
                        previous_line += line
                        continue
                    # If previous line seemed to start an unfinished declaration
                    # (as above), concat and treat them as one.
                    if previous_line:
                        line = previous_line.strip() + " " + line.strip() + "\n"
                        previous_line = ""
                    # Skip parsing if line has a space in front = heuristic to
                    # skip function argument lines (highly subject to formatting
                    # changes)
                    if line[0] == " ":
                        continue
                    identifier = identifier_regex.search(line)
                    if not identifier:
                        continue
                    # Find the group that matched, and append it
                    for group in identifier.groups():
                        if not group:
                            continue
                        identifiers.append(Match(
                            header_file,
                            line,
                            line_no,
                            identifier.span(),
                            group))
        return identifiers