diff --git a/tests/scripts/check_files.py b/tests/scripts/check_files.py index 03400ab92d..d20ec2e9ec 100755 --- a/tests/scripts/check_files.py +++ b/tests/scripts/check_files.py @@ -263,6 +263,45 @@ class Utf8BomIssueTracker(FileIssueTracker): self.files_with_issues[filepath] = None +class UnicodeIssueTracker(LineIssueTracker): + """Track lines with invalid characters or invalid text encoding.""" + + heading = "Invalid UTF-8 or forbidden character:" + + # Only allow valid UTF-8, and only white-listed characters. + # We deliberately exclude all characters that aren't a simple non-blank, + # non-zero-width glyph, apart from a very small set (tab, ordinary space, + # line breaks, "basic" no-break space and soft hyphen). In particular, + # non-ASCII control characters, combinig characters, and Unicode state + # changes (e.g. right-to-left text) are forbidden. + # Note that we do allow some characters with a risk of visual confusion, + # for example '-' (U+002D HYPHEN-MINUS) vs '' (U+00AD SOFT HYPHEN) vs + # '‐' (U+2010 HYPHEN), or 'A' (U+0041 LATIN CAPITAL LETTER A) vs + # 'Α' (U+0391 GREEK CAPITAL LETTER ALPHA). + GOOD_CHARACTERS = ''.join([ + '\t\n\r -~', # ASCII (tabs and line endings are checked separately) + '\u00A0-\u00FF', # Latin-1 Supplement (for NO-BREAK SPACE and punctuation) + '\u2010-\u2027\u2030-\u205E', # General Punctuation (printable) + '\u2070\u2071\u2074-\u208E\u2090-\u209C', # Superscripts and Subscripts + '\u2190-\u21FF', # Arrows + '\u2200-\u22FF', # Mathematical Symbols + ]) + # Allow any of the characters and ranges above, and anything classified + # as a word constituent. + GOOD_CHARACTERS_RE = re.compile(r'[\w{}]+\Z'.format(GOOD_CHARACTERS)) + + def issue_with_line(self, line, _filepath, line_number): + try: + text = line.decode('utf-8') + except UnicodeDecodeError: + return True + if line_number == 1 and text.startswith('\uFEFF'): + # Strip BOM (U+FEFF ZERO WIDTH NO-BREAK SPACE) at the beginning. + # Which files are allowed to have a BOM is handled in + # Utf8BomIssueTracker. + text = text[1:] + return not self.GOOD_CHARACTERS_RE.match(text) + class UnixLineEndingIssueTracker(LineIssueTracker): """Track files with non-Unix line endings (i.e. files with CR).""" @@ -350,6 +389,7 @@ class IntegrityChecker: ShebangIssueTracker(), EndOfFileNewlineIssueTracker(), Utf8BomIssueTracker(), + UnicodeIssueTracker(), UnixLineEndingIssueTracker(), WindowsLineEndingIssueTracker(), TrailingWhitespaceIssueTracker(),