From d11bb47fe08fe633826ff35c64319cd2ce65fcc8 Mon Sep 17 00:00:00 2001 From: Gilles Peskine Date: Thu, 5 Jan 2023 20:28:57 +0100 Subject: [PATCH] Reject invalid UTF-8 and weird characters in text files MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reject "weird" characters in text files, especially control characters that might be escape sequences or that might cause other text to appear garbled (as in https://trojansource.codes/). Also reject byte sequences that aren't valid UTF-8. Accept only ASCII (except most control characters), letters, some non-ASCII punctuation and some mathematical and technical symbols. This covers everything that's currently present in Mbed TLS ( §áèéëñóöüłŽ–—’“”…≥). Signed-off-by: Gilles Peskine --- tests/scripts/check_files.py | 40 ++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/tests/scripts/check_files.py b/tests/scripts/check_files.py index 03400ab92d..d20ec2e9ec 100755 --- a/tests/scripts/check_files.py +++ b/tests/scripts/check_files.py @@ -263,6 +263,45 @@ class Utf8BomIssueTracker(FileIssueTracker): self.files_with_issues[filepath] = None +class UnicodeIssueTracker(LineIssueTracker): + """Track lines with invalid characters or invalid text encoding.""" + + heading = "Invalid UTF-8 or forbidden character:" + + # Only allow valid UTF-8, and only white-listed characters. + # We deliberately exclude all characters that aren't a simple non-blank, + # non-zero-width glyph, apart from a very small set (tab, ordinary space, + # line breaks, "basic" no-break space and soft hyphen). In particular, + # non-ASCII control characters, combinig characters, and Unicode state + # changes (e.g. right-to-left text) are forbidden. + # Note that we do allow some characters with a risk of visual confusion, + # for example '-' (U+002D HYPHEN-MINUS) vs '­' (U+00AD SOFT HYPHEN) vs + # '‐' (U+2010 HYPHEN), or 'A' (U+0041 LATIN CAPITAL LETTER A) vs + # 'Α' (U+0391 GREEK CAPITAL LETTER ALPHA). + GOOD_CHARACTERS = ''.join([ + '\t\n\r -~', # ASCII (tabs and line endings are checked separately) + '\u00A0-\u00FF', # Latin-1 Supplement (for NO-BREAK SPACE and punctuation) + '\u2010-\u2027\u2030-\u205E', # General Punctuation (printable) + '\u2070\u2071\u2074-\u208E\u2090-\u209C', # Superscripts and Subscripts + '\u2190-\u21FF', # Arrows + '\u2200-\u22FF', # Mathematical Symbols + ]) + # Allow any of the characters and ranges above, and anything classified + # as a word constituent. + GOOD_CHARACTERS_RE = re.compile(r'[\w{}]+\Z'.format(GOOD_CHARACTERS)) + + def issue_with_line(self, line, _filepath, line_number): + try: + text = line.decode('utf-8') + except UnicodeDecodeError: + return True + if line_number == 1 and text.startswith('\uFEFF'): + # Strip BOM (U+FEFF ZERO WIDTH NO-BREAK SPACE) at the beginning. + # Which files are allowed to have a BOM is handled in + # Utf8BomIssueTracker. + text = text[1:] + return not self.GOOD_CHARACTERS_RE.match(text) + class UnixLineEndingIssueTracker(LineIssueTracker): """Track files with non-Unix line endings (i.e. files with CR).""" @@ -350,6 +389,7 @@ class IntegrityChecker: ShebangIssueTracker(), EndOfFileNewlineIssueTracker(), Utf8BomIssueTracker(), + UnicodeIssueTracker(), UnixLineEndingIssueTracker(), WindowsLineEndingIssueTracker(), TrailingWhitespaceIssueTracker(),