From d11bb47fe08fe633826ff35c64319cd2ce65fcc8 Mon Sep 17 00:00:00 2001
From: Gilles Peskine <Gilles.Peskine@arm.com>
Date: Thu, 5 Jan 2023 20:28:57 +0100
Subject: [PATCH] Reject invalid UTF-8 and weird characters in text files
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reject "weird" characters in text files, especially control characters that
might be escape sequences or that might cause other text to appear garbled
(as in https://trojansource.codes/).

Also reject byte sequences that aren't valid UTF-8.

Accept only ASCII (except most control characters), letters, some non-ASCII
punctuation and some mathematical and technical symbols. This covers
everything that's currently present in Mbed TLS ( §áèéëñóöüłŽ–—’“”…≥).

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>
---
 tests/scripts/check_files.py | 40 ++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/tests/scripts/check_files.py b/tests/scripts/check_files.py
index 03400ab92d..d20ec2e9ec 100755
--- a/tests/scripts/check_files.py
+++ b/tests/scripts/check_files.py
@@ -263,6 +263,45 @@ class Utf8BomIssueTracker(FileIssueTracker):
                 self.files_with_issues[filepath] = None
 
 
+class UnicodeIssueTracker(LineIssueTracker):
+    """Track lines with invalid characters or invalid text encoding."""
+
+    heading = "Invalid UTF-8 or forbidden character:"
+
+    # Only allow valid UTF-8, and only white-listed characters.
+    # We deliberately exclude all characters that aren't a simple non-blank,
+    # non-zero-width glyph, apart from a very small set (tab, ordinary space,
+    # line breaks, "basic" no-break space and soft hyphen). In particular,
+    # non-ASCII control characters, combinig characters, and Unicode state
+    # changes (e.g. right-to-left text) are forbidden.
+    # Note that we do allow some characters with a risk of visual confusion,
+    # for example '-' (U+002D HYPHEN-MINUS) vs '­' (U+00AD SOFT HYPHEN) vs
+    # '‐' (U+2010 HYPHEN), or 'A' (U+0041 LATIN CAPITAL LETTER A) vs
+    # 'Α' (U+0391 GREEK CAPITAL LETTER ALPHA).
+    GOOD_CHARACTERS = ''.join([
+        '\t\n\r -~', # ASCII (tabs and line endings are checked separately)
+        '\u00A0-\u00FF', # Latin-1 Supplement (for NO-BREAK SPACE and punctuation)
+        '\u2010-\u2027\u2030-\u205E', # General Punctuation (printable)
+        '\u2070\u2071\u2074-\u208E\u2090-\u209C', # Superscripts and Subscripts
+        '\u2190-\u21FF', # Arrows
+        '\u2200-\u22FF', # Mathematical Symbols
+    ])
+    # Allow any of the characters and ranges above, and anything classified
+    # as a word constituent.
+    GOOD_CHARACTERS_RE = re.compile(r'[\w{}]+\Z'.format(GOOD_CHARACTERS))
+
+    def issue_with_line(self, line, _filepath, line_number):
+        try:
+            text = line.decode('utf-8')
+        except UnicodeDecodeError:
+            return True
+        if line_number == 1 and text.startswith('\uFEFF'):
+            # Strip BOM (U+FEFF ZERO WIDTH NO-BREAK SPACE) at the beginning.
+            # Which files are allowed to have a BOM is handled in
+            # Utf8BomIssueTracker.
+            text = text[1:]
+        return not self.GOOD_CHARACTERS_RE.match(text)
+
 class UnixLineEndingIssueTracker(LineIssueTracker):
     """Track files with non-Unix line endings (i.e. files with CR)."""
 
@@ -350,6 +389,7 @@ class IntegrityChecker:
             ShebangIssueTracker(),
             EndOfFileNewlineIssueTracker(),
             Utf8BomIssueTracker(),
+            UnicodeIssueTracker(),
             UnixLineEndingIssueTracker(),
             WindowsLineEndingIssueTracker(),
             TrailingWhitespaceIssueTracker(),