From 0ed9e78bf7541959aa0cd90715cb49543bb174e8 Mon Sep 17 00:00:00 2001 From: Gilles Peskine Date: Thu, 5 Jan 2023 20:27:18 +0100 Subject: [PATCH 1/3] Treat more *.bin files as binary Signed-off-by: Gilles Peskine --- tests/scripts/check_files.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/scripts/check_files.py b/tests/scripts/check_files.py index 42f2e82c99..e1c6478fd9 100755 --- a/tests/scripts/check_files.py +++ b/tests/scripts/check_files.py @@ -122,7 +122,7 @@ BINARY_FILE_PATH_RE_LIST = [ r'tests/data_files/.*\.req\.[^/]+\Z', r'tests/data_files/.*malformed[^/]+\Z', r'tests/data_files/format_pkcs12\.fmt\Z', - r'tests/data_files/pkcs7_data.*\.bin\Z', + r'tests/data_files/.*\.bin\Z', ] BINARY_FILE_PATH_RE = re.compile('|'.join(BINARY_FILE_PATH_RE_LIST)) From b389743ace411d580a68795767ddaa7a3d5d1fcd Mon Sep 17 00:00:00 2001 From: Gilles Peskine Date: Thu, 5 Jan 2023 20:28:30 +0100 Subject: [PATCH 2/3] Pass line number to issue_with_line Signed-off-by: Gilles Peskine --- tests/scripts/check_files.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/scripts/check_files.py b/tests/scripts/check_files.py index e1c6478fd9..03400ab92d 100755 --- a/tests/scripts/check_files.py +++ b/tests/scripts/check_files.py @@ -136,7 +136,7 @@ class LineIssueTracker(FileIssueTracker): # Exclude binary files. path_exemptions = BINARY_FILE_PATH_RE - def issue_with_line(self, line, filepath): + def issue_with_line(self, line, filepath, line_number): """Check the specified line for the issue that this class is for. Subclasses must implement this method. @@ -144,7 +144,7 @@ class LineIssueTracker(FileIssueTracker): raise NotImplementedError def check_file_line(self, filepath, line, line_number): - if self.issue_with_line(line, filepath): + if self.issue_with_line(line, filepath, line_number): self.record_issue(filepath, line_number) def check_file_for_issue(self, filepath): @@ -273,7 +273,7 @@ class UnixLineEndingIssueTracker(LineIssueTracker): return False return not is_windows_file(filepath) - def issue_with_line(self, line, _filepath): + def issue_with_line(self, line, _filepath, _line_number): return b"\r" in line @@ -287,7 +287,7 @@ class WindowsLineEndingIssueTracker(LineIssueTracker): return False return is_windows_file(filepath) - def issue_with_line(self, line, _filepath): + def issue_with_line(self, line, _filepath, _line_number): return not line.endswith(b"\r\n") or b"\r" in line[:-2] @@ -297,7 +297,7 @@ class TrailingWhitespaceIssueTracker(LineIssueTracker): heading = "Trailing whitespace:" suffix_exemptions = frozenset([".dsp", ".md"]) - def issue_with_line(self, line, _filepath): + def issue_with_line(self, line, _filepath, _line_number): return line.rstrip(b"\r\n") != line.rstrip() @@ -313,7 +313,7 @@ class TabIssueTracker(LineIssueTracker): "/generate_visualc_files.pl", ]) - def issue_with_line(self, line, _filepath): + def issue_with_line(self, line, _filepath, _line_number): return b"\t" in line @@ -323,7 +323,7 @@ class MergeArtifactIssueTracker(LineIssueTracker): heading = "Merge artifact:" - def issue_with_line(self, line, _filepath): + def issue_with_line(self, line, _filepath, _line_number): # Detect leftover git conflict markers. if line.startswith(b'<<<<<<< ') or line.startswith(b'>>>>>>> '): return True From d11bb47fe08fe633826ff35c64319cd2ce65fcc8 Mon Sep 17 00:00:00 2001 From: Gilles Peskine Date: Thu, 5 Jan 2023 20:28:57 +0100 Subject: [PATCH 3/3] Reject invalid UTF-8 and weird characters in text files MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reject "weird" characters in text files, especially control characters that might be escape sequences or that might cause other text to appear garbled (as in https://trojansource.codes/). Also reject byte sequences that aren't valid UTF-8. Accept only ASCII (except most control characters), letters, some non-ASCII punctuation and some mathematical and technical symbols. This covers everything that's currently present in Mbed TLS ( §áèéëñóöüłŽ–—’“”…≥). Signed-off-by: Gilles Peskine --- tests/scripts/check_files.py | 40 ++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/tests/scripts/check_files.py b/tests/scripts/check_files.py index 03400ab92d..d20ec2e9ec 100755 --- a/tests/scripts/check_files.py +++ b/tests/scripts/check_files.py @@ -263,6 +263,45 @@ class Utf8BomIssueTracker(FileIssueTracker): self.files_with_issues[filepath] = None +class UnicodeIssueTracker(LineIssueTracker): + """Track lines with invalid characters or invalid text encoding.""" + + heading = "Invalid UTF-8 or forbidden character:" + + # Only allow valid UTF-8, and only white-listed characters. + # We deliberately exclude all characters that aren't a simple non-blank, + # non-zero-width glyph, apart from a very small set (tab, ordinary space, + # line breaks, "basic" no-break space and soft hyphen). In particular, + # non-ASCII control characters, combinig characters, and Unicode state + # changes (e.g. right-to-left text) are forbidden. + # Note that we do allow some characters with a risk of visual confusion, + # for example '-' (U+002D HYPHEN-MINUS) vs '­' (U+00AD SOFT HYPHEN) vs + # '‐' (U+2010 HYPHEN), or 'A' (U+0041 LATIN CAPITAL LETTER A) vs + # 'Α' (U+0391 GREEK CAPITAL LETTER ALPHA). + GOOD_CHARACTERS = ''.join([ + '\t\n\r -~', # ASCII (tabs and line endings are checked separately) + '\u00A0-\u00FF', # Latin-1 Supplement (for NO-BREAK SPACE and punctuation) + '\u2010-\u2027\u2030-\u205E', # General Punctuation (printable) + '\u2070\u2071\u2074-\u208E\u2090-\u209C', # Superscripts and Subscripts + '\u2190-\u21FF', # Arrows + '\u2200-\u22FF', # Mathematical Symbols + ]) + # Allow any of the characters and ranges above, and anything classified + # as a word constituent. + GOOD_CHARACTERS_RE = re.compile(r'[\w{}]+\Z'.format(GOOD_CHARACTERS)) + + def issue_with_line(self, line, _filepath, line_number): + try: + text = line.decode('utf-8') + except UnicodeDecodeError: + return True + if line_number == 1 and text.startswith('\uFEFF'): + # Strip BOM (U+FEFF ZERO WIDTH NO-BREAK SPACE) at the beginning. + # Which files are allowed to have a BOM is handled in + # Utf8BomIssueTracker. + text = text[1:] + return not self.GOOD_CHARACTERS_RE.match(text) + class UnixLineEndingIssueTracker(LineIssueTracker): """Track files with non-Unix line endings (i.e. files with CR).""" @@ -350,6 +389,7 @@ class IntegrityChecker: ShebangIssueTracker(), EndOfFileNewlineIssueTracker(), Utf8BomIssueTracker(), + UnicodeIssueTracker(), UnixLineEndingIssueTracker(), WindowsLineEndingIssueTracker(), TrailingWhitespaceIssueTracker(),