Merge pull request #6884 from gilles-peskine-arm/check-files-unicode

Reject bad characters in source code
This commit is contained in:
Gilles Peskine 2023-01-11 13:46:59 +01:00 committed by GitHub
commit c55c343670
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -122,7 +122,7 @@ BINARY_FILE_PATH_RE_LIST = [
r'tests/data_files/.*\.req\.[^/]+\Z', r'tests/data_files/.*\.req\.[^/]+\Z',
r'tests/data_files/.*malformed[^/]+\Z', r'tests/data_files/.*malformed[^/]+\Z',
r'tests/data_files/format_pkcs12\.fmt\Z', r'tests/data_files/format_pkcs12\.fmt\Z',
r'tests/data_files/pkcs7_data.*\.bin\Z', r'tests/data_files/.*\.bin\Z',
] ]
BINARY_FILE_PATH_RE = re.compile('|'.join(BINARY_FILE_PATH_RE_LIST)) BINARY_FILE_PATH_RE = re.compile('|'.join(BINARY_FILE_PATH_RE_LIST))
@ -136,7 +136,7 @@ class LineIssueTracker(FileIssueTracker):
# Exclude binary files. # Exclude binary files.
path_exemptions = BINARY_FILE_PATH_RE path_exemptions = BINARY_FILE_PATH_RE
def issue_with_line(self, line, filepath): def issue_with_line(self, line, filepath, line_number):
"""Check the specified line for the issue that this class is for. """Check the specified line for the issue that this class is for.
Subclasses must implement this method. Subclasses must implement this method.
@ -144,7 +144,7 @@ class LineIssueTracker(FileIssueTracker):
raise NotImplementedError raise NotImplementedError
def check_file_line(self, filepath, line, line_number): def check_file_line(self, filepath, line, line_number):
if self.issue_with_line(line, filepath): if self.issue_with_line(line, filepath, line_number):
self.record_issue(filepath, line_number) self.record_issue(filepath, line_number)
def check_file_for_issue(self, filepath): def check_file_for_issue(self, filepath):
@ -263,6 +263,45 @@ class Utf8BomIssueTracker(FileIssueTracker):
self.files_with_issues[filepath] = None self.files_with_issues[filepath] = None
class UnicodeIssueTracker(LineIssueTracker):
"""Track lines with invalid characters or invalid text encoding."""
heading = "Invalid UTF-8 or forbidden character:"
# Only allow valid UTF-8, and only white-listed characters.
# We deliberately exclude all characters that aren't a simple non-blank,
# non-zero-width glyph, apart from a very small set (tab, ordinary space,
# line breaks, "basic" no-break space and soft hyphen). In particular,
# non-ASCII control characters, combinig characters, and Unicode state
# changes (e.g. right-to-left text) are forbidden.
# Note that we do allow some characters with a risk of visual confusion,
# for example '-' (U+002D HYPHEN-MINUS) vs '­' (U+00AD SOFT HYPHEN) vs
# '' (U+2010 HYPHEN), or 'A' (U+0041 LATIN CAPITAL LETTER A) vs
# 'Α' (U+0391 GREEK CAPITAL LETTER ALPHA).
GOOD_CHARACTERS = ''.join([
'\t\n\r -~', # ASCII (tabs and line endings are checked separately)
'\u00A0-\u00FF', # Latin-1 Supplement (for NO-BREAK SPACE and punctuation)
'\u2010-\u2027\u2030-\u205E', # General Punctuation (printable)
'\u2070\u2071\u2074-\u208E\u2090-\u209C', # Superscripts and Subscripts
'\u2190-\u21FF', # Arrows
'\u2200-\u22FF', # Mathematical Symbols
])
# Allow any of the characters and ranges above, and anything classified
# as a word constituent.
GOOD_CHARACTERS_RE = re.compile(r'[\w{}]+\Z'.format(GOOD_CHARACTERS))
def issue_with_line(self, line, _filepath, line_number):
try:
text = line.decode('utf-8')
except UnicodeDecodeError:
return True
if line_number == 1 and text.startswith('\uFEFF'):
# Strip BOM (U+FEFF ZERO WIDTH NO-BREAK SPACE) at the beginning.
# Which files are allowed to have a BOM is handled in
# Utf8BomIssueTracker.
text = text[1:]
return not self.GOOD_CHARACTERS_RE.match(text)
class UnixLineEndingIssueTracker(LineIssueTracker): class UnixLineEndingIssueTracker(LineIssueTracker):
"""Track files with non-Unix line endings (i.e. files with CR).""" """Track files with non-Unix line endings (i.e. files with CR)."""
@ -273,7 +312,7 @@ class UnixLineEndingIssueTracker(LineIssueTracker):
return False return False
return not is_windows_file(filepath) return not is_windows_file(filepath)
def issue_with_line(self, line, _filepath): def issue_with_line(self, line, _filepath, _line_number):
return b"\r" in line return b"\r" in line
@ -287,7 +326,7 @@ class WindowsLineEndingIssueTracker(LineIssueTracker):
return False return False
return is_windows_file(filepath) return is_windows_file(filepath)
def issue_with_line(self, line, _filepath): def issue_with_line(self, line, _filepath, _line_number):
return not line.endswith(b"\r\n") or b"\r" in line[:-2] return not line.endswith(b"\r\n") or b"\r" in line[:-2]
@ -297,7 +336,7 @@ class TrailingWhitespaceIssueTracker(LineIssueTracker):
heading = "Trailing whitespace:" heading = "Trailing whitespace:"
suffix_exemptions = frozenset([".dsp", ".md"]) suffix_exemptions = frozenset([".dsp", ".md"])
def issue_with_line(self, line, _filepath): def issue_with_line(self, line, _filepath, _line_number):
return line.rstrip(b"\r\n") != line.rstrip() return line.rstrip(b"\r\n") != line.rstrip()
@ -313,7 +352,7 @@ class TabIssueTracker(LineIssueTracker):
"/generate_visualc_files.pl", "/generate_visualc_files.pl",
]) ])
def issue_with_line(self, line, _filepath): def issue_with_line(self, line, _filepath, _line_number):
return b"\t" in line return b"\t" in line
@ -323,7 +362,7 @@ class MergeArtifactIssueTracker(LineIssueTracker):
heading = "Merge artifact:" heading = "Merge artifact:"
def issue_with_line(self, line, _filepath): def issue_with_line(self, line, _filepath, _line_number):
# Detect leftover git conflict markers. # Detect leftover git conflict markers.
if line.startswith(b'<<<<<<< ') or line.startswith(b'>>>>>>> '): if line.startswith(b'<<<<<<< ') or line.startswith(b'>>>>>>> '):
return True return True
@ -350,6 +389,7 @@ class IntegrityChecker:
ShebangIssueTracker(), ShebangIssueTracker(),
EndOfFileNewlineIssueTracker(), EndOfFileNewlineIssueTracker(),
Utf8BomIssueTracker(), Utf8BomIssueTracker(),
UnicodeIssueTracker(),
UnixLineEndingIssueTracker(), UnixLineEndingIssueTracker(),
WindowsLineEndingIssueTracker(), WindowsLineEndingIssueTracker(),
TrailingWhitespaceIssueTracker(), TrailingWhitespaceIssueTracker(),