fmt/support/printable.py

202 lines
6.0 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
2021-08-22 15:15:34 +00:00
# This script is based on
# https://github.com/rust-lang/rust/blob/master/library/core/src/unicode/printable.py
# distributed under https://github.com/rust-lang/rust/blob/master/LICENSE-MIT.
2021-08-22 15:15:34 +00:00
# This script uses the following Unicode tables:
# - UnicodeData.txt
from collections import namedtuple
import csv
import os
import subprocess
NUM_CODEPOINTS=0x110000
def to_ranges(iter):
current = None
for i in iter:
if current is None or i != current[1] or i in (0x10000, 0x20000):
if current is not None:
yield tuple(current)
current = [i, i + 1]
else:
current[1] += 1
if current is not None:
yield tuple(current)
def get_escaped(codepoints):
for c in codepoints:
if (c.class_ or "Cn") in "Cc Cf Cs Co Cn Zl Zp Zs".split() and c.value != ord(' '):
yield c.value
def get_file(f):
try:
return open(os.path.basename(f))
except FileNotFoundError:
subprocess.run(["curl", "-O", f], check=True)
return open(os.path.basename(f))
Codepoint = namedtuple('Codepoint', 'value class_')
def get_codepoints(f):
r = csv.reader(f, delimiter=";")
prev_codepoint = 0
class_first = None
for row in r:
codepoint = int(row[0], 16)
name = row[1]
class_ = row[2]
if class_first is not None:
if not name.endswith("Last>"):
raise ValueError("Missing Last after First")
for c in range(prev_codepoint + 1, codepoint):
yield Codepoint(c, class_first)
class_first = None
if name.endswith("First>"):
class_first = class_
yield Codepoint(codepoint, class_)
prev_codepoint = codepoint
if class_first is not None:
raise ValueError("Missing Last after First")
for c in range(prev_codepoint + 1, NUM_CODEPOINTS):
yield Codepoint(c, None)
def compress_singletons(singletons):
uppers = [] # (upper, # items in lowers)
lowers = []
for i in singletons:
upper = i >> 8
lower = i & 0xff
if len(uppers) == 0 or uppers[-1][0] != upper:
uppers.append((upper, 1))
else:
upper, count = uppers[-1]
uppers[-1] = upper, count + 1
lowers.append(lower)
return uppers, lowers
def compress_normal(normal):
# lengths 0x00..0x7f are encoded as 00, 01, ..., 7e, 7f
# lengths 0x80..0x7fff are encoded as 80 80, 80 81, ..., ff fe, ff ff
compressed = [] # [truelen, (truelenaux), falselen, (falselenaux)]
prev_start = 0
for start, count in normal:
truelen = start - prev_start
falselen = count
prev_start = start + count
assert truelen < 0x8000 and falselen < 0x8000
entry = []
if truelen > 0x7f:
entry.append(0x80 | (truelen >> 8))
entry.append(truelen & 0xff)
else:
entry.append(truelen & 0x7f)
if falselen > 0x7f:
entry.append(0x80 | (falselen >> 8))
entry.append(falselen & 0xff)
else:
entry.append(falselen & 0x7f)
compressed.append(entry)
return compressed
def print_singletons(uppers, lowers, uppersname, lowersname):
print(" static constexpr singleton {}[] = {{".format(uppersname))
2021-08-22 15:15:34 +00:00
for u, c in uppers:
print(" {{{:#04x}, {}}},".format(u, c))
print(" };")
print(" static constexpr unsigned char {}[] = {{".format(lowersname))
2021-08-22 15:15:34 +00:00
for i in range(0, len(lowers), 8):
print(" {}".format(" ".join("{:#04x},".format(l) for l in lowers[i:i+8])))
print(" };")
2021-08-22 15:15:34 +00:00
def print_normal(normal, normalname):
print(" static constexpr unsigned char {}[] = {{".format(normalname))
2021-08-22 15:15:34 +00:00
for v in normal:
print(" {}".format(" ".join("{:#04x},".format(i) for i in v)))
print(" };")
2021-08-22 15:15:34 +00:00
def main():
file = get_file("https://www.unicode.org/Public/UNIDATA/UnicodeData.txt")
codepoints = get_codepoints(file)
CUTOFF=0x10000
singletons0 = []
singletons1 = []
normal0 = []
normal1 = []
extra = []
for a, b in to_ranges(get_escaped(codepoints)):
if a > 2 * CUTOFF:
extra.append((a, b - a))
elif a == b - 1:
if a & CUTOFF:
singletons1.append(a & ~CUTOFF)
else:
singletons0.append(a)
elif a == b - 2:
if a & CUTOFF:
singletons1.append(a & ~CUTOFF)
singletons1.append((a + 1) & ~CUTOFF)
else:
singletons0.append(a)
singletons0.append(a + 1)
else:
if a >= 2 * CUTOFF:
extra.append((a, b - a))
elif a & CUTOFF:
normal1.append((a & ~CUTOFF, b - a))
else:
normal0.append((a, b - a))
singletons0u, singletons0l = compress_singletons(singletons0)
singletons1u, singletons1l = compress_singletons(singletons1)
normal0 = compress_normal(normal0)
normal1 = compress_normal(normal1)
print("""\
FMT_FUNC auto is_printable(uint32_t cp) -> bool {\
""")
2021-08-22 22:14:13 +00:00
print_singletons(singletons0u, singletons0l, 'singletons0', 'singletons0_lower')
print_singletons(singletons1u, singletons1l, 'singletons1', 'singletons1_lower')
print_normal(normal0, 'normal0')
print_normal(normal1, 'normal1')
print("""\
auto lower = static_cast<uint16_t>(cp);
if (cp < 0x10000) {
2021-08-22 22:14:13 +00:00
return is_printable(lower, singletons0,
sizeof(singletons0) / sizeof(*singletons0),
2021-08-22 20:23:38 +00:00
singletons0_lower, normal0, sizeof(normal0));
}
if (cp < 0x20000) {
2021-08-22 22:14:13 +00:00
return is_printable(lower, singletons1,
sizeof(singletons1) / sizeof(*singletons1),
2021-08-22 20:23:38 +00:00
singletons1_lower, normal1, sizeof(normal1));
}\
2021-08-22 15:15:34 +00:00
""")
for a, b in extra:
print(" if (0x{:x} <= cp && cp < 0x{:x}) return false;".format(a, a + b))
2021-08-22 15:15:34 +00:00
print("""\
2021-08-22 22:14:13 +00:00
return cp < 0x{:x};
}}\
""".format(NUM_CODEPOINTS))
2021-08-22 15:15:34 +00:00
if __name__ == '__main__':
main()