katie/scripts/genutf.py
Ivailo Monev 439647d272 update unicode data tables to v15.1
Signed-off-by: Ivailo Monev <xakepa10@gmail.com>
2024-03-13 22:10:30 +02:00

164 lines
4.5 KiB
Python
Executable file

#!/usr/bin/python
import sys
def readlines(fromfile):
lines = []
with open(fromfile, 'r') as f:
content = f.read()
for line in content.splitlines():
if line.startswith('#') or not line:
continue
commentsplit = line.split('#')
lines.append(commentsplit[0])
return lines
def mapinsert(tomap, key, value):
if not key in tomap.keys():
tomap[key] = [value]
else:
tomap[key].append(value)
def mapdecideinsert(toswitchmap, torangemap, value, key):
if '..' in key:
rangesplit = key.split('..')
rangemin = rangesplit[0]
rangemax = rangesplit[1]
# use switch statement if the range is 1
irangemax = int(rangemax, 16)
if (int(rangemin, 16) + 1) == irangemax or (int(rangemin, 16) - 1) == irangemax:
mapinsert(toswitchmap, value, rangemin)
mapinsert(toswitchmap, value, rangemax)
else:
mapinsert(torangemap, value, key)
else:
mapinsert(toswitchmap, value, key)
def printswitch(frommap):
print(' switch (ucs4) {')
for key in frommap:
for case in frommap[key]:
print(' case 0x%s:' % case)
print(' return %s;' % key)
print(' }')
def printifrange(frommap):
for key in frommap:
for r in frommap[key]:
rangesplit = r.split('..')
rangemin = rangesplit[0]
rangemax = rangesplit[1]
print(' if (ucs4 <= 0x%s && ucs4 >= 0x%s)' % (rangemax, rangemin))
print(' return %s;' % key)
switchmap = {}
rangemap = {}
if 'grapheme' in sys.argv:
# only some are supported by harfbuzz
supported = [
'CR',
'LF',
'Control',
'Extend',
'L',
'V',
'T',
'LV',
'LVT',
]
for line in readlines('auxiliary/GraphemeBreakProperty.txt'):
tablesplit = line.split(';')
codepoint = tablesplit[0].strip()
value = tablesplit[1].strip()
if not value in supported:
continue
value = 'QUnicodeTables::GraphemeBreak_%s' % value.replace('_', '')
mapdecideinsert(switchmap, rangemap, value, codepoint)
printswitch(switchmap)
printifrange(rangemap)
elif 'line' in sys.argv:
# only some are supported by harfbuzz
supported = [
'OP',
'CL',
'QU',
'GL',
'NS',
'EX',
'SY',
'IS',
'PR',
'PO',
'NU',
'AL',
'ID',
'IN',
'HY',
'BA',
'BB',
'B2',
'ZW',
'CM',
'WJ',
'H2',
'H3',
'JL',
'JV',
'JT',
'SA',
'SG',
'SP',
'CR',
'LF',
'BK'
]
for line in readlines('LineBreak.txt'):
tablesplit = line.split(';')
codepoint = tablesplit[0].strip()
value = tablesplit[1].strip()
if not value in supported:
continue
value = 'QUnicodeTables::LineBreak_%s' % value.replace('_', '')
mapdecideinsert(switchmap, rangemap, value, codepoint)
printswitch(switchmap)
printifrange(rangemap)
elif 'script' in sys.argv:
for line in readlines('Scripts.txt'):
tablesplit = line.split(';')
codepoint = tablesplit[0].strip()
value = tablesplit[1].strip()
if value == 'Common':
# default is Common
continue
value = 'QUnicodeTables::%s' % value.replace('_', '')
mapdecideinsert(switchmap, rangemap, value, codepoint)
printswitch(switchmap)
printifrange(rangemap)
elif 'special' in sys.argv:
scriptsmap = {}
for line in readlines('Scripts.txt'):
tablesplit = line.split(';')
codepoint = tablesplit[0].strip()
value = tablesplit[1].strip()
if value in ('Inherited', 'Common'):
# both are treated differently
codepoint = '0'
if value in scriptsmap.keys():
# only one per script
continue
if '..' in codepoint:
rangesplit = codepoint.split('..')
codepoint = rangesplit[0]
scriptsmap[value] = codepoint
print(' 0x%s, // Common' % scriptsmap['Common'])
for value in sorted(scriptsmap.keys()):
if (value == 'Common'):
continue
print(' 0x%s, // %s' % (scriptsmap[value], value.replace('_', '')))
else:
print('''usage: <grapheme|line|script|special>
Data is from https://unicode.org/Public/15.1.0/ucd/UCD.zip''')
sys.exit(1)