katie/scripts/genutf.py
Ivailo Monev ab1464f104 update unicode data tables to v13.0
combining types which harfbuzz does not support are no longer added to the
tables too, this should make obtaining properties of text faster.

special language table used to probe fonts for scripts support should be
reviewed once new CLDR version is released to add proper language entries
if present in likelySubtags.xml

Signed-off-by: Ivailo Monev <xakepa10@laimg.moc>
2020-03-12 21:42:56 +00:00

251 lines
7.6 KiB
Python
Executable file

#!/usr/bin/python
import os, sys
def readlines(fromfile):
lines = []
with open(fromfile, 'r') as f:
content = f.read()
for line in content.splitlines():
if line.startswith('#') or not line:
continue
commentsplit = line.split('#')
lines.append(commentsplit[0])
return lines
def mapinsert(tomap, key, value):
if not key in tomap.keys():
tomap[key] = [value]
else:
tomap[key].append(value)
def mapdecideinsert(toswitchmap, torangemap, value, key):
if '..' in key:
rangesplit = key.split('..')
rangemin = rangesplit[0]
rangemax = rangesplit[1]
# use switch statement if the range is 1
irangemax = int(rangemax, 16)
if (int(rangemin, 16) + 1) == irangemax or (int(rangemin, 16) - 1) == irangemax:
mapinsert(toswitchmap, value, rangemin)
mapinsert(toswitchmap, value, rangemax)
else:
mapinsert(torangemap, value, key)
else:
mapinsert(toswitchmap, value, key)
def printswitch(frommap):
print(' switch (ucs4) {')
for key in frommap:
for case in frommap[key]:
print(' case 0x%s:' % case)
print(' return %s;' % key)
print(' }')
def printifrange(frommap):
for key in frommap:
for r in frommap[key]:
rangesplit = r.split('..')
rangemin = rangesplit[0]
rangemax = rangesplit[1]
print(' if (ucs4 <= 0x%s && ucs4 >= 0x%s)' % (rangemax, rangemin))
print(' return %s;' % key)
switchmap = {}
rangemap = {}
if 'combining' in sys.argv:
for line in readlines('extracted/DerivedCombiningClass.txt'):
tablesplit = line.split(';')
codepoint = tablesplit[0].strip()
value = tablesplit[1].strip()
if value == '0':
# not ordered is default
continue
elif value == '202':
value = 'QUnicodeTables::Combining_AttachedBelow'
elif value == '214':
value = 'QUnicodeTables::Combining_AttachedAbove'
elif value == '216':
value = 'QUnicodeTables::Combining_AttachedAboveRight'
elif value == '218':
value = 'QUnicodeTables::Combining_BelowLeft'
elif value == '220':
value = 'QUnicodeTables::Combining_Below'
elif value == '222':
value = 'QUnicodeTables::Combining_BelowRight'
elif value == '224':
value = 'QUnicodeTables::Combining_Left'
elif value == '226':
value = 'QUnicodeTables::Combining_Right'
elif value == '228':
value = 'QUnicodeTables::Combining_AboveLeft'
elif value == '230':
value = 'QUnicodeTables::Combining_Above'
elif value == '232':
value = 'QUnicodeTables::Combining_AboveRight'
elif value == '233':
value = 'QUnicodeTables::Combining_DoubleBelow'
elif value == '234':
value = 'QUnicodeTables::Combining_DoubleAbove'
elif value == '240':
value = 'QUnicodeTables::Combining_IotaSubscript'
elif value in ['200', '204', '208', '210', '212']:
print('Unhandled deprecated combining type: %s' % value)
sys.exit(2)
else:
# only some are supported by harfbuzz
continue
mapdecideinsert(switchmap, rangemap, value, codepoint)
printswitch(switchmap)
printifrange(rangemap)
elif 'grapheme' in sys.argv:
# only some are supported by harfbuzz
supported = [
'CR',
'LF',
'Control'
'Extend',
'L',
'V',
'T',
'LV',
'LVT',
]
for line in readlines('auxiliary/GraphemeBreakProperty.txt'):
tablesplit = line.split(';')
codepoint = tablesplit[0].strip()
value = tablesplit[1].strip()
if not value in supported:
continue
value = 'QUnicodeTables::GraphemeBreak_%s' % value.replace('_', '')
mapdecideinsert(switchmap, rangemap, value, codepoint)
printswitch(switchmap)
printifrange(rangemap)
elif 'word' in sys.argv:
# only some are supported by harfbuzz
supported = [
'Format',
'Katakana'
'ALetter',
'MidLetter',
'MidNum',
'Numeric',
'ExtendNumLet',
]
for line in readlines('auxiliary/WordBreakProperty.txt'):
tablesplit = line.split(';')
codepoint = tablesplit[0].strip()
value = tablesplit[1].strip()
if not value in supported:
continue
value = 'QUnicodeTables::WordBreak_%s' % value.replace('_', '')
mapdecideinsert(switchmap, rangemap, value, codepoint)
printswitch(switchmap)
printifrange(rangemap)
elif 'sentence' in sys.argv:
# only some are supported by harfbuzz
supported = [
'Sep',
'Format',
'Sp'
'Lower',
'Upper',
'OLetter',
'Numeric',
'ATerm',
'STerm',
'Close',
]
for line in readlines('auxiliary/SentenceBreakProperty.txt'):
tablesplit = line.split(';')
codepoint = tablesplit[0].strip()
value = tablesplit[1].strip()
if not value in supported:
continue
value = 'QUnicodeTables::SentenceBreak_%s' % value.replace('_', '')
mapdecideinsert(switchmap, rangemap, value, codepoint)
printswitch(switchmap)
printifrange(rangemap)
elif 'line' in sys.argv:
# only some are supported by harfbuzz
supported = [
'OP',
'CL',
'QU'
'GL',
'NS',
'EX',
'SY',
'IS',
'PR',
'PO',
'NU',
'AL',
'ID',
'IN',
'HY',
'BA',
'BB',
'B2',
'ZW',
'CM',
'WJ',
'H2',
'H3',
'JL',
'JV',
'JT',
'SA',
'SG',
'SP',
'CR',
'LF',
'BK'
]
for line in readlines('LineBreak.txt'):
tablesplit = line.split(';')
codepoint = tablesplit[0].strip()
value = tablesplit[1].strip()
if not value in supported:
continue
value = 'QUnicodeTables::LineBreak_%s' % value.replace('_', '')
mapdecideinsert(switchmap, rangemap, value, codepoint)
printswitch(switchmap)
printifrange(rangemap)
elif 'script' in sys.argv:
for line in readlines('Scripts.txt'):
tablesplit = line.split(';')
codepoint = tablesplit[0].strip()
value = tablesplit[1].strip()
if value == 'Common':
# default is Common
continue
value = 'QUnicodeTables::%s' % value.replace('_', '')
mapdecideinsert(switchmap, rangemap, value, codepoint)
printswitch(switchmap)
printifrange(rangemap)
elif 'special' in sys.argv:
scriptslist = []
for line in readlines('Scripts.txt'):
tablesplit = line.split(';')
codepoint = tablesplit[0].strip()
value = tablesplit[1].strip()
if value in ('Inherited', 'Common'):
# both are treated differently
codepoint = '0'
if value in scriptslist:
# only one per script
continue
elif '..' in codepoint:
rangesplit = codepoint.split('..')
rangemin = rangesplit[0]
print(' 0x%s, // %s' % (rangemin, value.replace('_', '')))
else:
print(' 0x%s, // %s' % (codepoint, value.replace('_', '')))
scriptslist.append(value)
else:
print('''usage: <combining|grapheme|word|sentence|line|script|special>
Data is from https://unicode.org/Public/13.0.0/ucd/UCD.zip''')
sys.exit(1)