katie/scripts/genlocale.py

996 lines
38 KiB
Python
Raw Permalink Normal View History

#!/usr/bin/python3
#-*- coding: UTF-8 -*-
# Data is from https://unicode.org/Public/cldr/44/core.zip
import os, sys, glob, re
import xml.etree.ElementTree as ET
printenumsandexit = ('--printenums' in sys.argv)
printdocsandexit = ('--printdocs' in sys.argv)
def mapcopy(frommap, tomap):
for key in frommap.keys():
tomap[key] = frommap[key]
def mapmerge(frommap, tomap, defaultmap):
for key in frommap.keys():
if frommap[key] == defaultmap[key]:
continue
tomap[key] = frommap[key]
def listcopy(fromlist, tolist):
for entry in fromlist:
tolist.append(entry)
def stripxmltext(fromxmltext):
result = fromxmltext.replace('\n', '')
result = result.replace('\t', '')
# 3-passes of double-space removal seems to be enough for all cases
for r in range(3):
result = result.replace(' ', ' ')
return result.strip()
def normalizestring(fromstring):
result = fromstring.replace(' ', '')
result = result.replace('-', '')
result = result.replace(',', '')
result = result.replace("'", '')
result = result.replace('&', 'And')
result = result.replace('(', '')
result = result.replace(')', '')
result = result.replace('St.', 'St')
result = result.replace('U.S.', 'UnitedStates')
# UTF-8 chars
result = result.replace(u'ʼ', '')
result = result.replace(u'', '')
result = result.replace(u'ü', 'u')
result = result.replace(u'å', 'a')
result = result.replace(u'ç', 'c')
result = result.replace(u'õ', 'o')
result = result.replace(u'Å', 'A')
result = result.replace(u'ô', 'o')
result = result.replace(u'ã', 'a')
result = result.replace(u'é', 'e')
result = result.replace(u'í', 'i')
result = result.replace(u'ā', 'a')
result = result.replace(u'á', 'a')
return result
def xmlmerge(fromxml, fromxml2):
tree = ET.parse(fromxml)
root = tree.getroot()
tree2 = ET.parse(fromxml2)
root2 = tree2.getroot()
for element in root:
root2.insert(0, element)
return root2
def touint(fromstring):
# NOTE: symbols (plus, minus, etc.) are assumed to be single character which is not true for
# many of the locales, however the API for those does not handle them as strings thus the first
# character only is used
return ord(fromstring)
def tochar(fromstring):
if fromstring:
return '"%s\\0"' % fromstring
return 'nullptr'
def tochararray(fromstringlist):
result = '{ '
for string in fromstringlist:
result = '%s%s, ' % (result, tochar(string))
result = '%s }' % result
result = result.replace(', }', ' }')
return result
def todayenum(day):
if day == 'mon':
return 'Qt::Monday'
elif day == 'tue':
return 'Qt::Tuesday'
elif day == 'wed':
return 'Qt::Wednesday'
elif day == 'thu':
return 'Qt::Thursday'
elif day == 'fri':
return 'Qt::Friday'
elif day == 'sat':
return 'Qt::Saturday'
elif day == 'sun':
return 'Qt::Sunday'
print('Unknown day: %s' % day)
sys.exit(1)
def todatetimeformat(fromformat):
# valid are y, m, M, d, h, H, s, a, A, z and t
unsupportedtags = [
'g',
'u',
'q',
'l',
'w',
'f',
'j',
]
replacementtags = {
'MMMMM' : 'MMM', # narrow month name
'LLLLL' : 'MMM', # stand-alone narrow month name
'E' : 'ddd', 'EE' : 'ddd', 'EEE' : 'ddd', 'EEEEE' : 'ddd', 'EEEE' : 'dddd', # day of week
'e' : 'ddd', 'ee' : 'ddd', 'eee' : 'ddd', 'eeeee' : 'ddd', 'eeee' : 'dddd', # local day of week
'c' : 'ddd', 'cc' : 'ddd', 'ccc' : 'ddd', 'ccccc' : 'ddd', 'cccc' : 'dddd', # stand-alone local day of week
'K' : 'h', # Hour 0-11
'k' : 'H', # Hour 1-24
'z' : 'Z', 'zz' : 'Z', 'zzz' : 'Z', 'zzzz' : 'Z', # timezone
'Z' : 'Z', 'ZZ' : 'Z', 'ZZZ' : 'Z', 'ZZZZ' : 'Z', # timezone
'v' : 'Z', 'vv' : 'Z', 'vvv' : 'Z', 'vvvv' : 'Z', # timezone
'V' : 'Z', 'VV' : 'Z', 'VVV' : 'Z', 'VVVV' : 'Z', # timezone
'L' : 'M', # stand-alone month names. not supported
}
replacementregex = {
r'y' : 'yyyy', # four-digit year without leading zeroes
r'yyy{3,}' : 'yyyy', # more that three digits hence convert to four-digit year
r'S{1,}' : '', # fractional seconds. not supported.
r'A{1,}' : '', # milliseconds in day. not supported.
r'a' : 'AP', # AM/PM
}
possibleoccurences = [
'%s, ',
', %s',
'%s.',
'.%s',
'%s-',
'-%s',
'(%s)',
"('%s')",
'%s ',
' %s',
'%s',
]
result = fromformat
for tag in unsupportedtags:
uppertag = tag.upper()
for occurence in possibleoccurences:
result = result.replace(occurence % (tag * 4), '')
result = result.replace(occurence % (tag * 3), '')
result = result.replace(occurence % (tag * 2), '')
result = result.replace(occurence % tag, '')
result = result.replace(occurence % (uppertag * 4), '')
result = result.replace(occurence % (uppertag * 3), '')
result = result.replace(occurence % (uppertag * 2), '')
result = result.replace(occurence % uppertag, '')
for key in replacementregex.keys():
result = re.sub(key, replacementregex[key], result)
for key in replacementtags.keys():
result = result.replace(key, replacementtags[key])
return result
def tomonthslist(fromxmlelements, initialvalues):
result = []
listcopy(initialvalues, result)
for month in fromxmlelements:
monthtype = month.get('type')
if monthtype == '1':
result[0] = month.text
elif monthtype == '2':
result[1] = month.text
elif monthtype == '3':
result[2] = month.text
elif monthtype == '4':
result[3] = month.text
elif monthtype == '5':
result[4] = month.text
elif monthtype == '6':
result[5] = month.text
elif monthtype == '7':
result[6] = month.text
elif monthtype == '8':
result[7] = month.text
elif monthtype == '9':
result[8] = month.text
elif monthtype == '10':
result[9] = month.text
elif monthtype == '11':
result[10] = month.text
elif monthtype == '12':
result[11] = month.text
else:
print('Unknown month: %s' % monthtype)
sys.exit(1)
return result
def todayslist(fromxmlelements, initialvalues):
result = []
listcopy(initialvalues, result)
for day in fromxmlelements:
daytype = day.get('type')
if daytype == 'mon':
result[0] = day.text
elif daytype == 'tue':
result[1] = day.text
elif daytype == 'wed':
result[2] = day.text
elif daytype == 'thu':
result[3] = day.text
elif daytype == 'fri':
result[4] = day.text
elif daytype == 'sat':
result[5] = day.text
elif daytype == 'sun':
result[6] = day.text
else:
print('Unknown day: %s' % daytype)
sys.exit(1)
return result
def tolanguageenum(fromstring):
for key in languagemap.keys():
if fromstring == languagemap[key]['code']:
return 'QLocale::Language::%s' % key
# print('Unknown language: %s' % fromstring)
# sys.exit(1)
def toscriptenum(fromstring):
for key in scriptmap.keys():
if fromstring == scriptmap[key]['code']:
return 'QLocale::Script::%s' % key
# print('Unknown script: %s' % fromstring)
# sys.exit(1)
def tocountryenum(fromstring):
for key in countrymap.keys():
if fromstring == countrymap[key]['code']:
return 'QLocale::Country::%s' % key
# print('Unknown country: %s' % fromstring)
# sys.exit(1)
# printenum prints mapped values that have unique code only, the rest are set to the enum of the
# first occurence. the reason for doing so is because table lookups for figuring out language,
# script and country required for constructing QLocale from string (named locales) relies on the
# fact that there is only one code for each, if that is not the case constructing copy of locale
# from its name will not copy it correctly. printtable skips duplicate code entries entirely
def printenum(frommap, prefix):
keyscount = 0
aliascodes = []
seencodes = []
print(' enum %s {' % prefix)
# print Default and C first
for key in frommap.keys():
if not key in ('Any%s' % prefix, 'C'):
continue
print(' %s = %d,' % (key, keyscount))
keyscount += 1
# now everything except those, save last key for later
lastkey = ''
for key in sorted(frommap.keys()):
if key in ('Any%s' % prefix, 'C'):
continue
code = frommap[key]['code']
if code in seencodes:
aliascodes.append(key)
continue
seencodes.append(code)
print(' %s = %d,' % (key, keyscount))
lastkey = key
keyscount += 1
# now aliases
print('')
for alias in sorted(aliascodes):
aliascode = frommap[alias]['code']
aliasenum = None
for key in sorted(frommap.keys()):
code = frommap[key]['code']
if aliascode == code:
aliasenum == key
break
print(' %s = %s,' % (alias, key))
# print last key
print('\n Last%s = %s' % (prefix, lastkey))
print(' };\n')
def printdoc(frommap, prefix):
print('// %s' % prefix)
for key in sorted(frommap.keys()):
if key in ('Any%s' % prefix, 'C'):
continue
print(' \\value %s' % key)
print('')
def printtable(frommap, prefix):
lowerprefix = prefix.lower()
seencodes = []
print('''static const struct %sTblData {
const QLocale::%s %s;
const char* name;
const char* code;
} %sTbl[] = {''' % (lowerprefix, prefix, lowerprefix, lowerprefix))
# print Default and C first
for key in frommap.keys():
if not key in ('Any%s' % prefix, 'C'):
continue
code = frommap[key]['code']
name = frommap[key]['name']
print(' { QLocale::%s::%s, %s, %s },' % (prefix, key, tochar(name), tochar(code)))
# now everything except those but only unique code values
for key in sorted(frommap.keys()):
if key in ('Any%s' % prefix, 'C'):
continue
code = frommap[key]['code']
if code in seencodes:
continue
seencodes.append(code)
name = frommap[key]['name']
print(' { QLocale::%s::%s, %s, %s },' % (prefix, key, tochar(name), tochar(code)))
print('};')
print('static const qint16 %sTblSize = sizeof(%sTbl) / sizeof(%sTblData);\n' % (lowerprefix, lowerprefix, lowerprefix))
def printlocaledata(frommap, key):
value = frommap[key]
# skip table entries without country (non-territory), unless it is artificial, this is done to
# preserve the assumption in QLocalePrivate::findLocale that "AnyCountry" means "find me a
# language, no matter what country it is spoken in" if "AnyCountry" is passed to it as argument
# and also shrinks the table
if value['country'] == 'QLocale::Country::AnyCountry' and not key == 'C':
return
# HACK: skip table entries the language of which is unknown
if key == 'apc_SY' or key == 'skr_PK':
return
# HACK: skip table entries with and without specifiec script
if key == 'ha_Arab_NG':
return
print(''' {
%s, %s, %s,
%s, %s, %s,
%s, %s, %s, %s, %s, %s, %s, %s,
%s, %s, %s, %s,
%s, %s,
%s,
%s,
%s,
%s,
%s,
%s,
%s,
%s,
%s,
%s,
%s,
%s
}, // %s''' % (
value['language'],
value['script'],
value['country'],
value['first_day_of_week'],
value['weekend_start'],
value['weekend_end'],
touint(value['decimal']),
touint(value['group']),
touint(value['list']),
touint(value['percent']),
touint(value['minus']),
touint(value['plus']),
touint(value['exponential']),
touint(value['zero']),
tochar(value['short_date_format']),
tochar(value['long_date_format']),
tochar(value['short_time_format']),
tochar(value['long_time_format']),
tochar(value['am']),
tochar(value['pm']),
tochararray(value['standalone_short_month_names']),
tochararray(value['standalone_long_month_names']),
tochararray(value['standalone_narrow_month_names']),
tochararray(value['short_month_names']),
tochararray(value['long_month_names']),
tochararray(value['narrow_month_names']),
tochararray(value['standalone_short_day_names']),
tochararray(value['standalone_long_day_names']),
tochararray(value['standalone_narrow_day_names']),
tochararray(value['short_day_names']),
tochararray(value['long_day_names']),
tochararray(value['narrow_day_names']),
key,
)
)
def printaliastable(frommap, prefix):
print('''static const struct %sAliasTblData {
const char* original;
const char* substitute;
} %sAliasTbl[] = {''' % (prefix, prefix))
for key in sorted(frommap):
# territories and scripts entries can contain multiple replacements, add one for each
splitvalue = frommap[key].split(' ')
for value in splitvalue:
print(' { "%s", "%s" },' % (key, value))
print('};')
print('static const qint16 %sAliasTblSize = sizeof(%sAliasTbl) / sizeof(%sAliasTblData);\n' % (prefix, prefix, prefix))
# main maps
languagemap = {}
countrymap = {}
scriptmap = {}
localemap = {}
likelysubtagsmap = {}
languagealiasmap = {}
countryaliasmap = {}
scriptaliasmap = {}
# cross-reference maps
localeparentmap = {}
localeparentvaluesmap = {}
localescriptmap = {}
localefirstdaymap = {}
localeweekendstartmap = {}
localeweekendendmap = {}
localenumberingmap = {}
# regular expressions
localeregex = re.compile('([^_|\-|\.|@]+)+')
# artificial entries
languagemap['AnyLanguage'] = {
'code': '',
'name': 'Default',
}
languagemap['C'] = {
'code': 'C',
'name': 'C',
}
countrymap['AnyCountry'] = {
'code': '',
'name': 'Default',
}
scriptmap['AnyScript'] = {
'code': '',
'name': 'Default',
}
# locale to parent parsing
tree = ET.parse('common/supplemental/supplementalData.xml')
root = tree.getroot()
for parentlocale in root.findall('./parentLocales/parentLocale'):
parentlocaleparent = parentlocale.get('parent')
parentlocalelocales = parentlocale.get('locales')
localeparentmap[parentlocaleparent] = parentlocalelocales.split(' ')
# locale to script parsing
# only languages with one primary script are mapped because if there are multiple it should be
# specified in the locale data, see:
# https://sites.google.com/site/cldr/development/updating-codes/update-language-script-info/language-script-description
# secondary scripts are not taken into account at all
for suppllanguage in root.findall('./languageData/language'):
suppllanguagetype = suppllanguage.get('type')
suppllanguagescripts = suppllanguage.get('scripts')
suppllanguagealt = suppllanguage.get('alt')
if not suppllanguagescripts or suppllanguagealt == 'secondary':
# alternative entry, skip it
continue
suppllanguagescriptslist = suppllanguagescripts.split(' ')
if not len(suppllanguagescriptslist) == 1:
# skip entries without definitive primary script
continue
suppllanguageterritories = suppllanguage.get('territories')
if not suppllanguageterritories:
# territories is optional, if not specified use artifical value to map all languages of
# that type to the script
suppllanguageterritories = 'AnyTerritory'
localescriptmap[suppllanguagetype] = {
'script': suppllanguagescripts,
'territories': suppllanguageterritories.split(' '),
}
# locale to first day parsing
for firstday in root.findall('./weekData/firstDay'):
firstdayday = firstday.get('day')
firstdayterritories = firstday.get('territories')
localefirstdaymap[todayenum(firstdayday)] = stripxmltext(firstdayterritories).split(' ')
# locale to weekend start parsing
for weekstart in root.findall('./weekData/weekendStart'):
weekstartday = weekstart.get('day')
weekstartterritories = weekstart.get('territories')
localeweekendstartmap[todayenum(weekstartday)] = stripxmltext(weekstartterritories).split(' ')
# locale to weekend end parsing
for weekend in root.findall('./weekData/weekendEnd'):
weekendday = weekend.get('day')
weekendterritories = weekend.get('territories')
localeweekendendmap[todayenum(weekendday)] = stripxmltext(weekendterritories).split(' ')
# locale to numbering system parsing
tree = ET.parse('common/supplemental/numberingSystems.xml')
root = tree.getroot()
for numberingsystem in root.findall('./numberingSystems/numberingSystem'):
numberingsystemid = numberingsystem.get('id')
numberingsystemdigits = numberingsystem.get('digits')
if numberingsystemdigits:
# either digits or rules is set
localenumberingmap[numberingsystemid] = stripxmltext(numberingsystemdigits)
# language parsing
tree = ET.parse('common/main/en.xml')
root = tree.getroot()
for language in root.findall('./localeDisplayNames/languages/language'):
languagetype = language.get('type')
normallanguage = normalizestring(language.text)
if normallanguage in ('Nauru', 'Tokelau', 'Tuvalu'):
# country and language are the same, suffix to solve enum clashes
normallanguage = '%sLanguage' % normallanguage
languagemap[normallanguage] = {
'code': languagetype,
'name': language.text,
}
if printenumsandexit:
printenum(languagemap, 'Language')
elif printdocsandexit:
printdoc(languagemap, 'Language')
else:
printtable(languagemap, 'Language')
# country parsing
for country in root.findall('./localeDisplayNames/territories/territory'):
countrytype = country.get('type')
normalcountry = normalizestring(country.text)
countrymap[normalcountry] = {
'code': countrytype,
'name': country.text,
}
if printenumsandexit:
printenum(countrymap, 'Country')
elif printdocsandexit:
printdoc(countrymap, 'Country')
else:
printtable(countrymap, 'Country')
# scripts parsing
for script in root.findall('./localeDisplayNames/scripts/script'):
scripttype = script.get('type')
normalscript = normalizestring(script.text)
if not normalscript.endswith('Script'):
# suffix script if needed
normalscript = '%sScript' % normalscript
if normalscript in ('UnknownScript', 'CommonScript'):
# only interested in specific scripts
continue
scriptmap[normalscript] = {
'code': scripttype,
'name': script.text,
}
if printenumsandexit:
printenum(scriptmap, 'Script')
sys.exit(0)
elif printdocsandexit:
printdoc(scriptmap, 'Script')
sys.exit(0)
else:
printtable(scriptmap, 'Script')
# these defaults are used as parent locales fallback, C uses them as actual values because root
# contains UTF-8 characters and for compatibility. for the rest defaults are set from root
localedefaults = {
# enums
'language': 'QLocale::Language::AnyLanguage',
'script': 'QLocale::Script::AnyScript',
'country': 'QLocale::Country::AnyCountry',
'first_day_of_week': 'Qt::Monday',
'weekend_start': 'Qt::Saturday',
'weekend_end': 'Qt::Sunday',
# characters
'decimal': '.',
'group': ',',
'list': ';',
'percent': '%',
'zero': '0',
'minus': '-',
'plus': '+',
'exponential': 'e', # default in CLDR is E
# strings
'short_date_format': 'd MMM yyyy', # default in CLDR is y-MM-dd
'long_date_format': 'd MMMM yyyy',
'short_time_format': 'HH:mm:ss', # default in CLDR is HH:mm
'long_time_format': 'HH:mm:ss z',
'am': 'AM',
'pm': 'PM',
# arrays
'standalone_short_month_names': ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'],
'standalone_long_month_names': ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'],
'standalone_narrow_month_names': ['J', 'F', 'M', 'A', 'M', 'J', 'J', 'A', 'S', 'O', 'N', 'D'],
'short_month_names': ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'],
'long_month_names': ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'],
'narrow_month_names': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12'],
'standalone_short_day_names': ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'],
'standalone_long_day_names': ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'],
'standalone_narrow_day_names': ['M', 'T', 'W', 'T', 'F', 'S', 'S'],
'short_day_names': ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'],
'long_day_names': ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'],
'narrow_day_names': ['1', '2', '3', '4', '5', '6', '7'],
}
# artificial entries
localemap['C'] = {}
mapcopy(localedefaults, localemap['C'])
localemap['C']['language'] = 'QLocale::Language::C'
# locales parsing
# TODO: accept only "contributed" or "approved" values
def readlocale(fromxml, tomap, isparent):
locale = os.path.basename(fromxml)
locale = locale.replace('.xml', '')
if '_' in fromxml:
# merge parent locales (non-territory) into the current one so that data can be read
# from them. they do not have territory and currency data is based on territory so
# cross-reference is not possible without doing so
localeparent = locale.split('_')[0]
xmlparent = fromxml.replace(locale, localeparent)
root = xmlmerge(fromxml, xmlparent)
else:
tree = ET.parse(fromxml)
root = tree.getroot()
variant = root.find('./identity/variant')
if variant is not None:
# TODO: variants are not supported by QLocale
return
language = root.find('./identity/language')
langtype = language.get('type')
country = root.find('./identity/territory')
countrytype = None
scripttype = None
numbertype = 'latn' # CLDR default
tomap[locale] = {}
if isparent:
mapcopy(localedefaults, tomap[locale])
else:
mapcopy(localeparentvaluesmap['root'], tomap[locale])
# set defaults from parent locale if territory is specified
if country is not None:
for parent in localeparentmap.keys():
if locale in localeparentmap[parent]:
if not parent in localeparentvaluesmap.keys():
# reference to locale without data
continue
mapcopy(localeparentvaluesmap[parent], tomap[locale])
# then from main locale (non-territory) filling the blanks that even parent locales do not fill
if not isparent:
mapmerge(localemap[langtype], tomap[locale], localedefaults)
# find the enums from mapped values
tomap[locale]['language'] = tolanguageenum(langtype)
if not isparent and country is not None:
countrytype = country.get('type')
tomap[locale]['country'] = tocountryenum(countrytype)
else:
# territory often is not specified, use language code as fallback
countrytype = langtype.upper()
# script is specified either in the locale or supplemental data
script = root.find('./identity/script')
if script is not None:
scripttype = script.get('type')
elif not isparent:
# scripts map is partial, pick from what is mapped
if langtype in localescriptmap.keys():
scriptterritories = localescriptmap[langtype]['territories']
if 'AnyTerritory' in scriptterritories \
or countrytype in scriptterritories:
scripttype = localescriptmap[langtype]['script']
defaultnumbersystem = root.find('./numbers/defaultNumberingSystem')
if defaultnumbersystem is not None:
numbertype = defaultnumbersystem.text
# find values from supplemental maps
if not isparent and scripttype:
tomap[locale]['script'] = toscriptenum(scripttype)
for key in localefirstdaymap.keys():
for countryvalue in localefirstdaymap[key]:
if countryvalue == countrytype:
tomap[locale]['first_day_of_week'] = key
break
for key in localeweekendstartmap.keys():
for countryvalue in localeweekendstartmap[key]:
if countryvalue == countrytype:
tomap[locale]['weekend_start'] = key
break
for key in localeweekendendmap.keys():
for countryvalue in localeweekendendmap[key]:
if countryvalue == countrytype:
tomap[locale]['weekend_end'] = key
break
# find from locale data
for symbol in root.findall('./numbers/symbols'):
symbolnumbersystem = symbol.get('numberSystem')
if not symbolnumbersystem == numbertype:
# should be the locale numeric system
continue
decimal = symbol.find('./decimal')
if decimal is not None and len(decimal.text) == 1:
tomap[locale]['decimal'] = decimal.text
group = symbol.find('./group')
if group is not None and len(group.text) == 1:
tomap[locale]['group'] = group.text
listdelimiter = symbol.find('./list')
if listdelimiter is not None and len(listdelimiter.text) == 1:
tomap[locale]['list'] = listdelimiter.text
percent = symbol.find('./percentSign')
if percent is not None and len(percent.text) == 1:
tomap[locale]['percent'] = percent.text
minus = symbol.find('./minusSign')
if minus is not None and len(minus.text) == 1:
tomap[locale]['minus'] = minus.text
plus = symbol.find('./plusSign')
if plus is not None and len(plus.text) == 1:
tomap[locale]['plus'] = plus.text
exponential = symbol.find('./exponential')
if exponential is not None and len(exponential.text) == 1:
tomap[locale]['exponential'] = exponential.text
# zero is from cross-reference numeric system map,
# taking the first character works even for UTF-8 chars
tomap[locale]['zero'] = localenumberingmap[numbertype][0]
# locale numeric system was found, break
break
for calendar in root.findall('./dates/calendars/calendar'):
calendartype = calendar.get('type')
if not calendartype == 'gregorian':
# all values should be from gregorian calendar
continue
for dateformat in calendar.findall('./dateFormats/dateFormatLength'):
dateformattype = dateformat.get('type')
if dateformattype == 'short':
pattern = dateformat.find('./dateFormat/pattern')
tomap[locale]['short_date_format'] = todatetimeformat(pattern.text)
elif dateformattype == 'long':
pattern = dateformat.find('./dateFormat/pattern')
tomap[locale]['long_date_format'] = todatetimeformat(pattern.text)
for timeformat in calendar.findall('./timeFormats/timeFormatLength'):
timeformattype = timeformat.get('type')
if timeformattype == 'short':
pattern = timeformat.find('./timeFormat/pattern')
tomap[locale]['short_time_format'] = todatetimeformat(pattern.text)
elif timeformattype == 'long':
pattern = timeformat.find('./timeFormat/pattern')
tomap[locale]['long_time_format'] = todatetimeformat(pattern.text)
for dayperiodwidth in calendar.findall('./dayPeriods/dayPeriodContext/dayPeriodWidth'):
dayperiodwidthtype = dayperiodwidth.get('type')
if not dayperiodwidthtype == 'wide':
# all values should be in wide format
continue
for dayperiod in dayperiodwidth.findall('dayPeriod'):
dayperiodtype = dayperiod.get('type')
if dayperiodtype == 'am':
tomap[locale]['am'] = dayperiod.text
elif dayperiodtype == 'pm':
tomap[locale]['pm'] = dayperiod.text
# month/day names
for monthcontext in calendar.findall('./months/monthContext'):
monthcontexttype = monthcontext.get('type')
if monthcontexttype == 'stand-alone':
for monthwidth in monthcontext.findall('./monthWidth'):
monthwidthtype = monthwidth.get('type')
if monthwidthtype == 'wide':
months = monthwidth.findall('./month')
tomap[locale]['standalone_long_month_names'] = tomonthslist(months, tomap[locale]['standalone_long_month_names'])
elif monthwidthtype == 'abbreviated':
months = monthwidth.findall('./month')
tomap[locale]['standalone_short_month_names'] = tomonthslist(months, tomap[locale]['standalone_short_month_names'])
elif monthwidthtype == 'narrow':
months = monthwidth.findall('./month')
tomap[locale]['standalone_narrow_month_names'] = tomonthslist(months, tomap[locale]['standalone_narrow_month_names'])
elif monthcontexttype == 'format':
for monthwidth in monthcontext.findall('./monthWidth'):
monthwidthtype = monthwidth.get('type')
if monthwidthtype == 'wide':
months = monthwidth.findall('./month')
tomap[locale]['long_month_names'] = tomonthslist(months, tomap[locale]['long_month_names'])
elif monthwidthtype == 'abbreviated':
months = monthwidth.findall('./month')
tomap[locale]['short_month_names'] = tomonthslist(months, tomap[locale]['short_month_names'])
elif monthwidthtype == 'narrow':
months = monthwidth.findall('./month')
tomap[locale]['narrow_month_names'] = tomonthslist(months, tomap[locale]['narrow_month_names'])
for daycontext in calendar.findall('./days/dayContext'):
daycontexttype = daycontext.get('type')
if daycontexttype == 'stand-alone':
for daywidth in daycontext.findall('./dayWidth'):
daywidthtype = daywidth.get('type')
if daywidthtype == 'wide':
days = daywidth.findall('./day')
tomap[locale]['standalone_long_day_names'] = todayslist(days, tomap[locale]['standalone_long_day_names'])
elif daywidthtype == 'abbreviated':
days = daywidth.findall('./day')
tomap[locale]['standalone_short_day_names'] = todayslist(days, tomap[locale]['standalone_short_day_names'])
elif daywidthtype == 'narrow':
days = daywidth.findall('./day')
tomap[locale]['standalone_narrow_day_names'] = todayslist(days, tomap[locale]['standalone_narrow_day_names'])
elif daycontexttype == 'format':
for daywidth in daycontext.findall('./dayWidth'):
daywidthtype = daywidth.get('type')
if daywidthtype == 'wide':
days = daywidth.findall('./day')
tomap[locale]['long_day_names'] = todayslist(days, tomap[locale]['long_day_names'])
elif daywidthtype == 'abbreviated':
days = daywidth.findall('./day')
tomap[locale]['short_day_names'] = todayslist(days, tomap[locale]['short_day_names'])
elif daywidthtype == 'narrow':
days = daywidth.findall('./day')
tomap[locale]['narrow_day_names'] = todayslist(days, tomap[locale]['narrow_day_names'])
# gregorian calendar was found, break
break
# month/day names are set during calendar parsing
# read parent locales first
for xml in glob.glob('common/main/*.xml'):
xmlbase = os.path.basename(xml)
xmlbase = xmlbase.replace('.xml', '')
if not xmlbase in localeparentmap.keys():
continue
readlocale(xml, localeparentvaluesmap, True)
# now everything including those
for xml in sorted(glob.glob('common/main/*.xml')):
if xml.endswith('/root.xml'):
# root is not actual locale
continue
readlocale(xml, localemap, False)
print('''static const QLocalePrivate localeTbl[] = {''')
# print C first
printlocaledata(localemap, 'C')
# now everything except that
for key in sorted(localemap.keys()):
if key == 'C':
continue
printlocaledata(localemap, key)
print('};')
print('static const qint16 localeTblSize = sizeof(localeTbl) / sizeof(QLocalePrivate);\n')
# likely subtags parsing
tree = ET.parse('common/supplemental/likelySubtags.xml')
root = tree.getroot()
for likelysubtag in root.findall('./likelySubtags/likelySubtag'):
likelysubtagfrom = likelysubtag.get('from')
likelysubtagto = likelysubtag.get('to')
# split code into language, script and country to make it possible to match against regardless
# of separators and remap to enums so that it is possible to substitute both named and enumed
# locale searches at the cost of not covering all named cases
likelysubtagfromsplit = localeregex.findall(likelysubtagfrom)
likelysubtagfromsplitlen = len(likelysubtagfromsplit)
likelysubtagtosplit = localeregex.findall(likelysubtagto)
likelysubtagtosplitlen = len(likelysubtagtosplit)
likelyfromlanguage = None
likelyfromscript = None
likelyfromcountry = None
if likelysubtagfromsplitlen == 1:
likelyfromlanguage = tolanguageenum(likelysubtagfromsplit[0])
likelyfromscript = 'QLocale::Script::AnyScript'
likelyfromcountry = 'QLocale::Country::AnyCountry'
elif likelysubtagfromsplitlen == 2:
likelyfromlanguage = tolanguageenum(likelysubtagfromsplit[0])
likelyfromscript = 'QLocale::Script::AnyScript'
likelyfromcountry = tocountryenum(likelysubtagfromsplit[1])
elif likelysubtagfromsplitlen == 3:
likelyfromlanguage = tolanguageenum(likelysubtagfromsplit[0])
likelyfromscript = toscriptenum(likelysubtagfromsplit[1])
likelyfromcountry = tocountryenum(likelysubtagfromsplit[2])
elif likelysubtagfromsplitlen > 3 or likelysubtagtosplitlen > 3:
# the regular expression is intentionally greedy, if there are more than 3 group matches
# then it is likely a variant and that is not supported case yet
print(likelysubtagfrom, likelysubtagfromsplit)
print(likelysubtagto, likelysubtagtosplit)
sys.exit(1)
likelytolanguage = tolanguageenum(likelysubtagtosplit[0])
likelytoscript = toscriptenum(likelysubtagtosplit[1])
likelytocountry = tocountryenum(likelysubtagtosplit[2])
if not likelyfromlanguage or not likelyfromscript or not likelyfromcountry \
or not likelytolanguage or not likelytoscript or not likelytocountry:
# if there are no enums for the codes skip the entry
continue
likelysubtagsmap[likelysubtagfrom] = {
'fromlanguage' : likelyfromlanguage,
'fromscript' : likelyfromscript,
'fromcountry' : likelyfromcountry,
'tolanguage' : likelytolanguage,
'toscript' : likelytoscript,
'tocountry' : likelytocountry,
}
print('''static const struct subtagAliasTblData {
const QLocale::Language fromlanguage;
const QLocale::Script fromscript;
const QLocale::Country fromcountry;
const QLocale::Language tolanguage;
const QLocale::Script toscript;
const QLocale::Country tocountry;
} subtagAliasTbl[] = {''')
for key in sorted(likelysubtagsmap):
value = likelysubtagsmap[key]
print(''' {
%s, %s, %s,
%s, %s, %s
},''' % (
value['fromlanguage'],
value['fromscript'],
value['fromcountry'],
value['tolanguage'],
value['toscript'],
value['tocountry'],
)
)
print('};')
print('static const qint16 subtagAliasTblSize = sizeof(subtagAliasTbl) / sizeof(subtagAliasTblData);\n')
# language alias parsing
tree = ET.parse('common/supplemental/supplementalMetadata.xml')
root = tree.getroot()
for languagealias in root.findall('./metadata/alias/languageAlias'):
languagealiastype = languagealias.get('type')
languagealiasreplacement = languagealias.get('replacement')
if '_' in languagealiastype or '-' in languagealiastype \
or '_' in languagealiasreplacement or '-' in languagealiasreplacement:
# if either the original or the substitute is BCP47 code (language and script/country
# included) skip it because QLocalePrivate::codeToLanguage() should be dealing with
# language codes only
continue
languagealiasmap[languagealiastype] = languagealiasreplacement
printaliastable(languagealiasmap, 'language')
# country alias parsing
for territoryalias in root.findall('./metadata/alias/territoryAlias'):
territoryaliastype = territoryalias.get('type')
territoryaliasreplacement = territoryalias.get('replacement')
countryaliasmap[territoryaliastype] = territoryaliasreplacement
printaliastable(countryaliasmap, 'country')
# script alias parsing
for scriptalias in root.findall('./metadata/alias/scriptAlias'):
scriptaliastype = scriptalias.get('type')
scriptaliasreplacement = scriptalias.get('replacement')
scriptaliasmap[scriptaliastype] = scriptaliasreplacement
printaliastable(scriptaliasmap, 'script')