kde-l10n/sr@ijekavian/scripts/ts-pmap-compile.py

296 lines
8.5 KiB
Python

#!/usr/bin/env python
# -*- coding: UTF-8 -*-
# Script that compiles Transcript property maps from text to binary format.
# Binary format greately speeds up loading of property maps at runtime.
# http://techbase.kde.org/Localization/Concepts/Transcript
#
# Usage:
# ts-pmap-compile.py file.pmap file.pmapc
#
# Works with Python >= 2.6 and >= 3.0.
import locale
import os
import re
import struct
import sys
cmdname = os.path.basename(sys.argv[0])
lenc = locale.getpreferredencoding()
def error (msg, code=1):
sys.stderr.write(("%s: error: %s\n" % (cmdname, msg)).encode(lenc))
sys.exit(code)
def count_lines (text, tolen):
return text.count("\n", 0, tolen) + 1
def norm_keystr (text):
# Must do the same as normKeystr() in kdelibs/kdecore/ktranscript.cpp
return re.sub("[\s&]", "", text).lower()
def trim_smart (text):
return re.sub("^\s*\n|\n\s*$", "", text)
def read_pmap (fname):
# Adapted directly from C++ code.
fh = open(fname, "rb")
s = "".join([l.decode("utf8") for l in fh.readlines()])
fh.close()
s_nextEntry, s_nextKey, s_nextValue = 1, 2, 3
pmap = []
class END_PROP_PARSE (Exception): pass
try:
slen = len(s)
state = s_nextEntry
ekeys = [] # holds keys for current entry
props = [] # holds properties for current entry
pkey = "" # holds current property key
i = 0
while True:
i_checkpoint = i
if state == s_nextEntry:
while s[i].isspace():
i += 1
if i >= slen: raise END_PROP_PARSE
if i + 1 >= slen:
error("unexpected end of file %s" % fname)
if s[i] != '#':
# Separator characters for this entry.
key_sep = s[i]
prop_sep = s[i + 1]
if key_sep.isalpha() or prop_sep.isalpha():
error("separator characters must not be letters "
"at %s:%d" % (fname, count_lines(s, i)))
# Reset all data for current entry.
ekeys = []
props = []
pkey = ""
i += 2
state = s_nextKey
else:
# This is a comment, skip to EOL, don't change state.
while s[i] != '\n':
i += 1
if i >= slen: raise END_PROP_PARSE
elif state == s_nextKey:
ip = i
# Proceed up to next key or property separator.
while s[i] != key_sep and s[i] != prop_sep:
i += 1
if i >= slen: raise END_PROP_PARSE
if s[i] == key_sep:
# This is a property key,
# record for when the value gets parsed.
pkey = norm_keystr(s[ip:i])
i += 1
state = s_nextValue
else: # if (s[i] == prop_sep
# This is an entry key, or end of entry.
ekey = norm_keystr(s[ip:i])
if ekey:
# An entry key.
ekeys.append(ekey)
i += 1
state = s_nextKey
else:
# End of entry.
if len(ekeys) < 1:
error("no entry key for entry ending "
"at %s:%d" % (fname, count_lines(s, i)))
# Put collected properties into global store.
pmap.append((ekeys, props))
i += 1
state = s_nextEntry
elif state == s_nextValue:
ip = i
# Proceed up to next property separator.
while s[i] != prop_sep:
i += 1
if i >= slen: raise END_PROP_PARSE
if s[i] == key_sep:
error("property separator inside property value "
"at %s:%d" % (fname, count_lines(s, i)))
# Extract the property value and store the property.
pval = trim_smart(s[ip:i])
props.append((pkey, pval))
i += 1
state = s_nextKey
else:
error("internal error 10 "
"at %s:%d" % (fname, count_lines(s, i)))
# To avoid infinite looping and stepping out.
if i == i_checkpoint or i >= slen:
error("internal error 20 "
"at %s:%d" % (fname, count_lines(s, i)))
except END_PROP_PARSE:
if state != s_nextEntry:
error("unexpected end of file in %s" % fname)
return pmap
# Convert integer to 32-bit big-endian byte sequence.
def int_bin_32 (val):
return struct.pack(">i", val)[-4:]
# Convert integer to 64-bit big-endian byte sequence.
def int_bin_64 (val):
return struct.pack(">q", val)[-8:]
# Convert string to UTF-8 byte sequence,
# preceded by its length in 32-bit big-endian.
def str_bin_32 (val):
val_enc = val.encode("utf8")
return int_bin_32(len(val_enc)) + val_enc
# Concatenate byte sequence.
def catb (seq):
return bytes().join(seq)
# Binary map format 00.
def write_map_bin_00 (fh, pmap):
# Magic bytes.
fh.write("TSPMAP00".encode("ascii"))
# Number of entries.
fh.write(int_bin_32(len(pmap)))
for ekeys, props in pmap:
# Number of phrase keys and all phrase keys.
fh.write(int_bin_32(len(ekeys)))
for ekey in ekeys:
fh.write(str_bin_32(ekey))
# Number of properties and all properties.
fh.write(int_bin_32(len(props)))
for pkey, pval in props:
fh.write(str_bin_32(pkey))
fh.write(str_bin_32(pval))
# Binary map format 01.
def write_map_bin_01 (fh, pmap):
offset0 = 0
binint32len = len(int_bin_32(0))
binint64len = len(int_bin_64(0))
# Magic bytes.
mbytestr = "TSPMAP01".encode("ascii")
offset0 += len(mbytestr)
# Compute length of binary representation of all entry keys
# additionally equipped with offsets to corresponding property blobs.
offset0 += binint32len
offset0 += binint64len
binekeyslen = 0
for ekeys, d1 in pmap:
binekeyslen += sum([len(str_bin_32(x)) + binint64len for x in ekeys])
offset0 += binekeyslen
# Construct binary representations of all unique property keys.
offset0 += binint32len
offset0 += binint64len
allpkeys = set()
for d1, props in pmap:
allpkeys.update([x[0] for x in props])
binpkeys = catb(map(str_bin_32, sorted(allpkeys)))
offset0 += len(binpkeys)
# Construct binary representations of properties for each entry.
# Compute byte offsets for each of these binary blobs, in the given order.
binprops = []
plength = 0
poffset = offset0 + binint32len
for d1, props in pmap:
cbinprops = catb(sum([list(map(str_bin_32, x)) for x in props], []))
cbinprops = catb([int_bin_32(len(props)), int_bin_32(len(cbinprops)),
cbinprops])
offset = poffset + plength
binprops.append([cbinprops, offset])
poffset = offset
plength = len(cbinprops)
# Construct binary representations of all entry keys with property offsets.
allekeys = []
binekeys = []
for (ekeys, d1), (d2, offset) in zip(pmap, binprops):
binoffset = int_bin_64(offset)
cbinekeys = catb([str_bin_32(x) + binoffset for x in ekeys])
binekeys.append(cbinekeys)
allekeys.extend(ekeys)
binekeys = catb(binekeys)
assert(binekeyslen == len(binekeys))
# Write everything out.
fh.write(mbytestr)
fh.write(int_bin_32(len(allekeys)))
fh.write(int_bin_64(len(binekeys)))
fh.write(binekeys)
fh.write(int_bin_32(len(allpkeys)))
fh.write(int_bin_64(len(binpkeys)))
fh.write(binpkeys)
fh.write(int_bin_32(len(pmap)))
for cbinprops, d1 in binprops:
fh.write(cbinprops)
def main ():
if len(sys.argv) != 3:
error("usage: %s INPUT_FILE OUTPUT_FILE" % cmdname)
try:
import psyco
psyco.full()
except ImportError:
pass
ifile = sys.argv[1]
ofile = sys.argv[2]
pmap = read_pmap(ifile)
ofh = open(ofile, "wb")
write_map_bin_01(ofh, pmap)
ofh.close()
if __name__ == '__main__':
main()