kde-extraapps/lokalize/scripts/xliffmerge.py
2014-12-07 23:48:52 +00:00

384 lines
14 KiB
Python

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#python scripts/xliffmerge.py -i tests/testxliffmerge_tr.xlf -t tests/testxliffmerge_en.xlf -o test_merged.xlf
#TODO: mark as 'needs adaptation' when only punctuation is changed
#check id's change after document update
from PyQt4.QtCore import *
from PyQt4.QtXml import *
import itertools
import math
from optparse import OptionParser
parser = OptionParser()
parser.add_option("-I", "--sticky-ids", dest="stickyIds", default=False, action="store_true",
help="mark translations as needing review if IDs didn't match")
parser.add_option("-i", "--input", dest="oldFile",
help="file with translations")
parser.add_option("-t", "--template", dest="templateFile",
help="new template file")
parser.add_option("-o", "--output", dest="outFile",
help="where to store merged file")
(options, args) = parser.parse_args()
def saveElement(elem):
contents=QString()
stream=QTextStream(contents)
elem.save(stream,0)
return contents
def elementText(parent):
contents=QString()
n=parent.firstChild()
while not n.isNull():
if n.isCharacterData():
contents+=n.toCharacterData().data()
elif n.isElement():
contents+=elementText(n)
n=n.nextSibling()
return contents
strings={}
def getDocUnitsList(path):
doc=QDomDocument()
file=QFile(path)
file.open(QIODevice.ReadOnly)
reader=QXmlSimpleReader()
reader.setFeature('http://qtsoftware.com/xml/features/report-whitespace-only-CharData',True)
reader.setFeature('http://xml.org/sax/features/namespaces',False)
source=QXmlInputSource(file)
doc.setContent(source,reader)
file.close()
units=doc.elementsByTagName("trans-unit")
unitsList=[]
for i in range(units.count()):
unit=units.at(i)
if unit.firstChildElement("source").text() not in strings:
strings[unit.firstChildElement("source").text()]=len(strings)
unitsList.append(strings[unit.firstChildElement("source").text()])
return (doc, units, unitsList)
(templateDoc, templateUnits, templateUnitsList)=getDocUnitsList(options.templateFile)
(oldDoc, oldUnits, oldUnitsList)=getDocUnitsList(options.oldFile)
freezedOldUnits=[]
for i in range(oldUnits.size()):
freezedOldUnits.append(oldUnits.at(i))
def lcs_length(xs, ys):
ny = len(ys)
curr = list(itertools.repeat(0, 1 + ny))
for x in xs:
prev = list(curr)
for i, y in enumerate(ys):
if x == y:
curr[i+1] = prev[i] + 1
else:
curr[i+1] = max(curr[i], prev[i+1])
return curr[ny]
def LCS(X, Y):
m = len(X)
n = len(Y)
# An (m+1) times (n+1) matrix
C = [[0] * (n+1) for i in range(m+1)]
for i in range(1, m+1):
for j in range(1, n+1):
if X[i-1] == Y[j-1]:
C[i][j] = C[i-1][j-1] + 1
else:
C[i][j] = max(C[i][j-1], C[i-1][j])
return C
removedUnits=[]
def recordRemoved(C, X, Y, i, j):
if i > 0 and j > 0 and X[i-1] == Y[j-1]:
recordRemoved(C, X, Y, i-1, j-1)
else:
C[i-1][j]
if j > 0 and (i == 0 or C[i][j-1] >= C[i-1][j]):
recordRemoved(C, X, Y, i, j-1)
elif i > 0 and (j == 0 or C[i][j-1] < C[i-1][j]):
recordRemoved(C, X, Y, i-1, j)
removedUnits.append(i-1)
def inlineTags(parent):
result=[]
elem=parent.firstChildElement()
while not elem.isNull():
result.append(elem.tagName())
elem=elem.nextSiblingElement()
return result
def getIdsMap(parent):
result={}
elem=parent.firstChildElement()
while not elem.isNull():
result[elem.attribute('id')]=elem
elem=elem.nextSiblingElement()
return result
def removeAttributes(elem):
for attrNode in [elem.attributes().item(i) for i in range(elem.attributes().size())]:
elem.removeChild(attrNode)
def cloneToAltTrans(unitAltToBe, newUnit):
altUnit=unitAltToBe.cloneNode().toElement()
altUnit.setTagName('alt-trans')
altUnit.setAttribute('alttranstype','previous-version')
altUnit.setAttribute('phase-name',phaseName)
altUnit.removeAttribute('id')
altUnit.removeAttribute('approved')
refNode=newUnit.firstChildElement('alt-trans')
if refNode.isNull(): altUnit=newUnit.appendChild(altUnit)
else: altUnit=newUnit.insertBefore(altUnit,refNode)
subAltUnits=[]
subAltUnit=altUnit.firstChildElement('alt-trans')
refNode=altUnit
while not subAltUnit.isNull():
refNode=altUnit.parentNode().insertAfter(subAltUnit.cloneNode(),refNode)
subAltUnits.append(subAltUnit)
subAltUnit=subAltUnit.nextSiblingElement('alt-trans')
for subAltUnit in subAltUnits:
altUnit.removeChild(subAltUnit)
return altUnit
INLINE_MARKUP_ELEMENTS=['g', 'x', 'bx', 'ex', 'bpt' , 'ept', 'ph', 'it'] #, 'mrk' -- doesn't have id attribute
globals()['recentlyRemoved']=[]
globals()['lastCommon']=-1
def merge(C, X, Y, i, j):
if i > 0 and j > 0 and X[i-1] == Y[j-1]:
merge(C, X, Y, i-1, j-1)
globals()['recentlyRemoved']=[]
globals()['lastCommon']=i-1
templateUnit=templateUnits.at(j-1).toElement()
templateSource=templateUnit.firstChildElement("source")
commonUnit=freezedOldUnits[i-1].toElement()
commonTarget=commonUnit.firstChildElement("target")
commonSource=commonUnit.firstChildElement("source")
targetIdsMap=getIdsMap(commonTarget)
equalIds=False
# [only] inline markup differs?
completelyEqual=saveElement(commonSource)==saveElement(templateSource)
if not completelyEqual:
altUnit=cloneToAltTrans(commonUnit,commonUnit)
commonTarget.setAttribute('state','needs-review-adaptation')
commonUnit.setAttribute('phase-name',phaseName)
commonTarget.setAttribute('phase-name',phaseName)
#print ' '+templateSource.text()
# update inline markup attributes in target
for markupElement in INLINE_MARKUP_ELEMENTS:
templateElem=templateSource.firstChildElement(markupElement)
commonSourceElem=commonSource.firstChildElement(markupElement)
while not templateElem.isNull() and not commonSourceElem.isNull():
equalIds=equalIds and commonSourceElem.attribute('id')==templateElem.attribute('id')
if targetIdsMap.has_key(commonSourceElem.attribute('id')):
commonTargetElem=targetIdsMap[commonSourceElem.attribute('id')]
removeAttributes(commonTargetElem)
for attrNode in [templateElem.attributes().item(i).toAttr() for i in range(templateElem.attributes().size())]:
commonTargetElem.setAttribute(attrNode.name(), attrNode.value())
del targetIdsMap[commonSourceElem.attribute('id')]
templateElem=templateElem.nextSiblingElement(markupElement)
commonSourceElem=commonSourceElem.nextSiblingElement(markupElement)
# remove inline markup in target which doesn't have corresponding markup in source
for orphan in targetIdsMap.itervalues():
if orphan.tagName()=='mrk': continue
removeAttributes(orphan)
child=orphan.firstChild()
while not child.isNull():
newChild=child.cloneNode()
orphan.parentNode().insertAfter(newChild,orphan.previousSibling())
child=child.nextSibling()
orphan.parentNode().removeChild(orphan)
#copy templates source entirely
commonUnit.insertAfter(oldDoc.importNode(templateSource.cloneNode(),True), commonSource)
commonUnit.removeChild(commonSource)
#ids
equalIds=equalIds and commonUnit.attribute('id')==templateUnit.attribute('id')
if not equalIds and options.stickyIds:
commonUnit.removeAttribute('approved')
#if not commonTarget.attribute('state').contains('review'):
commonTarget.setAttribute('state','needs-review-l10n')
if not commonTarget.hasChildNodes(): commonTarget.parentNode().removeChild(commonTarget)
#if equalIds and completelyEqual:
#commonTarget.setAttribute('state-qualifier','id-match')
commonUnit.setAttribute('id',templateUnit.attribute('id'))
else:
if j > 0 and (i == 0 or C[i][j-1] >= C[i-1][j]):
merge(C, X, Y, i, j-1)
templateUnit=templateUnits.at(j-1)
#print '+'+templateUnit.firstChildElement("source").text()
newUnit=oldDoc.importNode(templateUnit, True).toElement()
newUnit.setAttribute('phase-name',phaseName)
if globals()['lastCommon']==-1:
refNode=freezedOldUnits[0]
refNode.parentNode().insertBefore(newUnit, refNode)
else:
refNode=freezedOldUnits[ globals()['lastCommon'] ]
refNode.parentNode().insertAfter(newUnit, refNode)
globals()['lastCommon']=i-1 #to preserve order
#look for alternate translations, neighbourhood first
#nonRecentlyRemoved=[x for x in removedUnits if x not in globals()['recentlyRemoved']]
maxUnits=[]
newUnitText=newUnit.firstChildElement("source").text()
newUnitWords=newUnitText.split(' ')
maxScore=0
scores={}
for x in removedUnits:
remNode=freezedOldUnits[x]
remNodeText=remNode.firstChildElement("source").text()
commonWordLen=lcs_length(newUnitWords,remNodeText.split(' '))
if (commonWordLen+1)<0.5*len(newUnitWords):
scores[x]=0
continue
commonLen=lcs_length(newUnitText,remNodeText)
remLen=newUnitText.size()-commonLen
addLen=remNodeText.size()-commonLen
if commonLen==0: score=0
else: score=99*math.exp(0.2*math.log(1.0*commonLen/newUnitText.size())) / (math.exp(0.015*addLen)*math.exp(0.01*remLen))
scores[x]=score
if maxScore<score:maxScore=score
if maxScore<80: return
for x in removedUnits:
if scores[x]==maxScore:
remNode=freezedOldUnits[x]
maxUnits.append((score+1*(x in globals()['recentlyRemoved']), remNode))
def count_compare_inverted(x, y): return int(y[0]-x[0])
maxUnits.sort(count_compare_inverted)
for maxUnit in maxUnits:
#print maxUnit[0],
#print newUnitText,
#print '------------',
#print maxUnit[1].firstChildElement("source").text()
cloneToAltTrans(maxUnit[1],newUnit)
elif i > 0 and (j == 0 or C[i][j-1] < C[i-1][j]):
merge(C, X, Y, i-1, j)
globals()['recentlyRemoved'].append(i-1)
#print '-'+elementText(freezedOldUnits[i-1].toElement().firstChildElement("source"))
def addPhase():
VERSION='0.1'
file=oldDoc.elementsByTagName("file").at(0).toElement()
header=file.firstChildElement("header")
phasegroup=header.firstChildElement("phase-group")
if phasegroup.isNull():
phasegroup=oldDoc.createElement("phase-group")
#order following XLIFF spec
skl=header.firstChildElement("skl")
if not skl.isNull(): header.insertAfter(phasegroup, skl)
else: header.insertBefore(phasegroup, header.firstChildElement())
phaseNames={}
phaseElem=phasegroup.firstChildElement("phase")
while not phaseElem.isNull():
phaseNames[phaseElem.attribute("phase-name")]=True
phaseElem=phaseElem.nextSiblingElement("phase")
i=1
while 'update-from-template-%d' % i in phaseNames:
i+=1
phaseElem=phasegroup.appendChild(oldDoc.createElement("phase")).toElement()
phaseElem.setAttribute("phase-name",'update-from-template-%d' % i)
phaseElem.setAttribute("process-name", 'update-from-template')
phaseElem.setAttribute("tool-id", 'xliffmerge-%s' % VERSION)
phaseElem.setAttribute("date", QDate.currentDate().toString(Qt.ISODate))
toolElem=header.firstChildElement("tool")
while not toolElem.isNull() and toolElem.attribute("tool-id")!='xliffmerge-%s' % VERSION:
toolElem=toolElem.nextSiblingElement("tool")
if toolElem.isNull():
toolElem=header.appendChild(oldDoc.createElement("tool")).toElement()
toolElem.setAttribute("tool-id",'xliffmerge-%s' % VERSION)
toolElem.setAttribute("tool-name","xliffmerge.py")
toolElem.setAttribute("tool-version",VERSION)
return 'update-from-template-%d' % i
phaseName=addPhase()
C = LCS(oldUnitsList, templateUnitsList)
recordRemoved(C, oldUnitsList, templateUnitsList, len(oldUnitsList), len(templateUnitsList))
merge(C, oldUnitsList, templateUnitsList, len(oldUnitsList), len(templateUnitsList))
for remNode in [freezedOldUnits[x] for x in removedUnits]:
remNode.parentNode().removeChild(remNode)
def fixWhiteSpace(elem):
first=elem.firstChildElement()
if not first.previousSibling().isCharacterData():
elem.insertBefore(oldDoc.createTextNode(''),first)
n=first
while not n.isNull():
if not n.nextSibling().isCharacterData():
elem.insertAfter(oldDoc.createTextNode(''),n)
n=n.nextSiblingElement()
def fixWhiteSpaceInList(nodeList):
for node in [nodeList.at(x) for x in range(nodeList.size())]:
fixWhiteSpace(node)
containers=["source", "seg-source","target","g","bpt","ept","ph","it","mrk"] #immediate containers allowing markup
for container in containers:
fixWhiteSpaceInList(oldDoc.elementsByTagName(container))
file=QFile(options.outFile)
file.open(QIODevice.WriteOnly)
stream=QTextStream(file)
oldDoc.save(stream,2)
stream.flush()
file.close()