2014-11-13 01:04:59 +02:00
|
|
|
/* This file is part of the KDE libraries
|
|
|
|
Copyright (C) 1999 Lars Knoll (knoll@kde.org)
|
|
|
|
Copyright (C) 2001, 2003, 2004, 2005, 2006 Nicolas GOUTTE <goutte@kde.org>
|
|
|
|
Copyright (C) 2007 Nick Shaforostoff <shafff@ukr.net>
|
|
|
|
|
|
|
|
This library is free software; you can redistribute it and/or
|
|
|
|
modify it under the terms of the GNU Library General Public
|
|
|
|
License as published by the Free Software Foundation; either
|
|
|
|
version 2 of the License, or (at your option) any later version.
|
|
|
|
|
|
|
|
This library is distributed in the hope that it will be useful,
|
|
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
Library General Public License for more details.
|
|
|
|
|
|
|
|
You should have received a copy of the GNU Library General Public License
|
|
|
|
along with this library; see the file COPYING.LIB. If not, write to
|
|
|
|
the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
|
|
|
|
Boston, MA 02110-1301, USA.
|
|
|
|
*/
|
|
|
|
#include "kcharsets.h"
|
|
|
|
|
|
|
|
#include "kfilterdev.h"
|
2015-09-04 01:57:00 +00:00
|
|
|
#include "kentities.cpp"
|
2014-11-13 01:04:59 +02:00
|
|
|
|
|
|
|
#include "kconfig.h"
|
|
|
|
#include "kdebug.h"
|
|
|
|
#include "kglobal.h"
|
|
|
|
#include "klocale.h"
|
|
|
|
|
|
|
|
#include <QtCore/QDir>
|
|
|
|
#include <QtCore/QRegExp>
|
2015-08-11 05:56:07 +03:00
|
|
|
#include <QtCore/qstring.h>
|
|
|
|
#include <QtCore/qstringlist.h>
|
2014-11-13 01:04:59 +02:00
|
|
|
#include <QtCore/QTextCodec>
|
|
|
|
|
|
|
|
#include <assert.h>
|
|
|
|
#include <QHash>
|
|
|
|
|
2019-11-22 20:18:41 +00:00
|
|
|
static const QLatin1String kOtherEncoding = QLatin1String("Other");
|
|
|
|
|
|
|
|
static void splitEncoding(const QByteArray &encoding, QString &group, QString &set) {
|
2019-11-24 23:33:23 +00:00
|
|
|
int separatorindex = 0;
|
|
|
|
const char *data = encoding.constData();
|
|
|
|
for (int i = 0; i < encoding.size(); i++) {
|
|
|
|
if (data[i] == ' ' || data[i] == '-' || data[i] == '_') {
|
|
|
|
separatorindex = i;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (separatorindex > 1) {
|
|
|
|
group = QString::fromLatin1(encoding.mid(0, separatorindex));
|
|
|
|
set = QString::fromLatin1(encoding.mid(separatorindex + 1, encoding.size() - separatorindex - 1));
|
2019-11-22 20:18:41 +00:00
|
|
|
} else {
|
|
|
|
group = kOtherEncoding;
|
|
|
|
set = QString::fromLatin1(encoding);
|
|
|
|
}
|
2014-11-13 01:04:59 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
class KCharsetsPrivate
|
|
|
|
{
|
|
|
|
public:
|
|
|
|
KCharsetsPrivate(KCharsets* _kc)
|
|
|
|
{
|
|
|
|
kc = _kc;
|
|
|
|
codecForNameDict.reserve( 43 );
|
|
|
|
}
|
|
|
|
// Hash for the encoding names (sensitive case)
|
|
|
|
QHash<QByteArray,QTextCodec*> codecForNameDict;
|
|
|
|
KCharsets* kc;
|
|
|
|
|
|
|
|
//Cache list so QStrings can be implicitly shared
|
|
|
|
QList<QStringList> encodingsByScript;
|
|
|
|
};
|
|
|
|
|
|
|
|
// --------------------------------------------------------------------------
|
|
|
|
|
|
|
|
KCharsets::KCharsets()
|
2015-09-04 01:57:00 +00:00
|
|
|
:d(new KCharsetsPrivate(this))
|
2014-11-13 01:04:59 +02:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
KCharsets::~KCharsets()
|
|
|
|
{
|
|
|
|
delete d;
|
|
|
|
}
|
|
|
|
|
|
|
|
QChar KCharsets::fromEntity(const QString &str)
|
|
|
|
{
|
2018-07-09 18:39:18 +00:00
|
|
|
QChar res;
|
2014-11-13 01:04:59 +02:00
|
|
|
|
|
|
|
if ( str.isEmpty() )
|
2018-07-09 18:39:18 +00:00
|
|
|
return res;
|
2014-11-13 01:04:59 +02:00
|
|
|
|
|
|
|
int pos = 0;
|
|
|
|
if(str[pos] == QLatin1Char('&')) pos++;
|
|
|
|
|
|
|
|
// Check for '�' or '�' sequence
|
|
|
|
if (str[pos] == QLatin1Char('#') && str.length()-pos > 1) {
|
|
|
|
bool ok;
|
|
|
|
pos++;
|
|
|
|
if (str[pos] == QLatin1Char('x') || str[pos] == QLatin1Char('X')) {
|
|
|
|
pos++;
|
|
|
|
// '�', hexadecimal character reference
|
|
|
|
const QString tmp( str.mid( pos ) );
|
|
|
|
res = tmp.toInt(&ok, 16);
|
|
|
|
} else {
|
|
|
|
// '�', decimal character reference
|
|
|
|
const QString tmp( str.mid( pos ) );
|
|
|
|
res = tmp.toInt(&ok, 10);
|
|
|
|
}
|
|
|
|
if ( ok )
|
|
|
|
return res;
|
|
|
|
else
|
2018-07-09 18:39:18 +00:00
|
|
|
return QChar();
|
2014-11-13 01:04:59 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
const QByteArray raw ( str.toLatin1() );
|
2015-09-04 01:57:00 +00:00
|
|
|
const entity *e = EntitiesHash::kde_findEntity( raw, raw.length() );
|
2014-11-13 01:04:59 +02:00
|
|
|
|
|
|
|
if(!e)
|
|
|
|
{
|
|
|
|
//kDebug( 0 ) << "unknown entity " << str <<", len = " << str.length();
|
2018-07-09 18:39:18 +00:00
|
|
|
return QChar();
|
2014-11-13 01:04:59 +02:00
|
|
|
}
|
|
|
|
//kDebug() << "got entity " << str << " = " << e->code;
|
|
|
|
|
|
|
|
return QChar(e->code);
|
|
|
|
}
|
|
|
|
|
|
|
|
QChar KCharsets::fromEntity(const QString &str, int &len)
|
|
|
|
{
|
|
|
|
// entities are never longer than 8 chars... we start from
|
|
|
|
// that length and work backwards...
|
|
|
|
len = 8;
|
|
|
|
while(len > 0)
|
|
|
|
{
|
|
|
|
QString tmp = str.left(len);
|
|
|
|
QChar res = fromEntity(tmp);
|
2018-07-09 18:39:18 +00:00
|
|
|
if( !res.isNull() ) return res;
|
2014-11-13 01:04:59 +02:00
|
|
|
len--;
|
|
|
|
}
|
2018-07-09 18:39:18 +00:00
|
|
|
return QChar();
|
2014-11-13 01:04:59 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
QString KCharsets::toEntity(const QChar &ch)
|
|
|
|
{
|
|
|
|
QString ent;
|
|
|
|
ent.sprintf("�x%x;", ch.unicode());
|
|
|
|
return ent;
|
|
|
|
}
|
|
|
|
|
|
|
|
QString KCharsets::resolveEntities( const QString &input )
|
|
|
|
{
|
|
|
|
QString text = input;
|
|
|
|
const QChar *p = text.unicode();
|
|
|
|
const QChar *end = p + text.length();
|
|
|
|
const QChar *ampersand = 0;
|
|
|
|
bool scanForSemicolon = false;
|
|
|
|
|
|
|
|
for ( ; p < end; ++p ) {
|
|
|
|
const QChar ch = *p;
|
|
|
|
|
|
|
|
if ( ch == QLatin1Char('&') ) {
|
|
|
|
ampersand = p;
|
|
|
|
scanForSemicolon = true;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
if ( ch != QLatin1Char(';') || scanForSemicolon == false )
|
|
|
|
continue;
|
|
|
|
|
|
|
|
assert( ampersand );
|
|
|
|
|
|
|
|
scanForSemicolon = false;
|
|
|
|
|
|
|
|
const QChar *entityBegin = ampersand + 1;
|
|
|
|
|
|
|
|
const uint entityLength = p - entityBegin;
|
|
|
|
if ( entityLength == 0 )
|
|
|
|
continue;
|
|
|
|
|
|
|
|
const QChar entityValue = KCharsets::fromEntity( QString( entityBegin, entityLength ) );
|
|
|
|
if ( entityValue.isNull() )
|
|
|
|
continue;
|
|
|
|
|
|
|
|
const uint ampersandPos = ampersand - text.unicode();
|
|
|
|
|
|
|
|
text[ (int)ampersandPos ] = entityValue;
|
|
|
|
text.remove( ampersandPos + 1, entityLength + 1 );
|
|
|
|
p = text.unicode() + ampersandPos;
|
|
|
|
end = text.unicode() + text.length();
|
|
|
|
ampersand = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
return text;
|
|
|
|
}
|
|
|
|
|
|
|
|
QStringList KCharsets::availableEncodingNames() const
|
|
|
|
{
|
|
|
|
QStringList available;
|
2019-11-22 20:18:41 +00:00
|
|
|
foreach (const QByteArray &encoding, QTextCodec::availableCodecs()) {
|
|
|
|
available.append( QString::fromLatin1( encoding ) );
|
|
|
|
}
|
2014-11-13 01:04:59 +02:00
|
|
|
available.sort();
|
|
|
|
return available;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
QString KCharsets::descriptionForEncoding( const QString& encoding ) const
|
|
|
|
{
|
2019-11-22 20:18:41 +00:00
|
|
|
QString group;
|
|
|
|
QString set;
|
|
|
|
splitEncoding(encoding.toUtf8(), group, set);
|
|
|
|
|
|
|
|
if ( group != kOtherEncoding )
|
|
|
|
return i18nc( "@item %1 character set, %2 encoding",
|
|
|
|
"%1 ( %2 )", group, set );
|
|
|
|
return i18nc( "@item", "Other encoding (%1)", encoding );
|
2014-11-13 01:04:59 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
QString KCharsets::encodingForName( const QString &descriptiveName ) const
|
|
|
|
{
|
|
|
|
const int left = descriptiveName.lastIndexOf( QLatin1Char('(') );
|
|
|
|
|
|
|
|
if (left<0) // No parenthesis, so assume it is a normal encoding name
|
|
|
|
return descriptiveName.trimmed();
|
|
|
|
|
|
|
|
QString name(descriptiveName.mid(left+1));
|
|
|
|
|
|
|
|
const int right = name.lastIndexOf( QLatin1Char(')') );
|
|
|
|
|
|
|
|
if (right<0)
|
|
|
|
return name;
|
|
|
|
|
|
|
|
return name.left(right).trimmed();
|
|
|
|
}
|
|
|
|
|
|
|
|
QStringList KCharsets::descriptiveEncodingNames() const
|
|
|
|
{
|
|
|
|
QStringList encodings;
|
2019-11-22 20:18:41 +00:00
|
|
|
foreach (const QByteArray &encoding, QTextCodec::availableCodecs()) {
|
|
|
|
QString group;
|
|
|
|
QString set;
|
|
|
|
splitEncoding(encoding, group, set);
|
|
|
|
|
|
|
|
encodings.append( i18nc( "@item Text encoding: %1 character set, %2 encoding",
|
|
|
|
"%1 ( %2 )", group, set ) );
|
2014-11-13 01:04:59 +02:00
|
|
|
}
|
|
|
|
encodings.sort();
|
|
|
|
return encodings;
|
|
|
|
}
|
|
|
|
|
|
|
|
QList<QStringList> KCharsets::encodingsByScript() const
|
|
|
|
{
|
|
|
|
if (!d->encodingsByScript.isEmpty())
|
|
|
|
return d->encodingsByScript;
|
2019-11-22 20:18:41 +00:00
|
|
|
|
|
|
|
foreach (const QByteArray &encoding, QTextCodec::availableCodecs()) {
|
|
|
|
QString group;
|
|
|
|
QString set;
|
|
|
|
splitEncoding(encoding, group, set);
|
|
|
|
|
2019-11-24 23:33:23 +00:00
|
|
|
int i = 0;
|
2019-11-22 20:18:41 +00:00
|
|
|
const QString encodingstring = QString::fromLatin1(encoding);
|
|
|
|
for (i = 0; i < d->encodingsByScript.size(); i++) {
|
2019-11-24 23:33:23 +00:00
|
|
|
if (d->encodingsByScript.at(i).at(0).toLower() == group.toLower()) {
|
2019-11-22 20:18:41 +00:00
|
|
|
d->encodingsByScript[i].append(encodingstring);
|
2014-11-13 01:04:59 +02:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2019-11-22 20:18:41 +00:00
|
|
|
if (i == d->encodingsByScript.size()) {
|
|
|
|
d->encodingsByScript.append(QStringList() << group << encodingstring);
|
2014-11-13 01:04:59 +02:00
|
|
|
}
|
|
|
|
}
|
2019-11-24 23:33:23 +00:00
|
|
|
|
|
|
|
// remove groups with only one entry and move their entry to Other group
|
|
|
|
foreach (const QStringList &list, d->encodingsByScript) {
|
|
|
|
if (list.size() == 2) {
|
|
|
|
int i = 0;
|
|
|
|
const QString encoding = list.at(1);
|
|
|
|
d->encodingsByScript.removeAll(list);
|
|
|
|
|
|
|
|
for (i = 0; i < d->encodingsByScript.size(); i++) {
|
|
|
|
if (d->encodingsByScript.at(i).at(0) == kOtherEncoding) {
|
|
|
|
d->encodingsByScript[i].append(encoding);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (i == d->encodingsByScript.size()) {
|
|
|
|
d->encodingsByScript.append(QStringList() << kOtherEncoding << encoding);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-11-13 01:04:59 +02:00
|
|
|
return d->encodingsByScript;
|
|
|
|
}
|
|
|
|
|
|
|
|
QTextCodec* KCharsets::codecForName(const QString &n) const
|
|
|
|
{
|
|
|
|
if ( n == QLatin1String("gb2312") || n == QLatin1String("gbk") )
|
|
|
|
return QTextCodec::codecForName( "gb18030" );
|
|
|
|
const QByteArray name( n.toLatin1() );
|
|
|
|
QTextCodec* codec = codecForNameOrNull( name );
|
|
|
|
if ( codec )
|
|
|
|
return codec;
|
|
|
|
else
|
|
|
|
return QTextCodec::codecForName( "iso-8859-1" );
|
|
|
|
}
|
|
|
|
|
|
|
|
QTextCodec* KCharsets::codecForName(const QString &n, bool &ok) const
|
|
|
|
{
|
|
|
|
if (n == QLatin1String("gb2312") || n == QLatin1String("gbk")) {
|
|
|
|
ok = true;
|
|
|
|
return QTextCodec::codecForName( "gb18030" );
|
|
|
|
}
|
|
|
|
const QByteArray name( n.toLatin1() );
|
|
|
|
QTextCodec* codec = codecForNameOrNull( name );
|
|
|
|
if ( codec )
|
|
|
|
{
|
|
|
|
ok = true;
|
|
|
|
return codec;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
ok = false;
|
|
|
|
return QTextCodec::codecForName( "iso-8859-1" );
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
QTextCodec *KCharsets::codecForNameOrNull( const QByteArray& n ) const
|
|
|
|
{
|
|
|
|
QTextCodec* codec = 0;
|
|
|
|
|
|
|
|
if (n.isEmpty()) {
|
|
|
|
// No name, assume locale (KDE's, not Qt's)
|
|
|
|
const QByteArray locale = "->locale<-";
|
|
|
|
if ( d->codecForNameDict.contains( locale ) )
|
|
|
|
return d->codecForNameDict.value( locale );
|
|
|
|
codec = KGlobal::locale()->codecForEncoding();
|
|
|
|
d->codecForNameDict.insert("->locale<-", codec);
|
|
|
|
return codec;
|
|
|
|
}
|
|
|
|
// For a non-empty name, lookup the "dictionnary", in a case-sensitive way.
|
|
|
|
else if ( d->codecForNameDict.contains( n ) ) {
|
|
|
|
return d->codecForNameDict.value( n );
|
|
|
|
}
|
|
|
|
|
|
|
|
// If the name is not in the hash table, call directly QTextCoded::codecForName.
|
|
|
|
// We assume that QTextCodec is smarter and more maintained than this code.
|
|
|
|
codec = QTextCodec::codecForName( n );
|
|
|
|
if ( codec ) {
|
|
|
|
d->codecForNameDict.insert( n, codec );
|
|
|
|
return codec;
|
|
|
|
}
|
|
|
|
|
|
|
|
// We have had no luck with QTextCodec::codecForName, so we must now process the name, so that QTextCodec::codecForName could work with it.
|
|
|
|
|
|
|
|
QByteArray name = n.toLower();
|
|
|
|
bool changed = false;
|
|
|
|
if (name.endsWith("_charset")) { // krazy:exclude=strings
|
|
|
|
name.chop( 8 );
|
|
|
|
changed = true;
|
|
|
|
}
|
|
|
|
if ( name.startsWith( "x-" ) ) { // krazy:exclude=strings
|
|
|
|
name.remove( 0, 2 ); // remove x- at start
|
|
|
|
changed = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (name.isEmpty()) {
|
|
|
|
// We have no name anymore, therefore the name is invalid.
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
// We only need to check changed names.
|
|
|
|
if ( changed ) {
|
|
|
|
codec = QTextCodec::codecForName(name);
|
|
|
|
if (codec) {
|
|
|
|
d->codecForNameDict.insert( n, codec );
|
|
|
|
return codec;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// we could not assign a codec, therefore return NULL
|
|
|
|
return 0;
|
|
|
|
}
|