kdelibs/kdecore/localization/kcharsets.cpp

/* This file is part of the KDE libraries
    Copyright (C) 1999 Lars Knoll (knoll@kde.org)
    Copyright (C) 2001, 2003, 2004, 2005, 2006 Nicolas GOUTTE <goutte@kde.org>
    Copyright (C) 2007 Nick Shaforostoff <shafff@ukr.net>

    This library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Library General Public
    License as published by the Free Software Foundation; either
    version 2 of the License, or (at your option) any later version.

    This library is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    Library General Public License for more details.

    You should have received a copy of the GNU Library General Public License
    along with this library; see the file COPYING.LIB.  If not, write to
    the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
    Boston, MA 02110-1301, USA.
*/
#include "kcharsets.h"

#include "kfilterdev.h"
#include "kentities.cpp"

#include "kconfig.h"
#include "kdebug.h"
#include "kglobal.h"
#include "klocale.h"

#include <QtCore/QDir>
#include <QtCore/QRegExp>
#include <QtCore/qstring.h>
#include <QtCore/qstringlist.h>
#include <QtCore/QTextCodec>

#include <assert.h>
#include <QHash>

/*
 * ### FIXME KDE4: the name of the encodings should mostly be uppercase
 * The names of this list are user-visible
 * Generate with generate_string_table.pl, input data:
ISO 8859-1
i18n:Western European
ISO 8859-15
i18n:Western European
ISO 8859-14
i18n:Western European
cp 1252
i18n:Western European
IBM850
i18n:Western European
ISO 8859-2
i18n:Central European
ISO 8859-3
i18n:Central European
ISO 8859-4
i18n:Baltic
ISO 8859-13
i18n:Baltic
ISO 8859-16
i18n:South-Eastern Europe
cp 1250
i18n:Central European
cp 1254
i18n:Turkish
cp 1257
i18n:Baltic
KOI8-R
i18n:Cyrillic
ISO 8859-5
i18n:Cyrillic
cp 1251
i18n:Cyrillic
KOI8-U
i18n:Cyrillic
IBM866
i18n:Cyrillic
Big5
i18n:Chinese Traditional
Big5-HKSCS
i18n:Chinese Traditional
GB18030
i18n:Chinese Simplified
GBK
i18n:Chinese Simplified
GB2312
i18n:Chinese Simplified
EUC-KR
i18n:Korean
cp 949
i18n:Korean
sjis
i18n:Japanese
jis7
i18n:Japanese
EUC-JP
i18n:Japanese
ISO 8859-7
i18n:Greek
cp 1253
i18n:Greek
ISO 8859-6
i18n:Arabic
cp 1256
i18n:Arabic
ISO 8859-8
i18n:Hebrew
ISO 8859-8-I
i18n:Hebrew
cp 1255
i18n:Hebrew
ISO 8859-9
i18n:Turkish
TIS620
i18n:Thai
ISO 8859-11
i18n:Thai
UTF-8
i18n:Unicode
UTF-16
i18n:Unicode
utf7
i18n:Unicode
ucs2
i18n:Unicode
ISO 10646-UCS-2
i18n:Unicode
winsami2
i18n:Northern Saami
windows-1258
i18n:Other
IBM874
i18n:Other
TSCII
i18n:Other
 */
/*
 * Notes about the table:
 *
 * - The following entries were disabled and removed from the table:
ibm852
i18n:Central European
pt 154
i18n:Cyrillic              // ### TODO "PT 154" seems to have been removed from Qt
 *
 * - ISO 8559-11 is the deprecated name of TIS-620
 * - utf7 is not in Qt
 * - UTF-16 is duplicated as "ucs2" and "ISO 10646-UCS-2"
 * - windows-1258: TODO
 * - IBM874: TODO
 * - TSCII: TODO
 */
static const char language_for_encoding_string[] =
    "ISO 8859-1\0"
    I18N_NOOP2("@item Text character set", "Western European")"\0"
    "ISO 8859-15\0"
    "ISO 8859-14\0"
    "cp 1252\0"
    "IBM850\0"
    "ISO 8859-2\0"
    I18N_NOOP2("@item Text character set", "Central European")"\0"
    "ISO 8859-3\0"
    "ISO 8859-4\0"
    I18N_NOOP2("@item Text character set", "Baltic")"\0"
    "ISO 8859-13\0"
    "ISO 8859-16\0"
    I18N_NOOP2("@item Text character set", "South-Eastern Europe")"\0"
    "cp 1250\0"
    "cp 1254\0"
    I18N_NOOP2("@item Text character set", "Turkish")"\0"
    "cp 1257\0"
    "KOI8-R\0"
    I18N_NOOP2("@item Text character set", "Cyrillic")"\0"
    "ISO 8859-5\0"
    "cp 1251\0"
    "KOI8-U\0"
    "IBM866\0"
    "Big5\0"
    I18N_NOOP2("@item Text character set", "Chinese Traditional")"\0"
    "Big5-HKSCS\0"
    "GB18030\0"
    I18N_NOOP2("@item Text character set", "Chinese Simplified")"\0"
    "GBK\0"
    "GB2312\0"
    "EUC-KR\0"
    I18N_NOOP2("@item Text character set", "Korean")"\0"
    "cp 949\0"
    "sjis\0"
    I18N_NOOP2("@item Text character set", "Japanese")"\0"
    "jis7\0"
    "EUC-JP\0"
    "ISO 8859-7\0"
    I18N_NOOP2("@item Text character set", "Greek")"\0"
    "cp 1253\0"
    "ISO 8859-6\0"
    I18N_NOOP2("@item Text character set", "Arabic")"\0"
    "cp 1256\0"
    "ISO 8859-8\0"
    I18N_NOOP2("@item Text character set", "Hebrew")"\0"
    "ISO 8859-8-I\0"
    "cp 1255\0"
    "ISO 8859-9\0"
    "TIS620\0"
    I18N_NOOP2("@item Text character set", "Thai")"\0"
    "ISO 8859-11\0"
    "UTF-8\0"
    I18N_NOOP2("@item Text character set", "Unicode")"\0"
    "UTF-16\0"
    "utf7\0"
    "ucs2\0"
    "ISO 10646-UCS-2\0"
    "winsami2\0"
    I18N_NOOP2("@item Text character set", "Northern Saami")"\0"
    "windows-1258\0"
    I18N_NOOP2("@item Text character set", "Other")"\0"
    "IBM874\0"
    "TSCII\0"
    "\0";

static const int language_for_encoding_indices[] = {
       0,   11,   28,   11,   40,   11,   52,   11,
      60,   11,   67,   78,   95,   78,  106,  117,
     124,  117,  136,  148,  169,   78,  177,  185,
     193,  117,  201,  208,  217,  208,  228,  208,
     236,  208,  243,  208,  250,  255,  275,  255,
     286,  294,  313,  294,  317,  294,  324,  331,
     338,  331,  345,  350,  359,  350,  364,  350,
     371,  382,  388,  382,  396,  407,  414,  407,
     422,  433,  440,  433,  453,  433,  461,  185,
     472,  479,  484,  479,  496,  502,  510,  502,
     517,  502,  522,  502,  527,  502,  543,  552,
     567,  580,  586,  580,  593,  580,   -1
};

/*
 * defines some different names for codecs that are built into Qt.
 * The names in this list must be lower-case.
 * input data for generate_string_table.pl:
iso-ir-111
koi8-r
koi unified
koi8-r
us-ascii
iso 8859-1
usascii
iso 8859-1
ascii
iso 8859-1
unicode-1-1-utf-7
utf-7
ucs2
iso-10646-ucs-2
iso10646-1
iso-10646-ucs-2
gb18030.2000-1
gb18030
gb18030.2000-0
gb18030
gbk-0
gbk
gb2312
gbk
gb2312.1980-0
gbk
big5-0
big5
euc-kr
euckr
cp949
cp 949
euc-jp
eucjp
jisx0201.1976-0
eucjp
jisx0208.1983-0
eucjp
jisx0208.1990-0
eucjp
jisx0208.1997-0
eucjp
jisx0212.1990-0
eucjp
jisx0213.2000-1
eucjp
jisx0213.2000-2
eucjp
shift_jis
sjis
shift-jis
sjis
sjis
sjis
iso-2022-jp
jis7
windows850
ibm850
windows866
ibm866
windows-850
ibm850
windows-866
ibm866
cp-10000
apple roman
thai-tis620
iso 8859-11
windows-874
ibm874
windows874
ibm874
cp-874
ibm874
ksc5601.1987-0
euckr
ks_c_5601-1987
euckr
mac-roman
apple roman
macintosh
apple roman
mac
apple roman
csiso2022jp
iso-2022-jp
*/
/*
 * Notes about the table:
 * - using ISO-8859-1 for ASCII is only an approximation (as you cannot test if a character is part of the set)
 * - utf7 is not in Qt
 * - UTF-16 is duplicated as "ucs2" and "ISO 10646-UCS-2"
 * - sjis: appears on the table for x-sjis
 * - jis7: ISO-2022-JP is now the default name in Qt4
 * - cp-874: is it really needed?
 * - mac-roman: appears on the table for x-mac-roman
 * - csiso2022jp: See bug #77243
 */
static const char builtin_string[] =
    "iso-ir-111\0"
    "koi8-r\0"
    "koi unified\0"
    "us-ascii\0"
    "iso 8859-1\0"
    "usascii\0"
    "ascii\0"
    "unicode-1-1-utf-7\0"
    "utf-7\0"
    "ucs2\0"
    "iso-10646-ucs-2\0"
    "iso10646-1\0"
    "gb18030.2000-1\0"
    "gb18030\0"
    "gb18030.2000-0\0"
    "gbk-0\0"
    "gbk\0"
    "gb2312\0"
    "gb2312.1980-0\0"
    "big5-0\0"
    "big5\0"
    "euc-kr\0"
    "euckr\0"
    "cp949\0"
    "cp 949\0"
    "euc-jp\0"
    "eucjp\0"
    "jisx0201.1976-0\0"
    "jisx0208.1983-0\0"
    "jisx0208.1990-0\0"
    "jisx0208.1997-0\0"
    "jisx0212.1990-0\0"
    "jisx0213.2000-1\0"
    "jisx0213.2000-2\0"
    "shift_jis\0"
    "sjis\0"
    "shift-jis\0"
    "iso-2022-jp\0"
    "jis7\0"
    "windows850\0"
    "ibm850\0"
    "windows866\0"
    "ibm866\0"
    "windows-850\0"
    "windows-866\0"
    "cp-10000\0"
    "apple roman\0"
    "thai-tis620\0"
    "iso 8859-11\0"
    "windows-874\0"
    "ibm874\0"
    "windows874\0"
    "cp-874\0"
    "ksc5601.1987-0\0"
    "ks_c_5601-1987\0"
    "mac-roman\0"
    "macintosh\0"
    "mac\0"
    "csiso2022jp\0"
    "\0";

static const int builtin_indices[] = {
       0,   11,   18,   11,   30,   39,   50,   39,
      58,   39,   64,   82,   88,   93,  109,   93,
     120,  135,  143,  135,  158,  164,  168,  164,
     175,  164,  189,  196,  201,  208,  214,  220,
     227,  234,  240,  234,  256,  234,  272,  234,
     288,  234,  304,  234,  320,  234,  336,  234,
     352,  362,  367,  362,  362,  362,  377,  389,
     394,  405,  412,  423,  430,  405,  442,  423,
     454,  463,  475,  487,  499,  511,  518,  511,
     529,  511,  536,  208,  551,  208,  566,  463,
     576,  463,  586,  463,  590,  377,   -1
};

/*
 * some last resort hints in case the charmap file couldn't be found.
 * This gives at least a partial conversion and helps making things readable.
 *
 * the name used as input here is already converted to the more canonical
 * name as defined in the aliases array.
 *
 * Input data:
cp1250
iso-8859-2
koi8-r
iso-8859-5
koi8-u
koi8-r
pt 154
windows-1251
paratype-154
windows-1251
pt-154
windows-1251
 */
/* Notes:
 * - KDE had always "CP 1251" as best fallback to PT 154. As Qt does not offer this encoding anymore, the codepage 1251 is used as fallback.
 */
static const char conversion_hints_string[] =
    "cp1250\0"
    "iso-8859-2\0"
    "koi8-r\0"
    "iso-8859-5\0"
    "koi8-u\0"
    "pt 154\0"
    "windows-1251\0"
    "paratype-154\0"
    "pt-154\0"
    "\0";

static const int conversion_hints_indices[] = {
       0,    7,   18,   25,   36,   18,   43,   50,
      63,   50,   76,   50,   -1
};

// search an array of items index/data, find first matching index
// and return data, or return 0
static inline
const char *kcharsets_array_search(const char *start, const int *indices, const char *entry)
{
    for (int i = 0; indices[i] != -1; i += 2)
        if (qstrcmp(start + indices[i], entry) == 0)
            return start + indices[i + 1];
    return 0;
}


class KCharsetsPrivate
{
public:
    KCharsetsPrivate(KCharsets* _kc)
    {
        kc = _kc;
        codecForNameDict.reserve( 43 );
    }
    // Hash for the encoding names (sensitive case)
    QHash<QByteArray,QTextCodec*> codecForNameDict;
    KCharsets* kc;

    //Cache list so QStrings can be implicitly shared
    QList<QStringList> encodingsByScript;
};

// --------------------------------------------------------------------------

KCharsets::KCharsets()
    :d(new KCharsetsPrivate(this))
{
}

KCharsets::~KCharsets()
{
    delete d;
}

QChar KCharsets::fromEntity(const QString &str)
{
    QChar res = QChar::Null;

    if ( str.isEmpty() )
        return QChar::Null;

    int pos = 0;
    if(str[pos] == QLatin1Char('&')) pos++;

    // Check for '&#000' or '&#x0000' sequence
    if (str[pos] == QLatin1Char('#') && str.length()-pos > 1) {
        bool ok;
        pos++;
        if (str[pos] == QLatin1Char('x') || str[pos] == QLatin1Char('X')) {
            pos++;
            // '&#x0000', hexadecimal character reference
            const QString tmp( str.mid( pos ) );
            res = tmp.toInt(&ok, 16);
        } else {
            //  '&#0000', decimal character reference
            const QString tmp( str.mid( pos ) );
            res = tmp.toInt(&ok, 10);
        }
        if ( ok )
            return res;
        else
            return QChar::Null;
    }

    const QByteArray raw ( str.toLatin1() );
    const entity *e = EntitiesHash::kde_findEntity( raw, raw.length() );

    if(!e)
    {
        //kDebug( 0 ) << "unknown entity " << str <<", len = " << str.length();
        return QChar::Null;
    }
    //kDebug() << "got entity " << str << " = " << e->code;

    return QChar(e->code);
}

QChar KCharsets::fromEntity(const QString &str, int &len)
{
    // entities are never longer than 8 chars... we start from
    // that length and work backwards...
    len = 8;
    while(len > 0)
    {
        QString tmp = str.left(len);
        QChar res = fromEntity(tmp);
        if( res != QChar::Null ) return res;
        len--;
    }
    return QChar::Null;
}


QString KCharsets::toEntity(const QChar &ch)
{
    QString ent;
    ent.sprintf("&#0x%x;", ch.unicode());
    return ent;
}

QString KCharsets::resolveEntities( const QString &input )
{
    QString text = input;
    const QChar *p = text.unicode();
    const QChar *end = p + text.length();
    const QChar *ampersand = 0;
    bool scanForSemicolon = false;

    for ( ; p < end; ++p ) {
        const QChar ch = *p;

        if ( ch == QLatin1Char('&') ) {
            ampersand = p;
            scanForSemicolon = true;
            continue;
        }

        if ( ch != QLatin1Char(';') || scanForSemicolon == false )
            continue;

        assert( ampersand );

        scanForSemicolon = false;

        const QChar *entityBegin = ampersand + 1;

        const uint entityLength = p - entityBegin;
        if ( entityLength == 0 )
            continue;

        const QChar entityValue = KCharsets::fromEntity( QString( entityBegin, entityLength ) );
        if ( entityValue.isNull() )
            continue;

        const uint ampersandPos = ampersand - text.unicode();

        text[ (int)ampersandPos ] = entityValue;
        text.remove( ampersandPos + 1, entityLength + 1 );
        p = text.unicode() + ampersandPos;
        end = text.unicode() + text.length();
        ampersand = 0;
    }

    return text;
}

QStringList KCharsets::availableEncodingNames() const
{
    QStringList available;
    for ( const int *p = language_for_encoding_indices; *p != -1; p += 2)
        available.append( QString::fromUtf8( language_for_encoding_string + *p ) );
    available.sort();
    return available;
}


QString KCharsets::descriptionForEncoding( const QString& encoding ) const
{
    const char* lang = kcharsets_array_search( language_for_encoding_string,
                                               language_for_encoding_indices,
                                               encoding.toUtf8() );
    if ( lang )
        return i18nc( "@item %1 character set, %2 encoding", "%1 ( %2 )",
                      i18nc( "@item Text character set", lang ), encoding );
    else
        return i18nc( "@item", "Other encoding (%1)", encoding );
}

QString KCharsets::encodingForName( const QString &descriptiveName ) const
{
    const int left = descriptiveName.lastIndexOf( QLatin1Char('(') );

    if (left<0) // No parenthesis, so assume it is a normal encoding name
	return descriptiveName.trimmed();

    QString name(descriptiveName.mid(left+1));

    const int right = name.lastIndexOf( QLatin1Char(')') );

    if (right<0)
        return name;

    return name.left(right).trimmed();
}

QStringList KCharsets::descriptiveEncodingNames() const
{
    QStringList encodings;
    for ( const int *p = language_for_encoding_indices; *p != -1; p += 2) {
        const QString name = QString::fromUtf8( language_for_encoding_string + p[0] );
        const QString description = i18nc( "@item Text character set", language_for_encoding_string + p[1] );
        encodings.append( i18nc( "@item Text encoding: %1 character set, %2 encoding", "%1 ( %2 )",
                                 description, name ) );
    }
    encodings.sort();
    return encodings;
}

QList<QStringList> KCharsets::encodingsByScript() const
{
    if (!d->encodingsByScript.isEmpty())
        return d->encodingsByScript;
    int i;
    for ( const int *p = language_for_encoding_indices; *p != -1; p += 2) {
        const QString name = QString::fromUtf8( language_for_encoding_string + p[0] );
        const QString description = i18nc("@item Text character set", language_for_encoding_string + p[1] );

        for (i=0; i<d->encodingsByScript.size(); ++i) {
            if (d->encodingsByScript.at(i).at(0) == description) {
                d->encodingsByScript[i].append(name);
                break;
            }
        }

        if (i==d->encodingsByScript.size()) {
            d->encodingsByScript.append(QStringList() << description << name);
        }

    }
    return d->encodingsByScript;
}

QTextCodec* KCharsets::codecForName(const QString &n) const
{
    if ( n == QLatin1String("gb2312") || n == QLatin1String("gbk") )
        return QTextCodec::codecForName( "gb18030" );
    const QByteArray name( n.toLatin1() );
    QTextCodec* codec = codecForNameOrNull( name );
    if ( codec )
        return codec;
    else
        return QTextCodec::codecForName( "iso-8859-1" );
}

QTextCodec* KCharsets::codecForName(const QString &n, bool &ok) const
{
    if (n == QLatin1String("gb2312") || n == QLatin1String("gbk")) {
        ok = true;
        return QTextCodec::codecForName( "gb18030" );
    }
    const QByteArray name( n.toLatin1() );
    QTextCodec* codec = codecForNameOrNull( name );
    if ( codec )
    {
        ok = true;
        return codec;
    }
    else
    {
        ok = false;
        return QTextCodec::codecForName( "iso-8859-1" );
    }
}

QTextCodec *KCharsets::codecForNameOrNull( const QByteArray& n ) const
{
    QTextCodec* codec = 0;

    if (n.isEmpty()) {
        // No name, assume locale (KDE's, not Qt's)
        const QByteArray locale = "->locale<-";
        if ( d->codecForNameDict.contains( locale ) )
            return d->codecForNameDict.value( locale );
        codec = KGlobal::locale()->codecForEncoding();
        d->codecForNameDict.insert("->locale<-", codec);
        return codec;
    }
    // For a non-empty name, lookup the "dictionnary", in a case-sensitive way.
    else if ( d->codecForNameDict.contains( n ) ) {
        return d->codecForNameDict.value( n );
    }

    // If the name is not in the hash table, call directly QTextCoded::codecForName.
    // We assume that QTextCodec is smarter and more maintained than this code.
    codec = QTextCodec::codecForName( n );
    if ( codec ) {
        d->codecForNameDict.insert( n, codec );
        return codec;
    }

    // We have had no luck with QTextCodec::codecForName, so we must now process the name, so that QTextCodec::codecForName could work with it.

    QByteArray name = n.toLower();
    bool changed = false;
    if (name.endsWith("_charset")) { // krazy:exclude=strings
       name.chop( 8 );
       changed = true;
    }
    if ( name.startsWith( "x-" ) ) { // krazy:exclude=strings
       name.remove( 0, 2 ); // remove x- at start
       changed = true;
    }

    if (name.isEmpty()) {
      // We have no name anymore, therefore the name is invalid.
      return 0;
    }

    // We only need to check changed names.
    if ( changed ) {
        codec = QTextCodec::codecForName(name);
        if (codec) {
            d->codecForNameDict.insert( n, codec );
            return codec;
        }
    }

    // these codecs are built into Qt, but the name given for the codec is different,
    // so QTextCodec did not recognize it.
    QByteArray cname = kcharsets_array_search( builtin_string, builtin_indices, name);

    if(!cname.isEmpty())
        codec = QTextCodec::codecForName(cname);

    if (codec)
    {
        d->codecForNameDict.insert( n, codec );
        return codec;
    }

    // this also failed, the last resort is now to take some compatibility charmap
    // ### TODO: while emergency conversions might be useful at read, it is not sure if they should be done if the application plans to write.
    cname = kcharsets_array_search( conversion_hints_string, conversion_hints_indices, name );

    if (!cname.isEmpty()) {
        codec = QTextCodec::codecForName(cname);
        if (codec) {
            d->codecForNameDict.insert( n, codec );
            return codec;
        }
    }

    // we could not assign a codec, therefore return NULL
    return 0;
}