mirror of
https://bitbucket.org/smil3y/kdelibs.git
synced 2025-02-24 19:02:48 +00:00
323 lines
11 KiB
C++
323 lines
11 KiB
C++
/*
|
|
This file is part of the KDE libraries
|
|
|
|
Copyright (C) 2008 Wang Hoi (zealot.hoi@gmail.com)
|
|
|
|
This library is free software; you can redistribute it and/or
|
|
modify it under the terms of the GNU Library General Public
|
|
License as published by the Free Software Foundation; either
|
|
version 2 of the License, or (at your option) any later version.
|
|
|
|
This library is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
Library General Public License for more details.
|
|
|
|
You should have received a copy of the GNU Library General Public License
|
|
along with this library; see the file COPYING.LIB. If not, write to
|
|
the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
|
|
Boston, MA 02110-1301, USA.
|
|
|
|
*/
|
|
|
|
#include "kencodingprober.h"
|
|
|
|
#include "klocale.h"
|
|
|
|
#include "probers/nsCharSetProber.h"
|
|
#include "probers/nsUniversalDetector.h"
|
|
#include "probers/ChineseGroupProber.h"
|
|
#include "probers/JapaneseGroupProber.h"
|
|
#include "probers/UnicodeGroupProber.h"
|
|
#include "probers/nsSBCSGroupProber.h"
|
|
#include "probers/nsMBCSGroupProber.h"
|
|
|
|
#include <string.h>
|
|
|
|
class KEncodingProberPrivate
|
|
{
|
|
public:
|
|
KEncodingProberPrivate(): prober(NULL), mStart(true) {};
|
|
~KEncodingProberPrivate()
|
|
{
|
|
delete prober;
|
|
}
|
|
void setProberType(KEncodingProber::ProberType pType)
|
|
{
|
|
proberType = pType;
|
|
/* handle multi-byte encodings carefully , because they're hard to detect,
|
|
* and have to use some Stastics methods.
|
|
* for single-byte encodings (most western encodings), nsSBCSGroupProber is ok,
|
|
* because encoding state machine can detect many such encodings.
|
|
*/
|
|
|
|
delete prober;
|
|
|
|
switch (proberType) {
|
|
case KEncodingProber::None:
|
|
prober = NULL;
|
|
break;
|
|
case KEncodingProber::Arabic:
|
|
case KEncodingProber::Baltic:
|
|
case KEncodingProber::CentralEuropean:
|
|
case KEncodingProber::Cyrillic:
|
|
case KEncodingProber::Greek:
|
|
case KEncodingProber::Hebrew:
|
|
case KEncodingProber::NorthernSaami:
|
|
case KEncodingProber::Other:
|
|
case KEncodingProber::SouthEasternEurope:
|
|
case KEncodingProber::Thai:
|
|
case KEncodingProber::Turkish:
|
|
case KEncodingProber::WesternEuropean:
|
|
prober = new kencodingprober::nsSBCSGroupProber();
|
|
break;
|
|
case KEncodingProber::ChineseSimplified:
|
|
case KEncodingProber::ChineseTraditional:
|
|
prober = new kencodingprober::ChineseGroupProber();
|
|
break;
|
|
case KEncodingProber::Japanese:
|
|
prober = new kencodingprober::JapaneseGroupProber();
|
|
break;
|
|
case KEncodingProber::Korean:
|
|
prober = new kencodingprober::nsMBCSGroupProber();
|
|
break;
|
|
case KEncodingProber::Unicode:
|
|
prober = new kencodingprober::UnicodeGroupProber();
|
|
break;
|
|
case KEncodingProber::Universal:
|
|
prober = new kencodingprober::nsUniversalDetector();
|
|
break;
|
|
default:
|
|
prober = NULL;
|
|
}
|
|
}
|
|
void unicodeTest(const char *aBuf, int aLen)
|
|
{
|
|
if (mStart)
|
|
{
|
|
mStart = false;
|
|
if (aLen > 3)
|
|
switch (aBuf[0])
|
|
{
|
|
case '\xEF':
|
|
if (('\xBB' == aBuf[1]) && ('\xBF' == aBuf[2]))
|
|
// EF BB BF UTF-8 encoded BOM
|
|
proberState = KEncodingProber::FoundIt;
|
|
break;
|
|
case '\xFE':
|
|
if (('\xFF' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3]))
|
|
// FE FF 00 00 UCS-4, unusual octet order BOM (3412)
|
|
proberState = KEncodingProber::FoundIt;
|
|
else if ('\xFF' == aBuf[1])
|
|
// FE FF UTF-16, big endian BOM
|
|
proberState = KEncodingProber::FoundIt;
|
|
break;
|
|
case '\x00':
|
|
if (('\x00' == aBuf[1]) && ('\xFE' == aBuf[2]) && ('\xFF' == aBuf[3]))
|
|
// 00 00 FE FF UTF-32, big-endian BOM
|
|
proberState = KEncodingProber::FoundIt;
|
|
else if (('\x00' == aBuf[1]) && ('\xFF' == aBuf[2]) && ('\xFE' == aBuf[3]))
|
|
// 00 00 FF FE UCS-4, unusual octet order BOM (2143)
|
|
proberState = KEncodingProber::FoundIt;
|
|
break;
|
|
case '\xFF':
|
|
if (('\xFE' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3]))
|
|
// FF FE 00 00 UTF-32, little-endian BOM
|
|
proberState = KEncodingProber::FoundIt;
|
|
else if ('\xFE' == aBuf[1])
|
|
// FF FE UTF-16, little endian BOM
|
|
proberState = KEncodingProber::FoundIt;
|
|
break;
|
|
} // switch
|
|
|
|
}
|
|
}
|
|
KEncodingProber::ProberType proberType;
|
|
KEncodingProber::ProberState proberState;
|
|
kencodingprober::nsCharSetProber *prober;
|
|
bool mStart;
|
|
};
|
|
|
|
KEncodingProber::KEncodingProber(KEncodingProber::ProberType proberType): d(new KEncodingProberPrivate())
|
|
{
|
|
setProberType(proberType);
|
|
}
|
|
|
|
KEncodingProber::~KEncodingProber()
|
|
{
|
|
delete d;
|
|
}
|
|
|
|
void KEncodingProber::reset()
|
|
{
|
|
d->proberState = KEncodingProber::Probing;
|
|
d->mStart = true;
|
|
}
|
|
|
|
KEncodingProber::ProberState KEncodingProber::feed(const QByteArray &data)
|
|
{
|
|
return feed(data.data(), data.size());
|
|
}
|
|
|
|
KEncodingProber::ProberState KEncodingProber::feed(const char* data, int len)
|
|
{
|
|
if (!d->prober)
|
|
return d->proberState;
|
|
if (d->proberState == Probing) {
|
|
if (d->mStart) {
|
|
d->unicodeTest(data, len);
|
|
if (d->proberState == FoundIt)
|
|
return d->proberState;
|
|
}
|
|
d->prober->HandleData(data, len);
|
|
switch (d->prober->GetState())
|
|
{
|
|
case kencodingprober::eNotMe:
|
|
d->proberState = NotMe;
|
|
break;
|
|
case kencodingprober::eFoundIt:
|
|
d->proberState = FoundIt;
|
|
break;
|
|
default:
|
|
d->proberState = Probing;
|
|
break;
|
|
}
|
|
}
|
|
#ifdef DEBUG_PROBE
|
|
d->prober->DumpStatus();
|
|
#endif
|
|
return d->proberState;
|
|
}
|
|
|
|
KEncodingProber::ProberState KEncodingProber::state() const
|
|
{
|
|
return d->proberState;
|
|
}
|
|
|
|
//DEPRECATED, do *not* use
|
|
#ifndef KDE_NO_DEPRECATED
|
|
const char* KEncodingProber::encodingName() const
|
|
{
|
|
return qstrdup(encoding().constData());
|
|
}
|
|
#endif
|
|
|
|
QByteArray KEncodingProber::encoding() const
|
|
{
|
|
if (!d->prober)
|
|
return QByteArray("UTF-8");
|
|
|
|
return QByteArray(d->prober->GetCharSetName());
|
|
}
|
|
|
|
float KEncodingProber::confidence() const
|
|
{
|
|
if (!d->prober)
|
|
return 0.0;
|
|
|
|
return d->prober->GetConfidence();
|
|
}
|
|
|
|
KEncodingProber::ProberType KEncodingProber::proberType() const
|
|
{
|
|
return d->proberType;
|
|
}
|
|
|
|
void KEncodingProber::setProberType(KEncodingProber::ProberType proberType)
|
|
{
|
|
d->setProberType(proberType);
|
|
reset();
|
|
}
|
|
|
|
KEncodingProber::ProberType KEncodingProber::proberTypeForName(const QString& lang)
|
|
{
|
|
if (lang.isEmpty())
|
|
return KEncodingProber::Universal;
|
|
else if (lang==i18nc("@item Text character set", "Disabled"))
|
|
return KEncodingProber::None;
|
|
else if (lang==i18nc("@item Text character set", "Universal"))
|
|
return KEncodingProber::Universal;
|
|
else if (lang==i18nc("@item Text character set", "Unicode"))
|
|
return KEncodingProber::Unicode;
|
|
else if (lang==i18nc("@item Text character set", "Cyrillic"))
|
|
return KEncodingProber::Cyrillic;
|
|
else if (lang==i18nc("@item Text character set", "Western European"))
|
|
return KEncodingProber::WesternEuropean;
|
|
else if (lang==i18nc("@item Text character set", "Central European"))
|
|
return KEncodingProber::CentralEuropean;
|
|
else if (lang==i18nc("@item Text character set", "Greek"))
|
|
return KEncodingProber::Greek;
|
|
else if (lang==i18nc("@item Text character set", "Hebrew"))
|
|
return KEncodingProber::Hebrew;
|
|
else if (lang==i18nc("@item Text character set", "Turkish"))
|
|
return KEncodingProber::Turkish;
|
|
else if (lang==i18nc("@item Text character set", "Japanese"))
|
|
return KEncodingProber::Japanese;
|
|
else if (lang==i18nc("@item Text character set", "Baltic"))
|
|
return KEncodingProber::Baltic;
|
|
else if (lang==i18nc("@item Text character set", "Chinese Traditional"))
|
|
return KEncodingProber::ChineseTraditional;
|
|
else if (lang==i18nc("@item Text character set", "Chinese Simplified"))
|
|
return KEncodingProber::ChineseSimplified;
|
|
else if (lang==i18nc("@item Text character set", "Arabic"))
|
|
return KEncodingProber::Arabic;
|
|
|
|
return KEncodingProber::Universal;
|
|
}
|
|
|
|
QString KEncodingProber::nameForProberType(KEncodingProber::ProberType proberType)
|
|
{
|
|
switch (proberType)
|
|
{
|
|
case KEncodingProber::None:
|
|
return i18nc("@item Text character set", "Disabled");
|
|
break;
|
|
case KEncodingProber::Universal:
|
|
return i18nc("@item Text character set", "Universal");
|
|
break;
|
|
case KEncodingProber::Arabic:
|
|
return i18nc("@item Text character set", "Arabic");
|
|
break;
|
|
case KEncodingProber::Baltic:
|
|
return i18nc("@item Text character set", "Baltic");
|
|
break;
|
|
case KEncodingProber::CentralEuropean:
|
|
return i18nc("@item Text character set", "Central European");
|
|
break;
|
|
case KEncodingProber::Cyrillic:
|
|
return i18nc("@item Text character set", "Cyrillic");
|
|
break;
|
|
case KEncodingProber::Greek:
|
|
return i18nc("@item Text character set", "Greek");
|
|
break;
|
|
case KEncodingProber::Hebrew:
|
|
return i18nc("@item Text character set", "Hebrew");
|
|
break;
|
|
case KEncodingProber::Japanese:
|
|
return i18nc("@item Text character set", "Japanese");
|
|
break;
|
|
case KEncodingProber::Turkish:
|
|
return i18nc("@item Text character set", "Turkish");
|
|
break;
|
|
case KEncodingProber::WesternEuropean:
|
|
return i18nc("@item Text character set", "Western European");
|
|
break;
|
|
case KEncodingProber::ChineseTraditional:
|
|
return i18nc("@item Text character set", "Chinese Traditional");
|
|
break;
|
|
case KEncodingProber::ChineseSimplified:
|
|
return i18nc("@item Text character set", "Chinese Simplified");
|
|
break;
|
|
case KEncodingProber::Korean:
|
|
return i18nc("@item Text character set", "Korean");
|
|
break;
|
|
case KEncodingProber::Thai:
|
|
return i18nc("@item Text character set", "Thai");
|
|
break;
|
|
case KEncodingProber::Unicode:
|
|
return i18nc("@item Text character set", "Unicode");
|
|
break;
|
|
default:
|
|
return QString();
|
|
}
|
|
}
|