diff --git a/kdecore/CMakeLists.txt b/kdecore/CMakeLists.txt index f3bea91a..3b05199a 100644 --- a/kdecore/CMakeLists.txt +++ b/kdecore/CMakeLists.txt @@ -181,8 +181,6 @@ set(kdecore_LIB_SRCS localization/kcatalog.cpp localization/kcharsets.cpp - localization/kencodingdetector.cpp - localization/guess_ja.cpp localization/klocale.cpp localization/klocale_kde.cpp localization/klocalizedstring.cpp @@ -337,7 +335,6 @@ install( #services/kserviceoffer.h: do not install, internal API services/kplugininfo.h localization/kcharsets.h - localization/kencodingdetector.h localization/klocale.h localization/klocalizedstring.h sycoca/kprotocolinfo.h diff --git a/kdecore/localization/guess_ja.cpp b/kdecore/localization/guess_ja.cpp deleted file mode 100644 index 06318154..00000000 --- a/kdecore/localization/guess_ja.cpp +++ /dev/null @@ -1,376 +0,0 @@ -/* - * This file is part of the KDE libraries - * - * Copyright 2000-2003 Shiro Kawai , All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * 3. Neither the name of the authors nor the names of its contributors - * may be used to endorse or promote products derived from this - * software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED - * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - */ -/* - * original code is here. - * http://cvs.sourceforge.net/viewcvs.py/gauche/Gauche/ext/charconv/guess.c?view=markup - */ - -/* - * Maybe we should use QTextCodec::heuristicContentMatch() - * But it fails detection. It's not useful. - */ -#include "guess_ja_p.h" - -/* DFA tables */ -const dfa_table guess_eucj_st[] = { - { /* state init */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 2, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, - 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, - 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, - 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, - 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, - 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, -1, - }, - { /* state jis0201_kana */ - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - }, - { /* state jis0213_1 */ - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, -1, - }, - { /* state jis0213_2 */ - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, - 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, - 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, - 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, - 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, - 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, -1, - }, -}; - -guess_arc guess_eucj_ar[7] = { - { 0, 1.0 }, /* init -> init */ - { 1, 0.8 }, /* init -> jis0201_kana */ - { 3, 0.95 }, /* init -> jis0213_2 */ - { 2, 1.0 }, /* init -> jis0213_1 */ - { 0, 1.0 }, /* jis0201_kana -> init */ - { 0, 1.0 }, /* jis0213_1 -> init */ - { 0, 1.0 }, /* jis0213_2 -> init */ -}; - -const dfa_table guess_sjis_st[] = { - { /* state init */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - -1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, - }, - { /* state jis0213 */ - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, -1, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, -1, -1, -1, - }, -}; - -guess_arc guess_sjis_ar[6] = { - { 0, 1.0 }, /* init -> init */ - { 1, 1.0 }, /* init -> jis0213 */ - { 0, 0.8 }, /* init -> init */ - { 1, 0.95 }, /* init -> jis0213 */ - { 0, 0.8 }, /* init -> init */ - { 0, 1.0 }, /* jis0213 -> init */ -}; - -const dfa_table guess_utf8_st[] = { - { /* state init */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, -1, -1, - }, - { /* state 1byte_more */ - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, - 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, - 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, - 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - }, - { /* state 2byte_more */ - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - }, - { /* state 3byte_more */ - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - }, - { /* state 4byte_more */ - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - }, - { /* state 5byte_more */ - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, - 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, - 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, - 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - }, -}; - -guess_arc guess_utf8_ar[11] = { - { 0, 1.0 }, /* init -> init */ - { 1, 1.0 }, /* init -> 1byte_more */ - { 2, 1.0 }, /* init -> 2byte_more */ - { 3, 1.0 }, /* init -> 3byte_more */ - { 4, 1.0 }, /* init -> 4byte_more */ - { 5, 1.0 }, /* init -> 5byte_more */ - { 0, 1.0 }, /* 1byte_more -> init */ - { 1, 1.0 }, /* 2byte_more -> 1byte_more */ - { 2, 1.0 }, /* 3byte_more -> 2byte_more */ - { 3, 1.0 }, /* 4byte_more -> 3byte_more */ - { 4, 1.0 }, /* 5byte_more -> 4byte_more */ -}; - -/* Guessing Routine */ -enum JapaneseCode::Type JapaneseCode::guess_jp(const char *buf, int buflen) -{ - int i; - guess_dfa *top = 0; - - for (i=0; iscore == 1.0 && sjis->score == 1.0 && utf8->score == 1.0) - return JapaneseCode::ASCII; - - /* Now, we have ambigous code. Pick the highest score. If more than - one candidate tie, pick the default encoding. */ - if (DFA_ALIVE(eucj)) top = eucj; - if (DFA_ALIVE(utf8)) { - if (top) { - if (top->score < utf8->score) top = utf8; - } else { - top = utf8; - } - } - if (DFA_ALIVE(sjis)) { - if (top) { - if (top->score <= sjis->score) top = sjis; - } else { - top = sjis; - } - } - - if (top == eucj) return JapaneseCode::EUC; - if (top == utf8) return JapaneseCode::UTF8; - if (top == sjis) return JapaneseCode::SJIS; - - return JapaneseCode::ASCII; -} diff --git a/kdecore/localization/guess_ja_p.h b/kdecore/localization/guess_ja_p.h deleted file mode 100644 index b529ebbe..00000000 --- a/kdecore/localization/guess_ja_p.h +++ /dev/null @@ -1,120 +0,0 @@ -/* - * This file is part of the KDE libraries - * - * Copyright 2000-2003 Shiro Kawai , All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * 3. Neither the name of the authors nor the names of its contributors - * may be used to endorse or promote products derived from this - * software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED - * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - */ -/* - * original code is here. - * http://cvs.sourceforge.net/viewcvs.py/gauche/Gauche/ext/charconv/guess.c?view=markup - */ -#ifndef GUESS_JA_H -#define GUESS_JA_H - -#include -#ifdef SOLARIS -#undef UNICODE -#endif - -class guess_arc { -public: - unsigned int next; /* next state */ - double score; /* score */ -}; - -typedef signed char dfa_table[256]; - -/* DFA tables declared in guess_ja.cpp */ -extern const dfa_table guess_eucj_st[]; -extern guess_arc guess_eucj_ar[7]; -extern const dfa_table guess_sjis_st[]; -extern guess_arc guess_sjis_ar[6]; -extern const dfa_table guess_utf8_st[]; -extern guess_arc guess_utf8_ar[11]; - -class guess_dfa { -public: - const dfa_table *states; - const guess_arc *arcs; - int state; - double score; - - guess_dfa (const dfa_table stable[], const guess_arc *atable) : - states(stable), arcs(atable) - { - state = 0; - score = 1.0; - } -}; - -class JapaneseCode -{ -public: - enum Type {ASCII, JIS, EUC, SJIS, UNICODE, UTF8 }; - enum Type guess_jp(const char* buf, int buflen); - - JapaneseCode () { - eucj = new guess_dfa(guess_eucj_st, guess_eucj_ar); - sjis = new guess_dfa(guess_sjis_st, guess_sjis_ar); - utf8 = new guess_dfa(guess_utf8_st, guess_utf8_ar); - last_JIS_escape = false; - } - - ~JapaneseCode () { - delete eucj; - delete sjis; - delete utf8; - } - -protected: - guess_dfa *eucj; - guess_dfa *sjis; - guess_dfa *utf8; - - bool last_JIS_escape; -}; - -#define DFA_NEXT(dfa, ch) \ - do { \ - int arc__; \ - if (dfa->state >= 0) { \ - arc__ = dfa->states[dfa->state][ch]; \ - if (arc__ < 0) { \ - dfa->state = -1; \ - } else { \ - dfa->state = dfa->arcs[arc__].next; \ - dfa->score *= dfa->arcs[arc__].score; \ - } \ - } \ - } while (0) - -#define DFA_ALIVE(dfa) (dfa->state >= 0) - -#endif /* GUESS_JA_H */ diff --git a/kdecore/localization/kencodingdetector.cpp b/kdecore/localization/kencodingdetector.cpp deleted file mode 100644 index 1f20bec5..00000000 --- a/kdecore/localization/kencodingdetector.cpp +++ /dev/null @@ -1,1257 +0,0 @@ -/* - This file is part of the KDE libraries - - Copyright (C) 1999 Lars Knoll (knoll@kde.org) - Copyright (C) 2003 Dirk Mueller (mueller@kde.org) - Copyright (C) 2003 Apple Computer, Inc. - Copyright (C) 2007 Nick Shaforostoff (shafff@ukr.net) - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Library General Public - License as published by the Free Software Foundation; either - version 2 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Library General Public License for more details. - - You should have received a copy of the GNU Library General Public License - along with this library; see the file COPYING.LIB. If not, write to - the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, - Boston, MA 02110-1301, USA. -*/ -//---------------------------------------------------------------------------- -// -// decoder for input stream - -#include "kencodingdetector.h" - -#undef DECODE_DEBUG -//#define DECODE_DEBUG - -#define MAX_BUFFER 16*1024 - -#include - -#include "guess_ja_p.h" - -#include -#include - -#include -#include -#include -#include - -#include - -enum MIB -{ - MibLatin1 = 4, - Mib8859_8 = 85, - MibUtf8 = 106, - MibUcs2 = 1000, - MibUtf16 = 1015, - MibUtf16BE = 1013, - MibUtf16LE = 1014 -}; - -static bool is16Bit(QTextCodec* codec) -{ - switch (codec->mibEnum()) - { - case MibUtf16: - case MibUtf16BE: - case MibUtf16LE: - case MibUcs2: - return true; - default: - return false; - } -} - -class KEncodingDetectorPrivate -{ -public: - QTextCodec *m_codec; - QTextDecoder *m_decoder; // utf16 - QTextCodec *m_defaultCodec; - QByteArray m_storeDecoderName; - - KEncodingDetector::EncodingChoiceSource m_source; - KEncodingDetector::AutoDetectScript m_autoDetectLanguage; - - bool m_visualRTL : 1; - bool m_seenBody : 1; - bool m_writtingHappened : 1; - bool m_analyzeCalled : 1; //for decode() - int m_multiByte; - - QByteArray m_bufferForDefferedEncDetection; - - KEncodingDetectorPrivate() - : m_codec(QTextCodec::codecForMib(MibLatin1)) - , m_decoder(m_codec->makeDecoder()) - , m_defaultCodec(m_codec) - , m_source(KEncodingDetector::DefaultEncoding) - , m_autoDetectLanguage(KEncodingDetector::SemiautomaticDetection) - , m_visualRTL(false) - , m_seenBody(false) - , m_writtingHappened(false) - , m_analyzeCalled(false) - , m_multiByte(0) - { - } - - KEncodingDetectorPrivate(QTextCodec* codec,KEncodingDetector::EncodingChoiceSource source, KEncodingDetector::AutoDetectScript script) - : m_codec(codec) - , m_decoder(m_codec->makeDecoder()) - , m_defaultCodec(m_codec) - , m_source(source) - , m_autoDetectLanguage(script) - , m_visualRTL(false) - , m_seenBody(false) - , m_writtingHappened(false) - , m_analyzeCalled(false) - , m_multiByte(0) - { - } - - ~KEncodingDetectorPrivate() - { - delete m_decoder; - } - - // Returns true if the encoding was explicitly specified someplace. - bool isExplicitlySpecifiedEncoding() - { - return m_source != KEncodingDetector::DefaultEncoding && m_source != KEncodingDetector::AutoDetectedEncoding; - } -}; - - -static QByteArray automaticDetectionForArabic( const unsigned char* ptr, int size ) -{ - for ( int i = 0; i < size; ++i ) { - if ( ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9F ) || ptr[ i ] == 0xA1 || ptr[ i ] == 0xA2 || ptr[ i ] == 0xA3 - || ( ptr[ i ] >= 0xA5 && ptr[ i ] <= 0xAB ) || ( ptr[ i ] >= 0xAE && ptr[ i ] <= 0xBA ) - || ptr[ i ] == 0xBC || ptr[ i ] == 0xBD || ptr[ i ] == 0xBE || ptr[ i ] == 0xC0 - || ( ptr[ i ] >= 0xDB && ptr[ i ] <= 0xDF ) || ( ptr[ i ] >= 0xF3 ) ) { - return "cp1256"; - } - } - - return "iso-8859-6"; -} - -static QByteArray automaticDetectionForBaltic( const unsigned char* ptr, int size ) -{ - for ( int i = 0; i < size; ++i ) { - if ( ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9E ) ) - return "cp1257"; - - if ( ptr[ i ] == 0xA1 || ptr[ i ] == 0xA5 ) - return "iso-8859-13"; - } - - return "iso-8859-13"; -} - -static QByteArray automaticDetectionForCentralEuropean(const unsigned char* ptr, int size ) -{ - QByteArray charset = QByteArray(); - for ( int i = 0; i < size; ++i ) { - if ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9F ) { - if ( ptr[ i ] == 0x81 || ptr[ i ] == 0x83 || ptr[ i ] == 0x90 || ptr[ i ] == 0x98 ) - return "ibm852"; - - if ( i + 1 > size ) - return "cp1250"; - else { // maybe ibm852 ? - charset = "cp1250"; - continue; - } - } - if ( ptr[ i ] == 0xA5 || ptr[ i ] == 0xAE || ptr[ i ] == 0xBE || ptr[ i ] == 0xC3 || ptr[ i ] == 0xD0 || ptr[ i ] == 0xE3 || ptr[ i ] == 0xF0 ) { - if ( i + 1 > size ) - return "iso-8859-2"; - else { // maybe ibm852 ? - if ( charset.isNull() ) - charset = "iso-8859-2"; - continue; - } - } - } - - if ( charset.isNull() ) - charset = "iso-8859-3"; - - return charset.data(); -} - -static QByteArray automaticDetectionForCyrillic( const unsigned char* ptr, int size) -{ -#ifdef DECODE_DEBUG - kWarning() << "KEncodingDetector: Cyr heuristics"; -#endif - -// if (ptr[0]==0xef && ptr[1]==0xbb && ptr[2]==0xbf) -// return "utf8"; - int utf8_mark=0; - int koi_score=0; - int cp1251_score=0; - - int koi_st=0; - int cp1251_st=0; - -// int koi_na=0; -// int cp1251_na=0; - - int koi_o_capital=0; - int koi_o=0; - int cp1251_o_capital=0; - int cp1251_o=0; - - int koi_a_capital=0; - int koi_a=0; - int cp1251_a_capital=0; - int cp1251_a=0; - - int koi_s_capital=0; - int koi_s=0; - int cp1251_s_capital=0; - int cp1251_s=0; - - int koi_i_capital=0; - int koi_i=0; - int cp1251_i_capital=0; - int cp1251_i=0; - - int cp1251_small_range=0; - int koi_small_range=0; - int ibm866_small_range=0; - - int i; - for (i=1; (i0xdf) - { - ++cp1251_small_range; - - if (ptr[i]==0xee)//small o - ++cp1251_o; - else if (ptr[i]==0xe0)//small a - ++cp1251_a; - else if (ptr[i]==0xe8)//small i - ++cp1251_i; - else if (ptr[i]==0xf1)//small s - ++cp1251_s; - else if (ptr[i]==0xf2 && ptr[i-1]==0xf1)//small st - ++cp1251_st; - - else if (ptr[i]==0xef) - ++koi_o_capital; - else if (ptr[i]==0xe1) - ++koi_a_capital; - else if (ptr[i]==0xe9) - ++koi_i_capital; - else if (ptr[i]==0xf3) - ++koi_s_capital; - - } - else if (ptr[i]>0xbf) - { - ++koi_small_range; - - if (ptr[i]==0xd0||ptr[i]==0xd1)//small o - ++utf8_mark; - else if (ptr[i]==0xcf)//small o - ++koi_o; - else if (ptr[i]==0xc1)//small a - ++koi_a; - else if (ptr[i]==0xc9)//small i - ++koi_i; - else if (ptr[i]==0xd3)//small s - ++koi_s; - else if (ptr[i]==0xd4 && ptr[i-1]==0xd3)//small st - ++koi_st; - - else if (ptr[i]==0xce) - ++cp1251_o_capital; - else if (ptr[i]==0xc0) - ++cp1251_a_capital; - else if (ptr[i]==0xc8) - ++cp1251_i_capital; - else if (ptr[i]==0xd1) - ++cp1251_s_capital; - } - else if (ptr[i]>0x9f && ptr[i]<0xb0) //first 16 letterz is 60% - ++ibm866_small_range; - - } - - //cannot decide? - if (cp1251_small_range+koi_small_range+ibm866_small_range<8) - { - return ""; - } - - if (3*utf8_mark>cp1251_small_range+koi_small_range+ibm866_small_range) - { -#ifdef DECODE_DEBUG - kWarning() << "Cyr Enc Detection: UTF8"; -#endif - return "UTF-8"; - } - - if (ibm866_small_range>cp1251_small_range+koi_small_range) - return "ibm866"; - -// QByteArray koi_string = "koi8-u"; -// QByteArray cp1251_string = "cp1251"; - - if (cp1251_st==0 && koi_st>1) - koi_score+=10; - else if (koi_st==0 && cp1251_st>1) - cp1251_score+=10; - - if (cp1251_st && koi_st) - { - if (cp1251_st/koi_st>2) - cp1251_score+=20; - else if (koi_st/cp1251_st>2) - koi_score+=20; - } - - if (cp1251_a>koi_a) - cp1251_score+=10; - else if (cp1251_a || koi_a) - koi_score+=10; - - if (cp1251_o>koi_o) - cp1251_score+=10; - else if (cp1251_o || koi_o) - koi_score+=10; - - if (cp1251_i>koi_i) - cp1251_score+=10; - else if (cp1251_i || koi_i) - koi_score+=10; - - if (cp1251_s>koi_s) - cp1251_score+=10; - else if (cp1251_s || koi_s) - koi_score+=10; - - if (cp1251_a_capital>koi_a_capital) - cp1251_score+=9; - else if (cp1251_a_capital || koi_a_capital) - koi_score+=9; - - if (cp1251_o_capital>koi_o_capital) - cp1251_score+=9; - else if (cp1251_o_capital || koi_o_capital) - koi_score+=9; - - if (cp1251_i_capital>koi_i_capital) - cp1251_score+=9; - else if (cp1251_i_capital || koi_i_capital) - koi_score+=9; - - if (cp1251_s_capital>koi_s_capital) - cp1251_score+=9; - else if (cp1251_s_capital || koi_s_capital) - koi_score+=9; -#ifdef DECODE_DEBUG - kWarning()<<"koi_score " << koi_score << " cp1251_score " << cp1251_score; -#endif - if (abs(koi_score-cp1251_score)<10) - { - //fallback... - cp1251_score=cp1251_small_range; - koi_score=koi_small_range; - } - if (cp1251_score>koi_score) - return "cp1251"; - else - return "koi8-u"; - - -// if (cp1251_score>koi_score) -// setEncoding("cp1251",AutoDetectedEncoding); -// else -// setEncoding("koi8-u",AutoDetectedEncoding); -// return true; - -} - -static QByteArray automaticDetectionForGreek( const unsigned char* ptr, int size ) -{ - for ( int i = 0; i < size; ++i ) { - if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x87 ) || ptr[ i ] == 0x89 || ptr[ i ] == 0x8B - || ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x97 ) || ptr[ i ] == 0x99 || ptr[ i ] == 0x9B || ptr[ i ] == 0xA4 - || ptr[ i ] == 0xA5 || ptr[ i ] == 0xAE ) { - return "cp1253"; - } - } - - return "iso-8859-7"; -} - -static QByteArray automaticDetectionForHebrew( const unsigned char* ptr, int size ) -{ - for ( int i = 0; i < size; ++i ) { - if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x89 ) || ptr[ i ] == 0x8B - || ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x99 ) || ptr[ i ] == 0x9B || ptr[ i ] == 0xA1 || ( ptr[ i ] >= 0xBF && ptr[ i ] <= 0xC9 ) - || ( ptr[ i ] >= 0xCB && ptr[ i ] <= 0xD8 ) ) { - return "cp1255"; - } - - if ( ptr[ i ] == 0xDF ) - return "iso-8859-8-i"; - } - - return "iso-8859-8-i"; -} - -static QByteArray automaticDetectionForJapanese( const unsigned char* ptr, int size ) -{ - JapaneseCode kc; - - switch ( kc.guess_jp( (const char*)ptr, size ) ) { - case JapaneseCode::JIS: - return "jis7"; - case JapaneseCode::EUC: - return "eucjp"; - case JapaneseCode::SJIS: - return "sjis"; - case JapaneseCode::UTF8: - return "utf8"; - default: - break; - } - - return ""; -} - -static QByteArray automaticDetectionForTurkish( const unsigned char* ptr, int size ) -{ - for ( int i = 0; i < size; ++i ) { - if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x8C ) || ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x9C ) || ptr[ i ] == 0x9F ) { - return "cp1254"; - } - } - - return "iso-8859-9"; -} - -static QByteArray automaticDetectionForWesternEuropean( const unsigned char* ptr, int size ) -{ - --size; - uint nonansi_count=0; - for (int i=0; i0x79) - { - ++nonansi_count; - if ( ptr[i]>0xc1 && ptr[i]<0xf0 && ptr[i+1]>0x7f && ptr[i+1]<0xc0) - { - return "UTF-8"; - } - if (ptr[i] >= 0x78 && ptr[i]<=0x9F ) - { - return "cp1252"; - } - } - - } - - if (nonansi_count>0) - return "iso-8859-15"; - - return ""; -} - -// Other browsers allow comments in the head section, so we need to also. -// It's important not to look for tags inside the comments. -static void skipComment(const char *&ptr, const char *pEnd) -{ - const char *p = ptr; - // Allow ; other browsers do. - if (*p=='>') - { - p++; - } - else - { - while (p!=pEnd) - { - if (*p=='-') - { - // This is the real end of comment, "-->". - if (p[1]=='-' && p[2]=='>') - { - p += 3; - break; - } - // This is the incorrect end of comment that other browsers allow, "--!>". - if (p[1] == '-' && p[2] == '!' && p[3] == '>') - { - p += 4; - break; - } - } - p++; - } - } - ptr=p; -} - -// Returns the position of the encoding string. -static int findXMLEncoding(const QByteArray &str, int &encodingLength) -{ - int len = str.length(); - int pos = str.indexOf("encoding"); - if (pos == -1) - return -1; - pos += 8; - - // Skip spaces and stray control characters. - while (pos=len || str[pos] != '=') - return -1; - ++pos; - - // Skip spaces and stray control characters. - while (pos= len) - return -1; - - // Skip quotation mark. - char quoteMark = str[pos]; - if (quoteMark != '"' && quoteMark != '\'') - return -1; - ++pos; - - // Find the trailing quotation mark. - int end=pos; - while (end=len) - return -1; - - encodingLength = end-pos; - return pos; -} - -bool KEncodingDetector::processNull(char *data, int len) -{ - bool bin=false; - if(is16Bit(d->m_codec)) - { - for (int i=1; i < len; i+=2) - { - if ((data[i]=='\0') && (data[i-1]=='\0')) - { - bin=true; - data[i]=' '; - } - } - return bin; - } - // replace '\0' by spaces, for buggy pages - int i = len-1; - while(--i>=0) - { - if(data[i]==0) - { - bin=true; - data[i]=' '; - } - } - return bin; -} - - -bool KEncodingDetector::errorsIfUtf8 (const char* data, int length) -{ - if (d->m_codec->mibEnum()!=MibUtf8) - return false; //means no errors -// #define highest1Bits (unsigned char)0x80 -// #define highest2Bits (unsigned char)0xC0 -// #define highest3Bits (unsigned char)0xE0 -// #define highest4Bits (unsigned char)0xF0 -// #define highest5Bits (unsigned char)0xF8 -static const unsigned char highest1Bits = 0x80; -static const unsigned char highest2Bits = 0xC0; -static const unsigned char highest3Bits = 0xE0; -static const unsigned char highest4Bits = 0xF0; -static const unsigned char highest5Bits = 0xF8; - - for (int i=0; im_multiByte>0) - { - if ((c & highest2Bits) == 0x80) - { - --(d->m_multiByte); - continue; - } -#ifdef DECODE_DEBUG - kWarning() << "EncDetector: Broken UTF8"; -#endif - return true; - } - - // most significant bit zero, single char - if ((c & highest1Bits) == 0x00) - continue; - - // 110xxxxx => init 1 following bytes - if ((c & highest3Bits) == 0xC0) - { - d->m_multiByte = 1; - continue; - } - - // 1110xxxx => init 2 following bytes - if ((c & highest4Bits) == 0xE0) - { - d->m_multiByte = 2; - continue; - } - - // 11110xxx => init 3 following bytes - if ((c & highest5Bits) == 0xF0) - { - d->m_multiByte = 3; - continue; - } -#ifdef DECODE_DEBUG - kWarning() << "EncDetector:_Broken UTF8"; -#endif - return true; - } - return false; -} - - -KEncodingDetector::KEncodingDetector() : d(new KEncodingDetectorPrivate) -{ -} - -KEncodingDetector::KEncodingDetector(QTextCodec* codec, EncodingChoiceSource source, AutoDetectScript script) : - d(new KEncodingDetectorPrivate(codec,source,script)) -{ -} - -KEncodingDetector::~KEncodingDetector() -{ - delete d; -} - -void KEncodingDetector::setAutoDetectLanguage( KEncodingDetector::AutoDetectScript lang) -{ - d->m_autoDetectLanguage=lang; -} -KEncodingDetector::AutoDetectScript KEncodingDetector::autoDetectLanguage() const -{ - return d->m_autoDetectLanguage; -} - -KEncodingDetector::EncodingChoiceSource KEncodingDetector::encodingChoiceSource() const -{ - return d->m_source; -} - -const char* KEncodingDetector::encoding() const -{ - d->m_storeDecoderName = d->m_codec->name(); - return d->m_storeDecoderName.constData(); -} - -bool KEncodingDetector::visuallyOrdered() const -{ - return d->m_visualRTL; -} - -// const QTextCodec* KEncodingDetector::codec() const -// { -// return d->m_codec; -// } - -QTextDecoder* KEncodingDetector::decoder() -{ - return d->m_decoder; -} - -void KEncodingDetector::resetDecoder() -{ - assert(d->m_defaultCodec); - d->m_bufferForDefferedEncDetection.clear(); - d->m_writtingHappened = false; - d->m_analyzeCalled = false; - d->m_multiByte = 0; - delete d->m_decoder; - if (!d->m_codec) - d->m_codec = d->m_defaultCodec; - d->m_decoder = d->m_codec->makeDecoder(); -} - -bool KEncodingDetector::setEncoding(const char *_encoding, EncodingChoiceSource type) -{ - QTextCodec *codec; - QByteArray enc(_encoding); - if(/*enc.isNull() || */enc.isEmpty()) - { - if (type==DefaultEncoding) - codec=d->m_defaultCodec; - else - return false; - } - else - { - //QString->QTextCodec - - enc = enc.toLower(); - // hebrew visually ordered - if(enc=="visual") - enc="iso8859-8"; - bool b; - codec = KGlobal::charsets()->codecForName(QLatin1String(enc), b); - if (!b) - return false; - } - - if (d->m_codec->mibEnum()==codec->mibEnum()) - { - // We already have the codec, but we still want to re-set the type, - // as we may have overwritten a default with a detected - d->m_source = type; - return true; - } - - if ((type==EncodingFromMetaTag || type==EncodingFromXMLHeader) && is16Bit(codec)) - { - //Sometimes the codec specified is absurd, i.e. UTF-16 despite - //us decoding a meta tag as ASCII. In that case, ignore it. - return false; - } - - if (codec->mibEnum() == Mib8859_8) - { - //We do NOT want to use Qt's QHebrewCodec, since it tries to reorder itself. - codec = QTextCodec::codecForName("iso8859-8-i"); - - // visually ordered unless one of the following - if(!(enc=="iso-8859-8-i"||enc=="iso_8859-8-i"||enc=="csiso88598i"||enc=="logical")) - d->m_visualRTL = true; - } - - d->m_codec = codec; - d->m_source = type; - delete d->m_decoder; - d->m_decoder = d->m_codec->makeDecoder(); -#ifdef DECODE_DEBUG - kDebug(6005) << "KEncodingDetector::encoding used is" << d->m_codec->name(); -#endif - return true; -} - -QString KEncodingDetector::decode(const char *data, int len) -{ - processNull(const_cast(data),len); - if (!d->m_analyzeCalled) - { - analyze(data,len); - d->m_analyzeCalled=true; - } - - return d->m_decoder->toUnicode(data,len); -} - -QString KEncodingDetector::decode(const QByteArray &data) -{ - processNull(const_cast(data.data()),data.size()); - if (!d->m_analyzeCalled) - { - analyze(data.data(),data.size()); - d->m_analyzeCalled=true; - } - - return d->m_decoder->toUnicode(data); -} - -QString KEncodingDetector::decodeWithBuffering(const char *data, int len) -{ -#ifdef DECODE_DEBUG - kWarning() << "KEncodingDetector: decoding "<m_writtingHappened) - { -#ifdef DECODE_DEBUG - kWarning() << "KEncodingDetector: d->m_writtingHappened "<< d->m_codec->name(); -#endif - processNull(const_cast(data),len); - return d->m_decoder->toUnicode(data, len); - } - else - { - if (d->m_bufferForDefferedEncDetection.isEmpty()) - { - // If encoding detection produced something, and we either got to the body or - // actually saw the encoding explicitly, we're done. - if (analyze(data,len) && (d->m_seenBody || d->isExplicitlySpecifiedEncoding())) - { -#ifdef DECODE_DEBUG - kWarning() << "KEncodingDetector: m_writtingHappened first time "<< d->m_codec->name(); -#endif - processNull(const_cast(data),len); - d->m_writtingHappened=true; - return d->m_decoder->toUnicode(data, len); - } - else - { -#ifdef DECODE_DEBUG - kWarning() << "KEncodingDetector: begin deffer"; -#endif - d->m_bufferForDefferedEncDetection=data; - } - } - else - { - d->m_bufferForDefferedEncDetection+=data; - // As above, but also limit the buffer size. We must use the entire buffer here, - // since the boundaries might split the meta tag, etc. - bool detected = analyze(d->m_bufferForDefferedEncDetection.constData(), d->m_bufferForDefferedEncDetection.length()); - if ((detected && (d->m_seenBody || d->isExplicitlySpecifiedEncoding())) || - d->m_bufferForDefferedEncDetection.length() > MAX_BUFFER) - { - d->m_writtingHappened=true; - d->m_bufferForDefferedEncDetection.replace('\0',' '); - QString result(d->m_decoder->toUnicode(d->m_bufferForDefferedEncDetection)); - d->m_bufferForDefferedEncDetection.clear(); -#ifdef DECODE_DEBUG - kWarning() << "KEncodingDetector: m_writtingHappened in the middle " << d->m_codec->name(); -#endif - return result; - } - } - } - - return QString(); -} - -bool KEncodingDetector::decodedInvalidCharacters() const -{ - return d->m_decoder ? d->m_decoder->hasFailure() : false; -} - -QString KEncodingDetector::flush() -{ - if (d->m_bufferForDefferedEncDetection.isEmpty()) - return QString(); - - d->m_bufferForDefferedEncDetection.replace('\0',' '); - QString result(d->m_decoder->toUnicode(d->m_bufferForDefferedEncDetection)); - d->m_bufferForDefferedEncDetection.clear(); -#ifdef DECODE_DEBUG - kWarning() << "KEncodingDetector:flush() "<< d->m_bufferForDefferedEncDetection.length()<<" bytes "<< d->m_codec->name(); -#endif - return result; -} - -bool KEncodingDetector::analyze(const char *data, int len) -{ - // Check for UTF-16 or UTF-8 BOM mark at the beginning, which is a sure sign of a Unicode encoding. - // maximumBOMLength = 10 - // Even if the user has chosen utf16 we still need to auto-detect the endianness - if (len >= 10 && ((d->m_source != UserChosenEncoding) || is16Bit(d->m_codec))) - { - // Extract the first three bytes. - const uchar *udata = (const uchar *)data; - uchar c1 = *udata++; - uchar c2 = *udata++; - uchar c3 = *udata++; - - // Check for the BOM - const char *autoDetectedEncoding; - if ((c1 == 0xFE && c2 == 0xFF) || (c1 == 0xFF && c2 == 0xFE)) - { - autoDetectedEncoding = "UTF-16"; - } - else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) - { - autoDetectedEncoding = "UTF-8"; - } - else if (c1 == 0x00 || c2 == 0x00) - { - uchar c4 = *udata++; - uchar c5 = *udata++; - uchar c6 = *udata++; - uchar c7 = *udata++; - uchar c8 = *udata++; - uchar c9 = *udata++; - uchar c10 = *udata++; - - int nul_count_even = (c2 != 0) + (c4 != 0) + (c6 != 0) + (c8 != 0) + (c10 != 0); - int nul_count_odd = (c1 != 0) + (c3 != 0) + (c5 != 0) + (c7 != 0) + (c9 != 0); - if ((nul_count_even==0 && nul_count_odd==5) || (nul_count_even==5 && nul_count_odd==0)) - autoDetectedEncoding = "UTF-16"; - else - autoDetectedEncoding = 0; - } - else - { - autoDetectedEncoding = 0; - } - - // If we found a BOM, use the encoding it implies. - if (autoDetectedEncoding != 0) - { - d->m_source = BOM; - d->m_codec = QTextCodec::codecForName(autoDetectedEncoding); - assert(d->m_codec); - //enc = d->m_codec->name(); - delete d->m_decoder; - d->m_decoder = d->m_codec->makeDecoder(); -#ifdef DECODE_DEBUG - kWarning() << "Detection by BOM"; -#endif - if (is16Bit(d->m_codec) && c2==0x00) - { - // utf16LE, we need to put the decoder in LE mode - char reverseUtf16[3] = {(char)0xFF, (char)0xFE, 0x00}; - d->m_decoder->toUnicode(reverseUtf16, 2); - } - return true; - } - } - - //exit from routine in case it was called to only detect byte order for utf-16 - if (d->m_source==UserChosenEncoding) - { -#ifdef DECODE_DEBUG - kWarning() << "KEncodingDetector: UserChosenEncoding exit "; -#endif - - if (errorsIfUtf8(data, len)) - setEncoding("",DefaultEncoding); - return true; - } - - // HTTP header takes precedence over meta-type stuff - if (d->m_source==EncodingFromHTTPHeader) - return true; - - if (!d->m_seenBody) - { - // we still don't have an encoding, and are in the head - // the following tags are allowed in : - // SCRIPT|STYLE|META|LINK|OBJECT|TITLE|BASE - const char *ptr = data; - const char *pEnd = data+len; - - while(ptr != pEnd) - { - if(*ptr!='<') - { - ++ptr; - continue; - } - ++ptr; - // Handle comments. - if (ptr[0] == '!' && ptr[1] == '-' && ptr[2] == '-') - { - ptr += 3; - skipComment(ptr, pEnd); - continue; - } - - // Handle XML header, which can have encoding in it. - if (ptr[0]=='?' && ptr[1]=='x' && ptr[2]=='m' && ptr[3]=='l') - { - const char *end = ptr; - while (*end != '>' && end < pEnd) - end++; - if (*end == '\0' || end == pEnd) - break; - QByteArray str(ptr, end - ptr); // qbytearray provides the \0 terminator - int length; - int pos = findXMLEncoding(str, length); - // also handles the case when specified encoding aint correct - if (pos!=-1 && setEncoding(str.mid(pos, length), EncodingFromXMLHeader)) - { - return true; - } - } - - //look for , stop if we reach - while ( - !(((*ptr >= 'a') && (*ptr <= 'z')) || - ((*ptr >= 'A') && (*ptr <= 'Z'))) - && ptr < pEnd - ) - ++ptr; - - char tmp[5]; - int length=0; - const char* max=ptr+4; - if (pEnd= 'a') && (*ptr <= 'z')) || - ((*ptr >= 'A') && (*ptr <= 'Z')) || - ((*ptr >= '0') && (*ptr <= '9'))) - && ptr < max - ) - { - tmp[length] = tolower( *ptr ); - ++ptr; - ++length; - } - tmp[length] = 0; - if (tmp[0]=='m'&&tmp[1]=='e'&&tmp[2]=='t'&&tmp[3]=='a') - { - // found a meta tag... - const char* end = ptr; - while(*end != '>' && *end != '\0' && end: " << str.mid(pos,endpos-pos).data(); - #endif - if (setEncoding(str.mid(pos,endpos-pos), EncodingFromMetaTag)) - return true; - } - else if (tmp[0]=='b'&&tmp[1]=='o'&&tmp[2]=='d'&&tmp[3]=='y') - { - d->m_seenBody=true; - break; - } - } - } - - if (len<20) - return false; - -#ifdef DECODE_DEBUG - kDebug( 6005 ) << "KEncodingDetector: using heuristics (" << strlen(data) << ")"; -#endif - - switch ( d->m_autoDetectLanguage) - { - case KEncodingDetector::Arabic: - return setEncoding(automaticDetectionForArabic( (const unsigned char*) data, len ), AutoDetectedEncoding); -// break; - case KEncodingDetector::Baltic: - return setEncoding(automaticDetectionForBaltic( (const unsigned char*) data, len ), AutoDetectedEncoding); -// break; - case KEncodingDetector::CentralEuropean: - return setEncoding(automaticDetectionForCentralEuropean( (const unsigned char*) data, len ), AutoDetectedEncoding); - break; - case KEncodingDetector::Cyrillic: - return setEncoding(automaticDetectionForCyrillic( (const unsigned char*) data, len), AutoDetectedEncoding); -// break; - case KEncodingDetector::Greek: - return setEncoding(automaticDetectionForGreek( (const unsigned char*) data, len ), AutoDetectedEncoding); -// break; - case KEncodingDetector::Hebrew: - return setEncoding(automaticDetectionForHebrew( (const unsigned char*) data, len ), AutoDetectedEncoding); -// break; - case KEncodingDetector::Japanese: - return setEncoding(automaticDetectionForJapanese( (const unsigned char*) data, len ), AutoDetectedEncoding); -// break; - case KEncodingDetector::Turkish: - return setEncoding(automaticDetectionForTurkish( (const unsigned char*) data, len ), AutoDetectedEncoding); -// break; - case KEncodingDetector::WesternEuropean: - if (setEncoding(automaticDetectionForWesternEuropean( (const unsigned char*) data, len ), AutoDetectedEncoding)) - return true; - else if (d->m_defaultCodec->mibEnum()==MibLatin1) //detection for khtml - { - return setEncoding("iso-8859-15",AutoDetectedEncoding); - } - else //use default provided by eg katepart - { - return setEncoding("",DefaultEncoding); - } -// break; - case KEncodingDetector::SemiautomaticDetection: - case KEncodingDetector::ChineseSimplified: - case KEncodingDetector::ChineseTraditional: - case KEncodingDetector::Korean: - case KEncodingDetector::Thai: - case KEncodingDetector::Unicode: - case KEncodingDetector::NorthernSaami: - case KEncodingDetector::SouthEasternEurope: - case KEncodingDetector::None: - // huh. somethings broken in this code ### FIXME - //enc = 0; //Reset invalid codec we tried, so we get back to latin1 fallback. - break; - } - - return true; -} - - -KEncodingDetector::AutoDetectScript KEncodingDetector::scriptForName(const QString& lang) -{ - if (lang.isEmpty()) - return KEncodingDetector::None; - else if (lang==i18nc("@item Text character set", "Unicode")) - return KEncodingDetector::Unicode; - else if (lang==i18nc("@item Text character set", "Cyrillic")) - return KEncodingDetector::Cyrillic; - else if (lang==i18nc("@item Text character set", "Western European")) - return KEncodingDetector::WesternEuropean; - else if (lang==i18nc("@item Text character set", "Central European")) - return KEncodingDetector::CentralEuropean; - else if (lang==i18nc("@item Text character set", "Greek")) - return KEncodingDetector::Greek; - else if (lang==i18nc("@item Text character set", "Hebrew")) - return KEncodingDetector::Hebrew; - else if (lang==i18nc("@item Text character set", "Turkish")) - return KEncodingDetector::Turkish; - else if (lang==i18nc("@item Text character set", "Japanese")) - return KEncodingDetector::Japanese; - else if (lang==i18nc("@item Text character set", "Baltic")) - return KEncodingDetector::Baltic; - else if (lang==i18nc("@item Text character set", "Arabic")) - return KEncodingDetector::Arabic; - - return KEncodingDetector::None; -} - -bool KEncodingDetector::hasAutoDetectionForScript(KEncodingDetector::AutoDetectScript script) -{ - switch (script) - { - case KEncodingDetector::Arabic: - return true; - case KEncodingDetector::Baltic: - return true; - case KEncodingDetector::CentralEuropean: - return true; - case KEncodingDetector::Cyrillic: - return true; - case KEncodingDetector::Greek: - return true; - case KEncodingDetector::Hebrew: - return true; - case KEncodingDetector::Japanese: - return true; - case KEncodingDetector::Turkish: - return true; - case KEncodingDetector::WesternEuropean: - return true; - case KEncodingDetector::ChineseTraditional: - return true; - case KEncodingDetector::ChineseSimplified: - return true; - case KEncodingDetector::Unicode: - return true; - break; - default: - return false; - } -} - -QString KEncodingDetector::nameForScript(KEncodingDetector::AutoDetectScript script) -{ - switch (script) - { - case KEncodingDetector::Arabic: - return i18nc("@item Text character set", "Arabic"); - break; - case KEncodingDetector::Baltic: - return i18nc("@item Text character set", "Baltic"); - break; - case KEncodingDetector::CentralEuropean: - return i18nc("@item Text character set", "Central European"); - break; - case KEncodingDetector::Cyrillic: - return i18nc("@item Text character set", "Cyrillic"); - break; - case KEncodingDetector::Greek: - return i18nc("@item Text character set", "Greek"); - break; - case KEncodingDetector::Hebrew: - return i18nc("@item Text character set", "Hebrew"); - break; - case KEncodingDetector::Japanese: - return i18nc("@item Text character set", "Japanese"); - break; - case KEncodingDetector::Turkish: - return i18nc("@item Text character set", "Turkish"); - break; - case KEncodingDetector::WesternEuropean: - return i18nc("@item Text character set", "Western European"); - break; - case KEncodingDetector::ChineseTraditional: - return i18nc("@item Text character set", "Chinese Traditional"); - break; - case KEncodingDetector::ChineseSimplified: - return i18nc("@item Text character set", "Chinese Simplified"); - break; - case KEncodingDetector::Korean: - return i18nc("@item Text character set", "Korean"); - break; - case KEncodingDetector::Thai: - return i18nc("@item Text character set", "Thai"); - break; - case KEncodingDetector::Unicode: - return i18nc("@item Text character set", "Unicode"); - break; - //case KEncodingDetector::SemiautomaticDetection: - default: - return QString(); - - } -} - -#undef DECODE_DEBUG - diff --git a/kdecore/localization/kencodingdetector.h b/kdecore/localization/kencodingdetector.h deleted file mode 100644 index f0203340..00000000 --- a/kdecore/localization/kencodingdetector.h +++ /dev/null @@ -1,219 +0,0 @@ -/* - This file is part of the KDE libraries - - Copyright (C) 1999 Lars Knoll (knoll@mpi-hd.mpg.de) - Copyright (C) 2007 Nick Shaforostoff (shafff@ukr.net) - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Library General Public - License as published by the Free Software Foundation; either - version 2 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Library General Public License for more details. - - You should have received a copy of the GNU Library General Public License - along with this library; see the file COPYING.LIB. If not, write to - the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, - Boston, MA 02110-1301, USA. - -*/ -#ifndef KENCODINGDETECTOR_H -#define KENCODINGDETECTOR_H - -#include -#include - -#include -#include -class KEncodingDetectorPrivate; - -/** - * @short Provides encoding detection capabilities. - * - * Searches for encoding declaration inside raw data -- meta and xml tags. - * In the case it can't find it, uses heuristics for specified language. - * - * If it finds unicode BOM marks, it changes encoding regardless of what the user has told - * - * Intended lifetime of the object: one instance per document. - * - * Typical use: - * \code - * QByteArray data; - * ... - * KEncodingDetector detector; - * detector.setAutoDetectLanguage(KEncodingDetector::Cyrillic); - * QString out=detector.decode(data); - * \endcode - * - * - * Do not mix decode() with decodeWithBuffering() - * - * @short Guess encoding of char array - * - */ -class KDECORE_EXPORT KEncodingDetector -{ -public: - enum EncodingChoiceSource - { - DefaultEncoding, - AutoDetectedEncoding, - BOM, - EncodingFromXMLHeader, - EncodingFromMetaTag, - EncodingFromHTTPHeader, - UserChosenEncoding - }; - - enum AutoDetectScript - { - None, - SemiautomaticDetection, - Arabic, - Baltic, - CentralEuropean, - ChineseSimplified, - ChineseTraditional, - Cyrillic, - Greek, - Hebrew, - Japanese, - Korean, - NorthernSaami, - SouthEasternEurope, - Thai, - Turkish, - Unicode, - WesternEuropean - }; - - /** - * Default codec is latin1 (as html spec says), EncodingChoiceSource is default, AutoDetectScript=Semiautomatic - */ - KEncodingDetector(); - - /** - * Allows to set Default codec, EncodingChoiceSource, AutoDetectScript - */ - KEncodingDetector(QTextCodec* codec, EncodingChoiceSource source, AutoDetectScript script=None); - ~KEncodingDetector(); - - //const QTextCodec* codec() const; - - /** - * @returns true if specified encoding was recognized - */ - bool setEncoding(const char *encoding, EncodingChoiceSource type); - - /** - * Convenience method. - * @returns mime name of detected encoding - */ - const char* encoding() const; - - bool visuallyOrdered() const; - -// void setAutoDetectLanguage( const QString& ); -// const QString& autoDetectLanguage() const; - - void setAutoDetectLanguage( AutoDetectScript ); - AutoDetectScript autoDetectLanguage() const; - - EncodingChoiceSource encodingChoiceSource() const; - - /** - * The main class method - * - * Calls protected analyze() only the first time of the whole object life - * - * Replaces all null chars with spaces. - */ - QString decode(const char *data, int len); - QString decode(const QByteArray &data); - - //* You don't need to call analyze() if you use this method. - /** - * Convenience method that uses buffering. It waits for full html head to be buffered - * (i.e. calls analyze every time until it returns true). - * - * Replaces all null chars with spaces. - * - * @returns Decoded data, or empty string, if there was not enough data for accurate detection - * @see flush() - */ - QString decodeWithBuffering(const char *data, int len); - - /** - * This method checks whether invalid characters were found - * during a decoding operation. - * - * Note that this bit is never reset once invalid characters have been found. - * To force a reset, either change the encoding using setEncoding() or call - * resetDecoder() - * - * @returns a boolean reflecting said state. - * @since 4.3 - * @see resetDecoder() setEncoding() - */ - bool decodedInvalidCharacters() const; - - /** - * Resets the decoder. Any stateful decoding information (such as resulting from previous calls - * to decodeWithBuffering()) will be lost. - * Will Reset the state of decodedInvalidCharacters() as a side effect. - * - * @since 4.3 - * @see decodeWithBuffering() decodedInvalidCharacters() - * - */ - void resetDecoder(); - - /** - * Convenience method to be used with decodeForHtml. Flushes buffer. - * @see decodeForHtml() - */ - QString flush(); - - /** - * Takes lang name _after_ it were i18n()'ed - */ - static AutoDetectScript scriptForName(const QString& lang); - static QString nameForScript(AutoDetectScript); - static bool hasAutoDetectionForScript(AutoDetectScript); - -protected: - /** - * This nice method will kill all 0 bytes (or double bytes) - * and remember if this was a binary or not ;) - */ - bool processNull(char* data,int length); - - /** - * Check if we are really utf8. Taken from kate - * - * @returns true if current encoding is utf8 and the text cannot be in this encoding - * - * Please somebody read http://de.wikipedia.org/wiki/UTF-8 and check this code... - */ - bool errorsIfUtf8 (const char* data, int length); - - /** - * Analyze text data. - * @returns true if there was enough data for accurate detection - */ - bool analyze (const char *data, int len); - - /** - * @returns QTextDecoder for detected encoding - */ - QTextDecoder* decoder(); - -private: - KEncodingDetectorPrivate* const d; -}; - -#endif diff --git a/kdecore/tests/CMakeLists.txt b/kdecore/tests/CMakeLists.txt index 771b7103..835e849b 100644 --- a/kdecore/tests/CMakeLists.txt +++ b/kdecore/tests/CMakeLists.txt @@ -57,7 +57,6 @@ KDECORE_UNIT_TESTS( kconfigafterkglobaltest2 ksycocathreadtest kdebug_unittest - kencodingdetectortest qcoreapptest kdebug_qcoreapptest kmimetype_nomimetypes diff --git a/kdecore/tests/kencodingdetectortest.cpp b/kdecore/tests/kencodingdetectortest.cpp deleted file mode 100644 index 893d9704..00000000 --- a/kdecore/tests/kencodingdetectortest.cpp +++ /dev/null @@ -1,89 +0,0 @@ -/* This file is part of the KDE libraries - Copyright (c) 2009 Germain Garand - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Library General Public - License version 2 as published by the Free Software Foundation. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Library General Public License for more details. - - You should have received a copy of the GNU Library General Public License - along with this library; see the file COPYING.LIB. If not, write to - the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, - Boston, MA 02110-1301, USA. -*/ - -#include "kencodingdetectortest.h" -#include "qtest_kde.h" -#include -#include - -#include "moc_kencodingdetectortest.cpp" - -static const char data1[] = "this should decode correctly"; -static const char data2[] = "this is an invalid utf-8 byte: \xBF and another one: \xBE"; - -static KEncodingDetector* ed = 0; - -void KEncodingDetectorTest::initTestCase() -{ - ed = new KEncodingDetector(); -} - -void KEncodingDetectorTest::testSetEncoding() -{ - QCOMPARE(ed->setEncoding( "iso8859-1", KEncodingDetector::UserChosenEncoding ), true); - QCOMPARE(ed->setEncoding( "utf-8", KEncodingDetector::UserChosenEncoding ), true); -} - -void KEncodingDetectorTest::testDecode() -{ - QString s = ed->decode( data1, sizeof(data1)-1); - QCOMPARE(ed->decodedInvalidCharacters(), false); - QString s2 = ed->decode( data2, sizeof(data2)-1); - QCOMPARE(ed->decodedInvalidCharacters(), true); - QCOMPARE( s == data1, true ); - - ed->resetDecoder(); - QCOMPARE(ed->decodedInvalidCharacters(), false); - - // set to automatic detection - ed->setEncoding( "", KEncodingDetector::DefaultEncoding ); - - // decodeWithBuffering should just accumulate the buffer here, - // waiting for some HTML/XML encoding tags - s = ed->decodeWithBuffering(data2, sizeof data2 -1); - - // shouldn't even decode anything yet, so: - QCOMPARE(s.isEmpty(), true); - QCOMPARE(ed->decodedInvalidCharacters(), false); - - // force encoding, as the high bytes must have switched the encoding - // to anything *but* utf-8 - QCOMPARE(QString("utf-8").startsWith(ed->encoding(), Qt::CaseInsensitive), false); - ed->setEncoding( "utf-8", KEncodingDetector::UserChosenEncoding ); - QCOMPARE(QString("utf-8").startsWith(ed->encoding(), Qt::CaseInsensitive), true); - - // force decoding now - s = ed->flush(); - QCOMPARE(s.isEmpty(), false); - QCOMPARE(ed->decodedInvalidCharacters(), true); - - // now check that resetDecoder() empties the buffer - s2 = ed->decodeWithBuffering(data1, sizeof data1 -1); - ed->resetDecoder(); - s2 = ed->flush(); - QCOMPARE(s2.isEmpty(), true); - - // check that buffered decoding with non-overridable specified codec decodes right away - ed->setEncoding( "utf-8", KEncodingDetector::EncodingFromHTTPHeader ); - s = ed->decodeWithBuffering(data2, sizeof data2 -1); - - QCOMPARE( s.isEmpty(), false ); - QCOMPARE( ed->decodedInvalidCharacters(), true ); -} - -QTEST_KDEMAIN_CORE(KEncodingDetectorTest) diff --git a/kdecore/tests/kencodingdetectortest.h b/kdecore/tests/kencodingdetectortest.h deleted file mode 100644 index a9eedec4..00000000 --- a/kdecore/tests/kencodingdetectortest.h +++ /dev/null @@ -1,33 +0,0 @@ -/* This file is part of the KDE libraries - Copyright (c) 2009 Germain Garand - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Library General Public - License version 2 as published by the Free Software Foundation. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Library General Public License for more details. - - You should have received a copy of the GNU Library General Public License - along with this library; see the file COPYING.LIB. If not, write to - the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, - Boston, MA 02110-1301, USA. -*/ - -#ifndef KENCODINGDETECTORTEST_H -#define KENCODINGDETECTORTEST_H - -#include - -class KEncodingDetectorTest : public QObject -{ - Q_OBJECT -private Q_SLOTS: - void initTestCase(); - void testSetEncoding(); - void testDecode(); -}; - -#endif // KENCODINGDETECTORTEST_H diff --git a/kdecore/text/kstringhandler.h b/kdecore/text/kstringhandler.h index 6919778a..a3e1242c 100644 --- a/kdecore/text/kstringhandler.h +++ b/kdecore/text/kstringhandler.h @@ -194,7 +194,6 @@ namespace KStringHandler @param str the input string @return the (hopefully correctly guessed) QString representation of @p str - @see KEncodingDetector */ KDECORE_EXPORT QString from8Bit( const char *str ); diff --git a/kdeui/actions/kcodecaction.cpp b/kdeui/actions/kcodecaction.cpp index 701ccef0..db77d927 100644 --- a/kdeui/actions/kcodecaction.cpp +++ b/kdeui/actions/kcodecaction.cpp @@ -45,39 +45,37 @@ class KCodecAction::Private public: Private(KCodecAction *parent) : q(parent), - defaultAction(0), currentSubAction(0) { } - void init(bool); + void init(); void _k_subActionTriggered(QAction*); KCodecAction *q; - QAction *defaultAction; QAction *currentSubAction; }; -KCodecAction::KCodecAction(QObject *parent,bool showAutoOptions) +KCodecAction::KCodecAction(QObject *parent) : KSelectAction(parent) , d(new Private(this)) { - d->init(showAutoOptions); + d->init(); } -KCodecAction::KCodecAction(const QString &text, QObject *parent,bool showAutoOptions) +KCodecAction::KCodecAction(const QString &text, QObject *parent) : KSelectAction(text, parent) , d(new Private(this)) { - d->init(showAutoOptions); + d->init(); } -KCodecAction::KCodecAction(const KIcon &icon, const QString &text, QObject *parent,bool showAutoOptions) +KCodecAction::KCodecAction(const KIcon &icon, const QString &text, QObject *parent) : KSelectAction(icon, text, parent) , d(new Private(this)) { - d->init(showAutoOptions); + d->init(); } KCodecAction::~KCodecAction() @@ -85,26 +83,13 @@ KCodecAction::~KCodecAction() delete d; } -void KCodecAction::Private::init(bool showAutoOptions) +void KCodecAction::Private::init() { q->setToolBarMode(MenuMode); - defaultAction = q->addAction(i18nc("Encodings menu", "Default")); - int i; - foreach(const QStringList &encodingsForScript, KGlobal::charsets()->encodingsByScript()) - { - KSelectAction* tmp = new KSelectAction(encodingsForScript.at(0),q); - if (showAutoOptions) - { - KEncodingDetector::AutoDetectScript scri=KEncodingDetector::scriptForName(encodingsForScript.at(0)); - if (KEncodingDetector::hasAutoDetectionForScript(scri)) - { - tmp->addAction(i18nc("Encodings menu","Autodetect"))->setData(QVariant((uint)scri)); - tmp->menu()->addSeparator(); - } - } - for (i=1; iencodingsByScript()) { + KSelectAction* tmp = new KSelectAction(encodingsForScript.at(0), q); + for (int i = 1; iaddAction(encodingsForScript.at(i)); } q->connect(tmp,SIGNAL(triggered(QAction*)),q,SLOT(_k_subActionTriggered(QAction*))); @@ -122,26 +107,23 @@ int KCodecAction::mibForName(const QString &codecName, bool *ok) const int mib = MIB_DEFAULT; KCharsets *charsets = KGlobal::charsets(); - if (codecName == d->defaultAction->text()) - success = true; - else - { - QTextCodec *codec = charsets->codecForName(codecName, success); - if (!success) - { - // Maybe we got a description name instead - codec = charsets->codecForName(charsets->encodingForName(codecName), success); - } - - if (codec) - mib = codec->mibEnum(); + QTextCodec *codec = charsets->codecForName(codecName, success); + if (!success) { + // Maybe we got a description name instead + codec = charsets->codecForName(charsets->encodingForName(codecName), success); } - if (ok) - *ok = success; + if (codec) { + mib = codec->mibEnum(); + } - if (success) + if (ok) { + *ok = success; + } + + if (success) { return mib; + } kWarning() << "Invalid codec name: " << codecName; return MIB_DEFAULT; @@ -149,43 +131,24 @@ int KCodecAction::mibForName(const QString &codecName, bool *ok) const QTextCodec *KCodecAction::codecForMib(int mib) const { - if (mib == MIB_DEFAULT) - { - // FIXME offer to change the default codec + if (mib == MIB_DEFAULT) { return QTextCodec::codecForLocale(); } - else - return QTextCodec::codecForMib(mib); -} - -void KCodecAction::actionTriggered(QAction *action) -{ -//we don't want to emit any signals from top-level items -//except for the default one - if (action==d->defaultAction) - { - emit triggered(KEncodingDetector::SemiautomaticDetection); - emit defaultItemTriggered(); - } + return QTextCodec::codecForMib(mib); } void KCodecAction::Private::_k_subActionTriggered(QAction *action) { - if (currentSubAction==action) + if (currentSubAction == action) { return; - currentSubAction=action; + } + currentSubAction = action; bool ok = false; int mib = q->mibForName(action->text(), &ok); - if (ok) - { + if (ok) { emit q->triggered(action->text()); emit q->triggered(q->codecForMib(mib)); } - else - { - if (!action->data().isNull()) - emit q->triggered((KEncodingDetector::AutoDetectScript) action->data().toUInt()); - } } QTextCodec *KCodecAction::currentCodec() const @@ -195,21 +158,15 @@ QTextCodec *KCodecAction::currentCodec() const bool KCodecAction::setCurrentCodec( QTextCodec *codec ) { - if (!codec) + if (!codec) { return false; + } - int i,j; - for (i=0;imenu()) - { - for (j=0;jmenu()->actions().size();++j) - { - if (!j && !actions().at(i)->menu()->actions().at(j)->data().isNull()) - continue; - if (codec==KGlobal::charsets()->codecForName(actions().at(i)->menu()->actions().at(j)->text())) - { - d->currentSubAction=actions().at(i)->menu()->actions().at(j); + for (int i = 0; i menu()) { + for (int j = 0; j < actions().at(i)->menu()->actions().size(); ++j) { + if (codec == KGlobal::charsets()->codecForName(actions().at(i)->menu()->actions().at(j)->text())) { + d->currentSubAction = actions().at(i)->menu()->actions().at(j); d->currentSubAction->trigger(); return true; } @@ -225,7 +182,7 @@ QString KCodecAction::currentCodecName() const return d->currentSubAction->text(); } -bool KCodecAction::setCurrentCodec( const QString &codecName ) +bool KCodecAction::setCurrentCodec(const QString &codecName) { return setCurrentCodec(KGlobal::charsets()->codecForName(codecName)); } @@ -235,47 +192,9 @@ int KCodecAction::currentCodecMib() const return mibForName(currentCodecName()); } -bool KCodecAction::setCurrentCodec( int mib ) +bool KCodecAction::setCurrentCodec(int mib) { - if (mib == MIB_DEFAULT) - return setCurrentAction(d->defaultAction); - else - return setCurrentCodec(codecForMib(mib)); -} - -KEncodingDetector::AutoDetectScript KCodecAction::currentAutoDetectScript() const -{ - return d->currentSubAction->data().isNull()? - KEncodingDetector::None : - (KEncodingDetector::AutoDetectScript)d->currentSubAction->data().toUInt(); -} - -bool KCodecAction::setCurrentAutoDetectScript(KEncodingDetector::AutoDetectScript scri) -{ - if (scri==KEncodingDetector::SemiautomaticDetection) - { - d->currentSubAction=d->defaultAction; - d->currentSubAction->trigger(); - return true; - } - - int i; - for (i=0;imenu()) - { - if (!actions().at(i)->menu()->actions().isEmpty() - &&!actions().at(i)->menu()->actions().at(0)->data().isNull() - &&actions().at(i)->menu()->actions().at(0)->data().toUInt()==(uint)scri - ) - { - d->currentSubAction=actions().at(i)->menu()->actions().at(0); - d->currentSubAction->trigger(); - return true; - } - } - } - return false; + return setCurrentCodec(codecForMib(mib)); } #include "moc_kcodecaction.cpp" diff --git a/kdeui/actions/kcodecaction.h b/kdeui/actions/kcodecaction.h index 53e0f819..f5980d39 100644 --- a/kdeui/actions/kcodecaction.h +++ b/kdeui/actions/kcodecaction.h @@ -27,7 +27,7 @@ #ifndef KCODECACTION_H #define KCODECACTION_H -#include +#include #include /** @@ -35,81 +35,47 @@ * * This action shows up a submenu with a list of the available codecs on the system. */ -class KDEUI_EXPORT KCodecAction - : public KSelectAction +class KDEUI_EXPORT KCodecAction : public KSelectAction { - Q_OBJECT + Q_OBJECT - Q_PROPERTY(QString codecName READ currentCodecName WRITE setCurrentCodec) - Q_PROPERTY(int codecMib READ currentCodecMib) + Q_PROPERTY(QString codecName READ currentCodecName WRITE setCurrentCodec) + Q_PROPERTY(int codecMib READ currentCodecMib) public: - explicit KCodecAction(QObject *parent,bool showAutoOptions=false); + explicit KCodecAction(QObject *parent); + KCodecAction(const QString &text, QObject *parent); + KCodecAction(const KIcon &icon, const QString &text, QObject *parent); - KCodecAction(const QString &text, QObject *parent,bool showAutoOptions=false); - - KCodecAction(const KIcon &icon, const QString &text, QObject *parent,bool showAutoOptions=false); - - virtual ~KCodecAction(); + virtual ~KCodecAction(); public: - int mibForName(const QString &codecName, bool *ok = 0) const; - QTextCodec *codecForMib(int mib) const; + int mibForName(const QString &codecName, bool *ok = 0) const; + QTextCodec *codecForMib(int mib) const; - QTextCodec *currentCodec() const; - bool setCurrentCodec(QTextCodec *codec); + QTextCodec *currentCodec() const; + bool setCurrentCodec(QTextCodec *codec); - QString currentCodecName() const; - bool setCurrentCodec(const QString &codecName); - - int currentCodecMib() const; - bool setCurrentCodec(int mib); - - /** - * Applicable only if showAutoOptions in c'tor was true - * - * @returns KEncodingDetector::None if specific encoding is selected, not autodetection, otherwise... you know it! - */ - KEncodingDetector::AutoDetectScript currentAutoDetectScript() const; - /** - * Applicable only if showAutoOptions in c'tor was true - * - * KEncodingDetector::SemiautomaticDetection means 'Default' item - */ - bool setCurrentAutoDetectScript(KEncodingDetector::AutoDetectScript); + QString currentCodecName() const; + bool setCurrentCodec(const QString &codecName); + int currentCodecMib() const; + bool setCurrentCodec(int mib); Q_SIGNALS: - /** - * Specific (proper) codec was selected - * - * Note that triggered(const QString&) is emitted too (as defined in KSelectAction) - */ - void triggered(QTextCodec *codec); - - /** - * Autodetection has been selected. - * emits KEncodingDetector::SemiautomaticDetection if Default was selected. - * - * Applicable only if showAutoOptions in c'tor was true - */ - void triggered(KEncodingDetector::AutoDetectScript); - - /** - * If showAutoOptions==true, then better handle triggered(KEncodingDetector::AutoDetectScript) signal - */ - void defaultItemTriggered(); - - -protected Q_SLOTS: - virtual void actionTriggered(QAction*); + /** + * Specific (proper) codec was selected + * + * Note that triggered(const QString&) is emitted too (as defined in KSelectAction) + */ + void triggered(QTextCodec *codec); protected: - using KSelectAction::triggered; + using KSelectAction::triggered; private: - class Private; - Private* const d; + class Private; + Private* const d; Q_PRIVATE_SLOT( d, void _k_subActionTriggered(QAction*) ) };