kde-workspace/kate/part/buffer/katetextloader.h
Ivailo Monev c31839755f kate: adjust to Katie changes
Signed-off-by: Ivailo Monev <xakepa10@gmail.com>
2023-11-05 00:49:55 +02:00

349 lines
9.5 KiB
C++

/* This file is part of the Kate project.
*
* Copyright (C) 2010 Christoph Cullmann <cullmann@kde.org>
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Library General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Library General Public License for more details.
*
* You should have received a copy of the GNU Library General Public License
* along with this library; see the file COPYING.LIB. If not, write to
* the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
* Boston, MA 02110-1301, USA.
*/
#ifndef KATE_TEXTLOADER_H
#define KATE_TEXTLOADER_H
#include <kateglobal.h>
#include <QtCore/QString>
#include <QtCore/QFile>
#include <QCryptographicHash>
namespace Kate {
/**
* File Loader, will handle reading of files + detecting encoding
*/
class TextLoader
{
public:
/**
* Construct file loader for given file.
* @param filename file to open
* @param proberType prober type
*/
TextLoader (const QString &filename)
: m_codec (0)
, m_eof (false) // default to not eof
, m_lastWasEndOfLine (true) // at start of file, we had a virtual newline
, m_lastWasR (false) // we have not found a \r as last char
, m_position (0)
, m_lastLineStart (0)
, m_eol (TextBuffer::eolUnknown) // no eol type detected atm
, m_buffer (KATE_FILE_LOADER_BS, 0)
, m_converter (0)
, m_bomFound (false)
, m_firstRead (true)
{
// construct file device
m_file = new QFile (filename);
}
/**
* Destructor
*/
~TextLoader ()
{
delete m_file;
delete m_converter;
}
/**
* open file with given codec
* @param codec codec to use, if 0, will do some auto-dectect or fallback
* @return success
*/
bool open (QTextCodec *codec)
{
m_codec = codec;
m_eof = false;
m_lastWasEndOfLine = true;
m_lastWasR = false;
m_position = 0;
m_lastLineStart = 0;
m_eol = TextBuffer::eolUnknown;
m_text.clear ();
delete m_converter;
m_converter = 0;
m_bomFound = false;
m_firstRead = true;
// if already opened, close the file...
if (m_file->isOpen())
m_file->close ();
return m_file->open (QIODevice::ReadOnly);
}
/**
* end of file reached?
* @return end of file reached
*/
bool eof () const { return m_eof && !m_lastWasEndOfLine && (m_lastLineStart == m_text.length()); }
/**
* Detected end of line mode for this file.
* Detected during reading, is valid after complete file is read.
* @return eol mode of this file
*/
TextBuffer::EndOfLineMode eol () const { return m_eol; }
/**
* BOM found?
* @return byte order mark found?
*/
bool byteOrderMarkFound () const { return m_bomFound; }
/**
* internal unicode data array
* @return internal unicode data
*/
const QChar *unicode () const { return m_text.unicode(); }
/**
* Get codec for this loader
* @return currently in use codec of this loader
*/
QTextCodec *textCodec () const { return m_codec; }
/**
* read a line, return length + offset in unicode data
* @param offset offset into internal unicode data for read line
* @param length length of read line
* @return true if no encoding errors occurred
*/
bool readLine (int &offset, int &length)
{
length = 0;
offset = 0;
bool encodingError = false;
static const QLatin1Char cr(QLatin1Char('\r'));
static const QLatin1Char lf(QLatin1Char('\n'));
/**
* did we read two time but got no stuff? encoding error
* fixes problem with one character latin-1 files, which lead to crash otherwise!
* bug 272579
*/
bool failedToConvertOnce = false;
/**
* reading loop
*/
while (m_position <= m_text.length())
{
if (m_position == m_text.length())
{
// try to load more text if something is around
if (!m_eof)
{
// kill the old lines...
m_text.remove (0, m_lastLineStart);
// try to read new data
const int c = m_file->read(m_buffer.data(), m_buffer.size());
// if any text is there, append it....
if (c > 0)
{
// update hash sum
m_digest.addData (m_buffer.data(), c);
// detect byte order marks & codec for byte order markers on first read
int bomBytes = 0;
if (m_firstRead) {
// use first 16 bytes max to allow BOM detection of codec
QByteArray bom (m_buffer.data(), qMin (16, c));
QTextCodec *codecForByteOrderMark = QTextCodec::codecForUtfText (bom, 0);
// if codec != null, we found a BOM!
if (codecForByteOrderMark) {
m_bomFound = true;
// eat away the different boms!
int mib = codecForByteOrderMark->mibEnum ();
if (mib == 106) // utf8
bomBytes = 3;
if (mib == 1013 || mib == 1014 || mib == 1015) // utf16
bomBytes = 2;
if (mib == 1017 || mib == 1018 || mib == 1019) // utf32
bomBytes = 4;
}
/**
* if no codec given, do autodetection
*/
if (!m_codec) {
/**
* byte order said something about encoding?
*/
if (codecForByteOrderMark)
m_codec = codecForByteOrderMark;
else {
return false;
}
}
m_firstRead = false;
}
Q_ASSERT (m_codec);
if (!m_converter) {
m_converter = new QTextConverter(m_codec->name());
m_converter->setFlags(QTextConverter::ConvertInvalidToNull);
}
QString unicode = m_converter->toUnicode (m_buffer.constData() + bomBytes, c - bomBytes);
// detect broken encoding
for (int i = 0; i < unicode.size(); ++i) {
if (unicode[i] == 0) {
encodingError = true;
break;
}
}
m_text.append (unicode);
}
// is file completely read ?
m_eof = (c == -1) || (c == 0);
// recalc current pos and last pos
m_position -= m_lastLineStart;
m_lastLineStart = 0;
}
// oh oh, end of file, escape !
if (m_eof && (m_position == m_text.length()))
{
m_lastWasEndOfLine = false;
// line data
offset = m_lastLineStart;
length = m_position-m_lastLineStart;
m_lastLineStart = m_position;
return !encodingError && !failedToConvertOnce;
}
// empty? try again
if (m_position == m_text.length()) {
failedToConvertOnce = true;
continue;
}
}
if (m_text.at(m_position) == lf)
{
m_lastWasEndOfLine = true;
if (m_lastWasR)
{
m_lastLineStart++;
m_lastWasR = false;
m_eol = TextBuffer::eolDos;
}
else
{
// line data
offset = m_lastLineStart;
length = m_position-m_lastLineStart;
m_lastLineStart = m_position+1;
m_position++;
// only win, if not dos!
if (m_eol != TextBuffer::eolDos)
m_eol = TextBuffer::eolUnix;
return !encodingError;
}
}
else if (m_text.at(m_position) == cr)
{
m_lastWasEndOfLine = true;
m_lastWasR = true;
// line data
offset = m_lastLineStart;
length = m_position-m_lastLineStart;
m_lastLineStart = m_position+1;
m_position++;
// should only win of first time!
if (m_eol == TextBuffer::eolUnknown)
m_eol = TextBuffer::eolMac;
return !encodingError;
}
else if (m_text.at(m_position) == QChar::LineSeparator)
{
m_lastWasEndOfLine = true;
// line data
offset = m_lastLineStart;
length = m_position-m_lastLineStart;
m_lastLineStart = m_position+1;
m_position++;
return !encodingError;
}
else
{
m_lastWasEndOfLine = false;
m_lastWasR = false;
}
m_position++;
}
return !encodingError;
}
QByteArray digest ()
{
return m_digest.result ();
}
private:
QTextCodec *m_codec;
bool m_eof;
bool m_lastWasEndOfLine;
bool m_lastWasR;
int m_position;
int m_lastLineStart;
TextBuffer::EndOfLineMode m_eol;
QIODevice *m_file;
QByteArray m_buffer;
QCryptographicHash m_digest;
QString m_text;
QTextConverter *m_converter;
bool m_bomFound;
bool m_firstRead;
};
}
#endif