kde-workspace/kate/part/buffer/katetextloader.h

/*  This file is part of the Kate project.
 *
 *  Copyright (C) 2010 Christoph Cullmann <cullmann@kde.org>
 *
 *  This library is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU Library General Public
 *  License as published by the Free Software Foundation; either
 *  version 2 of the License, or (at your option) any later version.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 *  Library General Public License for more details.
 *
 *  You should have received a copy of the GNU Library General Public License
 *  along with this library; see the file COPYING.LIB.  If not, write to
 *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
 *  Boston, MA 02110-1301, USA.
 */

#ifndef KATE_TEXTLOADER_H
#define KATE_TEXTLOADER_H

#include <kateglobal.h>

#include <QtCore/QString>
#include <QtCore/QFile>
#include <QCryptographicHash>

namespace Kate {

/**
 * File Loader, will handle reading of files + detecting encoding
 */
class TextLoader
{
  public:
    /**
     * Construct file loader for given file.
     * @param filename file to open
     * @param proberType prober type
     */
    TextLoader (const QString &filename)
      : m_codec (0)
      , m_eof (false) // default to not eof
      , m_lastWasEndOfLine (true) // at start of file, we had a virtual newline
      , m_lastWasR (false) // we have not found a \r as last char
      , m_position (0)
      , m_lastLineStart (0)
      , m_eol (TextBuffer::eolUnknown) // no eol type detected atm
      , m_buffer (KATE_FILE_LOADER_BS, 0)
      , m_converter (0)
      , m_bomFound (false)
      , m_firstRead (true)
    {
      // construct file device
      m_file = new QFile (filename);
    }

    /**
     * Destructor
     */
    ~TextLoader ()
    {
      delete m_file;
      delete m_converter;
    }

    /**
     * open file with given codec
     * @param codec codec to use, if 0, will do some auto-dectect or fallback
     * @return success
     */
    bool open (QTextCodec *codec)
    {
      m_codec = codec;
      m_eof = false;
      m_lastWasEndOfLine = true;
      m_lastWasR = false;
      m_position = 0;
      m_lastLineStart = 0;
      m_eol = TextBuffer::eolUnknown;
      m_text.clear ();
      delete m_converter;
      m_converter = 0;
      m_bomFound = false;
      m_firstRead = true;

      // if already opened, close the file...
      if (m_file->isOpen())
        m_file->close ();

      return m_file->open (QIODevice::ReadOnly);
    }

    /**
     * end of file reached?
     * @return end of file reached
     */
    bool eof () const { return m_eof && !m_lastWasEndOfLine && (m_lastLineStart == m_text.length()); }

    /**
     * Detected end of line mode for this file.
     * Detected during reading, is valid after complete file is read.
     * @return eol mode of this file
     */
    TextBuffer::EndOfLineMode eol () const { return m_eol; }

    /**
     * BOM found?
     * @return byte order mark found?
     */
    bool byteOrderMarkFound () const { return m_bomFound; }

    /**
     * internal unicode data array
     * @return internal unicode data
     */
    const QChar *unicode () const { return m_text.unicode(); }

    /**
     * Get codec for this loader
     * @return currently in use codec of this loader
     */
    QTextCodec *textCodec () const { return m_codec; }

    /**
     * read a line, return length + offset in unicode data
     * @param offset offset into internal unicode data for read line
     * @param length length of read line
     * @return true if no encoding errors occurred
     */
    bool readLine (int &offset, int &length)
    {
      length = 0;
      offset = 0;
      bool encodingError = false;

      static const QLatin1Char cr(QLatin1Char('\r'));
      static const QLatin1Char lf(QLatin1Char('\n'));

      /**
       * did we read two time but got no stuff? encoding error
       * fixes problem with one character latin-1 files, which lead to crash otherwise!
       * bug 272579
       */
      bool failedToConvertOnce = false;

      /**
       * reading loop
       */
      while (m_position <= m_text.length())
      {
        if (m_position == m_text.length())
        {
          // try to load more text if something is around
          if (!m_eof)
          {
            // kill the old lines...
            m_text.remove (0, m_lastLineStart);

            // try to read new data
            const int c = m_file->read(m_buffer.data(), m_buffer.size());

            // if any text is there, append it....
            if (c > 0)
            {
              // update hash sum
              m_digest.addData (m_buffer.data(), c);

              // detect byte order marks & codec for byte order markers on first read
              int bomBytes = 0;
              if (m_firstRead) {
                // use first 16 bytes max to allow BOM detection of codec
                QByteArray bom (m_buffer.data(), qMin (16, c));
                QTextCodec *codecForByteOrderMark = QTextCodec::codecForUtfText (bom, 0);

                // if codec != null, we found a BOM!
                if (codecForByteOrderMark) {
                  m_bomFound = true;

                  // eat away the different boms!
                  int mib = codecForByteOrderMark->mibEnum ();
                  if (mib == 106) // utf8
                    bomBytes = 3;
                  if (mib == 1013 || mib == 1014 || mib == 1015) // utf16
                    bomBytes = 2;
                  if (mib == 1017 || mib == 1018 || mib == 1019) // utf32
                    bomBytes = 4;
                }

                /**
                 * if no codec given, do autodetection
                 */
                if (!m_codec) {
                  /**
                   * byte order said something about encoding?
                   */
                  if (codecForByteOrderMark)
                    m_codec = codecForByteOrderMark;
                  else {
                    return false;
                  }
                }

                m_firstRead = false;
              }

              Q_ASSERT (m_codec);
              if (!m_converter) {
                m_converter = new QTextConverter(m_codec->name());
                m_converter->setFlags(QTextConverter::ConvertInvalidToNull);
              }
              QString unicode = m_converter->toUnicode (m_buffer.constData() + bomBytes, c - bomBytes);

              // detect broken encoding
              for (int i = 0; i < unicode.size(); ++i) {
                  if (unicode[i] == 0) {
                    encodingError = true;
                    break;
                  }
              }

              m_text.append (unicode);
            }

            // is file completely read ?
            m_eof = (c == -1) || (c == 0);

            // recalc current pos and last pos
            m_position -= m_lastLineStart;
            m_lastLineStart = 0;
          }

          // oh oh, end of file, escape !
          if (m_eof && (m_position == m_text.length()))
          {
            m_lastWasEndOfLine = false;

            // line data
            offset = m_lastLineStart;
            length = m_position-m_lastLineStart;

            m_lastLineStart = m_position;

            return !encodingError && !failedToConvertOnce;
          }

          // empty? try again
          if (m_position == m_text.length()) {
	    failedToConvertOnce = true;
	    continue;
	  }
        }

        if (m_text.at(m_position) == lf)
        {
          m_lastWasEndOfLine = true;

          if (m_lastWasR)
          {
            m_lastLineStart++;
            m_lastWasR = false;
            m_eol = TextBuffer::eolDos;
          }
          else
          {
            // line data
            offset = m_lastLineStart;
            length = m_position-m_lastLineStart;

            m_lastLineStart = m_position+1;
            m_position++;

            // only win, if not dos!
            if (m_eol != TextBuffer::eolDos)
              m_eol = TextBuffer::eolUnix;

            return !encodingError;
          }
        }
        else if (m_text.at(m_position) == cr)
        {
          m_lastWasEndOfLine = true;
          m_lastWasR = true;

          // line data
          offset = m_lastLineStart;
          length = m_position-m_lastLineStart;

          m_lastLineStart = m_position+1;
          m_position++;

          // should only win of first time!
          if (m_eol == TextBuffer::eolUnknown)
            m_eol = TextBuffer::eolMac;

          return !encodingError;
        }
        else if (m_text.at(m_position) == QChar::LineSeparator)
        {
          m_lastWasEndOfLine = true;

          // line data
          offset = m_lastLineStart;
          length = m_position-m_lastLineStart;

          m_lastLineStart = m_position+1;
          m_position++;

          return !encodingError;
        }
        else
        {
          m_lastWasEndOfLine = false;
          m_lastWasR = false;
        }

        m_position++;
      }

      return !encodingError;
    }

    QByteArray digest ()
    {
      return m_digest.result ();
    }

  private:
    QTextCodec *m_codec;
    bool m_eof;
    bool m_lastWasEndOfLine;
    bool m_lastWasR;
    int m_position;
    int m_lastLineStart;
    TextBuffer::EndOfLineMode m_eol;
    QIODevice *m_file;
    QByteArray m_buffer;
    QCryptographicHash m_digest;
    QString m_text;
    QTextConverter *m_converter;
    bool m_bomFound;
    bool m_firstRead;
};

}

#endif