kde-playground/kdepimlibs/kmime/kmime_codec_qp.cpp

/*  -*- c++ -*-
    kmime_codec_qp.cpp

    KMime, the KDE Internet mail/usenet news message library.
    Copyright (c) 2002 Marc Mutz <mutz@kde.org>

    This library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Library General Public
    License as published by the Free Software Foundation; either
    version 2 of the License, or (at your option) any later version.

    This library is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    Library General Public License for more details.

    You should have received a copy of the GNU Library General Public License
    along with this library; see the file COPYING.LIB.  If not, write to
    the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
    Boston, MA 02110-1301, USA.
*/
/**
  @file
  This file is part of the API for handling @ref MIME data and
  defines the @ref QuotedPrintable, @ref  RFC2047Q, and
  @ref RFC2231 @ref Codec classes.

  @brief
  Defines the classes QuotedPrintableCodec, Rfc2047QEncodingCodec, and
  Rfc2231EncodingCodec.

  @authors Marc Mutz \<mutz@kde.org\>
*/

#include "kmime_codec_qp.h"
#include "kmime_util.h"

#include <kdebug.h>

#include <cassert>

using namespace KMime;

namespace KMime {

// some helpful functions:

/**
  Converts a 4-bit @p value into its hexadecimal characater representation.
  So input of value [0,15] returns ['0','1',... 'F'].  Input values
  greater than 15 will produce undesired results.
  @param value is an unsigned character containing the 4-bit input value.
*/
static inline char binToHex( uchar value )
{
  if ( value > 9 ) {
    return value + 'A' - 10;
  } else {
    return value + '0';
  }
}

/**
  Returns the high-order 4 bits of an 8-bit value in another 8-bit value.
  @param ch is an unsigned character containing the 8-bit input value.
*/
static inline uchar highNibble( uchar ch )
{
  return ch >> 4;
}

/**
  Returns the low-order 4 bits of an 8-bit value in another 8-bit value.
  @param ch is an unsigned character containing the 8-bit input value.
*/
static inline uchar lowNibble( uchar ch )
{
  return ch & 0xF;
}

/**
  Returns true if the specified value is a not Control character or
  question mark; else true.
  @param ch is an unsigned character containing the 8-bit input value.
*/
static inline bool keep( uchar ch )
{
  // no CTLs, except HT and not '?'
  return !( ( ch < ' ' && ch != '\t' ) || ch == '?' );
}

//
// QuotedPrintableCodec
//

class QuotedPrintableEncoder : public Encoder
{
  char mInputBuffer[16];
  uchar mCurrentLineLength; // 0..76
  uchar mAccu;
  uint mInputBufferReadCursor  : 4; // 0..15
  uint mInputBufferWriteCursor : 4; // 0..15
  enum {
    Never, AtBOL, Definitely
  } mAccuNeedsEncoding    : 2;
  bool mSawLineEnd        : 1;
  bool mSawCR             : 1;
  bool mFinishing         : 1;
  bool mFinished          : 1;
  protected:
  friend class QuotedPrintableCodec;
  QuotedPrintableEncoder( bool withCRLF=false )
    : Encoder( withCRLF ), mCurrentLineLength( 0 ), mAccu( 0 ),
      mInputBufferReadCursor( 0 ), mInputBufferWriteCursor( 0 ),
      mAccuNeedsEncoding( Never ),
      mSawLineEnd( false ), mSawCR( false ), mFinishing( false ),
      mFinished( false ) {}

  bool needsEncoding( uchar ch )
    { return ch > '~' || ( ch < ' ' && ch != '\t' ) || ch == '='; }
  bool needsEncodingAtEOL( uchar ch )
    { return ch == ' ' || ch == '\t'; }
  bool needsEncodingAtBOL( uchar ch )
    { return ch == 'F' || ch == '.' || ch == '-'; }
  bool fillInputBuffer( const char* &scursor, const char * const send );
  bool processNextChar();
  void createOutputBuffer( char* &dcursor, const char * const dend );
  public:
    virtual ~QuotedPrintableEncoder() {}

    bool encode( const char* &scursor, const char * const send,
                 char* &dcursor, const char * const dend );

    bool finish( char* &dcursor, const char * const dend );
};

class QuotedPrintableDecoder : public Decoder
{
  const char mEscapeChar;
  char mBadChar;
  /** @p accu holds the msb nibble of the hexchar or zero. */
  uchar mAccu;
  /** @p insideHexChar is true iff we're inside an hexchar (=XY).
      Together with @ref mAccu, we can build this states:
      @li @p insideHexChar == @p false:
      normal text
      @li @p insideHexChar == @p true, @p mAccu == 0:
      saw the leading '='
      @li @p insideHexChar == @p true, @p mAccu != 0:
      saw the first nibble '=X'
  */
  const bool mQEncoding;
  bool mInsideHexChar;
  bool mFlushing;
  bool mExpectLF;
  bool mHaveAccu;
  /** @p mLastChar holds the first char of an encoded char, so that
      we are able to keep the first char if the second char is invalid. */
  char mLastChar;
  protected:
  friend class QuotedPrintableCodec;
  friend class Rfc2047QEncodingCodec;
  friend class Rfc2231EncodingCodec;
  QuotedPrintableDecoder( bool withCRLF=false,
                          bool aQEncoding=false, char aEscapeChar='=' )
    : Decoder( withCRLF ),
      mEscapeChar( aEscapeChar ),
      mBadChar( 0 ),
      mAccu( 0 ),
      mQEncoding( aQEncoding ),
      mInsideHexChar( false ),
      mFlushing( false ),
      mExpectLF( false ),
      mHaveAccu( false ),
      mLastChar( 0 ) {}
  public:
    virtual ~QuotedPrintableDecoder() {}

    bool decode( const char* &scursor, const char * const send,
                 char* &dcursor, const char * const dend );
    bool finish( char* & dcursor, const char * const dend );
};

class Rfc2047QEncodingEncoder : public Encoder
{
  uchar      mAccu;
  uchar      mStepNo;
  const char mEscapeChar;
  bool       mInsideFinishing : 1;
  protected:
  friend class Rfc2047QEncodingCodec;
  friend class Rfc2231EncodingCodec;
  Rfc2047QEncodingEncoder( bool withCRLF=false, char aEscapeChar='=' )
    : Encoder( withCRLF ),
      mAccu( 0 ), mStepNo( 0 ), mEscapeChar( aEscapeChar ),
      mInsideFinishing( false )
    {
      // else an optimization in ::encode might break.
      assert( aEscapeChar == '=' || aEscapeChar == '%' );
    }

  // this code assumes that isEText( mEscapeChar ) == false!
  bool needsEncoding( uchar ch )
    {
      if ( ch > 'z' ) {
        return true; // {|}~ DEL and 8bit chars need
      }
      if ( !isEText( ch ) ) {
        return true; // all but a-zA-Z0-9!/*+- need, too
      }
      if ( mEscapeChar == '%' && ( ch == '*' || ch == '/' ) ) {
        return true; // not allowed in rfc2231 encoding
      }
      return false;
    }

  public:
    virtual ~Rfc2047QEncodingEncoder() {}

    bool encode( const char* & scursor, const char * const send,
                 char* & dcursor, const char * const dend );
    bool finish( char* & dcursor, const char * const dend );
};

// this doesn't access any member variables, so it can be defined static
// but then we can't call it from virtual functions
static int QuotedPrintableDecoder_maxDecodedSizeFor( int insize, bool withCRLF )
{
  // all chars unencoded:
  int result = insize;
  // but maybe all of them are \n and we need to make them \r\n :-o
  if ( withCRLF )
    result += insize;

  // there might be an accu plus escape
  result += 2;

  return result;
}

Encoder *QuotedPrintableCodec::makeEncoder( bool withCRLF ) const
{
  return new QuotedPrintableEncoder( withCRLF );
}

Decoder *QuotedPrintableCodec::makeDecoder( bool withCRLF ) const
{
  return new QuotedPrintableDecoder( withCRLF );
}

int QuotedPrintableCodec::maxDecodedSizeFor( int insize, bool withCRLF ) const
{
  return QuotedPrintableDecoder_maxDecodedSizeFor(insize, withCRLF);
}

Encoder *Rfc2047QEncodingCodec::makeEncoder( bool withCRLF ) const
{
  return new Rfc2047QEncodingEncoder( withCRLF );
}

Decoder *Rfc2047QEncodingCodec::makeDecoder( bool withCRLF ) const
{
  return new QuotedPrintableDecoder( withCRLF, true );
}

int Rfc2047QEncodingCodec::maxDecodedSizeFor( int insize, bool withCRLF ) const
{
  return QuotedPrintableDecoder_maxDecodedSizeFor(insize, withCRLF);
}

Encoder *Rfc2231EncodingCodec::makeEncoder( bool withCRLF ) const
{
  return new Rfc2047QEncodingEncoder( withCRLF, '%' );
}

Decoder *Rfc2231EncodingCodec::makeDecoder( bool withCRLF ) const
{
  return new QuotedPrintableDecoder( withCRLF, true, '%' );
}

int Rfc2231EncodingCodec::maxDecodedSizeFor( int insize, bool withCRLF ) const
{
  return QuotedPrintableDecoder_maxDecodedSizeFor(insize, withCRLF);
}

/********************************************************/
/********************************************************/
/********************************************************/

bool QuotedPrintableDecoder::decode( const char* &scursor,
                                     const char * const send,
                                     char* &dcursor, const char * const dend )
{
  if ( mWithCRLF ) {
    kWarning() << "CRLF output for decoders isn't yet supported!";
  }

  while ( scursor != send && dcursor != dend ) {
    if ( mFlushing ) {
      // we have to flush chars in the aftermath of an decoding
      // error. The way to request a flush is to
      // - store the offending character in mBadChar and
      // - set mFlushing to true.
      // The supported cases are (H: hexchar, X: bad char):
      // =X, =HX, CR
      // mBadChar is only written out if it is not by itself illegal in
      // quoted-printable (e.g. CTLs, 8Bits).
      // A fast way to suppress mBadChar output is to set it to NUL.
      if ( mInsideHexChar ) {
        // output '='
        *dcursor++ = mEscapeChar;
        mInsideHexChar = false;
      } else if ( mHaveAccu ) {
        // output the high nibble of the accumulator:
        *dcursor++ = mLastChar;
        mHaveAccu = false;
        mAccu = 0;
      } else {
        // output mBadChar
        assert( mAccu == 0 );
        if ( mBadChar ) {
          if ( mBadChar == '=' ) {
            mInsideHexChar = true;
          } else {
            *dcursor++ = mBadChar;
          }
          mBadChar = 0;
        }
        mFlushing = false;
      }
      continue;
    }
    assert( mBadChar == 0 );

    uchar ch = *scursor++;
    uchar value = 255;

    if ( mExpectLF && ch != '\n' ) {
      kWarning() << "QuotedPrintableDecoder:"
        "illegally formed soft linebreak or lonely CR!";
      mInsideHexChar = false;
      mExpectLF = false;
      assert( mAccu == 0 );
    }

    if ( mInsideHexChar ) {
      // next char(s) represent nibble instead of itself:
      if ( ch <= '9' ) {
        if ( ch >= '0' ) {
          value = ch - '0';
        } else {
          switch ( ch ) {
          case '\r':
            mExpectLF = true;
            break;
          case '\n':
            // soft line break, but only if mAccu is NUL.
            if ( !mHaveAccu ) {
              mExpectLF = false;
              mInsideHexChar = false;
              break;
            }
            // else fall through
          default:
            kWarning() << "QuotedPrintableDecoder:"
              "illegally formed hex char! Outputting verbatim.";
            mBadChar = ch;
            mFlushing = true;
          }
          continue;
        }
      } else { // ch > '9'
        if ( ch <= 'F' ) {
          if ( ch >= 'A' ) {
            value = 10 + ch - 'A';
          } else { // [:-@]
            mBadChar = ch;
            mFlushing = true;
            continue;
          }
        } else { // ch > 'F'
          if ( ch <= 'f' && ch >= 'a' ) {
            value = 10 + ch - 'a';
          } else {
            mBadChar = ch;
            mFlushing = true;
            continue;
          }
        }
      }

      assert( value < 16 );
      assert( mBadChar == 0 );
      assert( !mExpectLF );

      if ( mHaveAccu ) {
        *dcursor++ = char( mAccu | value );
        mAccu = 0;
        mHaveAccu = false;
        mInsideHexChar = false;
      } else {
        mHaveAccu = true;
        mAccu = value << 4;
        mLastChar = ch;
      }
    } else { // not mInsideHexChar
      if ( ( ch <= '~' && ch >= ' ' ) || ch == '\t' ) {
        if ( ch == mEscapeChar ) {
          mInsideHexChar = true;
        } else if ( mQEncoding && ch == '_' ) {
          *dcursor++ = char( 0x20 );
        } else {
          *dcursor++ = char( ch );
        }
      } else if ( ch == '\n' ) {
        *dcursor++ = '\n';
        mExpectLF = false;
      } else if ( ch == '\r' ) {
        mExpectLF = true;
      } else {
        //kWarning() << "QuotedPrintableDecoder:" << ch <<
        //  "illegal character in input stream!";
        *dcursor++ = char( ch );
      }
    }
  }

  return scursor == send;
}

bool QuotedPrintableDecoder::finish( char* &dcursor, const char * const dend )
{
  while ( ( mInsideHexChar || mHaveAccu || mFlushing ) && dcursor != dend ) {
    // we have to flush chars
    if ( mInsideHexChar ) {
      // output '='
      *dcursor++ = mEscapeChar;
      mInsideHexChar = false;
    }
    else if ( mHaveAccu ) {
      // output the high nibble of the accumulator:
      *dcursor++ = mLastChar;
      mHaveAccu = false;
      mAccu = 0;
    } else {
      // output mBadChar
      assert( mAccu == 0 );
      if ( mBadChar ) {
        *dcursor++ = mBadChar;
        mBadChar = 0;
      }
      mFlushing = false;
    }
  }

  // return false if we are not finished yet; note that mInsideHexChar is always false
  return !( mHaveAccu || mFlushing );
}

bool QuotedPrintableEncoder::fillInputBuffer( const char* &scursor,
                                              const char * const send ) {
  // Don't read more if there's still a tail of a line in the buffer:
  if ( mSawLineEnd ) {
    return true;
  }

  // Read until the buffer is full or we have found CRLF or LF (which
  // don't end up in the input buffer):
  for ( ; ( mInputBufferWriteCursor + 1 ) % 16 != mInputBufferReadCursor &&
          scursor != send ; mInputBufferWriteCursor++ ) {
    char ch = *scursor++;
    if ( ch == '\r' ) {
      mSawCR = true;
    } else if ( ch == '\n' ) {
      // remove the CR from the input buffer (if any) and return that
      // we found a line ending:
      if ( mSawCR ) {
        mSawCR = false;
        assert( mInputBufferWriteCursor != mInputBufferReadCursor );
        mInputBufferWriteCursor--;
      }
      mSawLineEnd = true;
      return true; // saw CRLF or LF
    } else {
      mSawCR = false;
    }
    mInputBuffer[ mInputBufferWriteCursor ] = ch;
  }
  mSawLineEnd = false;
  return false; // didn't see a line ending...
}

bool QuotedPrintableEncoder::processNextChar()
{

  // If we process a buffer which doesn't end in a line break, we
  // can't process all of it, since the next chars that will be read
  // could be a line break. So we empty the buffer only until a fixed
  // number of chars is left (except when mFinishing, which means that
  // the data doesn't end in newline):
  const int minBufferFillWithoutLineEnd = 4;

  assert( mOutputBufferCursor == 0 );

  int bufferFill =
    int( mInputBufferWriteCursor ) - int( mInputBufferReadCursor ) ;
  if ( bufferFill < 0 ) {
    bufferFill += 16;
  }

  assert( bufferFill >=0 && bufferFill <= 15 );

  if ( !mFinishing && !mSawLineEnd &&
       bufferFill < minBufferFillWithoutLineEnd ) {
    return false;
  }

  // buffer is empty, return false:
  if ( mInputBufferReadCursor == mInputBufferWriteCursor ) {
    return false;
  }

  // Real processing goes here:
  mAccu = mInputBuffer[ mInputBufferReadCursor++ ];
  if ( needsEncoding( mAccu ) ) { // always needs encoding or
    mAccuNeedsEncoding = Definitely;
  } else if ( ( mSawLineEnd || mFinishing ) && // needs encoding at end of line
              bufferFill == 1               && // or end of buffer
              needsEncodingAtEOL( mAccu ) ) {
    mAccuNeedsEncoding = Definitely;
  } else if ( needsEncodingAtBOL( mAccu ) ) {
    mAccuNeedsEncoding = AtBOL;
  } else {
    // never needs encoding
    mAccuNeedsEncoding = Never;
  }

  return true;
}

// Outputs processed (verbatim or hex-encoded) chars and inserts soft
// line breaks as necessary. Depends on processNextChar's directions
// on whether or not to encode the current char, and whether or not
// the current char is the last one in it's input line:
void QuotedPrintableEncoder::createOutputBuffer( char* &dcursor,
                                                 const char * const dend )
{
  const int maxLineLength = 76; // rfc 2045

  assert( mOutputBufferCursor == 0 );

  bool lastOneOnThisLine = mSawLineEnd
                           && mInputBufferReadCursor == mInputBufferWriteCursor;

  int neededSpace = 1;
  if ( mAccuNeedsEncoding == Definitely ) {
    neededSpace = 3;
  }

  // reserve space for the soft hyphen (=)
  if ( !lastOneOnThisLine ) {
    neededSpace++;
  }

  if ( mCurrentLineLength > maxLineLength - neededSpace ) {
    // current line too short, insert soft line break:
    write( '=', dcursor, dend );
    writeCRLF( dcursor, dend );
    mCurrentLineLength = 0;
  }

  if ( Never == mAccuNeedsEncoding ||
       ( AtBOL == mAccuNeedsEncoding && mCurrentLineLength != 0 ) ) {
    write( mAccu, dcursor, dend );
    mCurrentLineLength++;
  } else {
    write( '=', dcursor, dend );
    write( binToHex( highNibble( mAccu ) ), dcursor, dend );
    write( binToHex( lowNibble( mAccu ) ), dcursor, dend );
    mCurrentLineLength += 3;
  }
}

bool QuotedPrintableEncoder::encode( const char* &scursor,
                                     const char * const send,
                                     char* &dcursor, const char * const dend )
{
  // support probing by the caller:
  if ( mFinishing ) {
    return true;
  }

  while ( scursor != send && dcursor != dend ) {
    if ( mOutputBufferCursor && !flushOutputBuffer( dcursor, dend ) ) {
      return scursor == send;
    }

    assert( mOutputBufferCursor == 0 );

    // fill input buffer until eol has been reached or until the
    // buffer is full, whatever comes first:
    fillInputBuffer( scursor, send );

    if ( processNextChar() ) {
      // there was one...
      createOutputBuffer( dcursor, dend );
    } else if ( mSawLineEnd &&
                mInputBufferWriteCursor == mInputBufferReadCursor ) {
      // load a hard line break into output buffer:
      writeCRLF( dcursor, dend );
      // signal fillInputBuffer() we are ready for the next line:
      mSawLineEnd = false;
      mCurrentLineLength = 0;
    } else {
      // we are supposedly finished with this input block:
      break;
    }
  }

  // make sure we write as much as possible and don't stop _writing_
  // just because we have no more _input_:
  if ( mOutputBufferCursor ) {
    flushOutputBuffer( dcursor, dend );
  }

  return scursor == send;

} // encode

bool QuotedPrintableEncoder::finish( char* &dcursor, const char * const dend )
{
  mFinishing = true;

  if ( mFinished ) {
    return flushOutputBuffer( dcursor, dend );
  }

  while ( dcursor != dend ) {
    if ( mOutputBufferCursor && !flushOutputBuffer( dcursor, dend ) ) {
      return false;
    }

    assert( mOutputBufferCursor == 0 );

    if ( processNextChar() ) {
      // there was one...
      createOutputBuffer( dcursor, dend );
    } else if ( mSawLineEnd &&
                mInputBufferWriteCursor == mInputBufferReadCursor ) {
      // load a hard line break into output buffer:
      writeCRLF( dcursor, dend );
      mSawLineEnd = false;
      mCurrentLineLength = 0;
    } else {
      mFinished = true;
      return flushOutputBuffer( dcursor, dend );
    }
  }

  return mFinished && !mOutputBufferCursor;

} // finish

bool Rfc2047QEncodingEncoder::encode( const char* &scursor,
                                      const char * const send,
                                      char* &dcursor, const char * const dend )
{
  if ( mInsideFinishing ) {
    return true;
  }

  while ( scursor != send && dcursor != dend ) {
    uchar value = 0;
    switch ( mStepNo ) {
    case 0:
      // read the next char and decide if and how do encode:
      mAccu = *scursor++;
      if ( !needsEncoding( mAccu ) ) {
        *dcursor++ = char( mAccu );
      } else if ( mEscapeChar == '=' && mAccu == 0x20 ) {
        // shortcut encoding for 0x20 (latin-1/us-ascii SPACE)
        // (not for rfc2231 encoding)
        *dcursor++ = '_';
      } else {
        // needs =XY encoding - write escape char:
        *dcursor++ = mEscapeChar;
        mStepNo = 1;
      }
      continue;
    case 1:
      // extract hi-nibble:
      value = highNibble( mAccu );
      mStepNo = 2;
      break;
    case 2:
      // extract lo-nibble:
      value = lowNibble( mAccu );
      mStepNo = 0;
      break;
    default: assert( 0 );
    }

    // and write:
    *dcursor++ = binToHex( value );
  }

  return scursor == send;
} // encode

#include <QtCore/QString>

bool Rfc2047QEncodingEncoder::finish( char* &dcursor, const char * const dend )
{
  mInsideFinishing = true;

  // write the last bits of mAccu, if any:
  while ( mStepNo != 0 && dcursor != dend ) {
    uchar value = 0;
    switch ( mStepNo ) {
    case 1:
      // extract hi-nibble:
      value = highNibble( mAccu );
      mStepNo = 2;
      break;
    case 2:
      // extract lo-nibble:
      value = lowNibble( mAccu );
      mStepNo = 0;
      break;
    default: assert( 0 );
    }

    // and write:
    *dcursor++ = binToHex( value );
  }

  return mStepNo == 0;
}

} // namespace KMime