kdelibs/kioslave/http/parsinghelpers.cpp

/* This file is part of the KDE libraries
    Copyright (C) 2008 Andreas Hartmetz <ahartmetz@gmail.com>
    Copyright (C) 2010,2011 Rolf Eike Beer <kde@opensource.sf-tec.de>

    This library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Library General Public
    License as published by the Free Software Foundation; either
    version 2 of the License, or (at your option) any later version.

    This library is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    Library General Public License for more details.

    You should have received a copy of the GNU Library General Public License
    along with this library; see the file COPYING.LIB.  If not, write to
    the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
    Boston, MA 02110-1301, USA.
*/

#include <ctype.h>

#include <QDir>
#include <QMap>
#include <QTextCodec>
#include <QUrl>

#include <kcodecs.h>
#include <kdebug.h>

// Advance *pos beyond spaces / tabs
static void skipSpace(const char input[], int *pos, int end)
{
    int idx = *pos;
    while (idx < end && (input[idx] == ' ' || input[idx] == '\t')) {
        idx++;
    }
    *pos = idx;
    return;
}

// Advance *pos to start of next line while being forgiving about line endings.
// Return false if the end of the header has been reached, true otherwise.
static bool nextLine(const char input[], int *pos, int end)
{
    int idx = *pos;
    while (idx < end && input[idx] != '\r' && input[idx] != '\n') {
        idx++;
    }
    int rCount = 0;
    int nCount = 0;
    while (idx < end && qMax(rCount, nCount) < 2 && (input[idx] == '\r' || input[idx] == '\n')) {
        input[idx] == '\r' ? rCount++ : nCount++;
        idx++;
    }
    if (idx < end && qMax(rCount, nCount) == 2 && qMin(rCount, nCount) == 1) {
        // if just one of the others is missing eat it too.
        // this ensures that conforming headers using the proper
        // \r\n sequence (and also \n\r) will be parsed correctly.
        if ((rCount == 1 && input[idx] == '\r') || (nCount == 1 && input[idx] == '\n')) {
            idx++;
        }
    }

    *pos = idx;
    return idx < end && rCount < 2 && nCount < 2;
}

// QByteArray::fromPercentEncoding() does not notify us about encoding errors so we need
// to check here if this is valid at all.
static bool isValidPercentEncoding(const QByteArray &data)
{
    int i = 0;
    const int last = data.length() - 1;
    const char *d = data.constData();

    while ( (i = data.indexOf('%', i)) != -1) {
        if ( i >= last - 2 )
            return false;
        if ( ! isxdigit(d[i + 1]) )
            return false;
        if ( ! isxdigit(d[i + 2]) )
            return false;
        i++;
    }

    return true;
}

QByteArray TokenIterator::next()
{
    QPair<int, int> token = m_tokens[m_currentToken++];
    //fromRawData brings some speed advantage but also the requirement to keep the text buffer
    //around. this together with implicit sharing (you don't know where copies end up)
    //is dangerous!
    //return QByteArray::fromRawData(&m_buffer[token.first], token.second - token.first);
    return QByteArray(&m_buffer[token.first], token.second - token.first);
}

QByteArray TokenIterator::current() const
{
    QPair<int, int> token = m_tokens[m_currentToken - 1];
    //return QByteArray::fromRawData(&m_buffer[token.first], token.second - token.first);
    return QByteArray(&m_buffer[token.first], token.second - token.first);
}

QList<QByteArray> TokenIterator::all() const
{
    QList<QByteArray> ret;
    for (int i = 0; i < m_tokens.count(); i++) {
        QPair<int, int> token = m_tokens[i];
        ret.append(QByteArray(&m_buffer[token.first], token.second - token.first));
    }
    return ret;
}


HeaderTokenizer::HeaderTokenizer(char *buffer)
    : m_buffer(buffer)
{
    // add information about available headers and whether they have one or multiple,
    // comma-separated values.

    //The following response header fields are from RFC 2616 unless otherwise specified.
    //Hint: search the web for e.g. 'http "accept-ranges header"' to find information about
    //a header field.
    static const HeaderFieldTemplate headerFieldTemplates[] = {
        {"accept-ranges", false},
        {"age", false},
        {"cache-control", true},
        {"connection", true},
        {"content-disposition", false}, //is multi-valued in a way, but with ";" separator!
        {"content-encoding", true},
        {"content-language", true},
        {"content-length", false},
        {"content-location", false},
        {"content-md5", false},
        {"content-type", false},
        {"date", false},
        {"dav", true}, //RFC 2518
        {"etag", false},
        {"expires", false},
        {"keep-alive", true}, //RFC 2068
        {"last-modified", false},
        {"link", false}, //RFC 2068, multi-valued with ";" separator
        {"location", false},
        {"p3p", true}, // http://www.w3.org/TR/P3P/
        {"pragma", true},
        {"proxy-authenticate", false}, //complicated multi-valuedness: quoted commas don't separate
                                       //multiple values. we handle this at a higher level.
        {"proxy-connection", true}, //inofficial but well-known; to avoid misunderstandings
                                    //when using "connection" when talking to a proxy.
        {"refresh", false}, //not sure, only found some mailing list posts mentioning it
        {"set-cookie", false}, //RFC 2109; the multi-valuedness seems to be usually achieved
                               //by sending several instances of this field as opposed to
                               //usually comma-separated lists with maybe multiple instances.
        {"transfer-encoding", true},
        {"upgrade", true},
        {"warning", true},
        {"www-authenticate", false} //see proxy-authenticate
    };

    for (uint i = 0; i < sizeof(headerFieldTemplates) / sizeof(HeaderFieldTemplate); i++) {
        const HeaderFieldTemplate &ft = headerFieldTemplates[i];
        insert(QByteArray(ft.name), HeaderField(ft.isMultiValued));
    }
}

int HeaderTokenizer::tokenize(int begin, int end)
{
    char *buf = m_buffer;  //keep line length in check :/
    int idx = begin;
    int startIdx = begin; //multi-purpose start of current token
    bool multiValuedEndedWithComma = false; //did the last multi-valued line end with a comma?
    QByteArray headerKey;
    do {

        if (buf[idx] == ' ' || buf [idx] == '\t') {
            // line continuation; preserve startIdx except (see below)
            if (headerKey.isEmpty()) {
                continue;
            }
            // turn CR/LF into spaces for later parsing convenience
            int backIdx = idx - 1;
            while (backIdx >= begin && (buf[backIdx] == '\r' || buf[backIdx] == '\n')) {
                buf[backIdx--] = ' ';
            }

            // multiple values, comma-separated: add new value or continue previous?
            if (operator[](headerKey).isMultiValued) {
                if (multiValuedEndedWithComma) {
                    // start new value; this is almost like no line continuation
                    skipSpace(buf, &idx, end);
                    startIdx = idx;
                } else {
                    // continue previous value; this is tricky. unit tests to the rescue!
                    if (operator[](headerKey).beginEnd.last().first == startIdx) {
                        // remove entry, it will be re-added because already idx != startIdx
                        operator[](headerKey).beginEnd.removeLast();
                    } else {
                        // no comma, no entry: the prev line was whitespace only - start new value
                        skipSpace(buf, &idx, end);
                        startIdx = idx;
                    }
                }
            }

        } else {
            // new field
            startIdx = idx;
            // also make sure that there is at least one char after the colon
            while (idx < (end - 1) && buf[idx] != ':' && buf[idx] != '\r' && buf[idx] != '\n') {
                buf[idx] = tolower(buf[idx]);
                idx++;
            }
            if (buf[idx] != ':') {
                //malformed line: no colon
                headerKey.clear();
                continue;
            }
            headerKey = QByteArray(&buf[startIdx], idx - startIdx);
            if (!contains(headerKey)) {
                //we don't recognize this header line
                headerKey.clear();
                continue;
            }
            // skip colon & leading whitespace
            idx++;
            skipSpace(buf, &idx, end);
            startIdx = idx;
        }

        // we have the name/key of the field, now parse the value
        if (!operator[](headerKey).isMultiValued) {

            // scan to end of line
            while (idx < end && buf[idx] != '\r' && buf[idx] != '\n') {
                idx++;
            }
            if (!operator[](headerKey).beginEnd.isEmpty()) {
                // there already is an entry; are we just in a line continuation?
                if (operator[](headerKey).beginEnd.last().first == startIdx) {
                    // line continuation: delete previous entry and later insert a new, longer one.
                    operator[](headerKey).beginEnd.removeLast();
                }
            }
            operator[](headerKey).beginEnd.append(QPair<int, int>(startIdx, idx));

        } else {

            // comma-separated list
            while (true) {
                //skip one value
                while (idx < end && buf[idx] != '\r' && buf[idx] != '\n' && buf[idx] != ',') {
                    idx++;
                }
                if (idx != startIdx) {
                    operator[](headerKey).beginEnd.append(QPair<int, int>(startIdx, idx));
                }
                multiValuedEndedWithComma = buf[idx] == ',';
                //skip comma(s) and leading whitespace, if any respectively
                while (idx < end && buf[idx] == ',') {
                    idx++;
                }
                skipSpace(buf, &idx, end);
                //next value or end-of-line / end of header?
                if (buf[idx] >= end || buf[idx] == '\r' || buf[idx] == '\n') {
                    break;
                }
                //next value
                startIdx = idx;
            }
        }
    } while (nextLine(buf, &idx, end));
    return idx;
}


TokenIterator HeaderTokenizer::iterator(const char *key) const
{
    QByteArray keyBa = QByteArray::fromRawData(key, strlen(key));
    if (contains(keyBa)) {
        return TokenIterator(value(keyBa).beginEnd, m_buffer);
    } else {
        return TokenIterator(m_nullTokens, m_buffer);
    }
}

static void skipLWS(const QString &str, int &pos)
{
    while (pos < str.length() && (str[pos] == QLatin1Char(' ') || str[pos] == QLatin1Char('\t'))) {
        ++pos;
    }
}

// keep the common ending, this allows the compiler to join them
static const char typeSpecials[] =  "{}*'%()<>@,;:\\\"/[]?=";
static const char attrSpecials[] =     "'%()<>@,;:\\\"/[]?=";
static const char valueSpecials[] =      "()<>@,;:\\\"/[]?=";

static bool specialChar(const QChar &ch, const char *specials)
{
    // WORKAROUND: According to RFC 2616, any character other than ascii
    // characters should NOT be allowed in unquoted content-disposition file
    // names. However, since none of the major browsers follow this rule, we do
    // the same thing here and allow all printable unicode characters. See
    // https://bugs.kde.org/show_bug.cgi?id=261223 for the detials.
    if (!ch.isPrint()) {
        return true;
    }

    for (int i = qstrlen(specials) - 1; i >= 0; i--) {
        if (ch == QLatin1Char(specials[i])) {
            return true;
        }
    }

    return false;
}

/**
 * read and parse the input until the given terminator
 * @param str input string to parse
 * @param term terminator
 * @param pos position marker in the input string
 * @param specials characters forbidden in this section
 * @return the next section or an empty string if it was invalid
 *
 * Extracts token-like input until terminator char or EOL.
 * Also skips over the terminator.
 *
 * pos is correctly incremented even if this functions returns
 * an empty string so this can be used to skip over invalid
 * parts and continue.
 */
static QString extractUntil(const QString &str, QChar term, int &pos, const char *specials)
{
    QString out;
    skipLWS(str, pos);
    bool valid = true;

    while (pos < str.length() && (str[pos] != term)) {
        out += str[pos];
        valid = (valid && !specialChar(str[pos], specials));
        ++pos;
    }

    if (pos < str.length()) { // Stopped due to finding term
        ++pos;
    }

    if (!valid) {
        return QString();
    }

    // Remove trailing linear whitespace...
    while (out.endsWith(QLatin1Char(' ')) || out.endsWith(QLatin1Char('\t'))) {
        out.chop(1);
    }

    if (out.contains(QLatin1Char(' '))) {
        out.clear();
    }

    return out;
}

// As above, but also handles quotes..
// pos is set to -1 on parse error
static QString extractMaybeQuotedUntil(const QString &str, int &pos)
{
    const QChar term = QLatin1Char(';');

    skipLWS(str, pos);

    // Are we quoted?
    if (pos < str.length() && str[pos] == QLatin1Char('"')) {
        QString out;

        // Skip the quote...
        ++pos;

        // when quoted we also need an end-quote
        bool endquote = false;

        // Parse until trailing quote...
        while (pos < str.length()) {
            if (str[pos] == QLatin1Char('\\') && pos + 1 < str.length()) {
                // quoted-pair = "\" CHAR
                out += str[pos + 1];
                pos += 2; // Skip both...
            } else if (str[pos] == QLatin1Char('"')) {
                ++pos;
                endquote = true;
                break;
            } else if (!str[pos].isPrint()) { // Don't allow CTL's RFC 2616 sec 2.2
                break;
            } else {
                out += str[pos];
                ++pos;
            }
        }

        if (!endquote) {
            pos = -1;
            return QString();
        }

        // Skip until term..
        while (pos < str.length() && (str[pos] != term)) {
            if ((str[pos] != QLatin1Char(' ')) && (str[pos] != QLatin1Char('\t'))) {
                pos = -1;
                return QString();
            }
            ++pos;
        }

        if (pos < str.length()) {  // Stopped due to finding term
            ++pos;
        }

        return out;
    } else {
        return extractUntil(str, term, pos, valueSpecials);
    }
}

static QMap<QString, QString> contentDispositionParserInternal(const QString &disposition)
{
    kDebug(7113) << "disposition: " << disposition;
    int pos = 0;
    const QString strDisposition = extractUntil(disposition, QLatin1Char(';'), pos, typeSpecials).toLower();

    QMap<QString, QString> parameters;
    QMap<QString, QString> contparams;   // all parameters that contain continuations
    QMap<QString, QString> encparams;    // all parameters that have character encoding

    // the type is invalid, the complete header is junk
    if (strDisposition.isEmpty()) {
        return parameters;
    }

    parameters.insert(QLatin1String("type"), strDisposition);

    while (pos < disposition.length()) {
        QString key = extractUntil(disposition, QLatin1Char('='), pos, attrSpecials).toLower();

        if (key.isEmpty()) {
            // parse error in this key: do not parse more, but add up
            // everything we already got
            kDebug(7113) << "parse error in key, abort parsing";
            break;
        }

        QString val;
        if (key.endsWith(QLatin1Char('*'))) {
            val = extractUntil(disposition, QLatin1Char(';'), pos, valueSpecials);
        } else {
            val = extractMaybeQuotedUntil(disposition, pos);
        }

        if (val.isEmpty()) {
            if (pos == -1) {
                kDebug(7113) << "parse error in value, abort parsing";
                break;
            }
            continue;
        }

        const int spos = key.indexOf(QLatin1Char('*'));
        if (spos == key.length() - 1) {
            key.chop(1);
            encparams.insert(key, val);
        } else if (spos >= 0) {
            contparams.insert(key, val);
        } else if (parameters.contains(key)) {
            kDebug(7113) << "duplicate key" << key << "found, ignoring everything more";
            parameters.remove(key);
            return parameters;
        } else {
            parameters.insert(key, val);
        }
    }

    QMap<QString, QString>::iterator i = contparams.begin();
    while (i != contparams.end()) {
        QString key = i.key();
        int spos = key.indexOf(QLatin1Char('*'));
        bool hasencoding = false;

        if (key.at(spos + 1) != QLatin1Char('0')) {
            ++i;
            continue;
        }

        // no leading zeros allowed, so delete the junk
        int klen = key.length();
        if (klen > spos + 2) {
            // nothing but continuations and encodings may insert * into parameter name
            if ((klen > spos + 3) || ((klen == spos + 3) && (key.at(spos + 2) != QLatin1Char('*')))) {
                kDebug(7113) << "removing invalid key " << key << "with val" << i.value() << key.at(spos + 2);
                i = contparams.erase(i);
                continue;
            }
            hasencoding = true;
        }

        int seqnum = 1;
        QMap<QString, QString>::iterator partsi;
        // we do not need to care about encoding specifications: only the first
        // part is allowed to have one
        QString val = i.value();

        key.chop(hasencoding ? 2 : 1);

        while ((partsi = contparams.find(key + QString::number(seqnum))) != contparams.end()) {
            val += partsi.value();
            contparams.erase(partsi);
        }

        i = contparams.erase(i);

        key.chop(1);
        if (hasencoding) {
            encparams.insert(key, val);
        } else {
            if (parameters.contains(key)) {
                kDebug(7113) << "duplicate key" << key << "found, ignoring everything more";
                parameters.remove(key);
                return parameters;
            }

            parameters.insert(key, val);
        }
    }

    for (QMap<QString, QString>::iterator i = encparams.begin(); i != encparams.end(); ++i) {
        QString val = i.value();

        // RfC 2231 encoded character set in filename
        int spos = val.indexOf(QLatin1Char('\''));
        if (spos == -1) {
            continue;
        }
        int npos = val.indexOf(QLatin1Char('\''), spos + 1);
        if (npos == -1) {
            continue;
        }

        const QString charset = val.left(spos);
        const QString lang = val.mid(spos + 1, npos - spos - 1);
        const QByteArray encodedVal = val.mid(npos + 1).toLatin1();

        if ( ! isValidPercentEncoding(encodedVal) )
            continue;

        const QByteArray rawval = QByteArray::fromPercentEncoding(encodedVal);

        if (charset.isEmpty() || (charset == QLatin1String("us-ascii"))) {
            bool valid = true;
            for (int j = rawval.length() - 1; (j >= 0) && valid; j--) {
                valid = (rawval.at(j) >= 32);
            }

            if (!valid)
                continue;
            val = QString::fromLatin1(rawval.constData());
        } else {
            QTextCodec *codec = QTextCodec::codecForName(charset.toLatin1());
            if (!codec)
                continue;
            val = codec->toUnicode(rawval);
        }

        parameters.insert(i.key(), val);
    }

    return parameters;
}

static QMap<QString, QString> contentDispositionParser(const QString &disposition)
{
    QMap<QString, QString> parameters = contentDispositionParserInternal(disposition);

    const QLatin1String fn("filename");
    if (parameters.contains(fn)) {
        // Content-Disposition is not allowed to dictate directory
        // path, thus we extract the filename only.
        const QString val = QDir::toNativeSeparators(parameters[fn]);
        int slpos = val.lastIndexOf(QDir::separator());

        if (slpos > -1) {
            parameters.insert(fn, val.mid(slpos + 1));
        }
    }

    return parameters;
}