/* This file is part of the KDE libraries Copyright (C) 2008 Andreas Hartmetz Copyright (C) 2010,2011 Rolf Eike Beer This library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU Library General Public License along with this library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ #include #include #include #include #include #include // Advance *pos beyond spaces / tabs static void skipSpace(const char input[], int *pos, int end) { int idx = *pos; while (idx < end && (input[idx] == ' ' || input[idx] == '\t')) { idx++; } *pos = idx; return; } // Advance *pos to start of next line while being forgiving about line endings. // Return false if the end of the header has been reached, true otherwise. static bool nextLine(const char input[], int *pos, int end) { int idx = *pos; while (idx < end && input[idx] != '\r' && input[idx] != '\n') { idx++; } int rCount = 0; int nCount = 0; while (idx < end && qMax(rCount, nCount) < 2 && (input[idx] == '\r' || input[idx] == '\n')) { input[idx] == '\r' ? rCount++ : nCount++; idx++; } if (idx < end && qMax(rCount, nCount) == 2 && qMin(rCount, nCount) == 1) { // if just one of the others is missing eat it too. // this ensures that conforming headers using the proper // \r\n sequence (and also \n\r) will be parsed correctly. if ((rCount == 1 && input[idx] == '\r') || (nCount == 1 && input[idx] == '\n')) { idx++; } } *pos = idx; return idx < end && rCount < 2 && nCount < 2; } // QByteArray::fromPercentEncoding() does not notify us about encoding errors so we need // to check here if this is valid at all. static bool isValidPercentEncoding(const QByteArray &data) { int i = 0; const int last = data.length() - 1; const char *d = data.constData(); while ( (i = data.indexOf('%', i)) != -1) { if ( i >= last - 2 ) return false; if ( ! isxdigit(d[i + 1]) ) return false; if ( ! isxdigit(d[i + 2]) ) return false; i++; } return true; } QByteArray TokenIterator::next() { QPair token = m_tokens[m_currentToken++]; //fromRawData brings some speed advantage but also the requirement to keep the text buffer //around. this together with implicit sharing (you don't know where copies end up) //is dangerous! //return QByteArray::fromRawData(&m_buffer[token.first], token.second - token.first); return QByteArray(&m_buffer[token.first], token.second - token.first); } QByteArray TokenIterator::current() const { QPair token = m_tokens[m_currentToken - 1]; //return QByteArray::fromRawData(&m_buffer[token.first], token.second - token.first); return QByteArray(&m_buffer[token.first], token.second - token.first); } QList TokenIterator::all() const { QList ret; for (int i = 0; i < m_tokens.count(); i++) { QPair token = m_tokens[i]; ret.append(QByteArray(&m_buffer[token.first], token.second - token.first)); } return ret; } HeaderTokenizer::HeaderTokenizer(char *buffer) : m_buffer(buffer) { // add information about available headers and whether they have one or multiple, // comma-separated values. //The following response header fields are from RFC 2616 unless otherwise specified. //Hint: search the web for e.g. 'http "accept-ranges header"' to find information about //a header field. static const HeaderFieldTemplate headerFieldTemplates[] = { {"accept-ranges", false}, {"age", false}, {"cache-control", true}, {"connection", true}, {"content-disposition", false}, //is multi-valued in a way, but with ";" separator! {"content-encoding", true}, {"content-language", true}, {"content-length", false}, {"content-location", false}, {"content-md5", false}, {"content-type", false}, {"date", false}, {"dav", true}, //RFC 2518 {"etag", false}, {"expires", false}, {"keep-alive", true}, //RFC 2068 {"last-modified", false}, {"link", false}, //RFC 2068, multi-valued with ";" separator {"location", false}, {"p3p", true}, // http://www.w3.org/TR/P3P/ {"pragma", true}, {"proxy-authenticate", false}, //complicated multi-valuedness: quoted commas don't separate //multiple values. we handle this at a higher level. {"proxy-connection", true}, //inofficial but well-known; to avoid misunderstandings //when using "connection" when talking to a proxy. {"refresh", false}, //not sure, only found some mailing list posts mentioning it {"set-cookie", false}, //RFC 2109; the multi-valuedness seems to be usually achieved //by sending several instances of this field as opposed to //usually comma-separated lists with maybe multiple instances. {"transfer-encoding", true}, {"upgrade", true}, {"warning", true}, {"www-authenticate", false} //see proxy-authenticate }; for (uint i = 0; i < sizeof(headerFieldTemplates) / sizeof(HeaderFieldTemplate); i++) { const HeaderFieldTemplate &ft = headerFieldTemplates[i]; insert(QByteArray(ft.name), HeaderField(ft.isMultiValued)); } } int HeaderTokenizer::tokenize(int begin, int end) { char *buf = m_buffer; //keep line length in check :/ int idx = begin; int startIdx = begin; //multi-purpose start of current token bool multiValuedEndedWithComma = false; //did the last multi-valued line end with a comma? QByteArray headerKey; do { if (buf[idx] == ' ' || buf [idx] == '\t') { // line continuation; preserve startIdx except (see below) if (headerKey.isEmpty()) { continue; } // turn CR/LF into spaces for later parsing convenience int backIdx = idx - 1; while (backIdx >= begin && (buf[backIdx] == '\r' || buf[backIdx] == '\n')) { buf[backIdx--] = ' '; } // multiple values, comma-separated: add new value or continue previous? if (operator[](headerKey).isMultiValued) { if (multiValuedEndedWithComma) { // start new value; this is almost like no line continuation skipSpace(buf, &idx, end); startIdx = idx; } else { // continue previous value; this is tricky. unit tests to the rescue! if (operator[](headerKey).beginEnd.last().first == startIdx) { // remove entry, it will be re-added because already idx != startIdx operator[](headerKey).beginEnd.removeLast(); } else { // no comma, no entry: the prev line was whitespace only - start new value skipSpace(buf, &idx, end); startIdx = idx; } } } } else { // new field startIdx = idx; // also make sure that there is at least one char after the colon while (idx < (end - 1) && buf[idx] != ':' && buf[idx] != '\r' && buf[idx] != '\n') { buf[idx] = tolower(buf[idx]); idx++; } if (buf[idx] != ':') { //malformed line: no colon headerKey.clear(); continue; } headerKey = QByteArray(&buf[startIdx], idx - startIdx); if (!contains(headerKey)) { //we don't recognize this header line headerKey.clear(); continue; } // skip colon & leading whitespace idx++; skipSpace(buf, &idx, end); startIdx = idx; } // we have the name/key of the field, now parse the value if (!operator[](headerKey).isMultiValued) { // scan to end of line while (idx < end && buf[idx] != '\r' && buf[idx] != '\n') { idx++; } if (!operator[](headerKey).beginEnd.isEmpty()) { // there already is an entry; are we just in a line continuation? if (operator[](headerKey).beginEnd.last().first == startIdx) { // line continuation: delete previous entry and later insert a new, longer one. operator[](headerKey).beginEnd.removeLast(); } } operator[](headerKey).beginEnd.append(QPair(startIdx, idx)); } else { // comma-separated list while (true) { //skip one value while (idx < end && buf[idx] != '\r' && buf[idx] != '\n' && buf[idx] != ',') { idx++; } if (idx != startIdx) { operator[](headerKey).beginEnd.append(QPair(startIdx, idx)); } multiValuedEndedWithComma = buf[idx] == ','; //skip comma(s) and leading whitespace, if any respectively while (idx < end && buf[idx] == ',') { idx++; } skipSpace(buf, &idx, end); //next value or end-of-line / end of header? if (buf[idx] >= end || buf[idx] == '\r' || buf[idx] == '\n') { break; } //next value startIdx = idx; } } } while (nextLine(buf, &idx, end)); return idx; } TokenIterator HeaderTokenizer::iterator(const char *key) const { QByteArray keyBa = QByteArray::fromRawData(key, strlen(key)); if (contains(keyBa)) { return TokenIterator(value(keyBa).beginEnd, m_buffer); } else { return TokenIterator(m_nullTokens, m_buffer); } } static void skipLWS(const QString &str, int &pos) { while (pos < str.length() && (str[pos] == QLatin1Char(' ') || str[pos] == QLatin1Char('\t'))) { ++pos; } } // keep the common ending, this allows the compiler to join them static const char typeSpecials[] = "{}*'%()<>@,;:\\\"/[]?="; static const char attrSpecials[] = "'%()<>@,;:\\\"/[]?="; static const char valueSpecials[] = "()<>@,;:\\\"/[]?="; static bool specialChar(const QChar &ch, const char *specials) { // WORKAROUND: According to RFC 2616, any character other than ascii // characters should NOT be allowed in unquoted content-disposition file // names. However, since none of the major browsers follow this rule, we do // the same thing here and allow all printable unicode characters. See // https://bugs.kde.org/show_bug.cgi?id=261223 for the detials. if (!ch.isPrint()) { return true; } for (int i = qstrlen(specials) - 1; i >= 0; i--) { if (ch == QLatin1Char(specials[i])) { return true; } } return false; } /** * read and parse the input until the given terminator * @param str input string to parse * @param term terminator * @param pos position marker in the input string * @param specials characters forbidden in this section * @return the next section or an empty string if it was invalid * * Extracts token-like input until terminator char or EOL. * Also skips over the terminator. * * pos is correctly incremented even if this functions returns * an empty string so this can be used to skip over invalid * parts and continue. */ static QString extractUntil(const QString &str, QChar term, int &pos, const char *specials) { QString out; skipLWS(str, pos); bool valid = true; while (pos < str.length() && (str[pos] != term)) { out += str[pos]; valid = (valid && !specialChar(str[pos], specials)); ++pos; } if (pos < str.length()) { // Stopped due to finding term ++pos; } if (!valid) { return QString(); } // Remove trailing linear whitespace... while (out.endsWith(QLatin1Char(' ')) || out.endsWith(QLatin1Char('\t'))) { out.chop(1); } if (out.contains(QLatin1Char(' '))) { out.clear(); } return out; } // As above, but also handles quotes.. // pos is set to -1 on parse error static QString extractMaybeQuotedUntil(const QString &str, int &pos) { const QChar term = QLatin1Char(';'); skipLWS(str, pos); // Are we quoted? if (pos < str.length() && str[pos] == QLatin1Char('"')) { QString out; // Skip the quote... ++pos; // when quoted we also need an end-quote bool endquote = false; // Parse until trailing quote... while (pos < str.length()) { if (str[pos] == QLatin1Char('\\') && pos + 1 < str.length()) { // quoted-pair = "\" CHAR out += str[pos + 1]; pos += 2; // Skip both... } else if (str[pos] == QLatin1Char('"')) { ++pos; endquote = true; break; } else if (!str[pos].isPrint()) { // Don't allow CTL's RFC 2616 sec 2.2 break; } else { out += str[pos]; ++pos; } } if (!endquote) { pos = -1; return QString(); } // Skip until term.. while (pos < str.length() && (str[pos] != term)) { if ((str[pos] != QLatin1Char(' ')) && (str[pos] != QLatin1Char('\t'))) { pos = -1; return QString(); } ++pos; } if (pos < str.length()) { // Stopped due to finding term ++pos; } return out; } else { return extractUntil(str, term, pos, valueSpecials); } } static QMap contentDispositionParserInternal(const QString &disposition) { kDebug(7113) << "disposition: " << disposition; int pos = 0; const QString strDisposition = extractUntil(disposition, QLatin1Char(';'), pos, typeSpecials).toLower(); QMap parameters; QMap contparams; // all parameters that contain continuations QMap encparams; // all parameters that have character encoding // the type is invalid, the complete header is junk if (strDisposition.isEmpty()) { return parameters; } parameters.insert(QLatin1String("type"), strDisposition); while (pos < disposition.length()) { QString key = extractUntil(disposition, QLatin1Char('='), pos, attrSpecials).toLower(); if (key.isEmpty()) { // parse error in this key: do not parse more, but add up // everything we already got kDebug(7113) << "parse error in key, abort parsing"; break; } QString val; if (key.endsWith(QLatin1Char('*'))) { val = extractUntil(disposition, QLatin1Char(';'), pos, valueSpecials); } else { val = extractMaybeQuotedUntil(disposition, pos); } if (val.isEmpty()) { if (pos == -1) { kDebug(7113) << "parse error in value, abort parsing"; break; } continue; } const int spos = key.indexOf(QLatin1Char('*')); if (spos == key.length() - 1) { key.chop(1); encparams.insert(key, val); } else if (spos >= 0) { contparams.insert(key, val); } else if (parameters.contains(key)) { kDebug(7113) << "duplicate key" << key << "found, ignoring everything more"; parameters.remove(key); return parameters; } else { parameters.insert(key, val); } } QMap::iterator i = contparams.begin(); while (i != contparams.end()) { QString key = i.key(); int spos = key.indexOf(QLatin1Char('*')); bool hasencoding = false; if (key.at(spos + 1) != QLatin1Char('0')) { ++i; continue; } // no leading zeros allowed, so delete the junk int klen = key.length(); if (klen > spos + 2) { // nothing but continuations and encodings may insert * into parameter name if ((klen > spos + 3) || ((klen == spos + 3) && (key.at(spos + 2) != QLatin1Char('*')))) { kDebug(7113) << "removing invalid key " << key << "with val" << i.value() << key.at(spos + 2); i = contparams.erase(i); continue; } hasencoding = true; } int seqnum = 1; QMap::iterator partsi; // we do not need to care about encoding specifications: only the first // part is allowed to have one QString val = i.value(); key.chop(hasencoding ? 2 : 1); while ((partsi = contparams.find(key + QString::number(seqnum))) != contparams.end()) { val += partsi.value(); contparams.erase(partsi); } i = contparams.erase(i); key.chop(1); if (hasencoding) { encparams.insert(key, val); } else { if (parameters.contains(key)) { kDebug(7113) << "duplicate key" << key << "found, ignoring everything more"; parameters.remove(key); return parameters; } parameters.insert(key, val); } } for (QMap::iterator i = encparams.begin(); i != encparams.end(); ++i) { QString val = i.value(); // RfC 2231 encoded character set in filename int spos = val.indexOf(QLatin1Char('\'')); if (spos == -1) { continue; } int npos = val.indexOf(QLatin1Char('\''), spos + 1); if (npos == -1) { continue; } const QString charset = val.left(spos); const QString lang = val.mid(spos + 1, npos - spos - 1); const QByteArray encodedVal = val.mid(npos + 1).toLatin1(); if ( ! isValidPercentEncoding(encodedVal) ) continue; const QByteArray rawval = QByteArray::fromPercentEncoding(encodedVal); if (charset.isEmpty() || (charset == QLatin1String("us-ascii"))) { bool valid = true; for (int j = rawval.length() - 1; (j >= 0) && valid; j--) { valid = (rawval.at(j) >= 32); } if (!valid) continue; val = QString::fromLatin1(rawval.constData()); } else { QTextCodec *codec = QTextCodec::codecForName(charset.toLatin1()); if (!codec) continue; val = codec->toUnicode(rawval); } parameters.insert(i.key(), val); } return parameters; } static QMap contentDispositionParser(const QString &disposition) { QMap parameters = contentDispositionParserInternal(disposition); const QLatin1String fn("filename"); if (parameters.contains(fn)) { // Content-Disposition is not allowed to dictate directory // path, thus we extract the filename only. const QString val = QDir::toNativeSeparators(parameters[fn]); int slpos = val.lastIndexOf(QDir::separator()); if (slpos > -1) { parameters.insert(fn, val.mid(slpos + 1)); } } return parameters; }