mirror of
https://bitbucket.org/smil3y/kdelibs.git
synced 2025-02-25 11:22:50 +00:00
455 lines
11 KiB
C++
455 lines
11 KiB
C++
/*
|
|
* tokenizer.cc - Copyright 2005 Maksim Orlovich <maksim@kde.org>
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
*
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
|
|
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
|
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
|
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
|
|
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
|
|
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
|
|
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
#include "tokenizer.h"
|
|
|
|
#include "xml/dom_stringimpl.h"
|
|
#include "xml/dom3_xpathimpl.h"
|
|
#include "dom/dom3_xpath.h"
|
|
|
|
#include <cstdio>
|
|
|
|
using namespace std;
|
|
|
|
using namespace DOM;
|
|
using namespace DOM::XPath;
|
|
using namespace khtml;
|
|
using namespace khtml::XPath;
|
|
|
|
namespace khtml {
|
|
namespace XPath {
|
|
|
|
struct AxisNameMapping
|
|
{
|
|
const char *name;
|
|
Step::AxisType type;
|
|
};
|
|
|
|
static AxisNameMapping axisNames[] = {
|
|
{ "ancestor", Step::AncestorAxis },
|
|
{ "ancestor-or-self", Step::AncestorOrSelfAxis },
|
|
{ "attribute", Step::AttributeAxis },
|
|
{ "child", Step::ChildAxis },
|
|
{ "descendant", Step::DescendantAxis },
|
|
{ "descendant-or-self", Step::DescendantOrSelfAxis },
|
|
{ "following", Step::FollowingAxis },
|
|
{ "following-sibling", Step::FollowingSiblingAxis },
|
|
{ "namespace", Step::NamespaceAxis },
|
|
{ "parent", Step::ParentAxis },
|
|
{ "preceding", Step::PrecedingAxis },
|
|
{ "preceding-sibling", Step::PrecedingSiblingAxis },
|
|
{ "self", Step::SelfAxis }
|
|
};
|
|
static unsigned int axisNamesCount = sizeof(axisNames) / sizeof(axisNames[0]);
|
|
|
|
static const char* const nodeTypeNames[] = {
|
|
"comment",
|
|
"text",
|
|
"processing-instruction",
|
|
"node",
|
|
0
|
|
};
|
|
|
|
QHash<QString, Step::AxisType>* Tokenizer::s_axisNamesDict = 0;
|
|
QSet<QString>* Tokenizer::s_nodeTypeNamesDict = 0;
|
|
|
|
Tokenizer &Tokenizer::self()
|
|
{
|
|
static Tokenizer instance;
|
|
return instance;
|
|
}
|
|
|
|
Tokenizer::XMLCat Tokenizer::charCat(QChar aChar)
|
|
{
|
|
//### might need to add some special cases from the XML spec.
|
|
|
|
if (aChar.unicode() == '_')
|
|
return NameStart;
|
|
|
|
if (aChar.unicode() == '.' || aChar.unicode() == '-')
|
|
return NameCont;
|
|
|
|
switch (aChar.category()) {
|
|
case QChar::Letter_Lowercase: //Ll
|
|
case QChar::Letter_Uppercase: //Lu
|
|
case QChar::Letter_Other: //Lo
|
|
case QChar::Letter_Titlecase: //Lt
|
|
case QChar::Number_Letter: //Nl
|
|
return NameStart;
|
|
|
|
case QChar::Mark_SpacingCombining: //Mc
|
|
case QChar::Mark_Enclosing: //Me
|
|
case QChar::Mark_NonSpacing: //Mn
|
|
case QChar::Letter_Modifier: //Lm
|
|
case QChar::Number_DecimalDigit: //Nd
|
|
return NameCont;
|
|
|
|
default:
|
|
return NotPartOfName;
|
|
}
|
|
}
|
|
|
|
bool Tokenizer::isAxisName(QString name, Step::AxisType *type)
|
|
{
|
|
if (!s_axisNamesDict) {
|
|
s_axisNamesDict = new QHash<QString, Step::AxisType>;
|
|
for (unsigned int p = 0; p < axisNamesCount; ++p)
|
|
s_axisNamesDict->insert(QLatin1String(axisNames[p].name),
|
|
axisNames[p].type);
|
|
}
|
|
|
|
QHash<QString, Step::AxisType>::ConstIterator it = s_axisNamesDict->constFind(name);
|
|
if ( it != s_axisNamesDict->constEnd() ) {
|
|
*type = *it;
|
|
}
|
|
return it != s_axisNamesDict->constEnd();
|
|
}
|
|
|
|
bool Tokenizer::isNodeTypeName(QString name)
|
|
{
|
|
if (!s_nodeTypeNamesDict) {
|
|
s_nodeTypeNamesDict = new QSet<QString>;
|
|
for (int p = 0; nodeTypeNames[p]; ++p)
|
|
s_nodeTypeNamesDict->insert(QLatin1String(nodeTypeNames[p]));
|
|
}
|
|
return s_nodeTypeNamesDict->contains(name);
|
|
}
|
|
|
|
/* Returns whether the last parsed token matches the [32] Operator rule
|
|
* (check http://www.w3.org/TR/xpath#exprlex). Necessary to disambiguate
|
|
* the tokens.
|
|
*/
|
|
bool Tokenizer::isOperatorContext()
|
|
{
|
|
if ( m_nextPos == 0 ) {
|
|
return false;
|
|
}
|
|
|
|
switch ( m_lastTokenType ) {
|
|
case AND: case OR: case MULOP:
|
|
case '/': case SLASHSLASH: case '|': case PLUS: case MINUS:
|
|
case EQOP: case RELOP:
|
|
case '@': case AXISNAME: case '(': case '[':
|
|
return false;
|
|
default:
|
|
return true;
|
|
}
|
|
}
|
|
|
|
void Tokenizer::skipWS()
|
|
{
|
|
while (m_nextPos < m_data.length() && m_data[m_nextPos].isSpace())
|
|
++m_nextPos;
|
|
}
|
|
|
|
Token Tokenizer::makeTokenAndAdvance(int code, int advance)
|
|
{
|
|
m_nextPos += advance;
|
|
return Token(code);
|
|
}
|
|
|
|
Token Tokenizer::makeIntTokenAndAdvance(int code, int val, int advance)
|
|
{
|
|
m_nextPos += advance;
|
|
return Token(code, val);
|
|
}
|
|
|
|
//Returns next char if it's there and interesting, 0 otherwise
|
|
char Tokenizer::peekAheadHelper()
|
|
{
|
|
if (m_nextPos + 1 >= m_data.length())
|
|
return 0;
|
|
QChar next = m_data[m_nextPos + 1];
|
|
if (next.row() != 0)
|
|
return 0;
|
|
else
|
|
return next.cell();
|
|
}
|
|
|
|
char Tokenizer::peekCurHelper()
|
|
{
|
|
if (m_nextPos >= m_data.length())
|
|
return 0;
|
|
QChar next = m_data[m_nextPos];
|
|
if (next.row() != 0)
|
|
return 0;
|
|
else
|
|
return next.cell();
|
|
}
|
|
|
|
Token Tokenizer::lexString()
|
|
{
|
|
QChar delimiter = m_data[m_nextPos];
|
|
int startPos = m_nextPos + 1;
|
|
|
|
for (m_nextPos = startPos; m_nextPos < m_data.length(); ++m_nextPos) {
|
|
if (m_data[m_nextPos] == delimiter) {
|
|
QString value = m_data.mid(startPos, m_nextPos - startPos);
|
|
++m_nextPos; //Consume the char;
|
|
return Token(LITERAL, value);
|
|
}
|
|
}
|
|
|
|
//Ouch, went off the end -- report error
|
|
return Token(ERROR);
|
|
}
|
|
|
|
Token Tokenizer::lexNumber()
|
|
{
|
|
int startPos = m_nextPos;
|
|
bool seenDot = false;
|
|
|
|
//Go until end or a non-digits character
|
|
for (; m_nextPos < m_data.length(); ++m_nextPos) {
|
|
QChar aChar = m_data[m_nextPos];
|
|
if (aChar.row() != 0) break;
|
|
|
|
if (aChar.cell() < '0' || aChar.cell() > '9') {
|
|
if (aChar.cell() == '.' && !seenDot)
|
|
seenDot = true;
|
|
else
|
|
break;
|
|
}
|
|
}
|
|
|
|
QString value = m_data.mid(startPos, m_nextPos - startPos);
|
|
return Token(NUMBER, value);
|
|
}
|
|
|
|
Token Tokenizer::lexNCName()
|
|
{
|
|
int startPos = m_nextPos;
|
|
if (m_nextPos < m_data.length() && charCat(m_data[m_nextPos]) == NameStart)
|
|
{
|
|
//Keep going until we get a character that's not good for names.
|
|
for (; m_nextPos < m_data.length(); ++m_nextPos) {
|
|
if (charCat(m_data[m_nextPos]) == NotPartOfName)
|
|
break;
|
|
}
|
|
|
|
QString value = m_data.mid(startPos, m_nextPos - startPos);
|
|
return Token(value);
|
|
}
|
|
else
|
|
return makeTokenAndAdvance(ERROR);
|
|
}
|
|
|
|
Token Tokenizer::lexQName()
|
|
{
|
|
Token t1 = lexNCName();
|
|
if (t1.type == ERROR) return t1;
|
|
skipWS();
|
|
//If the next character is :, what we just got it the prefix, if not,
|
|
//it's the whole thing
|
|
if (peekAheadHelper() != ':')
|
|
return t1;
|
|
|
|
Token t2 = lexNCName();
|
|
if (t2.type == ERROR) return t2;
|
|
|
|
return Token(t1.value + ":" + t2.value);
|
|
}
|
|
|
|
Token Tokenizer::nextTokenInternal()
|
|
{
|
|
skipWS();
|
|
|
|
if (m_nextPos >= m_data.length()) {
|
|
return Token(0);
|
|
}
|
|
|
|
char code = peekCurHelper();
|
|
switch (code) {
|
|
case '(': case ')': case '[': case ']':
|
|
case '@': case ',': case '|':
|
|
return makeTokenAndAdvance(code);
|
|
case '\'':
|
|
case '\"':
|
|
return lexString();
|
|
case '0': case '1': case '2': case '3': case '4':
|
|
case '5': case '6': case '7': case '8': case '9':
|
|
return lexNumber();
|
|
case '.': {
|
|
char next = peekAheadHelper();
|
|
if (next == '.')
|
|
return makeTokenAndAdvance(DOTDOT, 2);
|
|
else if (next >= '0' && next <= '9')
|
|
return lexNumber();
|
|
else
|
|
return makeTokenAndAdvance('.');
|
|
}
|
|
case '/':
|
|
if (peekAheadHelper() == '/')
|
|
return makeTokenAndAdvance(SLASHSLASH, 2);
|
|
else
|
|
return makeTokenAndAdvance('/');
|
|
case '+':
|
|
return makeTokenAndAdvance(PLUS);
|
|
case '-':
|
|
return makeTokenAndAdvance(MINUS);
|
|
case '=':
|
|
return makeIntTokenAndAdvance(EQOP, RelationOp::OP_EQ);
|
|
case '!':
|
|
if (peekAheadHelper() == '=')
|
|
return makeIntTokenAndAdvance(EQOP, RelationOp::OP_NE, 2);
|
|
else {
|
|
return Token(ERROR);
|
|
}
|
|
case '<':
|
|
if (peekAheadHelper() == '=')
|
|
return makeIntTokenAndAdvance(RELOP, RelationOp::OP_LE, 2);
|
|
else
|
|
return makeIntTokenAndAdvance(RELOP, RelationOp::OP_LT);
|
|
case '>':
|
|
if (peekAheadHelper() == '=')
|
|
return makeIntTokenAndAdvance(RELOP, RelationOp::OP_GE, 2);
|
|
else
|
|
return makeIntTokenAndAdvance(RELOP, RelationOp::OP_GT);
|
|
case '*':
|
|
if (isOperatorContext())
|
|
return makeIntTokenAndAdvance(MULOP, NumericOp::OP_Mul);
|
|
else {
|
|
++m_nextPos;
|
|
return Token(NAMETEST, "*");
|
|
}
|
|
case '$': {//$ QName
|
|
m_nextPos++;
|
|
Token par = lexQName();
|
|
if (par.type == ERROR)
|
|
return par;
|
|
else
|
|
return Token(VARIABLEREFERENCE, par.value);
|
|
}
|
|
}
|
|
|
|
Token t1 = lexNCName();
|
|
if (t1.type == ERROR) return t1;
|
|
|
|
skipWS();
|
|
|
|
//If we're in an operator context, check for any operator names
|
|
if (isOperatorContext()) {
|
|
if (t1.value == QLatin1String("and")) //### hash?
|
|
return Token(AND);
|
|
if (t1.value == QLatin1String("or"))
|
|
return Token(OR);
|
|
if (t1.value == QLatin1String("mod"))
|
|
return Token(MULOP, NumericOp::OP_Mod);
|
|
if (t1.value == QLatin1String("div"))
|
|
return Token(MULOP, NumericOp::OP_Div);
|
|
}
|
|
|
|
//See whether we are at a :
|
|
if (peekCurHelper() == ':') {
|
|
m_nextPos++;
|
|
//Any chance it's an axis name?
|
|
if (peekCurHelper() == ':') {
|
|
m_nextPos++;
|
|
|
|
//It might be an axis name.
|
|
Step::AxisType axisType;
|
|
if (isAxisName(t1.value, &axisType))
|
|
return Token(AXISNAME, axisType);
|
|
//Ugh, :: is only valid in axis names -> error
|
|
return Token(ERROR);
|
|
}
|
|
|
|
//Seems like this is a fully qualified qname, or perhaps the * modified one from NameTest
|
|
skipWS();
|
|
if (peekCurHelper() == '*') {
|
|
m_nextPos++;
|
|
return Token(NAMETEST, t1.value + ":*");
|
|
}
|
|
|
|
//Make a full qname..
|
|
Token t2 = lexNCName();
|
|
if (t2.type == ERROR) return t2;
|
|
|
|
t1.value = t1.value + ':' + t2.value;
|
|
}
|
|
|
|
skipWS();
|
|
if (peekCurHelper() == '(') {
|
|
//note: we don't swallow the ( here!
|
|
|
|
//either node type of function name
|
|
if (isNodeTypeName(t1.value)) {
|
|
if (t1.value == "processing-instruction")
|
|
return Token(PI, t1.value);
|
|
else
|
|
return Token(NODETYPE, t1.value);
|
|
}
|
|
//must be a function name.
|
|
return Token(FUNCTIONNAME, t1.value);
|
|
}
|
|
|
|
//At this point, it must be NAMETEST
|
|
return Token(NAMETEST, t1.value);
|
|
}
|
|
|
|
Token Tokenizer::nextToken()
|
|
{
|
|
Token toRet = nextTokenInternal();
|
|
m_lastTokenType = toRet.type;
|
|
return toRet;
|
|
}
|
|
|
|
Tokenizer::Tokenizer()
|
|
{
|
|
reset(QString());
|
|
}
|
|
|
|
Tokenizer::~Tokenizer()
|
|
{
|
|
delete s_axisNamesDict;
|
|
delete s_nodeTypeNamesDict;
|
|
}
|
|
|
|
void Tokenizer::reset(QString data)
|
|
{
|
|
m_nextPos = 0;
|
|
m_data = data;
|
|
m_lastTokenType = 0;
|
|
}
|
|
|
|
int khtmlxpathyylex()
|
|
{
|
|
Token tok = Tokenizer::self().nextToken();
|
|
if (tok.hasString) {
|
|
khtmlxpathyylval.str = new DOMString(tok.value);
|
|
} else if (tok.intValue) {
|
|
khtmlxpathyylval.num = tok.intValue;
|
|
}
|
|
return tok.type;
|
|
}
|
|
|
|
void initTokenizer(const DOM::DOMString& string)
|
|
{
|
|
Tokenizer::self().reset(string.string());
|
|
}
|
|
|
|
} // namespace XPath
|
|
} // namespace khtml
|
|
|
|
// kate: indent-width 4; replace-tabs off; tab-width 4; indent-spaces: off;
|