kdelibs/khtml/xpath/tokenizer.cpp

456 lines
11 KiB
C++
Raw Normal View History

2014-11-13 01:04:59 +02:00
/*
* tokenizer.cc - Copyright 2005 Maksim Orlovich <maksim@kde.org>
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "tokenizer.h"
#include "xml/dom_stringimpl.h"
#include "xml/dom3_xpathimpl.h"
#include "dom/dom3_xpath.h"
#include <cstdio>
using namespace std;
using namespace DOM;
using namespace DOM::XPath;
using namespace khtml;
using namespace khtml::XPath;
namespace khtml {
namespace XPath {
struct AxisNameMapping
{
const char *name;
Step::AxisType type;
};
static AxisNameMapping axisNames[] = {
{ "ancestor", Step::AncestorAxis },
{ "ancestor-or-self", Step::AncestorOrSelfAxis },
{ "attribute", Step::AttributeAxis },
{ "child", Step::ChildAxis },
{ "descendant", Step::DescendantAxis },
{ "descendant-or-self", Step::DescendantOrSelfAxis },
{ "following", Step::FollowingAxis },
{ "following-sibling", Step::FollowingSiblingAxis },
{ "namespace", Step::NamespaceAxis },
{ "parent", Step::ParentAxis },
{ "preceding", Step::PrecedingAxis },
{ "preceding-sibling", Step::PrecedingSiblingAxis },
{ "self", Step::SelfAxis }
};
static unsigned int axisNamesCount = sizeof(axisNames) / sizeof(axisNames[0]);
static const char* const nodeTypeNames[] = {
"comment",
"text",
"processing-instruction",
"node",
0
};
QHash<QString, Step::AxisType>* Tokenizer::s_axisNamesDict = 0;
QSet<QString>* Tokenizer::s_nodeTypeNamesDict = 0;
Tokenizer &Tokenizer::self()
{
static Tokenizer instance;
return instance;
}
Tokenizer::XMLCat Tokenizer::charCat(QChar aChar)
{
//### might need to add some special cases from the XML spec.
if (aChar.unicode() == '_')
return NameStart;
if (aChar.unicode() == '.' || aChar.unicode() == '-')
return NameCont;
switch (aChar.category()) {
case QChar::Letter_Lowercase: //Ll
case QChar::Letter_Uppercase: //Lu
case QChar::Letter_Other: //Lo
case QChar::Letter_Titlecase: //Lt
case QChar::Number_Letter: //Nl
return NameStart;
case QChar::Mark_SpacingCombining: //Mc
case QChar::Mark_Enclosing: //Me
case QChar::Mark_NonSpacing: //Mn
case QChar::Letter_Modifier: //Lm
case QChar::Number_DecimalDigit: //Nd
return NameCont;
default:
return NotPartOfName;
}
}
bool Tokenizer::isAxisName(QString name, Step::AxisType *type)
{
if (!s_axisNamesDict) {
s_axisNamesDict = new QHash<QString, Step::AxisType>;
for (unsigned int p = 0; p < axisNamesCount; ++p)
s_axisNamesDict->insert(QLatin1String(axisNames[p].name),
axisNames[p].type);
}
QHash<QString, Step::AxisType>::ConstIterator it = s_axisNamesDict->constFind(name);
if ( it != s_axisNamesDict->constEnd() ) {
*type = *it;
}
return it != s_axisNamesDict->constEnd();
}
bool Tokenizer::isNodeTypeName(QString name)
{
if (!s_nodeTypeNamesDict) {
s_nodeTypeNamesDict = new QSet<QString>;
for (int p = 0; nodeTypeNames[p]; ++p)
s_nodeTypeNamesDict->insert(QLatin1String(nodeTypeNames[p]));
}
return s_nodeTypeNamesDict->contains(name);
}
/* Returns whether the last parsed token matches the [32] Operator rule
* (check http://www.w3.org/TR/xpath#exprlex). Necessary to disambiguate
* the tokens.
*/
bool Tokenizer::isOperatorContext()
{
if ( m_nextPos == 0 ) {
return false;
}
switch ( m_lastTokenType ) {
case AND: case OR: case MULOP:
case '/': case SLASHSLASH: case '|': case PLUS: case MINUS:
case EQOP: case RELOP:
case '@': case AXISNAME: case '(': case '[':
return false;
default:
return true;
}
}
void Tokenizer::skipWS()
{
while (m_nextPos < m_data.length() && m_data[m_nextPos].isSpace())
++m_nextPos;
}
Token Tokenizer::makeTokenAndAdvance(int code, int advance)
{
m_nextPos += advance;
return Token(code);
}
Token Tokenizer::makeIntTokenAndAdvance(int code, int val, int advance)
{
m_nextPos += advance;
return Token(code, val);
}
//Returns next char if it's there and interesting, 0 otherwise
char Tokenizer::peekAheadHelper()
{
if (m_nextPos + 1 >= m_data.length())
return 0;
QChar next = m_data[m_nextPos + 1];
if (next.row() != 0)
return 0;
else
return next.cell();
}
char Tokenizer::peekCurHelper()
{
if (m_nextPos >= m_data.length())
return 0;
QChar next = m_data[m_nextPos];
if (next.row() != 0)
return 0;
else
return next.cell();
}
Token Tokenizer::lexString()
{
QChar delimiter = m_data[m_nextPos];
int startPos = m_nextPos + 1;
for (m_nextPos = startPos; m_nextPos < m_data.length(); ++m_nextPos) {
if (m_data[m_nextPos] == delimiter) {
QString value = m_data.mid(startPos, m_nextPos - startPos);
++m_nextPos; //Consume the char;
return Token(LITERAL, value);
}
}
//Ouch, went off the end -- report error
return Token(ERROR);
}
Token Tokenizer::lexNumber()
{
int startPos = m_nextPos;
bool seenDot = false;
//Go until end or a non-digits character
for (; m_nextPos < m_data.length(); ++m_nextPos) {
QChar aChar = m_data[m_nextPos];
if (aChar.row() != 0) break;
if (aChar.cell() < '0' || aChar.cell() > '9') {
if (aChar.cell() == '.' && !seenDot)
seenDot = true;
else
break;
}
}
QString value = m_data.mid(startPos, m_nextPos - startPos);
return Token(NUMBER, value);
}
Token Tokenizer::lexNCName()
{
int startPos = m_nextPos;
if (m_nextPos < m_data.length() && charCat(m_data[m_nextPos]) == NameStart)
{
//Keep going until we get a character that's not good for names.
for (; m_nextPos < m_data.length(); ++m_nextPos) {
if (charCat(m_data[m_nextPos]) == NotPartOfName)
break;
}
QString value = m_data.mid(startPos, m_nextPos - startPos);
return Token(value);
}
else
return makeTokenAndAdvance(ERROR);
}
Token Tokenizer::lexQName()
{
Token t1 = lexNCName();
if (t1.type == ERROR) return t1;
skipWS();
//If the next character is :, what we just got it the prefix, if not,
//it's the whole thing
if (peekAheadHelper() != ':')
return t1;
Token t2 = lexNCName();
if (t2.type == ERROR) return t2;
return Token(t1.value + ":" + t2.value);
}
Token Tokenizer::nextTokenInternal()
{
skipWS();
if (m_nextPos >= m_data.length()) {
return Token(0);
}
char code = peekCurHelper();
switch (code) {
case '(': case ')': case '[': case ']':
case '@': case ',': case '|':
return makeTokenAndAdvance(code);
case '\'':
case '\"':
return lexString();
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
return lexNumber();
case '.': {
char next = peekAheadHelper();
if (next == '.')
return makeTokenAndAdvance(DOTDOT, 2);
else if (next >= '0' && next <= '9')
return lexNumber();
else
return makeTokenAndAdvance('.');
}
case '/':
if (peekAheadHelper() == '/')
return makeTokenAndAdvance(SLASHSLASH, 2);
else
return makeTokenAndAdvance('/');
case '+':
return makeTokenAndAdvance(PLUS);
case '-':
return makeTokenAndAdvance(MINUS);
case '=':
return makeIntTokenAndAdvance(EQOP, RelationOp::OP_EQ);
case '!':
if (peekAheadHelper() == '=')
return makeIntTokenAndAdvance(EQOP, RelationOp::OP_NE, 2);
else {
return Token(ERROR);
}
case '<':
if (peekAheadHelper() == '=')
return makeIntTokenAndAdvance(RELOP, RelationOp::OP_LE, 2);
else
return makeIntTokenAndAdvance(RELOP, RelationOp::OP_LT);
case '>':
if (peekAheadHelper() == '=')
return makeIntTokenAndAdvance(RELOP, RelationOp::OP_GE, 2);
else
return makeIntTokenAndAdvance(RELOP, RelationOp::OP_GT);
case '*':
if (isOperatorContext())
return makeIntTokenAndAdvance(MULOP, NumericOp::OP_Mul);
else {
++m_nextPos;
return Token(NAMETEST, "*");
}
case '$': {//$ QName
m_nextPos++;
Token par = lexQName();
if (par.type == ERROR)
return par;
else
return Token(VARIABLEREFERENCE, par.value);
}
}
Token t1 = lexNCName();
if (t1.type == ERROR) return t1;
skipWS();
//If we're in an operator context, check for any operator names
if (isOperatorContext()) {
if (t1.value == QLatin1String("and")) //### hash?
return Token(AND);
if (t1.value == QLatin1String("or"))
return Token(OR);
if (t1.value == QLatin1String("mod"))
return Token(MULOP, NumericOp::OP_Mod);
if (t1.value == QLatin1String("div"))
return Token(MULOP, NumericOp::OP_Div);
}
//See whether we are at a :
if (peekCurHelper() == ':') {
m_nextPos++;
//Any chance it's an axis name?
if (peekCurHelper() == ':') {
m_nextPos++;
//It might be an axis name.
Step::AxisType axisType;
if (isAxisName(t1.value, &axisType))
return Token(AXISNAME, axisType);
//Ugh, :: is only valid in axis names -> error
return Token(ERROR);
}
//Seems like this is a fully qualified qname, or perhaps the * modified one from NameTest
skipWS();
if (peekCurHelper() == '*') {
m_nextPos++;
return Token(NAMETEST, t1.value + ":*");
}
//Make a full qname..
Token t2 = lexNCName();
if (t2.type == ERROR) return t2;
t1.value = t1.value + ':' + t2.value;
}
skipWS();
if (peekCurHelper() == '(') {
//note: we don't swallow the ( here!
//either node type of function name
if (isNodeTypeName(t1.value)) {
if (t1.value == "processing-instruction")
return Token(PI, t1.value);
else
return Token(NODETYPE, t1.value);
}
//must be a function name.
return Token(FUNCTIONNAME, t1.value);
}
//At this point, it must be NAMETEST
return Token(NAMETEST, t1.value);
}
Token Tokenizer::nextToken()
{
Token toRet = nextTokenInternal();
m_lastTokenType = toRet.type;
return toRet;
}
Tokenizer::Tokenizer()
{
reset(QString());
}
Tokenizer::~Tokenizer()
{
delete s_axisNamesDict;
delete s_nodeTypeNamesDict;
}
void Tokenizer::reset(QString data)
{
m_nextPos = 0;
m_data = data;
m_lastTokenType = 0;
}
int khtmlxpathyylex()
{
Token tok = Tokenizer::self().nextToken();
if (tok.hasString) {
khtmlxpathyylval.str = new DOMString(tok.value);
} else if (tok.intValue) {
khtmlxpathyylval.num = tok.intValue;
}
return tok.type;
}
void initTokenizer(const DOM::DOMString& string)
{
Tokenizer::self().reset(string.string());
}
} // namespace XPath
} // namespace khtml
// kate: indent-width 4; replace-tabs off; tab-width 4; indent-spaces: off;