kdelibs/khtml/xpath/tokenizer.cpp

/*
 * tokenizer.cc - Copyright 2005 Maksim Orlovich <maksim@kde.org>
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include "tokenizer.h"

#include "xml/dom_stringimpl.h"
#include "xml/dom3_xpathimpl.h"
#include "dom/dom3_xpath.h"

#include <cstdio>

using namespace std;

using namespace DOM;
using namespace DOM::XPath;
using namespace khtml;
using namespace khtml::XPath;

namespace khtml {
namespace XPath {

struct AxisNameMapping
{
	const char *name;
	Step::AxisType type;
};

static AxisNameMapping axisNames[] = {
	{ "ancestor", Step::AncestorAxis },
	{ "ancestor-or-self", Step::AncestorOrSelfAxis },
	{ "attribute", Step::AttributeAxis },
	{ "child", Step::ChildAxis },
	{ "descendant", Step::DescendantAxis },
	{ "descendant-or-self", Step::DescendantOrSelfAxis },
	{ "following", Step::FollowingAxis },
	{ "following-sibling", Step::FollowingSiblingAxis },
	{ "namespace", Step::NamespaceAxis },
	{ "parent", Step::ParentAxis },
	{ "preceding", Step::PrecedingAxis },
	{ "preceding-sibling", Step::PrecedingSiblingAxis },
	{ "self", Step::SelfAxis }
};
static unsigned int axisNamesCount = sizeof(axisNames) / sizeof(axisNames[0]);

static const char* const nodeTypeNames[] = {
	"comment",
	"text",
	"processing-instruction",
	"node",
	0
};

QHash<QString, Step::AxisType>* Tokenizer::s_axisNamesDict     = 0;
QSet<QString>* Tokenizer::s_nodeTypeNamesDict = 0;

Tokenizer &Tokenizer::self()
{
	static Tokenizer instance;
	return instance;
}

Tokenizer::XMLCat Tokenizer::charCat(QChar aChar)
{
	//### might need to add some special cases from the XML spec.

	if (aChar.unicode() == '_')
		return NameStart;

	if (aChar.unicode() == '.' || aChar.unicode() == '-')
		return NameCont;

	switch (aChar.category()) {
		case QChar::Letter_Lowercase: //Ll
		case QChar::Letter_Uppercase: //Lu
		case QChar::Letter_Other:     //Lo
		case QChar::Letter_Titlecase: //Lt
		case QChar::Number_Letter:    //Nl
			return NameStart;

		case QChar::Mark_SpacingCombining: //Mc
		case QChar::Mark_Enclosing:        //Me
		case QChar::Mark_NonSpacing:       //Mn
		case QChar::Letter_Modifier:       //Lm
		case QChar::Number_DecimalDigit:   //Nd
			return NameCont;

		default:
			return NotPartOfName;
	}
}

bool Tokenizer::isAxisName(QString name, Step::AxisType *type)
{
	if (!s_axisNamesDict) {
		s_axisNamesDict = new QHash<QString, Step::AxisType>;
		for (unsigned int p = 0; p < axisNamesCount; ++p)
			s_axisNamesDict->insert(QLatin1String(axisNames[p].name),
			                        axisNames[p].type);
	}

	QHash<QString, Step::AxisType>::ConstIterator it = s_axisNamesDict->constFind(name);
	if ( it != s_axisNamesDict->constEnd() ) {
		*type = *it;
	}
	return it != s_axisNamesDict->constEnd();
}

bool Tokenizer::isNodeTypeName(QString name)
{
	if (!s_nodeTypeNamesDict) {
		s_nodeTypeNamesDict = new QSet<QString>;
		for (int p = 0; nodeTypeNames[p]; ++p)
			s_nodeTypeNamesDict->insert(QLatin1String(nodeTypeNames[p]));
	}
	return s_nodeTypeNamesDict->contains(name);
}

/* Returns whether the last parsed token matches the [32] Operator rule
 * (check http://www.w3.org/TR/xpath#exprlex). Necessary to disambiguate
 * the tokens.
 */
bool Tokenizer::isOperatorContext()
{
	if ( m_nextPos == 0 ) {
		return false;
	}

	switch ( m_lastTokenType ) {
		case AND: case OR: case MULOP:
		case '/': case SLASHSLASH: case '|': case PLUS: case MINUS:
		case EQOP: case RELOP:
		case '@': case AXISNAME:   case '(': case '[':
			return false;
		default:
			return true;
	}
}

void Tokenizer::skipWS()
{
	while (m_nextPos < m_data.length() && m_data[m_nextPos].isSpace())
		++m_nextPos;
}

Token Tokenizer::makeTokenAndAdvance(int code, int advance)
{
	m_nextPos += advance;
	return Token(code);
}

Token Tokenizer::makeIntTokenAndAdvance(int code, int val, int advance)
{
	m_nextPos += advance;
	return Token(code, val);
}

//Returns next char if it's there and interesting, 0 otherwise
char Tokenizer::peekAheadHelper()
{
	if (m_nextPos + 1 >= m_data.length())
		return 0;
	QChar next = m_data[m_nextPos + 1];
	if (next.row() != 0)
		return 0;
	else
		return next.cell();
}

char Tokenizer::peekCurHelper()
{
	if (m_nextPos >= m_data.length())
		return 0;
	QChar next = m_data[m_nextPos];
	if (next.row() != 0)
		return 0;
	else
		return next.cell();
}

Token Tokenizer::lexString()
{
	QChar delimiter = m_data[m_nextPos];
	int   startPos  = m_nextPos + 1;

	for (m_nextPos = startPos; m_nextPos < m_data.length(); ++m_nextPos) {
		if (m_data[m_nextPos] == delimiter) {
			QString value = m_data.mid(startPos, m_nextPos - startPos);
			++m_nextPos; //Consume the char;
			return Token(LITERAL, value);
		}
	}

	//Ouch, went off the end -- report error
	return Token(ERROR);
}

Token Tokenizer::lexNumber()
{
	int startPos = m_nextPos;
	bool seenDot = false;

	//Go until end or a non-digits character
	for (; m_nextPos < m_data.length(); ++m_nextPos) {
		QChar aChar = m_data[m_nextPos];
		if (aChar.row() != 0) break;

		if (aChar.cell() < '0' || aChar.cell() > '9') {
			if (aChar.cell() == '.' && !seenDot)
				seenDot = true;
			else
				break;
		}
	}

	QString value = m_data.mid(startPos, m_nextPos - startPos);
	return Token(NUMBER, value);
}

Token Tokenizer::lexNCName()
{
	int startPos = m_nextPos;
	if (m_nextPos < m_data.length() && charCat(m_data[m_nextPos]) == NameStart)
	{
		//Keep going until we get a character that's not good for names.
		for (; m_nextPos < m_data.length(); ++m_nextPos) {
			if (charCat(m_data[m_nextPos]) == NotPartOfName)
				break;
		}

		QString value = m_data.mid(startPos, m_nextPos - startPos);
		return Token(value);
	}
	else
		return makeTokenAndAdvance(ERROR);
}

Token Tokenizer::lexQName()
{
	Token t1 = lexNCName();
	if (t1.type == ERROR) return t1;
	skipWS();
	//If the next character is :, what we just got it the prefix, if not,
	//it's the whole thing
	if (peekAheadHelper() != ':')
		return t1;

	Token t2 = lexNCName();
	if (t2.type == ERROR) return t2;

	return Token(t1.value + ":" + t2.value);
}

Token Tokenizer::nextTokenInternal()
{
	skipWS();

	if (m_nextPos >= m_data.length()) {
		return Token(0);
	}

	char code = peekCurHelper();
	switch (code) {
		case '(': case ')': case '[': case ']':
		case '@': case ',': case '|':
			return makeTokenAndAdvance(code);
		case '\'':
		case '\"':
			return lexString();
		case '0': case '1': case '2': case '3': case '4':
		case '5': case '6': case '7': case '8': case '9':
			return lexNumber();
		case '.': {
			char next = peekAheadHelper();
			if (next == '.')
				return makeTokenAndAdvance(DOTDOT, 2);
			else if (next >= '0' && next <= '9')
				return lexNumber();
			else
				return makeTokenAndAdvance('.');
		}
		case '/':
			if (peekAheadHelper() == '/')
				return makeTokenAndAdvance(SLASHSLASH, 2);
			else
				return makeTokenAndAdvance('/');
		case '+':
			return makeTokenAndAdvance(PLUS);
		case '-':
			return makeTokenAndAdvance(MINUS);
		case '=':
			return makeIntTokenAndAdvance(EQOP, RelationOp::OP_EQ);
		case '!':
			if (peekAheadHelper() == '=')
				return makeIntTokenAndAdvance(EQOP, RelationOp::OP_NE, 2);
			else {
				return Token(ERROR);
			}
		case '<':
			if (peekAheadHelper() == '=')
				return makeIntTokenAndAdvance(RELOP, RelationOp::OP_LE, 2);
			else
				return makeIntTokenAndAdvance(RELOP, RelationOp::OP_LT);
		case '>':
			if (peekAheadHelper() == '=')
				return makeIntTokenAndAdvance(RELOP, RelationOp::OP_GE, 2);
			else
				return makeIntTokenAndAdvance(RELOP, RelationOp::OP_GT);
		case '*':
			if (isOperatorContext())
				return makeIntTokenAndAdvance(MULOP, NumericOp::OP_Mul);
			else {
				++m_nextPos;
				return Token(NAMETEST, "*");
			}
		case '$': {//$ QName
			m_nextPos++;
			Token par = lexQName();
			if (par.type == ERROR)
				return par;
			else
				return Token(VARIABLEREFERENCE, par.value);
		}
	}

	Token t1 = lexNCName();
	if (t1.type == ERROR) return t1;

	skipWS();

	//If we're in an operator context, check for any operator names
	if (isOperatorContext()) {
		if (t1.value == QLatin1String("and")) //### hash?
			return Token(AND);
		if (t1.value == QLatin1String("or"))
			return Token(OR);
		if (t1.value == QLatin1String("mod"))
			return Token(MULOP, NumericOp::OP_Mod);
		if (t1.value == QLatin1String("div"))
			return Token(MULOP, NumericOp::OP_Div);
	}

	//See whether we are at a :
	if (peekCurHelper() == ':') {
		m_nextPos++;
		//Any chance it's an axis name?
		if (peekCurHelper() == ':') {
			m_nextPos++;

			//It might be an axis name.
			Step::AxisType axisType;
			if (isAxisName(t1.value, &axisType))
				return Token(AXISNAME, axisType);
			//Ugh, :: is only valid in axis names -> error
			return Token(ERROR);
		}

		//Seems like this is a fully qualified qname, or perhaps the * modified one from NameTest
		skipWS();
		if (peekCurHelper() == '*') {
			m_nextPos++;
			return Token(NAMETEST, t1.value + ":*");
		}

		//Make a full qname..
		Token t2 = lexNCName();
		if (t2.type == ERROR) return t2;

		t1.value = t1.value + ':' + t2.value;
	}

	skipWS();
	if (peekCurHelper() == '(') {
		//note: we don't swallow the ( here!

		//either node type of function name
		if (isNodeTypeName(t1.value)) {
			if (t1.value == "processing-instruction")
				return Token(PI, t1.value);
			else
				return Token(NODETYPE, t1.value);
		}
		//must be a function name.
		return Token(FUNCTIONNAME, t1.value);
	}

	//At this point, it must be NAMETEST
	return Token(NAMETEST, t1.value);
}

Token Tokenizer::nextToken()
{
	Token toRet = nextTokenInternal();
	m_lastTokenType = toRet.type;
	return toRet;
}

Tokenizer::Tokenizer()
{
	reset(QString());
}

Tokenizer::~Tokenizer()
{
	delete s_axisNamesDict;
	delete s_nodeTypeNamesDict;
}

void Tokenizer::reset(QString data)
{
	m_nextPos = 0;
	m_data = data;
	m_lastTokenType = 0;
}

int khtmlxpathyylex()
{
	Token tok = Tokenizer::self().nextToken();
	if (tok.hasString) {
		khtmlxpathyylval.str = new DOMString(tok.value);
	} else if (tok.intValue) {
		khtmlxpathyylval.num = tok.intValue;
	}
	return tok.type;
}

void initTokenizer(const DOM::DOMString& string)
{
	Tokenizer::self().reset(string.string());
}

} // namespace XPath
} // namespace khtml

// kate: indent-width 4; replace-tabs off; tab-width 4; indent-spaces: off;