kdelibs/kjs/jsonlexer.cpp

/*
 *  This file is part of the KDE libraries
 *  Copyright (C) 2012 Bernd Buschinski (b.buschinski@googlemail.com)
 *
 *  This library is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU Library General Public
 *  License as published by the Free Software Foundation; either
 *  version 2 of the License, or (at your option) any later version.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 *  Library General Public License for more details.
 *
 *  You should have received a copy of the GNU Library General Public License
 *  along with this library; see the file COPYING.LIB.  If not, write to
 *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
 *  Boston, MA 02110-1301, USA.
 *
 */

#include "jsonlexer.h"

#include <stack>

#include "lexer.h"
#include "object.h"

#include "wtf/Assertions.h"

// #define JSONLEXER_DEBUG_VERBOSE

namespace KJS {

using namespace JSONParserState;

static const unsigned short InvalidJSONUnicode = 0x001F;

static inline bool isDecimalDigit(const UChar &c)
{
    return (c.uc >= '0' && c.uc <= '9');
}

static inline bool isHexDigit(const UChar& c)
{
  return (isDecimalDigit(c) ||
          (c.uc >= 'a' && c.uc <= 'f') ||
          (c.uc >= 'A' && c.uc <= 'F'));
}

static inline bool isJSONWhiteSpace(const UChar& c)
{
    //ECMA Edition 5.1r6 - 15.12.1.1 - Syntax
    switch (c.uc) {
        case 0x0020: //SP
        case 0x0009: //TAB
        case 0x000A: //LF
        case 0x000D: //CR
            return true;
        default:
            return false;
    }
}

#ifdef JSONLEXER_DEBUG_VERBOSE
static inline UString tokenToString(TokenType type)
{
    switch (type) {
        case TokLBracket: return UString("TokLBracket");
        case TokRBracket: return UString("TokRBracket");
        case TokLBrace: return UString("TokLBrace");
        case TokRBrace: return UString("TokRBrace");
        case TokString: return UString("TokString");
        case TokIdentifier: return UString("TokIdentifier");
        case TokNumber: return UString("TokNumber");
        case TokColon: return UString("TokColon");
        case TokLParen: return UString("TokLParen");
        case TokRParen: return UString("TokRParen");
        case TokComma: return UString("TokComma");
        case TokTrue: return UString("TokTrue");
        case TokFalse: return UString("TokFalse");
        case TokNull: return UString("TokNull");
        case TokEnd: return UString("TokEnd");
        case TokError:  return UString("TokError");
    }
    ASSERT_NOT_REACHED();
    return UString("Default");
}

static inline UString parserStateToString(ParserState state)
{
    switch (state) {
        case JSONValue: return UString("JSONValue");
        case JSONObject: return UString("JSONObject");
        case JSONArray: return UString("JSONArray");
    }
    ASSERT_NOT_REACHED();
    return UString("Default");
}
#endif


// ------------------------------ JSONParser --------------------------------

JSONParser::JSONParser(const UString& code)
    : m_lexer(code)
{
#ifdef JSONLEXER_DEBUG_VERBOSE
        fprintf(stderr, "=============== new JSONParser ===============\n%s\n===============\n", code.ascii());
#endif
}

JSValue* JSONParser::tryParse(ExecState* exec)
{
    JSValue* ret = parse(exec);
    // If the syntax is correct, we never see the EOF, the last used token may be '}'.
    // But Syntax like "{} xyz" is also invalid, so we have to check if the next(last) token is EOF
    if (ret && nextParseIsEOF())
        return ret;
    return 0;
}

// helper function for adding a value to the object.
// the arrayStack saves all added values and gives the correct array position.
// This function will return false for NULL value or on exception.
static inline bool addArrayItem(ExecState* exec, std::stack<JSValue*>* arrayStack, JSValue* value, JSObject* object)
{
    if (exec->hadException())
        return false;

    if (!value)
        return false;

    arrayStack->push(value);
    object->put(exec, arrayStack->size()-1, value);
    return true;
}

JSValue* JSONParser::parse(ExecState* exec, ParserState state)
{
    if (exec->hadException())
        return 0;

    ParserState tState = state;
    TokenType type = m_lexer.next();

    JSObject* object = 0;
    std::stack<JSValue*> arrayObjectStack;
    UString propertyName;

    // For parsing the Object, did we found a propertyName?
    // NOTE: empty propertynames are allowed.
    bool havePropertyName = false;
    // For parsing the Object/Array, checks if we really added/found a propertyName
    // before we find the comma ','
    bool propAdded = false;
    // For parsing the Array, remember if last found token is Comma
    bool lastFoundIsTokComma = false;

    while (type != TokEnd && type != TokError) {
#ifdef JSONLEXER_DEBUG_VERBOSE
        fprintf(stderr, "TokenType: %s \t State: %s\n", tokenToString(type).ascii(), parserStateToString(tState).ascii());
#endif

        switch (tState) {
            case JSONValue:
                switch (type) {
                    case TokLBracket:
                        object = static_cast<JSObject *>(exec->lexicalInterpreter()->builtinArray()->construct(exec, List::empty()));
                        tState = JSONArray;
                        break;
                    case TokLBrace:
                        object = static_cast<JSObject *>(exec->lexicalInterpreter()->builtinObject()->construct(exec, List::empty()));
                        tState = JSONObject;
                        break;
                    case TokString:
                        return jsString(m_lexer.currentString());
                    case TokNull:
                        return jsNull();
                    case TokTrue:
                        return jsBoolean(true);
                    case TokFalse:
                        return jsBoolean(false);
                    case TokNumber:
                        return jsNumber(m_lexer.currentNumber());
                    default:
                        // This can only happen on invalid syntax and with 0 return
                        // we tell the caller we got a syntax error.

                        // ASSERT_NOT_REACHED();
                        return 0;
                }
                break;
            case JSONObject: {
                // if we got called from JSONArray-TokLBrace we did not create an object.

                // In more detail for the following JSON String "[{}]"
                // If we are in parse with type=JSONArray and state=TokLBrace,
                // means we just found the "{" in the Array, and call parse(exec, JSONObject),
                // now in this call type=JSONObject, state=TokRBrace ("}") and, our new, object=0 (!)
                // We will finish the object and return it, but as object is null, we return 0.
                // which would be wrong, as empty objects are allowed.
                // In this case we just report invalid data.
                // But for JSON String like "[{"a":1}]", we end up using object(0)->putDirect
                // and crash.

                // In short, remove this line and we will crash.
                object = object ? object : static_cast<JSObject *>(exec->lexicalInterpreter()->builtinObject()->construct(exec, List::empty()));
                switch (type) {
                    case TokString: // PropertyName
                        if (havePropertyName)
                            return 0;
                        propertyName = m_lexer.currentString();
                        havePropertyName = true;
                        break;
                    case TokColon: {
                        if (!havePropertyName)
                            return 0;
                        JSValue* val = parse(exec, JSONValue);
                        if (!val)
                            return 0;

                        // use putDirect to by-pass __proto__
                        object->putDirect(Identifier(propertyName), val);
                        propertyName = "";
                        havePropertyName = false;
                        propAdded = true;
                        break;
                    }
                    case TokRBrace: //Finish Object
                        if (havePropertyName)
                            return 0;
                        return object;
                    case TokComma: // Next Property
                        if (!propAdded)
                            return 0;
                        propAdded = false;
                        break;
                    default:
                        // This can only happen on invalid syntax and with 0 return
                        // we tell the caller we got a syntax error.

                        // ASSERT_NOT_REACHED();
                        return 0;
                }
                break;
            }
            case JSONArray: {
                // if we got called from JSONArray-TokLBracket we did not create an object
                object = object ? object : static_cast<JSObject *>(exec->lexicalInterpreter()->builtinArray()->construct(exec, List::empty()));

                // Check for invalid Array syntax, like ["1" "2"]
                switch (type) {
                    case TokNumber:
                    case TokString:
                    case TokNull:
                    case TokTrue:
                    case TokFalse:
                    case TokLBrace:
                    case TokLBracket:
                        if (propAdded)
                            return 0;
                        propAdded = true;
                        lastFoundIsTokComma = false;
                        break;
                    default:
                        break;
                }

                switch (type) {
                    case TokRBracket: // Finish array
                        // Check for invalid syntax like "[1,]"
                        if (lastFoundIsTokComma)
                            return 0;
                        return object;
                    case TokNumber:
                        if (!addArrayItem(exec, &arrayObjectStack, jsNumber(m_lexer.currentNumber()), object))
                            return 0;
                        break;
                    case TokString:
                        if (!addArrayItem(exec, &arrayObjectStack, jsString(m_lexer.currentString()), object))
                            return 0;
                        break;
                    case TokNull:
                        if (!addArrayItem(exec, &arrayObjectStack, jsNull(), object))
                            return 0;
                        break;
                    case TokTrue:
                        if (!addArrayItem(exec, &arrayObjectStack, jsBoolean(true), object))
                            return 0;
                        break;
                    case TokFalse:
                        if (!addArrayItem(exec, &arrayObjectStack, jsBoolean(false), object))
                            return 0;
                        break;
                    case TokLBrace:
                        if (!addArrayItem(exec, &arrayObjectStack, parse(exec, JSONObject), object))
                            return 0;
                        break;
                    case TokLBracket:
                        if (!addArrayItem(exec, &arrayObjectStack, parse(exec, JSONArray), object))
                            return 0;
                        break;
                    case TokComma: // Skip Comma and parse next Array Element
                        // if we found a comma without a leading property, this is invalid syntax
                        if (!propAdded)
                            return 0;
                        propAdded = false;
                        lastFoundIsTokComma = true;
                        break;
                    default:
                        // This can only happen on invalid syntax and with 0 return
                        // we tell the caller we got a syntax error.

                        // ASSERT_NOT_REACHED();
                        return 0;
                }
                break;
            }
            default:
                ASSERT_NOT_REACHED();
                return 0;
        }
        type = m_lexer.next();
    }

    if (type == TokError) {
#ifdef JSONLEXER_DEBUG_VERBOSE
        fprintf(stderr, "WARNING: JSONParse ending with error!\n");
#endif
        return 0;
    }
    if (type == TokEnd) {
#ifdef JSONLEXER_DEBUG_VERBOSE
        fprintf(stderr, "WARNING: JSONParse ending with unexpected END!\n");
#endif
        return 0;
    }
    ASSERT_NOT_REACHED();
    return 0;
}


// ------------------------------ JSONLexer --------------------------------

JSONLexer::JSONLexer(const UString& code)
    : m_code(code),
      m_pos(0)
{
}

TokenType JSONLexer::current()
{
    return m_type;
}

double JSONLexer::currentNumber() const
{
    ASSERT(m_type == TokNumber);
    return m_numberToken;
}

UString JSONLexer::currentString() const
{
    ASSERT(m_type == TokString);
    return m_stringToken;
}

TokenType JSONLexer::lexString()
{
    UString string;
    const int codeSize = m_code.size();
    //skip first detected '"'
    ++m_pos;

    if (m_pos >= codeSize) {
        m_type = TokError;
        return m_type;
    }

    //while not at the end of the string '"'
    while (!(m_code[m_pos] == '"')) {
        UChar cur = m_code[m_pos];
        if (cur == UChar('\\')) {
            ++m_pos;
            bool error = false;
            string.append(parseEscapeChar(&error));
            if (error) {
                m_type = TokError;
                return m_type;
            }
        } else {
            if (cur.uc <= InvalidJSONUnicode) {
                m_type = TokError;
                return m_type;
            }
            string.append(cur);
            ++m_pos;
        }

        if (m_pos >= codeSize) {
            m_type = TokError;
            return m_type;
        }
    }

    m_type = TokString;
    m_stringToken = string;
    ++m_pos;
#ifdef JSONLEXER_DEBUG_VERBOSE
    fprintf(stderr, "JSONLexer::lexString: Pos:%d stringlength:%d string:%s\n", m_pos, string.size(), string.ascii());
#endif
    return m_type;
}

TokenType JSONLexer::lexNumber()
{
    const int start = m_pos;
    const int codeSize = m_code.size();

    // -?(0 | [1-9][0-9]*) ('.' [0-9]+)? ([eE][+-]? [0-9]+)?

    // -?
    if (m_pos < codeSize && m_code[m_pos] == '-')
        ++m_pos;

    // (0 | [1-9][0-9]*)
    if (m_pos < codeSize && m_code[m_pos] == '0') {
        ++m_pos;
    } else if (m_pos < codeSize) {
        while (m_pos < codeSize && isDecimalDigit(m_code[m_pos])) {
            ++m_pos;
        }
    } else {
        m_type = TokError;
        return m_type;
    }

    // ('.' [0-9]+)?
    if (m_pos < codeSize && m_code[m_pos] == '.') {
        ++m_pos;
        // [0-9]+
        if (m_pos >= codeSize || !isDecimalDigit(m_code[m_pos])) {
            m_type = TokError;
            return m_type;
        }
        ++m_pos;

        while (m_pos < codeSize && isDecimalDigit(m_code[m_pos]))
            ++m_pos;
    }

    //  ([eE][+-]? [0-9]+)?
    if (m_pos < codeSize && (m_code[m_pos] == 'e' || m_code[m_pos] == 'E')) { // [eE]
        ++m_pos;

        // [-+]?
        if (m_pos < codeSize && (m_code[m_pos] == '-' || m_code[m_pos] == '+'))
            ++m_pos;

        // [0-9]+
        if (m_pos >= codeSize || !isDecimalDigit(m_code[m_pos])) {
            m_type = TokError;
            return m_type;
        }

        ++m_pos;
        while (m_pos < codeSize && isDecimalDigit(m_code[m_pos]))
            ++m_pos;
    }

    m_numberToken = m_code.substr(start, m_pos-start).toDouble(false, false);
    m_type = TokNumber;
#ifdef JSONLEXER_DEBUG_VERBOSE
    fprintf(stderr, "Number: %f\n", m_numberToken);
#endif
    return m_type;
}

UChar JSONLexer::parseEscapeChar(bool* error)
{
    UChar cur = m_code[m_pos];
    switch (cur.uc) {
        case '"':
        case '\\':
        case '/':
            ++m_pos;
            return cur;
        case 'b':
            ++m_pos;
            return UChar('\b');
        case 'f':
            ++m_pos;
            return UChar('\f');
        case 'n':
            ++m_pos;
            return UChar('\n');
        case 'r':
            ++m_pos;
            return UChar('\r');
        case 't':
            ++m_pos;
            return UChar('\t');
        case 'u':
        {
            if ((m_code.size() - (m_pos+1)) < 4) {
                *error = true;
                return UChar(' ');
            }
            if (!isHexDigit(m_code[m_pos+1]) || !isHexDigit(m_code[m_pos+2]) ||
                !isHexDigit(m_code[m_pos+3]) || !isHexDigit(m_code[m_pos+4])) {
                *error = true;
                return UChar(' ');
            }

            UChar next = Lexer::convertUnicode(m_code[m_pos+1].uc, m_code[m_pos+2].uc, m_code[m_pos+3].uc, m_code[m_pos+4].uc);

            *error = false;
            m_pos += 5;
            return next;
        }
        default:
            *error = true;
            return UChar(' ');
    }
}

//helper function, checks if "word" is in the "code" at "pos".
static inline bool isStringSequence(int pos, const UString& code, const UString& word)
{
    const int wordSize = word.size();
    if (pos + wordSize > code.size())
        return false;

    //Skip first, we already checked it
    for (int i = 1; i < wordSize; ++i) {
        if (code[pos+i].uc != word[i].uc)
            return false;
    }
    return true;
}

TokenType JSONLexer::next()
{
    while(true) {
        if (m_pos >= m_code.size()) {
            m_type = TokEnd;
            return m_type;
        }

        if (!isJSONWhiteSpace(m_code[m_pos])) {
            break;
        }
        ++m_pos;
    }

    m_type = TokError;

#ifdef JSONLEXER_DEBUG_VERBOSE
    fprintf(stderr, "JSONLexer::next current: %c \t\t pos: %d/%d\n", char(m_code[m_pos].uc), m_pos, m_code.size());
#endif

    switch (m_code[m_pos].uc) {
        case '[':
            m_type = TokLBracket;
            ++m_pos;
            return m_type;
        case ']':
            m_type = TokRBracket;
            ++m_pos;
            return m_type;
        case '(':
            m_type = TokLParen;
            ++m_pos;
            return m_type;
        case ')':
            m_type = TokRParen;
            ++m_pos;
            return m_type;
        case '{':
            m_type = TokLBrace;
            ++m_pos;
            return m_type;
        case '}':
            m_type = TokRBrace;
            ++m_pos;
            return m_type;
        case ',':
            m_type = TokComma;
            ++m_pos;
            return m_type;
        case ':':
            m_type = TokColon;
            ++m_pos;
            return m_type;
        case '"':
            return lexString();

        case 't':
            if (isStringSequence(m_pos, m_code, "true")) {
                m_type = TokTrue;
                m_pos += 4;
                return m_type;
            }
            break;
        case 'f':
            if (isStringSequence(m_pos, m_code, "false")) {
                m_type = TokFalse;
                m_pos += 5;
                return m_type;
            }
            break;
        case 'n':
            if (isStringSequence(m_pos, m_code, "null")) {
                m_type = TokNull;
                m_pos += 4;
                return m_type;
            }
            break;
        case '-':
        case '0':
        case '1':
        case '2':
        case '3':
        case '4':
        case '5':
        case '6':
        case '7':
        case '8':
        case '9':
            return lexNumber();
    }
    return m_type;
}

bool JSONParser::nextParseIsEOF()
{
    return m_lexer.next() == TokEnd;
}


} // namespace KJS