kdelibs/kjs/regexp.cpp

// -*- c-basic-offset: 2 -*-
/*
 *  This file is part of the KDE libraries
 *  Copyright (C) 1999-2001,2004 Harri Porten (porten@kde.org)
 *  Copyright (C) 2003,2004 Apple Computer, Inc.
 *  Copyright (C) 2006      Maksim Orlovich (maksim@kde.org)
 *  Copyright (C) 2007      Sune Vuorela (debian@pusling.com)
 *
 *  This library is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU Lesser General Public
 *  License as published by the Free Software Foundation; either
 *  version 2 of the License, or (at your option) any later version.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 *  Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public
 *  License along with this library; if not, write to the Free Software
 *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 *
 */

#include "regexp.h"
#include <config-kjs.h>
#include "lexer.h"

#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <wtf/Vector.h>

#if defined _WIN32 || defined _WIN64
#undef HAVE_SYS_TIME_H
#endif
#if HAVE(SYS_TIME_H)
#include <sys/time.h>
#include <sys/resource.h>

static const rlim_t sWantedStackSizeLimit = 32*1024*1024;

#endif

using WTF::Vector;

// GCC cstring uses these automatically, but not all implementations do.
using std::strlen;
using std::strcpy;
using std::strncpy;
using std::memset;
using std::memcpy;

namespace KJS {

RegExp::UTF8SupportState RegExp::utf8Support = RegExp::Unknown;

static bool sanitizePatternExtensions(UString &p, WTF::Vector<int>* parenIdx = 0);

// JS regexps can contain Unicode escape sequences (\uxxxx) which
// are rather uncommon elsewhere. As our regexp libs don't understand
// them we do the unescaping ourselves internally.
// Also make sure to expand out any nulls as pcre_compile
// expects null termination..
static UString sanitizePattern(const UString &p)
{
  UString np;

  bool changed = false;
  const char* const nil = "\\x00";
  if (p.find("\\u") >= 0 || p.find(KJS::UChar('\0')) >= 0) {
    bool escape = false;
    changed = true;
    for (int i = 0; i < p.size(); ++i) {
      UChar c = p[i];
      if (escape) {
        escape = false;
        // we only care about \u
        if (c == 'u') {
	  // standard unicode escape sequence looks like \uxxxx but
	  // other browsers also accept less than 4 hex digits
	  unsigned short u = 0;
	  int j = 0;
	  for (j = 0; j < 4; ++j) {
	    if (i + 1 < p.size() && Lexer::isHexDigit(p[i + 1].unicode())) {
	      u = (u << 4) + Lexer::convertHex(p[i + 1].unicode());
	      ++i;
	    } else {
	      // sequence incomplete. restore index.
	      // TODO: cleaner way to propagate warning
	      fprintf(stderr, "KJS: saw %d digit \\u sequence.\n", j);
	      i -= j;
	      break;
	    }
	  }
	  if (j < 4) {
	    // sequence was incomplete. treat \u as u which IE always
	    // and FF sometimes does.
	     np.append(UString('u'));
	  } else {
            c = UChar(u);
            switch (u) {
            case 0:
	      // Make sure to encode 0, to avoid terminating the string
	       np += UString(nil);
	      break;
            case '^':
            case '$':
            case '\\':
            case '.':
            case '*':
            case '+':
            case '?':
            case '(': case ')':
            case '{': case '}':
            case '[': case ']':
            case '|':
	      // escape pattern characters have to remain escaped
	       np.append(UString('\\'));
	      // intentional fallthrough
            default:
	      np += UString(&c, 1);
	      break;
	    }
          }
          continue;
        }
        np += UString('\\');
        np += UString(&c, 1);
      } else {
        if (c == '\\')
          escape = true;
        else if (c == '\0')
          np += UString(nil);
        else
          np += UString(&c, 1);
      }
    }
  }
  // Rewrite very inefficient RE formulation:
  // (.|\s)+ is often used instead of the less intuitive, but vastly preferable [\w\W]+
  // The first wording needs to recurse at each character matched in libPCRE, leading to rapid exhaustion of stack space.
  if (p.find(".|\\s)")>=0) {
      if (np.isEmpty())
          np = p;
      bool didRewrite = false;
      WTF::Vector<int> parenIdx;
      sanitizePatternExtensions(np, &parenIdx);
      Vector<int>::const_iterator end = parenIdx.end();
      int previdx = 0;
      UString tmp;
      bool nonCapturing = false;
      for (Vector<int>::const_iterator it = parenIdx.begin(); it != end; ++it) {
          int idx = *it;
          if (np.size() < idx+6)
              break;
          if (np[idx+1] == '?' && np[idx+2] == ':') {
              nonCapturing = true;
              idx += 3;
          } else {
             ++idx;
          }
          if (!(np[idx] == '.' && np[idx+1] == '|' && np[idx+2] == '\\' && np[idx+3] == 's'))
              continue;
          if (np.size() >= idx+6 && (np[idx+5] == '+' || (np[idx+5] == '*')) &&
              // no need to do anything if the pattern is minimal e.g. (.|\s)+?
              !(np.size() > idx+6 && np[idx+6] == '?')) {
                  didRewrite = true;
                  if (nonCapturing) {               // trivial case: (?:.|\s)+ => [\w\W]+
                      tmp.append(np, previdx, idx-previdx-3);
                      tmp.append("[\\w\\W]");
                      tmp.append(np[idx+5]);
                  } else if (np[idx+5] == '*') {    // capture zero of one or more: (.|\s)* => (?:[\w\W]*([\w\W])|[\w\W]?)
                      tmp.append(np, previdx, idx-previdx-1);
                      tmp.append("(?:[\\w\\W]*([\\w\\W])|[\\w\\W]?)");
                  } else {                          // capture last of one or more: (.|\s)+ => [\w\W]*([\w\W])
                      assert(np[idx+5] == '+');
                      tmp.append(np, previdx, idx-previdx-1);
                      tmp.append("[\\w\\W]*([\\w\\W])");
                  }
          } else {
              tmp.append(np, previdx, idx-previdx+5);
          }
          previdx = idx+6;
      }
      if (didRewrite) {
          tmp.append(np, previdx);
          fprintf(stderr, "Pattern: %s ", np.ascii());
          fprintf(stderr, "was rewritten to: %s\n", tmp.ascii());
          np = tmp;
          changed = true;
      }
  }
  return (changed ? np : p);
}

// For now, the only 'extension' to standard we are willing to deal with is
// a non-escaped closing bracket, outside of a character class. e.g. /.*]/
static bool sanitizePatternExtensions(UString &p, WTF::Vector<int>* parenIdx)
{
  UString newPattern;

  static const int StateNominal = 0, StateOpenBracket = 1;
  WTF::Vector<int> v;
  bool escape = false;

  int state = StateNominal;
  int escapedSinceLastParen = 0;
  for (int i = 0; i < p.size(); ++i) {
      UChar c = p[i];
      if (escape) {
        escape = false;
      } else {
        if (c == '\\') {
          escape = true;
        } else if (c == ']') {
            if (state == StateOpenBracket) {
                state = StateNominal;
            } else if (state == StateNominal) {
                v.append(i);
                ++escapedSinceLastParen;
            }
        } else if (c == '[') {
            if (state == StateOpenBracket) {
                v.append(i);
                ++escapedSinceLastParen;
            } else if (state == StateNominal) {
                state = StateOpenBracket;
            }
        } else if (c == '(') {
            if (parenIdx && state == StateNominal) {
                parenIdx->append(i+escapedSinceLastParen);
                escapedSinceLastParen = 0;
            }
        }
    }
  }
  if (state == StateOpenBracket) {
      // this is not recoverable.
      return false;
  }
  if (v.size()) {
      int pos=0;
      Vector<int>::const_iterator end = v.end();
      for (Vector<int>::const_iterator it = v.begin(); it != end; ++it) {
          newPattern += p.substr(pos, *it-pos);
          pos = *it;
          newPattern += UString('\\');
      }
      newPattern += p.substr(pos);
      p = newPattern;
      return true;
  } else {
      return false;
  }
}

bool RegExp::tryGrowingMaxStackSize = true;
bool RegExp::didIncreaseMaxStackSize = false;

#if HAVE(SYS_TIME_H)
rlim_t RegExp::availableStackSize = 8*1024*1024;
#else
int RegExp::availableStackSize = 8*1024*1024;
#endif

RegExp::RegExp(const UString &p, char flags)
  : _pat(p), _flags(flags), _valid(true), _numSubPatterns(0)
{
#ifdef HAVE_PCREPOSIX
  // Determine whether libpcre has unicode support if need be..
  if (utf8Support == Unknown) {
    int supported;
    pcre_config(PCRE_CONFIG_UTF8, (void*)&supported);
    utf8Support = supported ? Supported : Unsupported;
  }
#endif

  UString intern = sanitizePattern(p);

#ifdef HAVE_PCREPOSIX
  int options = 0;

  // we are close but not 100% the same as Perl
#ifdef PCRE_JAVASCRIPT_COMPAT // introduced in PCRE 7.7
  options |= PCRE_JAVASCRIPT_COMPAT;
#endif

  // Note: the Global flag is already handled by RegExpProtoFunc::execute.
  // FIXME: That last comment is dubious. Not all RegExps get run through RegExpProtoFunc::execute.
  if (flags & IgnoreCase)
    options |= PCRE_CASELESS;
  if (flags & Multiline)
    options |= PCRE_MULTILINE;

  if (utf8Support == Supported)
    options |= (PCRE_UTF8 | PCRE_NO_UTF8_CHECK);

  const char *errorMessage;
  int errorOffset;
  bool secondTry = false;

  while (1) {
    RegExpStringContext converted(intern);

    _regex = pcre_compile(converted.buffer(), options, &errorMessage, &errorOffset, NULL);

    if (!_regex) {
#ifdef PCRE_JAVASCRIPT_COMPAT
      // The compilation failed. It is likely the pattern contains non-standard extensions.
      // We may try to tolerate some of those extensions.
      bool doRecompile = !secondTry && sanitizePatternExtensions(intern);
      if (doRecompile) {
        secondTry = true;
#ifndef NDEBUG
        fprintf(stderr, "KJS: pcre_compile() failed with '%s' - non-standard extensions detected in pattern, trying second compile after correction.\n", errorMessage);
#endif
        continue;
      }
#endif
#ifndef NDEBUG
      fprintf(stderr, "KJS: pcre_compile() failed with '%s'\n", errorMessage);
#endif
      _valid = false;
      return;
    }
    break;
  }

#ifdef PCRE_INFO_CAPTURECOUNT
  // Get number of subpatterns that will be returned.
  pcre_fullinfo(_regex, NULL, PCRE_INFO_CAPTURECOUNT, &_numSubPatterns);
#endif

#else /* HAVE_PCREPOSIX */

  int regflags = 0;
#ifdef REG_EXTENDED
  regflags |= REG_EXTENDED;
#endif
#ifdef REG_ICASE
  if ( flags & IgnoreCase )
    regflags |= REG_ICASE;
#endif

  //NOTE: Multiline is not feasible with POSIX regex.
  //if ( f & Multiline )
  //    ;
  // Note: the Global flag is already handled by RegExpProtoFunc::execute

  int errorCode = regcomp(&_regex, intern.ascii(), regflags);
  if (errorCode != 0) {
#ifndef NDEBUG
    char errorMessage[80];
    regerror(errorCode, &_regex, errorMessage, sizeof errorMessage);
    fprintf(stderr, "KJS: regcomp failed with '%s'\n", errorMessage);
#endif
    _valid = false;
  }
#endif
}

RegExp::~RegExp()
{
#ifdef HAVE_PCREPOSIX
  pcre_free(_regex);
#else
  /* TODO: is this really okay after an error ? */
  regfree(&_regex);
#endif
}

void RegExpStringContext::prepareUtf8(const UString& s)
{
  // Allocate a buffer big enough to hold all the characters plus \0
  const int length = s.size();
  _buffer = new char[length * 3 + 1];

  // Also create buffer for positions. We need one extra character in there,
  // even past the \0 since the non-empty handling may jump one past the end
  _originalPos = new int[length * 3 + 2];

  // Convert to runs of 8-bit characters, and generate indices
  // Note that we do NOT combine surrogate pairs here, as
  // regexps operate on them as separate characters
  char *p      = _buffer;
  int  *posOut = _originalPos;
  const UChar *d = s.data();
  for (int i = 0; i != length; ++i) {
    unsigned short c = d[i].unicode();

    int sequenceLen;
    if (c < 0x80) {
      *p++ = (char)c;
      sequenceLen = 1;
    } else if (c < 0x800) {
      *p++ = (char)((c >> 6) | 0xC0); // C0 is the 2-byte flag for UTF-8
      *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
      sequenceLen = 2;
    } else {
      *p++ = (char)((c >> 12) | 0xE0); // E0 is the 3-byte flag for UTF-8
      *p++ = (char)(((c >> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set
      *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
      sequenceLen = 3;
    }

    while (sequenceLen > 0) {
      *posOut = i;
      ++posOut;
      --sequenceLen;
    }
  }

  _bufferSize = p - _buffer;

  *p++ = '\0';

  // Record positions for \0, and the fictional character after that.
  *posOut     = length;
  *(posOut+1) = length+1;
}

void RegExpStringContext::prepareASCII (const UString& s)
{
  _originalPos = 0;

  // Best-effort attempt to get something done
  // when we don't have utf 8 available -- use
  // truncated version, and pray for the best
  CString truncated = s.cstring();
  _buffer = new char[truncated.size() + 1];
  memcpy(_buffer, truncated.c_str(), truncated.size());
  _buffer[truncated.size()] = '\0'; // For _compile use
  _bufferSize = truncated.size();
}

RegExpStringContext::RegExpStringContext(const UString &s)
{
#ifndef NDEBUG
  _originalS = s;
#endif

  if (RegExp::utf8Support == RegExp::Supported)
    prepareUtf8(s);
  else
    prepareASCII(s);
}

RegExpStringContext::~RegExpStringContext()
{
  delete[] _originalPos; _originalPos = 0;
  delete[] _buffer;      _buffer      = 0;
}

UString RegExp::match(const RegExpStringContext& ctx, const UString &s, bool *error, int i, int *pos, int **ovector)
{
#ifndef NDEBUG
  assert(s.data() == ctx._originalS.data()); // Make sure the context is right..
#endif

  if (i < 0)
    i = 0;
  int dummyPos;
  if (!pos)
    pos = &dummyPos;
  *pos = -1;
  if (ovector)
    *ovector = 0;

  if (i > s.size() || s.isNull())
    return UString::null();

#ifdef HAVE_PCREPOSIX

  if (!_regex)
    return UString::null();

  // Set up the offset vector for the result.
  // First 2/3 used for result, the last third used by PCRE.
  int *offsetVector;
  int offsetVectorSize;
  int fixedSizeOffsetVector[3];
  if (!ovector) {
    offsetVectorSize = 3;
    offsetVector = fixedSizeOffsetVector;
  } else {
    offsetVectorSize = (_numSubPatterns + 1) * 3;
    offsetVector = new int [offsetVectorSize];
  }

  int startPos;
  if (utf8Support == Supported) {
    startPos = i;
    while (ctx.originalPos(startPos) < i)
      ++startPos;
  } else {
    startPos = i;
  }

  int baseFlags = utf8Support == Supported ? PCRE_NO_UTF8_CHECK : 0;

  // See if we have to limit stack space...
  *error = false;
  int stackGlutton = 0;
  pcre_config(PCRE_CONFIG_STACKRECURSE, (void*)&stackGlutton);
  pcre_extra limits;
  if (stackGlutton) {
#if HAVE(SYS_TIME_H)
    if (tryGrowingMaxStackSize) {
      rlimit l;
      getrlimit(RLIMIT_STACK, &l);
      availableStackSize = l.rlim_cur;
      if (l.rlim_cur < sWantedStackSizeLimit &&
          (l.rlim_max > l.rlim_cur || l.rlim_max == RLIM_INFINITY)) {
        l.rlim_cur = (l.rlim_max == RLIM_INFINITY) ?
                     sWantedStackSizeLimit : std::min(l.rlim_max, sWantedStackSizeLimit);
        if ((didIncreaseMaxStackSize = !setrlimit( RLIMIT_STACK, &l)))
          availableStackSize = l.rlim_cur;
      }
      tryGrowingMaxStackSize = false;
    }
#endif

    limits.flags = PCRE_EXTRA_MATCH_LIMIT_RECURSION;
    // libPCRE docs claim that it munches about 500 bytes per recursion.
    // The crash in #160792 actually showed pcre 7.4 using about 1300 bytes
    // (and I've measured 800 in an another instance)
    // We go somewhat conservative, and use about 3/4ths of that,
    // especially since we're not exactly light on the stack, either
    limits.match_limit_recursion = (availableStackSize/1300)*3/4;
  }

  const int numMatches = pcre_exec(_regex, stackGlutton ? &limits : 0, ctx.buffer(),
                                   ctx.bufferSize(), startPos, baseFlags, offsetVector, offsetVectorSize);

  //Now go through and patch up the offsetVector
  if (utf8Support == Supported)
    for (int c = 0; c < 2 * numMatches; ++c)
      if (offsetVector[c] != -1)
        offsetVector[c] = ctx.originalPos(offsetVector[c]);

  if (numMatches < 0) {
#ifndef NDEBUG
    if (numMatches != PCRE_ERROR_NOMATCH)
      fprintf(stderr, "KJS: pcre_exec() failed with result %d\n", numMatches);
#endif
    if (offsetVector != fixedSizeOffsetVector)
      delete [] offsetVector;
    if (numMatches == PCRE_ERROR_MATCHLIMIT || numMatches == PCRE_ERROR_RECURSIONLIMIT)
      *error = true;
    return UString::null();
  }

  *pos = offsetVector[0];
  if (ovector)
    *ovector = offsetVector;
  return s.substr(offsetVector[0], offsetVector[1] - offsetVector[0]);

#else

  if (!_valid)
    return UString::null();

  const unsigned maxMatch = 10;
  regmatch_t rmatch[maxMatch];

  char *str = strdup(s.ascii()); // TODO: why ???
  if (regexec(&_regex, str + i, maxMatch, rmatch, 0)) {
    free(str);
    return UString::null();
  }
  free(str);

  if (!ovector) {
    *pos = rmatch[0].rm_so + i;
    return s.substr(rmatch[0].rm_so + i, rmatch[0].rm_eo - rmatch[0].rm_so);
  }

  // map rmatch array to ovector used in PCRE case
  _numSubPatterns = 0;
  for(unsigned j = 1; j < maxMatch && rmatch[j].rm_so >= 0 ; j++)
      _numSubPatterns++;
  int ovecsize = (_numSubPatterns+1)*3; // see above
  *ovector = new int[ovecsize];
  for (unsigned j = 0; j < _numSubPatterns + 1; j++) {
    if (j>maxMatch)
      break;
    (*ovector)[2*j] = rmatch[j].rm_so + i;
    (*ovector)[2*j+1] = rmatch[j].rm_eo + i;
  }

  *pos = (*ovector)[0];
  return s.substr((*ovector)[0], (*ovector)[1] - (*ovector)[0]);

#endif
}

} // namespace KJS