// -*- c-basic-offset: 2 -*- /* * This file is part of the KDE libraries * Copyright (C) 1999-2001,2004 Harri Porten (porten@kde.org) * Copyright (C) 2003,2004 Apple Computer, Inc. * Copyright (C) 2006 Maksim Orlovich (maksim@kde.org) * Copyright (C) 2007 Sune Vuorela (debian@pusling.com) * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * */ #include "regexp.h" #include #include "lexer.h" #include #include #include #include #include #if defined _WIN32 || defined _WIN64 #undef HAVE_SYS_TIME_H #endif #if HAVE(SYS_TIME_H) #include #include static const rlim_t sWantedStackSizeLimit = 32*1024*1024; #endif using WTF::Vector; // GCC cstring uses these automatically, but not all implementations do. using std::strlen; using std::strcpy; using std::strncpy; using std::memset; using std::memcpy; namespace KJS { RegExp::UTF8SupportState RegExp::utf8Support = RegExp::Unknown; static bool sanitizePatternExtensions(UString &p, WTF::Vector* parenIdx = 0); // JS regexps can contain Unicode escape sequences (\uxxxx) which // are rather uncommon elsewhere. As our regexp libs don't understand // them we do the unescaping ourselves internally. // Also make sure to expand out any nulls as pcre_compile // expects null termination.. static UString sanitizePattern(const UString &p) { UString np; bool changed = false; const char* const nil = "\\x00"; if (p.find("\\u") >= 0 || p.find(KJS::UChar('\0')) >= 0) { bool escape = false; changed = true; for (int i = 0; i < p.size(); ++i) { UChar c = p[i]; if (escape) { escape = false; // we only care about \u if (c == 'u') { // standard unicode escape sequence looks like \uxxxx but // other browsers also accept less than 4 hex digits unsigned short u = 0; int j = 0; for (j = 0; j < 4; ++j) { if (i + 1 < p.size() && Lexer::isHexDigit(p[i + 1].unicode())) { u = (u << 4) + Lexer::convertHex(p[i + 1].unicode()); ++i; } else { // sequence incomplete. restore index. // TODO: cleaner way to propagate warning fprintf(stderr, "KJS: saw %d digit \\u sequence.\n", j); i -= j; break; } } if (j < 4) { // sequence was incomplete. treat \u as u which IE always // and FF sometimes does. np.append(UString('u')); } else { c = UChar(u); switch (u) { case 0: // Make sure to encode 0, to avoid terminating the string np += UString(nil); break; case '^': case '$': case '\\': case '.': case '*': case '+': case '?': case '(': case ')': case '{': case '}': case '[': case ']': case '|': // escape pattern characters have to remain escaped np.append(UString('\\')); // intentional fallthrough default: np += UString(&c, 1); break; } } continue; } np += UString('\\'); np += UString(&c, 1); } else { if (c == '\\') escape = true; else if (c == '\0') np += UString(nil); else np += UString(&c, 1); } } } // Rewrite very inefficient RE formulation: // (.|\s)+ is often used instead of the less intuitive, but vastly preferable [\w\W]+ // The first wording needs to recurse at each character matched in libPCRE, leading to rapid exhaustion of stack space. if (p.find(".|\\s)")>=0) { if (np.isEmpty()) np = p; bool didRewrite = false; WTF::Vector parenIdx; sanitizePatternExtensions(np, &parenIdx); Vector::const_iterator end = parenIdx.end(); int previdx = 0; UString tmp; bool nonCapturing = false; for (Vector::const_iterator it = parenIdx.begin(); it != end; ++it) { int idx = *it; if (np.size() < idx+6) break; if (np[idx+1] == '?' && np[idx+2] == ':') { nonCapturing = true; idx += 3; } else { ++idx; } if (!(np[idx] == '.' && np[idx+1] == '|' && np[idx+2] == '\\' && np[idx+3] == 's')) continue; if (np.size() >= idx+6 && (np[idx+5] == '+' || (np[idx+5] == '*')) && // no need to do anything if the pattern is minimal e.g. (.|\s)+? !(np.size() > idx+6 && np[idx+6] == '?')) { didRewrite = true; if (nonCapturing) { // trivial case: (?:.|\s)+ => [\w\W]+ tmp.append(np, previdx, idx-previdx-3); tmp.append("[\\w\\W]"); tmp.append(np[idx+5]); } else if (np[idx+5] == '*') { // capture zero of one or more: (.|\s)* => (?:[\w\W]*([\w\W])|[\w\W]?) tmp.append(np, previdx, idx-previdx-1); tmp.append("(?:[\\w\\W]*([\\w\\W])|[\\w\\W]?)"); } else { // capture last of one or more: (.|\s)+ => [\w\W]*([\w\W]) assert(np[idx+5] == '+'); tmp.append(np, previdx, idx-previdx-1); tmp.append("[\\w\\W]*([\\w\\W])"); } } else { tmp.append(np, previdx, idx-previdx+5); } previdx = idx+6; } if (didRewrite) { tmp.append(np, previdx); fprintf(stderr, "Pattern: %s ", np.ascii()); fprintf(stderr, "was rewritten to: %s\n", tmp.ascii()); np = tmp; changed = true; } } return (changed ? np : p); } // For now, the only 'extension' to standard we are willing to deal with is // a non-escaped closing bracket, outside of a character class. e.g. /.*]/ static bool sanitizePatternExtensions(UString &p, WTF::Vector* parenIdx) { UString newPattern; static const int StateNominal = 0, StateOpenBracket = 1; WTF::Vector v; bool escape = false; int state = StateNominal; int escapedSinceLastParen = 0; for (int i = 0; i < p.size(); ++i) { UChar c = p[i]; if (escape) { escape = false; } else { if (c == '\\') { escape = true; } else if (c == ']') { if (state == StateOpenBracket) { state = StateNominal; } else if (state == StateNominal) { v.append(i); ++escapedSinceLastParen; } } else if (c == '[') { if (state == StateOpenBracket) { v.append(i); ++escapedSinceLastParen; } else if (state == StateNominal) { state = StateOpenBracket; } } else if (c == '(') { if (parenIdx && state == StateNominal) { parenIdx->append(i+escapedSinceLastParen); escapedSinceLastParen = 0; } } } } if (state == StateOpenBracket) { // this is not recoverable. return false; } if (v.size()) { int pos=0; Vector::const_iterator end = v.end(); for (Vector::const_iterator it = v.begin(); it != end; ++it) { newPattern += p.substr(pos, *it-pos); pos = *it; newPattern += UString('\\'); } newPattern += p.substr(pos); p = newPattern; return true; } else { return false; } } bool RegExp::tryGrowingMaxStackSize = true; bool RegExp::didIncreaseMaxStackSize = false; #if HAVE(SYS_TIME_H) rlim_t RegExp::availableStackSize = 8*1024*1024; #else int RegExp::availableStackSize = 8*1024*1024; #endif RegExp::RegExp(const UString &p, char flags) : _pat(p), _flags(flags), _valid(true), _numSubPatterns(0) { #ifdef HAVE_PCREPOSIX // Determine whether libpcre has unicode support if need be.. if (utf8Support == Unknown) { int supported; pcre_config(PCRE_CONFIG_UTF8, (void*)&supported); utf8Support = supported ? Supported : Unsupported; } #endif UString intern = sanitizePattern(p); #ifdef HAVE_PCREPOSIX int options = 0; // we are close but not 100% the same as Perl #ifdef PCRE_JAVASCRIPT_COMPAT // introduced in PCRE 7.7 options |= PCRE_JAVASCRIPT_COMPAT; #endif // Note: the Global flag is already handled by RegExpProtoFunc::execute. // FIXME: That last comment is dubious. Not all RegExps get run through RegExpProtoFunc::execute. if (flags & IgnoreCase) options |= PCRE_CASELESS; if (flags & Multiline) options |= PCRE_MULTILINE; if (utf8Support == Supported) options |= (PCRE_UTF8 | PCRE_NO_UTF8_CHECK); const char *errorMessage; int errorOffset; bool secondTry = false; while (1) { RegExpStringContext converted(intern); _regex = pcre_compile(converted.buffer(), options, &errorMessage, &errorOffset, NULL); if (!_regex) { #ifdef PCRE_JAVASCRIPT_COMPAT // The compilation failed. It is likely the pattern contains non-standard extensions. // We may try to tolerate some of those extensions. bool doRecompile = !secondTry && sanitizePatternExtensions(intern); if (doRecompile) { secondTry = true; #ifndef NDEBUG fprintf(stderr, "KJS: pcre_compile() failed with '%s' - non-standard extensions detected in pattern, trying second compile after correction.\n", errorMessage); #endif continue; } #endif #ifndef NDEBUG fprintf(stderr, "KJS: pcre_compile() failed with '%s'\n", errorMessage); #endif _valid = false; return; } break; } #ifdef PCRE_INFO_CAPTURECOUNT // Get number of subpatterns that will be returned. pcre_fullinfo(_regex, NULL, PCRE_INFO_CAPTURECOUNT, &_numSubPatterns); #endif #else /* HAVE_PCREPOSIX */ int regflags = 0; #ifdef REG_EXTENDED regflags |= REG_EXTENDED; #endif #ifdef REG_ICASE if ( flags & IgnoreCase ) regflags |= REG_ICASE; #endif //NOTE: Multiline is not feasible with POSIX regex. //if ( f & Multiline ) // ; // Note: the Global flag is already handled by RegExpProtoFunc::execute int errorCode = regcomp(&_regex, intern.ascii(), regflags); if (errorCode != 0) { #ifndef NDEBUG char errorMessage[80]; regerror(errorCode, &_regex, errorMessage, sizeof errorMessage); fprintf(stderr, "KJS: regcomp failed with '%s'\n", errorMessage); #endif _valid = false; } #endif } RegExp::~RegExp() { #ifdef HAVE_PCREPOSIX pcre_free(_regex); #else /* TODO: is this really okay after an error ? */ regfree(&_regex); #endif } void RegExpStringContext::prepareUtf8(const UString& s) { // Allocate a buffer big enough to hold all the characters plus \0 const int length = s.size(); _buffer = new char[length * 3 + 1]; // Also create buffer for positions. We need one extra character in there, // even past the \0 since the non-empty handling may jump one past the end _originalPos = new int[length * 3 + 2]; // Convert to runs of 8-bit characters, and generate indices // Note that we do NOT combine surrogate pairs here, as // regexps operate on them as separate characters char *p = _buffer; int *posOut = _originalPos; const UChar *d = s.data(); for (int i = 0; i != length; ++i) { unsigned short c = d[i].unicode(); int sequenceLen; if (c < 0x80) { *p++ = (char)c; sequenceLen = 1; } else if (c < 0x800) { *p++ = (char)((c >> 6) | 0xC0); // C0 is the 2-byte flag for UTF-8 *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set sequenceLen = 2; } else { *p++ = (char)((c >> 12) | 0xE0); // E0 is the 3-byte flag for UTF-8 *p++ = (char)(((c >> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set sequenceLen = 3; } while (sequenceLen > 0) { *posOut = i; ++posOut; --sequenceLen; } } _bufferSize = p - _buffer; *p++ = '\0'; // Record positions for \0, and the fictional character after that. *posOut = length; *(posOut+1) = length+1; } void RegExpStringContext::prepareASCII (const UString& s) { _originalPos = 0; // Best-effort attempt to get something done // when we don't have utf 8 available -- use // truncated version, and pray for the best CString truncated = s.cstring(); _buffer = new char[truncated.size() + 1]; memcpy(_buffer, truncated.c_str(), truncated.size()); _buffer[truncated.size()] = '\0'; // For _compile use _bufferSize = truncated.size(); } RegExpStringContext::RegExpStringContext(const UString &s) { #ifndef NDEBUG _originalS = s; #endif if (RegExp::utf8Support == RegExp::Supported) prepareUtf8(s); else prepareASCII(s); } RegExpStringContext::~RegExpStringContext() { delete[] _originalPos; _originalPos = 0; delete[] _buffer; _buffer = 0; } UString RegExp::match(const RegExpStringContext& ctx, const UString &s, bool *error, int i, int *pos, int **ovector) { #ifndef NDEBUG assert(s.data() == ctx._originalS.data()); // Make sure the context is right.. #endif if (i < 0) i = 0; int dummyPos; if (!pos) pos = &dummyPos; *pos = -1; if (ovector) *ovector = 0; if (i > s.size() || s.isNull()) return UString::null(); #ifdef HAVE_PCREPOSIX if (!_regex) return UString::null(); // Set up the offset vector for the result. // First 2/3 used for result, the last third used by PCRE. int *offsetVector; int offsetVectorSize; int fixedSizeOffsetVector[3]; if (!ovector) { offsetVectorSize = 3; offsetVector = fixedSizeOffsetVector; } else { offsetVectorSize = (_numSubPatterns + 1) * 3; offsetVector = new int [offsetVectorSize]; } int startPos; if (utf8Support == Supported) { startPos = i; while (ctx.originalPos(startPos) < i) ++startPos; } else { startPos = i; } int baseFlags = utf8Support == Supported ? PCRE_NO_UTF8_CHECK : 0; // See if we have to limit stack space... *error = false; int stackGlutton = 0; pcre_config(PCRE_CONFIG_STACKRECURSE, (void*)&stackGlutton); pcre_extra limits; if (stackGlutton) { #if HAVE(SYS_TIME_H) if (tryGrowingMaxStackSize) { rlimit l; getrlimit(RLIMIT_STACK, &l); availableStackSize = l.rlim_cur; if (l.rlim_cur < sWantedStackSizeLimit && (l.rlim_max > l.rlim_cur || l.rlim_max == RLIM_INFINITY)) { l.rlim_cur = (l.rlim_max == RLIM_INFINITY) ? sWantedStackSizeLimit : std::min(l.rlim_max, sWantedStackSizeLimit); if ((didIncreaseMaxStackSize = !setrlimit( RLIMIT_STACK, &l))) availableStackSize = l.rlim_cur; } tryGrowingMaxStackSize = false; } #endif limits.flags = PCRE_EXTRA_MATCH_LIMIT_RECURSION; // libPCRE docs claim that it munches about 500 bytes per recursion. // The crash in #160792 actually showed pcre 7.4 using about 1300 bytes // (and I've measured 800 in an another instance) // We go somewhat conservative, and use about 3/4ths of that, // especially since we're not exactly light on the stack, either limits.match_limit_recursion = (availableStackSize/1300)*3/4; } const int numMatches = pcre_exec(_regex, stackGlutton ? &limits : 0, ctx.buffer(), ctx.bufferSize(), startPos, baseFlags, offsetVector, offsetVectorSize); //Now go through and patch up the offsetVector if (utf8Support == Supported) for (int c = 0; c < 2 * numMatches; ++c) if (offsetVector[c] != -1) offsetVector[c] = ctx.originalPos(offsetVector[c]); if (numMatches < 0) { #ifndef NDEBUG if (numMatches != PCRE_ERROR_NOMATCH) fprintf(stderr, "KJS: pcre_exec() failed with result %d\n", numMatches); #endif if (offsetVector != fixedSizeOffsetVector) delete [] offsetVector; if (numMatches == PCRE_ERROR_MATCHLIMIT || numMatches == PCRE_ERROR_RECURSIONLIMIT) *error = true; return UString::null(); } *pos = offsetVector[0]; if (ovector) *ovector = offsetVector; return s.substr(offsetVector[0], offsetVector[1] - offsetVector[0]); #else if (!_valid) return UString::null(); const unsigned maxMatch = 10; regmatch_t rmatch[maxMatch]; char *str = strdup(s.ascii()); // TODO: why ??? if (regexec(&_regex, str + i, maxMatch, rmatch, 0)) { free(str); return UString::null(); } free(str); if (!ovector) { *pos = rmatch[0].rm_so + i; return s.substr(rmatch[0].rm_so + i, rmatch[0].rm_eo - rmatch[0].rm_so); } // map rmatch array to ovector used in PCRE case _numSubPatterns = 0; for(unsigned j = 1; j < maxMatch && rmatch[j].rm_so >= 0 ; j++) _numSubPatterns++; int ovecsize = (_numSubPatterns+1)*3; // see above *ovector = new int[ovecsize]; for (unsigned j = 0; j < _numSubPatterns + 1; j++) { if (j>maxMatch) break; (*ovector)[2*j] = rmatch[j].rm_so + i; (*ovector)[2*j+1] = rmatch[j].rm_eo + i; } *pos = (*ovector)[0]; return s.substr((*ovector)[0], (*ovector)[1] - (*ovector)[0]); #endif } } // namespace KJS