diff options
author | Simon Hausmann <simon.hausmann@nokia.com> | 2012-01-06 14:44:00 +0100 |
---|---|---|
committer | Simon Hausmann <simon.hausmann@nokia.com> | 2012-01-06 14:44:00 +0100 |
commit | 40736c5763bf61337c8c14e16d8587db021a87d4 (patch) | |
tree | b17a9c00042ad89cb1308e2484491799aa14e9f8 /Source/JavaScriptCore/yarr | |
download | qtwebkit-40736c5763bf61337c8c14e16d8587db021a87d4.tar.gz |
Imported WebKit commit 2ea9d364d0f6efa8fa64acf19f451504c59be0e4 (http://svn.webkit.org/repository/webkit/trunk@104285)
Diffstat (limited to 'Source/JavaScriptCore/yarr')
-rw-r--r-- | Source/JavaScriptCore/yarr/Yarr.h | 71 | ||||
-rw-r--r-- | Source/JavaScriptCore/yarr/YarrInterpreter.cpp | 1974 | ||||
-rw-r--r-- | Source/JavaScriptCore/yarr/YarrInterpreter.h | 380 | ||||
-rw-r--r-- | Source/JavaScriptCore/yarr/YarrJIT.cpp | 2546 | ||||
-rw-r--r-- | Source/JavaScriptCore/yarr/YarrJIT.h | 108 | ||||
-rw-r--r-- | Source/JavaScriptCore/yarr/YarrParser.h | 874 | ||||
-rw-r--r-- | Source/JavaScriptCore/yarr/YarrPattern.cpp | 821 | ||||
-rw-r--r-- | Source/JavaScriptCore/yarr/YarrPattern.h | 421 | ||||
-rw-r--r-- | Source/JavaScriptCore/yarr/YarrSyntaxChecker.cpp | 59 | ||||
-rw-r--r-- | Source/JavaScriptCore/yarr/YarrSyntaxChecker.h | 38 | ||||
-rw-r--r-- | Source/JavaScriptCore/yarr/yarr.pri | 13 |
11 files changed, 7305 insertions, 0 deletions
diff --git a/Source/JavaScriptCore/yarr/Yarr.h b/Source/JavaScriptCore/yarr/Yarr.h new file mode 100644 index 000000000..501afac5f --- /dev/null +++ b/Source/JavaScriptCore/yarr/Yarr.h @@ -0,0 +1,71 @@ +/* + * Copyright (C) 2009 Apple Inc. All rights reserved. + * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY UNIVERSITY OF SZEGED ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL UNIVERSITY OF SZEGED OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef Yarr_h +#define Yarr_h + +#include "YarrInterpreter.h" +#include "YarrPattern.h" + +namespace JSC { namespace Yarr { + +#define YarrStackSpaceForBackTrackInfoPatternCharacter 1 // Only for !fixed quantifiers. +#define YarrStackSpaceForBackTrackInfoCharacterClass 1 // Only for !fixed quantifiers. +#define YarrStackSpaceForBackTrackInfoBackReference 2 +#define YarrStackSpaceForBackTrackInfoAlternative 1 // One per alternative. +#define YarrStackSpaceForBackTrackInfoParentheticalAssertion 1 +#define YarrStackSpaceForBackTrackInfoParenthesesOnce 1 // Only for !fixed quantifiers. +#define YarrStackSpaceForBackTrackInfoParenthesesTerminal 1 +#define YarrStackSpaceForBackTrackInfoParentheses 2 + +static const unsigned quantifyInfinite = UINT_MAX; + +// The below limit restricts the number of "recursive" match calls in order to +// avoid spending exponential time on complex regular expressions. +static const unsigned matchLimit = 1000000; + +enum JSRegExpResult { + JSRegExpMatch = 1, + JSRegExpNoMatch = 0, + JSRegExpErrorNoMatch = -1, + JSRegExpErrorHitLimit = -2, + JSRegExpErrorNoMemory = -3, + JSRegExpErrorInternal = -4 +}; + +enum YarrCharSize { + Char8, + Char16 +}; + +PassOwnPtr<BytecodePattern> byteCompile(YarrPattern&, BumpPointerAllocator*); +int interpret(BytecodePattern*, const UString& input, unsigned start, unsigned length, int* output); + +} } // namespace JSC::Yarr + +#endif // Yarr_h + diff --git a/Source/JavaScriptCore/yarr/YarrInterpreter.cpp b/Source/JavaScriptCore/yarr/YarrInterpreter.cpp new file mode 100644 index 000000000..b6bfb2c98 --- /dev/null +++ b/Source/JavaScriptCore/yarr/YarrInterpreter.cpp @@ -0,0 +1,1974 @@ +/* + * Copyright (C) 2009 Apple Inc. All rights reserved. + * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "YarrInterpreter.h" + +#include "UString.h" +#include "Yarr.h" +#include <wtf/BumpPointerAllocator.h> +#include <wtf/text/CString.h> + +#ifndef NDEBUG +#include <stdio.h> +#endif + +using namespace WTF; + +namespace JSC { namespace Yarr { + +class Interpreter { +public: + struct ParenthesesDisjunctionContext; + + struct BackTrackInfoPatternCharacter { + uintptr_t matchAmount; + }; + struct BackTrackInfoCharacterClass { + uintptr_t matchAmount; + }; + struct BackTrackInfoBackReference { + uintptr_t begin; // Not really needed for greedy quantifiers. + uintptr_t matchAmount; // Not really needed for fixed quantifiers. + }; + struct BackTrackInfoAlternative { + uintptr_t offset; + }; + struct BackTrackInfoParentheticalAssertion { + uintptr_t begin; + }; + struct BackTrackInfoParenthesesOnce { + uintptr_t begin; + }; + struct BackTrackInfoParenthesesTerminal { + uintptr_t begin; + }; + struct BackTrackInfoParentheses { + uintptr_t matchAmount; + ParenthesesDisjunctionContext* lastContext; + }; + + static inline void appendParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack, ParenthesesDisjunctionContext* context) + { + context->next = backTrack->lastContext; + backTrack->lastContext = context; + ++backTrack->matchAmount; + } + + static inline void popParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack) + { + ASSERT(backTrack->matchAmount); + ASSERT(backTrack->lastContext); + backTrack->lastContext = backTrack->lastContext->next; + --backTrack->matchAmount; + } + + struct DisjunctionContext + { + DisjunctionContext() + : term(0) + { + } + + void* operator new(size_t, void* where) + { + return where; + } + + int term; + unsigned matchBegin; + unsigned matchEnd; + uintptr_t frame[1]; + }; + + DisjunctionContext* allocDisjunctionContext(ByteDisjunction* disjunction) + { + size_t size = sizeof(DisjunctionContext) - sizeof(uintptr_t) + disjunction->m_frameSize * sizeof(uintptr_t); + allocatorPool = allocatorPool->ensureCapacity(size); + if (!allocatorPool) + CRASH(); + return new (allocatorPool->alloc(size)) DisjunctionContext(); + } + + void freeDisjunctionContext(DisjunctionContext* context) + { + allocatorPool = allocatorPool->dealloc(context); + } + + struct ParenthesesDisjunctionContext + { + ParenthesesDisjunctionContext(int* output, ByteTerm& term) + : next(0) + { + unsigned firstSubpatternId = term.atom.subpatternId; + unsigned numNestedSubpatterns = term.atom.parenthesesDisjunction->m_numSubpatterns; + + for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i) { + subpatternBackup[i] = output[(firstSubpatternId << 1) + i]; + output[(firstSubpatternId << 1) + i] = -1; + } + + new (getDisjunctionContext(term)) DisjunctionContext(); + } + + void* operator new(size_t, void* where) + { + return where; + } + + void restoreOutput(int* output, unsigned firstSubpatternId, unsigned numNestedSubpatterns) + { + for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i) + output[(firstSubpatternId << 1) + i] = subpatternBackup[i]; + } + + DisjunctionContext* getDisjunctionContext(ByteTerm& term) + { + return reinterpret_cast<DisjunctionContext*>(&(subpatternBackup[term.atom.parenthesesDisjunction->m_numSubpatterns << 1])); + } + + ParenthesesDisjunctionContext* next; + int subpatternBackup[1]; + }; + + ParenthesesDisjunctionContext* allocParenthesesDisjunctionContext(ByteDisjunction* disjunction, int* output, ByteTerm& term) + { + size_t size = sizeof(ParenthesesDisjunctionContext) - sizeof(int) + (term.atom.parenthesesDisjunction->m_numSubpatterns << 1) * sizeof(int) + sizeof(DisjunctionContext) - sizeof(uintptr_t) + disjunction->m_frameSize * sizeof(uintptr_t); + allocatorPool = allocatorPool->ensureCapacity(size); + if (!allocatorPool) + CRASH(); + return new (allocatorPool->alloc(size)) ParenthesesDisjunctionContext(output, term); + } + + void freeParenthesesDisjunctionContext(ParenthesesDisjunctionContext* context) + { + allocatorPool = allocatorPool->dealloc(context); + } + + // This class is a placeholder for future character iterator, current + // proposed name StringConstCharacterIterator. + class CharAccess { + public: + CharAccess(const UString& s) + { + if (s.is8Bit()) { + m_charSize = Char8; + m_ptr.ptr8 = s.characters8(); + } else { + m_charSize = Char16; + m_ptr.ptr16 = s.characters16(); + } + } + + CharAccess(const LChar* ptr) + : m_charSize(Char8) + { + m_ptr.ptr8 = ptr; + } + + CharAccess(const UChar* ptr) + : m_charSize(Char16) + { + m_ptr.ptr16 = ptr; + } + + ~CharAccess() + { + } + + inline UChar operator[](unsigned index) + { + if (m_charSize == Char8) + return m_ptr.ptr8[index]; + return m_ptr.ptr16[index]; + } + + private: + union { + const LChar* ptr8; + const UChar* ptr16; + } m_ptr; + YarrCharSize m_charSize; + }; + + class InputStream { + public: + InputStream(const UString& input, unsigned start, unsigned length) + : input(input) + , pos(start) + , length(length) + { + } + + void next() + { + ++pos; + } + + void rewind(unsigned amount) + { + ASSERT(pos >= amount); + pos -= amount; + } + + int read() + { + ASSERT(pos < length); + if (pos < length) + return input[pos]; + return -1; + } + + int readPair() + { + ASSERT(pos + 1 < length); + return input[pos] | input[pos + 1] << 16; + } + + int readChecked(int position) + { + ASSERT(position < 0); + ASSERT(static_cast<unsigned>(-position) <= pos); + unsigned p = pos + position; + ASSERT(p < length); + return input[p]; + } + + int reread(unsigned from) + { + ASSERT(from < length); + return input[from]; + } + + int prev() + { + ASSERT(!(pos > length)); + if (pos && length) + return input[pos - 1]; + return -1; + } + + unsigned getPos() + { + return pos; + } + + void setPos(unsigned p) + { + pos = p; + } + + bool atStart() + { + return pos == 0; + } + + bool atEnd() + { + return pos == length; + } + + unsigned end() + { + return length; + } + + bool checkInput(int count) + { + if ((pos + count) <= length) { + pos += count; + return true; + } + return false; + } + + void uncheckInput(int count) + { + pos -= count; + } + + bool atStart(int position) + { + return (pos + position) == 0; + } + + bool atEnd(int position) + { + return (pos + position) == length; + } + + bool isNotAvailableInput(int position) + { + return (pos + position) > length; + } + + private: + CharAccess input; + unsigned pos; + unsigned length; + }; + + bool testCharacterClass(CharacterClass* characterClass, int ch) + { + if (ch & 0xFF80) { + for (unsigned i = 0; i < characterClass->m_matchesUnicode.size(); ++i) + if (ch == characterClass->m_matchesUnicode[i]) + return true; + for (unsigned i = 0; i < characterClass->m_rangesUnicode.size(); ++i) + if ((ch >= characterClass->m_rangesUnicode[i].begin) && (ch <= characterClass->m_rangesUnicode[i].end)) + return true; + } else { + for (unsigned i = 0; i < characterClass->m_matches.size(); ++i) + if (ch == characterClass->m_matches[i]) + return true; + for (unsigned i = 0; i < characterClass->m_ranges.size(); ++i) + if ((ch >= characterClass->m_ranges[i].begin) && (ch <= characterClass->m_ranges[i].end)) + return true; + } + + return false; + } + + bool checkCharacter(int testChar, int inputPosition) + { + return testChar == input.readChecked(inputPosition); + } + + bool checkCasedCharacter(int loChar, int hiChar, int inputPosition) + { + int ch = input.readChecked(inputPosition); + return (loChar == ch) || (hiChar == ch); + } + + bool checkCharacterClass(CharacterClass* characterClass, bool invert, int inputPosition) + { + bool match = testCharacterClass(characterClass, input.readChecked(inputPosition)); + return invert ? !match : match; + } + + bool tryConsumeBackReference(int matchBegin, int matchEnd, int inputOffset) + { + int matchSize = matchEnd - matchBegin; + + if (!input.checkInput(matchSize)) + return false; + + if (pattern->m_ignoreCase) { + for (int i = 0; i < matchSize; ++i) { + int ch = input.reread(matchBegin + i); + + int lo = Unicode::toLower(ch); + int hi = Unicode::toUpper(ch); + + if ((lo != hi) ? (!checkCasedCharacter(lo, hi, inputOffset - matchSize + i)) : (!checkCharacter(ch, inputOffset - matchSize + i))) { + input.uncheckInput(matchSize); + return false; + } + } + } else { + for (int i = 0; i < matchSize; ++i) { + if (!checkCharacter(input.reread(matchBegin + i), inputOffset - matchSize + i)) { + input.uncheckInput(matchSize); + return false; + } + } + } + + return true; + } + + bool matchAssertionBOL(ByteTerm& term) + { + return (input.atStart(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition - 1))); + } + + bool matchAssertionEOL(ByteTerm& term) + { + if (term.inputPosition) + return (input.atEnd(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition))); + + return (input.atEnd()) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.read())); + } + + bool matchAssertionWordBoundary(ByteTerm& term) + { + bool prevIsWordchar = !input.atStart(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition - 1)); + bool readIsWordchar; + if (term.inputPosition) + readIsWordchar = !input.atEnd(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition)); + else + readIsWordchar = !input.atEnd() && testCharacterClass(pattern->wordcharCharacterClass, input.read()); + + bool wordBoundary = prevIsWordchar != readIsWordchar; + return term.invert() ? !wordBoundary : wordBoundary; + } + + bool backtrackPatternCharacter(ByteTerm& term, DisjunctionContext* context) + { + BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation); + + switch (term.atom.quantityType) { + case QuantifierFixedCount: + break; + + case QuantifierGreedy: + if (backTrack->matchAmount) { + --backTrack->matchAmount; + input.uncheckInput(1); + return true; + } + break; + + case QuantifierNonGreedy: + if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) { + ++backTrack->matchAmount; + if (checkCharacter(term.atom.patternCharacter, term.inputPosition - 1)) + return true; + } + input.uncheckInput(backTrack->matchAmount); + break; + } + + return false; + } + + bool backtrackPatternCasedCharacter(ByteTerm& term, DisjunctionContext* context) + { + BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation); + + switch (term.atom.quantityType) { + case QuantifierFixedCount: + break; + + case QuantifierGreedy: + if (backTrack->matchAmount) { + --backTrack->matchAmount; + input.uncheckInput(1); + return true; + } + break; + + case QuantifierNonGreedy: + if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) { + ++backTrack->matchAmount; + if (checkCasedCharacter(term.atom.casedCharacter.lo, term.atom.casedCharacter.hi, term.inputPosition - 1)) + return true; + } + input.uncheckInput(backTrack->matchAmount); + break; + } + + return false; + } + + bool matchCharacterClass(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeCharacterClass); + BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation); + + switch (term.atom.quantityType) { + case QuantifierFixedCount: { + for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) { + if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + matchAmount)) + return false; + } + return true; + } + + case QuantifierGreedy: { + unsigned matchAmount = 0; + while ((matchAmount < term.atom.quantityCount) && input.checkInput(1)) { + if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - 1)) { + input.uncheckInput(1); + break; + } + ++matchAmount; + } + backTrack->matchAmount = matchAmount; + + return true; + } + + case QuantifierNonGreedy: + backTrack->matchAmount = 0; + return true; + } + + ASSERT_NOT_REACHED(); + return false; + } + + bool backtrackCharacterClass(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeCharacterClass); + BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation); + + switch (term.atom.quantityType) { + case QuantifierFixedCount: + break; + + case QuantifierGreedy: + if (backTrack->matchAmount) { + --backTrack->matchAmount; + input.uncheckInput(1); + return true; + } + break; + + case QuantifierNonGreedy: + if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) { + ++backTrack->matchAmount; + if (checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - 1)) + return true; + } + input.uncheckInput(backTrack->matchAmount); + break; + } + + return false; + } + + bool matchBackReference(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeBackReference); + BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation); + + int matchBegin = output[(term.atom.subpatternId << 1)]; + int matchEnd = output[(term.atom.subpatternId << 1) + 1]; + + // If the end position of the referenced match hasn't set yet then the backreference in the same parentheses where it references to that. + // In this case the result of match is empty string like when it references to a parentheses with zero-width match. + // Eg.: /(a\1)/ + if (matchEnd == -1) + return true; + + ASSERT((matchBegin == -1) || (matchBegin <= matchEnd)); + + if (matchBegin == matchEnd) + return true; + + switch (term.atom.quantityType) { + case QuantifierFixedCount: { + backTrack->begin = input.getPos(); + for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) { + if (!tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) { + input.setPos(backTrack->begin); + return false; + } + } + return true; + } + + case QuantifierGreedy: { + unsigned matchAmount = 0; + while ((matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) + ++matchAmount; + backTrack->matchAmount = matchAmount; + return true; + } + + case QuantifierNonGreedy: + backTrack->begin = input.getPos(); + backTrack->matchAmount = 0; + return true; + } + + ASSERT_NOT_REACHED(); + return false; + } + + bool backtrackBackReference(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeBackReference); + BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation); + + int matchBegin = output[(term.atom.subpatternId << 1)]; + int matchEnd = output[(term.atom.subpatternId << 1) + 1]; + ASSERT((matchBegin == -1) || (matchBegin <= matchEnd)); + + if (matchBegin == matchEnd) + return false; + + switch (term.atom.quantityType) { + case QuantifierFixedCount: + // for quantityCount == 1, could rewind. + input.setPos(backTrack->begin); + break; + + case QuantifierGreedy: + if (backTrack->matchAmount) { + --backTrack->matchAmount; + input.rewind(matchEnd - matchBegin); + return true; + } + break; + + case QuantifierNonGreedy: + if ((backTrack->matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) { + ++backTrack->matchAmount; + return true; + } + input.setPos(backTrack->begin); + break; + } + + return false; + } + + void recordParenthesesMatch(ByteTerm& term, ParenthesesDisjunctionContext* context) + { + if (term.capture()) { + unsigned subpatternId = term.atom.subpatternId; + output[(subpatternId << 1)] = context->getDisjunctionContext(term)->matchBegin + term.inputPosition; + output[(subpatternId << 1) + 1] = context->getDisjunctionContext(term)->matchEnd + term.inputPosition; + } + } + void resetMatches(ByteTerm& term, ParenthesesDisjunctionContext* context) + { + unsigned firstSubpatternId = term.atom.subpatternId; + unsigned count = term.atom.parenthesesDisjunction->m_numSubpatterns; + context->restoreOutput(output, firstSubpatternId, count); + } + JSRegExpResult parenthesesDoBacktrack(ByteTerm& term, BackTrackInfoParentheses* backTrack) + { + while (backTrack->matchAmount) { + ParenthesesDisjunctionContext* context = backTrack->lastContext; + + JSRegExpResult result = matchDisjunction(term.atom.parenthesesDisjunction, context->getDisjunctionContext(term), true); + if (result == JSRegExpMatch) + return JSRegExpMatch; + + resetMatches(term, context); + popParenthesesDisjunctionContext(backTrack); + freeParenthesesDisjunctionContext(context); + + if (result != JSRegExpNoMatch) + return result; + } + + return JSRegExpNoMatch; + } + + bool matchParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin); + ASSERT(term.atom.quantityCount == 1); + + BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation); + + switch (term.atom.quantityType) { + case QuantifierGreedy: { + // set this speculatively; if we get to the parens end this will be true. + backTrack->begin = input.getPos(); + break; + } + case QuantifierNonGreedy: { + backTrack->begin = notFound; + context->term += term.atom.parenthesesWidth; + return true; + } + case QuantifierFixedCount: + break; + } + + if (term.capture()) { + unsigned subpatternId = term.atom.subpatternId; + output[(subpatternId << 1)] = input.getPos() + term.inputPosition; + } + + return true; + } + + bool matchParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd); + ASSERT(term.atom.quantityCount == 1); + + if (term.capture()) { + unsigned subpatternId = term.atom.subpatternId; + output[(subpatternId << 1) + 1] = input.getPos() + term.inputPosition; + } + + if (term.atom.quantityType == QuantifierFixedCount) + return true; + + BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation); + return backTrack->begin != input.getPos(); + } + + bool backtrackParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin); + ASSERT(term.atom.quantityCount == 1); + + BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation); + + if (term.capture()) { + unsigned subpatternId = term.atom.subpatternId; + output[(subpatternId << 1)] = -1; + output[(subpatternId << 1) + 1] = -1; + } + + switch (term.atom.quantityType) { + case QuantifierGreedy: + // if we backtrack to this point, there is another chance - try matching nothing. + ASSERT(backTrack->begin != notFound); + backTrack->begin = notFound; + context->term += term.atom.parenthesesWidth; + return true; + case QuantifierNonGreedy: + ASSERT(backTrack->begin != notFound); + case QuantifierFixedCount: + break; + } + + return false; + } + + bool backtrackParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd); + ASSERT(term.atom.quantityCount == 1); + + BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation); + + switch (term.atom.quantityType) { + case QuantifierGreedy: + if (backTrack->begin == notFound) { + context->term -= term.atom.parenthesesWidth; + return false; + } + case QuantifierNonGreedy: + if (backTrack->begin == notFound) { + backTrack->begin = input.getPos(); + if (term.capture()) { + // Technically this access to inputPosition should be accessing the begin term's + // inputPosition, but for repeats other than fixed these values should be + // the same anyway! (We don't pre-check for greedy or non-greedy matches.) + ASSERT((&term - term.atom.parenthesesWidth)->type == ByteTerm::TypeParenthesesSubpatternOnceBegin); + ASSERT((&term - term.atom.parenthesesWidth)->inputPosition == term.inputPosition); + unsigned subpatternId = term.atom.subpatternId; + output[subpatternId << 1] = input.getPos() + term.inputPosition; + } + context->term -= term.atom.parenthesesWidth; + return true; + } + case QuantifierFixedCount: + break; + } + + return false; + } + + bool matchParenthesesTerminalBegin(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalBegin); + ASSERT(term.atom.quantityType == QuantifierGreedy); + ASSERT(term.atom.quantityCount == quantifyInfinite); + ASSERT(!term.capture()); + + BackTrackInfoParenthesesTerminal* backTrack = reinterpret_cast<BackTrackInfoParenthesesTerminal*>(context->frame + term.frameLocation); + backTrack->begin = input.getPos(); + return true; + } + + bool matchParenthesesTerminalEnd(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalEnd); + + BackTrackInfoParenthesesTerminal* backTrack = reinterpret_cast<BackTrackInfoParenthesesTerminal*>(context->frame + term.frameLocation); + // Empty match is a failed match. + if (backTrack->begin == input.getPos()) + return false; + + // Successful match! Okay, what's next? - loop around and try to match moar! + context->term -= (term.atom.parenthesesWidth + 1); + return true; + } + + bool backtrackParenthesesTerminalBegin(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalBegin); + ASSERT(term.atom.quantityType == QuantifierGreedy); + ASSERT(term.atom.quantityCount == quantifyInfinite); + ASSERT(!term.capture()); + + // If we backtrack to this point, we have failed to match this iteration of the parens. + // Since this is greedy / zero minimum a failed is also accepted as a match! + context->term += term.atom.parenthesesWidth; + return true; + } + + bool backtrackParenthesesTerminalEnd(ByteTerm&, DisjunctionContext*) + { + // 'Terminal' parentheses are at the end of the regex, and as such a match past end + // should always be returned as a successful match - we should never backtrack to here. + ASSERT_NOT_REACHED(); + return false; + } + + bool matchParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin); + ASSERT(term.atom.quantityCount == 1); + + BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation); + + backTrack->begin = input.getPos(); + return true; + } + + bool matchParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd); + ASSERT(term.atom.quantityCount == 1); + + BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation); + + input.setPos(backTrack->begin); + + // We've reached the end of the parens; if they are inverted, this is failure. + if (term.invert()) { + context->term -= term.atom.parenthesesWidth; + return false; + } + + return true; + } + + bool backtrackParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin); + ASSERT(term.atom.quantityCount == 1); + + // We've failed to match parens; if they are inverted, this is win! + if (term.invert()) { + context->term += term.atom.parenthesesWidth; + return true; + } + + return false; + } + + bool backtrackParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd); + ASSERT(term.atom.quantityCount == 1); + + BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation); + + input.setPos(backTrack->begin); + + context->term -= term.atom.parenthesesWidth; + return false; + } + + JSRegExpResult matchParentheses(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern); + + BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation); + ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction; + + backTrack->matchAmount = 0; + backTrack->lastContext = 0; + + switch (term.atom.quantityType) { + case QuantifierFixedCount: { + // While we haven't yet reached our fixed limit, + while (backTrack->matchAmount < term.atom.quantityCount) { + // Try to do a match, and it it succeeds, add it to the list. + ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); + JSRegExpResult result = matchDisjunction(disjunctionBody, context->getDisjunctionContext(term)); + if (result == JSRegExpMatch) + appendParenthesesDisjunctionContext(backTrack, context); + else { + // The match failed; try to find an alternate point to carry on from. + resetMatches(term, context); + freeParenthesesDisjunctionContext(context); + + if (result != JSRegExpNoMatch) + return result; + JSRegExpResult backtrackResult = parenthesesDoBacktrack(term, backTrack); + if (backtrackResult != JSRegExpMatch) + return backtrackResult; + } + } + + ASSERT(backTrack->matchAmount == term.atom.quantityCount); + ParenthesesDisjunctionContext* context = backTrack->lastContext; + recordParenthesesMatch(term, context); + return JSRegExpMatch; + } + + case QuantifierGreedy: { + while (backTrack->matchAmount < term.atom.quantityCount) { + ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); + JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term)); + if (result == JSRegExpMatch) + appendParenthesesDisjunctionContext(backTrack, context); + else { + resetMatches(term, context); + freeParenthesesDisjunctionContext(context); + + if (result != JSRegExpNoMatch) + return result; + + break; + } + } + + if (backTrack->matchAmount) { + ParenthesesDisjunctionContext* context = backTrack->lastContext; + recordParenthesesMatch(term, context); + } + return JSRegExpMatch; + } + + case QuantifierNonGreedy: + return JSRegExpMatch; + } + + ASSERT_NOT_REACHED(); + return JSRegExpErrorNoMatch; + } + + // Rules for backtracking differ depending on whether this is greedy or non-greedy. + // + // Greedy matches never should try just adding more - you should already have done + // the 'more' cases. Always backtrack, at least a leetle bit. However cases where + // you backtrack an item off the list needs checking, since we'll never have matched + // the one less case. Tracking forwards, still add as much as possible. + // + // Non-greedy, we've already done the one less case, so don't match on popping. + // We haven't done the one more case, so always try to add that. + // + JSRegExpResult backtrackParentheses(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern); + + BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation); + ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction; + + switch (term.atom.quantityType) { + case QuantifierFixedCount: { + ASSERT(backTrack->matchAmount == term.atom.quantityCount); + + ParenthesesDisjunctionContext* context = 0; + JSRegExpResult result = parenthesesDoBacktrack(term, backTrack); + + if (result != JSRegExpMatch) + return result; + + // While we haven't yet reached our fixed limit, + while (backTrack->matchAmount < term.atom.quantityCount) { + // Try to do a match, and it it succeeds, add it to the list. + context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); + result = matchDisjunction(disjunctionBody, context->getDisjunctionContext(term)); + + if (result == JSRegExpMatch) + appendParenthesesDisjunctionContext(backTrack, context); + else { + // The match failed; try to find an alternate point to carry on from. + resetMatches(term, context); + freeParenthesesDisjunctionContext(context); + + if (result != JSRegExpNoMatch) + return result; + JSRegExpResult backtrackResult = parenthesesDoBacktrack(term, backTrack); + if (backtrackResult != JSRegExpMatch) + return backtrackResult; + } + } + + ASSERT(backTrack->matchAmount == term.atom.quantityCount); + context = backTrack->lastContext; + recordParenthesesMatch(term, context); + return JSRegExpMatch; + } + + case QuantifierGreedy: { + if (!backTrack->matchAmount) + return JSRegExpNoMatch; + + ParenthesesDisjunctionContext* context = backTrack->lastContext; + JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true); + if (result == JSRegExpMatch) { + while (backTrack->matchAmount < term.atom.quantityCount) { + ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); + JSRegExpResult parenthesesResult = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term)); + if (parenthesesResult == JSRegExpMatch) + appendParenthesesDisjunctionContext(backTrack, context); + else { + resetMatches(term, context); + freeParenthesesDisjunctionContext(context); + + if (parenthesesResult != JSRegExpNoMatch) + return parenthesesResult; + + break; + } + } + } else { + resetMatches(term, context); + popParenthesesDisjunctionContext(backTrack); + freeParenthesesDisjunctionContext(context); + + if (result != JSRegExpNoMatch) + return result; + } + + if (backTrack->matchAmount) { + ParenthesesDisjunctionContext* context = backTrack->lastContext; + recordParenthesesMatch(term, context); + } + return JSRegExpMatch; + } + + case QuantifierNonGreedy: { + // If we've not reached the limit, try to add one more match. + if (backTrack->matchAmount < term.atom.quantityCount) { + ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); + JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term)); + if (result == JSRegExpMatch) { + appendParenthesesDisjunctionContext(backTrack, context); + recordParenthesesMatch(term, context); + return JSRegExpMatch; + } + + resetMatches(term, context); + freeParenthesesDisjunctionContext(context); + + if (result != JSRegExpNoMatch) + return result; + } + + // Nope - okay backtrack looking for an alternative. + while (backTrack->matchAmount) { + ParenthesesDisjunctionContext* context = backTrack->lastContext; + JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true); + if (result == JSRegExpMatch) { + // successful backtrack! we're back in the game! + if (backTrack->matchAmount) { + context = backTrack->lastContext; + recordParenthesesMatch(term, context); + } + return JSRegExpMatch; + } + + // pop a match off the stack + resetMatches(term, context); + popParenthesesDisjunctionContext(backTrack); + freeParenthesesDisjunctionContext(context); + + if (result != JSRegExpNoMatch) + return result; + } + + return JSRegExpNoMatch; + } + } + + ASSERT_NOT_REACHED(); + return JSRegExpErrorNoMatch; + } + + bool matchDotStarEnclosure(ByteTerm& term, DisjunctionContext* context) + { + UNUSED_PARAM(term); + unsigned matchBegin = context->matchBegin; + + if (matchBegin) { + for (matchBegin--; true; matchBegin--) { + if (testCharacterClass(pattern->newlineCharacterClass, input.reread(matchBegin))) { + ++matchBegin; + break; + } + + if (!matchBegin) + break; + } + } + + unsigned matchEnd = input.getPos(); + + for (; (matchEnd != input.end()) + && (!testCharacterClass(pattern->newlineCharacterClass, input.reread(matchEnd))); matchEnd++) { } + + if (((matchBegin && term.anchors.m_bol) + || ((matchEnd != input.end()) && term.anchors.m_eol)) + && !pattern->m_multiline) + return false; + + context->matchBegin = matchBegin; + context->matchEnd = matchEnd; + return true; + } + +#define MATCH_NEXT() { ++context->term; goto matchAgain; } +#define BACKTRACK() { --context->term; goto backtrack; } +#define currentTerm() (disjunction->terms[context->term]) + JSRegExpResult matchDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false) + { + if (!--remainingMatchCount) + return JSRegExpErrorHitLimit; + + if (btrack) + BACKTRACK(); + + context->matchBegin = input.getPos(); + context->term = 0; + + matchAgain: + ASSERT(context->term < static_cast<int>(disjunction->terms.size())); + + switch (currentTerm().type) { + case ByteTerm::TypeSubpatternBegin: + MATCH_NEXT(); + case ByteTerm::TypeSubpatternEnd: + context->matchEnd = input.getPos(); + return JSRegExpMatch; + + case ByteTerm::TypeBodyAlternativeBegin: + MATCH_NEXT(); + case ByteTerm::TypeBodyAlternativeDisjunction: + case ByteTerm::TypeBodyAlternativeEnd: + context->matchEnd = input.getPos(); + return JSRegExpMatch; + + case ByteTerm::TypeAlternativeBegin: + MATCH_NEXT(); + case ByteTerm::TypeAlternativeDisjunction: + case ByteTerm::TypeAlternativeEnd: { + int offset = currentTerm().alternative.end; + BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation); + backTrack->offset = offset; + context->term += offset; + MATCH_NEXT(); + } + + case ByteTerm::TypeAssertionBOL: + if (matchAssertionBOL(currentTerm())) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeAssertionEOL: + if (matchAssertionEOL(currentTerm())) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeAssertionWordBoundary: + if (matchAssertionWordBoundary(currentTerm())) + MATCH_NEXT(); + BACKTRACK(); + + case ByteTerm::TypePatternCharacterOnce: + case ByteTerm::TypePatternCharacterFixed: { + for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) { + if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition + matchAmount)) + BACKTRACK(); + } + MATCH_NEXT(); + } + case ByteTerm::TypePatternCharacterGreedy: { + BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation); + unsigned matchAmount = 0; + while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) { + if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition - 1)) { + input.uncheckInput(1); + break; + } + ++matchAmount; + } + backTrack->matchAmount = matchAmount; + + MATCH_NEXT(); + } + case ByteTerm::TypePatternCharacterNonGreedy: { + BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation); + backTrack->matchAmount = 0; + MATCH_NEXT(); + } + + case ByteTerm::TypePatternCasedCharacterOnce: + case ByteTerm::TypePatternCasedCharacterFixed: { + for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) { + if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition + matchAmount)) + BACKTRACK(); + } + MATCH_NEXT(); + } + case ByteTerm::TypePatternCasedCharacterGreedy: { + BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation); + unsigned matchAmount = 0; + while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) { + if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition - 1)) { + input.uncheckInput(1); + break; + } + ++matchAmount; + } + backTrack->matchAmount = matchAmount; + + MATCH_NEXT(); + } + case ByteTerm::TypePatternCasedCharacterNonGreedy: { + BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation); + backTrack->matchAmount = 0; + MATCH_NEXT(); + } + + case ByteTerm::TypeCharacterClass: + if (matchCharacterClass(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeBackReference: + if (matchBackReference(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeParenthesesSubpattern: { + JSRegExpResult result = matchParentheses(currentTerm(), context); + + if (result == JSRegExpMatch) { + MATCH_NEXT(); + } else if (result != JSRegExpNoMatch) + return result; + + BACKTRACK(); + } + case ByteTerm::TypeParenthesesSubpatternOnceBegin: + if (matchParenthesesOnceBegin(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeParenthesesSubpatternOnceEnd: + if (matchParenthesesOnceEnd(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeParenthesesSubpatternTerminalBegin: + if (matchParenthesesTerminalBegin(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeParenthesesSubpatternTerminalEnd: + if (matchParenthesesTerminalEnd(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeParentheticalAssertionBegin: + if (matchParentheticalAssertionBegin(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeParentheticalAssertionEnd: + if (matchParentheticalAssertionEnd(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + + case ByteTerm::TypeCheckInput: + if (input.checkInput(currentTerm().checkInputCount)) + MATCH_NEXT(); + BACKTRACK(); + + case ByteTerm::TypeUncheckInput: + input.uncheckInput(currentTerm().checkInputCount); + MATCH_NEXT(); + + case ByteTerm::TypeDotStarEnclosure: + if (matchDotStarEnclosure(currentTerm(), context)) + return JSRegExpMatch; + BACKTRACK(); + } + + // We should never fall-through to here. + ASSERT_NOT_REACHED(); + + backtrack: + ASSERT(context->term < static_cast<int>(disjunction->terms.size())); + + switch (currentTerm().type) { + case ByteTerm::TypeSubpatternBegin: + return JSRegExpNoMatch; + case ByteTerm::TypeSubpatternEnd: + ASSERT_NOT_REACHED(); + + case ByteTerm::TypeBodyAlternativeBegin: + case ByteTerm::TypeBodyAlternativeDisjunction: { + int offset = currentTerm().alternative.next; + context->term += offset; + if (offset > 0) + MATCH_NEXT(); + + if (input.atEnd()) + return JSRegExpNoMatch; + + input.next(); + + context->matchBegin = input.getPos(); + + if (currentTerm().alternative.onceThrough) + context->term += currentTerm().alternative.next; + + MATCH_NEXT(); + } + case ByteTerm::TypeBodyAlternativeEnd: + ASSERT_NOT_REACHED(); + + case ByteTerm::TypeAlternativeBegin: + case ByteTerm::TypeAlternativeDisjunction: { + int offset = currentTerm().alternative.next; + context->term += offset; + if (offset > 0) + MATCH_NEXT(); + BACKTRACK(); + } + case ByteTerm::TypeAlternativeEnd: { + // We should never backtrack back into an alternative of the main body of the regex. + BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation); + unsigned offset = backTrack->offset; + context->term -= offset; + BACKTRACK(); + } + + case ByteTerm::TypeAssertionBOL: + case ByteTerm::TypeAssertionEOL: + case ByteTerm::TypeAssertionWordBoundary: + BACKTRACK(); + + case ByteTerm::TypePatternCharacterOnce: + case ByteTerm::TypePatternCharacterFixed: + case ByteTerm::TypePatternCharacterGreedy: + case ByteTerm::TypePatternCharacterNonGreedy: + if (backtrackPatternCharacter(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypePatternCasedCharacterOnce: + case ByteTerm::TypePatternCasedCharacterFixed: + case ByteTerm::TypePatternCasedCharacterGreedy: + case ByteTerm::TypePatternCasedCharacterNonGreedy: + if (backtrackPatternCasedCharacter(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeCharacterClass: + if (backtrackCharacterClass(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeBackReference: + if (backtrackBackReference(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeParenthesesSubpattern: { + JSRegExpResult result = backtrackParentheses(currentTerm(), context); + + if (result == JSRegExpMatch) { + MATCH_NEXT(); + } else if (result != JSRegExpNoMatch) + return result; + + BACKTRACK(); + } + case ByteTerm::TypeParenthesesSubpatternOnceBegin: + if (backtrackParenthesesOnceBegin(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeParenthesesSubpatternOnceEnd: + if (backtrackParenthesesOnceEnd(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeParenthesesSubpatternTerminalBegin: + if (backtrackParenthesesTerminalBegin(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeParenthesesSubpatternTerminalEnd: + if (backtrackParenthesesTerminalEnd(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeParentheticalAssertionBegin: + if (backtrackParentheticalAssertionBegin(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeParentheticalAssertionEnd: + if (backtrackParentheticalAssertionEnd(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + + case ByteTerm::TypeCheckInput: + input.uncheckInput(currentTerm().checkInputCount); + BACKTRACK(); + + case ByteTerm::TypeUncheckInput: + input.checkInput(currentTerm().checkInputCount); + BACKTRACK(); + + case ByteTerm::TypeDotStarEnclosure: + ASSERT_NOT_REACHED(); + } + + ASSERT_NOT_REACHED(); + return JSRegExpErrorNoMatch; + } + + JSRegExpResult matchNonZeroDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false) + { + JSRegExpResult result = matchDisjunction(disjunction, context, btrack); + + if (result == JSRegExpMatch) { + while (context->matchBegin == context->matchEnd) { + result = matchDisjunction(disjunction, context, true); + if (result != JSRegExpMatch) + return result; + } + return JSRegExpMatch; + } + + return result; + } + + int interpret() + { + allocatorPool = pattern->m_allocator->startAllocator(); + if (!allocatorPool) + CRASH(); + + for (unsigned i = 0; i < ((pattern->m_body->m_numSubpatterns + 1) << 1); ++i) + output[i] = -1; + + DisjunctionContext* context = allocDisjunctionContext(pattern->m_body.get()); + + JSRegExpResult result = matchDisjunction(pattern->m_body.get(), context, false); + if (result == JSRegExpMatch) { + output[0] = context->matchBegin; + output[1] = context->matchEnd; + } + + freeDisjunctionContext(context); + + pattern->m_allocator->stopAllocator(); + + // RegExp.cpp currently expects all error to be converted to -1. + ASSERT((result == JSRegExpMatch) == (output[0] != -1)); + return output[0]; + } + + Interpreter(BytecodePattern* pattern, int* output, const UString input, unsigned start, unsigned length) + : pattern(pattern) + , output(output) + , input(input, start, length) + , allocatorPool(0) + , remainingMatchCount(matchLimit) + { + } + +private: + BytecodePattern* pattern; + int* output; + InputStream input; + BumpPointerPool* allocatorPool; + unsigned remainingMatchCount; +}; + + + +class ByteCompiler { + struct ParenthesesStackEntry { + unsigned beginTerm; + unsigned savedAlternativeIndex; + ParenthesesStackEntry(unsigned beginTerm, unsigned savedAlternativeIndex/*, unsigned subpatternId, bool capture = false*/) + : beginTerm(beginTerm) + , savedAlternativeIndex(savedAlternativeIndex) + { + } + }; + +public: + ByteCompiler(YarrPattern& pattern) + : m_pattern(pattern) + { + m_currentAlternativeIndex = 0; + } + + PassOwnPtr<BytecodePattern> compile(BumpPointerAllocator* allocator) + { + regexBegin(m_pattern.m_numSubpatterns, m_pattern.m_body->m_callFrameSize, m_pattern.m_body->m_alternatives[0]->onceThrough()); + emitDisjunction(m_pattern.m_body); + regexEnd(); + + return adoptPtr(new BytecodePattern(m_bodyDisjunction.release(), m_allParenthesesInfo, m_pattern, allocator)); + } + + void checkInput(unsigned count) + { + m_bodyDisjunction->terms.append(ByteTerm::CheckInput(count)); + } + + void uncheckInput(unsigned count) + { + m_bodyDisjunction->terms.append(ByteTerm::UncheckInput(count)); + } + + void assertionBOL(int inputPosition) + { + m_bodyDisjunction->terms.append(ByteTerm::BOL(inputPosition)); + } + + void assertionEOL(int inputPosition) + { + m_bodyDisjunction->terms.append(ByteTerm::EOL(inputPosition)); + } + + void assertionWordBoundary(bool invert, int inputPosition) + { + m_bodyDisjunction->terms.append(ByteTerm::WordBoundary(invert, inputPosition)); + } + + void atomPatternCharacter(UChar ch, int inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType) + { + if (m_pattern.m_ignoreCase) { + UChar lo = Unicode::toLower(ch); + UChar hi = Unicode::toUpper(ch); + + if (lo != hi) { + m_bodyDisjunction->terms.append(ByteTerm(lo, hi, inputPosition, frameLocation, quantityCount, quantityType)); + return; + } + } + + m_bodyDisjunction->terms.append(ByteTerm(ch, inputPosition, frameLocation, quantityCount, quantityType)); + } + + void atomCharacterClass(CharacterClass* characterClass, bool invert, int inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType) + { + m_bodyDisjunction->terms.append(ByteTerm(characterClass, invert, inputPosition)); + + m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount.unsafeGet(); + m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType; + m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; + } + + void atomBackReference(unsigned subpatternId, int inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType) + { + ASSERT(subpatternId); + + m_bodyDisjunction->terms.append(ByteTerm::BackReference(subpatternId, inputPosition)); + + m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount.unsafeGet(); + m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType; + m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; + } + + void atomParenthesesOnceBegin(unsigned subpatternId, bool capture, int inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation) + { + int beginTerm = m_bodyDisjunction->terms.size(); + + m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, false, inputPosition)); + m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; + m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin()); + m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation; + + m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex)); + m_currentAlternativeIndex = beginTerm + 1; + } + + void atomParenthesesTerminalBegin(unsigned subpatternId, bool capture, int inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation) + { + int beginTerm = m_bodyDisjunction->terms.size(); + + m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternTerminalBegin, subpatternId, capture, false, inputPosition)); + m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; + m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin()); + m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation; + + m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex)); + m_currentAlternativeIndex = beginTerm + 1; + } + + void atomParenthesesSubpatternBegin(unsigned subpatternId, bool capture, int inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation) + { + // Errrk! - this is a little crazy, we initially generate as a TypeParenthesesSubpatternOnceBegin, + // then fix this up at the end! - simplifying this should make it much clearer. + // https://bugs.webkit.org/show_bug.cgi?id=50136 + + int beginTerm = m_bodyDisjunction->terms.size(); + + m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, false, inputPosition)); + m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; + m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin()); + m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation; + + m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex)); + m_currentAlternativeIndex = beginTerm + 1; + } + + void atomParentheticalAssertionBegin(unsigned subpatternId, bool invert, unsigned frameLocation, unsigned alternativeFrameLocation) + { + int beginTerm = m_bodyDisjunction->terms.size(); + + m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParentheticalAssertionBegin, subpatternId, false, invert, 0)); + m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; + m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin()); + m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation; + + m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex)); + m_currentAlternativeIndex = beginTerm + 1; + } + + void atomParentheticalAssertionEnd(int inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType) + { + unsigned beginTerm = popParenthesesStack(); + closeAlternative(beginTerm + 1); + unsigned endTerm = m_bodyDisjunction->terms.size(); + + ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParentheticalAssertionBegin); + + bool invert = m_bodyDisjunction->terms[beginTerm].invert(); + unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId; + + m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParentheticalAssertionEnd, subpatternId, false, invert, inputPosition)); + m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm; + m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm; + m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation; + + m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount.unsafeGet(); + m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; + m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount.unsafeGet(); + m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType; + } + + void assertionDotStarEnclosure(bool bolAnchored, bool eolAnchored) + { + m_bodyDisjunction->terms.append(ByteTerm::DotStarEnclosure(bolAnchored, eolAnchored)); + } + + unsigned popParenthesesStack() + { + ASSERT(m_parenthesesStack.size()); + int stackEnd = m_parenthesesStack.size() - 1; + unsigned beginTerm = m_parenthesesStack[stackEnd].beginTerm; + m_currentAlternativeIndex = m_parenthesesStack[stackEnd].savedAlternativeIndex; + m_parenthesesStack.shrink(stackEnd); + + ASSERT(beginTerm < m_bodyDisjunction->terms.size()); + ASSERT(m_currentAlternativeIndex < m_bodyDisjunction->terms.size()); + + return beginTerm; + } + +#ifndef NDEBUG + void dumpDisjunction(ByteDisjunction* disjunction) + { + printf("ByteDisjunction(%p):\n\t", disjunction); + for (unsigned i = 0; i < disjunction->terms.size(); ++i) + printf("{ %d } ", disjunction->terms[i].type); + printf("\n"); + } +#endif + + void closeAlternative(int beginTerm) + { + int origBeginTerm = beginTerm; + ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeBegin); + int endIndex = m_bodyDisjunction->terms.size(); + + unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation; + + if (!m_bodyDisjunction->terms[beginTerm].alternative.next) + m_bodyDisjunction->terms.remove(beginTerm); + else { + while (m_bodyDisjunction->terms[beginTerm].alternative.next) { + beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next; + ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeDisjunction); + m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm; + m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation; + } + + m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm; + + m_bodyDisjunction->terms.append(ByteTerm::AlternativeEnd()); + m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation; + } + } + + void closeBodyAlternative() + { + int beginTerm = 0; + int origBeginTerm = 0; + ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeBegin); + int endIndex = m_bodyDisjunction->terms.size(); + + unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation; + + while (m_bodyDisjunction->terms[beginTerm].alternative.next) { + beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next; + ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeDisjunction); + m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm; + m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation; + } + + m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm; + + m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeEnd()); + m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation; + } + + void atomParenthesesSubpatternEnd(unsigned lastSubpatternId, int inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType, unsigned callFrameSize = 0) + { + unsigned beginTerm = popParenthesesStack(); + closeAlternative(beginTerm + 1); + unsigned endTerm = m_bodyDisjunction->terms.size(); + + ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternOnceBegin); + + ByteTerm& parenthesesBegin = m_bodyDisjunction->terms[beginTerm]; + + bool capture = parenthesesBegin.capture(); + unsigned subpatternId = parenthesesBegin.atom.subpatternId; + + unsigned numSubpatterns = lastSubpatternId - subpatternId + 1; + ByteDisjunction* parenthesesDisjunction = new ByteDisjunction(numSubpatterns, callFrameSize); + + parenthesesDisjunction->terms.append(ByteTerm::SubpatternBegin()); + for (unsigned termInParentheses = beginTerm + 1; termInParentheses < endTerm; ++termInParentheses) + parenthesesDisjunction->terms.append(m_bodyDisjunction->terms[termInParentheses]); + parenthesesDisjunction->terms.append(ByteTerm::SubpatternEnd()); + + m_bodyDisjunction->terms.shrink(beginTerm); + + m_allParenthesesInfo.append(parenthesesDisjunction); + m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, capture, inputPosition)); + + m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount.unsafeGet(); + m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; + m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation; + } + + void atomParenthesesOnceEnd(int inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType) + { + unsigned beginTerm = popParenthesesStack(); + closeAlternative(beginTerm + 1); + unsigned endTerm = m_bodyDisjunction->terms.size(); + + ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternOnceBegin); + + bool capture = m_bodyDisjunction->terms[beginTerm].capture(); + unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId; + + m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceEnd, subpatternId, capture, false, inputPosition)); + m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm; + m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm; + m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation; + + m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount.unsafeGet(); + m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; + m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount.unsafeGet(); + m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType; + } + + void atomParenthesesTerminalEnd(int inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType) + { + unsigned beginTerm = popParenthesesStack(); + closeAlternative(beginTerm + 1); + unsigned endTerm = m_bodyDisjunction->terms.size(); + + ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternTerminalBegin); + + bool capture = m_bodyDisjunction->terms[beginTerm].capture(); + unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId; + + m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternTerminalEnd, subpatternId, capture, false, inputPosition)); + m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm; + m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm; + m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation; + + m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount.unsafeGet(); + m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; + m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount.unsafeGet(); + m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType; + } + + void regexBegin(unsigned numSubpatterns, unsigned callFrameSize, bool onceThrough) + { + m_bodyDisjunction = adoptPtr(new ByteDisjunction(numSubpatterns, callFrameSize)); + m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeBegin(onceThrough)); + m_bodyDisjunction->terms[0].frameLocation = 0; + m_currentAlternativeIndex = 0; + } + + void regexEnd() + { + closeBodyAlternative(); + } + + void alternativeBodyDisjunction(bool onceThrough) + { + int newAlternativeIndex = m_bodyDisjunction->terms.size(); + m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex; + m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeDisjunction(onceThrough)); + + m_currentAlternativeIndex = newAlternativeIndex; + } + + void alternativeDisjunction() + { + int newAlternativeIndex = m_bodyDisjunction->terms.size(); + m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex; + m_bodyDisjunction->terms.append(ByteTerm::AlternativeDisjunction()); + + m_currentAlternativeIndex = newAlternativeIndex; + } + + void emitDisjunction(PatternDisjunction* disjunction, unsigned inputCountAlreadyChecked = 0, unsigned parenthesesInputCountAlreadyChecked = 0) + { + for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { + unsigned currentCountAlreadyChecked = inputCountAlreadyChecked; + + PatternAlternative* alternative = disjunction->m_alternatives[alt]; + + if (alt) { + if (disjunction == m_pattern.m_body) + alternativeBodyDisjunction(alternative->onceThrough()); + else + alternativeDisjunction(); + } + + unsigned minimumSize = alternative->m_minimumSize; + ASSERT(minimumSize >= parenthesesInputCountAlreadyChecked); + unsigned countToCheck = minimumSize - parenthesesInputCountAlreadyChecked; + + if (countToCheck) { + checkInput(countToCheck); + currentCountAlreadyChecked += countToCheck; + } + + for (unsigned i = 0; i < alternative->m_terms.size(); ++i) { + PatternTerm& term = alternative->m_terms[i]; + + switch (term.type) { + case PatternTerm::TypeAssertionBOL: + assertionBOL(term.inputPosition - currentCountAlreadyChecked); + break; + + case PatternTerm::TypeAssertionEOL: + assertionEOL(term.inputPosition - currentCountAlreadyChecked); + break; + + case PatternTerm::TypeAssertionWordBoundary: + assertionWordBoundary(term.invert(), term.inputPosition - currentCountAlreadyChecked); + break; + + case PatternTerm::TypePatternCharacter: + atomPatternCharacter(term.patternCharacter, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType); + break; + + case PatternTerm::TypeCharacterClass: + atomCharacterClass(term.characterClass, term.invert(), term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType); + break; + + case PatternTerm::TypeBackReference: + atomBackReference(term.backReferenceSubpatternId, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType); + break; + + case PatternTerm::TypeForwardReference: + break; + + case PatternTerm::TypeParenthesesSubpattern: { + unsigned disjunctionAlreadyCheckedCount = 0; + if (term.quantityCount == 1 && !term.parentheses.isCopy) { + unsigned alternativeFrameLocation = term.frameLocation; + // For QuantifierFixedCount we pre-check the minimum size; for greedy/non-greedy we reserve a slot in the frame. + if (term.quantityType == QuantifierFixedCount) + disjunctionAlreadyCheckedCount = term.parentheses.disjunction->m_minimumSize; + else + alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesOnce; + unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked; + atomParenthesesOnceBegin(term.parentheses.subpatternId, term.capture(), delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, alternativeFrameLocation); + emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, disjunctionAlreadyCheckedCount); + atomParenthesesOnceEnd(delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType); + } else if (term.parentheses.isTerminal) { + unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked; + atomParenthesesTerminalBegin(term.parentheses.subpatternId, term.capture(), delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, term.frameLocation + YarrStackSpaceForBackTrackInfoParenthesesOnce); + emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, disjunctionAlreadyCheckedCount); + atomParenthesesTerminalEnd(delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType); + } else { + unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked; + atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.capture(), delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, 0); + emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0); + atomParenthesesSubpatternEnd(term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize); + } + break; + } + + case PatternTerm::TypeParentheticalAssertion: { + unsigned alternativeFrameLocation = term.frameLocation + YarrStackSpaceForBackTrackInfoParentheticalAssertion; + + ASSERT(currentCountAlreadyChecked >= static_cast<unsigned>(term.inputPosition)); + unsigned positiveInputOffset = currentCountAlreadyChecked - static_cast<unsigned>(term.inputPosition); + unsigned uncheckAmount = 0; + if (positiveInputOffset > term.parentheses.disjunction->m_minimumSize) { + uncheckAmount = positiveInputOffset - term.parentheses.disjunction->m_minimumSize; + uncheckInput(uncheckAmount); + currentCountAlreadyChecked -= uncheckAmount; + } + + atomParentheticalAssertionBegin(term.parentheses.subpatternId, term.invert(), term.frameLocation, alternativeFrameLocation); + emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, positiveInputOffset - uncheckAmount); + atomParentheticalAssertionEnd(0, term.frameLocation, term.quantityCount, term.quantityType); + if (uncheckAmount) { + checkInput(uncheckAmount); + currentCountAlreadyChecked += uncheckAmount; + } + break; + } + + case PatternTerm::TypeDotStarEnclosure: + assertionDotStarEnclosure(term.anchors.bolAnchor, term.anchors.eolAnchor); + break; + } + } + } + } + +private: + YarrPattern& m_pattern; + OwnPtr<ByteDisjunction> m_bodyDisjunction; + unsigned m_currentAlternativeIndex; + Vector<ParenthesesStackEntry> m_parenthesesStack; + Vector<ByteDisjunction*> m_allParenthesesInfo; +}; + +PassOwnPtr<BytecodePattern> byteCompile(YarrPattern& pattern, BumpPointerAllocator* allocator) +{ + return ByteCompiler(pattern).compile(allocator); +} + +int interpret(BytecodePattern* bytecode, const UString& input, unsigned start, unsigned length, int* output) +{ + return Interpreter(bytecode, output, input, start, length).interpret(); +} + +COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoPatternCharacter) == (YarrStackSpaceForBackTrackInfoPatternCharacter * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoPatternCharacter); +COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoCharacterClass) == (YarrStackSpaceForBackTrackInfoCharacterClass * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoCharacterClass); +COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoBackReference) == (YarrStackSpaceForBackTrackInfoBackReference * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoBackReference); +COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoAlternative) == (YarrStackSpaceForBackTrackInfoAlternative * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoAlternative); +COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParentheticalAssertion) == (YarrStackSpaceForBackTrackInfoParentheticalAssertion * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParentheticalAssertion); +COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParenthesesOnce) == (YarrStackSpaceForBackTrackInfoParenthesesOnce * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParenthesesOnce); +COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParentheses) == (YarrStackSpaceForBackTrackInfoParentheses * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParentheses); + + +} } diff --git a/Source/JavaScriptCore/yarr/YarrInterpreter.h b/Source/JavaScriptCore/yarr/YarrInterpreter.h new file mode 100644 index 000000000..eb5fdc6dc --- /dev/null +++ b/Source/JavaScriptCore/yarr/YarrInterpreter.h @@ -0,0 +1,380 @@ +/* + * Copyright (C) 2009, 2010 Apple Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef YarrInterpreter_h +#define YarrInterpreter_h + +#include "YarrPattern.h" +#include <wtf/PassOwnPtr.h> +#include <wtf/unicode/Unicode.h> + +namespace WTF { +class BumpPointerAllocator; +} +using WTF::BumpPointerAllocator; + +namespace JSC { namespace Yarr { + +class ByteDisjunction; + +struct ByteTerm { + enum Type { + TypeBodyAlternativeBegin, + TypeBodyAlternativeDisjunction, + TypeBodyAlternativeEnd, + TypeAlternativeBegin, + TypeAlternativeDisjunction, + TypeAlternativeEnd, + TypeSubpatternBegin, + TypeSubpatternEnd, + TypeAssertionBOL, + TypeAssertionEOL, + TypeAssertionWordBoundary, + TypePatternCharacterOnce, + TypePatternCharacterFixed, + TypePatternCharacterGreedy, + TypePatternCharacterNonGreedy, + TypePatternCasedCharacterOnce, + TypePatternCasedCharacterFixed, + TypePatternCasedCharacterGreedy, + TypePatternCasedCharacterNonGreedy, + TypeCharacterClass, + TypeBackReference, + TypeParenthesesSubpattern, + TypeParenthesesSubpatternOnceBegin, + TypeParenthesesSubpatternOnceEnd, + TypeParenthesesSubpatternTerminalBegin, + TypeParenthesesSubpatternTerminalEnd, + TypeParentheticalAssertionBegin, + TypeParentheticalAssertionEnd, + TypeCheckInput, + TypeUncheckInput, + TypeDotStarEnclosure, + } type; + union { + struct { + union { + UChar patternCharacter; + struct { + UChar lo; + UChar hi; + } casedCharacter; + CharacterClass* characterClass; + unsigned subpatternId; + }; + union { + ByteDisjunction* parenthesesDisjunction; + unsigned parenthesesWidth; + }; + QuantifierType quantityType; + unsigned quantityCount; + } atom; + struct { + int next; + int end; + bool onceThrough; + } alternative; + struct { + bool m_bol : 1; + bool m_eol : 1; + } anchors; + unsigned checkInputCount; + }; + unsigned frameLocation; + bool m_capture : 1; + bool m_invert : 1; + int inputPosition; + + ByteTerm(UChar ch, int inputPos, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType) + : frameLocation(frameLocation) + , m_capture(false) + , m_invert(false) + { + switch (quantityType) { + case QuantifierFixedCount: + type = (quantityCount == 1) ? ByteTerm::TypePatternCharacterOnce : ByteTerm::TypePatternCharacterFixed; + break; + case QuantifierGreedy: + type = ByteTerm::TypePatternCharacterGreedy; + break; + case QuantifierNonGreedy: + type = ByteTerm::TypePatternCharacterNonGreedy; + break; + } + + atom.patternCharacter = ch; + atom.quantityType = quantityType; + atom.quantityCount = quantityCount.unsafeGet(); + inputPosition = inputPos; + } + + ByteTerm(UChar lo, UChar hi, int inputPos, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType) + : frameLocation(frameLocation) + , m_capture(false) + , m_invert(false) + { + switch (quantityType) { + case QuantifierFixedCount: + type = (quantityCount == 1) ? ByteTerm::TypePatternCasedCharacterOnce : ByteTerm::TypePatternCasedCharacterFixed; + break; + case QuantifierGreedy: + type = ByteTerm::TypePatternCasedCharacterGreedy; + break; + case QuantifierNonGreedy: + type = ByteTerm::TypePatternCasedCharacterNonGreedy; + break; + } + + atom.casedCharacter.lo = lo; + atom.casedCharacter.hi = hi; + atom.quantityType = quantityType; + atom.quantityCount = quantityCount.unsafeGet(); + inputPosition = inputPos; + } + + ByteTerm(CharacterClass* characterClass, bool invert, int inputPos) + : type(ByteTerm::TypeCharacterClass) + , m_capture(false) + , m_invert(invert) + { + atom.characterClass = characterClass; + atom.quantityType = QuantifierFixedCount; + atom.quantityCount = 1; + inputPosition = inputPos; + } + + ByteTerm(Type type, unsigned subpatternId, ByteDisjunction* parenthesesInfo, bool capture, int inputPos) + : type(type) + , m_capture(capture) + , m_invert(false) + { + atom.subpatternId = subpatternId; + atom.parenthesesDisjunction = parenthesesInfo; + atom.quantityType = QuantifierFixedCount; + atom.quantityCount = 1; + inputPosition = inputPos; + } + + ByteTerm(Type type, bool invert = false) + : type(type) + , m_capture(false) + , m_invert(invert) + { + atom.quantityType = QuantifierFixedCount; + atom.quantityCount = 1; + } + + ByteTerm(Type type, unsigned subpatternId, bool capture, bool invert, int inputPos) + : type(type) + , m_capture(capture) + , m_invert(invert) + { + atom.subpatternId = subpatternId; + atom.quantityType = QuantifierFixedCount; + atom.quantityCount = 1; + inputPosition = inputPos; + } + + static ByteTerm BOL(int inputPos) + { + ByteTerm term(TypeAssertionBOL); + term.inputPosition = inputPos; + return term; + } + + static ByteTerm CheckInput(Checked<unsigned> count) + { + ByteTerm term(TypeCheckInput); + term.checkInputCount = count.unsafeGet(); + return term; + } + + static ByteTerm UncheckInput(Checked<unsigned> count) + { + ByteTerm term(TypeUncheckInput); + term.checkInputCount = count.unsafeGet(); + return term; + } + + static ByteTerm EOL(int inputPos) + { + ByteTerm term(TypeAssertionEOL); + term.inputPosition = inputPos; + return term; + } + + static ByteTerm WordBoundary(bool invert, int inputPos) + { + ByteTerm term(TypeAssertionWordBoundary, invert); + term.inputPosition = inputPos; + return term; + } + + static ByteTerm BackReference(unsigned subpatternId, int inputPos) + { + return ByteTerm(TypeBackReference, subpatternId, false, false, inputPos); + } + + static ByteTerm BodyAlternativeBegin(bool onceThrough) + { + ByteTerm term(TypeBodyAlternativeBegin); + term.alternative.next = 0; + term.alternative.end = 0; + term.alternative.onceThrough = onceThrough; + return term; + } + + static ByteTerm BodyAlternativeDisjunction(bool onceThrough) + { + ByteTerm term(TypeBodyAlternativeDisjunction); + term.alternative.next = 0; + term.alternative.end = 0; + term.alternative.onceThrough = onceThrough; + return term; + } + + static ByteTerm BodyAlternativeEnd() + { + ByteTerm term(TypeBodyAlternativeEnd); + term.alternative.next = 0; + term.alternative.end = 0; + term.alternative.onceThrough = false; + return term; + } + + static ByteTerm AlternativeBegin() + { + ByteTerm term(TypeAlternativeBegin); + term.alternative.next = 0; + term.alternative.end = 0; + term.alternative.onceThrough = false; + return term; + } + + static ByteTerm AlternativeDisjunction() + { + ByteTerm term(TypeAlternativeDisjunction); + term.alternative.next = 0; + term.alternative.end = 0; + term.alternative.onceThrough = false; + return term; + } + + static ByteTerm AlternativeEnd() + { + ByteTerm term(TypeAlternativeEnd); + term.alternative.next = 0; + term.alternative.end = 0; + term.alternative.onceThrough = false; + return term; + } + + static ByteTerm SubpatternBegin() + { + return ByteTerm(TypeSubpatternBegin); + } + + static ByteTerm SubpatternEnd() + { + return ByteTerm(TypeSubpatternEnd); + } + + static ByteTerm DotStarEnclosure(bool bolAnchor, bool eolAnchor) + { + ByteTerm term(TypeDotStarEnclosure); + term.anchors.m_bol = bolAnchor; + term.anchors.m_eol = eolAnchor; + return term; + } + + bool invert() + { + return m_invert; + } + + bool capture() + { + return m_capture; + } +}; + +class ByteDisjunction { + WTF_MAKE_FAST_ALLOCATED; +public: + ByteDisjunction(unsigned numSubpatterns, unsigned frameSize) + : m_numSubpatterns(numSubpatterns) + , m_frameSize(frameSize) + { + } + + Vector<ByteTerm> terms; + unsigned m_numSubpatterns; + unsigned m_frameSize; +}; + +struct BytecodePattern { + WTF_MAKE_FAST_ALLOCATED; +public: + BytecodePattern(PassOwnPtr<ByteDisjunction> body, Vector<ByteDisjunction*> allParenthesesInfo, YarrPattern& pattern, BumpPointerAllocator* allocator) + : m_body(body) + , m_ignoreCase(pattern.m_ignoreCase) + , m_multiline(pattern.m_multiline) + , m_allocator(allocator) + { + newlineCharacterClass = pattern.newlineCharacterClass(); + wordcharCharacterClass = pattern.wordcharCharacterClass(); + + m_allParenthesesInfo.append(allParenthesesInfo); + m_userCharacterClasses.append(pattern.m_userCharacterClasses); + // 'Steal' the YarrPattern's CharacterClasses! We clear its + // array, so that it won't delete them on destruction. We'll + // take responsibility for that. + pattern.m_userCharacterClasses.clear(); + } + + ~BytecodePattern() + { + deleteAllValues(m_allParenthesesInfo); + deleteAllValues(m_userCharacterClasses); + } + + OwnPtr<ByteDisjunction> m_body; + bool m_ignoreCase; + bool m_multiline; + // Each BytecodePattern is associated with a RegExp, each RegExp is associated + // with a JSGlobalData. Cache a pointer to out JSGlobalData's m_regExpAllocator. + BumpPointerAllocator* m_allocator; + + CharacterClass* newlineCharacterClass; + CharacterClass* wordcharCharacterClass; + +private: + Vector<ByteDisjunction*> m_allParenthesesInfo; + Vector<CharacterClass*> m_userCharacterClasses; +}; + +} } // namespace JSC::Yarr + +#endif // YarrInterpreter_h diff --git a/Source/JavaScriptCore/yarr/YarrJIT.cpp b/Source/JavaScriptCore/yarr/YarrJIT.cpp new file mode 100644 index 000000000..a3f467dc1 --- /dev/null +++ b/Source/JavaScriptCore/yarr/YarrJIT.cpp @@ -0,0 +1,2546 @@ +/* + * Copyright (C) 2009 Apple Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "YarrJIT.h" + +#include "ASCIICType.h" +#include "LinkBuffer.h" +#include "Yarr.h" + +#if ENABLE(YARR_JIT) + +using namespace WTF; + +namespace JSC { namespace Yarr { + +class YarrGenerator : private MacroAssembler { + friend void jitCompile(JSGlobalData*, YarrCodeBlock& jitObject, const UString& pattern, unsigned& numSubpatterns, const char*& error, bool ignoreCase, bool multiline); + +#if CPU(ARM) + static const RegisterID input = ARMRegisters::r0; + static const RegisterID index = ARMRegisters::r1; + static const RegisterID length = ARMRegisters::r2; + static const RegisterID output = ARMRegisters::r4; + + static const RegisterID regT0 = ARMRegisters::r5; + static const RegisterID regT1 = ARMRegisters::r6; + + static const RegisterID returnRegister = ARMRegisters::r0; +#elif CPU(MIPS) + static const RegisterID input = MIPSRegisters::a0; + static const RegisterID index = MIPSRegisters::a1; + static const RegisterID length = MIPSRegisters::a2; + static const RegisterID output = MIPSRegisters::a3; + + static const RegisterID regT0 = MIPSRegisters::t4; + static const RegisterID regT1 = MIPSRegisters::t5; + + static const RegisterID returnRegister = MIPSRegisters::v0; +#elif CPU(SH4) + static const RegisterID input = SH4Registers::r4; + static const RegisterID index = SH4Registers::r5; + static const RegisterID length = SH4Registers::r6; + static const RegisterID output = SH4Registers::r7; + + static const RegisterID regT0 = SH4Registers::r0; + static const RegisterID regT1 = SH4Registers::r1; + + static const RegisterID returnRegister = SH4Registers::r0; +#elif CPU(X86) + static const RegisterID input = X86Registers::eax; + static const RegisterID index = X86Registers::edx; + static const RegisterID length = X86Registers::ecx; + static const RegisterID output = X86Registers::edi; + + static const RegisterID regT0 = X86Registers::ebx; + static const RegisterID regT1 = X86Registers::esi; + + static const RegisterID returnRegister = X86Registers::eax; +#elif CPU(X86_64) + static const RegisterID input = X86Registers::edi; + static const RegisterID index = X86Registers::esi; + static const RegisterID length = X86Registers::edx; + static const RegisterID output = X86Registers::ecx; + + static const RegisterID regT0 = X86Registers::eax; + static const RegisterID regT1 = X86Registers::ebx; + + static const RegisterID returnRegister = X86Registers::eax; +#endif + + void optimizeAlternative(PatternAlternative* alternative) + { + if (!alternative->m_terms.size()) + return; + + for (unsigned i = 0; i < alternative->m_terms.size() - 1; ++i) { + PatternTerm& term = alternative->m_terms[i]; + PatternTerm& nextTerm = alternative->m_terms[i + 1]; + + if ((term.type == PatternTerm::TypeCharacterClass) + && (term.quantityType == QuantifierFixedCount) + && (nextTerm.type == PatternTerm::TypePatternCharacter) + && (nextTerm.quantityType == QuantifierFixedCount)) { + PatternTerm termCopy = term; + alternative->m_terms[i] = nextTerm; + alternative->m_terms[i + 1] = termCopy; + } + } + } + + void matchCharacterClassRange(RegisterID character, JumpList& failures, JumpList& matchDest, const CharacterRange* ranges, unsigned count, unsigned* matchIndex, const UChar* matches, unsigned matchCount) + { + do { + // pick which range we're going to generate + int which = count >> 1; + char lo = ranges[which].begin; + char hi = ranges[which].end; + + // check if there are any ranges or matches below lo. If not, just jl to failure - + // if there is anything else to check, check that first, if it falls through jmp to failure. + if ((*matchIndex < matchCount) && (matches[*matchIndex] < lo)) { + Jump loOrAbove = branch32(GreaterThanOrEqual, character, Imm32((unsigned short)lo)); + + // generate code for all ranges before this one + if (which) + matchCharacterClassRange(character, failures, matchDest, ranges, which, matchIndex, matches, matchCount); + + while ((*matchIndex < matchCount) && (matches[*matchIndex] < lo)) { + matchDest.append(branch32(Equal, character, Imm32((unsigned short)matches[*matchIndex]))); + ++*matchIndex; + } + failures.append(jump()); + + loOrAbove.link(this); + } else if (which) { + Jump loOrAbove = branch32(GreaterThanOrEqual, character, Imm32((unsigned short)lo)); + + matchCharacterClassRange(character, failures, matchDest, ranges, which, matchIndex, matches, matchCount); + failures.append(jump()); + + loOrAbove.link(this); + } else + failures.append(branch32(LessThan, character, Imm32((unsigned short)lo))); + + while ((*matchIndex < matchCount) && (matches[*matchIndex] <= hi)) + ++*matchIndex; + + matchDest.append(branch32(LessThanOrEqual, character, Imm32((unsigned short)hi))); + // fall through to here, the value is above hi. + + // shuffle along & loop around if there are any more matches to handle. + unsigned next = which + 1; + ranges += next; + count -= next; + } while (count); + } + + void matchCharacterClass(RegisterID character, JumpList& matchDest, const CharacterClass* charClass) + { + if (charClass->m_table) { + ExtendedAddress tableEntry(character, reinterpret_cast<intptr_t>(charClass->m_table->m_table)); + matchDest.append(branchTest8(charClass->m_table->m_inverted ? Zero : NonZero, tableEntry)); + return; + } + Jump unicodeFail; + if (charClass->m_matchesUnicode.size() || charClass->m_rangesUnicode.size()) { + Jump isAscii = branch32(LessThanOrEqual, character, TrustedImm32(0x7f)); + + if (charClass->m_matchesUnicode.size()) { + for (unsigned i = 0; i < charClass->m_matchesUnicode.size(); ++i) { + UChar ch = charClass->m_matchesUnicode[i]; + matchDest.append(branch32(Equal, character, Imm32(ch))); + } + } + + if (charClass->m_rangesUnicode.size()) { + for (unsigned i = 0; i < charClass->m_rangesUnicode.size(); ++i) { + UChar lo = charClass->m_rangesUnicode[i].begin; + UChar hi = charClass->m_rangesUnicode[i].end; + + Jump below = branch32(LessThan, character, Imm32(lo)); + matchDest.append(branch32(LessThanOrEqual, character, Imm32(hi))); + below.link(this); + } + } + + unicodeFail = jump(); + isAscii.link(this); + } + + if (charClass->m_ranges.size()) { + unsigned matchIndex = 0; + JumpList failures; + matchCharacterClassRange(character, failures, matchDest, charClass->m_ranges.begin(), charClass->m_ranges.size(), &matchIndex, charClass->m_matches.begin(), charClass->m_matches.size()); + while (matchIndex < charClass->m_matches.size()) + matchDest.append(branch32(Equal, character, Imm32((unsigned short)charClass->m_matches[matchIndex++]))); + + failures.link(this); + } else if (charClass->m_matches.size()) { + // optimization: gather 'a','A' etc back together, can mask & test once. + Vector<char> matchesAZaz; + + for (unsigned i = 0; i < charClass->m_matches.size(); ++i) { + char ch = charClass->m_matches[i]; + if (m_pattern.m_ignoreCase) { + if (isASCIILower(ch)) { + matchesAZaz.append(ch); + continue; + } + if (isASCIIUpper(ch)) + continue; + } + matchDest.append(branch32(Equal, character, Imm32((unsigned short)ch))); + } + + if (unsigned countAZaz = matchesAZaz.size()) { + or32(TrustedImm32(32), character); + for (unsigned i = 0; i < countAZaz; ++i) + matchDest.append(branch32(Equal, character, TrustedImm32(matchesAZaz[i]))); + } + } + + if (charClass->m_matchesUnicode.size() || charClass->m_rangesUnicode.size()) + unicodeFail.link(this); + } + + // Jumps if input not available; will have (incorrectly) incremented already! + Jump jumpIfNoAvailableInput(unsigned countToCheck = 0) + { + if (countToCheck) + add32(Imm32(countToCheck), index); + return branch32(Above, index, length); + } + + Jump jumpIfAvailableInput(unsigned countToCheck) + { + add32(Imm32(countToCheck), index); + return branch32(BelowOrEqual, index, length); + } + + Jump checkInput() + { + return branch32(BelowOrEqual, index, length); + } + + Jump atEndOfInput() + { + return branch32(Equal, index, length); + } + + Jump notAtEndOfInput() + { + return branch32(NotEqual, index, length); + } + + Jump jumpIfCharNotEquals(UChar ch, int inputPosition, RegisterID character) + { + readCharacter(inputPosition, character); + + // For case-insesitive compares, non-ascii characters that have different + // upper & lower case representations are converted to a character class. + ASSERT(!m_pattern.m_ignoreCase || isASCIIAlpha(ch) || (Unicode::toLower(ch) == Unicode::toUpper(ch))); + if (m_pattern.m_ignoreCase && isASCIIAlpha(ch)) { + or32(TrustedImm32(32), character); + ch = Unicode::toLower(ch); + } + + return branch32(NotEqual, character, Imm32(ch)); + } + + void readCharacter(int inputPosition, RegisterID reg) + { + if (m_charSize == Char8) + load8(BaseIndex(input, index, TimesOne, inputPosition * sizeof(char)), reg); + else + load16(BaseIndex(input, index, TimesTwo, inputPosition * sizeof(UChar)), reg); + } + + void storeToFrame(RegisterID reg, unsigned frameLocation) + { + poke(reg, frameLocation); + } + + void storeToFrame(TrustedImm32 imm, unsigned frameLocation) + { + poke(imm, frameLocation); + } + + DataLabelPtr storeToFrameWithPatch(unsigned frameLocation) + { + return storePtrWithPatch(TrustedImmPtr(0), Address(stackPointerRegister, frameLocation * sizeof(void*))); + } + + void loadFromFrame(unsigned frameLocation, RegisterID reg) + { + peek(reg, frameLocation); + } + + void loadFromFrameAndJump(unsigned frameLocation) + { + jump(Address(stackPointerRegister, frameLocation * sizeof(void*))); + } + + enum YarrOpCode { + // These nodes wrap body alternatives - those in the main disjunction, + // rather than subpatterns or assertions. These are chained together in + // a doubly linked list, with a 'begin' node for the first alternative, + // a 'next' node for each subsequent alternative, and an 'end' node at + // the end. In the case of repeating alternatives, the 'end' node also + // has a reference back to 'begin'. + OpBodyAlternativeBegin, + OpBodyAlternativeNext, + OpBodyAlternativeEnd, + // Similar to the body alternatives, but used for subpatterns with two + // or more alternatives. + OpNestedAlternativeBegin, + OpNestedAlternativeNext, + OpNestedAlternativeEnd, + // Used for alternatives in subpatterns where there is only a single + // alternative (backtrackingis easier in these cases), or for alternatives + // which never need to be backtracked (those in parenthetical assertions, + // terminal subpatterns). + OpSimpleNestedAlternativeBegin, + OpSimpleNestedAlternativeNext, + OpSimpleNestedAlternativeEnd, + // Used to wrap 'Once' subpattern matches (quantityCount == 1). + OpParenthesesSubpatternOnceBegin, + OpParenthesesSubpatternOnceEnd, + // Used to wrap 'Terminal' subpattern matches (at the end of the regexp). + OpParenthesesSubpatternTerminalBegin, + OpParenthesesSubpatternTerminalEnd, + // Used to wrap parenthetical assertions. + OpParentheticalAssertionBegin, + OpParentheticalAssertionEnd, + // Wraps all simple terms (pattern characters, character classes). + OpTerm, + // Where an expression contains only 'once through' body alternatives + // and no repeating ones, this op is used to return match failure. + OpMatchFailed + }; + + // This structure is used to hold the compiled opcode information, + // including reference back to the original PatternTerm/PatternAlternatives, + // and JIT compilation data structures. + struct YarrOp { + explicit YarrOp(PatternTerm* term) + : m_op(OpTerm) + , m_term(term) + , m_isDeadCode(false) + { + } + + explicit YarrOp(YarrOpCode op) + : m_op(op) + , m_isDeadCode(false) + { + } + + // The operation, as a YarrOpCode, and also a reference to the PatternTerm. + YarrOpCode m_op; + PatternTerm* m_term; + + // For alternatives, this holds the PatternAlternative and doubly linked + // references to this alternative's siblings. In the case of the + // OpBodyAlternativeEnd node at the end of a section of repeating nodes, + // m_nextOp will reference the OpBodyAlternativeBegin node of the first + // repeating alternative. + PatternAlternative* m_alternative; + size_t m_previousOp; + size_t m_nextOp; + + // Used to record a set of Jumps out of the generated code, typically + // used for jumps out to backtracking code, and a single reentry back + // into the code for a node (likely where a backtrack will trigger + // rematching). + Label m_reentry; + JumpList m_jumps; + + // This flag is used to null out the second pattern character, when + // two are fused to match a pair together. + bool m_isDeadCode; + + // Currently used in the case of some of the more complex management of + // 'm_checked', to cache the offset used in this alternative, to avoid + // recalculating it. + int m_checkAdjust; + + // Used by OpNestedAlternativeNext/End to hold the pointer to the + // value that will be pushed into the pattern's frame to return to, + // upon backtracking back into the disjunction. + DataLabelPtr m_returnAddress; + }; + + // BacktrackingState + // This class encapsulates information about the state of code generation + // whilst generating the code for backtracking, when a term fails to match. + // Upon entry to code generation of the backtracking code for a given node, + // the Backtracking state will hold references to all control flow sources + // that are outputs in need of further backtracking from the prior node + // generated (which is the subsequent operation in the regular expression, + // and in the m_ops Vector, since we generated backtracking backwards). + // These references to control flow take the form of: + // - A jump list of jumps, to be linked to code that will backtrack them + // further. + // - A set of DataLabelPtr values, to be populated with values to be + // treated effectively as return addresses backtracking into complex + // subpatterns. + // - A flag indicating that the current sequence of generated code up to + // this point requires backtracking. + class BacktrackingState { + public: + BacktrackingState() + : m_pendingFallthrough(false) + { + } + + // Add a jump or jumps, a return address, or set the flag indicating + // that the current 'fallthrough' control flow requires backtracking. + void append(const Jump& jump) + { + m_laterFailures.append(jump); + } + void append(JumpList& jumpList) + { + m_laterFailures.append(jumpList); + } + void append(const DataLabelPtr& returnAddress) + { + m_pendingReturns.append(returnAddress); + } + void fallthrough() + { + ASSERT(!m_pendingFallthrough); + m_pendingFallthrough = true; + } + + // These methods clear the backtracking state, either linking to the + // current location, a provided label, or copying the backtracking out + // to a JumpList. All actions may require code generation to take place, + // and as such are passed a pointer to the assembler. + void link(MacroAssembler* assembler) + { + if (m_pendingReturns.size()) { + Label here(assembler); + for (unsigned i = 0; i < m_pendingReturns.size(); ++i) + m_backtrackRecords.append(ReturnAddressRecord(m_pendingReturns[i], here)); + m_pendingReturns.clear(); + } + m_laterFailures.link(assembler); + m_laterFailures.clear(); + m_pendingFallthrough = false; + } + void linkTo(Label label, MacroAssembler* assembler) + { + if (m_pendingReturns.size()) { + for (unsigned i = 0; i < m_pendingReturns.size(); ++i) + m_backtrackRecords.append(ReturnAddressRecord(m_pendingReturns[i], label)); + m_pendingReturns.clear(); + } + if (m_pendingFallthrough) + assembler->jump(label); + m_laterFailures.linkTo(label, assembler); + m_laterFailures.clear(); + m_pendingFallthrough = false; + } + void takeBacktracksToJumpList(JumpList& jumpList, MacroAssembler* assembler) + { + if (m_pendingReturns.size()) { + Label here(assembler); + for (unsigned i = 0; i < m_pendingReturns.size(); ++i) + m_backtrackRecords.append(ReturnAddressRecord(m_pendingReturns[i], here)); + m_pendingReturns.clear(); + m_pendingFallthrough = true; + } + if (m_pendingFallthrough) + jumpList.append(assembler->jump()); + jumpList.append(m_laterFailures); + m_laterFailures.clear(); + m_pendingFallthrough = false; + } + + bool isEmpty() + { + return m_laterFailures.empty() && m_pendingReturns.isEmpty() && !m_pendingFallthrough; + } + + // Called at the end of code generation to link all return addresses. + void linkDataLabels(LinkBuffer& linkBuffer) + { + ASSERT(isEmpty()); + for (unsigned i = 0; i < m_backtrackRecords.size(); ++i) + linkBuffer.patch(m_backtrackRecords[i].m_dataLabel, linkBuffer.locationOf(m_backtrackRecords[i].m_backtrackLocation)); + } + + private: + struct ReturnAddressRecord { + ReturnAddressRecord(DataLabelPtr dataLabel, Label backtrackLocation) + : m_dataLabel(dataLabel) + , m_backtrackLocation(backtrackLocation) + { + } + + DataLabelPtr m_dataLabel; + Label m_backtrackLocation; + }; + + JumpList m_laterFailures; + bool m_pendingFallthrough; + Vector<DataLabelPtr, 4> m_pendingReturns; + Vector<ReturnAddressRecord, 4> m_backtrackRecords; + }; + + // Generation methods: + // =================== + + // This method provides a default implementation of backtracking common + // to many terms; terms commonly jump out of the forwards matching path + // on any failed conditions, and add these jumps to the m_jumps list. If + // no special handling is required we can often just backtrack to m_jumps. + void backtrackTermDefault(size_t opIndex) + { + YarrOp& op = m_ops[opIndex]; + m_backtrackingState.append(op.m_jumps); + } + + void generateAssertionBOL(size_t opIndex) + { + YarrOp& op = m_ops[opIndex]; + PatternTerm* term = op.m_term; + + if (m_pattern.m_multiline) { + const RegisterID character = regT0; + + JumpList matchDest; + if (!term->inputPosition) + matchDest.append(branch32(Equal, index, Imm32(m_checked))); + + readCharacter((term->inputPosition - m_checked) - 1, character); + matchCharacterClass(character, matchDest, m_pattern.newlineCharacterClass()); + op.m_jumps.append(jump()); + + matchDest.link(this); + } else { + // Erk, really should poison out these alternatives early. :-/ + if (term->inputPosition) + op.m_jumps.append(jump()); + else + op.m_jumps.append(branch32(NotEqual, index, Imm32(m_checked))); + } + } + void backtrackAssertionBOL(size_t opIndex) + { + backtrackTermDefault(opIndex); + } + + void generateAssertionEOL(size_t opIndex) + { + YarrOp& op = m_ops[opIndex]; + PatternTerm* term = op.m_term; + + if (m_pattern.m_multiline) { + const RegisterID character = regT0; + + JumpList matchDest; + if (term->inputPosition == m_checked) + matchDest.append(atEndOfInput()); + + readCharacter(term->inputPosition - m_checked, character); + matchCharacterClass(character, matchDest, m_pattern.newlineCharacterClass()); + op.m_jumps.append(jump()); + + matchDest.link(this); + } else { + if (term->inputPosition == m_checked) + op.m_jumps.append(notAtEndOfInput()); + // Erk, really should poison out these alternatives early. :-/ + else + op.m_jumps.append(jump()); + } + } + void backtrackAssertionEOL(size_t opIndex) + { + backtrackTermDefault(opIndex); + } + + // Also falls though on nextIsNotWordChar. + void matchAssertionWordchar(size_t opIndex, JumpList& nextIsWordChar, JumpList& nextIsNotWordChar) + { + YarrOp& op = m_ops[opIndex]; + PatternTerm* term = op.m_term; + + const RegisterID character = regT0; + + if (term->inputPosition == m_checked) + nextIsNotWordChar.append(atEndOfInput()); + + readCharacter((term->inputPosition - m_checked), character); + matchCharacterClass(character, nextIsWordChar, m_pattern.wordcharCharacterClass()); + } + + void generateAssertionWordBoundary(size_t opIndex) + { + YarrOp& op = m_ops[opIndex]; + PatternTerm* term = op.m_term; + + const RegisterID character = regT0; + + Jump atBegin; + JumpList matchDest; + if (!term->inputPosition) + atBegin = branch32(Equal, index, Imm32(m_checked)); + readCharacter((term->inputPosition - m_checked) - 1, character); + matchCharacterClass(character, matchDest, m_pattern.wordcharCharacterClass()); + if (!term->inputPosition) + atBegin.link(this); + + // We fall through to here if the last character was not a wordchar. + JumpList nonWordCharThenWordChar; + JumpList nonWordCharThenNonWordChar; + if (term->invert()) { + matchAssertionWordchar(opIndex, nonWordCharThenNonWordChar, nonWordCharThenWordChar); + nonWordCharThenWordChar.append(jump()); + } else { + matchAssertionWordchar(opIndex, nonWordCharThenWordChar, nonWordCharThenNonWordChar); + nonWordCharThenNonWordChar.append(jump()); + } + op.m_jumps.append(nonWordCharThenNonWordChar); + + // We jump here if the last character was a wordchar. + matchDest.link(this); + JumpList wordCharThenWordChar; + JumpList wordCharThenNonWordChar; + if (term->invert()) { + matchAssertionWordchar(opIndex, wordCharThenNonWordChar, wordCharThenWordChar); + wordCharThenWordChar.append(jump()); + } else { + matchAssertionWordchar(opIndex, wordCharThenWordChar, wordCharThenNonWordChar); + // This can fall-though! + } + + op.m_jumps.append(wordCharThenWordChar); + + nonWordCharThenWordChar.link(this); + wordCharThenNonWordChar.link(this); + } + void backtrackAssertionWordBoundary(size_t opIndex) + { + backtrackTermDefault(opIndex); + } + + void generatePatternCharacterOnce(size_t opIndex) + { + YarrOp& op = m_ops[opIndex]; + + if (op.m_isDeadCode) + return; + + // m_ops always ends with a OpBodyAlternativeEnd or OpMatchFailed + // node, so there must always be at least one more node. + ASSERT(opIndex + 1 < m_ops.size()); + YarrOp* nextOp = &m_ops[opIndex + 1]; + + PatternTerm* term = op.m_term; + UChar ch = term->patternCharacter; + + if ((ch > 0xff) && (m_charSize == Char8)) { + // Have a 16 bit pattern character and an 8 bit string - short circuit + op.m_jumps.append(jump()); + return; + } + + const RegisterID character = regT0; + int maxCharactersAtOnce = m_charSize == Char8 ? 4 : 2; + unsigned ignoreCaseMask = 0; + int allCharacters = ch; + int numberCharacters; + int startTermPosition = term->inputPosition; + + // For case-insesitive compares, non-ascii characters that have different + // upper & lower case representations are converted to a character class. + ASSERT(!m_pattern.m_ignoreCase || isASCIIAlpha(ch) || (Unicode::toLower(ch) == Unicode::toUpper(ch))); + + if ((m_pattern.m_ignoreCase) && (isASCIIAlpha(ch))) + ignoreCaseMask |= 32; + + for (numberCharacters = 1; numberCharacters < maxCharactersAtOnce && nextOp->m_op == OpTerm; ++numberCharacters, nextOp = &m_ops[opIndex + numberCharacters]) { + PatternTerm* nextTerm = nextOp->m_term; + + if (nextTerm->type != PatternTerm::TypePatternCharacter + || nextTerm->quantityType != QuantifierFixedCount + || nextTerm->quantityCount != 1 + || nextTerm->inputPosition != (startTermPosition + numberCharacters)) + break; + + nextOp->m_isDeadCode = true; + + int shiftAmount = (m_charSize == Char8 ? 8 : 16) * numberCharacters; + + UChar currentCharacter = nextTerm->patternCharacter; + + if ((currentCharacter > 0xff) && (m_charSize == Char8)) { + // Have a 16 bit pattern character and an 8 bit string - short circuit + op.m_jumps.append(jump()); + return; + } + + // For case-insesitive compares, non-ascii characters that have different + // upper & lower case representations are converted to a character class. + ASSERT(!m_pattern.m_ignoreCase || isASCIIAlpha(currentCharacter) || (Unicode::toLower(currentCharacter) == Unicode::toUpper(currentCharacter))); + + allCharacters |= (currentCharacter << shiftAmount); + + if ((m_pattern.m_ignoreCase) && (isASCIIAlpha(currentCharacter))) + ignoreCaseMask |= 32 << shiftAmount; + } + + if (m_charSize == Char8) { + switch (numberCharacters) { + case 1: + op.m_jumps.append(jumpIfCharNotEquals(ch, startTermPosition - m_checked, character)); + return; + case 2: { + BaseIndex address(input, index, TimesOne, (startTermPosition - m_checked) * sizeof(LChar)); + load16(address, character); + break; + } + case 3: { + BaseIndex address(input, index, TimesOne, (startTermPosition - m_checked) * sizeof(LChar)); + load32WithUnalignedHalfWords(address, character); + and32(Imm32(0xffffff), character); + break; + } + case 4: { + BaseIndex address(input, index, TimesOne, (startTermPosition - m_checked) * sizeof(LChar)); + load32WithUnalignedHalfWords(address, character); + break; + } + } + } else { + switch (numberCharacters) { + case 1: + op.m_jumps.append(jumpIfCharNotEquals(ch, term->inputPosition - m_checked, character)); + return; + case 2: + BaseIndex address(input, index, TimesTwo, (term->inputPosition - m_checked) * sizeof(UChar)); + load32WithUnalignedHalfWords(address, character); + break; + } + } + + if (ignoreCaseMask) + or32(Imm32(ignoreCaseMask), character); + op.m_jumps.append(branch32(NotEqual, character, Imm32(allCharacters | ignoreCaseMask))); + return; + } + void backtrackPatternCharacterOnce(size_t opIndex) + { + backtrackTermDefault(opIndex); + } + + void generatePatternCharacterFixed(size_t opIndex) + { + YarrOp& op = m_ops[opIndex]; + PatternTerm* term = op.m_term; + UChar ch = term->patternCharacter; + + const RegisterID character = regT0; + const RegisterID countRegister = regT1; + + move(index, countRegister); + sub32(Imm32(term->quantityCount.unsafeGet()), countRegister); + + Label loop(this); + BaseIndex address(input, countRegister, m_charScale, (Checked<int>(term->inputPosition - m_checked + Checked<int64_t>(term->quantityCount)) * static_cast<int>(m_charSize == Char8 ? sizeof(char) : sizeof(UChar))).unsafeGet()); + + if (m_charSize == Char8) + load8(address, character); + else + load16(address, character); + + // For case-insesitive compares, non-ascii characters that have different + // upper & lower case representations are converted to a character class. + ASSERT(!m_pattern.m_ignoreCase || isASCIIAlpha(ch) || (Unicode::toLower(ch) == Unicode::toUpper(ch))); + if (m_pattern.m_ignoreCase && isASCIIAlpha(ch)) { + or32(TrustedImm32(32), character); + ch = Unicode::toLower(ch); + } + + op.m_jumps.append(branch32(NotEqual, character, Imm32(ch))); + add32(TrustedImm32(1), countRegister); + branch32(NotEqual, countRegister, index).linkTo(loop, this); + } + void backtrackPatternCharacterFixed(size_t opIndex) + { + backtrackTermDefault(opIndex); + } + + void generatePatternCharacterGreedy(size_t opIndex) + { + YarrOp& op = m_ops[opIndex]; + PatternTerm* term = op.m_term; + UChar ch = term->patternCharacter; + + const RegisterID character = regT0; + const RegisterID countRegister = regT1; + + move(TrustedImm32(0), countRegister); + + if ((ch > 0xff) && (m_charSize == Char8)) { + // Have a 16 bit pattern character and an 8 bit string - short circuit + op.m_jumps.append(jump()); + } else { + JumpList failures; + Label loop(this); + failures.append(atEndOfInput()); + failures.append(jumpIfCharNotEquals(ch, term->inputPosition - m_checked, character)); + + add32(TrustedImm32(1), countRegister); + add32(TrustedImm32(1), index); + if (term->quantityCount == quantifyInfinite) + jump(loop); + else + branch32(NotEqual, countRegister, Imm32(term->quantityCount.unsafeGet())).linkTo(loop, this); + + failures.link(this); + } + op.m_reentry = label(); + + storeToFrame(countRegister, term->frameLocation); + + } + void backtrackPatternCharacterGreedy(size_t opIndex) + { + YarrOp& op = m_ops[opIndex]; + PatternTerm* term = op.m_term; + + const RegisterID countRegister = regT1; + + m_backtrackingState.link(this); + + loadFromFrame(term->frameLocation, countRegister); + m_backtrackingState.append(branchTest32(Zero, countRegister)); + sub32(TrustedImm32(1), countRegister); + sub32(TrustedImm32(1), index); + jump(op.m_reentry); + } + + void generatePatternCharacterNonGreedy(size_t opIndex) + { + YarrOp& op = m_ops[opIndex]; + PatternTerm* term = op.m_term; + + const RegisterID countRegister = regT1; + + move(TrustedImm32(0), countRegister); + op.m_reentry = label(); + storeToFrame(countRegister, term->frameLocation); + } + void backtrackPatternCharacterNonGreedy(size_t opIndex) + { + YarrOp& op = m_ops[opIndex]; + PatternTerm* term = op.m_term; + UChar ch = term->patternCharacter; + + const RegisterID character = regT0; + const RegisterID countRegister = regT1; + + JumpList nonGreedyFailures; + + m_backtrackingState.link(this); + + loadFromFrame(term->frameLocation, countRegister); + + if ((ch > 0xff) && (m_charSize == Char8)) { + // Have a 16 bit pattern character and an 8 bit string - short circuit + nonGreedyFailures.append(jump()); + } else { + nonGreedyFailures.append(atEndOfInput()); + if (term->quantityCount != quantifyInfinite) + nonGreedyFailures.append(branch32(Equal, countRegister, Imm32(term->quantityCount.unsafeGet()))); + nonGreedyFailures.append(jumpIfCharNotEquals(ch, term->inputPosition - m_checked, character)); + + add32(TrustedImm32(1), countRegister); + add32(TrustedImm32(1), index); + + jump(op.m_reentry); + } + nonGreedyFailures.link(this); + + sub32(countRegister, index); + m_backtrackingState.fallthrough(); + } + + void generateCharacterClassOnce(size_t opIndex) + { + YarrOp& op = m_ops[opIndex]; + PatternTerm* term = op.m_term; + + const RegisterID character = regT0; + + JumpList matchDest; + readCharacter(term->inputPosition - m_checked, character); + matchCharacterClass(character, matchDest, term->characterClass); + + if (term->invert()) + op.m_jumps.append(matchDest); + else { + op.m_jumps.append(jump()); + matchDest.link(this); + } + } + void backtrackCharacterClassOnce(size_t opIndex) + { + backtrackTermDefault(opIndex); + } + + void generateCharacterClassFixed(size_t opIndex) + { + YarrOp& op = m_ops[opIndex]; + PatternTerm* term = op.m_term; + + const RegisterID character = regT0; + const RegisterID countRegister = regT1; + + move(index, countRegister); + sub32(Imm32(term->quantityCount.unsafeGet()), countRegister); + + Label loop(this); + JumpList matchDest; + if (m_charSize == Char8) + load8(BaseIndex(input, countRegister, TimesOne, (Checked<int>(term->inputPosition - m_checked + Checked<int64_t>(term->quantityCount)) * static_cast<int>(sizeof(char))).unsafeGet()), character); + else + load16(BaseIndex(input, countRegister, TimesTwo, (Checked<int>(term->inputPosition - m_checked + Checked<int64_t>(term->quantityCount)) * static_cast<int>(sizeof(UChar))).unsafeGet()), character); + matchCharacterClass(character, matchDest, term->characterClass); + + if (term->invert()) + op.m_jumps.append(matchDest); + else { + op.m_jumps.append(jump()); + matchDest.link(this); + } + + add32(TrustedImm32(1), countRegister); + branch32(NotEqual, countRegister, index).linkTo(loop, this); + } + void backtrackCharacterClassFixed(size_t opIndex) + { + backtrackTermDefault(opIndex); + } + + void generateCharacterClassGreedy(size_t opIndex) + { + YarrOp& op = m_ops[opIndex]; + PatternTerm* term = op.m_term; + + const RegisterID character = regT0; + const RegisterID countRegister = regT1; + + move(TrustedImm32(0), countRegister); + + JumpList failures; + Label loop(this); + failures.append(atEndOfInput()); + + if (term->invert()) { + readCharacter(term->inputPosition - m_checked, character); + matchCharacterClass(character, failures, term->characterClass); + } else { + JumpList matchDest; + readCharacter(term->inputPosition - m_checked, character); + matchCharacterClass(character, matchDest, term->characterClass); + failures.append(jump()); + matchDest.link(this); + } + + add32(TrustedImm32(1), countRegister); + add32(TrustedImm32(1), index); + if (term->quantityCount != quantifyInfinite) { + branch32(NotEqual, countRegister, Imm32(term->quantityCount.unsafeGet())).linkTo(loop, this); + failures.append(jump()); + } else + jump(loop); + + failures.link(this); + op.m_reentry = label(); + + storeToFrame(countRegister, term->frameLocation); + } + void backtrackCharacterClassGreedy(size_t opIndex) + { + YarrOp& op = m_ops[opIndex]; + PatternTerm* term = op.m_term; + + const RegisterID countRegister = regT1; + + m_backtrackingState.link(this); + + loadFromFrame(term->frameLocation, countRegister); + m_backtrackingState.append(branchTest32(Zero, countRegister)); + sub32(TrustedImm32(1), countRegister); + sub32(TrustedImm32(1), index); + jump(op.m_reentry); + } + + void generateCharacterClassNonGreedy(size_t opIndex) + { + YarrOp& op = m_ops[opIndex]; + PatternTerm* term = op.m_term; + + const RegisterID countRegister = regT1; + + move(TrustedImm32(0), countRegister); + op.m_reentry = label(); + storeToFrame(countRegister, term->frameLocation); + } + void backtrackCharacterClassNonGreedy(size_t opIndex) + { + YarrOp& op = m_ops[opIndex]; + PatternTerm* term = op.m_term; + + const RegisterID character = regT0; + const RegisterID countRegister = regT1; + + JumpList nonGreedyFailures; + + m_backtrackingState.link(this); + + Label backtrackBegin(this); + loadFromFrame(term->frameLocation, countRegister); + + nonGreedyFailures.append(atEndOfInput()); + nonGreedyFailures.append(branch32(Equal, countRegister, Imm32(term->quantityCount.unsafeGet()))); + + JumpList matchDest; + readCharacter(term->inputPosition - m_checked, character); + matchCharacterClass(character, matchDest, term->characterClass); + + if (term->invert()) + nonGreedyFailures.append(matchDest); + else { + nonGreedyFailures.append(jump()); + matchDest.link(this); + } + + add32(TrustedImm32(1), countRegister); + add32(TrustedImm32(1), index); + + jump(op.m_reentry); + + nonGreedyFailures.link(this); + sub32(countRegister, index); + m_backtrackingState.fallthrough(); + } + + void generateDotStarEnclosure(size_t opIndex) + { + YarrOp& op = m_ops[opIndex]; + PatternTerm* term = op.m_term; + + const RegisterID character = regT0; + const RegisterID matchPos = regT1; + + JumpList foundBeginningNewLine; + JumpList saveStartIndex; + JumpList foundEndingNewLine; + + if (m_pattern.m_body->m_hasFixedSize) { + move(index, matchPos); + sub32(Imm32(m_checked), matchPos); + } else + load32(Address(output), matchPos); + + saveStartIndex.append(branchTest32(Zero, matchPos)); + Label findBOLLoop(this); + sub32(TrustedImm32(1), matchPos); + if (m_charSize == Char8) + load8(BaseIndex(input, matchPos, TimesOne, 0), character); + else + load16(BaseIndex(input, matchPos, TimesTwo, 0), character); + matchCharacterClass(character, foundBeginningNewLine, m_pattern.newlineCharacterClass()); + branchTest32(NonZero, matchPos).linkTo(findBOLLoop, this); + saveStartIndex.append(jump()); + + foundBeginningNewLine.link(this); + add32(TrustedImm32(1), matchPos); // Advance past newline + saveStartIndex.link(this); + + if (!m_pattern.m_multiline && term->anchors.bolAnchor) + op.m_jumps.append(branchTest32(NonZero, matchPos)); + + store32(matchPos, Address(output)); + + move(index, matchPos); + + Label findEOLLoop(this); + foundEndingNewLine.append(branch32(Equal, matchPos, length)); + if (m_charSize == Char8) + load8(BaseIndex(input, matchPos, TimesOne, 0), character); + else + load16(BaseIndex(input, matchPos, TimesTwo, 0), character); + matchCharacterClass(character, foundEndingNewLine, m_pattern.newlineCharacterClass()); + add32(TrustedImm32(1), matchPos); + jump(findEOLLoop); + + foundEndingNewLine.link(this); + + if (!m_pattern.m_multiline && term->anchors.eolAnchor) + op.m_jumps.append(branch32(NotEqual, matchPos, length)); + + move(matchPos, index); + } + + void backtrackDotStarEnclosure(size_t opIndex) + { + backtrackTermDefault(opIndex); + } + + // Code generation/backtracking for simple terms + // (pattern characters, character classes, and assertions). + // These methods farm out work to the set of functions above. + void generateTerm(size_t opIndex) + { + YarrOp& op = m_ops[opIndex]; + PatternTerm* term = op.m_term; + + switch (term->type) { + case PatternTerm::TypePatternCharacter: + switch (term->quantityType) { + case QuantifierFixedCount: + if (term->quantityCount == 1) + generatePatternCharacterOnce(opIndex); + else + generatePatternCharacterFixed(opIndex); + break; + case QuantifierGreedy: + generatePatternCharacterGreedy(opIndex); + break; + case QuantifierNonGreedy: + generatePatternCharacterNonGreedy(opIndex); + break; + } + break; + + case PatternTerm::TypeCharacterClass: + switch (term->quantityType) { + case QuantifierFixedCount: + if (term->quantityCount == 1) + generateCharacterClassOnce(opIndex); + else + generateCharacterClassFixed(opIndex); + break; + case QuantifierGreedy: + generateCharacterClassGreedy(opIndex); + break; + case QuantifierNonGreedy: + generateCharacterClassNonGreedy(opIndex); + break; + } + break; + + case PatternTerm::TypeAssertionBOL: + generateAssertionBOL(opIndex); + break; + + case PatternTerm::TypeAssertionEOL: + generateAssertionEOL(opIndex); + break; + + case PatternTerm::TypeAssertionWordBoundary: + generateAssertionWordBoundary(opIndex); + break; + + case PatternTerm::TypeForwardReference: + break; + + case PatternTerm::TypeParenthesesSubpattern: + case PatternTerm::TypeParentheticalAssertion: + ASSERT_NOT_REACHED(); + case PatternTerm::TypeBackReference: + m_shouldFallBack = true; + break; + case PatternTerm::TypeDotStarEnclosure: + generateDotStarEnclosure(opIndex); + break; + } + } + void backtrackTerm(size_t opIndex) + { + YarrOp& op = m_ops[opIndex]; + PatternTerm* term = op.m_term; + + switch (term->type) { + case PatternTerm::TypePatternCharacter: + switch (term->quantityType) { + case QuantifierFixedCount: + if (term->quantityCount == 1) + backtrackPatternCharacterOnce(opIndex); + else + backtrackPatternCharacterFixed(opIndex); + break; + case QuantifierGreedy: + backtrackPatternCharacterGreedy(opIndex); + break; + case QuantifierNonGreedy: + backtrackPatternCharacterNonGreedy(opIndex); + break; + } + break; + + case PatternTerm::TypeCharacterClass: + switch (term->quantityType) { + case QuantifierFixedCount: + if (term->quantityCount == 1) + backtrackCharacterClassOnce(opIndex); + else + backtrackCharacterClassFixed(opIndex); + break; + case QuantifierGreedy: + backtrackCharacterClassGreedy(opIndex); + break; + case QuantifierNonGreedy: + backtrackCharacterClassNonGreedy(opIndex); + break; + } + break; + + case PatternTerm::TypeAssertionBOL: + backtrackAssertionBOL(opIndex); + break; + + case PatternTerm::TypeAssertionEOL: + backtrackAssertionEOL(opIndex); + break; + + case PatternTerm::TypeAssertionWordBoundary: + backtrackAssertionWordBoundary(opIndex); + break; + + case PatternTerm::TypeForwardReference: + break; + + case PatternTerm::TypeParenthesesSubpattern: + case PatternTerm::TypeParentheticalAssertion: + ASSERT_NOT_REACHED(); + + case PatternTerm::TypeDotStarEnclosure: + backtrackDotStarEnclosure(opIndex); + break; + + case PatternTerm::TypeBackReference: + m_shouldFallBack = true; + break; + } + } + + void generate() + { + // Forwards generate the matching code. + ASSERT(m_ops.size()); + size_t opIndex = 0; + + do { + YarrOp& op = m_ops[opIndex]; + switch (op.m_op) { + + case OpTerm: + generateTerm(opIndex); + break; + + // OpBodyAlternativeBegin/Next/End + // + // These nodes wrap the set of alternatives in the body of the regular expression. + // There may be either one or two chains of OpBodyAlternative nodes, one representing + // the 'once through' sequence of alternatives (if any exist), and one representing + // the repeating alternatives (again, if any exist). + // + // Upon normal entry to the Begin alternative, we will check that input is available. + // Reentry to the Begin alternative will take place after the check has taken place, + // and will assume that the input position has already been progressed as appropriate. + // + // Entry to subsequent Next/End alternatives occurs when the prior alternative has + // successfully completed a match - return a success state from JIT code. + // + // Next alternatives allow for reentry optimized to suit backtracking from its + // preceding alternative. It expects the input position to still be set to a position + // appropriate to its predecessor, and it will only perform an input check if the + // predecessor had a minimum size less than its own. + // + // In the case 'once through' expressions, the End node will also have a reentry + // point to jump to when the last alternative fails. Again, this expects the input + // position to still reflect that expected by the prior alternative. + case OpBodyAlternativeBegin: { + PatternAlternative* alternative = op.m_alternative; + + // Upon entry at the head of the set of alternatives, check if input is available + // to run the first alternative. (This progresses the input position). + op.m_jumps.append(jumpIfNoAvailableInput(alternative->m_minimumSize)); + // We will reenter after the check, and assume the input position to have been + // set as appropriate to this alternative. + op.m_reentry = label(); + + m_checked += alternative->m_minimumSize; + break; + } + case OpBodyAlternativeNext: + case OpBodyAlternativeEnd: { + PatternAlternative* priorAlternative = m_ops[op.m_previousOp].m_alternative; + PatternAlternative* alternative = op.m_alternative; + + // If we get here, the prior alternative matched - return success. + + // Adjust the stack pointer to remove the pattern's frame. + if (m_pattern.m_body->m_callFrameSize) + addPtr(Imm32(m_pattern.m_body->m_callFrameSize * sizeof(void*)), stackPointerRegister); + + // Load appropriate values into the return register and the first output + // slot, and return. In the case of pattern with a fixed size, we will + // not have yet set the value in the first + ASSERT(index != returnRegister); + if (m_pattern.m_body->m_hasFixedSize) { + move(index, returnRegister); + if (priorAlternative->m_minimumSize) + sub32(Imm32(priorAlternative->m_minimumSize), returnRegister); + store32(returnRegister, output); + } else + load32(Address(output), returnRegister); + store32(index, Address(output, 4)); + generateReturn(); + + // This is the divide between the tail of the prior alternative, above, and + // the head of the subsequent alternative, below. + + if (op.m_op == OpBodyAlternativeNext) { + // This is the reentry point for the Next alternative. We expect any code + // that jumps here to do so with the input position matching that of the + // PRIOR alteranative, and we will only check input availability if we + // need to progress it forwards. + op.m_reentry = label(); + if (alternative->m_minimumSize > priorAlternative->m_minimumSize) { + add32(Imm32(alternative->m_minimumSize - priorAlternative->m_minimumSize), index); + op.m_jumps.append(jumpIfNoAvailableInput()); + } else if (priorAlternative->m_minimumSize > alternative->m_minimumSize) + sub32(Imm32(priorAlternative->m_minimumSize - alternative->m_minimumSize), index); + } else if (op.m_nextOp == notFound) { + // This is the reentry point for the End of 'once through' alternatives, + // jumped to when the last alternative fails to match. + op.m_reentry = label(); + sub32(Imm32(priorAlternative->m_minimumSize), index); + } + + if (op.m_op == OpBodyAlternativeNext) + m_checked += alternative->m_minimumSize; + m_checked -= priorAlternative->m_minimumSize; + break; + } + + // OpSimpleNestedAlternativeBegin/Next/End + // OpNestedAlternativeBegin/Next/End + // + // These nodes are used to handle sets of alternatives that are nested within + // subpatterns and parenthetical assertions. The 'simple' forms are used where + // we do not need to be able to backtrack back into any alternative other than + // the last, the normal forms allow backtracking into any alternative. + // + // Each Begin/Next node is responsible for planting an input check to ensure + // sufficient input is available on entry. Next nodes additionally need to + // jump to the end - Next nodes use the End node's m_jumps list to hold this + // set of jumps. + // + // In the non-simple forms, successful alternative matches must store a + // 'return address' using a DataLabelPtr, used to store the address to jump + // to when backtracking, to get to the code for the appropriate alternative. + case OpSimpleNestedAlternativeBegin: + case OpNestedAlternativeBegin: { + PatternTerm* term = op.m_term; + PatternAlternative* alternative = op.m_alternative; + PatternDisjunction* disjunction = term->parentheses.disjunction; + + // Calculate how much input we need to check for, and if non-zero check. + op.m_checkAdjust = alternative->m_minimumSize; + if ((term->quantityType == QuantifierFixedCount) && (term->type != PatternTerm::TypeParentheticalAssertion)) + op.m_checkAdjust -= disjunction->m_minimumSize; + if (op.m_checkAdjust) + op.m_jumps.append(jumpIfNoAvailableInput(op.m_checkAdjust)); + + m_checked += op.m_checkAdjust; + break; + } + case OpSimpleNestedAlternativeNext: + case OpNestedAlternativeNext: { + PatternTerm* term = op.m_term; + PatternAlternative* alternative = op.m_alternative; + PatternDisjunction* disjunction = term->parentheses.disjunction; + + // In the non-simple case, store a 'return address' so we can backtrack correctly. + if (op.m_op == OpNestedAlternativeNext) { + unsigned parenthesesFrameLocation = term->frameLocation; + unsigned alternativeFrameLocation = parenthesesFrameLocation; + if (term->quantityType != QuantifierFixedCount) + alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesOnce; + op.m_returnAddress = storeToFrameWithPatch(alternativeFrameLocation); + } + + // If we reach here then the last alternative has matched - jump to the + // End node, to skip over any further alternatives. + // + // FIXME: this is logically O(N^2) (though N can be expected to be very + // small). We could avoid this either by adding an extra jump to the JIT + // data structures, or by making backtracking code that jumps to Next + // alternatives are responsible for checking that input is available (if + // we didn't need to plant the input checks, then m_jumps would be free). + YarrOp* endOp = &m_ops[op.m_nextOp]; + while (endOp->m_nextOp != notFound) { + ASSERT(endOp->m_op == OpSimpleNestedAlternativeNext || endOp->m_op == OpNestedAlternativeNext); + endOp = &m_ops[endOp->m_nextOp]; + } + ASSERT(endOp->m_op == OpSimpleNestedAlternativeEnd || endOp->m_op == OpNestedAlternativeEnd); + endOp->m_jumps.append(jump()); + + // This is the entry point for the next alternative. + op.m_reentry = label(); + + // Calculate how much input we need to check for, and if non-zero check. + op.m_checkAdjust = alternative->m_minimumSize; + if ((term->quantityType == QuantifierFixedCount) && (term->type != PatternTerm::TypeParentheticalAssertion)) + op.m_checkAdjust -= disjunction->m_minimumSize; + if (op.m_checkAdjust) + op.m_jumps.append(jumpIfNoAvailableInput(op.m_checkAdjust)); + + YarrOp& lastOp = m_ops[op.m_previousOp]; + m_checked -= lastOp.m_checkAdjust; + m_checked += op.m_checkAdjust; + break; + } + case OpSimpleNestedAlternativeEnd: + case OpNestedAlternativeEnd: { + PatternTerm* term = op.m_term; + + // In the non-simple case, store a 'return address' so we can backtrack correctly. + if (op.m_op == OpNestedAlternativeEnd) { + unsigned parenthesesFrameLocation = term->frameLocation; + unsigned alternativeFrameLocation = parenthesesFrameLocation; + if (term->quantityType != QuantifierFixedCount) + alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesOnce; + op.m_returnAddress = storeToFrameWithPatch(alternativeFrameLocation); + } + + // If this set of alternatives contains more than one alternative, + // then the Next nodes will have planted jumps to the End, and added + // them to this node's m_jumps list. + op.m_jumps.link(this); + op.m_jumps.clear(); + + YarrOp& lastOp = m_ops[op.m_previousOp]; + m_checked -= lastOp.m_checkAdjust; + break; + } + + // OpParenthesesSubpatternOnceBegin/End + // + // These nodes support (optionally) capturing subpatterns, that have a + // quantity count of 1 (this covers fixed once, and ?/?? quantifiers). + case OpParenthesesSubpatternOnceBegin: { + PatternTerm* term = op.m_term; + unsigned parenthesesFrameLocation = term->frameLocation; + const RegisterID indexTemporary = regT0; + ASSERT(term->quantityCount == 1); + + // Upon entry to a Greedy quantified set of parenthese store the index. + // We'll use this for two purposes: + // - To indicate which iteration we are on of mathing the remainder of + // the expression after the parentheses - the first, including the + // match within the parentheses, or the second having skipped over them. + // - To check for empty matches, which must be rejected. + // + // At the head of a NonGreedy set of parentheses we'll immediately set the + // value on the stack to -1 (indicating a match skipping the subpattern), + // and plant a jump to the end. We'll also plant a label to backtrack to + // to reenter the subpattern later, with a store to set up index on the + // second iteration. + // + // FIXME: for capturing parens, could use the index in the capture array? + if (term->quantityType == QuantifierGreedy) + storeToFrame(index, parenthesesFrameLocation); + else if (term->quantityType == QuantifierNonGreedy) { + storeToFrame(TrustedImm32(-1), parenthesesFrameLocation); + op.m_jumps.append(jump()); + op.m_reentry = label(); + storeToFrame(index, parenthesesFrameLocation); + } + + // If the parenthese are capturing, store the starting index value to the + // captures array, offsetting as necessary. + // + // FIXME: could avoid offsetting this value in JIT code, apply + // offsets only afterwards, at the point the results array is + // being accessed. + if (term->capture()) { + int offsetId = term->parentheses.subpatternId << 1; + int inputOffset = term->inputPosition - m_checked; + if (term->quantityType == QuantifierFixedCount) + inputOffset -= term->parentheses.disjunction->m_minimumSize; + if (inputOffset) { + move(index, indexTemporary); + add32(Imm32(inputOffset), indexTemporary); + store32(indexTemporary, Address(output, offsetId * sizeof(int))); + } else + store32(index, Address(output, offsetId * sizeof(int))); + } + break; + } + case OpParenthesesSubpatternOnceEnd: { + PatternTerm* term = op.m_term; + unsigned parenthesesFrameLocation = term->frameLocation; + const RegisterID indexTemporary = regT0; + ASSERT(term->quantityCount == 1); + + // For Greedy/NonGreedy quantified parentheses, we must reject zero length + // matches. If the minimum size is know to be non-zero we need not check. + if (term->quantityType != QuantifierFixedCount && !term->parentheses.disjunction->m_minimumSize) + op.m_jumps.append(branch32(Equal, index, Address(stackPointerRegister, parenthesesFrameLocation * sizeof(void*)))); + + // If the parenthese are capturing, store the ending index value to the + // captures array, offsetting as necessary. + // + // FIXME: could avoid offsetting this value in JIT code, apply + // offsets only afterwards, at the point the results array is + // being accessed. + if (term->capture()) { + int offsetId = (term->parentheses.subpatternId << 1) + 1; + int inputOffset = term->inputPosition - m_checked; + if (inputOffset) { + move(index, indexTemporary); + add32(Imm32(inputOffset), indexTemporary); + store32(indexTemporary, Address(output, offsetId * sizeof(int))); + } else + store32(index, Address(output, offsetId * sizeof(int))); + } + + // If the parentheses are quantified Greedy then add a label to jump back + // to if get a failed match from after the parentheses. For NonGreedy + // parentheses, link the jump from before the subpattern to here. + if (term->quantityType == QuantifierGreedy) + op.m_reentry = label(); + else if (term->quantityType == QuantifierNonGreedy) { + YarrOp& beginOp = m_ops[op.m_previousOp]; + beginOp.m_jumps.link(this); + } + break; + } + + // OpParenthesesSubpatternTerminalBegin/End + case OpParenthesesSubpatternTerminalBegin: { + PatternTerm* term = op.m_term; + ASSERT(term->quantityType == QuantifierGreedy); + ASSERT(term->quantityCount == quantifyInfinite); + ASSERT(!term->capture()); + + // Upon entry set a label to loop back to. + op.m_reentry = label(); + + // Store the start index of the current match; we need to reject zero + // length matches. + storeToFrame(index, term->frameLocation); + break; + } + case OpParenthesesSubpatternTerminalEnd: { + PatternTerm* term = op.m_term; + + // Check for zero length matches - if the match is non-zero, then we + // can accept it & loop back up to the head of the subpattern. + YarrOp& beginOp = m_ops[op.m_previousOp]; + branch32(NotEqual, index, Address(stackPointerRegister, term->frameLocation * sizeof(void*)), beginOp.m_reentry); + + // Reject the match - backtrack back into the subpattern. + op.m_jumps.append(jump()); + + // This is the entry point to jump to when we stop matching - we will + // do so once the subpattern cannot match any more. + op.m_reentry = label(); + break; + } + + // OpParentheticalAssertionBegin/End + case OpParentheticalAssertionBegin: { + PatternTerm* term = op.m_term; + + // Store the current index - assertions should not update index, so + // we will need to restore it upon a successful match. + unsigned parenthesesFrameLocation = term->frameLocation; + storeToFrame(index, parenthesesFrameLocation); + + // Check + op.m_checkAdjust = m_checked - term->inputPosition; + if (op.m_checkAdjust) + sub32(Imm32(op.m_checkAdjust), index); + + m_checked -= op.m_checkAdjust; + break; + } + case OpParentheticalAssertionEnd: { + PatternTerm* term = op.m_term; + + // Restore the input index value. + unsigned parenthesesFrameLocation = term->frameLocation; + loadFromFrame(parenthesesFrameLocation, index); + + // If inverted, a successful match of the assertion must be treated + // as a failure, so jump to backtracking. + if (term->invert()) { + op.m_jumps.append(jump()); + op.m_reentry = label(); + } + + YarrOp& lastOp = m_ops[op.m_previousOp]; + m_checked += lastOp.m_checkAdjust; + break; + } + + case OpMatchFailed: + if (m_pattern.m_body->m_callFrameSize) + addPtr(Imm32(m_pattern.m_body->m_callFrameSize * sizeof(void*)), stackPointerRegister); + move(TrustedImm32(-1), returnRegister); + generateReturn(); + break; + } + + ++opIndex; + } while (opIndex < m_ops.size()); + } + + void backtrack() + { + // Backwards generate the backtracking code. + size_t opIndex = m_ops.size(); + ASSERT(opIndex); + + do { + --opIndex; + YarrOp& op = m_ops[opIndex]; + switch (op.m_op) { + + case OpTerm: + backtrackTerm(opIndex); + break; + + // OpBodyAlternativeBegin/Next/End + // + // For each Begin/Next node representing an alternative, we need to decide what to do + // in two circumstances: + // - If we backtrack back into this node, from within the alternative. + // - If the input check at the head of the alternative fails (if this exists). + // + // We treat these two cases differently since in the former case we have slightly + // more information - since we are backtracking out of a prior alternative we know + // that at least enough input was available to run it. For example, given the regular + // expression /a|b/, if we backtrack out of the first alternative (a failed pattern + // character match of 'a'), then we need not perform an additional input availability + // check before running the second alternative. + // + // Backtracking required differs for the last alternative, which in the case of the + // repeating set of alternatives must loop. The code generated for the last alternative + // will also be used to handle all input check failures from any prior alternatives - + // these require similar functionality, in seeking the next available alternative for + // which there is sufficient input. + // + // Since backtracking of all other alternatives simply requires us to link backtracks + // to the reentry point for the subsequent alternative, we will only be generating any + // code when backtracking the last alternative. + case OpBodyAlternativeBegin: + case OpBodyAlternativeNext: { + PatternAlternative* alternative = op.m_alternative; + + if (op.m_op == OpBodyAlternativeNext) { + PatternAlternative* priorAlternative = m_ops[op.m_previousOp].m_alternative; + m_checked += priorAlternative->m_minimumSize; + } + m_checked -= alternative->m_minimumSize; + + // Is this the last alternative? If not, then if we backtrack to this point we just + // need to jump to try to match the next alternative. + if (m_ops[op.m_nextOp].m_op != OpBodyAlternativeEnd) { + m_backtrackingState.linkTo(m_ops[op.m_nextOp].m_reentry, this); + break; + } + YarrOp& endOp = m_ops[op.m_nextOp]; + + YarrOp* beginOp = &op; + while (beginOp->m_op != OpBodyAlternativeBegin) { + ASSERT(beginOp->m_op == OpBodyAlternativeNext); + beginOp = &m_ops[beginOp->m_previousOp]; + } + + bool onceThrough = endOp.m_nextOp == notFound; + + // First, generate code to handle cases where we backtrack out of an attempted match + // of the last alternative. If this is a 'once through' set of alternatives then we + // have nothing to do - link this straight through to the End. + if (onceThrough) + m_backtrackingState.linkTo(endOp.m_reentry, this); + else { + // If we don't need to move the input poistion, and the pattern has a fixed size + // (in which case we omit the store of the start index until the pattern has matched) + // then we can just link the backtrack out of the last alternative straight to the + // head of the first alternative. + if (m_pattern.m_body->m_hasFixedSize + && (alternative->m_minimumSize > beginOp->m_alternative->m_minimumSize) + && (alternative->m_minimumSize - beginOp->m_alternative->m_minimumSize == 1)) + m_backtrackingState.linkTo(beginOp->m_reentry, this); + else { + // We need to generate a trampoline of code to execute before looping back + // around to the first alternative. + m_backtrackingState.link(this); + + // If the pattern size is not fixed, then store the start index, for use if we match. + if (!m_pattern.m_body->m_hasFixedSize) { + if (alternative->m_minimumSize == 1) + store32(index, Address(output)); + else { + move(index, regT0); + if (alternative->m_minimumSize) + sub32(Imm32(alternative->m_minimumSize - 1), regT0); + else + add32(Imm32(1), regT0); + store32(regT0, Address(output)); + } + } + + // Generate code to loop. Check whether the last alternative is longer than the + // first (e.g. /a|xy/ or /a|xyz/). + if (alternative->m_minimumSize > beginOp->m_alternative->m_minimumSize) { + // We want to loop, and increment input position. If the delta is 1, it is + // already correctly incremented, if more than one then decrement as appropriate. + unsigned delta = alternative->m_minimumSize - beginOp->m_alternative->m_minimumSize; + ASSERT(delta); + if (delta != 1) + sub32(Imm32(delta - 1), index); + jump(beginOp->m_reentry); + } else { + // If the first alternative has minimum size 0xFFFFFFFFu, then there cannot + // be sufficent input available to handle this, so just fall through. + unsigned delta = beginOp->m_alternative->m_minimumSize - alternative->m_minimumSize; + if (delta != 0xFFFFFFFFu) { + // We need to check input because we are incrementing the input. + add32(Imm32(delta + 1), index); + checkInput().linkTo(beginOp->m_reentry, this); + } + } + } + } + + // We can reach this point in the code in two ways: + // - Fallthrough from the code above (a repeating alternative backtracked out of its + // last alternative, and did not have sufficent input to run the first). + // - We will loop back up to the following label when a releating alternative loops, + // following a failed input check. + // + // Either way, we have just failed the input check for the first alternative. + Label firstInputCheckFailed(this); + + // Generate code to handle input check failures from alternatives except the last. + // prevOp is the alternative we're handling a bail out from (initially Begin), and + // nextOp is the alternative we will be attempting to reenter into. + // + // We will link input check failures from the forwards matching path back to the code + // that can handle them. + YarrOp* prevOp = beginOp; + YarrOp* nextOp = &m_ops[beginOp->m_nextOp]; + while (nextOp->m_op != OpBodyAlternativeEnd) { + prevOp->m_jumps.link(this); + + // We only get here if an input check fails, it is only worth checking again + // if the next alternative has a minimum size less than the last. + if (prevOp->m_alternative->m_minimumSize > nextOp->m_alternative->m_minimumSize) { + // FIXME: if we added an extra label to YarrOp, we could avoid needing to + // subtract delta back out, and reduce this code. Should performance test + // the benefit of this. + unsigned delta = prevOp->m_alternative->m_minimumSize - nextOp->m_alternative->m_minimumSize; + sub32(Imm32(delta), index); + Jump fail = jumpIfNoAvailableInput(); + add32(Imm32(delta), index); + jump(nextOp->m_reentry); + fail.link(this); + } else if (prevOp->m_alternative->m_minimumSize < nextOp->m_alternative->m_minimumSize) + add32(Imm32(nextOp->m_alternative->m_minimumSize - prevOp->m_alternative->m_minimumSize), index); + prevOp = nextOp; + nextOp = &m_ops[nextOp->m_nextOp]; + } + + // We fall through to here if there is insufficient input to run the last alternative. + + // If there is insufficient input to run the last alternative, then for 'once through' + // alternatives we are done - just jump back up into the forwards matching path at the End. + if (onceThrough) { + op.m_jumps.linkTo(endOp.m_reentry, this); + jump(endOp.m_reentry); + break; + } + + // For repeating alternatives, link any input check failure from the last alternative to + // this point. + op.m_jumps.link(this); + + bool needsToUpdateMatchStart = !m_pattern.m_body->m_hasFixedSize; + + // Check for cases where input position is already incremented by 1 for the last + // alternative (this is particularly useful where the minimum size of the body + // disjunction is 0, e.g. /a*|b/). + if (needsToUpdateMatchStart && alternative->m_minimumSize == 1) { + // index is already incremented by 1, so just store it now! + store32(index, Address(output)); + needsToUpdateMatchStart = false; + } + + // Check whether there is sufficient input to loop. Increment the input position by + // one, and check. Also add in the minimum disjunction size before checking - there + // is no point in looping if we're just going to fail all the input checks around + // the next iteration. + ASSERT(alternative->m_minimumSize >= m_pattern.m_body->m_minimumSize); + if (alternative->m_minimumSize == m_pattern.m_body->m_minimumSize) { + // If the last alternative had the same minimum size as the disjunction, + // just simply increment input pos by 1, no adjustment based on minimum size. + add32(Imm32(1), index); + } else { + // If the minumum for the last alternative was one greater than than that + // for the disjunction, we're already progressed by 1, nothing to do! + unsigned delta = (alternative->m_minimumSize - m_pattern.m_body->m_minimumSize) - 1; + if (delta) + sub32(Imm32(delta), index); + } + Jump matchFailed = jumpIfNoAvailableInput(); + + if (needsToUpdateMatchStart) { + if (!m_pattern.m_body->m_minimumSize) + store32(index, Address(output)); + else { + move(index, regT0); + sub32(Imm32(m_pattern.m_body->m_minimumSize), regT0); + store32(regT0, Address(output)); + } + } + + // Calculate how much more input the first alternative requires than the minimum + // for the body as a whole. If no more is needed then we dont need an additional + // input check here - jump straight back up to the start of the first alternative. + if (beginOp->m_alternative->m_minimumSize == m_pattern.m_body->m_minimumSize) + jump(beginOp->m_reentry); + else { + if (beginOp->m_alternative->m_minimumSize > m_pattern.m_body->m_minimumSize) + add32(Imm32(beginOp->m_alternative->m_minimumSize - m_pattern.m_body->m_minimumSize), index); + else + sub32(Imm32(m_pattern.m_body->m_minimumSize - beginOp->m_alternative->m_minimumSize), index); + checkInput().linkTo(beginOp->m_reentry, this); + jump(firstInputCheckFailed); + } + + // We jump to here if we iterate to the point that there is insufficient input to + // run any matches, and need to return a failure state from JIT code. + matchFailed.link(this); + + if (m_pattern.m_body->m_callFrameSize) + addPtr(Imm32(m_pattern.m_body->m_callFrameSize * sizeof(void*)), stackPointerRegister); + move(TrustedImm32(-1), returnRegister); + generateReturn(); + break; + } + case OpBodyAlternativeEnd: { + // We should never backtrack back into a body disjunction. + ASSERT(m_backtrackingState.isEmpty()); + + PatternAlternative* priorAlternative = m_ops[op.m_previousOp].m_alternative; + m_checked += priorAlternative->m_minimumSize; + break; + } + + // OpSimpleNestedAlternativeBegin/Next/End + // OpNestedAlternativeBegin/Next/End + // + // Generate code for when we backtrack back out of an alternative into + // a Begin or Next node, or when the entry input count check fails. If + // there are more alternatives we need to jump to the next alternative, + // if not we backtrack back out of the current set of parentheses. + // + // In the case of non-simple nested assertions we need to also link the + // 'return address' appropriately to backtrack back out into the correct + // alternative. + case OpSimpleNestedAlternativeBegin: + case OpSimpleNestedAlternativeNext: + case OpNestedAlternativeBegin: + case OpNestedAlternativeNext: { + YarrOp& nextOp = m_ops[op.m_nextOp]; + bool isBegin = op.m_previousOp == notFound; + bool isLastAlternative = nextOp.m_nextOp == notFound; + ASSERT(isBegin == (op.m_op == OpSimpleNestedAlternativeBegin || op.m_op == OpNestedAlternativeBegin)); + ASSERT(isLastAlternative == (nextOp.m_op == OpSimpleNestedAlternativeEnd || nextOp.m_op == OpNestedAlternativeEnd)); + + // Treat an input check failure the same as a failed match. + m_backtrackingState.append(op.m_jumps); + + // Set the backtracks to jump to the appropriate place. We may need + // to link the backtracks in one of three different way depending on + // the type of alternative we are dealing with: + // - A single alternative, with no simplings. + // - The last alternative of a set of two or more. + // - An alternative other than the last of a set of two or more. + // + // In the case of a single alternative on its own, we don't need to + // jump anywhere - if the alternative fails to match we can just + // continue to backtrack out of the parentheses without jumping. + // + // In the case of the last alternative in a set of more than one, we + // need to jump to return back out to the beginning. We'll do so by + // adding a jump to the End node's m_jumps list, and linking this + // when we come to generate the Begin node. For alternatives other + // than the last, we need to jump to the next alternative. + // + // If the alternative had adjusted the input position we must link + // backtracking to here, correct, and then jump on. If not we can + // link the backtracks directly to their destination. + if (op.m_checkAdjust) { + // Handle the cases where we need to link the backtracks here. + m_backtrackingState.link(this); + sub32(Imm32(op.m_checkAdjust), index); + if (!isLastAlternative) { + // An alternative that is not the last should jump to its successor. + jump(nextOp.m_reentry); + } else if (!isBegin) { + // The last of more than one alternatives must jump back to the begnning. + nextOp.m_jumps.append(jump()); + } else { + // A single alternative on its own can fall through. + m_backtrackingState.fallthrough(); + } + } else { + // Handle the cases where we can link the backtracks directly to their destinations. + if (!isLastAlternative) { + // An alternative that is not the last should jump to its successor. + m_backtrackingState.linkTo(nextOp.m_reentry, this); + } else if (!isBegin) { + // The last of more than one alternatives must jump back to the begnning. + m_backtrackingState.takeBacktracksToJumpList(nextOp.m_jumps, this); + } + // In the case of a single alternative on its own do nothing - it can fall through. + } + + // At this point we've handled the backtracking back into this node. + // Now link any backtracks that need to jump to here. + + // For non-simple alternatives, link the alternative's 'return address' + // so that we backtrack back out into the previous alternative. + if (op.m_op == OpNestedAlternativeNext) + m_backtrackingState.append(op.m_returnAddress); + + // If there is more than one alternative, then the last alternative will + // have planted a jump to be linked to the end. This jump was added to the + // End node's m_jumps list. If we are back at the beginning, link it here. + if (isBegin) { + YarrOp* endOp = &m_ops[op.m_nextOp]; + while (endOp->m_nextOp != notFound) { + ASSERT(endOp->m_op == OpSimpleNestedAlternativeNext || endOp->m_op == OpNestedAlternativeNext); + endOp = &m_ops[endOp->m_nextOp]; + } + ASSERT(endOp->m_op == OpSimpleNestedAlternativeEnd || endOp->m_op == OpNestedAlternativeEnd); + m_backtrackingState.append(endOp->m_jumps); + } + + if (!isBegin) { + YarrOp& lastOp = m_ops[op.m_previousOp]; + m_checked += lastOp.m_checkAdjust; + } + m_checked -= op.m_checkAdjust; + break; + } + case OpSimpleNestedAlternativeEnd: + case OpNestedAlternativeEnd: { + PatternTerm* term = op.m_term; + + // If we backtrack into the end of a simple subpattern do nothing; + // just continue through into the last alternative. If we backtrack + // into the end of a non-simple set of alterntives we need to jump + // to the backtracking return address set up during generation. + if (op.m_op == OpNestedAlternativeEnd) { + m_backtrackingState.link(this); + + // Plant a jump to the return address. + unsigned parenthesesFrameLocation = term->frameLocation; + unsigned alternativeFrameLocation = parenthesesFrameLocation; + if (term->quantityType != QuantifierFixedCount) + alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesOnce; + loadFromFrameAndJump(alternativeFrameLocation); + + // Link the DataLabelPtr associated with the end of the last + // alternative to this point. + m_backtrackingState.append(op.m_returnAddress); + } + + YarrOp& lastOp = m_ops[op.m_previousOp]; + m_checked += lastOp.m_checkAdjust; + break; + } + + // OpParenthesesSubpatternOnceBegin/End + // + // When we are backtracking back out of a capturing subpattern we need + // to clear the start index in the matches output array, to record that + // this subpattern has not been captured. + // + // When backtracking back out of a Greedy quantified subpattern we need + // to catch this, and try running the remainder of the alternative after + // the subpattern again, skipping the parentheses. + // + // Upon backtracking back into a quantified set of parentheses we need to + // check whether we were currently skipping the subpattern. If not, we + // can backtrack into them, if we were we need to either backtrack back + // out of the start of the parentheses, or jump back to the forwards + // matching start, depending of whether the match is Greedy or NonGreedy. + case OpParenthesesSubpatternOnceBegin: { + PatternTerm* term = op.m_term; + ASSERT(term->quantityCount == 1); + + // We only need to backtrack to thispoint if capturing or greedy. + if (term->capture() || term->quantityType == QuantifierGreedy) { + m_backtrackingState.link(this); + + // If capturing, clear the capture (we only need to reset start). + if (term->capture()) + store32(TrustedImm32(-1), Address(output, (term->parentheses.subpatternId << 1) * sizeof(int))); + + // If Greedy, jump to the end. + if (term->quantityType == QuantifierGreedy) { + // Clear the flag in the stackframe indicating we ran through the subpattern. + unsigned parenthesesFrameLocation = term->frameLocation; + storeToFrame(TrustedImm32(-1), parenthesesFrameLocation); + // Jump to after the parentheses, skipping the subpattern. + jump(m_ops[op.m_nextOp].m_reentry); + // A backtrack from after the parentheses, when skipping the subpattern, + // will jump back to here. + op.m_jumps.link(this); + } + + m_backtrackingState.fallthrough(); + } + break; + } + case OpParenthesesSubpatternOnceEnd: { + PatternTerm* term = op.m_term; + + if (term->quantityType != QuantifierFixedCount) { + m_backtrackingState.link(this); + + // Check whether we should backtrack back into the parentheses, or if we + // are currently in a state where we had skipped over the subpattern + // (in which case the flag value on the stack will be -1). + unsigned parenthesesFrameLocation = term->frameLocation; + Jump hadSkipped = branch32(Equal, Address(stackPointerRegister, parenthesesFrameLocation * sizeof(void*)), TrustedImm32(-1)); + + if (term->quantityType == QuantifierGreedy) { + // For Greedy parentheses, we skip after having already tried going + // through the subpattern, so if we get here we're done. + YarrOp& beginOp = m_ops[op.m_previousOp]; + beginOp.m_jumps.append(hadSkipped); + } else { + // For NonGreedy parentheses, we try skipping the subpattern first, + // so if we get here we need to try running through the subpattern + // next. Jump back to the start of the parentheses in the forwards + // matching path. + ASSERT(term->quantityType == QuantifierNonGreedy); + YarrOp& beginOp = m_ops[op.m_previousOp]; + hadSkipped.linkTo(beginOp.m_reentry, this); + } + + m_backtrackingState.fallthrough(); + } + + m_backtrackingState.append(op.m_jumps); + break; + } + + // OpParenthesesSubpatternTerminalBegin/End + // + // Terminal subpatterns will always match - there is nothing after them to + // force a backtrack, and they have a minimum count of 0, and as such will + // always produce an acceptable result. + case OpParenthesesSubpatternTerminalBegin: { + // We will backtrack to this point once the subpattern cannot match any + // more. Since no match is accepted as a successful match (we are Greedy + // quantified with a minimum of zero) jump back to the forwards matching + // path at the end. + YarrOp& endOp = m_ops[op.m_nextOp]; + m_backtrackingState.linkTo(endOp.m_reentry, this); + break; + } + case OpParenthesesSubpatternTerminalEnd: + // We should never be backtracking to here (hence the 'terminal' in the name). + ASSERT(m_backtrackingState.isEmpty()); + m_backtrackingState.append(op.m_jumps); + break; + + // OpParentheticalAssertionBegin/End + case OpParentheticalAssertionBegin: { + PatternTerm* term = op.m_term; + YarrOp& endOp = m_ops[op.m_nextOp]; + + // We need to handle the backtracks upon backtracking back out + // of a parenthetical assertion if either we need to correct + // the input index, or the assertion was inverted. + if (op.m_checkAdjust || term->invert()) { + m_backtrackingState.link(this); + + if (op.m_checkAdjust) + add32(Imm32(op.m_checkAdjust), index); + + // In an inverted assertion failure to match the subpattern + // is treated as a successful match - jump to the end of the + // subpattern. We already have adjusted the input position + // back to that before the assertion, which is correct. + if (term->invert()) + jump(endOp.m_reentry); + + m_backtrackingState.fallthrough(); + } + + // The End node's jump list will contain any backtracks into + // the end of the assertion. Also, if inverted, we will have + // added the failure caused by a successful match to this. + m_backtrackingState.append(endOp.m_jumps); + + m_checked += op.m_checkAdjust; + break; + } + case OpParentheticalAssertionEnd: { + // FIXME: We should really be clearing any nested subpattern + // matches on bailing out from after the pattern. Firefox has + // this bug too (presumably because they use YARR!) + + // Never backtrack into an assertion; later failures bail to before the begin. + m_backtrackingState.takeBacktracksToJumpList(op.m_jumps, this); + + YarrOp& lastOp = m_ops[op.m_previousOp]; + m_checked -= lastOp.m_checkAdjust; + break; + } + + case OpMatchFailed: + break; + } + + } while (opIndex); + } + + // Compilation methods: + // ==================== + + // opCompileParenthesesSubpattern + // Emits ops for a subpattern (set of parentheses). These consist + // of a set of alternatives wrapped in an outer set of nodes for + // the parentheses. + // Supported types of parentheses are 'Once' (quantityCount == 1) + // and 'Terminal' (non-capturing parentheses quantified as greedy + // and infinite). + // Alternatives will use the 'Simple' set of ops if either the + // subpattern is terminal (in which case we will never need to + // backtrack), or if the subpattern only contains one alternative. + void opCompileParenthesesSubpattern(PatternTerm* term) + { + YarrOpCode parenthesesBeginOpCode; + YarrOpCode parenthesesEndOpCode; + YarrOpCode alternativeBeginOpCode = OpSimpleNestedAlternativeBegin; + YarrOpCode alternativeNextOpCode = OpSimpleNestedAlternativeNext; + YarrOpCode alternativeEndOpCode = OpSimpleNestedAlternativeEnd; + + // We can currently only compile quantity 1 subpatterns that are + // not copies. We generate a copy in the case of a range quantifier, + // e.g. /(?:x){3,9}/, or /(?:x)+/ (These are effectively expanded to + // /(?:x){3,3}(?:x){0,6}/ and /(?:x)(?:x)*/ repectively). The problem + // comes where the subpattern is capturing, in which case we would + // need to restore the capture from the first subpattern upon a + // failure in the second. + if (term->quantityCount == 1 && !term->parentheses.isCopy) { + // Select the 'Once' nodes. + parenthesesBeginOpCode = OpParenthesesSubpatternOnceBegin; + parenthesesEndOpCode = OpParenthesesSubpatternOnceEnd; + + // If there is more than one alternative we cannot use the 'simple' nodes. + if (term->parentheses.disjunction->m_alternatives.size() != 1) { + alternativeBeginOpCode = OpNestedAlternativeBegin; + alternativeNextOpCode = OpNestedAlternativeNext; + alternativeEndOpCode = OpNestedAlternativeEnd; + } + } else if (term->parentheses.isTerminal) { + // Select the 'Terminal' nodes. + parenthesesBeginOpCode = OpParenthesesSubpatternTerminalBegin; + parenthesesEndOpCode = OpParenthesesSubpatternTerminalEnd; + } else { + // This subpattern is not supported by the JIT. + m_shouldFallBack = true; + return; + } + + size_t parenBegin = m_ops.size(); + m_ops.append(parenthesesBeginOpCode); + + m_ops.append(alternativeBeginOpCode); + m_ops.last().m_previousOp = notFound; + m_ops.last().m_term = term; + Vector<PatternAlternative*>& alternatives = term->parentheses.disjunction->m_alternatives; + for (unsigned i = 0; i < alternatives.size(); ++i) { + size_t lastOpIndex = m_ops.size() - 1; + + PatternAlternative* nestedAlternative = alternatives[i]; + opCompileAlternative(nestedAlternative); + + size_t thisOpIndex = m_ops.size(); + m_ops.append(YarrOp(alternativeNextOpCode)); + + YarrOp& lastOp = m_ops[lastOpIndex]; + YarrOp& thisOp = m_ops[thisOpIndex]; + + lastOp.m_alternative = nestedAlternative; + lastOp.m_nextOp = thisOpIndex; + thisOp.m_previousOp = lastOpIndex; + thisOp.m_term = term; + } + YarrOp& lastOp = m_ops.last(); + ASSERT(lastOp.m_op == alternativeNextOpCode); + lastOp.m_op = alternativeEndOpCode; + lastOp.m_alternative = 0; + lastOp.m_nextOp = notFound; + + size_t parenEnd = m_ops.size(); + m_ops.append(parenthesesEndOpCode); + + m_ops[parenBegin].m_term = term; + m_ops[parenBegin].m_previousOp = notFound; + m_ops[parenBegin].m_nextOp = parenEnd; + m_ops[parenEnd].m_term = term; + m_ops[parenEnd].m_previousOp = parenBegin; + m_ops[parenEnd].m_nextOp = notFound; + } + + // opCompileParentheticalAssertion + // Emits ops for a parenthetical assertion. These consist of an + // OpSimpleNestedAlternativeBegin/Next/End set of nodes wrapping + // the alternatives, with these wrapped by an outer pair of + // OpParentheticalAssertionBegin/End nodes. + // We can always use the OpSimpleNestedAlternative nodes in the + // case of parenthetical assertions since these only ever match + // once, and will never backtrack back into the assertion. + void opCompileParentheticalAssertion(PatternTerm* term) + { + size_t parenBegin = m_ops.size(); + m_ops.append(OpParentheticalAssertionBegin); + + m_ops.append(OpSimpleNestedAlternativeBegin); + m_ops.last().m_previousOp = notFound; + m_ops.last().m_term = term; + Vector<PatternAlternative*>& alternatives = term->parentheses.disjunction->m_alternatives; + for (unsigned i = 0; i < alternatives.size(); ++i) { + size_t lastOpIndex = m_ops.size() - 1; + + PatternAlternative* nestedAlternative = alternatives[i]; + opCompileAlternative(nestedAlternative); + + size_t thisOpIndex = m_ops.size(); + m_ops.append(YarrOp(OpSimpleNestedAlternativeNext)); + + YarrOp& lastOp = m_ops[lastOpIndex]; + YarrOp& thisOp = m_ops[thisOpIndex]; + + lastOp.m_alternative = nestedAlternative; + lastOp.m_nextOp = thisOpIndex; + thisOp.m_previousOp = lastOpIndex; + thisOp.m_term = term; + } + YarrOp& lastOp = m_ops.last(); + ASSERT(lastOp.m_op == OpSimpleNestedAlternativeNext); + lastOp.m_op = OpSimpleNestedAlternativeEnd; + lastOp.m_alternative = 0; + lastOp.m_nextOp = notFound; + + size_t parenEnd = m_ops.size(); + m_ops.append(OpParentheticalAssertionEnd); + + m_ops[parenBegin].m_term = term; + m_ops[parenBegin].m_previousOp = notFound; + m_ops[parenBegin].m_nextOp = parenEnd; + m_ops[parenEnd].m_term = term; + m_ops[parenEnd].m_previousOp = parenBegin; + m_ops[parenEnd].m_nextOp = notFound; + } + + // opCompileAlternative + // Called to emit nodes for all terms in an alternative. + void opCompileAlternative(PatternAlternative* alternative) + { + optimizeAlternative(alternative); + + for (unsigned i = 0; i < alternative->m_terms.size(); ++i) { + PatternTerm* term = &alternative->m_terms[i]; + + switch (term->type) { + case PatternTerm::TypeParenthesesSubpattern: + opCompileParenthesesSubpattern(term); + break; + + case PatternTerm::TypeParentheticalAssertion: + opCompileParentheticalAssertion(term); + break; + + default: + m_ops.append(term); + } + } + } + + // opCompileBody + // This method compiles the body disjunction of the regular expression. + // The body consists of two sets of alternatives - zero or more 'once + // through' (BOL anchored) alternatives, followed by zero or more + // repeated alternatives. + // For each of these two sets of alteratives, if not empty they will be + // wrapped in a set of OpBodyAlternativeBegin/Next/End nodes (with the + // 'begin' node referencing the first alternative, and 'next' nodes + // referencing any further alternatives. The begin/next/end nodes are + // linked together in a doubly linked list. In the case of repeating + // alternatives, the end node is also linked back to the beginning. + // If no repeating alternatives exist, then a OpMatchFailed node exists + // to return the failing result. + void opCompileBody(PatternDisjunction* disjunction) + { + Vector<PatternAlternative*>& alternatives = disjunction->m_alternatives; + size_t currentAlternativeIndex = 0; + + // Emit the 'once through' alternatives. + if (alternatives.size() && alternatives[0]->onceThrough()) { + m_ops.append(YarrOp(OpBodyAlternativeBegin)); + m_ops.last().m_previousOp = notFound; + + do { + size_t lastOpIndex = m_ops.size() - 1; + PatternAlternative* alternative = alternatives[currentAlternativeIndex]; + opCompileAlternative(alternative); + + size_t thisOpIndex = m_ops.size(); + m_ops.append(YarrOp(OpBodyAlternativeNext)); + + YarrOp& lastOp = m_ops[lastOpIndex]; + YarrOp& thisOp = m_ops[thisOpIndex]; + + lastOp.m_alternative = alternative; + lastOp.m_nextOp = thisOpIndex; + thisOp.m_previousOp = lastOpIndex; + + ++currentAlternativeIndex; + } while (currentAlternativeIndex < alternatives.size() && alternatives[currentAlternativeIndex]->onceThrough()); + + YarrOp& lastOp = m_ops.last(); + + ASSERT(lastOp.m_op == OpBodyAlternativeNext); + lastOp.m_op = OpBodyAlternativeEnd; + lastOp.m_alternative = 0; + lastOp.m_nextOp = notFound; + } + + if (currentAlternativeIndex == alternatives.size()) { + m_ops.append(YarrOp(OpMatchFailed)); + return; + } + + // Emit the repeated alternatives. + size_t repeatLoop = m_ops.size(); + m_ops.append(YarrOp(OpBodyAlternativeBegin)); + m_ops.last().m_previousOp = notFound; + do { + size_t lastOpIndex = m_ops.size() - 1; + PatternAlternative* alternative = alternatives[currentAlternativeIndex]; + ASSERT(!alternative->onceThrough()); + opCompileAlternative(alternative); + + size_t thisOpIndex = m_ops.size(); + m_ops.append(YarrOp(OpBodyAlternativeNext)); + + YarrOp& lastOp = m_ops[lastOpIndex]; + YarrOp& thisOp = m_ops[thisOpIndex]; + + lastOp.m_alternative = alternative; + lastOp.m_nextOp = thisOpIndex; + thisOp.m_previousOp = lastOpIndex; + + ++currentAlternativeIndex; + } while (currentAlternativeIndex < alternatives.size()); + YarrOp& lastOp = m_ops.last(); + ASSERT(lastOp.m_op == OpBodyAlternativeNext); + lastOp.m_op = OpBodyAlternativeEnd; + lastOp.m_alternative = 0; + lastOp.m_nextOp = repeatLoop; + } + + void generateEnter() + { +#if CPU(X86_64) + push(X86Registers::ebp); + move(stackPointerRegister, X86Registers::ebp); + push(X86Registers::ebx); +#elif CPU(X86) + push(X86Registers::ebp); + move(stackPointerRegister, X86Registers::ebp); + // TODO: do we need spill registers to fill the output pointer if there are no sub captures? + push(X86Registers::ebx); + push(X86Registers::edi); + push(X86Registers::esi); + // load output into edi (2 = saved ebp + return address). + #if COMPILER(MSVC) + loadPtr(Address(X86Registers::ebp, 2 * sizeof(void*)), input); + loadPtr(Address(X86Registers::ebp, 3 * sizeof(void*)), index); + loadPtr(Address(X86Registers::ebp, 4 * sizeof(void*)), length); + loadPtr(Address(X86Registers::ebp, 5 * sizeof(void*)), output); + #else + loadPtr(Address(X86Registers::ebp, 2 * sizeof(void*)), output); + #endif +#elif CPU(ARM) + push(ARMRegisters::r4); + push(ARMRegisters::r5); + push(ARMRegisters::r6); +#if CPU(ARM_TRADITIONAL) + push(ARMRegisters::r8); // scratch register +#endif + move(ARMRegisters::r3, output); +#elif CPU(SH4) + push(SH4Registers::r11); + push(SH4Registers::r13); +#elif CPU(MIPS) + // Do nothing. +#endif + } + + void generateReturn() + { +#if CPU(X86_64) + pop(X86Registers::ebx); + pop(X86Registers::ebp); +#elif CPU(X86) + pop(X86Registers::esi); + pop(X86Registers::edi); + pop(X86Registers::ebx); + pop(X86Registers::ebp); +#elif CPU(ARM) +#if CPU(ARM_TRADITIONAL) + pop(ARMRegisters::r8); // scratch register +#endif + pop(ARMRegisters::r6); + pop(ARMRegisters::r5); + pop(ARMRegisters::r4); +#elif CPU(SH4) + pop(SH4Registers::r13); + pop(SH4Registers::r11); +#elif CPU(MIPS) + // Do nothing +#endif + ret(); + } + +public: + YarrGenerator(YarrPattern& pattern, YarrCharSize charSize) + : m_pattern(pattern) + , m_charSize(charSize) + , m_charScale(m_charSize == Char8 ? TimesOne: TimesTwo) + , m_shouldFallBack(false) + , m_checked(0) + { + } + + void compile(JSGlobalData* globalData, YarrCodeBlock& jitObject) + { + generateEnter(); + + if (!m_pattern.m_body->m_hasFixedSize) + store32(index, Address(output)); + + if (m_pattern.m_body->m_callFrameSize) + subPtr(Imm32(m_pattern.m_body->m_callFrameSize * sizeof(void*)), stackPointerRegister); + + // Compile the pattern to the internal 'YarrOp' representation. + opCompileBody(m_pattern.m_body); + + // If we encountered anything we can't handle in the JIT code + // (e.g. backreferences) then return early. + if (m_shouldFallBack) { + jitObject.setFallBack(true); + return; + } + + generate(); + backtrack(); + + // Link & finalize the code. + LinkBuffer linkBuffer(*globalData, this); + m_backtrackingState.linkDataLabels(linkBuffer); + if (m_charSize == Char8) + jitObject.set8BitCode(linkBuffer.finalizeCode()); + else + jitObject.set16BitCode(linkBuffer.finalizeCode()); + jitObject.setFallBack(m_shouldFallBack); + } + +private: + YarrPattern& m_pattern; + + YarrCharSize m_charSize; + + Scale m_charScale; + + // Used to detect regular expression constructs that are not currently + // supported in the JIT; fall back to the interpreter when this is detected. + bool m_shouldFallBack; + + // The regular expression expressed as a linear sequence of operations. + Vector<YarrOp, 128> m_ops; + + // This records the current input offset being applied due to the current + // set of alternatives we are nested within. E.g. when matching the + // character 'b' within the regular expression /abc/, we will know that + // the minimum size for the alternative is 3, checked upon entry to the + // alternative, and that 'b' is at offset 1 from the start, and as such + // when matching 'b' we need to apply an offset of -2 to the load. + // + // FIXME: This should go away. Rather than tracking this value throughout + // code generation, we should gather this information up front & store it + // on the YarrOp structure. + int m_checked; + + // This class records state whilst generating the backtracking path of code. + BacktrackingState m_backtrackingState; +}; + +void jitCompile(YarrPattern& pattern, YarrCharSize charSize, JSGlobalData* globalData, YarrCodeBlock& jitObject) +{ + YarrGenerator(pattern, charSize).compile(globalData, jitObject); +} + +}} + +#endif diff --git a/Source/JavaScriptCore/yarr/YarrJIT.h b/Source/JavaScriptCore/yarr/YarrJIT.h new file mode 100644 index 000000000..38ae76cc4 --- /dev/null +++ b/Source/JavaScriptCore/yarr/YarrJIT.h @@ -0,0 +1,108 @@ +/* + * Copyright (C) 2009 Apple Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef YarrJIT_h +#define YarrJIT_h + +#if ENABLE(YARR_JIT) + +#include "JSGlobalData.h" +#include "MacroAssembler.h" +#include "UString.h" +#include "Yarr.h" +#include "YarrPattern.h" + +#if CPU(X86) && !COMPILER(MSVC) +#define YARR_CALL __attribute__ ((regparm (3))) +#else +#define YARR_CALL +#endif + +namespace JSC { + +class JSGlobalData; +class ExecutablePool; + +namespace Yarr { + +class YarrCodeBlock { + typedef int (*YarrJITCode8)(const LChar* input, unsigned start, unsigned length, int* output) YARR_CALL; + typedef int (*YarrJITCode16)(const UChar* input, unsigned start, unsigned length, int* output) YARR_CALL; + +public: + YarrCodeBlock() + : m_needFallBack(false) + { + } + + ~YarrCodeBlock() + { + } + + void setFallBack(bool fallback) { m_needFallBack = fallback; } + bool isFallBack() { return m_needFallBack; } + bool has8BitCode() { return m_ref8.size(); } + bool has16BitCode() { return m_ref16.size(); } + void set8BitCode(MacroAssembler::CodeRef ref) { m_ref8 = ref; } + void set16BitCode(MacroAssembler::CodeRef ref) { m_ref16 = ref; } + + int execute(const LChar* input, unsigned start, unsigned length, int* output) + { + ASSERT(has8BitCode()); + return reinterpret_cast<YarrJITCode8>(m_ref8.code().executableAddress())(input, start, length, output); + } + + int execute(const UChar* input, unsigned start, unsigned length, int* output) + { + ASSERT(has16BitCode()); + return reinterpret_cast<YarrJITCode16>(m_ref16.code().executableAddress())(input, start, length, output); + } +#if ENABLE(REGEXP_TRACING) + void *getAddr() { return m_ref.code().executableAddress(); } +#endif + +private: + MacroAssembler::CodeRef m_ref8; + MacroAssembler::CodeRef m_ref16; + bool m_needFallBack; +}; + +void jitCompile(YarrPattern&, YarrCharSize, JSGlobalData*, YarrCodeBlock& jitObject); + +inline int execute(YarrCodeBlock& jitObject, const LChar* input, unsigned start, unsigned length, int* output) +{ + return jitObject.execute(input, start, length, output); +} + +inline int execute(YarrCodeBlock& jitObject, const UChar* input, unsigned start, unsigned length, int* output) +{ + return jitObject.execute(input, start, length, output); +} + +} } // namespace JSC::Yarr + +#endif + +#endif // YarrJIT_h diff --git a/Source/JavaScriptCore/yarr/YarrParser.h b/Source/JavaScriptCore/yarr/YarrParser.h new file mode 100644 index 000000000..e6694560a --- /dev/null +++ b/Source/JavaScriptCore/yarr/YarrParser.h @@ -0,0 +1,874 @@ +/* + * Copyright (C) 2009 Apple Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef YarrParser_h +#define YarrParser_h + +#include <runtime/UString.h> +#include "Yarr.h" +#include <wtf/ASCIICType.h> +#include <wtf/unicode/Unicode.h> + +namespace JSC { namespace Yarr { + +#define REGEXP_ERROR_PREFIX "Invalid regular expression: " + +enum BuiltInCharacterClassID { + DigitClassID, + SpaceClassID, + WordClassID, + NewlineClassID, +}; + +// The Parser class should not be used directly - only via the Yarr::parse() method. +template<class Delegate, typename CharType> +class Parser { +private: + template<class FriendDelegate> + friend const char* parse(FriendDelegate& delegate, const UString& pattern, unsigned backReferenceLimit); + + enum ErrorCode { + NoError, + PatternTooLarge, + QuantifierOutOfOrder, + QuantifierWithoutAtom, + MissingParentheses, + ParenthesesUnmatched, + ParenthesesTypeInvalid, + CharacterClassUnmatched, + CharacterClassOutOfOrder, + EscapeUnterminated, + NumberOfErrorCodes + }; + + /* + * CharacterClassParserDelegate: + * + * The class CharacterClassParserDelegate is used in the parsing of character + * classes. This class handles detection of character ranges. This class + * implements enough of the delegate interface such that it can be passed to + * parseEscape() as an EscapeDelegate. This allows parseEscape() to be reused + * to perform the parsing of escape characters in character sets. + */ + class CharacterClassParserDelegate { + public: + CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err) + : m_delegate(delegate) + , m_err(err) + , m_state(Empty) + , m_character(0) + { + } + + /* + * begin(): + * + * Called at beginning of construction. + */ + void begin(bool invert) + { + m_delegate.atomCharacterClassBegin(invert); + } + + /* + * atomPatternCharacter(): + * + * This method is called either from parseCharacterClass() (for an unescaped + * character in a character class), or from parseEscape(). In the former case + * the value true will be passed for the argument 'hyphenIsRange', and in this + * mode we will allow a hypen to be treated as indicating a range (i.e. /[a-z]/ + * is different to /[a\-z]/). + */ + void atomPatternCharacter(UChar ch, bool hyphenIsRange = false) + { + switch (m_state) { + case AfterCharacterClass: + // Following a builtin character class we need look out for a hyphen. + // We're looking for invalid ranges, such as /[\d-x]/ or /[\d-\d]/. + // If we see a hyphen following a charater class then unlike usual + // we'll report it to the delegate immediately, and put ourself into + // a poisoned state. Any following calls to add another character or + // character class will result in an error. (A hypen following a + // character-class is itself valid, but only at the end of a regex). + if (hyphenIsRange && ch == '-') { + m_delegate.atomCharacterClassAtom('-'); + m_state = AfterCharacterClassHyphen; + return; + } + // Otherwise just fall through - cached character so treat this as Empty. + + case Empty: + m_character = ch; + m_state = CachedCharacter; + return; + + case CachedCharacter: + if (hyphenIsRange && ch == '-') + m_state = CachedCharacterHyphen; + else { + m_delegate.atomCharacterClassAtom(m_character); + m_character = ch; + } + return; + + case CachedCharacterHyphen: + if (ch < m_character) { + m_err = CharacterClassOutOfOrder; + return; + } + m_delegate.atomCharacterClassRange(m_character, ch); + m_state = Empty; + return; + + // See coment in atomBuiltInCharacterClass below. + // This too is technically an error, per ECMA-262, and again we + // we chose to allow this. Note a subtlely here that while we + // diverge from the spec's definition of CharacterRange we do + // remain in compliance with the grammar. For example, consider + // the expression /[\d-a-z]/. We comply with the grammar in + // this case by not allowing a-z to be matched as a range. + case AfterCharacterClassHyphen: + m_delegate.atomCharacterClassAtom(ch); + m_state = Empty; + return; + } + } + + /* + * atomBuiltInCharacterClass(): + * + * Adds a built-in character class, called by parseEscape(). + */ + void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert) + { + switch (m_state) { + case CachedCharacter: + // Flush the currently cached character, then fall through. + m_delegate.atomCharacterClassAtom(m_character); + + case Empty: + case AfterCharacterClass: + m_state = AfterCharacterClass; + m_delegate.atomCharacterClassBuiltIn(classID, invert); + return; + + // If we hit either of these cases, we have an invalid range that + // looks something like /[x-\d]/ or /[\d-\d]/. + // According to ECMA-262 this should be a syntax error, but + // empirical testing shows this to break teh webz. Instead we + // comply with to the ECMA-262 grammar, and assume the grammar to + // have matched the range correctly, but tweak our interpretation + // of CharacterRange. Effectively we implicitly handle the hyphen + // as if it were escaped, e.g. /[\w-_]/ is treated as /[\w\-_]/. + case CachedCharacterHyphen: + m_delegate.atomCharacterClassAtom(m_character); + m_delegate.atomCharacterClassAtom('-'); + // fall through + case AfterCharacterClassHyphen: + m_delegate.atomCharacterClassBuiltIn(classID, invert); + m_state = Empty; + return; + } + } + + /* + * end(): + * + * Called at end of construction. + */ + void end() + { + if (m_state == CachedCharacter) + m_delegate.atomCharacterClassAtom(m_character); + else if (m_state == CachedCharacterHyphen) { + m_delegate.atomCharacterClassAtom(m_character); + m_delegate.atomCharacterClassAtom('-'); + } + m_delegate.atomCharacterClassEnd(); + } + + // parseEscape() should never call these delegate methods when + // invoked with inCharacterClass set. + NO_RETURN_DUE_TO_ASSERT void assertionWordBoundary(bool) { ASSERT_NOT_REACHED(); } + NO_RETURN_DUE_TO_ASSERT void atomBackReference(unsigned) { ASSERT_NOT_REACHED(); } + + private: + Delegate& m_delegate; + ErrorCode& m_err; + enum CharacterClassConstructionState { + Empty, + CachedCharacter, + CachedCharacterHyphen, + AfterCharacterClass, + AfterCharacterClassHyphen, + } m_state; + UChar m_character; + }; + + Parser(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit) + : m_delegate(delegate) + , m_backReferenceLimit(backReferenceLimit) + , m_err(NoError) + , m_data(pattern.getCharacters<CharType>()) + , m_size(pattern.length()) + , m_index(0) + , m_parenthesesNestingDepth(0) + { + } + + /* + * parseEscape(): + * + * Helper for parseTokens() AND parseCharacterClass(). + * Unlike the other parser methods, this function does not report tokens + * directly to the member delegate (m_delegate), instead tokens are + * emitted to the delegate provided as an argument. In the case of atom + * escapes, parseTokens() will call parseEscape() passing m_delegate as + * an argument, and as such the escape will be reported to the delegate. + * + * However this method may also be used by parseCharacterClass(), in which + * case a CharacterClassParserDelegate will be passed as the delegate that + * tokens should be added to. A boolean flag is also provided to indicate + * whether that an escape in a CharacterClass is being parsed (some parsing + * rules change in this context). + * + * The boolean value returned by this method indicates whether the token + * parsed was an atom (outside of a characted class \b and \B will be + * interpreted as assertions). + */ + template<bool inCharacterClass, class EscapeDelegate> + bool parseEscape(EscapeDelegate& delegate) + { + ASSERT(!m_err); + ASSERT(peek() == '\\'); + consume(); + + if (atEndOfPattern()) { + m_err = EscapeUnterminated; + return false; + } + + switch (peek()) { + // Assertions + case 'b': + consume(); + if (inCharacterClass) + delegate.atomPatternCharacter('\b'); + else { + delegate.assertionWordBoundary(false); + return false; + } + break; + case 'B': + consume(); + if (inCharacterClass) + delegate.atomPatternCharacter('B'); + else { + delegate.assertionWordBoundary(true); + return false; + } + break; + + // CharacterClassEscape + case 'd': + consume(); + delegate.atomBuiltInCharacterClass(DigitClassID, false); + break; + case 's': + consume(); + delegate.atomBuiltInCharacterClass(SpaceClassID, false); + break; + case 'w': + consume(); + delegate.atomBuiltInCharacterClass(WordClassID, false); + break; + case 'D': + consume(); + delegate.atomBuiltInCharacterClass(DigitClassID, true); + break; + case 'S': + consume(); + delegate.atomBuiltInCharacterClass(SpaceClassID, true); + break; + case 'W': + consume(); + delegate.atomBuiltInCharacterClass(WordClassID, true); + break; + + // DecimalEscape + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': { + // To match Firefox, we parse an invalid backreference in the range [1-7] as an octal escape. + // First, try to parse this as backreference. + if (!inCharacterClass) { + ParseState state = saveState(); + + unsigned backReference = consumeNumber(); + if (backReference <= m_backReferenceLimit) { + delegate.atomBackReference(backReference); + break; + } + + restoreState(state); + } + + // Not a backreference, and not octal. + if (peek() >= '8') { + delegate.atomPatternCharacter('\\'); + break; + } + + // Fall-through to handle this as an octal escape. + } + + // Octal escape + case '0': + delegate.atomPatternCharacter(consumeOctal()); + break; + + // ControlEscape + case 'f': + consume(); + delegate.atomPatternCharacter('\f'); + break; + case 'n': + consume(); + delegate.atomPatternCharacter('\n'); + break; + case 'r': + consume(); + delegate.atomPatternCharacter('\r'); + break; + case 't': + consume(); + delegate.atomPatternCharacter('\t'); + break; + case 'v': + consume(); + delegate.atomPatternCharacter('\v'); + break; + + // ControlLetter + case 'c': { + ParseState state = saveState(); + consume(); + if (!atEndOfPattern()) { + int control = consume(); + + // To match Firefox, inside a character class, we also accept numbers and '_' as control characters. + if (inCharacterClass ? WTF::isASCIIAlphanumeric(control) || (control == '_') : WTF::isASCIIAlpha(control)) { + delegate.atomPatternCharacter(control & 0x1f); + break; + } + } + restoreState(state); + delegate.atomPatternCharacter('\\'); + break; + } + + // HexEscape + case 'x': { + consume(); + int x = tryConsumeHex(2); + if (x == -1) + delegate.atomPatternCharacter('x'); + else + delegate.atomPatternCharacter(x); + break; + } + + // UnicodeEscape + case 'u': { + consume(); + int u = tryConsumeHex(4); + if (u == -1) + delegate.atomPatternCharacter('u'); + else + delegate.atomPatternCharacter(u); + break; + } + + // IdentityEscape + default: + delegate.atomPatternCharacter(consume()); + } + + return true; + } + + /* + * parseAtomEscape(), parseCharacterClassEscape(): + * + * These methods alias to parseEscape(). + */ + bool parseAtomEscape() + { + return parseEscape<false>(m_delegate); + } + void parseCharacterClassEscape(CharacterClassParserDelegate& delegate) + { + parseEscape<true>(delegate); + } + + /* + * parseCharacterClass(): + * + * Helper for parseTokens(); calls dirctly and indirectly (via parseCharacterClassEscape) + * to an instance of CharacterClassParserDelegate, to describe the character class to the + * delegate. + */ + void parseCharacterClass() + { + ASSERT(!m_err); + ASSERT(peek() == '['); + consume(); + + CharacterClassParserDelegate characterClassConstructor(m_delegate, m_err); + + characterClassConstructor.begin(tryConsume('^')); + + while (!atEndOfPattern()) { + switch (peek()) { + case ']': + consume(); + characterClassConstructor.end(); + return; + + case '\\': + parseCharacterClassEscape(characterClassConstructor); + break; + + default: + characterClassConstructor.atomPatternCharacter(consume(), true); + } + + if (m_err) + return; + } + + m_err = CharacterClassUnmatched; + } + + /* + * parseParenthesesBegin(): + * + * Helper for parseTokens(); checks for parentheses types other than regular capturing subpatterns. + */ + void parseParenthesesBegin() + { + ASSERT(!m_err); + ASSERT(peek() == '('); + consume(); + + if (tryConsume('?')) { + if (atEndOfPattern()) { + m_err = ParenthesesTypeInvalid; + return; + } + + switch (consume()) { + case ':': + m_delegate.atomParenthesesSubpatternBegin(false); + break; + + case '=': + m_delegate.atomParentheticalAssertionBegin(); + break; + + case '!': + m_delegate.atomParentheticalAssertionBegin(true); + break; + + default: + m_err = ParenthesesTypeInvalid; + } + } else + m_delegate.atomParenthesesSubpatternBegin(); + + ++m_parenthesesNestingDepth; + } + + /* + * parseParenthesesEnd(): + * + * Helper for parseTokens(); checks for parse errors (due to unmatched parentheses). + */ + void parseParenthesesEnd() + { + ASSERT(!m_err); + ASSERT(peek() == ')'); + consume(); + + if (m_parenthesesNestingDepth > 0) + m_delegate.atomParenthesesEnd(); + else + m_err = ParenthesesUnmatched; + + --m_parenthesesNestingDepth; + } + + /* + * parseQuantifier(): + * + * Helper for parseTokens(); checks for parse errors and non-greedy quantifiers. + */ + void parseQuantifier(bool lastTokenWasAnAtom, unsigned min, unsigned max) + { + ASSERT(!m_err); + ASSERT(min <= max); + + if (lastTokenWasAnAtom) + m_delegate.quantifyAtom(min, max, !tryConsume('?')); + else + m_err = QuantifierWithoutAtom; + } + + /* + * parseTokens(): + * + * This method loops over the input pattern reporting tokens to the delegate. + * The method returns when a parse error is detected, or the end of the pattern + * is reached. One piece of state is tracked around the loop, which is whether + * the last token passed to the delegate was an atom (this is necessary to detect + * a parse error when a quantifier provided without an atom to quantify). + */ + void parseTokens() + { + bool lastTokenWasAnAtom = false; + + while (!atEndOfPattern()) { + switch (peek()) { + case '|': + consume(); + m_delegate.disjunction(); + lastTokenWasAnAtom = false; + break; + + case '(': + parseParenthesesBegin(); + lastTokenWasAnAtom = false; + break; + + case ')': + parseParenthesesEnd(); + lastTokenWasAnAtom = true; + break; + + case '^': + consume(); + m_delegate.assertionBOL(); + lastTokenWasAnAtom = false; + break; + + case '$': + consume(); + m_delegate.assertionEOL(); + lastTokenWasAnAtom = false; + break; + + case '.': + consume(); + m_delegate.atomBuiltInCharacterClass(NewlineClassID, true); + lastTokenWasAnAtom = true; + break; + + case '[': + parseCharacterClass(); + lastTokenWasAnAtom = true; + break; + + case '\\': + lastTokenWasAnAtom = parseAtomEscape(); + break; + + case '*': + consume(); + parseQuantifier(lastTokenWasAnAtom, 0, quantifyInfinite); + lastTokenWasAnAtom = false; + break; + + case '+': + consume(); + parseQuantifier(lastTokenWasAnAtom, 1, quantifyInfinite); + lastTokenWasAnAtom = false; + break; + + case '?': + consume(); + parseQuantifier(lastTokenWasAnAtom, 0, 1); + lastTokenWasAnAtom = false; + break; + + case '{': { + ParseState state = saveState(); + + consume(); + if (peekIsDigit()) { + unsigned min = consumeNumber(); + unsigned max = min; + + if (tryConsume(',')) + max = peekIsDigit() ? consumeNumber() : quantifyInfinite; + + if (tryConsume('}')) { + if (min <= max) + parseQuantifier(lastTokenWasAnAtom, min, max); + else + m_err = QuantifierOutOfOrder; + lastTokenWasAnAtom = false; + break; + } + } + + restoreState(state); + } // if we did not find a complete quantifer, fall through to the default case. + + default: + m_delegate.atomPatternCharacter(consume()); + lastTokenWasAnAtom = true; + } + + if (m_err) + return; + } + + if (m_parenthesesNestingDepth > 0) + m_err = MissingParentheses; + } + + /* + * parse(): + * + * This method calls parseTokens() to parse over the input and converts any + * error code to a const char* for a result. + */ + const char* parse() + { + if (m_size > MAX_PATTERN_SIZE) + m_err = PatternTooLarge; + else + parseTokens(); + ASSERT(atEndOfPattern() || m_err); + + // The order of this array must match the ErrorCode enum. + static const char* errorMessages[NumberOfErrorCodes] = { + 0, // NoError + REGEXP_ERROR_PREFIX "regular expression too large", + REGEXP_ERROR_PREFIX "numbers out of order in {} quantifier", + REGEXP_ERROR_PREFIX "nothing to repeat", + REGEXP_ERROR_PREFIX "missing )", + REGEXP_ERROR_PREFIX "unmatched parentheses", + REGEXP_ERROR_PREFIX "unrecognized character after (?", + REGEXP_ERROR_PREFIX "missing terminating ] for character class", + REGEXP_ERROR_PREFIX "range out of order in character class", + REGEXP_ERROR_PREFIX "\\ at end of pattern" + }; + + return errorMessages[m_err]; + } + + + // Misc helper functions: + + typedef unsigned ParseState; + + ParseState saveState() + { + return m_index; + } + + void restoreState(ParseState state) + { + m_index = state; + } + + bool atEndOfPattern() + { + ASSERT(m_index <= m_size); + return m_index == m_size; + } + + int peek() + { + ASSERT(m_index < m_size); + return m_data[m_index]; + } + + bool peekIsDigit() + { + return !atEndOfPattern() && WTF::isASCIIDigit(peek()); + } + + unsigned peekDigit() + { + ASSERT(peekIsDigit()); + return peek() - '0'; + } + + int consume() + { + ASSERT(m_index < m_size); + return m_data[m_index++]; + } + + unsigned consumeDigit() + { + ASSERT(peekIsDigit()); + return consume() - '0'; + } + + unsigned consumeNumber() + { + unsigned n = consumeDigit(); + // check for overflow. + for (unsigned newValue; peekIsDigit() && ((newValue = n * 10 + peekDigit()) >= n); ) { + n = newValue; + consume(); + } + return n; + } + + unsigned consumeOctal() + { + ASSERT(WTF::isASCIIOctalDigit(peek())); + + unsigned n = consumeDigit(); + while (n < 32 && !atEndOfPattern() && WTF::isASCIIOctalDigit(peek())) + n = n * 8 + consumeDigit(); + return n; + } + + bool tryConsume(UChar ch) + { + if (atEndOfPattern() || (m_data[m_index] != ch)) + return false; + ++m_index; + return true; + } + + int tryConsumeHex(int count) + { + ParseState state = saveState(); + + int n = 0; + while (count--) { + if (atEndOfPattern() || !WTF::isASCIIHexDigit(peek())) { + restoreState(state); + return -1; + } + n = (n << 4) | WTF::toASCIIHexValue(consume()); + } + return n; + } + + Delegate& m_delegate; + unsigned m_backReferenceLimit; + ErrorCode m_err; + const CharType* m_data; + unsigned m_size; + unsigned m_index; + unsigned m_parenthesesNestingDepth; + + // Derived by empirical testing of compile time in PCRE and WREC. + static const unsigned MAX_PATTERN_SIZE = 1024 * 1024; +}; + +/* + * Yarr::parse(): + * + * The parse method is passed a pattern to be parsed and a delegate upon which + * callbacks will be made to record the parsed tokens forming the regex. + * Yarr::parse() returns null on success, or a const C string providing an error + * message where a parse error occurs. + * + * The Delegate must implement the following interface: + * + * void assertionBOL(); + * void assertionEOL(); + * void assertionWordBoundary(bool invert); + * + * void atomPatternCharacter(UChar ch); + * void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert); + * void atomCharacterClassBegin(bool invert) + * void atomCharacterClassAtom(UChar ch) + * void atomCharacterClassRange(UChar begin, UChar end) + * void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert) + * void atomCharacterClassEnd() + * void atomParenthesesSubpatternBegin(bool capture = true); + * void atomParentheticalAssertionBegin(bool invert = false); + * void atomParenthesesEnd(); + * void atomBackReference(unsigned subpatternId); + * + * void quantifyAtom(unsigned min, unsigned max, bool greedy); + * + * void disjunction(); + * + * The regular expression is described by a sequence of assertion*() and atom*() + * callbacks to the delegate, describing the terms in the regular expression. + * Following an atom a quantifyAtom() call may occur to indicate that the previous + * atom should be quantified. In the case of atoms described across multiple + * calls (parentheses and character classes) the call to quantifyAtom() will come + * after the call to the atom*End() method, never after atom*Begin(). + * + * Character classes may either be described by a single call to + * atomBuiltInCharacterClass(), or by a sequence of atomCharacterClass*() calls. + * In the latter case, ...Begin() will be called, followed by a sequence of + * calls to ...Atom(), ...Range(), and ...BuiltIn(), followed by a call to ...End(). + * + * Sequences of atoms and assertions are broken into alternatives via calls to + * disjunction(). Assertions, atoms, and disjunctions emitted between calls to + * atomParenthesesBegin() and atomParenthesesEnd() form the body of a subpattern. + * atomParenthesesBegin() is passed a subpatternId. In the case of a regular + * capturing subpattern, this will be the subpatternId associated with these + * parentheses, and will also by definition be the lowest subpatternId of these + * parentheses and of any nested paretheses. The atomParenthesesEnd() method + * is passed the subpatternId of the last capturing subexpression nested within + * these paretheses. In the case of a capturing subpattern with no nested + * capturing subpatterns, the same subpatternId will be passed to the begin and + * end functions. In the case of non-capturing subpatterns the subpatternId + * passed to the begin method is also the first possible subpatternId that might + * be nested within these paretheses. If a set of non-capturing parentheses does + * not contain any capturing subpatterns, then the subpatternId passed to begin + * will be greater than the subpatternId passed to end. + */ + +template<class Delegate> +const char* parse(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit = quantifyInfinite) +{ + if (pattern.is8Bit()) + return Parser<Delegate, LChar>(delegate, pattern, backReferenceLimit).parse(); + return Parser<Delegate, UChar>(delegate, pattern, backReferenceLimit).parse(); +} + +} } // namespace JSC::Yarr + +#endif // YarrParser_h diff --git a/Source/JavaScriptCore/yarr/YarrPattern.cpp b/Source/JavaScriptCore/yarr/YarrPattern.cpp new file mode 100644 index 000000000..1043e405d --- /dev/null +++ b/Source/JavaScriptCore/yarr/YarrPattern.cpp @@ -0,0 +1,821 @@ +/* + * Copyright (C) 2009 Apple Inc. All rights reserved. + * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "YarrPattern.h" + +#include "Yarr.h" +#include "YarrParser.h" +#include <wtf/Vector.h> + +using namespace WTF; + +namespace JSC { namespace Yarr { + +#include "RegExpJitTables.h" + +class CharacterClassConstructor { +public: + CharacterClassConstructor(bool isCaseInsensitive = false) + : m_isCaseInsensitive(isCaseInsensitive) + { + } + + void reset() + { + m_matches.clear(); + m_ranges.clear(); + m_matchesUnicode.clear(); + m_rangesUnicode.clear(); + } + + void append(const CharacterClass* other) + { + for (size_t i = 0; i < other->m_matches.size(); ++i) + addSorted(m_matches, other->m_matches[i]); + for (size_t i = 0; i < other->m_ranges.size(); ++i) + addSortedRange(m_ranges, other->m_ranges[i].begin, other->m_ranges[i].end); + for (size_t i = 0; i < other->m_matchesUnicode.size(); ++i) + addSorted(m_matchesUnicode, other->m_matchesUnicode[i]); + for (size_t i = 0; i < other->m_rangesUnicode.size(); ++i) + addSortedRange(m_rangesUnicode, other->m_rangesUnicode[i].begin, other->m_rangesUnicode[i].end); + } + + void putChar(UChar ch) + { + if (ch <= 0x7f) { + if (m_isCaseInsensitive && isASCIIAlpha(ch)) { + addSorted(m_matches, toASCIIUpper(ch)); + addSorted(m_matches, toASCIILower(ch)); + } else + addSorted(m_matches, ch); + } else { + UChar upper, lower; + if (m_isCaseInsensitive && ((upper = Unicode::toUpper(ch)) != (lower = Unicode::toLower(ch)))) { + addSorted(m_matchesUnicode, upper); + addSorted(m_matchesUnicode, lower); + } else + addSorted(m_matchesUnicode, ch); + } + } + + // returns true if this character has another case, and 'ch' is the upper case form. + static inline bool isUnicodeUpper(UChar ch) + { + return ch != Unicode::toLower(ch); + } + + // returns true if this character has another case, and 'ch' is the lower case form. + static inline bool isUnicodeLower(UChar ch) + { + return ch != Unicode::toUpper(ch); + } + + void putRange(UChar lo, UChar hi) + { + if (lo <= 0x7f) { + char asciiLo = lo; + char asciiHi = std::min(hi, (UChar)0x7f); + addSortedRange(m_ranges, lo, asciiHi); + + if (m_isCaseInsensitive) { + if ((asciiLo <= 'Z') && (asciiHi >= 'A')) + addSortedRange(m_ranges, std::max(asciiLo, 'A')+('a'-'A'), std::min(asciiHi, 'Z')+('a'-'A')); + if ((asciiLo <= 'z') && (asciiHi >= 'a')) + addSortedRange(m_ranges, std::max(asciiLo, 'a')+('A'-'a'), std::min(asciiHi, 'z')+('A'-'a')); + } + } + if (hi >= 0x80) { + uint32_t unicodeCurr = std::max(lo, (UChar)0x80); + addSortedRange(m_rangesUnicode, unicodeCurr, hi); + + if (m_isCaseInsensitive) { + while (unicodeCurr <= hi) { + // If the upper bound of the range (hi) is 0xffff, the increments to + // unicodeCurr in this loop may take it to 0x10000. This is fine + // (if so we won't re-enter the loop, since the loop condition above + // will definitely fail) - but this does mean we cannot use a UChar + // to represent unicodeCurr, we must use a 32-bit value instead. + ASSERT(unicodeCurr <= 0xffff); + + if (isUnicodeUpper(unicodeCurr)) { + UChar lowerCaseRangeBegin = Unicode::toLower(unicodeCurr); + UChar lowerCaseRangeEnd = lowerCaseRangeBegin; + while ((++unicodeCurr <= hi) && isUnicodeUpper(unicodeCurr) && (Unicode::toLower(unicodeCurr) == (lowerCaseRangeEnd + 1))) + lowerCaseRangeEnd++; + addSortedRange(m_rangesUnicode, lowerCaseRangeBegin, lowerCaseRangeEnd); + } else if (isUnicodeLower(unicodeCurr)) { + UChar upperCaseRangeBegin = Unicode::toUpper(unicodeCurr); + UChar upperCaseRangeEnd = upperCaseRangeBegin; + while ((++unicodeCurr <= hi) && isUnicodeLower(unicodeCurr) && (Unicode::toUpper(unicodeCurr) == (upperCaseRangeEnd + 1))) + upperCaseRangeEnd++; + addSortedRange(m_rangesUnicode, upperCaseRangeBegin, upperCaseRangeEnd); + } else + ++unicodeCurr; + } + } + } + } + + CharacterClass* charClass() + { + CharacterClass* characterClass = new CharacterClass(0); + + characterClass->m_matches.swap(m_matches); + characterClass->m_ranges.swap(m_ranges); + characterClass->m_matchesUnicode.swap(m_matchesUnicode); + characterClass->m_rangesUnicode.swap(m_rangesUnicode); + + return characterClass; + } + +private: + void addSorted(Vector<UChar>& matches, UChar ch) + { + unsigned pos = 0; + unsigned range = matches.size(); + + // binary chop, find position to insert char. + while (range) { + unsigned index = range >> 1; + + int val = matches[pos+index] - ch; + if (!val) + return; + else if (val > 0) + range = index; + else { + pos += (index+1); + range -= (index+1); + } + } + + if (pos == matches.size()) + matches.append(ch); + else + matches.insert(pos, ch); + } + + void addSortedRange(Vector<CharacterRange>& ranges, UChar lo, UChar hi) + { + unsigned end = ranges.size(); + + // Simple linear scan - I doubt there are that many ranges anyway... + // feel free to fix this with something faster (eg binary chop). + for (unsigned i = 0; i < end; ++i) { + // does the new range fall before the current position in the array + if (hi < ranges[i].begin) { + // optional optimization: concatenate appending ranges? - may not be worthwhile. + if (hi == (ranges[i].begin - 1)) { + ranges[i].begin = lo; + return; + } + ranges.insert(i, CharacterRange(lo, hi)); + return; + } + // Okay, since we didn't hit the last case, the end of the new range is definitely at or after the begining + // If the new range start at or before the end of the last range, then the overlap (if it starts one after the + // end of the last range they concatenate, which is just as good. + if (lo <= (ranges[i].end + 1)) { + // found an intersect! we'll replace this entry in the array. + ranges[i].begin = std::min(ranges[i].begin, lo); + ranges[i].end = std::max(ranges[i].end, hi); + + // now check if the new range can subsume any subsequent ranges. + unsigned next = i+1; + // each iteration of the loop we will either remove something from the list, or break the loop. + while (next < ranges.size()) { + if (ranges[next].begin <= (ranges[i].end + 1)) { + // the next entry now overlaps / concatenates this one. + ranges[i].end = std::max(ranges[i].end, ranges[next].end); + ranges.remove(next); + } else + break; + } + + return; + } + } + + // CharacterRange comes after all existing ranges. + ranges.append(CharacterRange(lo, hi)); + } + + bool m_isCaseInsensitive; + + Vector<UChar> m_matches; + Vector<CharacterRange> m_ranges; + Vector<UChar> m_matchesUnicode; + Vector<CharacterRange> m_rangesUnicode; +}; + +class YarrPatternConstructor { +public: + YarrPatternConstructor(YarrPattern& pattern) + : m_pattern(pattern) + , m_characterClassConstructor(pattern.m_ignoreCase) + , m_invertParentheticalAssertion(false) + { + m_pattern.m_body = new PatternDisjunction(); + m_alternative = m_pattern.m_body->addNewAlternative(); + m_pattern.m_disjunctions.append(m_pattern.m_body); + } + + ~YarrPatternConstructor() + { + } + + void reset() + { + m_pattern.reset(); + m_characterClassConstructor.reset(); + + m_pattern.m_body = new PatternDisjunction(); + m_alternative = m_pattern.m_body->addNewAlternative(); + m_pattern.m_disjunctions.append(m_pattern.m_body); + } + + void assertionBOL() + { + if (!m_alternative->m_terms.size() & !m_invertParentheticalAssertion) { + m_alternative->m_startsWithBOL = true; + m_alternative->m_containsBOL = true; + m_pattern.m_containsBOL = true; + } + m_alternative->m_terms.append(PatternTerm::BOL()); + } + void assertionEOL() + { + m_alternative->m_terms.append(PatternTerm::EOL()); + } + void assertionWordBoundary(bool invert) + { + m_alternative->m_terms.append(PatternTerm::WordBoundary(invert)); + } + + void atomPatternCharacter(UChar ch) + { + // We handle case-insensitive checking of unicode characters which do have both + // cases by handling them as if they were defined using a CharacterClass. + if (m_pattern.m_ignoreCase && !isASCII(ch) && (Unicode::toUpper(ch) != Unicode::toLower(ch))) { + atomCharacterClassBegin(); + atomCharacterClassAtom(ch); + atomCharacterClassEnd(); + } else + m_alternative->m_terms.append(PatternTerm(ch)); + } + + void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert) + { + switch (classID) { + case DigitClassID: + m_alternative->m_terms.append(PatternTerm(m_pattern.digitsCharacterClass(), invert)); + break; + case SpaceClassID: + m_alternative->m_terms.append(PatternTerm(m_pattern.spacesCharacterClass(), invert)); + break; + case WordClassID: + m_alternative->m_terms.append(PatternTerm(m_pattern.wordcharCharacterClass(), invert)); + break; + case NewlineClassID: + m_alternative->m_terms.append(PatternTerm(m_pattern.newlineCharacterClass(), invert)); + break; + } + } + + void atomCharacterClassBegin(bool invert = false) + { + m_invertCharacterClass = invert; + } + + void atomCharacterClassAtom(UChar ch) + { + m_characterClassConstructor.putChar(ch); + } + + void atomCharacterClassRange(UChar begin, UChar end) + { + m_characterClassConstructor.putRange(begin, end); + } + + void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert) + { + ASSERT(classID != NewlineClassID); + + switch (classID) { + case DigitClassID: + m_characterClassConstructor.append(invert ? m_pattern.nondigitsCharacterClass() : m_pattern.digitsCharacterClass()); + break; + + case SpaceClassID: + m_characterClassConstructor.append(invert ? m_pattern.nonspacesCharacterClass() : m_pattern.spacesCharacterClass()); + break; + + case WordClassID: + m_characterClassConstructor.append(invert ? m_pattern.nonwordcharCharacterClass() : m_pattern.wordcharCharacterClass()); + break; + + default: + ASSERT_NOT_REACHED(); + } + } + + void atomCharacterClassEnd() + { + CharacterClass* newCharacterClass = m_characterClassConstructor.charClass(); + m_pattern.m_userCharacterClasses.append(newCharacterClass); + m_alternative->m_terms.append(PatternTerm(newCharacterClass, m_invertCharacterClass)); + } + + void atomParenthesesSubpatternBegin(bool capture = true) + { + unsigned subpatternId = m_pattern.m_numSubpatterns + 1; + if (capture) + m_pattern.m_numSubpatterns++; + + PatternDisjunction* parenthesesDisjunction = new PatternDisjunction(m_alternative); + m_pattern.m_disjunctions.append(parenthesesDisjunction); + m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, capture, false)); + m_alternative = parenthesesDisjunction->addNewAlternative(); + } + + void atomParentheticalAssertionBegin(bool invert = false) + { + PatternDisjunction* parenthesesDisjunction = new PatternDisjunction(m_alternative); + m_pattern.m_disjunctions.append(parenthesesDisjunction); + m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParentheticalAssertion, m_pattern.m_numSubpatterns + 1, parenthesesDisjunction, false, invert)); + m_alternative = parenthesesDisjunction->addNewAlternative(); + m_invertParentheticalAssertion = invert; + } + + void atomParenthesesEnd() + { + ASSERT(m_alternative->m_parent); + ASSERT(m_alternative->m_parent->m_parent); + + PatternDisjunction* parenthesesDisjunction = m_alternative->m_parent; + m_alternative = m_alternative->m_parent->m_parent; + + PatternTerm& lastTerm = m_alternative->lastTerm(); + + unsigned numParenAlternatives = parenthesesDisjunction->m_alternatives.size(); + unsigned numBOLAnchoredAlts = 0; + + for (unsigned i = 0; i < numParenAlternatives; i++) { + // Bubble up BOL flags + if (parenthesesDisjunction->m_alternatives[i]->m_startsWithBOL) + numBOLAnchoredAlts++; + } + + if (numBOLAnchoredAlts) { + m_alternative->m_containsBOL = true; + // If all the alternatives in parens start with BOL, then so does this one + if (numBOLAnchoredAlts == numParenAlternatives) + m_alternative->m_startsWithBOL = true; + } + + lastTerm.parentheses.lastSubpatternId = m_pattern.m_numSubpatterns; + m_invertParentheticalAssertion = false; + } + + void atomBackReference(unsigned subpatternId) + { + ASSERT(subpatternId); + m_pattern.m_containsBackreferences = true; + m_pattern.m_maxBackReference = std::max(m_pattern.m_maxBackReference, subpatternId); + + if (subpatternId > m_pattern.m_numSubpatterns) { + m_alternative->m_terms.append(PatternTerm::ForwardReference()); + return; + } + + PatternAlternative* currentAlternative = m_alternative; + ASSERT(currentAlternative); + + // Note to self: if we waited until the AST was baked, we could also remove forwards refs + while ((currentAlternative = currentAlternative->m_parent->m_parent)) { + PatternTerm& term = currentAlternative->lastTerm(); + ASSERT((term.type == PatternTerm::TypeParenthesesSubpattern) || (term.type == PatternTerm::TypeParentheticalAssertion)); + + if ((term.type == PatternTerm::TypeParenthesesSubpattern) && term.capture() && (subpatternId == term.parentheses.subpatternId)) { + m_alternative->m_terms.append(PatternTerm::ForwardReference()); + return; + } + } + + m_alternative->m_terms.append(PatternTerm(subpatternId)); + } + + // deep copy the argument disjunction. If filterStartsWithBOL is true, + // skip alternatives with m_startsWithBOL set true. + PatternDisjunction* copyDisjunction(PatternDisjunction* disjunction, bool filterStartsWithBOL = false) + { + PatternDisjunction* newDisjunction = 0; + for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { + PatternAlternative* alternative = disjunction->m_alternatives[alt]; + if (!filterStartsWithBOL || !alternative->m_startsWithBOL) { + if (!newDisjunction) { + newDisjunction = new PatternDisjunction(); + newDisjunction->m_parent = disjunction->m_parent; + } + PatternAlternative* newAlternative = newDisjunction->addNewAlternative(); + for (unsigned i = 0; i < alternative->m_terms.size(); ++i) + newAlternative->m_terms.append(copyTerm(alternative->m_terms[i], filterStartsWithBOL)); + } + } + + if (newDisjunction) + m_pattern.m_disjunctions.append(newDisjunction); + return newDisjunction; + } + + PatternTerm copyTerm(PatternTerm& term, bool filterStartsWithBOL = false) + { + if ((term.type != PatternTerm::TypeParenthesesSubpattern) && (term.type != PatternTerm::TypeParentheticalAssertion)) + return PatternTerm(term); + + PatternTerm termCopy = term; + termCopy.parentheses.disjunction = copyDisjunction(termCopy.parentheses.disjunction, filterStartsWithBOL); + return termCopy; + } + + void quantifyAtom(unsigned min, unsigned max, bool greedy) + { + ASSERT(min <= max); + ASSERT(m_alternative->m_terms.size()); + + if (!max) { + m_alternative->removeLastTerm(); + return; + } + + PatternTerm& term = m_alternative->lastTerm(); + ASSERT(term.type > PatternTerm::TypeAssertionWordBoundary); + ASSERT((term.quantityCount == 1) && (term.quantityType == QuantifierFixedCount)); + + // For any assertion with a zero minimum, not matching is valid and has no effect, + // remove it. Otherwise, we need to match as least once, but there is no point + // matching more than once, so remove the quantifier. It is not entirely clear + // from the spec whether or not this behavior is correct, but I believe this + // matches Firefox. :-/ + if (term.type == PatternTerm::TypeParentheticalAssertion) { + if (!min) + m_alternative->removeLastTerm(); + return; + } + + if (min == 0) + term.quantify(max, greedy ? QuantifierGreedy : QuantifierNonGreedy); + else if (min == max) + term.quantify(min, QuantifierFixedCount); + else { + term.quantify(min, QuantifierFixedCount); + m_alternative->m_terms.append(copyTerm(term)); + // NOTE: this term is interesting from an analysis perspective, in that it can be ignored..... + m_alternative->lastTerm().quantify((max == quantifyInfinite) ? max : max - min, greedy ? QuantifierGreedy : QuantifierNonGreedy); + if (m_alternative->lastTerm().type == PatternTerm::TypeParenthesesSubpattern) + m_alternative->lastTerm().parentheses.isCopy = true; + } + } + + void disjunction() + { + m_alternative = m_alternative->m_parent->addNewAlternative(); + } + + unsigned setupAlternativeOffsets(PatternAlternative* alternative, unsigned currentCallFrameSize, unsigned initialInputPosition) + { + alternative->m_hasFixedSize = true; + Checked<unsigned> currentInputPosition = initialInputPosition; + + for (unsigned i = 0; i < alternative->m_terms.size(); ++i) { + PatternTerm& term = alternative->m_terms[i]; + + switch (term.type) { + case PatternTerm::TypeAssertionBOL: + case PatternTerm::TypeAssertionEOL: + case PatternTerm::TypeAssertionWordBoundary: + term.inputPosition = currentInputPosition.unsafeGet(); + break; + + case PatternTerm::TypeBackReference: + term.inputPosition = currentInputPosition.unsafeGet(); + term.frameLocation = currentCallFrameSize; + currentCallFrameSize += YarrStackSpaceForBackTrackInfoBackReference; + alternative->m_hasFixedSize = false; + break; + + case PatternTerm::TypeForwardReference: + break; + + case PatternTerm::TypePatternCharacter: + term.inputPosition = currentInputPosition.unsafeGet(); + if (term.quantityType != QuantifierFixedCount) { + term.frameLocation = currentCallFrameSize; + currentCallFrameSize += YarrStackSpaceForBackTrackInfoPatternCharacter; + alternative->m_hasFixedSize = false; + } else + currentInputPosition += term.quantityCount; + break; + + case PatternTerm::TypeCharacterClass: + term.inputPosition = currentInputPosition.unsafeGet(); + if (term.quantityType != QuantifierFixedCount) { + term.frameLocation = currentCallFrameSize; + currentCallFrameSize += YarrStackSpaceForBackTrackInfoCharacterClass; + alternative->m_hasFixedSize = false; + } else + currentInputPosition += term.quantityCount; + break; + + case PatternTerm::TypeParenthesesSubpattern: + // Note: for fixed once parentheses we will ensure at least the minimum is available; others are on their own. + term.frameLocation = currentCallFrameSize; + if (term.quantityCount == 1 && !term.parentheses.isCopy) { + if (term.quantityType != QuantifierFixedCount) + currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesOnce; + currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition.unsafeGet()); + // If quantity is fixed, then pre-check its minimum size. + if (term.quantityType == QuantifierFixedCount) + currentInputPosition += term.parentheses.disjunction->m_minimumSize; + term.inputPosition = currentInputPosition.unsafeGet(); + } else if (term.parentheses.isTerminal) { + currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesTerminal; + currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition.unsafeGet()); + term.inputPosition = currentInputPosition.unsafeGet(); + } else { + term.inputPosition = currentInputPosition.unsafeGet(); + setupDisjunctionOffsets(term.parentheses.disjunction, 0, currentInputPosition.unsafeGet()); + currentCallFrameSize += YarrStackSpaceForBackTrackInfoParentheses; + } + // Fixed count of 1 could be accepted, if they have a fixed size *AND* if all alternatives are of the same length. + alternative->m_hasFixedSize = false; + break; + + case PatternTerm::TypeParentheticalAssertion: + term.inputPosition = currentInputPosition.unsafeGet(); + term.frameLocation = currentCallFrameSize; + currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize + YarrStackSpaceForBackTrackInfoParentheticalAssertion, currentInputPosition.unsafeGet()); + break; + + case PatternTerm::TypeDotStarEnclosure: + alternative->m_hasFixedSize = false; + term.inputPosition = initialInputPosition; + break; + } + } + + alternative->m_minimumSize = (currentInputPosition - initialInputPosition).unsafeGet(); + return currentCallFrameSize; + } + + unsigned setupDisjunctionOffsets(PatternDisjunction* disjunction, unsigned initialCallFrameSize, unsigned initialInputPosition) + { + if ((disjunction != m_pattern.m_body) && (disjunction->m_alternatives.size() > 1)) + initialCallFrameSize += YarrStackSpaceForBackTrackInfoAlternative; + + unsigned minimumInputSize = UINT_MAX; + unsigned maximumCallFrameSize = 0; + bool hasFixedSize = true; + + for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { + PatternAlternative* alternative = disjunction->m_alternatives[alt]; + unsigned currentAlternativeCallFrameSize = setupAlternativeOffsets(alternative, initialCallFrameSize, initialInputPosition); + minimumInputSize = min(minimumInputSize, alternative->m_minimumSize); + maximumCallFrameSize = max(maximumCallFrameSize, currentAlternativeCallFrameSize); + hasFixedSize &= alternative->m_hasFixedSize; + } + + ASSERT(minimumInputSize != UINT_MAX); + ASSERT(maximumCallFrameSize >= initialCallFrameSize); + + disjunction->m_hasFixedSize = hasFixedSize; + disjunction->m_minimumSize = minimumInputSize; + disjunction->m_callFrameSize = maximumCallFrameSize; + return maximumCallFrameSize; + } + + void setupOffsets() + { + setupDisjunctionOffsets(m_pattern.m_body, 0, 0); + } + + // This optimization identifies sets of parentheses that we will never need to backtrack. + // In these cases we do not need to store state from prior iterations. + // We can presently avoid backtracking for: + // * where the parens are at the end of the regular expression (last term in any of the + // alternatives of the main body disjunction). + // * where the parens are non-capturing, and quantified unbounded greedy (*). + // * where the parens do not contain any capturing subpatterns. + void checkForTerminalParentheses() + { + // This check is much too crude; should be just checking whether the candidate + // node contains nested capturing subpatterns, not the whole expression! + if (m_pattern.m_numSubpatterns) + return; + + Vector<PatternAlternative*>& alternatives = m_pattern.m_body->m_alternatives; + for (size_t i = 0; i < alternatives.size(); ++i) { + Vector<PatternTerm>& terms = alternatives[i]->m_terms; + if (terms.size()) { + PatternTerm& term = terms.last(); + if (term.type == PatternTerm::TypeParenthesesSubpattern + && term.quantityType == QuantifierGreedy + && term.quantityCount == quantifyInfinite + && !term.capture()) + term.parentheses.isTerminal = true; + } + } + } + + void optimizeBOL() + { + // Look for expressions containing beginning of line (^) anchoring and unroll them. + // e.g. /^a|^b|c/ becomes /^a|^b|c/ which is executed once followed by /c/ which loops + // This code relies on the parsing code tagging alternatives with m_containsBOL and + // m_startsWithBOL and rolling those up to containing alternatives. + // At this point, this is only valid for non-multiline expressions. + PatternDisjunction* disjunction = m_pattern.m_body; + + if (!m_pattern.m_containsBOL || m_pattern.m_multiline) + return; + + PatternDisjunction* loopDisjunction = copyDisjunction(disjunction, true); + + // Set alternatives in disjunction to "onceThrough" + for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) + disjunction->m_alternatives[alt]->setOnceThrough(); + + if (loopDisjunction) { + // Move alternatives from loopDisjunction to disjunction + for (unsigned alt = 0; alt < loopDisjunction->m_alternatives.size(); ++alt) + disjunction->m_alternatives.append(loopDisjunction->m_alternatives[alt]); + + loopDisjunction->m_alternatives.clear(); + } + } + + bool containsCapturingTerms(PatternAlternative* alternative, size_t firstTermIndex, size_t lastTermIndex) + { + Vector<PatternTerm>& terms = alternative->m_terms; + + for (size_t termIndex = firstTermIndex; termIndex <= lastTermIndex; ++termIndex) { + PatternTerm& term = terms[termIndex]; + + if (term.m_capture) + return true; + + if (term.type == PatternTerm::TypeParenthesesSubpattern) { + PatternDisjunction* nestedDisjunction = term.parentheses.disjunction; + for (unsigned alt = 0; alt < nestedDisjunction->m_alternatives.size(); ++alt) { + if (containsCapturingTerms(nestedDisjunction->m_alternatives[alt], 0, nestedDisjunction->m_alternatives[alt]->m_terms.size() - 1)) + return true; + } + } + } + + return false; + } + + // This optimization identifies alternatives in the form of + // [^].*[?]<expression>.*[$] for expressions that don't have any + // capturing terms. The alternative is changed to <expression> + // followed by processing of the dot stars to find and adjust the + // beginning and the end of the match. + void optimizeDotStarWrappedExpressions() + { + Vector<PatternAlternative*>& alternatives = m_pattern.m_body->m_alternatives; + if (alternatives.size() != 1) + return; + + PatternAlternative* alternative = alternatives[0]; + Vector<PatternTerm>& terms = alternative->m_terms; + if (terms.size() >= 3) { + bool startsWithBOL = false; + bool endsWithEOL = false; + size_t termIndex, firstExpressionTerm, lastExpressionTerm; + + termIndex = 0; + if (terms[termIndex].type == PatternTerm::TypeAssertionBOL) { + startsWithBOL = true; + ++termIndex; + } + + PatternTerm& firstNonAnchorTerm = terms[termIndex]; + if ((firstNonAnchorTerm.type != PatternTerm::TypeCharacterClass) || (firstNonAnchorTerm.characterClass != m_pattern.newlineCharacterClass()) || !((firstNonAnchorTerm.quantityType == QuantifierGreedy) || (firstNonAnchorTerm.quantityType == QuantifierNonGreedy))) + return; + + firstExpressionTerm = termIndex + 1; + + termIndex = terms.size() - 1; + if (terms[termIndex].type == PatternTerm::TypeAssertionEOL) { + endsWithEOL = true; + --termIndex; + } + + PatternTerm& lastNonAnchorTerm = terms[termIndex]; + if ((lastNonAnchorTerm.type != PatternTerm::TypeCharacterClass) || (lastNonAnchorTerm.characterClass != m_pattern.newlineCharacterClass()) || (lastNonAnchorTerm.quantityType != QuantifierGreedy)) + return; + + lastExpressionTerm = termIndex - 1; + + if (firstExpressionTerm > lastExpressionTerm) + return; + + if (!containsCapturingTerms(alternative, firstExpressionTerm, lastExpressionTerm)) { + for (termIndex = terms.size() - 1; termIndex > lastExpressionTerm; --termIndex) + terms.remove(termIndex); + + for (termIndex = firstExpressionTerm; termIndex > 0; --termIndex) + terms.remove(termIndex - 1); + + terms.append(PatternTerm(startsWithBOL, endsWithEOL)); + + m_pattern.m_containsBOL = false; + } + } + } + +private: + YarrPattern& m_pattern; + PatternAlternative* m_alternative; + CharacterClassConstructor m_characterClassConstructor; + bool m_invertCharacterClass; + bool m_invertParentheticalAssertion; +}; + +const char* YarrPattern::compile(const UString& patternString) +{ + YarrPatternConstructor constructor(*this); + + if (const char* error = parse(constructor, patternString)) + return error; + + // If the pattern contains illegal backreferences reset & reparse. + // Quoting Netscape's "What's new in JavaScript 1.2", + // "Note: if the number of left parentheses is less than the number specified + // in \#, the \# is taken as an octal escape as described in the next row." + if (containsIllegalBackReference()) { + unsigned numSubpatterns = m_numSubpatterns; + + constructor.reset(); +#if !ASSERT_DISABLED + const char* error = +#endif + parse(constructor, patternString, numSubpatterns); + + ASSERT(!error); + ASSERT(numSubpatterns == m_numSubpatterns); + } + + constructor.checkForTerminalParentheses(); + constructor.optimizeDotStarWrappedExpressions(); + constructor.optimizeBOL(); + + constructor.setupOffsets(); + + return 0; +} + +YarrPattern::YarrPattern(const UString& pattern, bool ignoreCase, bool multiline, const char** error) + : m_ignoreCase(ignoreCase) + , m_multiline(multiline) + , m_containsBackreferences(false) + , m_containsBOL(false) + , m_numSubpatterns(0) + , m_maxBackReference(0) + , newlineCached(0) + , digitsCached(0) + , spacesCached(0) + , wordcharCached(0) + , nondigitsCached(0) + , nonspacesCached(0) + , nonwordcharCached(0) +{ + *error = compile(pattern); +} + +} } diff --git a/Source/JavaScriptCore/yarr/YarrPattern.h b/Source/JavaScriptCore/yarr/YarrPattern.h new file mode 100644 index 000000000..2cbb79586 --- /dev/null +++ b/Source/JavaScriptCore/yarr/YarrPattern.h @@ -0,0 +1,421 @@ +/* + * Copyright (C) 2009 Apple Inc. All rights reserved. + * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef YarrPattern_h +#define YarrPattern_h + +#include <runtime/UString.h> +#include <wtf/CheckedArithmetic.h> +#include <wtf/RefCounted.h> +#include <wtf/Vector.h> +#include <wtf/unicode/Unicode.h> + +namespace JSC { namespace Yarr { + +struct PatternDisjunction; + +struct CharacterRange { + UChar begin; + UChar end; + + CharacterRange(UChar begin, UChar end) + : begin(begin) + , end(end) + { + } +}; + +struct CharacterClassTable : RefCounted<CharacterClassTable> { + const char* m_table; + bool m_inverted; + static PassRefPtr<CharacterClassTable> create(const char* table, bool inverted) + { + return adoptRef(new CharacterClassTable(table, inverted)); + } + +private: + CharacterClassTable(const char* table, bool inverted) + : m_table(table) + , m_inverted(inverted) + { + } +}; + +struct CharacterClass { + WTF_MAKE_FAST_ALLOCATED; +public: + // All CharacterClass instances have to have the full set of matches and ranges, + // they may have an optional table for faster lookups (which must match the + // specified matches and ranges) + CharacterClass(PassRefPtr<CharacterClassTable> table) + : m_table(table) + { + } + Vector<UChar> m_matches; + Vector<CharacterRange> m_ranges; + Vector<UChar> m_matchesUnicode; + Vector<CharacterRange> m_rangesUnicode; + RefPtr<CharacterClassTable> m_table; +}; + +enum QuantifierType { + QuantifierFixedCount, + QuantifierGreedy, + QuantifierNonGreedy, +}; + +struct PatternTerm { + enum Type { + TypeAssertionBOL, + TypeAssertionEOL, + TypeAssertionWordBoundary, + TypePatternCharacter, + TypeCharacterClass, + TypeBackReference, + TypeForwardReference, + TypeParenthesesSubpattern, + TypeParentheticalAssertion, + TypeDotStarEnclosure, + } type; + bool m_capture :1; + bool m_invert :1; + union { + UChar patternCharacter; + CharacterClass* characterClass; + unsigned backReferenceSubpatternId; + struct { + PatternDisjunction* disjunction; + unsigned subpatternId; + unsigned lastSubpatternId; + bool isCopy; + bool isTerminal; + } parentheses; + struct { + bool bolAnchor : 1; + bool eolAnchor : 1; + } anchors; + }; + QuantifierType quantityType; + Checked<unsigned> quantityCount; + int inputPosition; + unsigned frameLocation; + + PatternTerm(UChar ch) + : type(PatternTerm::TypePatternCharacter) + , m_capture(false) + , m_invert(false) + { + patternCharacter = ch; + quantityType = QuantifierFixedCount; + quantityCount = 1; + } + + PatternTerm(CharacterClass* charClass, bool invert) + : type(PatternTerm::TypeCharacterClass) + , m_capture(false) + , m_invert(invert) + { + characterClass = charClass; + quantityType = QuantifierFixedCount; + quantityCount = 1; + } + + PatternTerm(Type type, unsigned subpatternId, PatternDisjunction* disjunction, bool capture = false, bool invert = false) + : type(type) + , m_capture(capture) + , m_invert(invert) + { + parentheses.disjunction = disjunction; + parentheses.subpatternId = subpatternId; + parentheses.isCopy = false; + parentheses.isTerminal = false; + quantityType = QuantifierFixedCount; + quantityCount = 1; + } + + PatternTerm(Type type, bool invert = false) + : type(type) + , m_capture(false) + , m_invert(invert) + { + quantityType = QuantifierFixedCount; + quantityCount = 1; + } + + PatternTerm(unsigned spatternId) + : type(TypeBackReference) + , m_capture(false) + , m_invert(false) + { + backReferenceSubpatternId = spatternId; + quantityType = QuantifierFixedCount; + quantityCount = 1; + } + + PatternTerm(bool bolAnchor, bool eolAnchor) + : type(TypeDotStarEnclosure) + , m_capture(false) + , m_invert(false) + { + anchors.bolAnchor = bolAnchor; + anchors.eolAnchor = eolAnchor; + quantityType = QuantifierFixedCount; + quantityCount = 1; + } + + static PatternTerm ForwardReference() + { + return PatternTerm(TypeForwardReference); + } + + static PatternTerm BOL() + { + return PatternTerm(TypeAssertionBOL); + } + + static PatternTerm EOL() + { + return PatternTerm(TypeAssertionEOL); + } + + static PatternTerm WordBoundary(bool invert) + { + return PatternTerm(TypeAssertionWordBoundary, invert); + } + + bool invert() + { + return m_invert; + } + + bool capture() + { + return m_capture; + } + + void quantify(unsigned count, QuantifierType type) + { + quantityCount = count; + quantityType = type; + } +}; + +struct PatternAlternative { + WTF_MAKE_FAST_ALLOCATED; +public: + PatternAlternative(PatternDisjunction* disjunction) + : m_parent(disjunction) + , m_onceThrough(false) + , m_hasFixedSize(false) + , m_startsWithBOL(false) + , m_containsBOL(false) + { + } + + PatternTerm& lastTerm() + { + ASSERT(m_terms.size()); + return m_terms[m_terms.size() - 1]; + } + + void removeLastTerm() + { + ASSERT(m_terms.size()); + m_terms.shrink(m_terms.size() - 1); + } + + void setOnceThrough() + { + m_onceThrough = true; + } + + bool onceThrough() + { + return m_onceThrough; + } + + Vector<PatternTerm> m_terms; + PatternDisjunction* m_parent; + unsigned m_minimumSize; + bool m_onceThrough : 1; + bool m_hasFixedSize : 1; + bool m_startsWithBOL : 1; + bool m_containsBOL : 1; +}; + +struct PatternDisjunction { + WTF_MAKE_FAST_ALLOCATED; +public: + PatternDisjunction(PatternAlternative* parent = 0) + : m_parent(parent) + , m_hasFixedSize(false) + { + } + + ~PatternDisjunction() + { + deleteAllValues(m_alternatives); + } + + PatternAlternative* addNewAlternative() + { + PatternAlternative* alternative = new PatternAlternative(this); + m_alternatives.append(alternative); + return alternative; + } + + Vector<PatternAlternative*> m_alternatives; + PatternAlternative* m_parent; + unsigned m_minimumSize; + unsigned m_callFrameSize; + bool m_hasFixedSize; +}; + +// You probably don't want to be calling these functions directly +// (please to be calling newlineCharacterClass() et al on your +// friendly neighborhood YarrPattern instance to get nicely +// cached copies). +CharacterClass* newlineCreate(); +CharacterClass* digitsCreate(); +CharacterClass* spacesCreate(); +CharacterClass* wordcharCreate(); +CharacterClass* nondigitsCreate(); +CharacterClass* nonspacesCreate(); +CharacterClass* nonwordcharCreate(); + +struct TermChain { + TermChain(PatternTerm term) + : term(term) + {} + + PatternTerm term; + Vector<TermChain> hotTerms; +}; + +struct YarrPattern { + YarrPattern(const UString& pattern, bool ignoreCase, bool multiline, const char** error); + + ~YarrPattern() + { + deleteAllValues(m_disjunctions); + deleteAllValues(m_userCharacterClasses); + } + + void reset() + { + m_numSubpatterns = 0; + m_maxBackReference = 0; + + m_containsBackreferences = false; + m_containsBOL = false; + + newlineCached = 0; + digitsCached = 0; + spacesCached = 0; + wordcharCached = 0; + nondigitsCached = 0; + nonspacesCached = 0; + nonwordcharCached = 0; + + deleteAllValues(m_disjunctions); + m_disjunctions.clear(); + deleteAllValues(m_userCharacterClasses); + m_userCharacterClasses.clear(); + } + + bool containsIllegalBackReference() + { + return m_maxBackReference > m_numSubpatterns; + } + + CharacterClass* newlineCharacterClass() + { + if (!newlineCached) + m_userCharacterClasses.append(newlineCached = newlineCreate()); + return newlineCached; + } + CharacterClass* digitsCharacterClass() + { + if (!digitsCached) + m_userCharacterClasses.append(digitsCached = digitsCreate()); + return digitsCached; + } + CharacterClass* spacesCharacterClass() + { + if (!spacesCached) + m_userCharacterClasses.append(spacesCached = spacesCreate()); + return spacesCached; + } + CharacterClass* wordcharCharacterClass() + { + if (!wordcharCached) + m_userCharacterClasses.append(wordcharCached = wordcharCreate()); + return wordcharCached; + } + CharacterClass* nondigitsCharacterClass() + { + if (!nondigitsCached) + m_userCharacterClasses.append(nondigitsCached = nondigitsCreate()); + return nondigitsCached; + } + CharacterClass* nonspacesCharacterClass() + { + if (!nonspacesCached) + m_userCharacterClasses.append(nonspacesCached = nonspacesCreate()); + return nonspacesCached; + } + CharacterClass* nonwordcharCharacterClass() + { + if (!nonwordcharCached) + m_userCharacterClasses.append(nonwordcharCached = nonwordcharCreate()); + return nonwordcharCached; + } + + bool m_ignoreCase : 1; + bool m_multiline : 1; + bool m_containsBackreferences : 1; + bool m_containsBOL : 1; + unsigned m_numSubpatterns; + unsigned m_maxBackReference; + PatternDisjunction* m_body; + Vector<PatternDisjunction*, 4> m_disjunctions; + Vector<CharacterClass*> m_userCharacterClasses; + +private: + const char* compile(const UString& patternString); + + CharacterClass* newlineCached; + CharacterClass* digitsCached; + CharacterClass* spacesCached; + CharacterClass* wordcharCached; + CharacterClass* nondigitsCached; + CharacterClass* nonspacesCached; + CharacterClass* nonwordcharCached; +}; + +} } // namespace JSC::Yarr + +#endif // YarrPattern_h diff --git a/Source/JavaScriptCore/yarr/YarrSyntaxChecker.cpp b/Source/JavaScriptCore/yarr/YarrSyntaxChecker.cpp new file mode 100644 index 000000000..51fda94d8 --- /dev/null +++ b/Source/JavaScriptCore/yarr/YarrSyntaxChecker.cpp @@ -0,0 +1,59 @@ +/* + * Copyright (C) 2011 Apple Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "YarrSyntaxChecker.h" + +#include "YarrParser.h" + +namespace JSC { namespace Yarr { + +class SyntaxChecker { +public: + void assertionBOL() {} + void assertionEOL() {} + void assertionWordBoundary(bool) {} + void atomPatternCharacter(UChar) {} + void atomBuiltInCharacterClass(BuiltInCharacterClassID, bool) {} + void atomCharacterClassBegin(bool = false) {} + void atomCharacterClassAtom(UChar) {} + void atomCharacterClassRange(UChar, UChar) {} + void atomCharacterClassBuiltIn(BuiltInCharacterClassID, bool) {} + void atomCharacterClassEnd() {} + void atomParenthesesSubpatternBegin(bool = true) {} + void atomParentheticalAssertionBegin(bool = false) {} + void atomParenthesesEnd() {} + void atomBackReference(unsigned) {} + void quantifyAtom(unsigned, unsigned, bool) {} + void disjunction() {} +}; + +const char* checkSyntax(const UString& pattern) +{ + SyntaxChecker syntaxChecker; + return parse(syntaxChecker, pattern); +} + +}} // JSC::YARR diff --git a/Source/JavaScriptCore/yarr/YarrSyntaxChecker.h b/Source/JavaScriptCore/yarr/YarrSyntaxChecker.h new file mode 100644 index 000000000..2013671eb --- /dev/null +++ b/Source/JavaScriptCore/yarr/YarrSyntaxChecker.h @@ -0,0 +1,38 @@ +/* + * Copyright (C) 2011 Apple Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef YarrSyntaxChecker_h +#define YarrSyntaxChecker_h + +#include <runtime/UString.h> + +namespace JSC { namespace Yarr { + +const char* checkSyntax(const UString& pattern); + +}} // JSC::YARR + +#endif // YarrSyntaxChecker_h + diff --git a/Source/JavaScriptCore/yarr/yarr.pri b/Source/JavaScriptCore/yarr/yarr.pri new file mode 100644 index 000000000..c2634864f --- /dev/null +++ b/Source/JavaScriptCore/yarr/yarr.pri @@ -0,0 +1,13 @@ +# ------------------------------------------------------------------- +# Project file for YARR +# +# See 'Tools/qmake/README' for an overview of the build system +# ------------------------------------------------------------------- + +SOURCES += \ + $$PWD/YarrInterpreter.cpp \ + $$PWD/YarrPattern.cpp \ + $$PWD/YarrSyntaxChecker.cpp + +# For UString.h +v8: INCLUDEPATH += $$PWD/../runtime |