diff options
Diffstat (limited to 'Source/JavaScriptCore/yarr/YarrPattern.cpp')
-rw-r--r-- | Source/JavaScriptCore/yarr/YarrPattern.cpp | 142 |
1 files changed, 93 insertions, 49 deletions
diff --git a/Source/JavaScriptCore/yarr/YarrPattern.cpp b/Source/JavaScriptCore/yarr/YarrPattern.cpp index f0d10e624..bbda9c526 100644 --- a/Source/JavaScriptCore/yarr/YarrPattern.cpp +++ b/Source/JavaScriptCore/yarr/YarrPattern.cpp @@ -28,6 +28,7 @@ #include "YarrPattern.h" #include "Yarr.h" +#include "YarrCanonicalizeUCS2.h" #include "YarrParser.h" #include <wtf/Vector.h> @@ -66,32 +67,43 @@ public: void putChar(UChar ch) { + // Handle ascii cases. if (ch <= 0x7f) { if (m_isCaseInsensitive && isASCIIAlpha(ch)) { addSorted(m_matches, toASCIIUpper(ch)); addSorted(m_matches, toASCIILower(ch)); } else addSorted(m_matches, ch); - } else { - UChar upper, lower; - if (m_isCaseInsensitive && ((upper = Unicode::toUpper(ch)) != (lower = Unicode::toLower(ch)))) { - addSorted(m_matchesUnicode, upper); - addSorted(m_matchesUnicode, lower); - } else - addSorted(m_matchesUnicode, ch); + return; } - } - // returns true if this character has another case, and 'ch' is the upper case form. - static inline bool isUnicodeUpper(UChar ch) - { - return ch != Unicode::toLower(ch); + // Simple case, not a case-insensitive match. + if (!m_isCaseInsensitive) { + addSorted(m_matchesUnicode, ch); + return; + } + + // Add multiple matches, if necessary. + UCS2CanonicalizationRange* info = rangeInfoFor(ch); + if (info->type == CanonicalizeUnique) + addSorted(m_matchesUnicode, ch); + else + putUnicodeIgnoreCase(ch, info); } - // returns true if this character has another case, and 'ch' is the lower case form. - static inline bool isUnicodeLower(UChar ch) + void putUnicodeIgnoreCase(UChar ch, UCS2CanonicalizationRange* info) { - return ch != Unicode::toUpper(ch); + ASSERT(m_isCaseInsensitive); + ASSERT(ch > 0x7f); + ASSERT(ch >= info->begin && ch <= info->end); + ASSERT(info->type != CanonicalizeUnique); + if (info->type == CanonicalizeSet) { + for (uint16_t* set = characterSetInfo[info->value]; (ch = *set); ++set) + addSorted(m_matchesUnicode, ch); + } else { + addSorted(m_matchesUnicode, ch); + addSorted(m_matchesUnicode, getCanonicalPair(info, ch)); + } } void putRange(UChar lo, UChar hi) @@ -108,36 +120,59 @@ public: addSortedRange(m_ranges, std::max(asciiLo, 'a')+('A'-'a'), std::min(asciiHi, 'z')+('A'-'a')); } } - if (hi >= 0x80) { - uint32_t unicodeCurr = std::max(lo, (UChar)0x80); - addSortedRange(m_rangesUnicode, unicodeCurr, hi); - - if (m_isCaseInsensitive) { - while (unicodeCurr <= hi) { - // If the upper bound of the range (hi) is 0xffff, the increments to - // unicodeCurr in this loop may take it to 0x10000. This is fine - // (if so we won't re-enter the loop, since the loop condition above - // will definitely fail) - but this does mean we cannot use a UChar - // to represent unicodeCurr, we must use a 32-bit value instead. - ASSERT(unicodeCurr <= 0xffff); - - if (isUnicodeUpper(unicodeCurr)) { - UChar lowerCaseRangeBegin = Unicode::toLower(unicodeCurr); - UChar lowerCaseRangeEnd = lowerCaseRangeBegin; - while ((++unicodeCurr <= hi) && isUnicodeUpper(unicodeCurr) && (Unicode::toLower(unicodeCurr) == (lowerCaseRangeEnd + 1))) - lowerCaseRangeEnd++; - addSortedRange(m_rangesUnicode, lowerCaseRangeBegin, lowerCaseRangeEnd); - } else if (isUnicodeLower(unicodeCurr)) { - UChar upperCaseRangeBegin = Unicode::toUpper(unicodeCurr); - UChar upperCaseRangeEnd = upperCaseRangeBegin; - while ((++unicodeCurr <= hi) && isUnicodeLower(unicodeCurr) && (Unicode::toUpper(unicodeCurr) == (upperCaseRangeEnd + 1))) - upperCaseRangeEnd++; - addSortedRange(m_rangesUnicode, upperCaseRangeBegin, upperCaseRangeEnd); - } else - ++unicodeCurr; - } + if (hi <= 0x7f) + return; + + lo = std::max(lo, (UChar)0x80); + addSortedRange(m_rangesUnicode, lo, hi); + + if (!m_isCaseInsensitive) + return; + + UCS2CanonicalizationRange* info = rangeInfoFor(lo); + while (true) { + // Handle the range [lo .. end] + UChar end = std::min<UChar>(info->end, hi); + + switch (info->type) { + case CanonicalizeUnique: + // Nothing to do - no canonical equivalents. + break; + case CanonicalizeSet: { + UChar ch; + for (uint16_t* set = characterSetInfo[info->value]; (ch = *set); ++set) + addSorted(m_matchesUnicode, ch); + break; } - } + case CanonicalizeRangeLo: + addSortedRange(m_rangesUnicode, lo + info->value, end + info->value); + break; + case CanonicalizeRangeHi: + addSortedRange(m_rangesUnicode, lo - info->value, end - info->value); + break; + case CanonicalizeAlternatingAligned: + // Use addSortedRange since there is likely an abutting range to combine with. + if (lo & 1) + addSortedRange(m_rangesUnicode, lo - 1, lo - 1); + if (!(end & 1)) + addSortedRange(m_rangesUnicode, end + 1, end + 1); + break; + case CanonicalizeAlternatingUnaligned: + // Use addSortedRange since there is likely an abutting range to combine with. + if (!(lo & 1)) + addSortedRange(m_rangesUnicode, lo - 1, lo - 1); + if (end & 1) + addSortedRange(m_rangesUnicode, end + 1, end + 1); + break; + } + + if (hi == end) + return; + + ++info; + lo = info->begin; + }; + } CharacterClass* charClass() @@ -280,12 +315,21 @@ public: { // We handle case-insensitive checking of unicode characters which do have both // cases by handling them as if they were defined using a CharacterClass. - if (m_pattern.m_ignoreCase && !isASCII(ch) && (Unicode::toUpper(ch) != Unicode::toLower(ch))) { - atomCharacterClassBegin(); - atomCharacterClassAtom(ch); - atomCharacterClassEnd(); - } else + if (!m_pattern.m_ignoreCase || isASCII(ch)) { + m_alternative->m_terms.append(PatternTerm(ch)); + return; + } + + UCS2CanonicalizationRange* info = rangeInfoFor(ch); + if (info->type == CanonicalizeUnique) { m_alternative->m_terms.append(PatternTerm(ch)); + return; + } + + m_characterClassConstructor.putUnicodeIgnoreCase(ch, info); + CharacterClass* newCharacterClass = m_characterClassConstructor.charClass(); + m_pattern.m_userCharacterClasses.append(newCharacterClass); + m_alternative->m_terms.append(PatternTerm(newCharacterClass, false)); } void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert) |