summaryrefslogtreecommitdiff
path: root/Source/JavaScriptCore/yarr/YarrPattern.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'Source/JavaScriptCore/yarr/YarrPattern.cpp')
-rw-r--r--Source/JavaScriptCore/yarr/YarrPattern.cpp142
1 files changed, 93 insertions, 49 deletions
diff --git a/Source/JavaScriptCore/yarr/YarrPattern.cpp b/Source/JavaScriptCore/yarr/YarrPattern.cpp
index f0d10e624..bbda9c526 100644
--- a/Source/JavaScriptCore/yarr/YarrPattern.cpp
+++ b/Source/JavaScriptCore/yarr/YarrPattern.cpp
@@ -28,6 +28,7 @@
#include "YarrPattern.h"
#include "Yarr.h"
+#include "YarrCanonicalizeUCS2.h"
#include "YarrParser.h"
#include <wtf/Vector.h>
@@ -66,32 +67,43 @@ public:
void putChar(UChar ch)
{
+ // Handle ascii cases.
if (ch <= 0x7f) {
if (m_isCaseInsensitive && isASCIIAlpha(ch)) {
addSorted(m_matches, toASCIIUpper(ch));
addSorted(m_matches, toASCIILower(ch));
} else
addSorted(m_matches, ch);
- } else {
- UChar upper, lower;
- if (m_isCaseInsensitive && ((upper = Unicode::toUpper(ch)) != (lower = Unicode::toLower(ch)))) {
- addSorted(m_matchesUnicode, upper);
- addSorted(m_matchesUnicode, lower);
- } else
- addSorted(m_matchesUnicode, ch);
+ return;
}
- }
- // returns true if this character has another case, and 'ch' is the upper case form.
- static inline bool isUnicodeUpper(UChar ch)
- {
- return ch != Unicode::toLower(ch);
+ // Simple case, not a case-insensitive match.
+ if (!m_isCaseInsensitive) {
+ addSorted(m_matchesUnicode, ch);
+ return;
+ }
+
+ // Add multiple matches, if necessary.
+ UCS2CanonicalizationRange* info = rangeInfoFor(ch);
+ if (info->type == CanonicalizeUnique)
+ addSorted(m_matchesUnicode, ch);
+ else
+ putUnicodeIgnoreCase(ch, info);
}
- // returns true if this character has another case, and 'ch' is the lower case form.
- static inline bool isUnicodeLower(UChar ch)
+ void putUnicodeIgnoreCase(UChar ch, UCS2CanonicalizationRange* info)
{
- return ch != Unicode::toUpper(ch);
+ ASSERT(m_isCaseInsensitive);
+ ASSERT(ch > 0x7f);
+ ASSERT(ch >= info->begin && ch <= info->end);
+ ASSERT(info->type != CanonicalizeUnique);
+ if (info->type == CanonicalizeSet) {
+ for (uint16_t* set = characterSetInfo[info->value]; (ch = *set); ++set)
+ addSorted(m_matchesUnicode, ch);
+ } else {
+ addSorted(m_matchesUnicode, ch);
+ addSorted(m_matchesUnicode, getCanonicalPair(info, ch));
+ }
}
void putRange(UChar lo, UChar hi)
@@ -108,36 +120,59 @@ public:
addSortedRange(m_ranges, std::max(asciiLo, 'a')+('A'-'a'), std::min(asciiHi, 'z')+('A'-'a'));
}
}
- if (hi >= 0x80) {
- uint32_t unicodeCurr = std::max(lo, (UChar)0x80);
- addSortedRange(m_rangesUnicode, unicodeCurr, hi);
-
- if (m_isCaseInsensitive) {
- while (unicodeCurr <= hi) {
- // If the upper bound of the range (hi) is 0xffff, the increments to
- // unicodeCurr in this loop may take it to 0x10000. This is fine
- // (if so we won't re-enter the loop, since the loop condition above
- // will definitely fail) - but this does mean we cannot use a UChar
- // to represent unicodeCurr, we must use a 32-bit value instead.
- ASSERT(unicodeCurr <= 0xffff);
-
- if (isUnicodeUpper(unicodeCurr)) {
- UChar lowerCaseRangeBegin = Unicode::toLower(unicodeCurr);
- UChar lowerCaseRangeEnd = lowerCaseRangeBegin;
- while ((++unicodeCurr <= hi) && isUnicodeUpper(unicodeCurr) && (Unicode::toLower(unicodeCurr) == (lowerCaseRangeEnd + 1)))
- lowerCaseRangeEnd++;
- addSortedRange(m_rangesUnicode, lowerCaseRangeBegin, lowerCaseRangeEnd);
- } else if (isUnicodeLower(unicodeCurr)) {
- UChar upperCaseRangeBegin = Unicode::toUpper(unicodeCurr);
- UChar upperCaseRangeEnd = upperCaseRangeBegin;
- while ((++unicodeCurr <= hi) && isUnicodeLower(unicodeCurr) && (Unicode::toUpper(unicodeCurr) == (upperCaseRangeEnd + 1)))
- upperCaseRangeEnd++;
- addSortedRange(m_rangesUnicode, upperCaseRangeBegin, upperCaseRangeEnd);
- } else
- ++unicodeCurr;
- }
+ if (hi <= 0x7f)
+ return;
+
+ lo = std::max(lo, (UChar)0x80);
+ addSortedRange(m_rangesUnicode, lo, hi);
+
+ if (!m_isCaseInsensitive)
+ return;
+
+ UCS2CanonicalizationRange* info = rangeInfoFor(lo);
+ while (true) {
+ // Handle the range [lo .. end]
+ UChar end = std::min<UChar>(info->end, hi);
+
+ switch (info->type) {
+ case CanonicalizeUnique:
+ // Nothing to do - no canonical equivalents.
+ break;
+ case CanonicalizeSet: {
+ UChar ch;
+ for (uint16_t* set = characterSetInfo[info->value]; (ch = *set); ++set)
+ addSorted(m_matchesUnicode, ch);
+ break;
}
- }
+ case CanonicalizeRangeLo:
+ addSortedRange(m_rangesUnicode, lo + info->value, end + info->value);
+ break;
+ case CanonicalizeRangeHi:
+ addSortedRange(m_rangesUnicode, lo - info->value, end - info->value);
+ break;
+ case CanonicalizeAlternatingAligned:
+ // Use addSortedRange since there is likely an abutting range to combine with.
+ if (lo & 1)
+ addSortedRange(m_rangesUnicode, lo - 1, lo - 1);
+ if (!(end & 1))
+ addSortedRange(m_rangesUnicode, end + 1, end + 1);
+ break;
+ case CanonicalizeAlternatingUnaligned:
+ // Use addSortedRange since there is likely an abutting range to combine with.
+ if (!(lo & 1))
+ addSortedRange(m_rangesUnicode, lo - 1, lo - 1);
+ if (end & 1)
+ addSortedRange(m_rangesUnicode, end + 1, end + 1);
+ break;
+ }
+
+ if (hi == end)
+ return;
+
+ ++info;
+ lo = info->begin;
+ };
+
}
CharacterClass* charClass()
@@ -280,12 +315,21 @@ public:
{
// We handle case-insensitive checking of unicode characters which do have both
// cases by handling them as if they were defined using a CharacterClass.
- if (m_pattern.m_ignoreCase && !isASCII(ch) && (Unicode::toUpper(ch) != Unicode::toLower(ch))) {
- atomCharacterClassBegin();
- atomCharacterClassAtom(ch);
- atomCharacterClassEnd();
- } else
+ if (!m_pattern.m_ignoreCase || isASCII(ch)) {
+ m_alternative->m_terms.append(PatternTerm(ch));
+ return;
+ }
+
+ UCS2CanonicalizationRange* info = rangeInfoFor(ch);
+ if (info->type == CanonicalizeUnique) {
m_alternative->m_terms.append(PatternTerm(ch));
+ return;
+ }
+
+ m_characterClassConstructor.putUnicodeIgnoreCase(ch, info);
+ CharacterClass* newCharacterClass = m_characterClassConstructor.charClass();
+ m_pattern.m_userCharacterClasses.append(newCharacterClass);
+ m_alternative->m_terms.append(PatternTerm(newCharacterClass, false));
}
void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)