summaryrefslogtreecommitdiff
path: root/Source/JavaScriptCore/yarr
diff options
context:
space:
mode:
authorSimon Hausmann <simon.hausmann@nokia.com>2012-05-07 11:21:11 +0200
committerSimon Hausmann <simon.hausmann@nokia.com>2012-05-07 11:21:11 +0200
commit2cf6c8816a73e0132bd8fa3b509d62d7c51b6e47 (patch)
tree988e8c5b116dd0466244ae2fe5af8ee9be926d76 /Source/JavaScriptCore/yarr
parentdd91e772430dc294e3bf478c119ef8d43c0a3358 (diff)
downloadqtwebkit-2cf6c8816a73e0132bd8fa3b509d62d7c51b6e47.tar.gz
Imported WebKit commit 7e538425aa020340619e927792f3d895061fb54b (http://svn.webkit.org/repository/webkit/trunk@116286)
Diffstat (limited to 'Source/JavaScriptCore/yarr')
-rw-r--r--Source/JavaScriptCore/yarr/Yarr.h3
-rw-r--r--Source/JavaScriptCore/yarr/YarrCanonicalizeUCS2.cpp463
-rw-r--r--Source/JavaScriptCore/yarr/YarrCanonicalizeUCS2.h138
-rw-r--r--Source/JavaScriptCore/yarr/YarrCanonicalizeUCS2.js219
-rw-r--r--Source/JavaScriptCore/yarr/YarrInterpreter.cpp106
-rw-r--r--Source/JavaScriptCore/yarr/YarrInterpreter.h5
-rw-r--r--Source/JavaScriptCore/yarr/YarrJIT.cpp200
-rw-r--r--Source/JavaScriptCore/yarr/YarrJIT.h78
-rw-r--r--Source/JavaScriptCore/yarr/YarrPattern.cpp142
-rw-r--r--Source/JavaScriptCore/yarr/yarr.pri9
10 files changed, 1162 insertions, 201 deletions
diff --git a/Source/JavaScriptCore/yarr/Yarr.h b/Source/JavaScriptCore/yarr/Yarr.h
index 57a3846c0..d393e9fa9 100644
--- a/Source/JavaScriptCore/yarr/Yarr.h
+++ b/Source/JavaScriptCore/yarr/Yarr.h
@@ -63,9 +63,6 @@ enum YarrCharSize {
Char16
};
-JS_EXPORT_PRIVATE PassOwnPtr<BytecodePattern> byteCompile(YarrPattern&, BumpPointerAllocator*);
-JS_EXPORT_PRIVATE unsigned interpret(BytecodePattern*, const UString& input, unsigned start, unsigned length, unsigned* output);
-
} } // namespace JSC::Yarr
#endif // Yarr_h
diff --git a/Source/JavaScriptCore/yarr/YarrCanonicalizeUCS2.cpp b/Source/JavaScriptCore/yarr/YarrCanonicalizeUCS2.cpp
new file mode 100644
index 000000000..7bb3d08eb
--- /dev/null
+++ b/Source/JavaScriptCore/yarr/YarrCanonicalizeUCS2.cpp
@@ -0,0 +1,463 @@
+/*
+ * Copyright (C) 2012 Apple Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+// DO NOT EDIT! - this file autogenerated by YarrCanonicalizeUCS2.js
+
+#include "config.h"
+#include "YarrCanonicalizeUCS2.h"
+
+namespace JSC { namespace Yarr {
+
+#include <stdint.h>
+
+uint16_t ucs2CharacterSet0[] = { 0x01c4u, 0x01c5u, 0x01c6u, 0 };
+uint16_t ucs2CharacterSet1[] = { 0x01c7u, 0x01c8u, 0x01c9u, 0 };
+uint16_t ucs2CharacterSet2[] = { 0x01cau, 0x01cbu, 0x01ccu, 0 };
+uint16_t ucs2CharacterSet3[] = { 0x01f1u, 0x01f2u, 0x01f3u, 0 };
+uint16_t ucs2CharacterSet4[] = { 0x0392u, 0x03b2u, 0x03d0u, 0 };
+uint16_t ucs2CharacterSet5[] = { 0x0395u, 0x03b5u, 0x03f5u, 0 };
+uint16_t ucs2CharacterSet6[] = { 0x0398u, 0x03b8u, 0x03d1u, 0 };
+uint16_t ucs2CharacterSet7[] = { 0x0345u, 0x0399u, 0x03b9u, 0x1fbeu, 0 };
+uint16_t ucs2CharacterSet8[] = { 0x039au, 0x03bau, 0x03f0u, 0 };
+uint16_t ucs2CharacterSet9[] = { 0x00b5u, 0x039cu, 0x03bcu, 0 };
+uint16_t ucs2CharacterSet10[] = { 0x03a0u, 0x03c0u, 0x03d6u, 0 };
+uint16_t ucs2CharacterSet11[] = { 0x03a1u, 0x03c1u, 0x03f1u, 0 };
+uint16_t ucs2CharacterSet12[] = { 0x03a3u, 0x03c2u, 0x03c3u, 0 };
+uint16_t ucs2CharacterSet13[] = { 0x03a6u, 0x03c6u, 0x03d5u, 0 };
+uint16_t ucs2CharacterSet14[] = { 0x1e60u, 0x1e61u, 0x1e9bu, 0 };
+
+static const size_t UCS2_CANONICALIZATION_SETS = 15;
+uint16_t* characterSetInfo[UCS2_CANONICALIZATION_SETS] = {
+ ucs2CharacterSet0,
+ ucs2CharacterSet1,
+ ucs2CharacterSet2,
+ ucs2CharacterSet3,
+ ucs2CharacterSet4,
+ ucs2CharacterSet5,
+ ucs2CharacterSet6,
+ ucs2CharacterSet7,
+ ucs2CharacterSet8,
+ ucs2CharacterSet9,
+ ucs2CharacterSet10,
+ ucs2CharacterSet11,
+ ucs2CharacterSet12,
+ ucs2CharacterSet13,
+ ucs2CharacterSet14,
+};
+
+const size_t UCS2_CANONICALIZATION_RANGES = 364;
+UCS2CanonicalizationRange rangeInfo[UCS2_CANONICALIZATION_RANGES] = {
+ { 0x0000u, 0x0040u, 0x0000u, CanonicalizeUnique },
+ { 0x0041u, 0x005au, 0x0020u, CanonicalizeRangeLo },
+ { 0x005bu, 0x0060u, 0x0000u, CanonicalizeUnique },
+ { 0x0061u, 0x007au, 0x0020u, CanonicalizeRangeHi },
+ { 0x007bu, 0x00b4u, 0x0000u, CanonicalizeUnique },
+ { 0x00b5u, 0x00b5u, 0x0009u, CanonicalizeSet },
+ { 0x00b6u, 0x00bfu, 0x0000u, CanonicalizeUnique },
+ { 0x00c0u, 0x00d6u, 0x0020u, CanonicalizeRangeLo },
+ { 0x00d7u, 0x00d7u, 0x0000u, CanonicalizeUnique },
+ { 0x00d8u, 0x00deu, 0x0020u, CanonicalizeRangeLo },
+ { 0x00dfu, 0x00dfu, 0x0000u, CanonicalizeUnique },
+ { 0x00e0u, 0x00f6u, 0x0020u, CanonicalizeRangeHi },
+ { 0x00f7u, 0x00f7u, 0x0000u, CanonicalizeUnique },
+ { 0x00f8u, 0x00feu, 0x0020u, CanonicalizeRangeHi },
+ { 0x00ffu, 0x00ffu, 0x0079u, CanonicalizeRangeLo },
+ { 0x0100u, 0x012fu, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x0130u, 0x0131u, 0x0000u, CanonicalizeUnique },
+ { 0x0132u, 0x0137u, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x0138u, 0x0138u, 0x0000u, CanonicalizeUnique },
+ { 0x0139u, 0x0148u, 0x0000u, CanonicalizeAlternatingUnaligned },
+ { 0x0149u, 0x0149u, 0x0000u, CanonicalizeUnique },
+ { 0x014au, 0x0177u, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x0178u, 0x0178u, 0x0079u, CanonicalizeRangeHi },
+ { 0x0179u, 0x017eu, 0x0000u, CanonicalizeAlternatingUnaligned },
+ { 0x017fu, 0x017fu, 0x0000u, CanonicalizeUnique },
+ { 0x0180u, 0x0180u, 0x00c3u, CanonicalizeRangeLo },
+ { 0x0181u, 0x0181u, 0x00d2u, CanonicalizeRangeLo },
+ { 0x0182u, 0x0185u, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x0186u, 0x0186u, 0x00ceu, CanonicalizeRangeLo },
+ { 0x0187u, 0x0188u, 0x0000u, CanonicalizeAlternatingUnaligned },
+ { 0x0189u, 0x018au, 0x00cdu, CanonicalizeRangeLo },
+ { 0x018bu, 0x018cu, 0x0000u, CanonicalizeAlternatingUnaligned },
+ { 0x018du, 0x018du, 0x0000u, CanonicalizeUnique },
+ { 0x018eu, 0x018eu, 0x004fu, CanonicalizeRangeLo },
+ { 0x018fu, 0x018fu, 0x00cau, CanonicalizeRangeLo },
+ { 0x0190u, 0x0190u, 0x00cbu, CanonicalizeRangeLo },
+ { 0x0191u, 0x0192u, 0x0000u, CanonicalizeAlternatingUnaligned },
+ { 0x0193u, 0x0193u, 0x00cdu, CanonicalizeRangeLo },
+ { 0x0194u, 0x0194u, 0x00cfu, CanonicalizeRangeLo },
+ { 0x0195u, 0x0195u, 0x0061u, CanonicalizeRangeLo },
+ { 0x0196u, 0x0196u, 0x00d3u, CanonicalizeRangeLo },
+ { 0x0197u, 0x0197u, 0x00d1u, CanonicalizeRangeLo },
+ { 0x0198u, 0x0199u, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x019au, 0x019au, 0x00a3u, CanonicalizeRangeLo },
+ { 0x019bu, 0x019bu, 0x0000u, CanonicalizeUnique },
+ { 0x019cu, 0x019cu, 0x00d3u, CanonicalizeRangeLo },
+ { 0x019du, 0x019du, 0x00d5u, CanonicalizeRangeLo },
+ { 0x019eu, 0x019eu, 0x0082u, CanonicalizeRangeLo },
+ { 0x019fu, 0x019fu, 0x00d6u, CanonicalizeRangeLo },
+ { 0x01a0u, 0x01a5u, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x01a6u, 0x01a6u, 0x00dau, CanonicalizeRangeLo },
+ { 0x01a7u, 0x01a8u, 0x0000u, CanonicalizeAlternatingUnaligned },
+ { 0x01a9u, 0x01a9u, 0x00dau, CanonicalizeRangeLo },
+ { 0x01aau, 0x01abu, 0x0000u, CanonicalizeUnique },
+ { 0x01acu, 0x01adu, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x01aeu, 0x01aeu, 0x00dau, CanonicalizeRangeLo },
+ { 0x01afu, 0x01b0u, 0x0000u, CanonicalizeAlternatingUnaligned },
+ { 0x01b1u, 0x01b2u, 0x00d9u, CanonicalizeRangeLo },
+ { 0x01b3u, 0x01b6u, 0x0000u, CanonicalizeAlternatingUnaligned },
+ { 0x01b7u, 0x01b7u, 0x00dbu, CanonicalizeRangeLo },
+ { 0x01b8u, 0x01b9u, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x01bau, 0x01bbu, 0x0000u, CanonicalizeUnique },
+ { 0x01bcu, 0x01bdu, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x01beu, 0x01beu, 0x0000u, CanonicalizeUnique },
+ { 0x01bfu, 0x01bfu, 0x0038u, CanonicalizeRangeLo },
+ { 0x01c0u, 0x01c3u, 0x0000u, CanonicalizeUnique },
+ { 0x01c4u, 0x01c6u, 0x0000u, CanonicalizeSet },
+ { 0x01c7u, 0x01c9u, 0x0001u, CanonicalizeSet },
+ { 0x01cau, 0x01ccu, 0x0002u, CanonicalizeSet },
+ { 0x01cdu, 0x01dcu, 0x0000u, CanonicalizeAlternatingUnaligned },
+ { 0x01ddu, 0x01ddu, 0x004fu, CanonicalizeRangeHi },
+ { 0x01deu, 0x01efu, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x01f0u, 0x01f0u, 0x0000u, CanonicalizeUnique },
+ { 0x01f1u, 0x01f3u, 0x0003u, CanonicalizeSet },
+ { 0x01f4u, 0x01f5u, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x01f6u, 0x01f6u, 0x0061u, CanonicalizeRangeHi },
+ { 0x01f7u, 0x01f7u, 0x0038u, CanonicalizeRangeHi },
+ { 0x01f8u, 0x021fu, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x0220u, 0x0220u, 0x0082u, CanonicalizeRangeHi },
+ { 0x0221u, 0x0221u, 0x0000u, CanonicalizeUnique },
+ { 0x0222u, 0x0233u, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x0234u, 0x0239u, 0x0000u, CanonicalizeUnique },
+ { 0x023au, 0x023au, 0x2a2bu, CanonicalizeRangeLo },
+ { 0x023bu, 0x023cu, 0x0000u, CanonicalizeAlternatingUnaligned },
+ { 0x023du, 0x023du, 0x00a3u, CanonicalizeRangeHi },
+ { 0x023eu, 0x023eu, 0x2a28u, CanonicalizeRangeLo },
+ { 0x023fu, 0x0240u, 0x2a3fu, CanonicalizeRangeLo },
+ { 0x0241u, 0x0242u, 0x0000u, CanonicalizeAlternatingUnaligned },
+ { 0x0243u, 0x0243u, 0x00c3u, CanonicalizeRangeHi },
+ { 0x0244u, 0x0244u, 0x0045u, CanonicalizeRangeLo },
+ { 0x0245u, 0x0245u, 0x0047u, CanonicalizeRangeLo },
+ { 0x0246u, 0x024fu, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x0250u, 0x0250u, 0x2a1fu, CanonicalizeRangeLo },
+ { 0x0251u, 0x0251u, 0x2a1cu, CanonicalizeRangeLo },
+ { 0x0252u, 0x0252u, 0x2a1eu, CanonicalizeRangeLo },
+ { 0x0253u, 0x0253u, 0x00d2u, CanonicalizeRangeHi },
+ { 0x0254u, 0x0254u, 0x00ceu, CanonicalizeRangeHi },
+ { 0x0255u, 0x0255u, 0x0000u, CanonicalizeUnique },
+ { 0x0256u, 0x0257u, 0x00cdu, CanonicalizeRangeHi },
+ { 0x0258u, 0x0258u, 0x0000u, CanonicalizeUnique },
+ { 0x0259u, 0x0259u, 0x00cau, CanonicalizeRangeHi },
+ { 0x025au, 0x025au, 0x0000u, CanonicalizeUnique },
+ { 0x025bu, 0x025bu, 0x00cbu, CanonicalizeRangeHi },
+ { 0x025cu, 0x025fu, 0x0000u, CanonicalizeUnique },
+ { 0x0260u, 0x0260u, 0x00cdu, CanonicalizeRangeHi },
+ { 0x0261u, 0x0262u, 0x0000u, CanonicalizeUnique },
+ { 0x0263u, 0x0263u, 0x00cfu, CanonicalizeRangeHi },
+ { 0x0264u, 0x0264u, 0x0000u, CanonicalizeUnique },
+ { 0x0265u, 0x0265u, 0xa528u, CanonicalizeRangeLo },
+ { 0x0266u, 0x0267u, 0x0000u, CanonicalizeUnique },
+ { 0x0268u, 0x0268u, 0x00d1u, CanonicalizeRangeHi },
+ { 0x0269u, 0x0269u, 0x00d3u, CanonicalizeRangeHi },
+ { 0x026au, 0x026au, 0x0000u, CanonicalizeUnique },
+ { 0x026bu, 0x026bu, 0x29f7u, CanonicalizeRangeLo },
+ { 0x026cu, 0x026eu, 0x0000u, CanonicalizeUnique },
+ { 0x026fu, 0x026fu, 0x00d3u, CanonicalizeRangeHi },
+ { 0x0270u, 0x0270u, 0x0000u, CanonicalizeUnique },
+ { 0x0271u, 0x0271u, 0x29fdu, CanonicalizeRangeLo },
+ { 0x0272u, 0x0272u, 0x00d5u, CanonicalizeRangeHi },
+ { 0x0273u, 0x0274u, 0x0000u, CanonicalizeUnique },
+ { 0x0275u, 0x0275u, 0x00d6u, CanonicalizeRangeHi },
+ { 0x0276u, 0x027cu, 0x0000u, CanonicalizeUnique },
+ { 0x027du, 0x027du, 0x29e7u, CanonicalizeRangeLo },
+ { 0x027eu, 0x027fu, 0x0000u, CanonicalizeUnique },
+ { 0x0280u, 0x0280u, 0x00dau, CanonicalizeRangeHi },
+ { 0x0281u, 0x0282u, 0x0000u, CanonicalizeUnique },
+ { 0x0283u, 0x0283u, 0x00dau, CanonicalizeRangeHi },
+ { 0x0284u, 0x0287u, 0x0000u, CanonicalizeUnique },
+ { 0x0288u, 0x0288u, 0x00dau, CanonicalizeRangeHi },
+ { 0x0289u, 0x0289u, 0x0045u, CanonicalizeRangeHi },
+ { 0x028au, 0x028bu, 0x00d9u, CanonicalizeRangeHi },
+ { 0x028cu, 0x028cu, 0x0047u, CanonicalizeRangeHi },
+ { 0x028du, 0x0291u, 0x0000u, CanonicalizeUnique },
+ { 0x0292u, 0x0292u, 0x00dbu, CanonicalizeRangeHi },
+ { 0x0293u, 0x0344u, 0x0000u, CanonicalizeUnique },
+ { 0x0345u, 0x0345u, 0x0007u, CanonicalizeSet },
+ { 0x0346u, 0x036fu, 0x0000u, CanonicalizeUnique },
+ { 0x0370u, 0x0373u, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x0374u, 0x0375u, 0x0000u, CanonicalizeUnique },
+ { 0x0376u, 0x0377u, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x0378u, 0x037au, 0x0000u, CanonicalizeUnique },
+ { 0x037bu, 0x037du, 0x0082u, CanonicalizeRangeLo },
+ { 0x037eu, 0x0385u, 0x0000u, CanonicalizeUnique },
+ { 0x0386u, 0x0386u, 0x0026u, CanonicalizeRangeLo },
+ { 0x0387u, 0x0387u, 0x0000u, CanonicalizeUnique },
+ { 0x0388u, 0x038au, 0x0025u, CanonicalizeRangeLo },
+ { 0x038bu, 0x038bu, 0x0000u, CanonicalizeUnique },
+ { 0x038cu, 0x038cu, 0x0040u, CanonicalizeRangeLo },
+ { 0x038du, 0x038du, 0x0000u, CanonicalizeUnique },
+ { 0x038eu, 0x038fu, 0x003fu, CanonicalizeRangeLo },
+ { 0x0390u, 0x0390u, 0x0000u, CanonicalizeUnique },
+ { 0x0391u, 0x0391u, 0x0020u, CanonicalizeRangeLo },
+ { 0x0392u, 0x0392u, 0x0004u, CanonicalizeSet },
+ { 0x0393u, 0x0394u, 0x0020u, CanonicalizeRangeLo },
+ { 0x0395u, 0x0395u, 0x0005u, CanonicalizeSet },
+ { 0x0396u, 0x0397u, 0x0020u, CanonicalizeRangeLo },
+ { 0x0398u, 0x0398u, 0x0006u, CanonicalizeSet },
+ { 0x0399u, 0x0399u, 0x0007u, CanonicalizeSet },
+ { 0x039au, 0x039au, 0x0008u, CanonicalizeSet },
+ { 0x039bu, 0x039bu, 0x0020u, CanonicalizeRangeLo },
+ { 0x039cu, 0x039cu, 0x0009u, CanonicalizeSet },
+ { 0x039du, 0x039fu, 0x0020u, CanonicalizeRangeLo },
+ { 0x03a0u, 0x03a0u, 0x000au, CanonicalizeSet },
+ { 0x03a1u, 0x03a1u, 0x000bu, CanonicalizeSet },
+ { 0x03a2u, 0x03a2u, 0x0000u, CanonicalizeUnique },
+ { 0x03a3u, 0x03a3u, 0x000cu, CanonicalizeSet },
+ { 0x03a4u, 0x03a5u, 0x0020u, CanonicalizeRangeLo },
+ { 0x03a6u, 0x03a6u, 0x000du, CanonicalizeSet },
+ { 0x03a7u, 0x03abu, 0x0020u, CanonicalizeRangeLo },
+ { 0x03acu, 0x03acu, 0x0026u, CanonicalizeRangeHi },
+ { 0x03adu, 0x03afu, 0x0025u, CanonicalizeRangeHi },
+ { 0x03b0u, 0x03b0u, 0x0000u, CanonicalizeUnique },
+ { 0x03b1u, 0x03b1u, 0x0020u, CanonicalizeRangeHi },
+ { 0x03b2u, 0x03b2u, 0x0004u, CanonicalizeSet },
+ { 0x03b3u, 0x03b4u, 0x0020u, CanonicalizeRangeHi },
+ { 0x03b5u, 0x03b5u, 0x0005u, CanonicalizeSet },
+ { 0x03b6u, 0x03b7u, 0x0020u, CanonicalizeRangeHi },
+ { 0x03b8u, 0x03b8u, 0x0006u, CanonicalizeSet },
+ { 0x03b9u, 0x03b9u, 0x0007u, CanonicalizeSet },
+ { 0x03bau, 0x03bau, 0x0008u, CanonicalizeSet },
+ { 0x03bbu, 0x03bbu, 0x0020u, CanonicalizeRangeHi },
+ { 0x03bcu, 0x03bcu, 0x0009u, CanonicalizeSet },
+ { 0x03bdu, 0x03bfu, 0x0020u, CanonicalizeRangeHi },
+ { 0x03c0u, 0x03c0u, 0x000au, CanonicalizeSet },
+ { 0x03c1u, 0x03c1u, 0x000bu, CanonicalizeSet },
+ { 0x03c2u, 0x03c3u, 0x000cu, CanonicalizeSet },
+ { 0x03c4u, 0x03c5u, 0x0020u, CanonicalizeRangeHi },
+ { 0x03c6u, 0x03c6u, 0x000du, CanonicalizeSet },
+ { 0x03c7u, 0x03cbu, 0x0020u, CanonicalizeRangeHi },
+ { 0x03ccu, 0x03ccu, 0x0040u, CanonicalizeRangeHi },
+ { 0x03cdu, 0x03ceu, 0x003fu, CanonicalizeRangeHi },
+ { 0x03cfu, 0x03cfu, 0x0008u, CanonicalizeRangeLo },
+ { 0x03d0u, 0x03d0u, 0x0004u, CanonicalizeSet },
+ { 0x03d1u, 0x03d1u, 0x0006u, CanonicalizeSet },
+ { 0x03d2u, 0x03d4u, 0x0000u, CanonicalizeUnique },
+ { 0x03d5u, 0x03d5u, 0x000du, CanonicalizeSet },
+ { 0x03d6u, 0x03d6u, 0x000au, CanonicalizeSet },
+ { 0x03d7u, 0x03d7u, 0x0008u, CanonicalizeRangeHi },
+ { 0x03d8u, 0x03efu, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x03f0u, 0x03f0u, 0x0008u, CanonicalizeSet },
+ { 0x03f1u, 0x03f1u, 0x000bu, CanonicalizeSet },
+ { 0x03f2u, 0x03f2u, 0x0007u, CanonicalizeRangeLo },
+ { 0x03f3u, 0x03f4u, 0x0000u, CanonicalizeUnique },
+ { 0x03f5u, 0x03f5u, 0x0005u, CanonicalizeSet },
+ { 0x03f6u, 0x03f6u, 0x0000u, CanonicalizeUnique },
+ { 0x03f7u, 0x03f8u, 0x0000u, CanonicalizeAlternatingUnaligned },
+ { 0x03f9u, 0x03f9u, 0x0007u, CanonicalizeRangeHi },
+ { 0x03fau, 0x03fbu, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x03fcu, 0x03fcu, 0x0000u, CanonicalizeUnique },
+ { 0x03fdu, 0x03ffu, 0x0082u, CanonicalizeRangeHi },
+ { 0x0400u, 0x040fu, 0x0050u, CanonicalizeRangeLo },
+ { 0x0410u, 0x042fu, 0x0020u, CanonicalizeRangeLo },
+ { 0x0430u, 0x044fu, 0x0020u, CanonicalizeRangeHi },
+ { 0x0450u, 0x045fu, 0x0050u, CanonicalizeRangeHi },
+ { 0x0460u, 0x0481u, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x0482u, 0x0489u, 0x0000u, CanonicalizeUnique },
+ { 0x048au, 0x04bfu, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x04c0u, 0x04c0u, 0x000fu, CanonicalizeRangeLo },
+ { 0x04c1u, 0x04ceu, 0x0000u, CanonicalizeAlternatingUnaligned },
+ { 0x04cfu, 0x04cfu, 0x000fu, CanonicalizeRangeHi },
+ { 0x04d0u, 0x0527u, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x0528u, 0x0530u, 0x0000u, CanonicalizeUnique },
+ { 0x0531u, 0x0556u, 0x0030u, CanonicalizeRangeLo },
+ { 0x0557u, 0x0560u, 0x0000u, CanonicalizeUnique },
+ { 0x0561u, 0x0586u, 0x0030u, CanonicalizeRangeHi },
+ { 0x0587u, 0x109fu, 0x0000u, CanonicalizeUnique },
+ { 0x10a0u, 0x10c5u, 0x1c60u, CanonicalizeRangeLo },
+ { 0x10c6u, 0x1d78u, 0x0000u, CanonicalizeUnique },
+ { 0x1d79u, 0x1d79u, 0x8a04u, CanonicalizeRangeLo },
+ { 0x1d7au, 0x1d7cu, 0x0000u, CanonicalizeUnique },
+ { 0x1d7du, 0x1d7du, 0x0ee6u, CanonicalizeRangeLo },
+ { 0x1d7eu, 0x1dffu, 0x0000u, CanonicalizeUnique },
+ { 0x1e00u, 0x1e5fu, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x1e60u, 0x1e61u, 0x000eu, CanonicalizeSet },
+ { 0x1e62u, 0x1e95u, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x1e96u, 0x1e9au, 0x0000u, CanonicalizeUnique },
+ { 0x1e9bu, 0x1e9bu, 0x000eu, CanonicalizeSet },
+ { 0x1e9cu, 0x1e9fu, 0x0000u, CanonicalizeUnique },
+ { 0x1ea0u, 0x1effu, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x1f00u, 0x1f07u, 0x0008u, CanonicalizeRangeLo },
+ { 0x1f08u, 0x1f0fu, 0x0008u, CanonicalizeRangeHi },
+ { 0x1f10u, 0x1f15u, 0x0008u, CanonicalizeRangeLo },
+ { 0x1f16u, 0x1f17u, 0x0000u, CanonicalizeUnique },
+ { 0x1f18u, 0x1f1du, 0x0008u, CanonicalizeRangeHi },
+ { 0x1f1eu, 0x1f1fu, 0x0000u, CanonicalizeUnique },
+ { 0x1f20u, 0x1f27u, 0x0008u, CanonicalizeRangeLo },
+ { 0x1f28u, 0x1f2fu, 0x0008u, CanonicalizeRangeHi },
+ { 0x1f30u, 0x1f37u, 0x0008u, CanonicalizeRangeLo },
+ { 0x1f38u, 0x1f3fu, 0x0008u, CanonicalizeRangeHi },
+ { 0x1f40u, 0x1f45u, 0x0008u, CanonicalizeRangeLo },
+ { 0x1f46u, 0x1f47u, 0x0000u, CanonicalizeUnique },
+ { 0x1f48u, 0x1f4du, 0x0008u, CanonicalizeRangeHi },
+ { 0x1f4eu, 0x1f50u, 0x0000u, CanonicalizeUnique },
+ { 0x1f51u, 0x1f51u, 0x0008u, CanonicalizeRangeLo },
+ { 0x1f52u, 0x1f52u, 0x0000u, CanonicalizeUnique },
+ { 0x1f53u, 0x1f53u, 0x0008u, CanonicalizeRangeLo },
+ { 0x1f54u, 0x1f54u, 0x0000u, CanonicalizeUnique },
+ { 0x1f55u, 0x1f55u, 0x0008u, CanonicalizeRangeLo },
+ { 0x1f56u, 0x1f56u, 0x0000u, CanonicalizeUnique },
+ { 0x1f57u, 0x1f57u, 0x0008u, CanonicalizeRangeLo },
+ { 0x1f58u, 0x1f58u, 0x0000u, CanonicalizeUnique },
+ { 0x1f59u, 0x1f59u, 0x0008u, CanonicalizeRangeHi },
+ { 0x1f5au, 0x1f5au, 0x0000u, CanonicalizeUnique },
+ { 0x1f5bu, 0x1f5bu, 0x0008u, CanonicalizeRangeHi },
+ { 0x1f5cu, 0x1f5cu, 0x0000u, CanonicalizeUnique },
+ { 0x1f5du, 0x1f5du, 0x0008u, CanonicalizeRangeHi },
+ { 0x1f5eu, 0x1f5eu, 0x0000u, CanonicalizeUnique },
+ { 0x1f5fu, 0x1f5fu, 0x0008u, CanonicalizeRangeHi },
+ { 0x1f60u, 0x1f67u, 0x0008u, CanonicalizeRangeLo },
+ { 0x1f68u, 0x1f6fu, 0x0008u, CanonicalizeRangeHi },
+ { 0x1f70u, 0x1f71u, 0x004au, CanonicalizeRangeLo },
+ { 0x1f72u, 0x1f75u, 0x0056u, CanonicalizeRangeLo },
+ { 0x1f76u, 0x1f77u, 0x0064u, CanonicalizeRangeLo },
+ { 0x1f78u, 0x1f79u, 0x0080u, CanonicalizeRangeLo },
+ { 0x1f7au, 0x1f7bu, 0x0070u, CanonicalizeRangeLo },
+ { 0x1f7cu, 0x1f7du, 0x007eu, CanonicalizeRangeLo },
+ { 0x1f7eu, 0x1fafu, 0x0000u, CanonicalizeUnique },
+ { 0x1fb0u, 0x1fb1u, 0x0008u, CanonicalizeRangeLo },
+ { 0x1fb2u, 0x1fb7u, 0x0000u, CanonicalizeUnique },
+ { 0x1fb8u, 0x1fb9u, 0x0008u, CanonicalizeRangeHi },
+ { 0x1fbau, 0x1fbbu, 0x004au, CanonicalizeRangeHi },
+ { 0x1fbcu, 0x1fbdu, 0x0000u, CanonicalizeUnique },
+ { 0x1fbeu, 0x1fbeu, 0x0007u, CanonicalizeSet },
+ { 0x1fbfu, 0x1fc7u, 0x0000u, CanonicalizeUnique },
+ { 0x1fc8u, 0x1fcbu, 0x0056u, CanonicalizeRangeHi },
+ { 0x1fccu, 0x1fcfu, 0x0000u, CanonicalizeUnique },
+ { 0x1fd0u, 0x1fd1u, 0x0008u, CanonicalizeRangeLo },
+ { 0x1fd2u, 0x1fd7u, 0x0000u, CanonicalizeUnique },
+ { 0x1fd8u, 0x1fd9u, 0x0008u, CanonicalizeRangeHi },
+ { 0x1fdau, 0x1fdbu, 0x0064u, CanonicalizeRangeHi },
+ { 0x1fdcu, 0x1fdfu, 0x0000u, CanonicalizeUnique },
+ { 0x1fe0u, 0x1fe1u, 0x0008u, CanonicalizeRangeLo },
+ { 0x1fe2u, 0x1fe4u, 0x0000u, CanonicalizeUnique },
+ { 0x1fe5u, 0x1fe5u, 0x0007u, CanonicalizeRangeLo },
+ { 0x1fe6u, 0x1fe7u, 0x0000u, CanonicalizeUnique },
+ { 0x1fe8u, 0x1fe9u, 0x0008u, CanonicalizeRangeHi },
+ { 0x1feau, 0x1febu, 0x0070u, CanonicalizeRangeHi },
+ { 0x1fecu, 0x1fecu, 0x0007u, CanonicalizeRangeHi },
+ { 0x1fedu, 0x1ff7u, 0x0000u, CanonicalizeUnique },
+ { 0x1ff8u, 0x1ff9u, 0x0080u, CanonicalizeRangeHi },
+ { 0x1ffau, 0x1ffbu, 0x007eu, CanonicalizeRangeHi },
+ { 0x1ffcu, 0x2131u, 0x0000u, CanonicalizeUnique },
+ { 0x2132u, 0x2132u, 0x001cu, CanonicalizeRangeLo },
+ { 0x2133u, 0x214du, 0x0000u, CanonicalizeUnique },
+ { 0x214eu, 0x214eu, 0x001cu, CanonicalizeRangeHi },
+ { 0x214fu, 0x215fu, 0x0000u, CanonicalizeUnique },
+ { 0x2160u, 0x216fu, 0x0010u, CanonicalizeRangeLo },
+ { 0x2170u, 0x217fu, 0x0010u, CanonicalizeRangeHi },
+ { 0x2180u, 0x2182u, 0x0000u, CanonicalizeUnique },
+ { 0x2183u, 0x2184u, 0x0000u, CanonicalizeAlternatingUnaligned },
+ { 0x2185u, 0x24b5u, 0x0000u, CanonicalizeUnique },
+ { 0x24b6u, 0x24cfu, 0x001au, CanonicalizeRangeLo },
+ { 0x24d0u, 0x24e9u, 0x001au, CanonicalizeRangeHi },
+ { 0x24eau, 0x2bffu, 0x0000u, CanonicalizeUnique },
+ { 0x2c00u, 0x2c2eu, 0x0030u, CanonicalizeRangeLo },
+ { 0x2c2fu, 0x2c2fu, 0x0000u, CanonicalizeUnique },
+ { 0x2c30u, 0x2c5eu, 0x0030u, CanonicalizeRangeHi },
+ { 0x2c5fu, 0x2c5fu, 0x0000u, CanonicalizeUnique },
+ { 0x2c60u, 0x2c61u, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x2c62u, 0x2c62u, 0x29f7u, CanonicalizeRangeHi },
+ { 0x2c63u, 0x2c63u, 0x0ee6u, CanonicalizeRangeHi },
+ { 0x2c64u, 0x2c64u, 0x29e7u, CanonicalizeRangeHi },
+ { 0x2c65u, 0x2c65u, 0x2a2bu, CanonicalizeRangeHi },
+ { 0x2c66u, 0x2c66u, 0x2a28u, CanonicalizeRangeHi },
+ { 0x2c67u, 0x2c6cu, 0x0000u, CanonicalizeAlternatingUnaligned },
+ { 0x2c6du, 0x2c6du, 0x2a1cu, CanonicalizeRangeHi },
+ { 0x2c6eu, 0x2c6eu, 0x29fdu, CanonicalizeRangeHi },
+ { 0x2c6fu, 0x2c6fu, 0x2a1fu, CanonicalizeRangeHi },
+ { 0x2c70u, 0x2c70u, 0x2a1eu, CanonicalizeRangeHi },
+ { 0x2c71u, 0x2c71u, 0x0000u, CanonicalizeUnique },
+ { 0x2c72u, 0x2c73u, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x2c74u, 0x2c74u, 0x0000u, CanonicalizeUnique },
+ { 0x2c75u, 0x2c76u, 0x0000u, CanonicalizeAlternatingUnaligned },
+ { 0x2c77u, 0x2c7du, 0x0000u, CanonicalizeUnique },
+ { 0x2c7eu, 0x2c7fu, 0x2a3fu, CanonicalizeRangeHi },
+ { 0x2c80u, 0x2ce3u, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x2ce4u, 0x2ceau, 0x0000u, CanonicalizeUnique },
+ { 0x2cebu, 0x2ceeu, 0x0000u, CanonicalizeAlternatingUnaligned },
+ { 0x2cefu, 0x2cffu, 0x0000u, CanonicalizeUnique },
+ { 0x2d00u, 0x2d25u, 0x1c60u, CanonicalizeRangeHi },
+ { 0x2d26u, 0xa63fu, 0x0000u, CanonicalizeUnique },
+ { 0xa640u, 0xa66du, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0xa66eu, 0xa67fu, 0x0000u, CanonicalizeUnique },
+ { 0xa680u, 0xa697u, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0xa698u, 0xa721u, 0x0000u, CanonicalizeUnique },
+ { 0xa722u, 0xa72fu, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0xa730u, 0xa731u, 0x0000u, CanonicalizeUnique },
+ { 0xa732u, 0xa76fu, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0xa770u, 0xa778u, 0x0000u, CanonicalizeUnique },
+ { 0xa779u, 0xa77cu, 0x0000u, CanonicalizeAlternatingUnaligned },
+ { 0xa77du, 0xa77du, 0x8a04u, CanonicalizeRangeHi },
+ { 0xa77eu, 0xa787u, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0xa788u, 0xa78au, 0x0000u, CanonicalizeUnique },
+ { 0xa78bu, 0xa78cu, 0x0000u, CanonicalizeAlternatingUnaligned },
+ { 0xa78du, 0xa78du, 0xa528u, CanonicalizeRangeHi },
+ { 0xa78eu, 0xa78fu, 0x0000u, CanonicalizeUnique },
+ { 0xa790u, 0xa791u, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0xa792u, 0xa79fu, 0x0000u, CanonicalizeUnique },
+ { 0xa7a0u, 0xa7a9u, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0xa7aau, 0xff20u, 0x0000u, CanonicalizeUnique },
+ { 0xff21u, 0xff3au, 0x0020u, CanonicalizeRangeLo },
+ { 0xff3bu, 0xff40u, 0x0000u, CanonicalizeUnique },
+ { 0xff41u, 0xff5au, 0x0020u, CanonicalizeRangeHi },
+ { 0xff5bu, 0xffffu, 0x0000u, CanonicalizeUnique },
+};
+
+const size_t LATIN_CANONICALIZATION_RANGES = 20;
+LatinCanonicalizationRange latinRangeInfo[LATIN_CANONICALIZATION_RANGES] = {
+ { 0x0000u, 0x0040u, 0x0000u, CanonicalizeLatinSelf },
+ { 0x0041u, 0x005au, 0x0000u, CanonicalizeLatinMask0x20 },
+ { 0x005bu, 0x0060u, 0x0000u, CanonicalizeLatinSelf },
+ { 0x0061u, 0x007au, 0x0000u, CanonicalizeLatinMask0x20 },
+ { 0x007bu, 0x00bfu, 0x0000u, CanonicalizeLatinSelf },
+ { 0x00c0u, 0x00d6u, 0x0000u, CanonicalizeLatinMask0x20 },
+ { 0x00d7u, 0x00d7u, 0x0000u, CanonicalizeLatinSelf },
+ { 0x00d8u, 0x00deu, 0x0000u, CanonicalizeLatinMask0x20 },
+ { 0x00dfu, 0x00dfu, 0x0000u, CanonicalizeLatinSelf },
+ { 0x00e0u, 0x00f6u, 0x0000u, CanonicalizeLatinMask0x20 },
+ { 0x00f7u, 0x00f7u, 0x0000u, CanonicalizeLatinSelf },
+ { 0x00f8u, 0x00feu, 0x0000u, CanonicalizeLatinMask0x20 },
+ { 0x00ffu, 0x00ffu, 0x0000u, CanonicalizeLatinSelf },
+ { 0x0100u, 0x0177u, 0x0000u, CanonicalizeLatinInvalid },
+ { 0x0178u, 0x0178u, 0x00ffu, CanonicalizeLatinOther },
+ { 0x0179u, 0x039bu, 0x0000u, CanonicalizeLatinInvalid },
+ { 0x039cu, 0x039cu, 0x00b5u, CanonicalizeLatinOther },
+ { 0x039du, 0x03bbu, 0x0000u, CanonicalizeLatinInvalid },
+ { 0x03bcu, 0x03bcu, 0x00b5u, CanonicalizeLatinOther },
+ { 0x03bdu, 0xffffu, 0x0000u, CanonicalizeLatinInvalid },
+};
+
+} } // JSC::Yarr
+
diff --git a/Source/JavaScriptCore/yarr/YarrCanonicalizeUCS2.h b/Source/JavaScriptCore/yarr/YarrCanonicalizeUCS2.h
new file mode 100644
index 000000000..be0ead43d
--- /dev/null
+++ b/Source/JavaScriptCore/yarr/YarrCanonicalizeUCS2.h
@@ -0,0 +1,138 @@
+/*
+ * Copyright (C) 2012 Apple Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef YarrCanonicalizeUCS2_H
+#define YarrCanonicalizeUCS2_H
+
+#include <stdint.h>
+#include <wtf/unicode/Unicode.h>
+
+namespace JSC { namespace Yarr {
+
+// This set of data (autogenerated using YarrCanonicalizeUCS2.js into YarrCanonicalizeUCS2.cpp)
+// provides information for each UCS2 code point as to the set of code points that it should
+// match under the ES5.1 case insensitive RegExp matching rules, specified in 15.10.2.8.
+enum UCS2CanonicalizationType {
+ CanonicalizeUnique, // No canonically equal values, e.g. 0x0.
+ CanonicalizeSet, // Value indicates a set in characterSetInfo.
+ CanonicalizeRangeLo, // Value is positive delta to pair, E.g. 0x41 has value 0x20, -> 0x61.
+ CanonicalizeRangeHi, // Value is positive delta to pair, E.g. 0x61 has value 0x20, -> 0x41.
+ CanonicalizeAlternatingAligned, // Aligned consequtive pair, e.g. 0x1f4,0x1f5.
+ CanonicalizeAlternatingUnaligned, // Unaligned consequtive pair, e.g. 0x241,0x242.
+};
+struct UCS2CanonicalizationRange { uint16_t begin, end, value, type; };
+extern const size_t UCS2_CANONICALIZATION_RANGES;
+extern uint16_t* characterSetInfo[];
+extern UCS2CanonicalizationRange rangeInfo[];
+
+// This table is similar to the full rangeInfo table, however this maps from UCS2 codepoints to
+// the set of Latin1 codepoints that could match.
+enum LatinCanonicalizationType {
+ CanonicalizeLatinSelf, // This character is in the Latin1 range, but has no canonical equivalent in the range.
+ CanonicalizeLatinMask0x20, // One of a pair of characters, under the mask 0x20.
+ CanonicalizeLatinOther, // This character is not in the Latin1 range, but canonicalizes to another that is.
+ CanonicalizeLatinInvalid, // Cannot match against Latin1 input.
+};
+struct LatinCanonicalizationRange { uint16_t begin, end, value, type; };
+extern const size_t LATIN_CANONICALIZATION_RANGES;
+extern LatinCanonicalizationRange latinRangeInfo[];
+
+// This searches in log2 time over ~364 entries, so should typically result in 8 compares.
+inline UCS2CanonicalizationRange* rangeInfoFor(UChar ch)
+{
+ UCS2CanonicalizationRange* info = rangeInfo;
+ size_t entries = UCS2_CANONICALIZATION_RANGES;
+
+ while (true) {
+ size_t candidate = entries >> 1;
+ UCS2CanonicalizationRange* candidateInfo = info + candidate;
+ if (ch < candidateInfo->begin)
+ entries = candidate;
+ else if (ch <= candidateInfo->end)
+ return candidateInfo;
+ else {
+ info = candidateInfo + 1;
+ entries -= (candidate + 1);
+ }
+ }
+}
+
+// Should only be called for characters that have one canonically matching value.
+inline UChar getCanonicalPair(UCS2CanonicalizationRange* info, UChar ch)
+{
+ ASSERT(ch >= info->begin && ch <= info->end);
+ switch (info->type) {
+ case CanonicalizeRangeLo:
+ return ch + info->value;
+ case CanonicalizeRangeHi:
+ return ch - info->value;
+ case CanonicalizeAlternatingAligned:
+ return ch ^ 1;
+ case CanonicalizeAlternatingUnaligned:
+ return ((ch - 1) ^ 1) + 1;
+ default:
+ ASSERT_NOT_REACHED();
+ }
+ ASSERT_NOT_REACHED();
+ return 0;
+}
+
+// Returns true if no other UCS2 codepoint can match this value.
+inline bool isCanonicallyUnique(UChar ch)
+{
+ return rangeInfoFor(ch)->type == CanonicalizeUnique;
+}
+
+// Returns true if values are equal, under the canonicalization rules.
+inline bool areCanonicallyEquivalent(UChar a, UChar b)
+{
+ UCS2CanonicalizationRange* info = rangeInfoFor(a);
+ switch (info->type) {
+ case CanonicalizeUnique:
+ return a == b;
+ case CanonicalizeSet: {
+ for (uint16_t* set = characterSetInfo[info->value]; (a = *set); ++set) {
+ if (a == b)
+ return true;
+ }
+ return false;
+ }
+ case CanonicalizeRangeLo:
+ return (a == b) || (a + info->value == b);
+ case CanonicalizeRangeHi:
+ return (a == b) || (a - info->value == b);
+ case CanonicalizeAlternatingAligned:
+ return (a | 1) == (b | 1);
+ case CanonicalizeAlternatingUnaligned:
+ return ((a - 1) | 1) == ((b - 1) | 1);
+ }
+
+ ASSERT_NOT_REACHED();
+ return false;
+}
+
+} } // JSC::Yarr
+
+#endif
diff --git a/Source/JavaScriptCore/yarr/YarrCanonicalizeUCS2.js b/Source/JavaScriptCore/yarr/YarrCanonicalizeUCS2.js
new file mode 100644
index 000000000..00361dd46
--- /dev/null
+++ b/Source/JavaScriptCore/yarr/YarrCanonicalizeUCS2.js
@@ -0,0 +1,219 @@
+/*
+ * Copyright (C) 2012 Apple Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+// See ES 5.1, 15.10.2.8
+function canonicalize(ch)
+{
+ var u = String.fromCharCode(ch).toUpperCase();
+ if (u.length > 1)
+ return ch;
+ var cu = u.charCodeAt(0);
+ if (ch >= 128 && cu < 128)
+ return ch;
+ return cu;
+}
+
+var MAX_UCS2 = 0xFFFF;
+var MAX_LATIN = 0xFF;
+
+var groupedCanonically = [];
+// Pass 1: populate groupedCanonically - this is mapping from canonicalized
+// values back to the set of character code that canonicalize to them.
+for (var i = 0; i <= MAX_UCS2; ++i) {
+ var ch = canonicalize(i);
+ if (!groupedCanonically[ch])
+ groupedCanonically[ch] = [];
+ groupedCanonically[ch].push(i);
+}
+
+var typeInfo = [];
+var latinTypeInfo = [];
+var characterSetInfo = [];
+// Pass 2: populate typeInfo & characterSetInfo. For every character calculate
+// a typeInfo value, described by the types above, and a value payload.
+for (cu in groupedCanonically) {
+ // The set of characters that canonicalize to cu
+ var characters = groupedCanonically[cu];
+
+ // If there is only one, it is unique.
+ if (characters.length == 1) {
+ typeInfo[characters[0]] = "CanonicalizeUnique:0";
+ latinTypeInfo[characters[0]] = characters[0] <= MAX_LATIN ? "CanonicalizeLatinSelf:0" : "CanonicalizeLatinInvalid:0";
+ continue;
+ }
+
+ // Sort the array.
+ characters.sort(function(x,y){return x-y;});
+
+ // If there are more than two characters, create an entry in characterSetInfo.
+ if (characters.length > 2) {
+ for (i in characters)
+ typeInfo[characters[i]] = "CanonicalizeSet:" + characterSetInfo.length;
+ characterSetInfo.push(characters);
+
+ if (characters[1] <= MAX_LATIN)
+ throw new Error("sets with more than one latin character not supported!");
+ if (characters[0] <= MAX_LATIN) {
+ for (i in characters)
+ latinTypeInfo[characters[i]] = "CanonicalizeLatinOther:" + characters[0];
+ latinTypeInfo[characters[0]] = "CanonicalizeLatinSelf:0";
+ } else {
+ for (i in characters)
+ latinTypeInfo[characters[i]] = "CanonicalizeLatinInvalid:0";
+ }
+
+ continue;
+ }
+
+ // We have a pair, mark alternating ranges, otherwise track whether this is the low or high partner.
+ var lo = characters[0];
+ var hi = characters[1];
+ var delta = hi - lo;
+ if (delta == 1) {
+ var type = lo & 1 ? "CanonicalizeAlternatingUnaligned:0" : "CanonicalizeAlternatingAligned:0";
+ typeInfo[lo] = type;
+ typeInfo[hi] = type;
+ } else {
+ typeInfo[lo] = "CanonicalizeRangeLo:" + delta;
+ typeInfo[hi] = "CanonicalizeRangeHi:" + delta;
+ }
+
+ if (lo > MAX_LATIN) {
+ latinTypeInfo[lo] = "CanonicalizeLatinInvalid:0";
+ latinTypeInfo[hi] = "CanonicalizeLatinInvalid:0";
+ } else if (hi > MAX_LATIN) {
+ latinTypeInfo[lo] = "CanonicalizeLatinSelf:0";
+ latinTypeInfo[hi] = "CanonicalizeLatinOther:" + lo;
+ } else {
+ if (delta != 0x20 || lo & 0x20)
+ throw new Error("pairs of latin characters that don't mask with 0x20 not supported!");
+ latinTypeInfo[lo] = "CanonicalizeLatinMask0x20:0";
+ latinTypeInfo[hi] = "CanonicalizeLatinMask0x20:0";
+ }
+}
+
+var rangeInfo = [];
+// Pass 3: coallesce types into ranges.
+for (var end = 0; end <= MAX_UCS2; ++end) {
+ var begin = end;
+ var type = typeInfo[end];
+ while (end < MAX_UCS2 && typeInfo[end + 1] == type)
+ ++end;
+ rangeInfo.push({begin:begin, end:end, type:type});
+}
+
+var latinRangeInfo = [];
+// Pass 4: coallesce latin-1 types into ranges.
+for (var end = 0; end <= MAX_UCS2; ++end) {
+ var begin = end;
+ var type = latinTypeInfo[end];
+ while (end < MAX_UCS2 && latinTypeInfo[end + 1] == type)
+ ++end;
+ latinRangeInfo.push({begin:begin, end:end, type:type});
+}
+
+
+// Helper function to convert a number to a fixed width hex representation of a C uint16_t.
+function hex(x)
+{
+ var s = Number(x).toString(16);
+ while (s.length < 4)
+ s = 0 + s;
+ return "0x" + s + "u";
+}
+
+var copyright = (
+ "/*" + "\n" +
+ " * Copyright (C) 2012 Apple Inc. All rights reserved." + "\n" +
+ " *" + "\n" +
+ " * Redistribution and use in source and binary forms, with or without" + "\n" +
+ " * modification, are permitted provided that the following conditions" + "\n" +
+ " * are met:" + "\n" +
+ " * 1. Redistributions of source code must retain the above copyright" + "\n" +
+ " * notice, this list of conditions and the following disclaimer." + "\n" +
+ " * 2. Redistributions in binary form must reproduce the above copyright" + "\n" +
+ " * notice, this list of conditions and the following disclaimer in the" + "\n" +
+ " * documentation and/or other materials provided with the distribution." + "\n" +
+ " *" + "\n" +
+ " * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY" + "\n" +
+ " * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE" + "\n" +
+ " * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR" + "\n" +
+ " * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR" + "\n" +
+ " * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL," + "\n" +
+ " * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO," + "\n" +
+ " * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR" + "\n" +
+ " * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY" + "\n" +
+ " * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT" + "\n" +
+ " * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE" + "\n" +
+ " * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. " + "\n" +
+ " */");
+
+print(copyright);
+print();
+print("// DO NOT EDIT! - this file autogenerated by YarrCanonicalizeUCS2.js");
+print();
+print('#include "config.h"');
+print('#include "YarrCanonicalizeUCS2.h"');
+print();
+print("namespace JSC { namespace Yarr {");
+print();
+print("#include <stdint.h>");
+print();
+
+for (i in characterSetInfo) {
+ var characters = ""
+ var set = characterSetInfo[i];
+ for (var j in set)
+ characters += hex(set[j]) + ", ";
+ print("uint16_t ucs2CharacterSet" + i + "[] = { " + characters + "0 };");
+}
+print();
+print("static const size_t UCS2_CANONICALIZATION_SETS = " + characterSetInfo.length + ";");
+print("uint16_t* characterSetInfo[UCS2_CANONICALIZATION_SETS] = {");
+for (i in characterSetInfo)
+print(" ucs2CharacterSet" + i + ",");
+print("};");
+print();
+print("const size_t UCS2_CANONICALIZATION_RANGES = " + rangeInfo.length + ";");
+print("UCS2CanonicalizationRange rangeInfo[UCS2_CANONICALIZATION_RANGES] = {");
+for (i in rangeInfo) {
+ var info = rangeInfo[i];
+ var typeAndValue = info.type.split(':');
+ print(" { " + hex(info.begin) + ", " + hex(info.end) + ", " + hex(typeAndValue[1]) + ", " + typeAndValue[0] + " },");
+}
+print("};");
+print();
+print("const size_t LATIN_CANONICALIZATION_RANGES = " + latinRangeInfo.length + ";");
+print("LatinCanonicalizationRange latinRangeInfo[LATIN_CANONICALIZATION_RANGES] = {");
+for (i in latinRangeInfo) {
+ var info = latinRangeInfo[i];
+ var typeAndValue = info.type.split(':');
+ print(" { " + hex(info.begin) + ", " + hex(info.end) + ", " + hex(typeAndValue[1]) + ", " + typeAndValue[0] + " },");
+}
+print("};");
+print();
+print("} } // JSC::Yarr");
+print();
+
diff --git a/Source/JavaScriptCore/yarr/YarrInterpreter.cpp b/Source/JavaScriptCore/yarr/YarrInterpreter.cpp
index 743f16048..ba10171bf 100644
--- a/Source/JavaScriptCore/yarr/YarrInterpreter.cpp
+++ b/Source/JavaScriptCore/yarr/YarrInterpreter.cpp
@@ -29,6 +29,7 @@
#include "UString.h"
#include "Yarr.h"
+#include "YarrCanonicalizeUCS2.h"
#include <wtf/BumpPointerAllocator.h>
#include <wtf/DataLog.h>
#include <wtf/text/CString.h>
@@ -41,6 +42,7 @@ using namespace WTF;
namespace JSC { namespace Yarr {
+template<typename CharType>
class Interpreter {
public:
struct ParenthesesDisjunctionContext;
@@ -169,55 +171,9 @@ public:
allocatorPool = allocatorPool->dealloc(context);
}
- // This class is a placeholder for future character iterator, current
- // proposed name StringConstCharacterIterator.
- class CharAccess {
- public:
- CharAccess(const UString& s)
- {
- if (s.is8Bit()) {
- m_charSize = Char8;
- m_ptr.ptr8 = s.characters8();
- } else {
- m_charSize = Char16;
- m_ptr.ptr16 = s.characters16();
- }
- }
-
- CharAccess(const LChar* ptr)
- : m_charSize(Char8)
- {
- m_ptr.ptr8 = ptr;
- }
-
- CharAccess(const UChar* ptr)
- : m_charSize(Char16)
- {
- m_ptr.ptr16 = ptr;
- }
-
- ~CharAccess()
- {
- }
-
- inline UChar operator[](unsigned index)
- {
- if (m_charSize == Char8)
- return m_ptr.ptr8[index];
- return m_ptr.ptr16[index];
- }
-
- private:
- union {
- const LChar* ptr8;
- const UChar* ptr16;
- } m_ptr;
- YarrCharSize m_charSize;
- };
-
class InputStream {
public:
- InputStream(const UString& input, unsigned start, unsigned length)
+ InputStream(const CharType* input, unsigned start, unsigned length)
: input(input)
, pos(start)
, length(length)
@@ -331,7 +287,7 @@ public:
}
private:
- CharAccess input;
+ const CharType* input;
unsigned pos;
unsigned length;
};
@@ -383,15 +339,22 @@ public:
if (pattern->m_ignoreCase) {
for (unsigned i = 0; i < matchSize; ++i) {
- int ch = input.reread(matchBegin + i);
+ int oldCh = input.reread(matchBegin + i);
+ int ch = input.readChecked(negativeInputOffset + matchSize - i);
- int lo = Unicode::toLower(ch);
- int hi = Unicode::toUpper(ch);
+ if (oldCh == ch)
+ continue;
- if ((lo != hi) ? (!checkCasedCharacter(lo, hi, negativeInputOffset + matchSize - i)) : (!checkCharacter(ch, negativeInputOffset + matchSize - i))) {
- input.uncheckInput(matchSize);
- return false;
- }
+ // The definition for canonicalize (see ES 5.1, 15.10.2.8) means that
+ // unicode values are never allowed to match against ascii ones.
+ if (isASCII(oldCh) || isASCII(ch)) {
+ if (toASCIIUpper(oldCh) == toASCIIUpper(ch))
+ continue;
+ } else if (areCanonicallyEquivalent(oldCh, ch))
+ continue;
+
+ input.uncheckInput(matchSize);
+ return false;
}
} else {
for (unsigned i = 0; i < matchSize; ++i) {
@@ -1481,7 +1444,7 @@ public:
return output[0];
}
- Interpreter(BytecodePattern* pattern, unsigned* output, const UString input, unsigned start, unsigned length)
+ Interpreter(BytecodePattern* pattern, unsigned* output, const CharType* input, unsigned length, unsigned start)
: pattern(pattern)
, output(output)
, input(input, start, length)
@@ -1971,18 +1934,31 @@ PassOwnPtr<BytecodePattern> byteCompile(YarrPattern& pattern, BumpPointerAllocat
return ByteCompiler(pattern).compile(allocator);
}
-unsigned interpret(BytecodePattern* bytecode, const UString& input, unsigned start, unsigned length, unsigned* output)
+unsigned interpret(BytecodePattern* bytecode, const UString& input, unsigned start, unsigned* output)
+{
+ if (input.is8Bit())
+ return Interpreter<LChar>(bytecode, output, input.characters8(), input.length(), start).interpret();
+ return Interpreter<UChar>(bytecode, output, input.characters16(), input.length(), start).interpret();
+}
+
+unsigned interpret(BytecodePattern* bytecode, const LChar* input, unsigned length, unsigned start, unsigned* output)
+{
+ return Interpreter<LChar>(bytecode, output, input, length, start).interpret();
+}
+
+unsigned interpret(BytecodePattern* bytecode, const UChar* input, unsigned length, unsigned start, unsigned* output)
{
- return Interpreter(bytecode, output, input, start, length).interpret();
+ return Interpreter<UChar>(bytecode, output, input, length, start).interpret();
}
-COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoPatternCharacter) == (YarrStackSpaceForBackTrackInfoPatternCharacter * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoPatternCharacter);
-COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoCharacterClass) == (YarrStackSpaceForBackTrackInfoCharacterClass * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoCharacterClass);
-COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoBackReference) == (YarrStackSpaceForBackTrackInfoBackReference * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoBackReference);
-COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoAlternative) == (YarrStackSpaceForBackTrackInfoAlternative * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoAlternative);
-COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParentheticalAssertion) == (YarrStackSpaceForBackTrackInfoParentheticalAssertion * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParentheticalAssertion);
-COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParenthesesOnce) == (YarrStackSpaceForBackTrackInfoParenthesesOnce * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParenthesesOnce);
-COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParentheses) == (YarrStackSpaceForBackTrackInfoParentheses * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParentheses);
+// These should be the same for both UChar & LChar.
+COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoPatternCharacter) == (YarrStackSpaceForBackTrackInfoPatternCharacter * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoPatternCharacter);
+COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoCharacterClass) == (YarrStackSpaceForBackTrackInfoCharacterClass * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoCharacterClass);
+COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoBackReference) == (YarrStackSpaceForBackTrackInfoBackReference * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoBackReference);
+COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoAlternative) == (YarrStackSpaceForBackTrackInfoAlternative * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoAlternative);
+COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoParentheticalAssertion) == (YarrStackSpaceForBackTrackInfoParentheticalAssertion * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParentheticalAssertion);
+COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoParenthesesOnce) == (YarrStackSpaceForBackTrackInfoParenthesesOnce * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParenthesesOnce);
+COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoParentheses) == (YarrStackSpaceForBackTrackInfoParentheses * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParentheses);
} }
diff --git a/Source/JavaScriptCore/yarr/YarrInterpreter.h b/Source/JavaScriptCore/yarr/YarrInterpreter.h
index 4bb1efc50..4ecd69eca 100644
--- a/Source/JavaScriptCore/yarr/YarrInterpreter.h
+++ b/Source/JavaScriptCore/yarr/YarrInterpreter.h
@@ -375,6 +375,11 @@ private:
Vector<CharacterClass*> m_userCharacterClasses;
};
+JS_EXPORT_PRIVATE PassOwnPtr<BytecodePattern> byteCompile(YarrPattern&, BumpPointerAllocator*);
+JS_EXPORT_PRIVATE unsigned interpret(BytecodePattern*, const UString& input, unsigned start, unsigned* output);
+unsigned interpret(BytecodePattern*, const LChar* input, unsigned length, unsigned start, unsigned* output);
+unsigned interpret(BytecodePattern*, const UChar* input, unsigned length, unsigned start, unsigned* output);
+
} } // namespace JSC::Yarr
#endif // YarrInterpreter_h
diff --git a/Source/JavaScriptCore/yarr/YarrJIT.cpp b/Source/JavaScriptCore/yarr/YarrJIT.cpp
index 2269792ec..60519ebd8 100644
--- a/Source/JavaScriptCore/yarr/YarrJIT.cpp
+++ b/Source/JavaScriptCore/yarr/YarrJIT.cpp
@@ -29,6 +29,7 @@
#include <wtf/ASCIICType.h>
#include "LinkBuffer.h"
#include "Yarr.h"
+#include "YarrCanonicalizeUCS2.h"
#if ENABLE(YARR_JIT)
@@ -36,6 +37,7 @@ using namespace WTF;
namespace JSC { namespace Yarr {
+template<YarrJITCompileMode compileMode>
class YarrGenerator : private MacroAssembler {
friend void jitCompile(JSGlobalData*, YarrCodeBlock& jitObject, const UString& pattern, unsigned& numSubpatterns, const char*& error, bool ignoreCase, bool multiline);
@@ -49,6 +51,7 @@ class YarrGenerator : private MacroAssembler {
static const RegisterID regT1 = ARMRegisters::r6;
static const RegisterID returnRegister = ARMRegisters::r0;
+ static const RegisterID returnRegister2 = ARMRegisters::r1;
#elif CPU(MIPS)
static const RegisterID input = MIPSRegisters::a0;
static const RegisterID index = MIPSRegisters::a1;
@@ -59,6 +62,7 @@ class YarrGenerator : private MacroAssembler {
static const RegisterID regT1 = MIPSRegisters::t5;
static const RegisterID returnRegister = MIPSRegisters::v0;
+ static const RegisterID returnRegister2 = MIPSRegisters::v1;
#elif CPU(SH4)
static const RegisterID input = SH4Registers::r4;
static const RegisterID index = SH4Registers::r5;
@@ -69,6 +73,7 @@ class YarrGenerator : private MacroAssembler {
static const RegisterID regT1 = SH4Registers::r1;
static const RegisterID returnRegister = SH4Registers::r0;
+ static const RegisterID returnRegister2 = SH4Registers::r1;
#elif CPU(X86)
static const RegisterID input = X86Registers::eax;
static const RegisterID index = X86Registers::edx;
@@ -79,6 +84,7 @@ class YarrGenerator : private MacroAssembler {
static const RegisterID regT1 = X86Registers::esi;
static const RegisterID returnRegister = X86Registers::eax;
+ static const RegisterID returnRegister2 = X86Registers::edx;
#elif CPU(X86_64)
static const RegisterID input = X86Registers::edi;
static const RegisterID index = X86Registers::esi;
@@ -89,6 +95,7 @@ class YarrGenerator : private MacroAssembler {
static const RegisterID regT1 = X86Registers::ebx;
static const RegisterID returnRegister = X86Registers::eax;
+ static const RegisterID returnRegister2 = X86Registers::edx;
#endif
void optimizeAlternative(PatternAlternative* alternative)
@@ -262,10 +269,10 @@ class YarrGenerator : private MacroAssembler {
// For case-insesitive compares, non-ascii characters that have different
// upper & lower case representations are converted to a character class.
- ASSERT(!m_pattern.m_ignoreCase || isASCIIAlpha(ch) || (Unicode::toLower(ch) == Unicode::toUpper(ch)));
+ ASSERT(!m_pattern.m_ignoreCase || isASCIIAlpha(ch) || isCanonicallyUnique(ch));
if (m_pattern.m_ignoreCase && isASCIIAlpha(ch)) {
- or32(TrustedImm32(32), character);
- ch = Unicode::toLower(ch);
+ or32(TrustedImm32(0x20), character);
+ ch |= 0x20;
}
return branch32(NotEqual, character, Imm32(ch));
@@ -304,6 +311,65 @@ class YarrGenerator : private MacroAssembler {
jump(Address(stackPointerRegister, frameLocation * sizeof(void*)));
}
+ void initCallFrame()
+ {
+ unsigned callFrameSize = m_pattern.m_body->m_callFrameSize;
+ if (callFrameSize)
+ subPtr(Imm32(callFrameSize * sizeof(void*)), stackPointerRegister);
+ }
+ void removeCallFrame()
+ {
+ unsigned callFrameSize = m_pattern.m_body->m_callFrameSize;
+ if (callFrameSize)
+ addPtr(Imm32(callFrameSize * sizeof(void*)), stackPointerRegister);
+ }
+
+ // Used to record subpatters, should only be called if compileMode is IncludeSubpatterns.
+ void setSubpatternStart(RegisterID reg, unsigned subpattern)
+ {
+ ASSERT(subpattern);
+ // FIXME: should be able to ASSERT(compileMode == IncludeSubpatterns), but then this function is conditionally NORETURN. :-(
+ store32(reg, Address(output, (subpattern << 1) * sizeof(int)));
+ }
+ void setSubpatternEnd(RegisterID reg, unsigned subpattern)
+ {
+ ASSERT(subpattern);
+ // FIXME: should be able to ASSERT(compileMode == IncludeSubpatterns), but then this function is conditionally NORETURN. :-(
+ store32(reg, Address(output, ((subpattern << 1) + 1) * sizeof(int)));
+ }
+ void clearSubpatternStart(unsigned subpattern)
+ {
+ ASSERT(subpattern);
+ // FIXME: should be able to ASSERT(compileMode == IncludeSubpatterns), but then this function is conditionally NORETURN. :-(
+ store32(TrustedImm32(-1), Address(output, (subpattern << 1) * sizeof(int)));
+ }
+
+ // We use one of three different strategies to track the start of the current match,
+ // while matching.
+ // 1) If the pattern has a fixed size, do nothing! - we calculate the value lazily
+ // at the end of matching. This is irrespective of compileMode, and in this case
+ // these methods should never be called.
+ // 2) If we're compiling IncludeSubpatterns, 'output' contains a pointer to an output
+ // vector, store the match start in the output vector.
+ // 3) If we're compiling MatchOnly, 'output' is unused, store the match start directly
+ // in this register.
+ void setMatchStart(RegisterID reg)
+ {
+ ASSERT(!m_pattern.m_body->m_hasFixedSize);
+ if (compileMode == IncludeSubpatterns)
+ store32(reg, output);
+ else
+ move(reg, output);
+ }
+ void getMatchStart(RegisterID reg)
+ {
+ ASSERT(!m_pattern.m_body->m_hasFixedSize);
+ if (compileMode == IncludeSubpatterns)
+ load32(output, reg);
+ else
+ move(output, reg);
+ }
+
enum YarrOpCode {
// These nodes wrap body alternatives - those in the main disjunction,
// rather than subpatterns or assertions. These are chained together in
@@ -685,9 +751,9 @@ class YarrGenerator : private MacroAssembler {
// For case-insesitive compares, non-ascii characters that have different
// upper & lower case representations are converted to a character class.
- ASSERT(!m_pattern.m_ignoreCase || isASCIIAlpha(ch) || (Unicode::toLower(ch) == Unicode::toUpper(ch)));
+ ASSERT(!m_pattern.m_ignoreCase || isASCIIAlpha(ch) || isCanonicallyUnique(ch));
- if ((m_pattern.m_ignoreCase) && (isASCIIAlpha(ch)))
+ if (m_pattern.m_ignoreCase && isASCIIAlpha(ch))
ignoreCaseMask |= 32;
for (numberCharacters = 1; numberCharacters < maxCharactersAtOnce && nextOp->m_op == OpTerm; ++numberCharacters, nextOp = &m_ops[opIndex + numberCharacters]) {
@@ -713,7 +779,7 @@ class YarrGenerator : private MacroAssembler {
// For case-insesitive compares, non-ascii characters that have different
// upper & lower case representations are converted to a character class.
- ASSERT(!m_pattern.m_ignoreCase || isASCIIAlpha(currentCharacter) || (Unicode::toLower(currentCharacter) == Unicode::toUpper(currentCharacter)));
+ ASSERT(!m_pattern.m_ignoreCase || isASCIIAlpha(currentCharacter) || isCanonicallyUnique(currentCharacter));
allCharacters |= (currentCharacter << shiftAmount);
@@ -728,12 +794,12 @@ class YarrGenerator : private MacroAssembler {
return;
case 2: {
BaseIndex address(input, index, TimesOne, (startTermPosition - m_checked) * sizeof(LChar));
- load16(address, character);
+ load16Unaligned(address, character);
break;
}
case 3: {
BaseIndex highAddress(input, index, TimesOne, (startTermPosition - m_checked) * sizeof(LChar));
- load16(highAddress, character);
+ load16Unaligned(highAddress, character);
if (ignoreCaseMask)
or32(Imm32(ignoreCaseMask), character);
op.m_jumps.append(branch32(NotEqual, character, Imm32((allCharacters & 0xffff) | ignoreCaseMask)));
@@ -790,10 +856,10 @@ class YarrGenerator : private MacroAssembler {
// For case-insesitive compares, non-ascii characters that have different
// upper & lower case representations are converted to a character class.
- ASSERT(!m_pattern.m_ignoreCase || isASCIIAlpha(ch) || (Unicode::toLower(ch) == Unicode::toUpper(ch)));
+ ASSERT(!m_pattern.m_ignoreCase || isASCIIAlpha(ch) || isCanonicallyUnique(ch));
if (m_pattern.m_ignoreCase && isASCIIAlpha(ch)) {
- or32(TrustedImm32(32), character);
- ch = Unicode::toLower(ch);
+ or32(TrustedImm32(0x20), character);
+ ch |= 0x20;
}
op.m_jumps.append(branch32(NotEqual, character, Imm32(ch)));
@@ -1029,7 +1095,6 @@ class YarrGenerator : private MacroAssembler {
m_backtrackingState.link(this);
- Label backtrackBegin(this);
loadFromFrame(term->frameLocation, countRegister);
nonGreedyFailures.append(atEndOfInput());
@@ -1068,11 +1133,8 @@ class YarrGenerator : private MacroAssembler {
JumpList saveStartIndex;
JumpList foundEndingNewLine;
- if (m_pattern.m_body->m_hasFixedSize) {
- move(index, matchPos);
- sub32(Imm32(m_checked), matchPos);
- } else
- load32(Address(output), matchPos);
+ ASSERT(!m_pattern.m_body->m_hasFixedSize);
+ getMatchStart(matchPos);
saveStartIndex.append(branchTest32(Zero, matchPos));
Label findBOLLoop(this);
@@ -1092,7 +1154,8 @@ class YarrGenerator : private MacroAssembler {
if (!m_pattern.m_multiline && term->anchors.bolAnchor)
op.m_jumps.append(branchTest32(NonZero, matchPos));
- store32(matchPos, Address(output));
+ ASSERT(!m_pattern.m_body->m_hasFixedSize);
+ setMatchStart(matchPos);
move(index, matchPos);
@@ -1314,8 +1377,7 @@ class YarrGenerator : private MacroAssembler {
// If we get here, the prior alternative matched - return success.
// Adjust the stack pointer to remove the pattern's frame.
- if (m_pattern.m_body->m_callFrameSize)
- addPtr(Imm32(m_pattern.m_body->m_callFrameSize * sizeof(void*)), stackPointerRegister);
+ removeCallFrame();
// Load appropriate values into the return register and the first output
// slot, and return. In the case of pattern with a fixed size, we will
@@ -1325,10 +1387,14 @@ class YarrGenerator : private MacroAssembler {
move(index, returnRegister);
if (priorAlternative->m_minimumSize)
sub32(Imm32(priorAlternative->m_minimumSize), returnRegister);
- store32(returnRegister, output);
+ if (compileMode == IncludeSubpatterns)
+ store32(returnRegister, output);
} else
- load32(Address(output), returnRegister);
- store32(index, Address(output, 4));
+ getMatchStart(returnRegister);
+ if (compileMode == IncludeSubpatterns)
+ store32(index, Address(output, 4));
+ move(index, returnRegister2);
+
generateReturn();
// This is the divide between the tail of the prior alternative, above, and
@@ -1511,17 +1577,16 @@ class YarrGenerator : private MacroAssembler {
// FIXME: could avoid offsetting this value in JIT code, apply
// offsets only afterwards, at the point the results array is
// being accessed.
- if (term->capture()) {
- int offsetId = term->parentheses.subpatternId << 1;
+ if (term->capture() && compileMode == IncludeSubpatterns) {
int inputOffset = term->inputPosition - m_checked;
if (term->quantityType == QuantifierFixedCount)
inputOffset -= term->parentheses.disjunction->m_minimumSize;
if (inputOffset) {
move(index, indexTemporary);
add32(Imm32(inputOffset), indexTemporary);
- store32(indexTemporary, Address(output, offsetId * sizeof(int)));
+ setSubpatternStart(indexTemporary, term->parentheses.subpatternId);
} else
- store32(index, Address(output, offsetId * sizeof(int)));
+ setSubpatternStart(index, term->parentheses.subpatternId);
}
break;
}
@@ -1547,15 +1612,14 @@ class YarrGenerator : private MacroAssembler {
// FIXME: could avoid offsetting this value in JIT code, apply
// offsets only afterwards, at the point the results array is
// being accessed.
- if (term->capture()) {
- int offsetId = (term->parentheses.subpatternId << 1) + 1;
+ if (term->capture() && compileMode == IncludeSubpatterns) {
int inputOffset = term->inputPosition - m_checked;
if (inputOffset) {
move(index, indexTemporary);
add32(Imm32(inputOffset), indexTemporary);
- store32(indexTemporary, Address(output, offsetId * sizeof(int)));
+ setSubpatternEnd(indexTemporary, term->parentheses.subpatternId);
} else
- store32(index, Address(output, offsetId * sizeof(int)));
+ setSubpatternEnd(index, term->parentheses.subpatternId);
}
// If the parentheses are quantified Greedy then add a label to jump back
@@ -1645,9 +1709,9 @@ class YarrGenerator : private MacroAssembler {
}
case OpMatchFailed:
- if (m_pattern.m_body->m_callFrameSize)
- addPtr(Imm32(m_pattern.m_body->m_callFrameSize * sizeof(void*)), stackPointerRegister);
- move(TrustedImm32(-1), returnRegister);
+ removeCallFrame();
+ move(TrustedImmPtr((void*)WTF::notFound), returnRegister);
+ move(TrustedImm32(0), returnRegister2);
generateReturn();
break;
}
@@ -1742,14 +1806,14 @@ class YarrGenerator : private MacroAssembler {
// If the pattern size is not fixed, then store the start index, for use if we match.
if (!m_pattern.m_body->m_hasFixedSize) {
if (alternative->m_minimumSize == 1)
- store32(index, Address(output));
+ setMatchStart(index);
else {
move(index, regT0);
if (alternative->m_minimumSize)
sub32(Imm32(alternative->m_minimumSize - 1), regT0);
else
add32(TrustedImm32(1), regT0);
- store32(regT0, Address(output));
+ setMatchStart(regT0);
}
}
@@ -1835,7 +1899,7 @@ class YarrGenerator : private MacroAssembler {
// disjunction is 0, e.g. /a*|b/).
if (needsToUpdateMatchStart && alternative->m_minimumSize == 1) {
// index is already incremented by 1, so just store it now!
- store32(index, Address(output));
+ setMatchStart(index);
needsToUpdateMatchStart = false;
}
@@ -1859,11 +1923,11 @@ class YarrGenerator : private MacroAssembler {
if (needsToUpdateMatchStart) {
if (!m_pattern.m_body->m_minimumSize)
- store32(index, Address(output));
+ setMatchStart(index);
else {
move(index, regT0);
sub32(Imm32(m_pattern.m_body->m_minimumSize), regT0);
- store32(regT0, Address(output));
+ setMatchStart(regT0);
}
}
@@ -1885,9 +1949,9 @@ class YarrGenerator : private MacroAssembler {
// run any matches, and need to return a failure state from JIT code.
matchFailed.link(this);
- if (m_pattern.m_body->m_callFrameSize)
- addPtr(Imm32(m_pattern.m_body->m_callFrameSize * sizeof(void*)), stackPointerRegister);
- move(TrustedImm32(-1), returnRegister);
+ removeCallFrame();
+ move(TrustedImmPtr((void*)WTF::notFound), returnRegister);
+ move(TrustedImm32(0), returnRegister2);
generateReturn();
break;
}
@@ -2054,12 +2118,12 @@ class YarrGenerator : private MacroAssembler {
ASSERT(term->quantityCount == 1);
// We only need to backtrack to thispoint if capturing or greedy.
- if (term->capture() || term->quantityType == QuantifierGreedy) {
+ if ((term->capture() && compileMode == IncludeSubpatterns) || term->quantityType == QuantifierGreedy) {
m_backtrackingState.link(this);
// If capturing, clear the capture (we only need to reset start).
- if (term->capture())
- store32(TrustedImm32(-1), Address(output, (term->parentheses.subpatternId << 1) * sizeof(int)));
+ if (term->capture() && compileMode == IncludeSubpatterns)
+ clearSubpatternStart(term->parentheses.subpatternId);
// If Greedy, jump to the end.
if (term->quantityType == QuantifierGreedy) {
@@ -2449,9 +2513,11 @@ class YarrGenerator : private MacroAssembler {
loadPtr(Address(X86Registers::ebp, 2 * sizeof(void*)), input);
loadPtr(Address(X86Registers::ebp, 3 * sizeof(void*)), index);
loadPtr(Address(X86Registers::ebp, 4 * sizeof(void*)), length);
- loadPtr(Address(X86Registers::ebp, 5 * sizeof(void*)), output);
+ if (compileMode == IncludeSubpatterns)
+ loadPtr(Address(X86Registers::ebp, 5 * sizeof(void*)), output);
#else
- loadPtr(Address(X86Registers::ebp, 2 * sizeof(void*)), output);
+ if (compileMode == IncludeSubpatterns)
+ loadPtr(Address(X86Registers::ebp, 2 * sizeof(void*)), output);
#endif
#elif CPU(ARM)
push(ARMRegisters::r4);
@@ -2460,7 +2526,8 @@ class YarrGenerator : private MacroAssembler {
#if CPU(ARM_TRADITIONAL)
push(ARMRegisters::r8); // scratch register
#endif
- move(ARMRegisters::r3, output);
+ if (compileMode == IncludeSubpatterns)
+ move(ARMRegisters::r3, output);
#elif CPU(SH4)
push(SH4Registers::r11);
push(SH4Registers::r13);
@@ -2510,18 +2577,20 @@ public:
generateEnter();
Jump hasInput = checkInput();
- move(TrustedImm32(-1), returnRegister);
+ move(TrustedImmPtr((void*)WTF::notFound), returnRegister);
+ move(TrustedImm32(0), returnRegister2);
generateReturn();
hasInput.link(this);
- for (unsigned i = 0; i < m_pattern.m_numSubpatterns + 1; ++i)
- store32(TrustedImm32(-1), Address(output, (i << 1) * sizeof(int)));
+ if (compileMode == IncludeSubpatterns) {
+ for (unsigned i = 0; i < m_pattern.m_numSubpatterns + 1; ++i)
+ store32(TrustedImm32(-1), Address(output, (i << 1) * sizeof(int)));
+ }
if (!m_pattern.m_body->m_hasFixedSize)
- store32(index, Address(output));
+ setMatchStart(index);
- if (m_pattern.m_body->m_callFrameSize)
- subPtr(Imm32(m_pattern.m_body->m_callFrameSize * sizeof(void*)), stackPointerRegister);
+ initCallFrame();
// Compile the pattern to the internal 'YarrOp' representation.
opCompileBody(m_pattern.m_body);
@@ -2539,10 +2608,18 @@ public:
// Link & finalize the code.
LinkBuffer linkBuffer(*globalData, this, REGEXP_CODE_ID);
m_backtrackingState.linkDataLabels(linkBuffer);
- if (m_charSize == Char8)
- jitObject.set8BitCode(linkBuffer.finalizeCode());
- else
- jitObject.set16BitCode(linkBuffer.finalizeCode());
+
+ if (compileMode == MatchOnly) {
+ if (m_charSize == Char8)
+ jitObject.set8BitCodeMatchOnly(linkBuffer.finalizeCode());
+ else
+ jitObject.set16BitCodeMatchOnly(linkBuffer.finalizeCode());
+ } else {
+ if (m_charSize == Char8)
+ jitObject.set8BitCode(linkBuffer.finalizeCode());
+ else
+ jitObject.set16BitCode(linkBuffer.finalizeCode());
+ }
jitObject.setFallBack(m_shouldFallBack);
}
@@ -2576,9 +2653,12 @@ private:
BacktrackingState m_backtrackingState;
};
-void jitCompile(YarrPattern& pattern, YarrCharSize charSize, JSGlobalData* globalData, YarrCodeBlock& jitObject)
+void jitCompile(YarrPattern& pattern, YarrCharSize charSize, JSGlobalData* globalData, YarrCodeBlock& jitObject, YarrJITCompileMode mode)
{
- YarrGenerator(pattern, charSize).compile(globalData, jitObject);
+ if (mode == MatchOnly)
+ YarrGenerator<MatchOnly>(pattern, charSize).compile(globalData, jitObject);
+ else
+ YarrGenerator<IncludeSubpatterns>(pattern, charSize).compile(globalData, jitObject);
}
}}
diff --git a/Source/JavaScriptCore/yarr/YarrJIT.h b/Source/JavaScriptCore/yarr/YarrJIT.h
index 38ae76cc4..71928e73c 100644
--- a/Source/JavaScriptCore/yarr/YarrJIT.h
+++ b/Source/JavaScriptCore/yarr/YarrJIT.h
@@ -29,7 +29,8 @@
#if ENABLE(YARR_JIT)
#include "JSGlobalData.h"
-#include "MacroAssembler.h"
+#include "MacroAssemblerCodeRef.h"
+#include "MatchResult.h"
#include "UString.h"
#include "Yarr.h"
#include "YarrPattern.h"
@@ -48,8 +49,17 @@ class ExecutablePool;
namespace Yarr {
class YarrCodeBlock {
- typedef int (*YarrJITCode8)(const LChar* input, unsigned start, unsigned length, int* output) YARR_CALL;
- typedef int (*YarrJITCode16)(const UChar* input, unsigned start, unsigned length, int* output) YARR_CALL;
+#if CPU(X86_64)
+ typedef MatchResult (*YarrJITCode8)(const LChar* input, unsigned start, unsigned length, int* output) YARR_CALL;
+ typedef MatchResult (*YarrJITCode16)(const UChar* input, unsigned start, unsigned length, int* output) YARR_CALL;
+ typedef MatchResult (*YarrJITCodeMatchOnly8)(const LChar* input, unsigned start, unsigned length) YARR_CALL;
+ typedef MatchResult (*YarrJITCodeMatchOnly16)(const UChar* input, unsigned start, unsigned length) YARR_CALL;
+#else
+ typedef EncodedMatchResult (*YarrJITCode8)(const LChar* input, unsigned start, unsigned length, int* output) YARR_CALL;
+ typedef EncodedMatchResult (*YarrJITCode16)(const UChar* input, unsigned start, unsigned length, int* output) YARR_CALL;
+ typedef EncodedMatchResult (*YarrJITCodeMatchOnly8)(const LChar* input, unsigned start, unsigned length) YARR_CALL;
+ typedef EncodedMatchResult (*YarrJITCodeMatchOnly16)(const UChar* input, unsigned start, unsigned length) YARR_CALL;
+#endif
public:
YarrCodeBlock()
@@ -63,43 +73,67 @@ public:
void setFallBack(bool fallback) { m_needFallBack = fallback; }
bool isFallBack() { return m_needFallBack; }
+
bool has8BitCode() { return m_ref8.size(); }
bool has16BitCode() { return m_ref16.size(); }
- void set8BitCode(MacroAssembler::CodeRef ref) { m_ref8 = ref; }
- void set16BitCode(MacroAssembler::CodeRef ref) { m_ref16 = ref; }
+ void set8BitCode(MacroAssemblerCodeRef ref) { m_ref8 = ref; }
+ void set16BitCode(MacroAssemblerCodeRef ref) { m_ref16 = ref; }
- int execute(const LChar* input, unsigned start, unsigned length, int* output)
+ bool has8BitCodeMatchOnly() { return m_matchOnly8.size(); }
+ bool has16BitCodeMatchOnly() { return m_matchOnly16.size(); }
+ void set8BitCodeMatchOnly(MacroAssemblerCodeRef matchOnly) { m_matchOnly8 = matchOnly; }
+ void set16BitCodeMatchOnly(MacroAssemblerCodeRef matchOnly) { m_matchOnly16 = matchOnly; }
+
+ MatchResult execute(const LChar* input, unsigned start, unsigned length, int* output)
{
ASSERT(has8BitCode());
- return reinterpret_cast<YarrJITCode8>(m_ref8.code().executableAddress())(input, start, length, output);
+ return MatchResult(reinterpret_cast<YarrJITCode8>(m_ref8.code().executableAddress())(input, start, length, output));
}
- int execute(const UChar* input, unsigned start, unsigned length, int* output)
+ MatchResult execute(const UChar* input, unsigned start, unsigned length, int* output)
{
ASSERT(has16BitCode());
- return reinterpret_cast<YarrJITCode16>(m_ref16.code().executableAddress())(input, start, length, output);
+ return MatchResult(reinterpret_cast<YarrJITCode16>(m_ref16.code().executableAddress())(input, start, length, output));
}
+
+ MatchResult execute(const LChar* input, unsigned start, unsigned length)
+ {
+ ASSERT(has8BitCodeMatchOnly());
+ return MatchResult(reinterpret_cast<YarrJITCodeMatchOnly8>(m_matchOnly8.code().executableAddress())(input, start, length));
+ }
+
+ MatchResult execute(const UChar* input, unsigned start, unsigned length)
+ {
+ ASSERT(has16BitCodeMatchOnly());
+ return MatchResult(reinterpret_cast<YarrJITCodeMatchOnly16>(m_matchOnly16.code().executableAddress())(input, start, length));
+ }
+
#if ENABLE(REGEXP_TRACING)
void *getAddr() { return m_ref.code().executableAddress(); }
#endif
+ void clear()
+ {
+ m_ref8 = MacroAssemblerCodeRef();
+ m_ref16 = MacroAssemblerCodeRef();
+ m_matchOnly8 = MacroAssemblerCodeRef();
+ m_matchOnly16 = MacroAssemblerCodeRef();
+ m_needFallBack = false;
+ }
+
private:
- MacroAssembler::CodeRef m_ref8;
- MacroAssembler::CodeRef m_ref16;
+ MacroAssemblerCodeRef m_ref8;
+ MacroAssemblerCodeRef m_ref16;
+ MacroAssemblerCodeRef m_matchOnly8;
+ MacroAssemblerCodeRef m_matchOnly16;
bool m_needFallBack;
};
-void jitCompile(YarrPattern&, YarrCharSize, JSGlobalData*, YarrCodeBlock& jitObject);
-
-inline int execute(YarrCodeBlock& jitObject, const LChar* input, unsigned start, unsigned length, int* output)
-{
- return jitObject.execute(input, start, length, output);
-}
-
-inline int execute(YarrCodeBlock& jitObject, const UChar* input, unsigned start, unsigned length, int* output)
-{
- return jitObject.execute(input, start, length, output);
-}
+enum YarrJITCompileMode {
+ MatchOnly,
+ IncludeSubpatterns
+};
+void jitCompile(YarrPattern&, YarrCharSize, JSGlobalData*, YarrCodeBlock& jitObject, YarrJITCompileMode = IncludeSubpatterns);
} } // namespace JSC::Yarr
diff --git a/Source/JavaScriptCore/yarr/YarrPattern.cpp b/Source/JavaScriptCore/yarr/YarrPattern.cpp
index f0d10e624..bbda9c526 100644
--- a/Source/JavaScriptCore/yarr/YarrPattern.cpp
+++ b/Source/JavaScriptCore/yarr/YarrPattern.cpp
@@ -28,6 +28,7 @@
#include "YarrPattern.h"
#include "Yarr.h"
+#include "YarrCanonicalizeUCS2.h"
#include "YarrParser.h"
#include <wtf/Vector.h>
@@ -66,32 +67,43 @@ public:
void putChar(UChar ch)
{
+ // Handle ascii cases.
if (ch <= 0x7f) {
if (m_isCaseInsensitive && isASCIIAlpha(ch)) {
addSorted(m_matches, toASCIIUpper(ch));
addSorted(m_matches, toASCIILower(ch));
} else
addSorted(m_matches, ch);
- } else {
- UChar upper, lower;
- if (m_isCaseInsensitive && ((upper = Unicode::toUpper(ch)) != (lower = Unicode::toLower(ch)))) {
- addSorted(m_matchesUnicode, upper);
- addSorted(m_matchesUnicode, lower);
- } else
- addSorted(m_matchesUnicode, ch);
+ return;
}
- }
- // returns true if this character has another case, and 'ch' is the upper case form.
- static inline bool isUnicodeUpper(UChar ch)
- {
- return ch != Unicode::toLower(ch);
+ // Simple case, not a case-insensitive match.
+ if (!m_isCaseInsensitive) {
+ addSorted(m_matchesUnicode, ch);
+ return;
+ }
+
+ // Add multiple matches, if necessary.
+ UCS2CanonicalizationRange* info = rangeInfoFor(ch);
+ if (info->type == CanonicalizeUnique)
+ addSorted(m_matchesUnicode, ch);
+ else
+ putUnicodeIgnoreCase(ch, info);
}
- // returns true if this character has another case, and 'ch' is the lower case form.
- static inline bool isUnicodeLower(UChar ch)
+ void putUnicodeIgnoreCase(UChar ch, UCS2CanonicalizationRange* info)
{
- return ch != Unicode::toUpper(ch);
+ ASSERT(m_isCaseInsensitive);
+ ASSERT(ch > 0x7f);
+ ASSERT(ch >= info->begin && ch <= info->end);
+ ASSERT(info->type != CanonicalizeUnique);
+ if (info->type == CanonicalizeSet) {
+ for (uint16_t* set = characterSetInfo[info->value]; (ch = *set); ++set)
+ addSorted(m_matchesUnicode, ch);
+ } else {
+ addSorted(m_matchesUnicode, ch);
+ addSorted(m_matchesUnicode, getCanonicalPair(info, ch));
+ }
}
void putRange(UChar lo, UChar hi)
@@ -108,36 +120,59 @@ public:
addSortedRange(m_ranges, std::max(asciiLo, 'a')+('A'-'a'), std::min(asciiHi, 'z')+('A'-'a'));
}
}
- if (hi >= 0x80) {
- uint32_t unicodeCurr = std::max(lo, (UChar)0x80);
- addSortedRange(m_rangesUnicode, unicodeCurr, hi);
-
- if (m_isCaseInsensitive) {
- while (unicodeCurr <= hi) {
- // If the upper bound of the range (hi) is 0xffff, the increments to
- // unicodeCurr in this loop may take it to 0x10000. This is fine
- // (if so we won't re-enter the loop, since the loop condition above
- // will definitely fail) - but this does mean we cannot use a UChar
- // to represent unicodeCurr, we must use a 32-bit value instead.
- ASSERT(unicodeCurr <= 0xffff);
-
- if (isUnicodeUpper(unicodeCurr)) {
- UChar lowerCaseRangeBegin = Unicode::toLower(unicodeCurr);
- UChar lowerCaseRangeEnd = lowerCaseRangeBegin;
- while ((++unicodeCurr <= hi) && isUnicodeUpper(unicodeCurr) && (Unicode::toLower(unicodeCurr) == (lowerCaseRangeEnd + 1)))
- lowerCaseRangeEnd++;
- addSortedRange(m_rangesUnicode, lowerCaseRangeBegin, lowerCaseRangeEnd);
- } else if (isUnicodeLower(unicodeCurr)) {
- UChar upperCaseRangeBegin = Unicode::toUpper(unicodeCurr);
- UChar upperCaseRangeEnd = upperCaseRangeBegin;
- while ((++unicodeCurr <= hi) && isUnicodeLower(unicodeCurr) && (Unicode::toUpper(unicodeCurr) == (upperCaseRangeEnd + 1)))
- upperCaseRangeEnd++;
- addSortedRange(m_rangesUnicode, upperCaseRangeBegin, upperCaseRangeEnd);
- } else
- ++unicodeCurr;
- }
+ if (hi <= 0x7f)
+ return;
+
+ lo = std::max(lo, (UChar)0x80);
+ addSortedRange(m_rangesUnicode, lo, hi);
+
+ if (!m_isCaseInsensitive)
+ return;
+
+ UCS2CanonicalizationRange* info = rangeInfoFor(lo);
+ while (true) {
+ // Handle the range [lo .. end]
+ UChar end = std::min<UChar>(info->end, hi);
+
+ switch (info->type) {
+ case CanonicalizeUnique:
+ // Nothing to do - no canonical equivalents.
+ break;
+ case CanonicalizeSet: {
+ UChar ch;
+ for (uint16_t* set = characterSetInfo[info->value]; (ch = *set); ++set)
+ addSorted(m_matchesUnicode, ch);
+ break;
}
- }
+ case CanonicalizeRangeLo:
+ addSortedRange(m_rangesUnicode, lo + info->value, end + info->value);
+ break;
+ case CanonicalizeRangeHi:
+ addSortedRange(m_rangesUnicode, lo - info->value, end - info->value);
+ break;
+ case CanonicalizeAlternatingAligned:
+ // Use addSortedRange since there is likely an abutting range to combine with.
+ if (lo & 1)
+ addSortedRange(m_rangesUnicode, lo - 1, lo - 1);
+ if (!(end & 1))
+ addSortedRange(m_rangesUnicode, end + 1, end + 1);
+ break;
+ case CanonicalizeAlternatingUnaligned:
+ // Use addSortedRange since there is likely an abutting range to combine with.
+ if (!(lo & 1))
+ addSortedRange(m_rangesUnicode, lo - 1, lo - 1);
+ if (end & 1)
+ addSortedRange(m_rangesUnicode, end + 1, end + 1);
+ break;
+ }
+
+ if (hi == end)
+ return;
+
+ ++info;
+ lo = info->begin;
+ };
+
}
CharacterClass* charClass()
@@ -280,12 +315,21 @@ public:
{
// We handle case-insensitive checking of unicode characters which do have both
// cases by handling them as if they were defined using a CharacterClass.
- if (m_pattern.m_ignoreCase && !isASCII(ch) && (Unicode::toUpper(ch) != Unicode::toLower(ch))) {
- atomCharacterClassBegin();
- atomCharacterClassAtom(ch);
- atomCharacterClassEnd();
- } else
+ if (!m_pattern.m_ignoreCase || isASCII(ch)) {
+ m_alternative->m_terms.append(PatternTerm(ch));
+ return;
+ }
+
+ UCS2CanonicalizationRange* info = rangeInfoFor(ch);
+ if (info->type == CanonicalizeUnique) {
m_alternative->m_terms.append(PatternTerm(ch));
+ return;
+ }
+
+ m_characterClassConstructor.putUnicodeIgnoreCase(ch, info);
+ CharacterClass* newCharacterClass = m_characterClassConstructor.charClass();
+ m_pattern.m_userCharacterClasses.append(newCharacterClass);
+ m_alternative->m_terms.append(PatternTerm(newCharacterClass, false));
}
void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)
diff --git a/Source/JavaScriptCore/yarr/yarr.pri b/Source/JavaScriptCore/yarr/yarr.pri
index c2634864f..623098fd3 100644
--- a/Source/JavaScriptCore/yarr/yarr.pri
+++ b/Source/JavaScriptCore/yarr/yarr.pri
@@ -7,7 +7,12 @@
SOURCES += \
$$PWD/YarrInterpreter.cpp \
$$PWD/YarrPattern.cpp \
- $$PWD/YarrSyntaxChecker.cpp
+ $$PWD/YarrSyntaxChecker.cpp \
+ $$PWD/YarrCanonicalizeUCS2.cpp
# For UString.h
-v8: INCLUDEPATH += $$PWD/../runtime
+v8 {
+ INCLUDEPATH += \
+ $$PWD/.. \
+ $$PWD/../runtime
+}