diff options
Diffstat (limited to 'Source/JavaScriptCore/yarr')
-rw-r--r-- | Source/JavaScriptCore/yarr/Yarr.h | 3 | ||||
-rw-r--r-- | Source/JavaScriptCore/yarr/YarrCanonicalizeUCS2.cpp | 463 | ||||
-rw-r--r-- | Source/JavaScriptCore/yarr/YarrCanonicalizeUCS2.h | 138 | ||||
-rw-r--r-- | Source/JavaScriptCore/yarr/YarrCanonicalizeUCS2.js | 219 | ||||
-rw-r--r-- | Source/JavaScriptCore/yarr/YarrInterpreter.cpp | 106 | ||||
-rw-r--r-- | Source/JavaScriptCore/yarr/YarrInterpreter.h | 5 | ||||
-rw-r--r-- | Source/JavaScriptCore/yarr/YarrJIT.cpp | 200 | ||||
-rw-r--r-- | Source/JavaScriptCore/yarr/YarrJIT.h | 78 | ||||
-rw-r--r-- | Source/JavaScriptCore/yarr/YarrPattern.cpp | 142 | ||||
-rw-r--r-- | Source/JavaScriptCore/yarr/yarr.pri | 9 |
10 files changed, 1162 insertions, 201 deletions
diff --git a/Source/JavaScriptCore/yarr/Yarr.h b/Source/JavaScriptCore/yarr/Yarr.h index 57a3846c0..d393e9fa9 100644 --- a/Source/JavaScriptCore/yarr/Yarr.h +++ b/Source/JavaScriptCore/yarr/Yarr.h @@ -63,9 +63,6 @@ enum YarrCharSize { Char16 }; -JS_EXPORT_PRIVATE PassOwnPtr<BytecodePattern> byteCompile(YarrPattern&, BumpPointerAllocator*); -JS_EXPORT_PRIVATE unsigned interpret(BytecodePattern*, const UString& input, unsigned start, unsigned length, unsigned* output); - } } // namespace JSC::Yarr #endif // Yarr_h diff --git a/Source/JavaScriptCore/yarr/YarrCanonicalizeUCS2.cpp b/Source/JavaScriptCore/yarr/YarrCanonicalizeUCS2.cpp new file mode 100644 index 000000000..7bb3d08eb --- /dev/null +++ b/Source/JavaScriptCore/yarr/YarrCanonicalizeUCS2.cpp @@ -0,0 +1,463 @@ +/* + * Copyright (C) 2012 Apple Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +// DO NOT EDIT! - this file autogenerated by YarrCanonicalizeUCS2.js + +#include "config.h" +#include "YarrCanonicalizeUCS2.h" + +namespace JSC { namespace Yarr { + +#include <stdint.h> + +uint16_t ucs2CharacterSet0[] = { 0x01c4u, 0x01c5u, 0x01c6u, 0 }; +uint16_t ucs2CharacterSet1[] = { 0x01c7u, 0x01c8u, 0x01c9u, 0 }; +uint16_t ucs2CharacterSet2[] = { 0x01cau, 0x01cbu, 0x01ccu, 0 }; +uint16_t ucs2CharacterSet3[] = { 0x01f1u, 0x01f2u, 0x01f3u, 0 }; +uint16_t ucs2CharacterSet4[] = { 0x0392u, 0x03b2u, 0x03d0u, 0 }; +uint16_t ucs2CharacterSet5[] = { 0x0395u, 0x03b5u, 0x03f5u, 0 }; +uint16_t ucs2CharacterSet6[] = { 0x0398u, 0x03b8u, 0x03d1u, 0 }; +uint16_t ucs2CharacterSet7[] = { 0x0345u, 0x0399u, 0x03b9u, 0x1fbeu, 0 }; +uint16_t ucs2CharacterSet8[] = { 0x039au, 0x03bau, 0x03f0u, 0 }; +uint16_t ucs2CharacterSet9[] = { 0x00b5u, 0x039cu, 0x03bcu, 0 }; +uint16_t ucs2CharacterSet10[] = { 0x03a0u, 0x03c0u, 0x03d6u, 0 }; +uint16_t ucs2CharacterSet11[] = { 0x03a1u, 0x03c1u, 0x03f1u, 0 }; +uint16_t ucs2CharacterSet12[] = { 0x03a3u, 0x03c2u, 0x03c3u, 0 }; +uint16_t ucs2CharacterSet13[] = { 0x03a6u, 0x03c6u, 0x03d5u, 0 }; +uint16_t ucs2CharacterSet14[] = { 0x1e60u, 0x1e61u, 0x1e9bu, 0 }; + +static const size_t UCS2_CANONICALIZATION_SETS = 15; +uint16_t* characterSetInfo[UCS2_CANONICALIZATION_SETS] = { + ucs2CharacterSet0, + ucs2CharacterSet1, + ucs2CharacterSet2, + ucs2CharacterSet3, + ucs2CharacterSet4, + ucs2CharacterSet5, + ucs2CharacterSet6, + ucs2CharacterSet7, + ucs2CharacterSet8, + ucs2CharacterSet9, + ucs2CharacterSet10, + ucs2CharacterSet11, + ucs2CharacterSet12, + ucs2CharacterSet13, + ucs2CharacterSet14, +}; + +const size_t UCS2_CANONICALIZATION_RANGES = 364; +UCS2CanonicalizationRange rangeInfo[UCS2_CANONICALIZATION_RANGES] = { + { 0x0000u, 0x0040u, 0x0000u, CanonicalizeUnique }, + { 0x0041u, 0x005au, 0x0020u, CanonicalizeRangeLo }, + { 0x005bu, 0x0060u, 0x0000u, CanonicalizeUnique }, + { 0x0061u, 0x007au, 0x0020u, CanonicalizeRangeHi }, + { 0x007bu, 0x00b4u, 0x0000u, CanonicalizeUnique }, + { 0x00b5u, 0x00b5u, 0x0009u, CanonicalizeSet }, + { 0x00b6u, 0x00bfu, 0x0000u, CanonicalizeUnique }, + { 0x00c0u, 0x00d6u, 0x0020u, CanonicalizeRangeLo }, + { 0x00d7u, 0x00d7u, 0x0000u, CanonicalizeUnique }, + { 0x00d8u, 0x00deu, 0x0020u, CanonicalizeRangeLo }, + { 0x00dfu, 0x00dfu, 0x0000u, CanonicalizeUnique }, + { 0x00e0u, 0x00f6u, 0x0020u, CanonicalizeRangeHi }, + { 0x00f7u, 0x00f7u, 0x0000u, CanonicalizeUnique }, + { 0x00f8u, 0x00feu, 0x0020u, CanonicalizeRangeHi }, + { 0x00ffu, 0x00ffu, 0x0079u, CanonicalizeRangeLo }, + { 0x0100u, 0x012fu, 0x0000u, CanonicalizeAlternatingAligned }, + { 0x0130u, 0x0131u, 0x0000u, CanonicalizeUnique }, + { 0x0132u, 0x0137u, 0x0000u, CanonicalizeAlternatingAligned }, + { 0x0138u, 0x0138u, 0x0000u, CanonicalizeUnique }, + { 0x0139u, 0x0148u, 0x0000u, CanonicalizeAlternatingUnaligned }, + { 0x0149u, 0x0149u, 0x0000u, CanonicalizeUnique }, + { 0x014au, 0x0177u, 0x0000u, CanonicalizeAlternatingAligned }, + { 0x0178u, 0x0178u, 0x0079u, CanonicalizeRangeHi }, + { 0x0179u, 0x017eu, 0x0000u, CanonicalizeAlternatingUnaligned }, + { 0x017fu, 0x017fu, 0x0000u, CanonicalizeUnique }, + { 0x0180u, 0x0180u, 0x00c3u, CanonicalizeRangeLo }, + { 0x0181u, 0x0181u, 0x00d2u, CanonicalizeRangeLo }, + { 0x0182u, 0x0185u, 0x0000u, CanonicalizeAlternatingAligned }, + { 0x0186u, 0x0186u, 0x00ceu, CanonicalizeRangeLo }, + { 0x0187u, 0x0188u, 0x0000u, CanonicalizeAlternatingUnaligned }, + { 0x0189u, 0x018au, 0x00cdu, CanonicalizeRangeLo }, + { 0x018bu, 0x018cu, 0x0000u, CanonicalizeAlternatingUnaligned }, + { 0x018du, 0x018du, 0x0000u, CanonicalizeUnique }, + { 0x018eu, 0x018eu, 0x004fu, CanonicalizeRangeLo }, + { 0x018fu, 0x018fu, 0x00cau, CanonicalizeRangeLo }, + { 0x0190u, 0x0190u, 0x00cbu, CanonicalizeRangeLo }, + { 0x0191u, 0x0192u, 0x0000u, CanonicalizeAlternatingUnaligned }, + { 0x0193u, 0x0193u, 0x00cdu, CanonicalizeRangeLo }, + { 0x0194u, 0x0194u, 0x00cfu, CanonicalizeRangeLo }, + { 0x0195u, 0x0195u, 0x0061u, CanonicalizeRangeLo }, + { 0x0196u, 0x0196u, 0x00d3u, CanonicalizeRangeLo }, + { 0x0197u, 0x0197u, 0x00d1u, CanonicalizeRangeLo }, + { 0x0198u, 0x0199u, 0x0000u, CanonicalizeAlternatingAligned }, + { 0x019au, 0x019au, 0x00a3u, CanonicalizeRangeLo }, + { 0x019bu, 0x019bu, 0x0000u, CanonicalizeUnique }, + { 0x019cu, 0x019cu, 0x00d3u, CanonicalizeRangeLo }, + { 0x019du, 0x019du, 0x00d5u, CanonicalizeRangeLo }, + { 0x019eu, 0x019eu, 0x0082u, CanonicalizeRangeLo }, + { 0x019fu, 0x019fu, 0x00d6u, CanonicalizeRangeLo }, + { 0x01a0u, 0x01a5u, 0x0000u, CanonicalizeAlternatingAligned }, + { 0x01a6u, 0x01a6u, 0x00dau, CanonicalizeRangeLo }, + { 0x01a7u, 0x01a8u, 0x0000u, CanonicalizeAlternatingUnaligned }, + { 0x01a9u, 0x01a9u, 0x00dau, CanonicalizeRangeLo }, + { 0x01aau, 0x01abu, 0x0000u, CanonicalizeUnique }, + { 0x01acu, 0x01adu, 0x0000u, CanonicalizeAlternatingAligned }, + { 0x01aeu, 0x01aeu, 0x00dau, CanonicalizeRangeLo }, + { 0x01afu, 0x01b0u, 0x0000u, CanonicalizeAlternatingUnaligned }, + { 0x01b1u, 0x01b2u, 0x00d9u, CanonicalizeRangeLo }, + { 0x01b3u, 0x01b6u, 0x0000u, CanonicalizeAlternatingUnaligned }, + { 0x01b7u, 0x01b7u, 0x00dbu, CanonicalizeRangeLo }, + { 0x01b8u, 0x01b9u, 0x0000u, CanonicalizeAlternatingAligned }, + { 0x01bau, 0x01bbu, 0x0000u, CanonicalizeUnique }, + { 0x01bcu, 0x01bdu, 0x0000u, CanonicalizeAlternatingAligned }, + { 0x01beu, 0x01beu, 0x0000u, CanonicalizeUnique }, + { 0x01bfu, 0x01bfu, 0x0038u, CanonicalizeRangeLo }, + { 0x01c0u, 0x01c3u, 0x0000u, CanonicalizeUnique }, + { 0x01c4u, 0x01c6u, 0x0000u, CanonicalizeSet }, + { 0x01c7u, 0x01c9u, 0x0001u, CanonicalizeSet }, + { 0x01cau, 0x01ccu, 0x0002u, CanonicalizeSet }, + { 0x01cdu, 0x01dcu, 0x0000u, CanonicalizeAlternatingUnaligned }, + { 0x01ddu, 0x01ddu, 0x004fu, CanonicalizeRangeHi }, + { 0x01deu, 0x01efu, 0x0000u, CanonicalizeAlternatingAligned }, + { 0x01f0u, 0x01f0u, 0x0000u, CanonicalizeUnique }, + { 0x01f1u, 0x01f3u, 0x0003u, CanonicalizeSet }, + { 0x01f4u, 0x01f5u, 0x0000u, CanonicalizeAlternatingAligned }, + { 0x01f6u, 0x01f6u, 0x0061u, CanonicalizeRangeHi }, + { 0x01f7u, 0x01f7u, 0x0038u, CanonicalizeRangeHi }, + { 0x01f8u, 0x021fu, 0x0000u, CanonicalizeAlternatingAligned }, + { 0x0220u, 0x0220u, 0x0082u, CanonicalizeRangeHi }, + { 0x0221u, 0x0221u, 0x0000u, CanonicalizeUnique }, + { 0x0222u, 0x0233u, 0x0000u, CanonicalizeAlternatingAligned }, + { 0x0234u, 0x0239u, 0x0000u, CanonicalizeUnique }, + { 0x023au, 0x023au, 0x2a2bu, CanonicalizeRangeLo }, + { 0x023bu, 0x023cu, 0x0000u, CanonicalizeAlternatingUnaligned }, + { 0x023du, 0x023du, 0x00a3u, CanonicalizeRangeHi }, + { 0x023eu, 0x023eu, 0x2a28u, CanonicalizeRangeLo }, + { 0x023fu, 0x0240u, 0x2a3fu, CanonicalizeRangeLo }, + { 0x0241u, 0x0242u, 0x0000u, CanonicalizeAlternatingUnaligned }, + { 0x0243u, 0x0243u, 0x00c3u, CanonicalizeRangeHi }, + { 0x0244u, 0x0244u, 0x0045u, CanonicalizeRangeLo }, + { 0x0245u, 0x0245u, 0x0047u, CanonicalizeRangeLo }, + { 0x0246u, 0x024fu, 0x0000u, CanonicalizeAlternatingAligned }, + { 0x0250u, 0x0250u, 0x2a1fu, CanonicalizeRangeLo }, + { 0x0251u, 0x0251u, 0x2a1cu, CanonicalizeRangeLo }, + { 0x0252u, 0x0252u, 0x2a1eu, CanonicalizeRangeLo }, + { 0x0253u, 0x0253u, 0x00d2u, CanonicalizeRangeHi }, + { 0x0254u, 0x0254u, 0x00ceu, CanonicalizeRangeHi }, + { 0x0255u, 0x0255u, 0x0000u, CanonicalizeUnique }, + { 0x0256u, 0x0257u, 0x00cdu, CanonicalizeRangeHi }, + { 0x0258u, 0x0258u, 0x0000u, CanonicalizeUnique }, + { 0x0259u, 0x0259u, 0x00cau, CanonicalizeRangeHi }, + { 0x025au, 0x025au, 0x0000u, CanonicalizeUnique }, + { 0x025bu, 0x025bu, 0x00cbu, CanonicalizeRangeHi }, + { 0x025cu, 0x025fu, 0x0000u, CanonicalizeUnique }, + { 0x0260u, 0x0260u, 0x00cdu, CanonicalizeRangeHi }, + { 0x0261u, 0x0262u, 0x0000u, CanonicalizeUnique }, + { 0x0263u, 0x0263u, 0x00cfu, CanonicalizeRangeHi }, + { 0x0264u, 0x0264u, 0x0000u, CanonicalizeUnique }, + { 0x0265u, 0x0265u, 0xa528u, CanonicalizeRangeLo }, + { 0x0266u, 0x0267u, 0x0000u, CanonicalizeUnique }, + { 0x0268u, 0x0268u, 0x00d1u, CanonicalizeRangeHi }, + { 0x0269u, 0x0269u, 0x00d3u, CanonicalizeRangeHi }, + { 0x026au, 0x026au, 0x0000u, CanonicalizeUnique }, + { 0x026bu, 0x026bu, 0x29f7u, CanonicalizeRangeLo }, + { 0x026cu, 0x026eu, 0x0000u, CanonicalizeUnique }, + { 0x026fu, 0x026fu, 0x00d3u, CanonicalizeRangeHi }, + { 0x0270u, 0x0270u, 0x0000u, CanonicalizeUnique }, + { 0x0271u, 0x0271u, 0x29fdu, CanonicalizeRangeLo }, + { 0x0272u, 0x0272u, 0x00d5u, CanonicalizeRangeHi }, + { 0x0273u, 0x0274u, 0x0000u, CanonicalizeUnique }, + { 0x0275u, 0x0275u, 0x00d6u, CanonicalizeRangeHi }, + { 0x0276u, 0x027cu, 0x0000u, CanonicalizeUnique }, + { 0x027du, 0x027du, 0x29e7u, CanonicalizeRangeLo }, + { 0x027eu, 0x027fu, 0x0000u, CanonicalizeUnique }, + { 0x0280u, 0x0280u, 0x00dau, CanonicalizeRangeHi }, + { 0x0281u, 0x0282u, 0x0000u, CanonicalizeUnique }, + { 0x0283u, 0x0283u, 0x00dau, CanonicalizeRangeHi }, + { 0x0284u, 0x0287u, 0x0000u, CanonicalizeUnique }, + { 0x0288u, 0x0288u, 0x00dau, CanonicalizeRangeHi }, + { 0x0289u, 0x0289u, 0x0045u, CanonicalizeRangeHi }, + { 0x028au, 0x028bu, 0x00d9u, CanonicalizeRangeHi }, + { 0x028cu, 0x028cu, 0x0047u, CanonicalizeRangeHi }, + { 0x028du, 0x0291u, 0x0000u, CanonicalizeUnique }, + { 0x0292u, 0x0292u, 0x00dbu, CanonicalizeRangeHi }, + { 0x0293u, 0x0344u, 0x0000u, CanonicalizeUnique }, + { 0x0345u, 0x0345u, 0x0007u, CanonicalizeSet }, + { 0x0346u, 0x036fu, 0x0000u, CanonicalizeUnique }, + { 0x0370u, 0x0373u, 0x0000u, CanonicalizeAlternatingAligned }, + { 0x0374u, 0x0375u, 0x0000u, CanonicalizeUnique }, + { 0x0376u, 0x0377u, 0x0000u, CanonicalizeAlternatingAligned }, + { 0x0378u, 0x037au, 0x0000u, CanonicalizeUnique }, + { 0x037bu, 0x037du, 0x0082u, CanonicalizeRangeLo }, + { 0x037eu, 0x0385u, 0x0000u, CanonicalizeUnique }, + { 0x0386u, 0x0386u, 0x0026u, CanonicalizeRangeLo }, + { 0x0387u, 0x0387u, 0x0000u, CanonicalizeUnique }, + { 0x0388u, 0x038au, 0x0025u, CanonicalizeRangeLo }, + { 0x038bu, 0x038bu, 0x0000u, CanonicalizeUnique }, + { 0x038cu, 0x038cu, 0x0040u, CanonicalizeRangeLo }, + { 0x038du, 0x038du, 0x0000u, CanonicalizeUnique }, + { 0x038eu, 0x038fu, 0x003fu, CanonicalizeRangeLo }, + { 0x0390u, 0x0390u, 0x0000u, CanonicalizeUnique }, + { 0x0391u, 0x0391u, 0x0020u, CanonicalizeRangeLo }, + { 0x0392u, 0x0392u, 0x0004u, CanonicalizeSet }, + { 0x0393u, 0x0394u, 0x0020u, CanonicalizeRangeLo }, + { 0x0395u, 0x0395u, 0x0005u, CanonicalizeSet }, + { 0x0396u, 0x0397u, 0x0020u, CanonicalizeRangeLo }, + { 0x0398u, 0x0398u, 0x0006u, CanonicalizeSet }, + { 0x0399u, 0x0399u, 0x0007u, CanonicalizeSet }, + { 0x039au, 0x039au, 0x0008u, CanonicalizeSet }, + { 0x039bu, 0x039bu, 0x0020u, CanonicalizeRangeLo }, + { 0x039cu, 0x039cu, 0x0009u, CanonicalizeSet }, + { 0x039du, 0x039fu, 0x0020u, CanonicalizeRangeLo }, + { 0x03a0u, 0x03a0u, 0x000au, CanonicalizeSet }, + { 0x03a1u, 0x03a1u, 0x000bu, CanonicalizeSet }, + { 0x03a2u, 0x03a2u, 0x0000u, CanonicalizeUnique }, + { 0x03a3u, 0x03a3u, 0x000cu, CanonicalizeSet }, + { 0x03a4u, 0x03a5u, 0x0020u, CanonicalizeRangeLo }, + { 0x03a6u, 0x03a6u, 0x000du, CanonicalizeSet }, + { 0x03a7u, 0x03abu, 0x0020u, CanonicalizeRangeLo }, + { 0x03acu, 0x03acu, 0x0026u, CanonicalizeRangeHi }, + { 0x03adu, 0x03afu, 0x0025u, CanonicalizeRangeHi }, + { 0x03b0u, 0x03b0u, 0x0000u, CanonicalizeUnique }, + { 0x03b1u, 0x03b1u, 0x0020u, CanonicalizeRangeHi }, + { 0x03b2u, 0x03b2u, 0x0004u, CanonicalizeSet }, + { 0x03b3u, 0x03b4u, 0x0020u, CanonicalizeRangeHi }, + { 0x03b5u, 0x03b5u, 0x0005u, CanonicalizeSet }, + { 0x03b6u, 0x03b7u, 0x0020u, CanonicalizeRangeHi }, + { 0x03b8u, 0x03b8u, 0x0006u, CanonicalizeSet }, + { 0x03b9u, 0x03b9u, 0x0007u, CanonicalizeSet }, + { 0x03bau, 0x03bau, 0x0008u, CanonicalizeSet }, + { 0x03bbu, 0x03bbu, 0x0020u, CanonicalizeRangeHi }, + { 0x03bcu, 0x03bcu, 0x0009u, CanonicalizeSet }, + { 0x03bdu, 0x03bfu, 0x0020u, CanonicalizeRangeHi }, + { 0x03c0u, 0x03c0u, 0x000au, CanonicalizeSet }, + { 0x03c1u, 0x03c1u, 0x000bu, CanonicalizeSet }, + { 0x03c2u, 0x03c3u, 0x000cu, CanonicalizeSet }, + { 0x03c4u, 0x03c5u, 0x0020u, CanonicalizeRangeHi }, + { 0x03c6u, 0x03c6u, 0x000du, CanonicalizeSet }, + { 0x03c7u, 0x03cbu, 0x0020u, CanonicalizeRangeHi }, + { 0x03ccu, 0x03ccu, 0x0040u, CanonicalizeRangeHi }, + { 0x03cdu, 0x03ceu, 0x003fu, CanonicalizeRangeHi }, + { 0x03cfu, 0x03cfu, 0x0008u, CanonicalizeRangeLo }, + { 0x03d0u, 0x03d0u, 0x0004u, CanonicalizeSet }, + { 0x03d1u, 0x03d1u, 0x0006u, CanonicalizeSet }, + { 0x03d2u, 0x03d4u, 0x0000u, CanonicalizeUnique }, + { 0x03d5u, 0x03d5u, 0x000du, CanonicalizeSet }, + { 0x03d6u, 0x03d6u, 0x000au, CanonicalizeSet }, + { 0x03d7u, 0x03d7u, 0x0008u, CanonicalizeRangeHi }, + { 0x03d8u, 0x03efu, 0x0000u, CanonicalizeAlternatingAligned }, + { 0x03f0u, 0x03f0u, 0x0008u, CanonicalizeSet }, + { 0x03f1u, 0x03f1u, 0x000bu, CanonicalizeSet }, + { 0x03f2u, 0x03f2u, 0x0007u, CanonicalizeRangeLo }, + { 0x03f3u, 0x03f4u, 0x0000u, CanonicalizeUnique }, + { 0x03f5u, 0x03f5u, 0x0005u, CanonicalizeSet }, + { 0x03f6u, 0x03f6u, 0x0000u, CanonicalizeUnique }, + { 0x03f7u, 0x03f8u, 0x0000u, CanonicalizeAlternatingUnaligned }, + { 0x03f9u, 0x03f9u, 0x0007u, CanonicalizeRangeHi }, + { 0x03fau, 0x03fbu, 0x0000u, CanonicalizeAlternatingAligned }, + { 0x03fcu, 0x03fcu, 0x0000u, CanonicalizeUnique }, + { 0x03fdu, 0x03ffu, 0x0082u, CanonicalizeRangeHi }, + { 0x0400u, 0x040fu, 0x0050u, CanonicalizeRangeLo }, + { 0x0410u, 0x042fu, 0x0020u, CanonicalizeRangeLo }, + { 0x0430u, 0x044fu, 0x0020u, CanonicalizeRangeHi }, + { 0x0450u, 0x045fu, 0x0050u, CanonicalizeRangeHi }, + { 0x0460u, 0x0481u, 0x0000u, CanonicalizeAlternatingAligned }, + { 0x0482u, 0x0489u, 0x0000u, CanonicalizeUnique }, + { 0x048au, 0x04bfu, 0x0000u, CanonicalizeAlternatingAligned }, + { 0x04c0u, 0x04c0u, 0x000fu, CanonicalizeRangeLo }, + { 0x04c1u, 0x04ceu, 0x0000u, CanonicalizeAlternatingUnaligned }, + { 0x04cfu, 0x04cfu, 0x000fu, CanonicalizeRangeHi }, + { 0x04d0u, 0x0527u, 0x0000u, CanonicalizeAlternatingAligned }, + { 0x0528u, 0x0530u, 0x0000u, CanonicalizeUnique }, + { 0x0531u, 0x0556u, 0x0030u, CanonicalizeRangeLo }, + { 0x0557u, 0x0560u, 0x0000u, CanonicalizeUnique }, + { 0x0561u, 0x0586u, 0x0030u, CanonicalizeRangeHi }, + { 0x0587u, 0x109fu, 0x0000u, CanonicalizeUnique }, + { 0x10a0u, 0x10c5u, 0x1c60u, CanonicalizeRangeLo }, + { 0x10c6u, 0x1d78u, 0x0000u, CanonicalizeUnique }, + { 0x1d79u, 0x1d79u, 0x8a04u, CanonicalizeRangeLo }, + { 0x1d7au, 0x1d7cu, 0x0000u, CanonicalizeUnique }, + { 0x1d7du, 0x1d7du, 0x0ee6u, CanonicalizeRangeLo }, + { 0x1d7eu, 0x1dffu, 0x0000u, CanonicalizeUnique }, + { 0x1e00u, 0x1e5fu, 0x0000u, CanonicalizeAlternatingAligned }, + { 0x1e60u, 0x1e61u, 0x000eu, CanonicalizeSet }, + { 0x1e62u, 0x1e95u, 0x0000u, CanonicalizeAlternatingAligned }, + { 0x1e96u, 0x1e9au, 0x0000u, CanonicalizeUnique }, + { 0x1e9bu, 0x1e9bu, 0x000eu, CanonicalizeSet }, + { 0x1e9cu, 0x1e9fu, 0x0000u, CanonicalizeUnique }, + { 0x1ea0u, 0x1effu, 0x0000u, CanonicalizeAlternatingAligned }, + { 0x1f00u, 0x1f07u, 0x0008u, CanonicalizeRangeLo }, + { 0x1f08u, 0x1f0fu, 0x0008u, CanonicalizeRangeHi }, + { 0x1f10u, 0x1f15u, 0x0008u, CanonicalizeRangeLo }, + { 0x1f16u, 0x1f17u, 0x0000u, CanonicalizeUnique }, + { 0x1f18u, 0x1f1du, 0x0008u, CanonicalizeRangeHi }, + { 0x1f1eu, 0x1f1fu, 0x0000u, CanonicalizeUnique }, + { 0x1f20u, 0x1f27u, 0x0008u, CanonicalizeRangeLo }, + { 0x1f28u, 0x1f2fu, 0x0008u, CanonicalizeRangeHi }, + { 0x1f30u, 0x1f37u, 0x0008u, CanonicalizeRangeLo }, + { 0x1f38u, 0x1f3fu, 0x0008u, CanonicalizeRangeHi }, + { 0x1f40u, 0x1f45u, 0x0008u, CanonicalizeRangeLo }, + { 0x1f46u, 0x1f47u, 0x0000u, CanonicalizeUnique }, + { 0x1f48u, 0x1f4du, 0x0008u, CanonicalizeRangeHi }, + { 0x1f4eu, 0x1f50u, 0x0000u, CanonicalizeUnique }, + { 0x1f51u, 0x1f51u, 0x0008u, CanonicalizeRangeLo }, + { 0x1f52u, 0x1f52u, 0x0000u, CanonicalizeUnique }, + { 0x1f53u, 0x1f53u, 0x0008u, CanonicalizeRangeLo }, + { 0x1f54u, 0x1f54u, 0x0000u, CanonicalizeUnique }, + { 0x1f55u, 0x1f55u, 0x0008u, CanonicalizeRangeLo }, + { 0x1f56u, 0x1f56u, 0x0000u, CanonicalizeUnique }, + { 0x1f57u, 0x1f57u, 0x0008u, CanonicalizeRangeLo }, + { 0x1f58u, 0x1f58u, 0x0000u, CanonicalizeUnique }, + { 0x1f59u, 0x1f59u, 0x0008u, CanonicalizeRangeHi }, + { 0x1f5au, 0x1f5au, 0x0000u, CanonicalizeUnique }, + { 0x1f5bu, 0x1f5bu, 0x0008u, CanonicalizeRangeHi }, + { 0x1f5cu, 0x1f5cu, 0x0000u, CanonicalizeUnique }, + { 0x1f5du, 0x1f5du, 0x0008u, CanonicalizeRangeHi }, + { 0x1f5eu, 0x1f5eu, 0x0000u, CanonicalizeUnique }, + { 0x1f5fu, 0x1f5fu, 0x0008u, CanonicalizeRangeHi }, + { 0x1f60u, 0x1f67u, 0x0008u, CanonicalizeRangeLo }, + { 0x1f68u, 0x1f6fu, 0x0008u, CanonicalizeRangeHi }, + { 0x1f70u, 0x1f71u, 0x004au, CanonicalizeRangeLo }, + { 0x1f72u, 0x1f75u, 0x0056u, CanonicalizeRangeLo }, + { 0x1f76u, 0x1f77u, 0x0064u, CanonicalizeRangeLo }, + { 0x1f78u, 0x1f79u, 0x0080u, CanonicalizeRangeLo }, + { 0x1f7au, 0x1f7bu, 0x0070u, CanonicalizeRangeLo }, + { 0x1f7cu, 0x1f7du, 0x007eu, CanonicalizeRangeLo }, + { 0x1f7eu, 0x1fafu, 0x0000u, CanonicalizeUnique }, + { 0x1fb0u, 0x1fb1u, 0x0008u, CanonicalizeRangeLo }, + { 0x1fb2u, 0x1fb7u, 0x0000u, CanonicalizeUnique }, + { 0x1fb8u, 0x1fb9u, 0x0008u, CanonicalizeRangeHi }, + { 0x1fbau, 0x1fbbu, 0x004au, CanonicalizeRangeHi }, + { 0x1fbcu, 0x1fbdu, 0x0000u, CanonicalizeUnique }, + { 0x1fbeu, 0x1fbeu, 0x0007u, CanonicalizeSet }, + { 0x1fbfu, 0x1fc7u, 0x0000u, CanonicalizeUnique }, + { 0x1fc8u, 0x1fcbu, 0x0056u, CanonicalizeRangeHi }, + { 0x1fccu, 0x1fcfu, 0x0000u, CanonicalizeUnique }, + { 0x1fd0u, 0x1fd1u, 0x0008u, CanonicalizeRangeLo }, + { 0x1fd2u, 0x1fd7u, 0x0000u, CanonicalizeUnique }, + { 0x1fd8u, 0x1fd9u, 0x0008u, CanonicalizeRangeHi }, + { 0x1fdau, 0x1fdbu, 0x0064u, CanonicalizeRangeHi }, + { 0x1fdcu, 0x1fdfu, 0x0000u, CanonicalizeUnique }, + { 0x1fe0u, 0x1fe1u, 0x0008u, CanonicalizeRangeLo }, + { 0x1fe2u, 0x1fe4u, 0x0000u, CanonicalizeUnique }, + { 0x1fe5u, 0x1fe5u, 0x0007u, CanonicalizeRangeLo }, + { 0x1fe6u, 0x1fe7u, 0x0000u, CanonicalizeUnique }, + { 0x1fe8u, 0x1fe9u, 0x0008u, CanonicalizeRangeHi }, + { 0x1feau, 0x1febu, 0x0070u, CanonicalizeRangeHi }, + { 0x1fecu, 0x1fecu, 0x0007u, CanonicalizeRangeHi }, + { 0x1fedu, 0x1ff7u, 0x0000u, CanonicalizeUnique }, + { 0x1ff8u, 0x1ff9u, 0x0080u, CanonicalizeRangeHi }, + { 0x1ffau, 0x1ffbu, 0x007eu, CanonicalizeRangeHi }, + { 0x1ffcu, 0x2131u, 0x0000u, CanonicalizeUnique }, + { 0x2132u, 0x2132u, 0x001cu, CanonicalizeRangeLo }, + { 0x2133u, 0x214du, 0x0000u, CanonicalizeUnique }, + { 0x214eu, 0x214eu, 0x001cu, CanonicalizeRangeHi }, + { 0x214fu, 0x215fu, 0x0000u, CanonicalizeUnique }, + { 0x2160u, 0x216fu, 0x0010u, CanonicalizeRangeLo }, + { 0x2170u, 0x217fu, 0x0010u, CanonicalizeRangeHi }, + { 0x2180u, 0x2182u, 0x0000u, CanonicalizeUnique }, + { 0x2183u, 0x2184u, 0x0000u, CanonicalizeAlternatingUnaligned }, + { 0x2185u, 0x24b5u, 0x0000u, CanonicalizeUnique }, + { 0x24b6u, 0x24cfu, 0x001au, CanonicalizeRangeLo }, + { 0x24d0u, 0x24e9u, 0x001au, CanonicalizeRangeHi }, + { 0x24eau, 0x2bffu, 0x0000u, CanonicalizeUnique }, + { 0x2c00u, 0x2c2eu, 0x0030u, CanonicalizeRangeLo }, + { 0x2c2fu, 0x2c2fu, 0x0000u, CanonicalizeUnique }, + { 0x2c30u, 0x2c5eu, 0x0030u, CanonicalizeRangeHi }, + { 0x2c5fu, 0x2c5fu, 0x0000u, CanonicalizeUnique }, + { 0x2c60u, 0x2c61u, 0x0000u, CanonicalizeAlternatingAligned }, + { 0x2c62u, 0x2c62u, 0x29f7u, CanonicalizeRangeHi }, + { 0x2c63u, 0x2c63u, 0x0ee6u, CanonicalizeRangeHi }, + { 0x2c64u, 0x2c64u, 0x29e7u, CanonicalizeRangeHi }, + { 0x2c65u, 0x2c65u, 0x2a2bu, CanonicalizeRangeHi }, + { 0x2c66u, 0x2c66u, 0x2a28u, CanonicalizeRangeHi }, + { 0x2c67u, 0x2c6cu, 0x0000u, CanonicalizeAlternatingUnaligned }, + { 0x2c6du, 0x2c6du, 0x2a1cu, CanonicalizeRangeHi }, + { 0x2c6eu, 0x2c6eu, 0x29fdu, CanonicalizeRangeHi }, + { 0x2c6fu, 0x2c6fu, 0x2a1fu, CanonicalizeRangeHi }, + { 0x2c70u, 0x2c70u, 0x2a1eu, CanonicalizeRangeHi }, + { 0x2c71u, 0x2c71u, 0x0000u, CanonicalizeUnique }, + { 0x2c72u, 0x2c73u, 0x0000u, CanonicalizeAlternatingAligned }, + { 0x2c74u, 0x2c74u, 0x0000u, CanonicalizeUnique }, + { 0x2c75u, 0x2c76u, 0x0000u, CanonicalizeAlternatingUnaligned }, + { 0x2c77u, 0x2c7du, 0x0000u, CanonicalizeUnique }, + { 0x2c7eu, 0x2c7fu, 0x2a3fu, CanonicalizeRangeHi }, + { 0x2c80u, 0x2ce3u, 0x0000u, CanonicalizeAlternatingAligned }, + { 0x2ce4u, 0x2ceau, 0x0000u, CanonicalizeUnique }, + { 0x2cebu, 0x2ceeu, 0x0000u, CanonicalizeAlternatingUnaligned }, + { 0x2cefu, 0x2cffu, 0x0000u, CanonicalizeUnique }, + { 0x2d00u, 0x2d25u, 0x1c60u, CanonicalizeRangeHi }, + { 0x2d26u, 0xa63fu, 0x0000u, CanonicalizeUnique }, + { 0xa640u, 0xa66du, 0x0000u, CanonicalizeAlternatingAligned }, + { 0xa66eu, 0xa67fu, 0x0000u, CanonicalizeUnique }, + { 0xa680u, 0xa697u, 0x0000u, CanonicalizeAlternatingAligned }, + { 0xa698u, 0xa721u, 0x0000u, CanonicalizeUnique }, + { 0xa722u, 0xa72fu, 0x0000u, CanonicalizeAlternatingAligned }, + { 0xa730u, 0xa731u, 0x0000u, CanonicalizeUnique }, + { 0xa732u, 0xa76fu, 0x0000u, CanonicalizeAlternatingAligned }, + { 0xa770u, 0xa778u, 0x0000u, CanonicalizeUnique }, + { 0xa779u, 0xa77cu, 0x0000u, CanonicalizeAlternatingUnaligned }, + { 0xa77du, 0xa77du, 0x8a04u, CanonicalizeRangeHi }, + { 0xa77eu, 0xa787u, 0x0000u, CanonicalizeAlternatingAligned }, + { 0xa788u, 0xa78au, 0x0000u, CanonicalizeUnique }, + { 0xa78bu, 0xa78cu, 0x0000u, CanonicalizeAlternatingUnaligned }, + { 0xa78du, 0xa78du, 0xa528u, CanonicalizeRangeHi }, + { 0xa78eu, 0xa78fu, 0x0000u, CanonicalizeUnique }, + { 0xa790u, 0xa791u, 0x0000u, CanonicalizeAlternatingAligned }, + { 0xa792u, 0xa79fu, 0x0000u, CanonicalizeUnique }, + { 0xa7a0u, 0xa7a9u, 0x0000u, CanonicalizeAlternatingAligned }, + { 0xa7aau, 0xff20u, 0x0000u, CanonicalizeUnique }, + { 0xff21u, 0xff3au, 0x0020u, CanonicalizeRangeLo }, + { 0xff3bu, 0xff40u, 0x0000u, CanonicalizeUnique }, + { 0xff41u, 0xff5au, 0x0020u, CanonicalizeRangeHi }, + { 0xff5bu, 0xffffu, 0x0000u, CanonicalizeUnique }, +}; + +const size_t LATIN_CANONICALIZATION_RANGES = 20; +LatinCanonicalizationRange latinRangeInfo[LATIN_CANONICALIZATION_RANGES] = { + { 0x0000u, 0x0040u, 0x0000u, CanonicalizeLatinSelf }, + { 0x0041u, 0x005au, 0x0000u, CanonicalizeLatinMask0x20 }, + { 0x005bu, 0x0060u, 0x0000u, CanonicalizeLatinSelf }, + { 0x0061u, 0x007au, 0x0000u, CanonicalizeLatinMask0x20 }, + { 0x007bu, 0x00bfu, 0x0000u, CanonicalizeLatinSelf }, + { 0x00c0u, 0x00d6u, 0x0000u, CanonicalizeLatinMask0x20 }, + { 0x00d7u, 0x00d7u, 0x0000u, CanonicalizeLatinSelf }, + { 0x00d8u, 0x00deu, 0x0000u, CanonicalizeLatinMask0x20 }, + { 0x00dfu, 0x00dfu, 0x0000u, CanonicalizeLatinSelf }, + { 0x00e0u, 0x00f6u, 0x0000u, CanonicalizeLatinMask0x20 }, + { 0x00f7u, 0x00f7u, 0x0000u, CanonicalizeLatinSelf }, + { 0x00f8u, 0x00feu, 0x0000u, CanonicalizeLatinMask0x20 }, + { 0x00ffu, 0x00ffu, 0x0000u, CanonicalizeLatinSelf }, + { 0x0100u, 0x0177u, 0x0000u, CanonicalizeLatinInvalid }, + { 0x0178u, 0x0178u, 0x00ffu, CanonicalizeLatinOther }, + { 0x0179u, 0x039bu, 0x0000u, CanonicalizeLatinInvalid }, + { 0x039cu, 0x039cu, 0x00b5u, CanonicalizeLatinOther }, + { 0x039du, 0x03bbu, 0x0000u, CanonicalizeLatinInvalid }, + { 0x03bcu, 0x03bcu, 0x00b5u, CanonicalizeLatinOther }, + { 0x03bdu, 0xffffu, 0x0000u, CanonicalizeLatinInvalid }, +}; + +} } // JSC::Yarr + diff --git a/Source/JavaScriptCore/yarr/YarrCanonicalizeUCS2.h b/Source/JavaScriptCore/yarr/YarrCanonicalizeUCS2.h new file mode 100644 index 000000000..be0ead43d --- /dev/null +++ b/Source/JavaScriptCore/yarr/YarrCanonicalizeUCS2.h @@ -0,0 +1,138 @@ +/* + * Copyright (C) 2012 Apple Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef YarrCanonicalizeUCS2_H +#define YarrCanonicalizeUCS2_H + +#include <stdint.h> +#include <wtf/unicode/Unicode.h> + +namespace JSC { namespace Yarr { + +// This set of data (autogenerated using YarrCanonicalizeUCS2.js into YarrCanonicalizeUCS2.cpp) +// provides information for each UCS2 code point as to the set of code points that it should +// match under the ES5.1 case insensitive RegExp matching rules, specified in 15.10.2.8. +enum UCS2CanonicalizationType { + CanonicalizeUnique, // No canonically equal values, e.g. 0x0. + CanonicalizeSet, // Value indicates a set in characterSetInfo. + CanonicalizeRangeLo, // Value is positive delta to pair, E.g. 0x41 has value 0x20, -> 0x61. + CanonicalizeRangeHi, // Value is positive delta to pair, E.g. 0x61 has value 0x20, -> 0x41. + CanonicalizeAlternatingAligned, // Aligned consequtive pair, e.g. 0x1f4,0x1f5. + CanonicalizeAlternatingUnaligned, // Unaligned consequtive pair, e.g. 0x241,0x242. +}; +struct UCS2CanonicalizationRange { uint16_t begin, end, value, type; }; +extern const size_t UCS2_CANONICALIZATION_RANGES; +extern uint16_t* characterSetInfo[]; +extern UCS2CanonicalizationRange rangeInfo[]; + +// This table is similar to the full rangeInfo table, however this maps from UCS2 codepoints to +// the set of Latin1 codepoints that could match. +enum LatinCanonicalizationType { + CanonicalizeLatinSelf, // This character is in the Latin1 range, but has no canonical equivalent in the range. + CanonicalizeLatinMask0x20, // One of a pair of characters, under the mask 0x20. + CanonicalizeLatinOther, // This character is not in the Latin1 range, but canonicalizes to another that is. + CanonicalizeLatinInvalid, // Cannot match against Latin1 input. +}; +struct LatinCanonicalizationRange { uint16_t begin, end, value, type; }; +extern const size_t LATIN_CANONICALIZATION_RANGES; +extern LatinCanonicalizationRange latinRangeInfo[]; + +// This searches in log2 time over ~364 entries, so should typically result in 8 compares. +inline UCS2CanonicalizationRange* rangeInfoFor(UChar ch) +{ + UCS2CanonicalizationRange* info = rangeInfo; + size_t entries = UCS2_CANONICALIZATION_RANGES; + + while (true) { + size_t candidate = entries >> 1; + UCS2CanonicalizationRange* candidateInfo = info + candidate; + if (ch < candidateInfo->begin) + entries = candidate; + else if (ch <= candidateInfo->end) + return candidateInfo; + else { + info = candidateInfo + 1; + entries -= (candidate + 1); + } + } +} + +// Should only be called for characters that have one canonically matching value. +inline UChar getCanonicalPair(UCS2CanonicalizationRange* info, UChar ch) +{ + ASSERT(ch >= info->begin && ch <= info->end); + switch (info->type) { + case CanonicalizeRangeLo: + return ch + info->value; + case CanonicalizeRangeHi: + return ch - info->value; + case CanonicalizeAlternatingAligned: + return ch ^ 1; + case CanonicalizeAlternatingUnaligned: + return ((ch - 1) ^ 1) + 1; + default: + ASSERT_NOT_REACHED(); + } + ASSERT_NOT_REACHED(); + return 0; +} + +// Returns true if no other UCS2 codepoint can match this value. +inline bool isCanonicallyUnique(UChar ch) +{ + return rangeInfoFor(ch)->type == CanonicalizeUnique; +} + +// Returns true if values are equal, under the canonicalization rules. +inline bool areCanonicallyEquivalent(UChar a, UChar b) +{ + UCS2CanonicalizationRange* info = rangeInfoFor(a); + switch (info->type) { + case CanonicalizeUnique: + return a == b; + case CanonicalizeSet: { + for (uint16_t* set = characterSetInfo[info->value]; (a = *set); ++set) { + if (a == b) + return true; + } + return false; + } + case CanonicalizeRangeLo: + return (a == b) || (a + info->value == b); + case CanonicalizeRangeHi: + return (a == b) || (a - info->value == b); + case CanonicalizeAlternatingAligned: + return (a | 1) == (b | 1); + case CanonicalizeAlternatingUnaligned: + return ((a - 1) | 1) == ((b - 1) | 1); + } + + ASSERT_NOT_REACHED(); + return false; +} + +} } // JSC::Yarr + +#endif diff --git a/Source/JavaScriptCore/yarr/YarrCanonicalizeUCS2.js b/Source/JavaScriptCore/yarr/YarrCanonicalizeUCS2.js new file mode 100644 index 000000000..00361dd46 --- /dev/null +++ b/Source/JavaScriptCore/yarr/YarrCanonicalizeUCS2.js @@ -0,0 +1,219 @@ +/* + * Copyright (C) 2012 Apple Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +// See ES 5.1, 15.10.2.8 +function canonicalize(ch) +{ + var u = String.fromCharCode(ch).toUpperCase(); + if (u.length > 1) + return ch; + var cu = u.charCodeAt(0); + if (ch >= 128 && cu < 128) + return ch; + return cu; +} + +var MAX_UCS2 = 0xFFFF; +var MAX_LATIN = 0xFF; + +var groupedCanonically = []; +// Pass 1: populate groupedCanonically - this is mapping from canonicalized +// values back to the set of character code that canonicalize to them. +for (var i = 0; i <= MAX_UCS2; ++i) { + var ch = canonicalize(i); + if (!groupedCanonically[ch]) + groupedCanonically[ch] = []; + groupedCanonically[ch].push(i); +} + +var typeInfo = []; +var latinTypeInfo = []; +var characterSetInfo = []; +// Pass 2: populate typeInfo & characterSetInfo. For every character calculate +// a typeInfo value, described by the types above, and a value payload. +for (cu in groupedCanonically) { + // The set of characters that canonicalize to cu + var characters = groupedCanonically[cu]; + + // If there is only one, it is unique. + if (characters.length == 1) { + typeInfo[characters[0]] = "CanonicalizeUnique:0"; + latinTypeInfo[characters[0]] = characters[0] <= MAX_LATIN ? "CanonicalizeLatinSelf:0" : "CanonicalizeLatinInvalid:0"; + continue; + } + + // Sort the array. + characters.sort(function(x,y){return x-y;}); + + // If there are more than two characters, create an entry in characterSetInfo. + if (characters.length > 2) { + for (i in characters) + typeInfo[characters[i]] = "CanonicalizeSet:" + characterSetInfo.length; + characterSetInfo.push(characters); + + if (characters[1] <= MAX_LATIN) + throw new Error("sets with more than one latin character not supported!"); + if (characters[0] <= MAX_LATIN) { + for (i in characters) + latinTypeInfo[characters[i]] = "CanonicalizeLatinOther:" + characters[0]; + latinTypeInfo[characters[0]] = "CanonicalizeLatinSelf:0"; + } else { + for (i in characters) + latinTypeInfo[characters[i]] = "CanonicalizeLatinInvalid:0"; + } + + continue; + } + + // We have a pair, mark alternating ranges, otherwise track whether this is the low or high partner. + var lo = characters[0]; + var hi = characters[1]; + var delta = hi - lo; + if (delta == 1) { + var type = lo & 1 ? "CanonicalizeAlternatingUnaligned:0" : "CanonicalizeAlternatingAligned:0"; + typeInfo[lo] = type; + typeInfo[hi] = type; + } else { + typeInfo[lo] = "CanonicalizeRangeLo:" + delta; + typeInfo[hi] = "CanonicalizeRangeHi:" + delta; + } + + if (lo > MAX_LATIN) { + latinTypeInfo[lo] = "CanonicalizeLatinInvalid:0"; + latinTypeInfo[hi] = "CanonicalizeLatinInvalid:0"; + } else if (hi > MAX_LATIN) { + latinTypeInfo[lo] = "CanonicalizeLatinSelf:0"; + latinTypeInfo[hi] = "CanonicalizeLatinOther:" + lo; + } else { + if (delta != 0x20 || lo & 0x20) + throw new Error("pairs of latin characters that don't mask with 0x20 not supported!"); + latinTypeInfo[lo] = "CanonicalizeLatinMask0x20:0"; + latinTypeInfo[hi] = "CanonicalizeLatinMask0x20:0"; + } +} + +var rangeInfo = []; +// Pass 3: coallesce types into ranges. +for (var end = 0; end <= MAX_UCS2; ++end) { + var begin = end; + var type = typeInfo[end]; + while (end < MAX_UCS2 && typeInfo[end + 1] == type) + ++end; + rangeInfo.push({begin:begin, end:end, type:type}); +} + +var latinRangeInfo = []; +// Pass 4: coallesce latin-1 types into ranges. +for (var end = 0; end <= MAX_UCS2; ++end) { + var begin = end; + var type = latinTypeInfo[end]; + while (end < MAX_UCS2 && latinTypeInfo[end + 1] == type) + ++end; + latinRangeInfo.push({begin:begin, end:end, type:type}); +} + + +// Helper function to convert a number to a fixed width hex representation of a C uint16_t. +function hex(x) +{ + var s = Number(x).toString(16); + while (s.length < 4) + s = 0 + s; + return "0x" + s + "u"; +} + +var copyright = ( + "/*" + "\n" + + " * Copyright (C) 2012 Apple Inc. All rights reserved." + "\n" + + " *" + "\n" + + " * Redistribution and use in source and binary forms, with or without" + "\n" + + " * modification, are permitted provided that the following conditions" + "\n" + + " * are met:" + "\n" + + " * 1. Redistributions of source code must retain the above copyright" + "\n" + + " * notice, this list of conditions and the following disclaimer." + "\n" + + " * 2. Redistributions in binary form must reproduce the above copyright" + "\n" + + " * notice, this list of conditions and the following disclaimer in the" + "\n" + + " * documentation and/or other materials provided with the distribution." + "\n" + + " *" + "\n" + + " * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY" + "\n" + + " * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE" + "\n" + + " * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR" + "\n" + + " * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR" + "\n" + + " * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL," + "\n" + + " * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO," + "\n" + + " * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR" + "\n" + + " * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY" + "\n" + + " * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT" + "\n" + + " * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE" + "\n" + + " * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. " + "\n" + + " */"); + +print(copyright); +print(); +print("// DO NOT EDIT! - this file autogenerated by YarrCanonicalizeUCS2.js"); +print(); +print('#include "config.h"'); +print('#include "YarrCanonicalizeUCS2.h"'); +print(); +print("namespace JSC { namespace Yarr {"); +print(); +print("#include <stdint.h>"); +print(); + +for (i in characterSetInfo) { + var characters = "" + var set = characterSetInfo[i]; + for (var j in set) + characters += hex(set[j]) + ", "; + print("uint16_t ucs2CharacterSet" + i + "[] = { " + characters + "0 };"); +} +print(); +print("static const size_t UCS2_CANONICALIZATION_SETS = " + characterSetInfo.length + ";"); +print("uint16_t* characterSetInfo[UCS2_CANONICALIZATION_SETS] = {"); +for (i in characterSetInfo) +print(" ucs2CharacterSet" + i + ","); +print("};"); +print(); +print("const size_t UCS2_CANONICALIZATION_RANGES = " + rangeInfo.length + ";"); +print("UCS2CanonicalizationRange rangeInfo[UCS2_CANONICALIZATION_RANGES] = {"); +for (i in rangeInfo) { + var info = rangeInfo[i]; + var typeAndValue = info.type.split(':'); + print(" { " + hex(info.begin) + ", " + hex(info.end) + ", " + hex(typeAndValue[1]) + ", " + typeAndValue[0] + " },"); +} +print("};"); +print(); +print("const size_t LATIN_CANONICALIZATION_RANGES = " + latinRangeInfo.length + ";"); +print("LatinCanonicalizationRange latinRangeInfo[LATIN_CANONICALIZATION_RANGES] = {"); +for (i in latinRangeInfo) { + var info = latinRangeInfo[i]; + var typeAndValue = info.type.split(':'); + print(" { " + hex(info.begin) + ", " + hex(info.end) + ", " + hex(typeAndValue[1]) + ", " + typeAndValue[0] + " },"); +} +print("};"); +print(); +print("} } // JSC::Yarr"); +print(); + diff --git a/Source/JavaScriptCore/yarr/YarrInterpreter.cpp b/Source/JavaScriptCore/yarr/YarrInterpreter.cpp index 743f16048..ba10171bf 100644 --- a/Source/JavaScriptCore/yarr/YarrInterpreter.cpp +++ b/Source/JavaScriptCore/yarr/YarrInterpreter.cpp @@ -29,6 +29,7 @@ #include "UString.h" #include "Yarr.h" +#include "YarrCanonicalizeUCS2.h" #include <wtf/BumpPointerAllocator.h> #include <wtf/DataLog.h> #include <wtf/text/CString.h> @@ -41,6 +42,7 @@ using namespace WTF; namespace JSC { namespace Yarr { +template<typename CharType> class Interpreter { public: struct ParenthesesDisjunctionContext; @@ -169,55 +171,9 @@ public: allocatorPool = allocatorPool->dealloc(context); } - // This class is a placeholder for future character iterator, current - // proposed name StringConstCharacterIterator. - class CharAccess { - public: - CharAccess(const UString& s) - { - if (s.is8Bit()) { - m_charSize = Char8; - m_ptr.ptr8 = s.characters8(); - } else { - m_charSize = Char16; - m_ptr.ptr16 = s.characters16(); - } - } - - CharAccess(const LChar* ptr) - : m_charSize(Char8) - { - m_ptr.ptr8 = ptr; - } - - CharAccess(const UChar* ptr) - : m_charSize(Char16) - { - m_ptr.ptr16 = ptr; - } - - ~CharAccess() - { - } - - inline UChar operator[](unsigned index) - { - if (m_charSize == Char8) - return m_ptr.ptr8[index]; - return m_ptr.ptr16[index]; - } - - private: - union { - const LChar* ptr8; - const UChar* ptr16; - } m_ptr; - YarrCharSize m_charSize; - }; - class InputStream { public: - InputStream(const UString& input, unsigned start, unsigned length) + InputStream(const CharType* input, unsigned start, unsigned length) : input(input) , pos(start) , length(length) @@ -331,7 +287,7 @@ public: } private: - CharAccess input; + const CharType* input; unsigned pos; unsigned length; }; @@ -383,15 +339,22 @@ public: if (pattern->m_ignoreCase) { for (unsigned i = 0; i < matchSize; ++i) { - int ch = input.reread(matchBegin + i); + int oldCh = input.reread(matchBegin + i); + int ch = input.readChecked(negativeInputOffset + matchSize - i); - int lo = Unicode::toLower(ch); - int hi = Unicode::toUpper(ch); + if (oldCh == ch) + continue; - if ((lo != hi) ? (!checkCasedCharacter(lo, hi, negativeInputOffset + matchSize - i)) : (!checkCharacter(ch, negativeInputOffset + matchSize - i))) { - input.uncheckInput(matchSize); - return false; - } + // The definition for canonicalize (see ES 5.1, 15.10.2.8) means that + // unicode values are never allowed to match against ascii ones. + if (isASCII(oldCh) || isASCII(ch)) { + if (toASCIIUpper(oldCh) == toASCIIUpper(ch)) + continue; + } else if (areCanonicallyEquivalent(oldCh, ch)) + continue; + + input.uncheckInput(matchSize); + return false; } } else { for (unsigned i = 0; i < matchSize; ++i) { @@ -1481,7 +1444,7 @@ public: return output[0]; } - Interpreter(BytecodePattern* pattern, unsigned* output, const UString input, unsigned start, unsigned length) + Interpreter(BytecodePattern* pattern, unsigned* output, const CharType* input, unsigned length, unsigned start) : pattern(pattern) , output(output) , input(input, start, length) @@ -1971,18 +1934,31 @@ PassOwnPtr<BytecodePattern> byteCompile(YarrPattern& pattern, BumpPointerAllocat return ByteCompiler(pattern).compile(allocator); } -unsigned interpret(BytecodePattern* bytecode, const UString& input, unsigned start, unsigned length, unsigned* output) +unsigned interpret(BytecodePattern* bytecode, const UString& input, unsigned start, unsigned* output) +{ + if (input.is8Bit()) + return Interpreter<LChar>(bytecode, output, input.characters8(), input.length(), start).interpret(); + return Interpreter<UChar>(bytecode, output, input.characters16(), input.length(), start).interpret(); +} + +unsigned interpret(BytecodePattern* bytecode, const LChar* input, unsigned length, unsigned start, unsigned* output) +{ + return Interpreter<LChar>(bytecode, output, input, length, start).interpret(); +} + +unsigned interpret(BytecodePattern* bytecode, const UChar* input, unsigned length, unsigned start, unsigned* output) { - return Interpreter(bytecode, output, input, start, length).interpret(); + return Interpreter<UChar>(bytecode, output, input, length, start).interpret(); } -COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoPatternCharacter) == (YarrStackSpaceForBackTrackInfoPatternCharacter * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoPatternCharacter); -COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoCharacterClass) == (YarrStackSpaceForBackTrackInfoCharacterClass * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoCharacterClass); -COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoBackReference) == (YarrStackSpaceForBackTrackInfoBackReference * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoBackReference); -COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoAlternative) == (YarrStackSpaceForBackTrackInfoAlternative * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoAlternative); -COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParentheticalAssertion) == (YarrStackSpaceForBackTrackInfoParentheticalAssertion * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParentheticalAssertion); -COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParenthesesOnce) == (YarrStackSpaceForBackTrackInfoParenthesesOnce * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParenthesesOnce); -COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParentheses) == (YarrStackSpaceForBackTrackInfoParentheses * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParentheses); +// These should be the same for both UChar & LChar. +COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoPatternCharacter) == (YarrStackSpaceForBackTrackInfoPatternCharacter * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoPatternCharacter); +COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoCharacterClass) == (YarrStackSpaceForBackTrackInfoCharacterClass * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoCharacterClass); +COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoBackReference) == (YarrStackSpaceForBackTrackInfoBackReference * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoBackReference); +COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoAlternative) == (YarrStackSpaceForBackTrackInfoAlternative * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoAlternative); +COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoParentheticalAssertion) == (YarrStackSpaceForBackTrackInfoParentheticalAssertion * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParentheticalAssertion); +COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoParenthesesOnce) == (YarrStackSpaceForBackTrackInfoParenthesesOnce * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParenthesesOnce); +COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoParentheses) == (YarrStackSpaceForBackTrackInfoParentheses * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParentheses); } } diff --git a/Source/JavaScriptCore/yarr/YarrInterpreter.h b/Source/JavaScriptCore/yarr/YarrInterpreter.h index 4bb1efc50..4ecd69eca 100644 --- a/Source/JavaScriptCore/yarr/YarrInterpreter.h +++ b/Source/JavaScriptCore/yarr/YarrInterpreter.h @@ -375,6 +375,11 @@ private: Vector<CharacterClass*> m_userCharacterClasses; }; +JS_EXPORT_PRIVATE PassOwnPtr<BytecodePattern> byteCompile(YarrPattern&, BumpPointerAllocator*); +JS_EXPORT_PRIVATE unsigned interpret(BytecodePattern*, const UString& input, unsigned start, unsigned* output); +unsigned interpret(BytecodePattern*, const LChar* input, unsigned length, unsigned start, unsigned* output); +unsigned interpret(BytecodePattern*, const UChar* input, unsigned length, unsigned start, unsigned* output); + } } // namespace JSC::Yarr #endif // YarrInterpreter_h diff --git a/Source/JavaScriptCore/yarr/YarrJIT.cpp b/Source/JavaScriptCore/yarr/YarrJIT.cpp index 2269792ec..60519ebd8 100644 --- a/Source/JavaScriptCore/yarr/YarrJIT.cpp +++ b/Source/JavaScriptCore/yarr/YarrJIT.cpp @@ -29,6 +29,7 @@ #include <wtf/ASCIICType.h> #include "LinkBuffer.h" #include "Yarr.h" +#include "YarrCanonicalizeUCS2.h" #if ENABLE(YARR_JIT) @@ -36,6 +37,7 @@ using namespace WTF; namespace JSC { namespace Yarr { +template<YarrJITCompileMode compileMode> class YarrGenerator : private MacroAssembler { friend void jitCompile(JSGlobalData*, YarrCodeBlock& jitObject, const UString& pattern, unsigned& numSubpatterns, const char*& error, bool ignoreCase, bool multiline); @@ -49,6 +51,7 @@ class YarrGenerator : private MacroAssembler { static const RegisterID regT1 = ARMRegisters::r6; static const RegisterID returnRegister = ARMRegisters::r0; + static const RegisterID returnRegister2 = ARMRegisters::r1; #elif CPU(MIPS) static const RegisterID input = MIPSRegisters::a0; static const RegisterID index = MIPSRegisters::a1; @@ -59,6 +62,7 @@ class YarrGenerator : private MacroAssembler { static const RegisterID regT1 = MIPSRegisters::t5; static const RegisterID returnRegister = MIPSRegisters::v0; + static const RegisterID returnRegister2 = MIPSRegisters::v1; #elif CPU(SH4) static const RegisterID input = SH4Registers::r4; static const RegisterID index = SH4Registers::r5; @@ -69,6 +73,7 @@ class YarrGenerator : private MacroAssembler { static const RegisterID regT1 = SH4Registers::r1; static const RegisterID returnRegister = SH4Registers::r0; + static const RegisterID returnRegister2 = SH4Registers::r1; #elif CPU(X86) static const RegisterID input = X86Registers::eax; static const RegisterID index = X86Registers::edx; @@ -79,6 +84,7 @@ class YarrGenerator : private MacroAssembler { static const RegisterID regT1 = X86Registers::esi; static const RegisterID returnRegister = X86Registers::eax; + static const RegisterID returnRegister2 = X86Registers::edx; #elif CPU(X86_64) static const RegisterID input = X86Registers::edi; static const RegisterID index = X86Registers::esi; @@ -89,6 +95,7 @@ class YarrGenerator : private MacroAssembler { static const RegisterID regT1 = X86Registers::ebx; static const RegisterID returnRegister = X86Registers::eax; + static const RegisterID returnRegister2 = X86Registers::edx; #endif void optimizeAlternative(PatternAlternative* alternative) @@ -262,10 +269,10 @@ class YarrGenerator : private MacroAssembler { // For case-insesitive compares, non-ascii characters that have different // upper & lower case representations are converted to a character class. - ASSERT(!m_pattern.m_ignoreCase || isASCIIAlpha(ch) || (Unicode::toLower(ch) == Unicode::toUpper(ch))); + ASSERT(!m_pattern.m_ignoreCase || isASCIIAlpha(ch) || isCanonicallyUnique(ch)); if (m_pattern.m_ignoreCase && isASCIIAlpha(ch)) { - or32(TrustedImm32(32), character); - ch = Unicode::toLower(ch); + or32(TrustedImm32(0x20), character); + ch |= 0x20; } return branch32(NotEqual, character, Imm32(ch)); @@ -304,6 +311,65 @@ class YarrGenerator : private MacroAssembler { jump(Address(stackPointerRegister, frameLocation * sizeof(void*))); } + void initCallFrame() + { + unsigned callFrameSize = m_pattern.m_body->m_callFrameSize; + if (callFrameSize) + subPtr(Imm32(callFrameSize * sizeof(void*)), stackPointerRegister); + } + void removeCallFrame() + { + unsigned callFrameSize = m_pattern.m_body->m_callFrameSize; + if (callFrameSize) + addPtr(Imm32(callFrameSize * sizeof(void*)), stackPointerRegister); + } + + // Used to record subpatters, should only be called if compileMode is IncludeSubpatterns. + void setSubpatternStart(RegisterID reg, unsigned subpattern) + { + ASSERT(subpattern); + // FIXME: should be able to ASSERT(compileMode == IncludeSubpatterns), but then this function is conditionally NORETURN. :-( + store32(reg, Address(output, (subpattern << 1) * sizeof(int))); + } + void setSubpatternEnd(RegisterID reg, unsigned subpattern) + { + ASSERT(subpattern); + // FIXME: should be able to ASSERT(compileMode == IncludeSubpatterns), but then this function is conditionally NORETURN. :-( + store32(reg, Address(output, ((subpattern << 1) + 1) * sizeof(int))); + } + void clearSubpatternStart(unsigned subpattern) + { + ASSERT(subpattern); + // FIXME: should be able to ASSERT(compileMode == IncludeSubpatterns), but then this function is conditionally NORETURN. :-( + store32(TrustedImm32(-1), Address(output, (subpattern << 1) * sizeof(int))); + } + + // We use one of three different strategies to track the start of the current match, + // while matching. + // 1) If the pattern has a fixed size, do nothing! - we calculate the value lazily + // at the end of matching. This is irrespective of compileMode, and in this case + // these methods should never be called. + // 2) If we're compiling IncludeSubpatterns, 'output' contains a pointer to an output + // vector, store the match start in the output vector. + // 3) If we're compiling MatchOnly, 'output' is unused, store the match start directly + // in this register. + void setMatchStart(RegisterID reg) + { + ASSERT(!m_pattern.m_body->m_hasFixedSize); + if (compileMode == IncludeSubpatterns) + store32(reg, output); + else + move(reg, output); + } + void getMatchStart(RegisterID reg) + { + ASSERT(!m_pattern.m_body->m_hasFixedSize); + if (compileMode == IncludeSubpatterns) + load32(output, reg); + else + move(output, reg); + } + enum YarrOpCode { // These nodes wrap body alternatives - those in the main disjunction, // rather than subpatterns or assertions. These are chained together in @@ -685,9 +751,9 @@ class YarrGenerator : private MacroAssembler { // For case-insesitive compares, non-ascii characters that have different // upper & lower case representations are converted to a character class. - ASSERT(!m_pattern.m_ignoreCase || isASCIIAlpha(ch) || (Unicode::toLower(ch) == Unicode::toUpper(ch))); + ASSERT(!m_pattern.m_ignoreCase || isASCIIAlpha(ch) || isCanonicallyUnique(ch)); - if ((m_pattern.m_ignoreCase) && (isASCIIAlpha(ch))) + if (m_pattern.m_ignoreCase && isASCIIAlpha(ch)) ignoreCaseMask |= 32; for (numberCharacters = 1; numberCharacters < maxCharactersAtOnce && nextOp->m_op == OpTerm; ++numberCharacters, nextOp = &m_ops[opIndex + numberCharacters]) { @@ -713,7 +779,7 @@ class YarrGenerator : private MacroAssembler { // For case-insesitive compares, non-ascii characters that have different // upper & lower case representations are converted to a character class. - ASSERT(!m_pattern.m_ignoreCase || isASCIIAlpha(currentCharacter) || (Unicode::toLower(currentCharacter) == Unicode::toUpper(currentCharacter))); + ASSERT(!m_pattern.m_ignoreCase || isASCIIAlpha(currentCharacter) || isCanonicallyUnique(currentCharacter)); allCharacters |= (currentCharacter << shiftAmount); @@ -728,12 +794,12 @@ class YarrGenerator : private MacroAssembler { return; case 2: { BaseIndex address(input, index, TimesOne, (startTermPosition - m_checked) * sizeof(LChar)); - load16(address, character); + load16Unaligned(address, character); break; } case 3: { BaseIndex highAddress(input, index, TimesOne, (startTermPosition - m_checked) * sizeof(LChar)); - load16(highAddress, character); + load16Unaligned(highAddress, character); if (ignoreCaseMask) or32(Imm32(ignoreCaseMask), character); op.m_jumps.append(branch32(NotEqual, character, Imm32((allCharacters & 0xffff) | ignoreCaseMask))); @@ -790,10 +856,10 @@ class YarrGenerator : private MacroAssembler { // For case-insesitive compares, non-ascii characters that have different // upper & lower case representations are converted to a character class. - ASSERT(!m_pattern.m_ignoreCase || isASCIIAlpha(ch) || (Unicode::toLower(ch) == Unicode::toUpper(ch))); + ASSERT(!m_pattern.m_ignoreCase || isASCIIAlpha(ch) || isCanonicallyUnique(ch)); if (m_pattern.m_ignoreCase && isASCIIAlpha(ch)) { - or32(TrustedImm32(32), character); - ch = Unicode::toLower(ch); + or32(TrustedImm32(0x20), character); + ch |= 0x20; } op.m_jumps.append(branch32(NotEqual, character, Imm32(ch))); @@ -1029,7 +1095,6 @@ class YarrGenerator : private MacroAssembler { m_backtrackingState.link(this); - Label backtrackBegin(this); loadFromFrame(term->frameLocation, countRegister); nonGreedyFailures.append(atEndOfInput()); @@ -1068,11 +1133,8 @@ class YarrGenerator : private MacroAssembler { JumpList saveStartIndex; JumpList foundEndingNewLine; - if (m_pattern.m_body->m_hasFixedSize) { - move(index, matchPos); - sub32(Imm32(m_checked), matchPos); - } else - load32(Address(output), matchPos); + ASSERT(!m_pattern.m_body->m_hasFixedSize); + getMatchStart(matchPos); saveStartIndex.append(branchTest32(Zero, matchPos)); Label findBOLLoop(this); @@ -1092,7 +1154,8 @@ class YarrGenerator : private MacroAssembler { if (!m_pattern.m_multiline && term->anchors.bolAnchor) op.m_jumps.append(branchTest32(NonZero, matchPos)); - store32(matchPos, Address(output)); + ASSERT(!m_pattern.m_body->m_hasFixedSize); + setMatchStart(matchPos); move(index, matchPos); @@ -1314,8 +1377,7 @@ class YarrGenerator : private MacroAssembler { // If we get here, the prior alternative matched - return success. // Adjust the stack pointer to remove the pattern's frame. - if (m_pattern.m_body->m_callFrameSize) - addPtr(Imm32(m_pattern.m_body->m_callFrameSize * sizeof(void*)), stackPointerRegister); + removeCallFrame(); // Load appropriate values into the return register and the first output // slot, and return. In the case of pattern with a fixed size, we will @@ -1325,10 +1387,14 @@ class YarrGenerator : private MacroAssembler { move(index, returnRegister); if (priorAlternative->m_minimumSize) sub32(Imm32(priorAlternative->m_minimumSize), returnRegister); - store32(returnRegister, output); + if (compileMode == IncludeSubpatterns) + store32(returnRegister, output); } else - load32(Address(output), returnRegister); - store32(index, Address(output, 4)); + getMatchStart(returnRegister); + if (compileMode == IncludeSubpatterns) + store32(index, Address(output, 4)); + move(index, returnRegister2); + generateReturn(); // This is the divide between the tail of the prior alternative, above, and @@ -1511,17 +1577,16 @@ class YarrGenerator : private MacroAssembler { // FIXME: could avoid offsetting this value in JIT code, apply // offsets only afterwards, at the point the results array is // being accessed. - if (term->capture()) { - int offsetId = term->parentheses.subpatternId << 1; + if (term->capture() && compileMode == IncludeSubpatterns) { int inputOffset = term->inputPosition - m_checked; if (term->quantityType == QuantifierFixedCount) inputOffset -= term->parentheses.disjunction->m_minimumSize; if (inputOffset) { move(index, indexTemporary); add32(Imm32(inputOffset), indexTemporary); - store32(indexTemporary, Address(output, offsetId * sizeof(int))); + setSubpatternStart(indexTemporary, term->parentheses.subpatternId); } else - store32(index, Address(output, offsetId * sizeof(int))); + setSubpatternStart(index, term->parentheses.subpatternId); } break; } @@ -1547,15 +1612,14 @@ class YarrGenerator : private MacroAssembler { // FIXME: could avoid offsetting this value in JIT code, apply // offsets only afterwards, at the point the results array is // being accessed. - if (term->capture()) { - int offsetId = (term->parentheses.subpatternId << 1) + 1; + if (term->capture() && compileMode == IncludeSubpatterns) { int inputOffset = term->inputPosition - m_checked; if (inputOffset) { move(index, indexTemporary); add32(Imm32(inputOffset), indexTemporary); - store32(indexTemporary, Address(output, offsetId * sizeof(int))); + setSubpatternEnd(indexTemporary, term->parentheses.subpatternId); } else - store32(index, Address(output, offsetId * sizeof(int))); + setSubpatternEnd(index, term->parentheses.subpatternId); } // If the parentheses are quantified Greedy then add a label to jump back @@ -1645,9 +1709,9 @@ class YarrGenerator : private MacroAssembler { } case OpMatchFailed: - if (m_pattern.m_body->m_callFrameSize) - addPtr(Imm32(m_pattern.m_body->m_callFrameSize * sizeof(void*)), stackPointerRegister); - move(TrustedImm32(-1), returnRegister); + removeCallFrame(); + move(TrustedImmPtr((void*)WTF::notFound), returnRegister); + move(TrustedImm32(0), returnRegister2); generateReturn(); break; } @@ -1742,14 +1806,14 @@ class YarrGenerator : private MacroAssembler { // If the pattern size is not fixed, then store the start index, for use if we match. if (!m_pattern.m_body->m_hasFixedSize) { if (alternative->m_minimumSize == 1) - store32(index, Address(output)); + setMatchStart(index); else { move(index, regT0); if (alternative->m_minimumSize) sub32(Imm32(alternative->m_minimumSize - 1), regT0); else add32(TrustedImm32(1), regT0); - store32(regT0, Address(output)); + setMatchStart(regT0); } } @@ -1835,7 +1899,7 @@ class YarrGenerator : private MacroAssembler { // disjunction is 0, e.g. /a*|b/). if (needsToUpdateMatchStart && alternative->m_minimumSize == 1) { // index is already incremented by 1, so just store it now! - store32(index, Address(output)); + setMatchStart(index); needsToUpdateMatchStart = false; } @@ -1859,11 +1923,11 @@ class YarrGenerator : private MacroAssembler { if (needsToUpdateMatchStart) { if (!m_pattern.m_body->m_minimumSize) - store32(index, Address(output)); + setMatchStart(index); else { move(index, regT0); sub32(Imm32(m_pattern.m_body->m_minimumSize), regT0); - store32(regT0, Address(output)); + setMatchStart(regT0); } } @@ -1885,9 +1949,9 @@ class YarrGenerator : private MacroAssembler { // run any matches, and need to return a failure state from JIT code. matchFailed.link(this); - if (m_pattern.m_body->m_callFrameSize) - addPtr(Imm32(m_pattern.m_body->m_callFrameSize * sizeof(void*)), stackPointerRegister); - move(TrustedImm32(-1), returnRegister); + removeCallFrame(); + move(TrustedImmPtr((void*)WTF::notFound), returnRegister); + move(TrustedImm32(0), returnRegister2); generateReturn(); break; } @@ -2054,12 +2118,12 @@ class YarrGenerator : private MacroAssembler { ASSERT(term->quantityCount == 1); // We only need to backtrack to thispoint if capturing or greedy. - if (term->capture() || term->quantityType == QuantifierGreedy) { + if ((term->capture() && compileMode == IncludeSubpatterns) || term->quantityType == QuantifierGreedy) { m_backtrackingState.link(this); // If capturing, clear the capture (we only need to reset start). - if (term->capture()) - store32(TrustedImm32(-1), Address(output, (term->parentheses.subpatternId << 1) * sizeof(int))); + if (term->capture() && compileMode == IncludeSubpatterns) + clearSubpatternStart(term->parentheses.subpatternId); // If Greedy, jump to the end. if (term->quantityType == QuantifierGreedy) { @@ -2449,9 +2513,11 @@ class YarrGenerator : private MacroAssembler { loadPtr(Address(X86Registers::ebp, 2 * sizeof(void*)), input); loadPtr(Address(X86Registers::ebp, 3 * sizeof(void*)), index); loadPtr(Address(X86Registers::ebp, 4 * sizeof(void*)), length); - loadPtr(Address(X86Registers::ebp, 5 * sizeof(void*)), output); + if (compileMode == IncludeSubpatterns) + loadPtr(Address(X86Registers::ebp, 5 * sizeof(void*)), output); #else - loadPtr(Address(X86Registers::ebp, 2 * sizeof(void*)), output); + if (compileMode == IncludeSubpatterns) + loadPtr(Address(X86Registers::ebp, 2 * sizeof(void*)), output); #endif #elif CPU(ARM) push(ARMRegisters::r4); @@ -2460,7 +2526,8 @@ class YarrGenerator : private MacroAssembler { #if CPU(ARM_TRADITIONAL) push(ARMRegisters::r8); // scratch register #endif - move(ARMRegisters::r3, output); + if (compileMode == IncludeSubpatterns) + move(ARMRegisters::r3, output); #elif CPU(SH4) push(SH4Registers::r11); push(SH4Registers::r13); @@ -2510,18 +2577,20 @@ public: generateEnter(); Jump hasInput = checkInput(); - move(TrustedImm32(-1), returnRegister); + move(TrustedImmPtr((void*)WTF::notFound), returnRegister); + move(TrustedImm32(0), returnRegister2); generateReturn(); hasInput.link(this); - for (unsigned i = 0; i < m_pattern.m_numSubpatterns + 1; ++i) - store32(TrustedImm32(-1), Address(output, (i << 1) * sizeof(int))); + if (compileMode == IncludeSubpatterns) { + for (unsigned i = 0; i < m_pattern.m_numSubpatterns + 1; ++i) + store32(TrustedImm32(-1), Address(output, (i << 1) * sizeof(int))); + } if (!m_pattern.m_body->m_hasFixedSize) - store32(index, Address(output)); + setMatchStart(index); - if (m_pattern.m_body->m_callFrameSize) - subPtr(Imm32(m_pattern.m_body->m_callFrameSize * sizeof(void*)), stackPointerRegister); + initCallFrame(); // Compile the pattern to the internal 'YarrOp' representation. opCompileBody(m_pattern.m_body); @@ -2539,10 +2608,18 @@ public: // Link & finalize the code. LinkBuffer linkBuffer(*globalData, this, REGEXP_CODE_ID); m_backtrackingState.linkDataLabels(linkBuffer); - if (m_charSize == Char8) - jitObject.set8BitCode(linkBuffer.finalizeCode()); - else - jitObject.set16BitCode(linkBuffer.finalizeCode()); + + if (compileMode == MatchOnly) { + if (m_charSize == Char8) + jitObject.set8BitCodeMatchOnly(linkBuffer.finalizeCode()); + else + jitObject.set16BitCodeMatchOnly(linkBuffer.finalizeCode()); + } else { + if (m_charSize == Char8) + jitObject.set8BitCode(linkBuffer.finalizeCode()); + else + jitObject.set16BitCode(linkBuffer.finalizeCode()); + } jitObject.setFallBack(m_shouldFallBack); } @@ -2576,9 +2653,12 @@ private: BacktrackingState m_backtrackingState; }; -void jitCompile(YarrPattern& pattern, YarrCharSize charSize, JSGlobalData* globalData, YarrCodeBlock& jitObject) +void jitCompile(YarrPattern& pattern, YarrCharSize charSize, JSGlobalData* globalData, YarrCodeBlock& jitObject, YarrJITCompileMode mode) { - YarrGenerator(pattern, charSize).compile(globalData, jitObject); + if (mode == MatchOnly) + YarrGenerator<MatchOnly>(pattern, charSize).compile(globalData, jitObject); + else + YarrGenerator<IncludeSubpatterns>(pattern, charSize).compile(globalData, jitObject); } }} diff --git a/Source/JavaScriptCore/yarr/YarrJIT.h b/Source/JavaScriptCore/yarr/YarrJIT.h index 38ae76cc4..71928e73c 100644 --- a/Source/JavaScriptCore/yarr/YarrJIT.h +++ b/Source/JavaScriptCore/yarr/YarrJIT.h @@ -29,7 +29,8 @@ #if ENABLE(YARR_JIT) #include "JSGlobalData.h" -#include "MacroAssembler.h" +#include "MacroAssemblerCodeRef.h" +#include "MatchResult.h" #include "UString.h" #include "Yarr.h" #include "YarrPattern.h" @@ -48,8 +49,17 @@ class ExecutablePool; namespace Yarr { class YarrCodeBlock { - typedef int (*YarrJITCode8)(const LChar* input, unsigned start, unsigned length, int* output) YARR_CALL; - typedef int (*YarrJITCode16)(const UChar* input, unsigned start, unsigned length, int* output) YARR_CALL; +#if CPU(X86_64) + typedef MatchResult (*YarrJITCode8)(const LChar* input, unsigned start, unsigned length, int* output) YARR_CALL; + typedef MatchResult (*YarrJITCode16)(const UChar* input, unsigned start, unsigned length, int* output) YARR_CALL; + typedef MatchResult (*YarrJITCodeMatchOnly8)(const LChar* input, unsigned start, unsigned length) YARR_CALL; + typedef MatchResult (*YarrJITCodeMatchOnly16)(const UChar* input, unsigned start, unsigned length) YARR_CALL; +#else + typedef EncodedMatchResult (*YarrJITCode8)(const LChar* input, unsigned start, unsigned length, int* output) YARR_CALL; + typedef EncodedMatchResult (*YarrJITCode16)(const UChar* input, unsigned start, unsigned length, int* output) YARR_CALL; + typedef EncodedMatchResult (*YarrJITCodeMatchOnly8)(const LChar* input, unsigned start, unsigned length) YARR_CALL; + typedef EncodedMatchResult (*YarrJITCodeMatchOnly16)(const UChar* input, unsigned start, unsigned length) YARR_CALL; +#endif public: YarrCodeBlock() @@ -63,43 +73,67 @@ public: void setFallBack(bool fallback) { m_needFallBack = fallback; } bool isFallBack() { return m_needFallBack; } + bool has8BitCode() { return m_ref8.size(); } bool has16BitCode() { return m_ref16.size(); } - void set8BitCode(MacroAssembler::CodeRef ref) { m_ref8 = ref; } - void set16BitCode(MacroAssembler::CodeRef ref) { m_ref16 = ref; } + void set8BitCode(MacroAssemblerCodeRef ref) { m_ref8 = ref; } + void set16BitCode(MacroAssemblerCodeRef ref) { m_ref16 = ref; } - int execute(const LChar* input, unsigned start, unsigned length, int* output) + bool has8BitCodeMatchOnly() { return m_matchOnly8.size(); } + bool has16BitCodeMatchOnly() { return m_matchOnly16.size(); } + void set8BitCodeMatchOnly(MacroAssemblerCodeRef matchOnly) { m_matchOnly8 = matchOnly; } + void set16BitCodeMatchOnly(MacroAssemblerCodeRef matchOnly) { m_matchOnly16 = matchOnly; } + + MatchResult execute(const LChar* input, unsigned start, unsigned length, int* output) { ASSERT(has8BitCode()); - return reinterpret_cast<YarrJITCode8>(m_ref8.code().executableAddress())(input, start, length, output); + return MatchResult(reinterpret_cast<YarrJITCode8>(m_ref8.code().executableAddress())(input, start, length, output)); } - int execute(const UChar* input, unsigned start, unsigned length, int* output) + MatchResult execute(const UChar* input, unsigned start, unsigned length, int* output) { ASSERT(has16BitCode()); - return reinterpret_cast<YarrJITCode16>(m_ref16.code().executableAddress())(input, start, length, output); + return MatchResult(reinterpret_cast<YarrJITCode16>(m_ref16.code().executableAddress())(input, start, length, output)); } + + MatchResult execute(const LChar* input, unsigned start, unsigned length) + { + ASSERT(has8BitCodeMatchOnly()); + return MatchResult(reinterpret_cast<YarrJITCodeMatchOnly8>(m_matchOnly8.code().executableAddress())(input, start, length)); + } + + MatchResult execute(const UChar* input, unsigned start, unsigned length) + { + ASSERT(has16BitCodeMatchOnly()); + return MatchResult(reinterpret_cast<YarrJITCodeMatchOnly16>(m_matchOnly16.code().executableAddress())(input, start, length)); + } + #if ENABLE(REGEXP_TRACING) void *getAddr() { return m_ref.code().executableAddress(); } #endif + void clear() + { + m_ref8 = MacroAssemblerCodeRef(); + m_ref16 = MacroAssemblerCodeRef(); + m_matchOnly8 = MacroAssemblerCodeRef(); + m_matchOnly16 = MacroAssemblerCodeRef(); + m_needFallBack = false; + } + private: - MacroAssembler::CodeRef m_ref8; - MacroAssembler::CodeRef m_ref16; + MacroAssemblerCodeRef m_ref8; + MacroAssemblerCodeRef m_ref16; + MacroAssemblerCodeRef m_matchOnly8; + MacroAssemblerCodeRef m_matchOnly16; bool m_needFallBack; }; -void jitCompile(YarrPattern&, YarrCharSize, JSGlobalData*, YarrCodeBlock& jitObject); - -inline int execute(YarrCodeBlock& jitObject, const LChar* input, unsigned start, unsigned length, int* output) -{ - return jitObject.execute(input, start, length, output); -} - -inline int execute(YarrCodeBlock& jitObject, const UChar* input, unsigned start, unsigned length, int* output) -{ - return jitObject.execute(input, start, length, output); -} +enum YarrJITCompileMode { + MatchOnly, + IncludeSubpatterns +}; +void jitCompile(YarrPattern&, YarrCharSize, JSGlobalData*, YarrCodeBlock& jitObject, YarrJITCompileMode = IncludeSubpatterns); } } // namespace JSC::Yarr diff --git a/Source/JavaScriptCore/yarr/YarrPattern.cpp b/Source/JavaScriptCore/yarr/YarrPattern.cpp index f0d10e624..bbda9c526 100644 --- a/Source/JavaScriptCore/yarr/YarrPattern.cpp +++ b/Source/JavaScriptCore/yarr/YarrPattern.cpp @@ -28,6 +28,7 @@ #include "YarrPattern.h" #include "Yarr.h" +#include "YarrCanonicalizeUCS2.h" #include "YarrParser.h" #include <wtf/Vector.h> @@ -66,32 +67,43 @@ public: void putChar(UChar ch) { + // Handle ascii cases. if (ch <= 0x7f) { if (m_isCaseInsensitive && isASCIIAlpha(ch)) { addSorted(m_matches, toASCIIUpper(ch)); addSorted(m_matches, toASCIILower(ch)); } else addSorted(m_matches, ch); - } else { - UChar upper, lower; - if (m_isCaseInsensitive && ((upper = Unicode::toUpper(ch)) != (lower = Unicode::toLower(ch)))) { - addSorted(m_matchesUnicode, upper); - addSorted(m_matchesUnicode, lower); - } else - addSorted(m_matchesUnicode, ch); + return; } - } - // returns true if this character has another case, and 'ch' is the upper case form. - static inline bool isUnicodeUpper(UChar ch) - { - return ch != Unicode::toLower(ch); + // Simple case, not a case-insensitive match. + if (!m_isCaseInsensitive) { + addSorted(m_matchesUnicode, ch); + return; + } + + // Add multiple matches, if necessary. + UCS2CanonicalizationRange* info = rangeInfoFor(ch); + if (info->type == CanonicalizeUnique) + addSorted(m_matchesUnicode, ch); + else + putUnicodeIgnoreCase(ch, info); } - // returns true if this character has another case, and 'ch' is the lower case form. - static inline bool isUnicodeLower(UChar ch) + void putUnicodeIgnoreCase(UChar ch, UCS2CanonicalizationRange* info) { - return ch != Unicode::toUpper(ch); + ASSERT(m_isCaseInsensitive); + ASSERT(ch > 0x7f); + ASSERT(ch >= info->begin && ch <= info->end); + ASSERT(info->type != CanonicalizeUnique); + if (info->type == CanonicalizeSet) { + for (uint16_t* set = characterSetInfo[info->value]; (ch = *set); ++set) + addSorted(m_matchesUnicode, ch); + } else { + addSorted(m_matchesUnicode, ch); + addSorted(m_matchesUnicode, getCanonicalPair(info, ch)); + } } void putRange(UChar lo, UChar hi) @@ -108,36 +120,59 @@ public: addSortedRange(m_ranges, std::max(asciiLo, 'a')+('A'-'a'), std::min(asciiHi, 'z')+('A'-'a')); } } - if (hi >= 0x80) { - uint32_t unicodeCurr = std::max(lo, (UChar)0x80); - addSortedRange(m_rangesUnicode, unicodeCurr, hi); - - if (m_isCaseInsensitive) { - while (unicodeCurr <= hi) { - // If the upper bound of the range (hi) is 0xffff, the increments to - // unicodeCurr in this loop may take it to 0x10000. This is fine - // (if so we won't re-enter the loop, since the loop condition above - // will definitely fail) - but this does mean we cannot use a UChar - // to represent unicodeCurr, we must use a 32-bit value instead. - ASSERT(unicodeCurr <= 0xffff); - - if (isUnicodeUpper(unicodeCurr)) { - UChar lowerCaseRangeBegin = Unicode::toLower(unicodeCurr); - UChar lowerCaseRangeEnd = lowerCaseRangeBegin; - while ((++unicodeCurr <= hi) && isUnicodeUpper(unicodeCurr) && (Unicode::toLower(unicodeCurr) == (lowerCaseRangeEnd + 1))) - lowerCaseRangeEnd++; - addSortedRange(m_rangesUnicode, lowerCaseRangeBegin, lowerCaseRangeEnd); - } else if (isUnicodeLower(unicodeCurr)) { - UChar upperCaseRangeBegin = Unicode::toUpper(unicodeCurr); - UChar upperCaseRangeEnd = upperCaseRangeBegin; - while ((++unicodeCurr <= hi) && isUnicodeLower(unicodeCurr) && (Unicode::toUpper(unicodeCurr) == (upperCaseRangeEnd + 1))) - upperCaseRangeEnd++; - addSortedRange(m_rangesUnicode, upperCaseRangeBegin, upperCaseRangeEnd); - } else - ++unicodeCurr; - } + if (hi <= 0x7f) + return; + + lo = std::max(lo, (UChar)0x80); + addSortedRange(m_rangesUnicode, lo, hi); + + if (!m_isCaseInsensitive) + return; + + UCS2CanonicalizationRange* info = rangeInfoFor(lo); + while (true) { + // Handle the range [lo .. end] + UChar end = std::min<UChar>(info->end, hi); + + switch (info->type) { + case CanonicalizeUnique: + // Nothing to do - no canonical equivalents. + break; + case CanonicalizeSet: { + UChar ch; + for (uint16_t* set = characterSetInfo[info->value]; (ch = *set); ++set) + addSorted(m_matchesUnicode, ch); + break; } - } + case CanonicalizeRangeLo: + addSortedRange(m_rangesUnicode, lo + info->value, end + info->value); + break; + case CanonicalizeRangeHi: + addSortedRange(m_rangesUnicode, lo - info->value, end - info->value); + break; + case CanonicalizeAlternatingAligned: + // Use addSortedRange since there is likely an abutting range to combine with. + if (lo & 1) + addSortedRange(m_rangesUnicode, lo - 1, lo - 1); + if (!(end & 1)) + addSortedRange(m_rangesUnicode, end + 1, end + 1); + break; + case CanonicalizeAlternatingUnaligned: + // Use addSortedRange since there is likely an abutting range to combine with. + if (!(lo & 1)) + addSortedRange(m_rangesUnicode, lo - 1, lo - 1); + if (end & 1) + addSortedRange(m_rangesUnicode, end + 1, end + 1); + break; + } + + if (hi == end) + return; + + ++info; + lo = info->begin; + }; + } CharacterClass* charClass() @@ -280,12 +315,21 @@ public: { // We handle case-insensitive checking of unicode characters which do have both // cases by handling them as if they were defined using a CharacterClass. - if (m_pattern.m_ignoreCase && !isASCII(ch) && (Unicode::toUpper(ch) != Unicode::toLower(ch))) { - atomCharacterClassBegin(); - atomCharacterClassAtom(ch); - atomCharacterClassEnd(); - } else + if (!m_pattern.m_ignoreCase || isASCII(ch)) { + m_alternative->m_terms.append(PatternTerm(ch)); + return; + } + + UCS2CanonicalizationRange* info = rangeInfoFor(ch); + if (info->type == CanonicalizeUnique) { m_alternative->m_terms.append(PatternTerm(ch)); + return; + } + + m_characterClassConstructor.putUnicodeIgnoreCase(ch, info); + CharacterClass* newCharacterClass = m_characterClassConstructor.charClass(); + m_pattern.m_userCharacterClasses.append(newCharacterClass); + m_alternative->m_terms.append(PatternTerm(newCharacterClass, false)); } void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert) diff --git a/Source/JavaScriptCore/yarr/yarr.pri b/Source/JavaScriptCore/yarr/yarr.pri index c2634864f..623098fd3 100644 --- a/Source/JavaScriptCore/yarr/yarr.pri +++ b/Source/JavaScriptCore/yarr/yarr.pri @@ -7,7 +7,12 @@ SOURCES += \ $$PWD/YarrInterpreter.cpp \ $$PWD/YarrPattern.cpp \ - $$PWD/YarrSyntaxChecker.cpp + $$PWD/YarrSyntaxChecker.cpp \ + $$PWD/YarrCanonicalizeUCS2.cpp # For UString.h -v8: INCLUDEPATH += $$PWD/../runtime +v8 { + INCLUDEPATH += \ + $$PWD/.. \ + $$PWD/../runtime +} |