/* * Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2012 Apple Inc. All rights reserved. * Copyright (C) 2005 Alexey Proskuryakov. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include "platform/text/UnicodeUtilities.h" #include "wtf/text/StringBuffer.h" #include "wtf/unicode/CharacterNames.h" #include using namespace WTF::Unicode; namespace WebCore { enum VoicedSoundMarkType { NoVoicedSoundMark, VoicedSoundMark, SemiVoicedSoundMark }; template static inline CharType foldQuoteMarkOrSoftHyphen(CharType c) { switch (static_cast(c)) { case hebrewPunctuationGershayim: case leftDoubleQuotationMark: case rightDoubleQuotationMark: return '"'; case hebrewPunctuationGeresh: case leftSingleQuotationMark: case rightSingleQuotationMark: return '\''; case softHyphen: // Replace soft hyphen with an ignorable character so that their presence or absence will // not affect string comparison. return 0; default: return c; } } void foldQuoteMarksAndSoftHyphens(UChar* data, size_t length) { for (size_t i = 0; i < length; ++i) data[i] = foldQuoteMarkOrSoftHyphen(data[i]); } void foldQuoteMarksAndSoftHyphens(String& s) { s.replace(hebrewPunctuationGeresh, '\''); s.replace(hebrewPunctuationGershayim, '"'); s.replace(leftDoubleQuotationMark, '"'); s.replace(leftSingleQuotationMark, '\''); s.replace(rightDoubleQuotationMark, '"'); s.replace(rightSingleQuotationMark, '\''); // Replace soft hyphen with an ignorable character so that their presence or absence will // not affect string comparison. s.replace(softHyphen, 0); } static bool isNonLatin1Separator(UChar32 character) { ASSERT_ARG(character, character >= 256); return U_GET_GC_MASK(character) & (U_GC_S_MASK | U_GC_P_MASK | U_GC_Z_MASK | U_GC_CF_MASK); } bool isSeparator(UChar32 character) { static const bool latin1SeparatorTable[256] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // space ! " # $ % & ' ( ) * + , - . / 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, // : ; < = > ? 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // @ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, // [ \ ] ^ _ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // ` 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, // { | } ~ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0 }; if (character < 256) return latin1SeparatorTable[character]; return isNonLatin1Separator(character); } // ICU's search ignores the distinction between small kana letters and ones // that are not small, and also characters that differ only in the voicing // marks when considering only primary collation strength differences. // This is not helpful for end users, since these differences make words // distinct, so for our purposes we need these to be considered. // The Unicode folks do not think the collation algorithm should be // changed. To work around this, we would like to tailor the ICU searcher, // but we can't get that to work yet. So instead, we check for cases where // these differences occur, and skip those matches. // We refer to the above technique as the "kana workaround". The next few // functions are helper functinos for the kana workaround. bool isKanaLetter(UChar character) { // Hiragana letters. if (character >= 0x3041 && character <= 0x3096) return true; // Katakana letters. if (character >= 0x30A1 && character <= 0x30FA) return true; if (character >= 0x31F0 && character <= 0x31FF) return true; // Halfwidth katakana letters. if (character >= 0xFF66 && character <= 0xFF9D && character != 0xFF70) return true; return false; } bool isSmallKanaLetter(UChar character) { ASSERT(isKanaLetter(character)); switch (character) { case 0x3041: // HIRAGANA LETTER SMALL A case 0x3043: // HIRAGANA LETTER SMALL I case 0x3045: // HIRAGANA LETTER SMALL U case 0x3047: // HIRAGANA LETTER SMALL E case 0x3049: // HIRAGANA LETTER SMALL O case 0x3063: // HIRAGANA LETTER SMALL TU case 0x3083: // HIRAGANA LETTER SMALL YA case 0x3085: // HIRAGANA LETTER SMALL YU case 0x3087: // HIRAGANA LETTER SMALL YO case 0x308E: // HIRAGANA LETTER SMALL WA case 0x3095: // HIRAGANA LETTER SMALL KA case 0x3096: // HIRAGANA LETTER SMALL KE case 0x30A1: // KATAKANA LETTER SMALL A case 0x30A3: // KATAKANA LETTER SMALL I case 0x30A5: // KATAKANA LETTER SMALL U case 0x30A7: // KATAKANA LETTER SMALL E case 0x30A9: // KATAKANA LETTER SMALL O case 0x30C3: // KATAKANA LETTER SMALL TU case 0x30E3: // KATAKANA LETTER SMALL YA case 0x30E5: // KATAKANA LETTER SMALL YU case 0x30E7: // KATAKANA LETTER SMALL YO case 0x30EE: // KATAKANA LETTER SMALL WA case 0x30F5: // KATAKANA LETTER SMALL KA case 0x30F6: // KATAKANA LETTER SMALL KE case 0x31F0: // KATAKANA LETTER SMALL KU case 0x31F1: // KATAKANA LETTER SMALL SI case 0x31F2: // KATAKANA LETTER SMALL SU case 0x31F3: // KATAKANA LETTER SMALL TO case 0x31F4: // KATAKANA LETTER SMALL NU case 0x31F5: // KATAKANA LETTER SMALL HA case 0x31F6: // KATAKANA LETTER SMALL HI case 0x31F7: // KATAKANA LETTER SMALL HU case 0x31F8: // KATAKANA LETTER SMALL HE case 0x31F9: // KATAKANA LETTER SMALL HO case 0x31FA: // KATAKANA LETTER SMALL MU case 0x31FB: // KATAKANA LETTER SMALL RA case 0x31FC: // KATAKANA LETTER SMALL RI case 0x31FD: // KATAKANA LETTER SMALL RU case 0x31FE: // KATAKANA LETTER SMALL RE case 0x31FF: // KATAKANA LETTER SMALL RO case 0xFF67: // HALFWIDTH KATAKANA LETTER SMALL A case 0xFF68: // HALFWIDTH KATAKANA LETTER SMALL I case 0xFF69: // HALFWIDTH KATAKANA LETTER SMALL U case 0xFF6A: // HALFWIDTH KATAKANA LETTER SMALL E case 0xFF6B: // HALFWIDTH KATAKANA LETTER SMALL O case 0xFF6C: // HALFWIDTH KATAKANA LETTER SMALL YA case 0xFF6D: // HALFWIDTH KATAKANA LETTER SMALL YU case 0xFF6E: // HALFWIDTH KATAKANA LETTER SMALL YO case 0xFF6F: // HALFWIDTH KATAKANA LETTER SMALL TU return true; } return false; } static inline VoicedSoundMarkType composedVoicedSoundMark(UChar character) { ASSERT(isKanaLetter(character)); switch (character) { case 0x304C: // HIRAGANA LETTER GA case 0x304E: // HIRAGANA LETTER GI case 0x3050: // HIRAGANA LETTER GU case 0x3052: // HIRAGANA LETTER GE case 0x3054: // HIRAGANA LETTER GO case 0x3056: // HIRAGANA LETTER ZA case 0x3058: // HIRAGANA LETTER ZI case 0x305A: // HIRAGANA LETTER ZU case 0x305C: // HIRAGANA LETTER ZE case 0x305E: // HIRAGANA LETTER ZO case 0x3060: // HIRAGANA LETTER DA case 0x3062: // HIRAGANA LETTER DI case 0x3065: // HIRAGANA LETTER DU case 0x3067: // HIRAGANA LETTER DE case 0x3069: // HIRAGANA LETTER DO case 0x3070: // HIRAGANA LETTER BA case 0x3073: // HIRAGANA LETTER BI case 0x3076: // HIRAGANA LETTER BU case 0x3079: // HIRAGANA LETTER BE case 0x307C: // HIRAGANA LETTER BO case 0x3094: // HIRAGANA LETTER VU case 0x30AC: // KATAKANA LETTER GA case 0x30AE: // KATAKANA LETTER GI case 0x30B0: // KATAKANA LETTER GU case 0x30B2: // KATAKANA LETTER GE case 0x30B4: // KATAKANA LETTER GO case 0x30B6: // KATAKANA LETTER ZA case 0x30B8: // KATAKANA LETTER ZI case 0x30BA: // KATAKANA LETTER ZU case 0x30BC: // KATAKANA LETTER ZE case 0x30BE: // KATAKANA LETTER ZO case 0x30C0: // KATAKANA LETTER DA case 0x30C2: // KATAKANA LETTER DI case 0x30C5: // KATAKANA LETTER DU case 0x30C7: // KATAKANA LETTER DE case 0x30C9: // KATAKANA LETTER DO case 0x30D0: // KATAKANA LETTER BA case 0x30D3: // KATAKANA LETTER BI case 0x30D6: // KATAKANA LETTER BU case 0x30D9: // KATAKANA LETTER BE case 0x30DC: // KATAKANA LETTER BO case 0x30F4: // KATAKANA LETTER VU case 0x30F7: // KATAKANA LETTER VA case 0x30F8: // KATAKANA LETTER VI case 0x30F9: // KATAKANA LETTER VE case 0x30FA: // KATAKANA LETTER VO return VoicedSoundMark; case 0x3071: // HIRAGANA LETTER PA case 0x3074: // HIRAGANA LETTER PI case 0x3077: // HIRAGANA LETTER PU case 0x307A: // HIRAGANA LETTER PE case 0x307D: // HIRAGANA LETTER PO case 0x30D1: // KATAKANA LETTER PA case 0x30D4: // KATAKANA LETTER PI case 0x30D7: // KATAKANA LETTER PU case 0x30DA: // KATAKANA LETTER PE case 0x30DD: // KATAKANA LETTER PO return SemiVoicedSoundMark; } return NoVoicedSoundMark; } static inline bool isCombiningVoicedSoundMark(UChar character) { switch (character) { case 0x3099: // COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK case 0x309A: // COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK return true; } return false; } bool containsKanaLetters(const String& pattern) { const unsigned length = pattern.length(); for (unsigned i = 0; i < length; ++i) { if (isKanaLetter(pattern[i])) return true; } return false; } void normalizeCharactersIntoNFCForm(const UChar* characters, unsigned length, Vector& buffer) { ASSERT(length); buffer.resize(length); UErrorCode status = U_ZERO_ERROR; size_t bufferSize = unorm_normalize(characters, length, UNORM_NFC, 0, buffer.data(), length, &status); ASSERT(status == U_ZERO_ERROR || status == U_STRING_NOT_TERMINATED_WARNING || status == U_BUFFER_OVERFLOW_ERROR); ASSERT(bufferSize); buffer.resize(bufferSize); if (status == U_ZERO_ERROR || status == U_STRING_NOT_TERMINATED_WARNING) return; status = U_ZERO_ERROR; unorm_normalize(characters, length, UNORM_NFC, 0, buffer.data(), bufferSize, &status); ASSERT(status == U_STRING_NOT_TERMINATED_WARNING); } // This function returns kNotFound if |first| and |second| contain different Kana letters. // If |first| and |second| contain the same Kana letter // then function returns offset in characters from |first|. // Pointers to both strings increase simultaneously so so it is possible to use one offset value. static inline size_t compareKanaLetterAndComposedVoicedSoundMarks(const UChar* first, const UChar* firstEnd, const UChar* second, const UChar* secondEnd) { const UChar* start = first; // Check for differences in the kana letter character itself. if (isSmallKanaLetter(*first) != isSmallKanaLetter(*second)) return kNotFound; if (composedVoicedSoundMark(*first) != composedVoicedSoundMark(*second)) return kNotFound; ++first; ++second; // Check for differences in combining voiced sound marks found after the letter. while (true) { const bool secondIsNotSoundMark = second == secondEnd || !isCombiningVoicedSoundMark(*second); if (first == firstEnd || !isCombiningVoicedSoundMark(*first)) { return secondIsNotSoundMark ? first - start : kNotFound; } if (secondIsNotSoundMark) return kNotFound; if (*first != *second) return kNotFound; ++first; ++second; } } bool checkOnlyKanaLettersInStrings(const UChar* firstData, unsigned firstLength, const UChar* secondData, unsigned secondLength) { const UChar* a = firstData; const UChar* aEnd = firstData + firstLength; const UChar* b = secondData; const UChar* bEnd = secondData + secondLength; while (true) { // Skip runs of non-kana-letter characters. This is necessary so we can // correctly handle strings where the |firstData| and |secondData| have different-length // runs of characters that match, while still double checking the correctness // of matches of kana letters with other kana letters. while (a != aEnd && !isKanaLetter(*a)) ++a; while (b != bEnd && !isKanaLetter(*b)) ++b; // If we reached the end of either the target or the match, we should have // reached the end of both; both should have the same number of kana letters. if (a == aEnd || b == bEnd) { return a == aEnd && b == bEnd; } // Check that single Kana letters in |a| and |b| are the same. const size_t offset = compareKanaLetterAndComposedVoicedSoundMarks(a, aEnd, b, bEnd); if (offset == kNotFound) return false; // Update values of |a| and |b| after comparing. a += offset; b += offset; } } bool checkKanaStringsEqual(const UChar* firstData, unsigned firstLength, const UChar* secondData, unsigned secondLength) { const UChar* a = firstData; const UChar* aEnd = firstData + firstLength; const UChar* b = secondData; const UChar* bEnd = secondData + secondLength; while (true) { // Check for non-kana-letter characters. while (a != aEnd && !isKanaLetter(*a) && b != bEnd && !isKanaLetter(*b)) { if (*a++ != *b++) return false; } // If we reached the end of either the target or the match, we should have // reached the end of both; both should have the same number of kana letters. if (a == aEnd || b == bEnd) { return a == aEnd && b == bEnd; } if (isKanaLetter(*a) != isKanaLetter(*b)) return false; // Check that single Kana letters in |a| and |b| are the same. const size_t offset = compareKanaLetterAndComposedVoicedSoundMarks(a, aEnd, b, bEnd); if (offset == kNotFound) return false; // Update values of |a| and |b| after comparing. a += offset; b += offset; } } }