/* * Copyright (C) 2011 Daniel Bates (dbates@intudata.com). All Rights Reserved. * Copyright (c) 2012 Google, inc. All Rights Reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of Google Inc. nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DecodeEscapeSequences_h #define DecodeEscapeSequences_h #include "wtf/ASCIICType.h" #include "wtf/Assertions.h" #include "wtf/text/StringBuilder.h" #include "wtf/text/TextEncoding.h" namespace WebCore { // See . struct Unicode16BitEscapeSequence { enum { sequenceSize = 6 }; // e.g. %u26C4 static size_t findInString(const String& string, size_t startPosition) { return string.find("%u", startPosition); } static size_t findEndOfRun(const String& string, size_t startPosition, size_t endPosition) { size_t runEnd = startPosition; while (endPosition - runEnd >= sequenceSize && string[runEnd] == '%' && string[runEnd + 1] == 'u' && isASCIIHexDigit(string[runEnd + 2]) && isASCIIHexDigit(string[runEnd + 3]) && isASCIIHexDigit(string[runEnd + 4]) && isASCIIHexDigit(string[runEnd + 5])) { runEnd += sequenceSize; } return runEnd; } template static String decodeRun(const CharType* run, size_t runLength, const WTF::TextEncoding&) { // Each %u-escape sequence represents a UTF-16 code unit. // See . // For 16-bit escape sequences, we know that findEndOfRun() has given us a contiguous run of sequences // without any intervening characters, so decode the run without additional checks. size_t numberOfSequences = runLength / sequenceSize; StringBuilder builder; builder.reserveCapacity(numberOfSequences); while (numberOfSequences--) { UChar codeUnit = (toASCIIHexValue(run[2]) << 12) | (toASCIIHexValue(run[3]) << 8) | (toASCIIHexValue(run[4]) << 4) | toASCIIHexValue(run[5]); builder.append(codeUnit); run += sequenceSize; } return builder.toString(); } }; struct URLEscapeSequence { enum { sequenceSize = 3 }; // e.g. %41 static size_t findInString(const String& string, size_t startPosition) { return string.find('%', startPosition); } static size_t findEndOfRun(const String& string, size_t startPosition, size_t endPosition) { // Make the simplifying assumption that supported encodings may have up to two unescaped characters // in the range 0x40 - 0x7F as the trailing bytes of their sequences which need to be passed into the // decoder as part of the run. In other words, we end the run at the first value outside of the // 0x40 - 0x7F range, after two values in this range, or at a %-sign that does not introduce a valid // escape sequence. size_t runEnd = startPosition; int numberOfTrailingCharacters = 0; while (runEnd < endPosition) { if (string[runEnd] == '%') { if (endPosition - runEnd >= sequenceSize && isASCIIHexDigit(string[runEnd + 1]) && isASCIIHexDigit(string[runEnd + 2])) { runEnd += sequenceSize; numberOfTrailingCharacters = 0; } else break; } else if (string[runEnd] >= 0x40 && string[runEnd] <= 0x7F && numberOfTrailingCharacters < 2) { runEnd += 1; numberOfTrailingCharacters += 1; } else break; } return runEnd; } template static String decodeRun(const CharType* run, size_t runLength, const WTF::TextEncoding& encoding) { // For URL escape sequences, we know that findEndOfRun() has given us a run where every %-sign introduces // a valid escape sequence, but there may be characters between the sequences. Vector buffer; buffer.resize(runLength); // Unescaping hex sequences only makes the length smaller. char* p = buffer.data(); const CharType* runEnd = run + runLength; while (run < runEnd) { if (run[0] == '%') { *p++ = (toASCIIHexValue(run[1]) << 4) | toASCIIHexValue(run[2]); run += sequenceSize; } else { *p++ = run[0]; run += 1; } } ASSERT(buffer.size() >= static_cast(p - buffer.data())); // Prove buffer not overrun. return (encoding.isValid() ? encoding : UTF8Encoding()).decode(buffer.data(), p - buffer.data()); } }; template String decodeEscapeSequences(const String& string, const WTF::TextEncoding& encoding) { StringBuilder result; size_t length = string.length(); size_t decodedPosition = 0; size_t searchPosition = 0; size_t encodedRunPosition; while ((encodedRunPosition = EscapeSequence::findInString(string, searchPosition)) != kNotFound) { size_t encodedRunEnd = EscapeSequence::findEndOfRun(string, encodedRunPosition, length); searchPosition = encodedRunEnd; if (encodedRunEnd == encodedRunPosition) { ++searchPosition; continue; } String decoded = string.is8Bit() ? EscapeSequence::decodeRun(string.characters8() + encodedRunPosition, encodedRunEnd - encodedRunPosition, encoding) : EscapeSequence::decodeRun(string.characters16() + encodedRunPosition, encodedRunEnd - encodedRunPosition, encoding); if (decoded.isEmpty()) continue; result.append(string, decodedPosition, encodedRunPosition - decodedPosition); result.append(decoded); decodedPosition = encodedRunEnd; } result.append(string, decodedPosition, length - decodedPosition); return result.toString(); } } // namespace WebCore #endif // DecodeEscapeSequences_h