/* * Copyright (C) 2011 Adam Barth. All Rights Reserved. * Copyright (C) 2011 Daniel Bates (dbates@intudata.com). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include "XSSAuditor.h" #include "ContentSecurityPolicy.h" #include "DecodeEscapeSequences.h" #include "Document.h" #include "DocumentLoader.h" #include "FormData.h" #include "Frame.h" #include "HTMLDocumentParser.h" #include "HTMLNames.h" #include "HTMLParamElement.h" #include "HTMLParserIdioms.h" #include "SVGNames.h" #include "Settings.h" #include "TextResourceDecoder.h" #include "XLinkNames.h" #include #include #include namespace WebCore { using namespace HTMLNames; static bool isNonCanonicalCharacter(UChar c) { // We remove all non-ASCII characters, including non-printable ASCII characters. // // Note, we don't remove backslashes like PHP stripslashes(), which among other things converts "\\0" to the \0 character. // Instead, we remove backslashes and zeros (since the string "\\0" =(remove backslashes)=> "0"). However, this has the // adverse effect that we remove any legitimate zeros from a string. // We also remove forward-slash, because it is common for some servers to collapse successive path components, eg, // a//b becomes a/b. // // For instance: new String("http://localhost:8000") => new String("http:localhost:8"). return (c == '\\' || c == '0' || c == '\0' || c == '/' || c >= 127); } static bool isRequiredForInjection(UChar c) { return (c == '\'' || c == '"' || c == '<' || c == '>'); } static bool isTerminatingCharacter(UChar c) { return (c == '&' || c == '/' || c == '"' || c == '\'' || c == '<' || c == '>' || c == ','); } static bool isHTMLQuote(UChar c) { return (c == '"' || c == '\''); } static bool isJSNewline(UChar c) { // Per ecma-262 section 7.3 Line Terminators. return (c == '\n' || c == '\r' || c == 0x2028 || c == 0x2029); } static bool startsHTMLCommentAt(const String& string, size_t start) { return (start + 3 < string.length() && string[start] == '<' && string[start + 1] == '!' && string[start + 2] == '-' && string[start + 3] == '-'); } static bool startsSingleLineCommentAt(const String& string, size_t start) { return (start + 1 < string.length() && string[start] == '/' && string[start + 1] == '/'); } static bool startsMultiLineCommentAt(const String& string, size_t start) { return (start + 1 < string.length() && string[start] == '/' && string[start + 1] == '*'); } static bool startsOpeningScriptTagAt(const String& string, size_t start) { return start + 6 < string.length() && string[start] == '<' && WTF::toASCIILowerUnchecked(string[start + 1]) == 's' && WTF::toASCIILowerUnchecked(string[start + 2]) == 'c' && WTF::toASCIILowerUnchecked(string[start + 3]) == 'r' && WTF::toASCIILowerUnchecked(string[start + 4]) == 'i' && WTF::toASCIILowerUnchecked(string[start + 5]) == 'p' && WTF::toASCIILowerUnchecked(string[start + 6]) == 't'; } // If other files need this, we should move this to HTMLParserIdioms.h template bool threadSafeMatch(const Vector& vector, const QualifiedName& qname) { return equalIgnoringNullity(vector, qname.localName().impl()); } static bool hasName(const HTMLToken& token, const QualifiedName& name) { return threadSafeMatch(token.name(), name); } static bool findAttributeWithName(const HTMLToken& token, const QualifiedName& name, size_t& indexOfMatchingAttribute) { // Notice that we're careful not to ref the StringImpl here because we might be on a background thread. const String& attrName = name.namespaceURI() == XLinkNames::xlinkNamespaceURI ? "xlink:" + name.localName().string() : name.localName().string(); for (size_t i = 0; i < token.attributes().size(); ++i) { if (equalIgnoringNullity(token.attributes().at(i).name, attrName)) { indexOfMatchingAttribute = i; return true; } } return false; } static bool isNameOfInlineEventHandler(const Vector& name) { const size_t lengthOfShortestInlineEventHandlerName = 5; // To wit: oncut. if (name.size() < lengthOfShortestInlineEventHandlerName) return false; return name[0] == 'o' && name[1] == 'n'; } static bool isDangerousHTTPEquiv(const String& value) { String equiv = value.stripWhiteSpace(); return equalLettersIgnoringASCIICase(equiv, "refresh") || equalLettersIgnoringASCIICase(equiv, "set-cookie"); } static inline String decode16BitUnicodeEscapeSequences(const String& string) { // Note, the encoding is ignored since each %u-escape sequence represents a UTF-16 code unit. return decodeEscapeSequences(string, UTF8Encoding()); } static inline String decodeStandardURLEscapeSequences(const String& string, const TextEncoding& encoding) { // We use decodeEscapeSequences() instead of decodeURLEscapeSequences() (declared in URL.h) to // avoid platform-specific URL decoding differences (e.g. URLGoogle). return decodeEscapeSequences(string, encoding); } static String fullyDecodeString(const String& string, const TextEncoding& encoding) { size_t oldWorkingStringLength; String workingString = string; do { oldWorkingStringLength = workingString.length(); workingString = decode16BitUnicodeEscapeSequences(decodeStandardURLEscapeSequences(workingString, encoding)); } while (workingString.length() < oldWorkingStringLength); workingString.replace('+', ' '); return workingString; } static void truncateForSrcLikeAttribute(String& decodedSnippet) { // In HTTP URLs, characters following the first ?, #, or third slash may come from // the page itself and can be merely ignored by an attacker's server when a remote // script or script-like resource is requested. In DATA URLS, the payload starts at // the first comma, and the the first /*, //, or . if (startsHTMLCommentAt(string, startPosition) || startsSingleLineCommentAt(string, startPosition)) { while (startPosition < endPosition && !isJSNewline(string[startPosition])) startPosition++; } else if (startsMultiLineCommentAt(string, startPosition)) { if (startPosition + 2 < endPosition && (foundPosition = string.find("*/", startPosition + 2)) != notFound) startPosition = foundPosition + 2; else startPosition = endPosition; } else break; } String result; while (startPosition < endPosition && !result.length()) { // Stop at next comment (using the same rules as above for SVG/XML vs HTML), when we encounter a comma, // when we hit an opening