/* * Copyright (C) 2014-2015 Apple Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS'' * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef Term_h #define Term_h #if ENABLE(CONTENT_EXTENSIONS) #include "ContentExtensionsDebugging.h" #include "NFA.h" #include #include #include #include #include #include namespace WebCore { namespace ContentExtensions { enum class AtomQuantifier : uint8_t { One, ZeroOrOne, ZeroOrMore, OneOrMore }; class Term { public: Term(); Term(char character, bool isCaseSensitive); enum UniversalTransitionTag { UniversalTransition }; explicit Term(UniversalTransitionTag); enum CharacterSetTermTag { CharacterSetTerm }; explicit Term(CharacterSetTermTag, bool isInverted); enum GroupTermTag { GroupTerm }; explicit Term(GroupTermTag); enum EndOfLineAssertionTermTag { EndOfLineAssertionTerm }; explicit Term(EndOfLineAssertionTermTag); Term(const Term& other); Term(Term&& other); enum EmptyValueTag { EmptyValue }; Term(EmptyValueTag); ~Term(); bool isValid() const; // CharacterSet terms only. void addCharacter(UChar character, bool isCaseSensitive); // Group terms only. void extendGroupSubpattern(const Term&); void quantify(const AtomQuantifier&); ImmutableCharNFANodeBuilder generateGraph(NFA&, ImmutableCharNFANodeBuilder& source, const ActionList& finalActions) const; void generateGraph(NFA&, ImmutableCharNFANodeBuilder& source, uint32_t destination) const; bool isEndOfLineAssertion() const; bool matchesAtLeastOneCharacter() const; // Matches any string, the empty string included. // This is very conservative. Patterns matching any string can return false here. bool isKnownToMatchAnyString() const; // Return true if the term can only match input of a single fixed length. bool hasFixedLength() const; Term& operator=(const Term& other); Term& operator=(Term&& other); bool operator==(const Term& other) const; unsigned hash() const; bool isEmptyValue() const; #if CONTENT_EXTENSIONS_STATE_MACHINE_DEBUGGING String toString() const; #endif #if CONTENT_EXTENSIONS_PERFORMANCE_REPORTING size_t memoryUsed() const; #endif private: // This is exact for character sets but conservative for groups. // The return value can be false for a group equivalent to a universal transition. bool isUniversalTransition() const; void generateSubgraphForAtom(NFA&, ImmutableCharNFANodeBuilder& source, const ImmutableCharNFANodeBuilder& destination) const; void generateSubgraphForAtom(NFA&, ImmutableCharNFANodeBuilder& source, uint32_t destination) const; void destroy(); enum class TermType : uint8_t { Empty, CharacterSet, Group }; TermType m_termType { TermType::Empty }; AtomQuantifier m_quantifier { AtomQuantifier::One }; class CharacterSet { public: void set(UChar character) { RELEASE_ASSERT(character < 128); m_characters[character / 64] |= (uint64_t(1) << (character % 64)); } bool get(UChar character) const { RELEASE_ASSERT(character < 128); return m_characters[character / 64] & (uint64_t(1) << (character % 64)); } void invert() { ASSERT(!m_inverted); m_inverted = true; } bool inverted() const { return m_inverted; } unsigned bitCount() const { return WTF::bitCount(m_characters[0]) + WTF::bitCount(m_characters[1]); } bool operator==(const CharacterSet& other) const { return other.m_inverted == m_inverted && other.m_characters[0] == m_characters[0] && other.m_characters[1] == m_characters[1]; } unsigned hash() const { return WTF::pairIntHash(WTF::pairIntHash(WTF::intHash(m_characters[0]), WTF::intHash(m_characters[1])), m_inverted); } private: bool m_inverted { false }; uint64_t m_characters[2] { 0, 0 }; }; struct Group { Vector terms; bool operator==(const Group& other) const { return other.terms == terms; } unsigned hash() const { unsigned hash = 6421749; for (const Term& term : terms) { unsigned termHash = term.hash(); hash = (hash << 16) ^ ((termHash << 11) ^ hash); hash += hash >> 11; } return hash; } }; union AtomData { AtomData() : invalidTerm(0) { } ~AtomData() { } char invalidTerm; CharacterSet characterSet; Group group; } m_atomData; }; #if CONTENT_EXTENSIONS_STATE_MACHINE_DEBUGGING inline String quantifierToString(AtomQuantifier quantifier) { switch (quantifier) { case AtomQuantifier::One: return ""; case AtomQuantifier::ZeroOrOne: return "?"; case AtomQuantifier::ZeroOrMore: return "*"; case AtomQuantifier::OneOrMore: return "+"; } } inline String Term::toString() const { switch (m_termType) { case TermType::Empty: return "(Empty)"; case TermType::CharacterSet: { StringBuilder builder; builder.append('['); for (UChar c = 0; c < 128; c++) { if (m_atomData.characterSet.get(c)) { if (isASCIIPrintable(c) && !isASCIISpace(c)) builder.append(c); else { builder.append('\\'); builder.append('u'); builder.appendNumber(c); } } } builder.append(']'); builder.append(quantifierToString(m_quantifier)); return builder.toString(); } case TermType::Group: { StringBuilder builder; builder.append('('); for (const Term& term : m_atomData.group.terms) builder.append(term.toString()); builder.append(')'); builder.append(quantifierToString(m_quantifier)); return builder.toString(); } } } #endif inline Term::Term() { } inline Term::Term(char character, bool isCaseSensitive) : m_termType(TermType::CharacterSet) { new (NotNull, &m_atomData.characterSet) CharacterSet(); addCharacter(character, isCaseSensitive); } inline Term::Term(UniversalTransitionTag) : m_termType(TermType::CharacterSet) { new (NotNull, &m_atomData.characterSet) CharacterSet(); for (UChar i = 1; i < 128; ++i) m_atomData.characterSet.set(i); } inline Term::Term(CharacterSetTermTag, bool isInverted) : m_termType(TermType::CharacterSet) { new (NotNull, &m_atomData.characterSet) CharacterSet(); if (isInverted) m_atomData.characterSet.invert(); } inline Term::Term(GroupTermTag) : m_termType(TermType::Group) { new (NotNull, &m_atomData.group) Group(); } inline Term::Term(EndOfLineAssertionTermTag) : Term(CharacterSetTerm, false) { m_atomData.characterSet.set(0); } inline Term::Term(const Term& other) : m_termType(other.m_termType) , m_quantifier(other.m_quantifier) { switch (m_termType) { case TermType::Empty: break; case TermType::CharacterSet: new (NotNull, &m_atomData.characterSet) CharacterSet(other.m_atomData.characterSet); break; case TermType::Group: new (NotNull, &m_atomData.group) Group(other.m_atomData.group); break; } } inline Term::Term(Term&& other) : m_termType(WTFMove(other.m_termType)) , m_quantifier(WTFMove(other.m_quantifier)) { switch (m_termType) { case TermType::Empty: break; case TermType::CharacterSet: new (NotNull, &m_atomData.characterSet) CharacterSet(WTFMove(other.m_atomData.characterSet)); break; case TermType::Group: new (NotNull, &m_atomData.group) Group(WTFMove(other.m_atomData.group)); break; } other.destroy(); } inline Term::Term(EmptyValueTag) : m_termType(TermType::Empty) { } inline Term::~Term() { destroy(); } inline bool Term::isValid() const { return m_termType != TermType::Empty; } inline void Term::addCharacter(UChar character, bool isCaseSensitive) { ASSERT(isASCII(character)); ASSERT_WITH_SECURITY_IMPLICATION(m_termType == TermType::CharacterSet); if (m_termType != TermType::CharacterSet) return; if (isCaseSensitive || !isASCIIAlpha(character)) m_atomData.characterSet.set(character); else { m_atomData.characterSet.set(toASCIIUpper(character)); m_atomData.characterSet.set(toASCIILower(character)); } } inline void Term::extendGroupSubpattern(const Term& term) { ASSERT_WITH_SECURITY_IMPLICATION(m_termType == TermType::Group); if (m_termType != TermType::Group) return; m_atomData.group.terms.append(term); } inline void Term::quantify(const AtomQuantifier& quantifier) { ASSERT_WITH_MESSAGE(m_quantifier == AtomQuantifier::One, "Transition to quantified term should only happen once."); m_quantifier = quantifier; } inline ImmutableCharNFANodeBuilder Term::generateGraph(NFA& nfa, ImmutableCharNFANodeBuilder& source, const ActionList& finalActions) const { ImmutableCharNFANodeBuilder newEnd(nfa); generateGraph(nfa, source, newEnd.nodeId()); newEnd.setActions(finalActions.begin(), finalActions.end()); return newEnd; } inline void Term::generateGraph(NFA& nfa, ImmutableCharNFANodeBuilder& source, uint32_t destination) const { ASSERT(isValid()); switch (m_quantifier) { case AtomQuantifier::One: { generateSubgraphForAtom(nfa, source, destination); break; } case AtomQuantifier::ZeroOrOne: { generateSubgraphForAtom(nfa, source, destination); source.addEpsilonTransition(destination); break; } case AtomQuantifier::ZeroOrMore: { ImmutableCharNFANodeBuilder repeatStart(nfa); source.addEpsilonTransition(repeatStart); ImmutableCharNFANodeBuilder repeatEnd(nfa); generateSubgraphForAtom(nfa, repeatStart, repeatEnd); repeatEnd.addEpsilonTransition(repeatStart); repeatEnd.addEpsilonTransition(destination); source.addEpsilonTransition(destination); break; } case AtomQuantifier::OneOrMore: { ImmutableCharNFANodeBuilder repeatStart(nfa); source.addEpsilonTransition(repeatStart); ImmutableCharNFANodeBuilder repeatEnd(nfa); generateSubgraphForAtom(nfa, repeatStart, repeatEnd); repeatEnd.addEpsilonTransition(repeatStart); repeatEnd.addEpsilonTransition(destination); break; } } } inline bool Term::isEndOfLineAssertion() const { return m_termType == TermType::CharacterSet && m_atomData.characterSet.bitCount() == 1 && m_atomData.characterSet.get(0); } inline bool Term::matchesAtLeastOneCharacter() const { ASSERT(isValid()); if (m_quantifier == AtomQuantifier::ZeroOrOne || m_quantifier == AtomQuantifier::ZeroOrMore) return false; if (isEndOfLineAssertion()) return false; if (m_termType == TermType::Group) { for (const Term& term : m_atomData.group.terms) { if (term.matchesAtLeastOneCharacter()) return true; } return false; } return true; } inline bool Term::isKnownToMatchAnyString() const { ASSERT(isValid()); switch (m_termType) { case TermType::Empty: ASSERT_NOT_REACHED(); break; case TermType::CharacterSet: // ".*" is the only simple term matching any string. return isUniversalTransition() && m_quantifier == AtomQuantifier::ZeroOrMore; break; case TermType::Group: { // There are infinitely many ways to match anything with groups, we just handle simple cases if (m_atomData.group.terms.size() != 1) return false; const Term& firstTermInGroup = m_atomData.group.terms.first(); // -(.*) with any quantifier. if (firstTermInGroup.isKnownToMatchAnyString()) return true; if (firstTermInGroup.isUniversalTransition()) { // -(.)*, (.+)*, (.?)* etc. if (m_quantifier == AtomQuantifier::ZeroOrMore) return true; // -(.+)?. if (m_quantifier == AtomQuantifier::ZeroOrOne && firstTermInGroup.m_quantifier == AtomQuantifier::OneOrMore) return true; // -(.?)+. if (m_quantifier == AtomQuantifier::OneOrMore && firstTermInGroup.m_quantifier == AtomQuantifier::ZeroOrOne) return true; } break; } } return false; } inline bool Term::hasFixedLength() const { ASSERT(isValid()); switch (m_termType) { case TermType::Empty: ASSERT_NOT_REACHED(); break; case TermType::CharacterSet: return m_quantifier == AtomQuantifier::One; case TermType::Group: { if (m_quantifier != AtomQuantifier::One) return false; for (const Term& term : m_atomData.group.terms) { if (!term.hasFixedLength()) return false; } return true; } } return false; } inline Term& Term::operator=(const Term& other) { destroy(); new (NotNull, this) Term(other); return *this; } inline Term& Term::operator=(Term&& other) { destroy(); new (NotNull, this) Term(WTFMove(other)); return *this; } inline bool Term::operator==(const Term& other) const { if (other.m_termType != m_termType || other.m_quantifier != m_quantifier) return false; switch (m_termType) { case TermType::Empty: return true; case TermType::CharacterSet: return m_atomData.characterSet == other.m_atomData.characterSet; case TermType::Group: return m_atomData.group == other.m_atomData.group; } ASSERT_NOT_REACHED(); return false; } inline unsigned Term::hash() const { unsigned primary = static_cast(m_termType) << 16 | static_cast(m_quantifier); unsigned secondary = 0; switch (m_termType) { case TermType::Empty: secondary = 52184393; break; case TermType::CharacterSet: secondary = m_atomData.characterSet.hash(); break; case TermType::Group: secondary = m_atomData.group.hash(); break; } return WTF::pairIntHash(primary, secondary); } inline bool Term::isEmptyValue() const { return m_termType == TermType::Empty; } inline bool Term::isUniversalTransition() const { ASSERT(isValid()); switch (m_termType) { case TermType::Empty: ASSERT_NOT_REACHED(); break; case TermType::CharacterSet: return (m_atomData.characterSet.inverted() && !m_atomData.characterSet.bitCount()) || (!m_atomData.characterSet.inverted() && m_atomData.characterSet.bitCount() == 127 && !m_atomData.characterSet.get(0)); case TermType::Group: return m_atomData.group.terms.size() == 1 && m_atomData.group.terms.first().isUniversalTransition(); } return false; } inline void Term::generateSubgraphForAtom(NFA& nfa, ImmutableCharNFANodeBuilder& source, const ImmutableCharNFANodeBuilder& destination) const { generateSubgraphForAtom(nfa, source, destination.nodeId()); } inline void Term::generateSubgraphForAtom(NFA& nfa, ImmutableCharNFANodeBuilder& source, uint32_t destination) const { switch (m_termType) { case TermType::Empty: ASSERT_NOT_REACHED(); source.addEpsilonTransition(destination); break; case TermType::CharacterSet: { if (!m_atomData.characterSet.inverted()) { UChar i = 0; while (true) { while (i < 128 && !m_atomData.characterSet.get(i)) ++i; if (i == 128) break; UChar start = i; ++i; while (i < 128 && m_atomData.characterSet.get(i)) ++i; source.addTransition(start, i - 1, destination); } } else { UChar i = 1; while (true) { while (i < 128 && m_atomData.characterSet.get(i)) ++i; if (i == 128) break; UChar start = i; ++i; while (i < 128 && !m_atomData.characterSet.get(i)) ++i; source.addTransition(start, i - 1, destination); } } break; } case TermType::Group: { if (m_atomData.group.terms.isEmpty()) { // FIXME: any kind of empty term could be avoided in the parser. This case should turned into an assertion. source.addEpsilonTransition(destination); return; } if (m_atomData.group.terms.size() == 1) { m_atomData.group.terms.first().generateGraph(nfa, source, destination); return; } ImmutableCharNFANodeBuilder lastTarget = m_atomData.group.terms.first().generateGraph(nfa, source, ActionList()); for (unsigned i = 1; i < m_atomData.group.terms.size() - 1; ++i) { const Term& currentTerm = m_atomData.group.terms[i]; ImmutableCharNFANodeBuilder newNode = currentTerm.generateGraph(nfa, lastTarget, ActionList()); lastTarget = WTFMove(newNode); } const Term& lastTerm = m_atomData.group.terms.last(); lastTerm.generateGraph(nfa, lastTarget, destination); break; } } } inline void Term::destroy() { switch (m_termType) { case TermType::Empty: break; case TermType::CharacterSet: m_atomData.characterSet.~CharacterSet(); break; case TermType::Group: m_atomData.group.~Group(); break; } m_termType = TermType::Empty; } #if CONTENT_EXTENSIONS_PERFORMANCE_REPORTING inline size_t Term::memoryUsed() const { size_t extraMemory = 0; if (m_termType == TermType::Group) { for (const Term& term : m_atomData.group.terms) extraMemory += term.memoryUsed(); } return sizeof(Term) + extraMemory; } #endif } // namespace ContentExtensions } // namespace WebCore #endif // ENABLE(CONTENT_EXTENSIONS) #endif // Term_h