module CodeRay
module Scanners
# HTML Scanner
#
# Alias: +xhtml+
#
# See also: Scanners::XML
class HTML < Scanner
register_for :html
KINDS_NOT_LOC = [
:comment, :doctype, :preprocessor,
:tag, :attribute_name, :operator,
:attribute_value, :delimiter, :content,
:plain, :entity, :error,
] # :nodoc:
ATTR_NAME = /[\w.:-]+/ # :nodoc:
TAG_END = /\/?>/ # :nodoc:
HEX = /[0-9a-fA-F]/ # :nodoc:
ENTITY = /
&
(?:
\w+
|
\#
(?:
\d+
|
x#{HEX}+
)
)
;
/ox # :nodoc:
PLAIN_STRING_CONTENT = {
"'" => /[^&'>\n]+/,
'"' => /[^&">\n]+/,
} # :nodoc:
def reset # :nodoc:
# FIXME: why not overwrite reset_instance?
super
@state = :initial
end
protected
def setup
@state = :initial
@plain_string_content = nil
end
def scan_java_script encoder, code
if code && !code.empty?
@java_script_scanner ||= Scanners::JavaScript.new '', :keep_tokens => true
# encoder.begin_group :inline
@java_script_scanner.tokenize code, :tokens => encoder
# encoder.end_group :inline
end
end
def scan_tokens encoder, options
state = @state
plain_string_content = @plain_string_content
in_tag = in_attribute = nil
until eos?
if match = scan(/\s+/m)
encoder.text_token match, :space
else
case state
when :initial
case in_tag
when 'script'
if scan(/(\s*)|(.*))/m)
code = self[2] || self[4]
closing = self[3]
encoder.text_token self[1], :comment
else
code = scan_until(/(?=(?:\n\s*)?<\/script>)|\z/)
closing = false
end
unless code.empty?
encoder.begin_group :inline
scan_java_script encoder, code
encoder.end_group :inline
end
encoder.text_token closing, :comment if closing
end
next if eos?
if match = scan(/|.*)/m)
encoder.text_token match, :comment
elsif match = scan(/|.*)/m)
encoder.text_token match, :doctype
elsif match = scan(/<\?xml(?:.*?\?>|.*)/m)
encoder.text_token match, :preprocessor
elsif match = scan(/<\?(?:.*?\?>|.*)|<%(?:.*?%>|.*)/m)
encoder.text_token match, :comment
elsif match = scan(/<\/[-\w.:]*>?/m)
encoder.text_token match, :tag
in_tag = nil
elsif match = scan(/<(?:(script)|[-\w.:]+)(>)?/m)
encoder.text_token match, :tag
in_tag = self[1]
state = :attribute unless self[2]
elsif match = scan(/[^<>&]+/)
encoder.text_token match, :plain
elsif match = scan(/#{ENTITY}/ox)
encoder.text_token match, :entity
elsif match = scan(/[<>&]/)
encoder.text_token match, :error
else
raise_inspect '[BUG] else-case reached with state %p' % [state], encoder
end
when :attribute
if match = scan(/#{TAG_END}/o)
encoder.text_token match, :tag
in_attribute = nil
state = :initial
elsif match = scan(/#{ATTR_NAME}/o)
if match.downcase == 'onclick'
in_attribute = 'script'
end
encoder.text_token match, :attribute_name
state = :attribute_equal
else
encoder.text_token getch, :error
end
when :attribute_equal
if match = scan(/=/)
encoder.text_token match, :operator
state = :attribute_value
elsif scan(/#{ATTR_NAME}/o) || scan(/#{TAG_END}/o)
state = :attribute
next
else
encoder.text_token getch, :error
state = :attribute
end
when :attribute_value
if match = scan(/#{ATTR_NAME}/o)
encoder.text_token match, :attribute_value
state = :attribute
elsif match = scan(/["']/)
if in_attribute == 'script'
encoder.begin_group :inline
encoder.text_token match, :inline_delimiter
if scan(/javascript:\s*/)
encoder.text_token matched, :comment
end
code = scan_until(match == '"' ? /(?="|\z)/ : /(?='|\z)/)
scan_java_script encoder, code
match = scan(/["']/)
encoder.text_token match, :inline_delimiter if match
encoder.end_group :inline
state = :attribute
in_attribute = nil
else
encoder.begin_group :string
state = :attribute_value_string
plain_string_content = PLAIN_STRING_CONTENT[match]
encoder.text_token match, :delimiter
end
elsif match = scan(/#{TAG_END}/o)
encoder.text_token match, :tag
state = :initial
else
encoder.text_token getch, :error
end
when :attribute_value_string
if match = scan(plain_string_content)
encoder.text_token match, :content
elsif match = scan(/['"]/)
encoder.text_token match, :delimiter
encoder.end_group :string
state = :attribute
elsif match = scan(/#{ENTITY}/ox)
encoder.text_token match, :entity
elsif match = scan(/&/)
encoder.text_token match, :content
elsif match = scan(/[\n>]/)
encoder.end_group :string
state = :initial
encoder.text_token match, :error
end
else
raise_inspect 'Unknown state: %p' % [state], encoder
end
end
end
if options[:keep_state]
@state = state
@plain_string_content = plain_string_content
else
if state == :attribute_value_string
encoder.end_group :string
end
end
encoder
end
end
end
end