module CodeRay
module Scanners
# HTML Scanner
#
# Alias: +xhtml+
#
# See also: Scanners::XML
class HTML < Scanner
register_for :html
KINDS_NOT_LOC = [
:comment, :doctype, :preprocessor,
:tag, :attribute_name, :operator,
:attribute_value, :delimiter, :content,
:plain, :entity, :error,
] # :nodoc:
ATTR_NAME = /[\w.:-]+/ # :nodoc:
TAG_END = /\/?>/ # :nodoc:
HEX = /[0-9a-fA-F]/ # :nodoc:
ENTITY = /
&
(?:
\w+
|
\#
(?:
\d+
|
x#{HEX}+
)
)
;
/ox # :nodoc:
PLAIN_STRING_CONTENT = {
"'" => /[^&'>\n]+/,
'"' => /[^&">\n]+/,
} # :nodoc:
def reset # :nodoc:
# FIXME: why not overwrite reset_instance?
super
@state = :initial
end
protected
def setup
@state = :initial
@plain_string_content = nil
end
def scan_tokens encoder, options
state = @state
plain_string_content = @plain_string_content
until eos?
if match = scan(/\s+/m)
encoder.text_token match, :space
else
case state
when :initial
if match = scan(//m)
encoder.text_token match, :comment
elsif match = scan(//m)
encoder.text_token match, :doctype
elsif match = scan(/<\?xml.*?\?>/m)
encoder.text_token match, :preprocessor
elsif match = scan(/<\?.*?\?>|<%.*?%>/m)
encoder.text_token match, :comment
elsif match = scan(/<\/[-\w.:]*>/m)
encoder.text_token match, :tag
elsif match = scan(/<[-\w.:]+>?/m)
encoder.text_token match, :tag
state = :attribute unless match[-1] == ?>
elsif match = scan(/[^<>&]+/)
encoder.text_token match, :plain
elsif match = scan(/#{ENTITY}/ox)
encoder.text_token match, :entity
elsif match = scan(/[<>&]/)
encoder.text_token match, :error
else
raise_inspect '[BUG] else-case reached with state %p' % [state], encoder
end
when :attribute
if match = scan(/#{TAG_END}/)
encoder.text_token match, :tag
state = :initial
elsif match = scan(/#{ATTR_NAME}/o)
encoder.text_token match, :attribute_name
state = :attribute_equal
else
encoder.text_token getch, :error
end
when :attribute_equal
if match = scan(/=/)
encoder.text_token match, :operator
state = :attribute_value
elsif match = scan(/#{ATTR_NAME}/o)
encoder.text_token match, :attribute_name
elsif match = scan(/#{TAG_END}/o)
encoder.text_token match, :tag
state = :initial
else
encoder.text_token getch, :error
state = :attribute
end
when :attribute_value
if match = scan(/#{ATTR_NAME}/o)
encoder.text_token match, :attribute_value
state = :attribute
elsif match = scan(/["']/)
encoder.begin_group :string
state = :attribute_value_string
plain_string_content = PLAIN_STRING_CONTENT[match]
encoder.text_token match, :delimiter
elsif scan(/#{TAG_END}/o)
encoder.text_token match, :tag
state = :initial
else
encoder.text_token getch, :error
end
when :attribute_value_string
if match = scan(plain_string_content)
encoder.text_token match, :content
elsif match = scan(/['"]/)
encoder.text_token match, :delimiter
encoder.end_group :string
state = :attribute
elsif match = scan(/#{ENTITY}/ox)
encoder.text_token match, :entity
elsif match = scan(/&/)
encoder.text_token match, :content
elsif match = scan(/[\n>]/)
encoder.end_group :string
state = :initial
encoder.text_token match, :error
end
else
raise_inspect 'Unknown state: %p' % [state], encoder
end
end
end
if options[:keep_state]
@state = state
@plain_string_content = plain_string_content
else
if state == :attribute_value_string
encoder.end_group :string
end
end
encoder
end
end
end
end