diff options
Diffstat (limited to 'lib/coderay/scanners/html.rb')
-rw-r--r-- | lib/coderay/scanners/html.rb | 341 |
1 files changed, 174 insertions, 167 deletions
diff --git a/lib/coderay/scanners/html.rb b/lib/coderay/scanners/html.rb index 7cdc07e..181e5d3 100644 --- a/lib/coderay/scanners/html.rb +++ b/lib/coderay/scanners/html.rb @@ -1,167 +1,174 @@ -module CodeRay
-module Scanners
-
- # HTML Scanner
- #
- # $Id$
- class HTML < Scanner
-
- include Streamable
- register_for :html
-
- ATTR_NAME = /[\w.:-]+/
- ATTR_VALUE_UNQUOTED = ATTR_NAME
- TAG_END = /\/?>/
- HEX = /[0-9a-fA-F]/
- ENTITY = /
- &
- (?:
- \w+
- |
- \#
- (?:
- \d+
- |
- x#{HEX}+
- )
- )
- ;
- /ox
-
- PLAIN_STRING_CONTENT = {
- "'" => /[^&'>\n]+/,
- '"' => /[^&">\n]+/,
- }
-
- private
- def setup
- @state = :initial
- @plain_string_content = nil
- end
-
- def scan_tokens tokens, options
-
- state = @state
- plain_string_content = @plain_string_content
-
- until eos?
-
- kind = :error
- match = nil
-
- if scan(/\s+/m)
- kind = :space
-
- else
-
- case state
-
- when :initial
- if scan(/<!--.*?-->/m)
- kind = :comment
- elsif scan(/<!DOCTYPE.*?>/m)
- kind = :preprocessor
- elsif scan(/<\?xml.*?\?>/m)
- kind = :preprocessor
- elsif scan(/<\?.*?\?>|<%.*?%>/m)
- kind = :comment
- elsif scan(/<\/[-\w_.:]*>/m)
- kind = :tag
- elsif match = scan(/<[-\w_.:]*>?/m)
- kind = :tag
- state = :attribute unless match[-1] == ?>
- elsif scan(/[^<>&]+/)
- kind = :plain
- elsif scan(/#{ENTITY}/ox)
- kind = :entity
- elsif scan(/[>&]/)
- kind = :error
- else
- raise_inspect '[BUG] else-case reached with state %p' % [state], tokens
- end
-
- when :attribute
- if scan(/#{TAG_END}/)
- kind = :tag
- state = :initial
- elsif scan(/#{ATTR_NAME}/o)
- kind = :attribute_name
- state = :attribute_equal
- else
- getch
- end
-
- when :attribute_equal
- if scan(/=/)
- kind = :operator
- state = :attribute_value
- elsif scan(/#{ATTR_NAME}/o)
- kind = :attribute_name
- elsif scan(/#{TAG_END}/o)
- kind = :tag
- state = :initial
- elsif scan(/./)
- state = :attribute
- end
-
- when :attribute_value
- if scan(/#{ATTR_VALUE_UNQUOTED}/o)
- kind = :attribute_value
- state = :attribute
- elsif match = scan(/["']/)
- tokens << [:open, :string]
- state = :attribute_value_string
- plain_string_content = PLAIN_STRING_CONTENT[match]
- kind = :delimiter
- elsif scan(/#{TAG_END}/o)
- kind = :tag
- state = :initial
- else
- getch
- end
-
- when :attribute_value_string
- if scan(plain_string_content)
- kind = :content
- elsif scan(/['"]/)
- tokens << [matched, :delimiter]
- tokens << [:close, :string]
- state = :attribute
- next
- elsif scan(/#{ENTITY}/ox)
- kind = :entity
- elsif scan(/[\n>]/)
- tokens << [:close, :string]
- kind = :error
- state = :initial
- end
-
- else
- raise_inspect 'Unknown state: %p' % [state], tokens
-
- end
-
- end
-
- match ||= matched
- if $DEBUG and (not kind or kind == :error)
- raise_inspect 'Error token %p in line %d' %
- [[match, kind], line], tokens
- end
- raise_inspect 'Empty token', tokens unless match
-
- tokens << [match, kind]
- end
-
- if options[:keep_state]
- @state = state
- @plain_string_content = plain_string_content
- end
-
- tokens
- end
-
- end
-
-end
-end
+module CodeRay +module Scanners + + # HTML Scanner + # + # $Id$ + class HTML < Scanner + + include Streamable + register_for :html + + ATTR_NAME = /[\w.:-]+/ + ATTR_VALUE_UNQUOTED = ATTR_NAME + TAG_END = /\/?>/ + HEX = /[0-9a-fA-F]/ + ENTITY = / + & + (?: + \w+ + | + \# + (?: + \d+ + | + x#{HEX}+ + ) + ) + ; + /ox + + PLAIN_STRING_CONTENT = { + "'" => /[^&'>\n]+/, + '"' => /[^&">\n]+/, + } + + def reset + super + @state = :initial + end + + private + def setup + @state = :initial + @plain_string_content = nil + end + + def scan_tokens tokens, options + + state = @state + plain_string_content = @plain_string_content + + until eos? + + kind = nil + match = nil + + if scan(/\s+/m) + kind = :space + + else + + case state + + when :initial + if scan(/<!--.*?-->/m) + kind = :comment + elsif scan(/<!DOCTYPE.*?>/m) + kind = :preprocessor + elsif scan(/<\?xml.*?\?>/m) + kind = :preprocessor + elsif scan(/<\?.*?\?>|<%.*?%>/m) + kind = :comment + elsif scan(/<\/[-\w_.:]*>/m) + kind = :tag + elsif match = scan(/<[-\w_.:]+>?/m) + kind = :tag + state = :attribute unless match[-1] == ?> + elsif scan(/[^<>&]+/) + kind = :plain + elsif scan(/#{ENTITY}/ox) + kind = :entity + elsif scan(/[<>&]/) + kind = :error + else + raise_inspect '[BUG] else-case reached with state %p' % [state], tokens + end + + when :attribute + if scan(/#{TAG_END}/) + kind = :tag + state = :initial + elsif scan(/#{ATTR_NAME}/o) + kind = :attribute_name + state = :attribute_equal + else + kind = :error + getch + end + + when :attribute_equal + if scan(/=/) + kind = :operator + state = :attribute_value + elsif scan(/#{ATTR_NAME}/o) + kind = :attribute_name + elsif scan(/#{TAG_END}/o) + kind = :tag + state = :initial + elsif scan(/./) + state = :attribute + end + + when :attribute_value + if scan(/#{ATTR_VALUE_UNQUOTED}/o) + kind = :attribute_value + state = :attribute + elsif match = scan(/["']/) + tokens << [:open, :string] + state = :attribute_value_string + plain_string_content = PLAIN_STRING_CONTENT[match] + kind = :delimiter + elsif scan(/#{TAG_END}/o) + kind = :tag + state = :initial + else + kind = :error + getch + end + + when :attribute_value_string + if scan(plain_string_content) + kind = :content + elsif scan(/['"]/) + tokens << [matched, :delimiter] + tokens << [:close, :string] + state = :attribute + next + elsif scan(/#{ENTITY}/ox) + kind = :entity + elsif scan(/[\n>]/) + tokens << [:close, :string] + kind = :error + state = :initial + end + + else + raise_inspect 'Unknown state: %p' % [state], tokens + + end + + end + + match ||= matched + if $DEBUG and not kind + raise_inspect 'Error token %p in line %d' % + [[match, kind], line], tokens, state + end + raise_inspect 'Empty token', tokens unless match + + tokens << [match, kind] + end + + if options[:keep_state] + @state = state + @plain_string_content = plain_string_content + end + + tokens + end + + end + +end +end |