1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
|
module CodeRay
module Scanners
# HTML Scanner
#
# Alias: +xhtml+
#
# See also: Scanners::XML
class HTML < Scanner
include Streamable
register_for :html
KINDS_NOT_LOC = [
:comment, :doctype, :preprocessor,
:tag, :attribute_name, :operator,
:attribute_value, :delimiter, :content,
:plain, :entity, :error,
] # :nodoc:
ATTR_NAME = /[\w.:-]+/ # :nodoc:
TAG_END = /\/?>/ # :nodoc:
HEX = /[0-9a-fA-F]/ # :nodoc:
ENTITY = /
&
(?:
\w+
|
\#
(?:
\d+
|
x#{HEX}+
)
)
;
/ox # :nodoc:
PLAIN_STRING_CONTENT = {
"'" => /[^&'>\n]+/,
'"' => /[^&">\n]+/,
} # :nodoc:
def reset # :nodoc:
# FIXME: why not overwrite reset_instance?
super
@state = :initial
end
protected
def setup
@state = :initial
@plain_string_content = nil
end
def scan_tokens encoder, options
state = @state
plain_string_content = @plain_string_content
until eos?
if match = scan(/\s+/m)
encoder.text_token match, :space
else
case state
when :initial
if match = scan(/<!--.*?-->/m)
encoder.text_token match, :comment
elsif match = scan(/<!DOCTYPE.*?>/m)
encoder.text_token match, :doctype
elsif match = scan(/<\?xml.*?\?>/m)
encoder.text_token match, :preprocessor
elsif match = scan(/<\?.*?\?>|<%.*?%>/m)
encoder.text_token match, :comment
elsif match = scan(/<\/[-\w.:]*>/m)
encoder.text_token match, :tag
elsif match = scan(/<[-\w.:]+>?/m)
encoder.text_token match, :tag
state = :attribute unless match[-1] == ?>
elsif match = scan(/[^<>&]+/)
encoder.text_token match, :plain
elsif match = scan(/#{ENTITY}/ox)
encoder.text_token match, :entity
elsif match = scan(/[<>&]/)
encoder.text_token match, :error
else
raise_inspect '[BUG] else-case reached with state %p' % [state], encoder
end
when :attribute
if match = scan(/#{TAG_END}/)
encoder.text_token match, :tag
state = :initial
elsif match = scan(/#{ATTR_NAME}/o)
encoder.text_token match, :attribute_name
state = :attribute_equal
else
encoder.text_token getch, :error
end
when :attribute_equal
if match = scan(/=/)
encoder.text_token match, :operator
state = :attribute_value
elsif match = scan(/#{ATTR_NAME}/o)
encoder.text_token match, :attribute_name
elsif match = scan(/#{TAG_END}/o)
encoder.text_token match, :tag
state = :initial
else
encoder.text_token getch, :error
state = :attribute
end
when :attribute_value
if match = scan(/#{ATTR_NAME}/o)
encoder.text_token match, :attribute_value
state = :attribute
elsif match = scan(/["']/)
encoder.begin_group :string
state = :attribute_value_string
plain_string_content = PLAIN_STRING_CONTENT[match]
encoder.text_token match, :delimiter
elsif scan(/#{TAG_END}/o)
encoder.text_token match, :tag
state = :initial
else
encoder.text_token getch, :error
end
when :attribute_value_string
if match = scan(plain_string_content)
encoder.text_token match, :content
elsif match = scan(/['"]/)
encoder.text_token match, :delimiter
encoder.end_group :string
state = :attribute
elsif match = scan(/#{ENTITY}/ox)
encoder.text_token match, :entity
elsif match = scan(/&/)
encoder.text_token match, :content
elsif match = scan(/[\n>]/)
encoder.end_group :string
state = :initial
encoder.text_token match, :error
end
else
raise_inspect 'Unknown state: %p' % [state], encoder
end
end
end
if options[:keep_state]
@state = state
@plain_string_content = plain_string_content
else
if state == :attribute_value_string
encoder.end_group :string
end
end
encoder
end
end
end
end
|