summaryrefslogtreecommitdiff
path: root/lib/coderay/scanners/html.rb
blob: a1efa9e239a417774531ff5cb895e3d3a87e110b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
module CodeRay module Scanners

	# HTML Scanner
	#
	# $Id$
	class HTML < Scanner

		include Streamable
		register_for :html

		ATTR_NAME = /[\w.:-]+/
		ATTR_VALUE_UNQUOTED = ATTR_NAME
		TAG_END = /\/?>/
		HEX = /[0-9a-fA-F]/
		ENTITY = /
			&
			(?:
				\w+
			|
				\#
				(?:
					\d+
				|
					x#{HEX}+
				)
			)
			;
		/ox

		PLAIN_STRING_CONTENT = {
			"'" => /[^&'>\n]+/,
			'"' => /[^&">\n]+/,
		}

	private
		def setup
			@state = :initial
			@plain_string_content = nil
		end
		
		def scan_tokens tokens, options

			state = @state
			plain_string_content = @plain_string_content
			
			until eos?
				
				kind = :error
				match = nil

				if scan(/\s+/m)
					kind = :space
					
				else
					
					case state
						
					when :initial
						if scan(/<!--.*?-->/m)
							kind = :comment
						elsif scan(/<!DOCTYPE.*?>/m)
							kind = :preprocessor
						elsif scan(/<\?xml.*?\?>/m)
							kind = :preprocessor
						elsif scan(/<\?.*?\?>|<%.*?%>/m)
							kind = :comment
						elsif scan(/<\/[-\w_.:]*>/m)
							kind = :tag
						elsif match = scan(/<[-\w_.:]*>?/m)
							kind = :tag
							state = :attribute unless match[-1] == ?>
						elsif scan(/[^<>&]+/)
							kind = :plain
						elsif scan(/#{ENTITY}/ox)
							kind = :entity
						elsif scan(/>/)
							kind = :error
						else
							raise_inspect '[BUG] else-case reached with state %p' % [state], tokens
						end
						
					when :attribute
						if scan(/#{TAG_END}/)
							kind = :tag
							state = :initial
						elsif scan(/#{ATTR_NAME}/o)
							kind = :attribute_name
							state = :attribute_equal
						else
							getch
						end

					when :attribute_equal
						if scan(/=/)
							kind = :operator
							state = :attribute_value
						elsif scan(/#{ATTR_NAME}/o)
							kind = :attribute_name
						elsif scan(/#{TAG_END}/o)
							kind = :tag
							state = :initial
						elsif scan(/./)
							state = :attribute
						end
						
					when :attribute_value
						if scan(/#{ATTR_VALUE_UNQUOTED}/o)
							kind = :attribute_value
							state = :attribute
						elsif match = scan(/["']/)
							tokens << [:open, :string]
							state = :attribute_value_string
							plain_string_content = PLAIN_STRING_CONTENT[match]
							kind = :delimiter
						elsif scan(/#{TAG_END}/o)
							kind = :tag
							state = :initial
						else
							getch
						end

					when :attribute_value_string
						if scan(plain_string_content)
							kind = :content
						elsif scan(/['"]/)
							tokens << [matched, :delimiter]
							tokens << [:close, :string]
							state = :attribute
							next
						elsif scan(/#{ENTITY}/ox)
							kind = :entity
						elsif match(/[\n>]/)
							tokens << [:close, :string]
							kind = error
							state = :initial
						end

					else
						raise_inspect 'Unknown state: %p' % [state], tokens

					end

				end

				match ||= matched
				if $DEBUG and (not kind or kind == :error)
					raise_inspect 'Error token %p in line %d' %
					[[match, kind], line], tokens
				end
				raise_inspect 'Empty token', tokens unless match
				
				tokens << [match, kind]
			end

			if options[:keep_state]
				@state = state
				@plain_string_content = plain_string_content
			end

			tokens
		end

	end

end end