summaryrefslogtreecommitdiff
path: root/lib/coderay/scanners/html.rb
blob: 62da13bb4cf3765e74016b20131b39abac262b57 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#require 'coderay/common_patterns'

module CodeRay module Scanners

	# HTML Scanner
	class HTML < Scanner

		include Streamable
		register_for :html

		ATTR_NAME = /[\w.:-]+/
		ATTR_VALUE_UNQUOTED = ATTR_NAME
		TAG_END = /\/?>/
		HEX = /[0-9a-fA-F]/
		ENTITY = /
			&
			(?:
				\w+
			|
				\#
				(?:
					\d+
				|
					x#{HEX}+
				)
			)
			;
		/ox

	private
		def scan_tokens tokens, options
			
			state = :initial
			
			until eos?
				
				kind = :error
				match = nil

				if scan(/\s+/m)
					kind = :space
					
				else
					
					case state
						
					when :initial
						if scan(/<!--.*?-->/m)
							kind = :comment
						elsif scan(/<!DOCTYPE.*?>/m)
							kind = :preprocessor
						elsif scan(/<\?xml.*?\?>/m)
							kind = :preprocessor
						elsif scan(/<\?.*?\?>|<%.*?%>/m)
							kind = :comment
						elsif scan(/<\/[-\w_.:]*>/m)
							kind = :tag
						elsif match = scan(/<[-\w_.:]*/m)
							kind = :tag
							if match?(/>/)
								match << getch
							else
								state = :attribute
							end
						elsif scan(/[^<>&]+/)
							kind = :plain
						elsif scan(/#{ENTITY}/ox)
							kind = :char
						elsif scan(/>/)
							kind = :error
						else
							raise_inspect '[BUG] else-case reached with state %p' % [state], tokens
						end
						
					when :attribute
						if scan(/#{TAG_END}/)
							kind = :tag
							state = :initial
						elsif scan(/#{ATTR_NAME}/o)
							kind = :attribute_name
							state = :attribute_equal
						end

					when :attribute_equal
						if scan(/=/)
							kind = :operator
							state = :attribute_value
						elsif scan(/#{ATTR_NAME}/o)
							kind = :attribute_name
						elsif scan(/#{TAG_END}/o)
							kind = :tag
							state = :initial
						elsif scan(/./)
							state = :attribute
						end
						
					when :attribute_value
						if scan(/#{ATTR_VALUE_UNQUOTED}/o)
							kind = :attribute_value
							state = :attribute
						elsif scan(/"/)
							tokens << [:open, :string]
							state = :attribute_value_string
							kind = :delimiter
						elsif scan(/#{TAG_END}/o)
							kind = :tag
							state = :initial
						end

					when :attribute_value_string
						if scan(/[^"&\n]+/)
							kind = :content
						elsif scan(/"/)
							tokens << ['"', :delimiter]
							tokens << [:close, :string]
							state = :attribute
							next
						elsif scan(/#{ENTITY}/ox)
							kind = :char
						elsif match(/\n/)
							tokens << [:close, :string]
							state = :attribute
							next
						end

					else
						raise_inspect 'Unknown state: %p' % [state], tokens

					end

				end

				match ||= matched
				if $DEBUG and (not kind or kind == :error)
					raise_inspect 'Error token %p in line %d' %
					[[match, kind], line], tokens
				end
				raise_inspect 'Empty token', tokens unless match

				tokens << [match, kind]
			end

			tokens
		end

	end

end end