| 1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
 | module CodeRay module Scanners
	
	class C < Scanner
		register_for :c
		
		RESERVED_WORDS = [
			'asm', 'break', 'case', 'continue', 'default', 'do', 'else',
			'for', 'goto', 'if', 'return', 'switch', 'while',
			'struct', 'union', 'enum', 'typedef',
			'static', 'register', 'auto', 'extern',
			'sizeof',
			'volatile', 'const',  # C89
			'inline', 'restrict', # C99			
		]
		PREDEFINED_TYPES = [
			'int', 'long', 'short', 'char', 'void',
			'signed', 'unsigned', 'float', 'double',
			'bool', 'complex',  # C99
		]
		PREDEFINED_CONSTANTS = [
			'EOF', 'NULL',
			'true', 'false',  # C99
		]
		IDENT_KIND = WordList.new(:ident).
			add(RESERVED_WORDS, :reserved).
			add(PREDEFINED_TYPES, :pre_type).
			add(PREDEFINED_CONSTANTS, :pre_constant)
		ESCAPE = / [rbfnrtv\n\\'"] | x[a-fA-F0-9]{1,2} | [0-7]{1,3} /x
		UNICODE_ESCAPE =  / u[a-fA-F0-9]{4} | U[a-fA-F0-9]{8} /x
		def scan_tokens tokens, options
			state = :initial
			until eos?
				kind = :error
				match = nil
				if state == :initial
					
					if scan(/ \s+ | \\\n /x)
						kind = :space
						
					elsif scan(%r! // [^\n\\]* (?: \\. [^\n\\]* )* | /\* (?: .*? \*/ | .* ) !mx)
						kind = :comment
					elsif match = scan(/ \# \s* if \s* 0 /x)
						match << scan_until(/ ^\# (?:elif|else|endif) .*? $ | \z /xm) unless eos?
						kind = :comment
						
					elsif scan(/ [-+*\/=<>?:;,!&^|()\[\]{}~%]+ | \.(?!\d) /x)
						kind = :operator
						
					elsif match = scan(/ [A-Za-z_][A-Za-z_0-9]* /x)
						kind = IDENT_KIND[match]
						if kind == :ident and check(/:(?!:)/)
							match << scan(/:/)
							kind = :label
						end
						
					elsif match = scan(/L?"/)
						tokens << [:open, :string]
						if match[0] == ?L
							tokens << ['L', :modifier]
							match = '"'
						end
						state = :string
						kind = :delimiter
						
					elsif scan(/#\s*(\w*)/)
						kind = :preprocessor  # FIXME multiline preprocs
						state = :include_expected if self[1] == 'include'
						
					elsif scan(/ L?' (?: [^\'\n\\] | \\ #{ESCAPE} )? '? /ox)
						kind = :char
						
					elsif scan(/0[xX][0-9A-Fa-f]+/)
						kind = :hex
						
					elsif scan(/(?:0[0-7]+)(?![89.eEfF])/)
						kind = :oct
						
					elsif scan(/(?:\d+)(?![.eEfF])/)
						kind = :integer
						
					elsif scan(/\d[fF]?|\d*\.\d+(?:[eE][+-]?\d+)?[fF]?|\d+[eE][+-]?\d+[fF]?/)
						kind = :float
					else
						getch
					end
					
				elsif state == :string
					if scan(/[^\\"]+/)
						kind = :content
					elsif scan(/"/)
						tokens << ['"', :delimiter]
						tokens << [:close, :string]
						state = :initial
						next
					elsif scan(/ \\ (?: #{ESCAPE} | #{UNICODE_ESCAPE} ) /mox)
						kind = :char
					elsif scan(/ \\ | $ /x)
						kind = :error
						state = :initial
					else
						raise "else case \" reached; %p not handled." % peek(1), tokens
					end
					
				elsif state == :include_expected
					if scan(/<[^>\n]+>?|"[^"\n\\]*(?:\\.[^"\n\\]*)*"?/)
						kind = :include
						state = :initial
						
					elsif match = scan(/\s+/)
						kind = :space
						state = :initial if match.index ?\n
						
					else
						getch
						
					end
					
				else
					raise 'else-case reached', tokens
					
				end
				
				match ||= matched
				raise [match, kind], tokens if kind == :error
				tokens << [match, kind]
				
			end
			
			tokens
		end
	end
end end
 |