summaryrefslogtreecommitdiff
path: root/lib/coderay/scanners
diff options
context:
space:
mode:
Diffstat (limited to 'lib/coderay/scanners')
-rw-r--r--lib/coderay/scanners/c.rb147
-rw-r--r--lib/coderay/scanners/delphi.rb123
-rw-r--r--lib/coderay/scanners/helpers/ruby_helper.rb212
-rw-r--r--lib/coderay/scanners/mush.rb102
-rw-r--r--lib/coderay/scanners/plaintext.rb13
-rw-r--r--lib/coderay/scanners/ruby.rb333
-rw-r--r--lib/coderay/scanners/rubyfast.rb287
-rw-r--r--lib/coderay/scanners/rubylex.rb102
8 files changed, 1319 insertions, 0 deletions
diff --git a/lib/coderay/scanners/c.rb b/lib/coderay/scanners/c.rb
new file mode 100644
index 0000000..3420822
--- /dev/null
+++ b/lib/coderay/scanners/c.rb
@@ -0,0 +1,147 @@
+module CodeRay module Scanners
+
+ class C < Scanner
+
+ register_for :c
+
+ RESERVED_WORDS = [
+ 'asm', 'break', 'case', 'continue', 'default', 'do', 'else',
+ 'for', 'goto', 'if', 'return', 'switch', 'while',
+ 'struct', 'union', 'enum', 'typedef',
+ 'static', 'register', 'auto', 'extern',
+ 'sizeof',
+ 'volatile', 'const', # C89
+ 'inline', 'restrict', # C99
+ ]
+
+ PREDEFINED_TYPES = [
+ 'int', 'long', 'short', 'char', 'void',
+ 'signed', 'unsigned', 'float', 'double',
+ 'bool', 'complex', # C99
+ ]
+
+ PREDEFINED_CONSTANTS = [
+ 'EOF', 'NULL',
+ 'true', 'false', # C99
+ ]
+
+ IDENT_KIND = Scanner::WordList.new(:ident).
+ add(RESERVED_WORDS, :reserved).
+ add(PREDEFINED_TYPES, :pre_type).
+ add(PREDEFINED_CONSTANTS, :pre_constant)
+
+ ESCAPE = / [rbfnrtv\n\\'"] | x[a-fA-F0-9]{1,2} | [0-7]{1,3} /x
+ UNICODE_ESCAPE = / u[a-fA-F0-9]{4} | U[a-fA-F0-9]{8} /x
+
+ def scan_tokens tokens, options
+
+ state = :initial
+
+ until eos?
+
+ kind = :error
+ match = nil
+
+ if state == :initial
+
+ if scan(/ \s+ | \\\n /x)
+ kind = :space
+
+ elsif scan(%r! // [^\n\\]* (?: \\. [^\n\\]* )* | /\* (?: .*? \*/ | .* ) !mx)
+ kind = :comment
+
+ elsif match = scan(/ \# \s* if \s* 0 /x)
+ match << scan_until(/ ^\# (?:elif|else|endif) .*? $ | \z /xm) unless eos?
+ kind = :comment
+
+ elsif scan(/ [-+*\/=<>?:;,!&^|()\[\]{}~%]+ | \.(?!\d) /x)
+ kind = :operator
+
+ elsif match = scan(/ [A-Za-z_][A-Za-z_0-9]* /x)
+ kind = IDENT_KIND[match]
+ if kind == :ident and check(/:(?!:)/)
+ match << scan(/:/)
+ kind = :label
+ end
+
+ elsif match = scan(/L?"/)
+ tokens << [:open, :string]
+ if match[0] == ?L
+ tokens << ['L', :modifier]
+ match = '"'
+ end
+ state = :string
+ kind = :delimiter
+
+ elsif scan(/#\s*(\w*)/)
+ kind = :preprocessor # FIXME multiline preprocs
+ state = :include_expected if self[1] == 'include'
+
+ elsif scan(/ L?' (?: [^\'\n\\] | \\ #{ESCAPE} )? '? /ox)
+ kind = :char
+
+ elsif scan(/0[xX][0-9A-Fa-f]+/)
+ kind = :hex
+
+ elsif scan(/(?:0[0-7]+)(?![89.eEfF])/)
+ kind = :oct
+
+ elsif scan(/(?:\d+)(?![.eEfF])/)
+ kind = :integer
+
+ elsif scan(/\d[fF]?|\d*\.\d+(?:[eE][+-]?\d+)?[fF]?|\d+[eE][+-]?\d+[fF]?/)
+ kind = :float
+
+ else
+ getch
+ end
+
+ elsif state == :string
+ if scan(/[^\\"]+/)
+ kind = :content
+ elsif scan(/"/)
+ tokens << ['"', :delimiter]
+ tokens << [:close, :string]
+ state = :initial
+ next
+ elsif scan(/ \\ (?: #{ESCAPE} | #{UNICODE_ESCAPE} ) /mox)
+ kind = :char
+ elsif scan(/ \\ | $ /x)
+ kind = :error
+ state = :initial
+ else
+ raise "else case \" reached; %p not handled." % peek(1), tokens
+ end
+
+ elsif state == :include_expected
+ if scan(/<[^>\n]+>?|"[^"\n\\]*(?:\\.[^"\n\\]*)*"?/)
+ kind = :include
+ state = :initial
+
+ elsif match = scan(/\s+/)
+ kind = :space
+ state = :initial if match.index ?\n
+
+ else
+ getch
+
+ end
+
+ else
+ raise 'else-case reached', tokens
+
+ end
+
+ match ||= matched
+ raise [match, kind], tokens if kind == :error
+
+ tokens << [match, kind]
+
+ end
+
+ tokens
+ end
+
+ end
+
+end end
diff --git a/lib/coderay/scanners/delphi.rb b/lib/coderay/scanners/delphi.rb
new file mode 100644
index 0000000..4c03147
--- /dev/null
+++ b/lib/coderay/scanners/delphi.rb
@@ -0,0 +1,123 @@
+module CodeRay module Scanners
+
+ class Delphi < Scanner
+
+ register_for :delphi
+
+ RESERVED_WORDS = [
+ 'and', 'array', 'as', 'at', 'asm', 'at', 'begin', 'case', 'class',
+ 'const', 'constructor', 'destructor', 'dispinterface', 'div', 'do',
+ 'downto', 'else', 'end', 'except', 'exports', 'file', 'finalization',
+ 'finally', 'for', 'function', 'goto', 'if', 'implementation', 'in',
+ 'inherited', 'initialization', 'inline', 'interface', 'is', 'label',
+ 'library', 'mod', 'nil', 'not', 'object', 'of', 'or', 'out', 'packed',
+ 'procedure', 'program', 'property', 'raise', 'record', 'repeat',
+ 'resourcestring', 'set', 'shl', 'shr', 'string', 'then', 'threadvar',
+ 'to', 'try', 'type', 'unit', 'until', 'uses', 'var', 'while', 'with',
+ 'xor', 'on'
+ ]
+
+ DIRECTIVES = [
+ 'absolute', 'abstract', 'assembler', 'at', 'automated', 'cdecl',
+ 'contains', 'deprecated', 'dispid', 'dynamic', 'export',
+ 'external', 'far', 'forward', 'implements', 'local',
+ 'near', 'nodefault', 'on', 'overload', 'override',
+ 'package', 'pascal', 'platform', 'private', 'protected', 'public',
+ 'published', 'read', 'readonly', 'register', 'reintroduce',
+ 'requires', 'resident', 'safecall', 'stdcall', 'stored', 'varargs',
+ 'virtual', 'write', 'writeonly'
+ ]
+
+ IDENT_KIND = Scanner::WordList.new(:ident, :case_ignore).
+ add(RESERVED_WORDS, :reserved).
+ add(DIRECTIVES, :directive)
+
+ def scan_tokens tokens, options
+
+ state = :initial
+
+ until eos?
+
+ kind = :error
+ match = nil
+
+ if state == :initial
+
+ if scan(/ \s+ /x)
+ kind = :space
+
+ elsif scan(%r! \{ \$ [^}]* \}? | \(\* \$ (?: .*? \*\) | .* ) !mx)
+ kind = :preprocessor
+
+ elsif scan(%r! // [^\n]* | \{ [^}]* \}? | \(\* (?: .*? \*\) | .* ) !mx)
+ kind = :comment
+
+ elsif scan(/ [-+*\/=<>:;,.@\^|\(\)\[\]]+ /x)
+ kind = :operator
+
+ elsif match = scan(/ [A-Za-z_][A-Za-z_0-9]* /x)
+ kind = IDENT_KIND[match]
+
+ elsif match = scan(/ ' ( [^\n']|'' ) (?:'|$) /x)
+ tokens << [:open, :char]
+ tokens << ["'", :delimiter]
+ tokens << [self[1], :content]
+ tokens << ["'", :delimiter]
+ tokens << [:close, :char]
+ next
+
+ elsif match = scan(/ ' /x)
+ tokens << [:open, :string]
+ state = :string
+ kind = :delimiter
+
+ elsif scan(/ \# (?: \d+ | \$[0-9A-Fa-f]+ ) /x)
+ kind = :char
+
+ elsif scan(/ \$ [0-9A-Fa-f]+ /x)
+ kind = :hex
+
+ elsif scan(/ (?: \d+ ) (?![eE]|\.[^.]) /x)
+ kind = :integer
+
+ elsif scan(/ \d+ (?: \.\d+ (?: [eE][+-]? \d+ )? | [eE][+-]? \d+ ) /x)
+ kind = :float
+
+ else
+ getch
+ end
+
+ elsif state == :string
+ if scan(/[^\n']+/)
+ kind = :content
+ elsif scan(/''/)
+ kind = :char
+ elsif scan(/'/)
+ tokens << ["'", :delimiter]
+ tokens << [:close, :string]
+ state = :initial
+ next
+ elsif scan(/\n/)
+ state = :initial
+ else
+ raise "else case \' reached; %p not handled." % peek(1), tokens
+ end
+
+ else
+ raise 'else-case reached', tokens
+
+ end
+
+ match ||= matched
+ raise [match, kind], tokens if kind == :error
+
+ tokens << [match, kind]
+
+ end
+
+ tokens
+ end
+
+ end
+
+end end
diff --git a/lib/coderay/scanners/helpers/ruby_helper.rb b/lib/coderay/scanners/helpers/ruby_helper.rb
new file mode 100644
index 0000000..241b392
--- /dev/null
+++ b/lib/coderay/scanners/helpers/ruby_helper.rb
@@ -0,0 +1,212 @@
+module CodeRay module Scanners
+
+ class Ruby
+
+ RESERVED_WORDS = %w[
+ and def end in or unless begin
+ defined? ensure module redo super until
+ BEGIN break do next rescue then
+ when END case else for retry
+ while alias class elsif if not return
+ undef yield
+ ]
+
+ DEF_KEYWORDS = %w[ def ]
+ MODULE_KEYWORDS = %w[class module]
+ DEF_NEW_STATE = WordList.new(:initial).
+ add(DEF_KEYWORDS, :def_expected).
+ add(MODULE_KEYWORDS, :module_expected)
+
+ IDENTS_ALLOWING_REGEXP = %w[
+ and or not while until unless if then elsif when sub sub! gsub gsub! scan slice slice! split
+ ]
+ REGEXP_ALLOWED = WordList.new(false).
+ add(IDENTS_ALLOWING_REGEXP, :set)
+
+ PREDEFINED_CONSTANTS = %w[
+ nil true false self
+ DATA ARGV ARGF __FILE__ __LINE__
+ ]
+
+ IDENT_KIND = WordList.new(:ident).
+ add(RESERVED_WORDS, :reserved).
+ add(PREDEFINED_CONSTANTS, :pre_constant)
+
+# IDENT = /[a-zA-Z_][a-zA-Z_0-9]*/
+ IDENT = /[a-z_][\w_]*/i
+
+ METHOD_NAME = / #{IDENT} [?!]? /ox
+ METHOD_NAME_EX = /
+ #{IDENT}[?!=]? # common methods: split, foo=, empty?, gsub!
+ | \*\*? # multiplication and power
+ | [-+]@? # plus, minus
+ | [\/%&|^`~] # division, modulo or format strings, &and, |or, ^xor, `system`, tilde
+ | \[\]=? # array getter and setter
+ | << | >> # append or shift left, shift right
+ | <=?>? | >=? # comparison, rocket operator
+ | ===? # simple equality and case equality
+ /ox
+ INSTANCE_VARIABLE = / @ #{IDENT} /ox
+ CLASS_VARIABLE = / @@ #{IDENT} /ox
+ OBJECT_VARIABLE = / @@? #{IDENT} /ox
+ GLOBAL_VARIABLE = / \$ (?: #{IDENT} | [1-9] | 0[a-zA-Z_0-9]* | [~&+`'=\/,;_.<>!@$?*":\\] | -[a-zA-Z_0-9] ) /ox
+ PREFIX_VARIABLE = / #{GLOBAL_VARIABLE} |#{OBJECT_VARIABLE} /ox
+ VARIABLE = / @?@? #{IDENT} | #{GLOBAL_VARIABLE} /ox
+
+ QUOTE_TO_TYPE = {
+ '`' => :shell,
+ '/'=> :regexp,
+ }
+ QUOTE_TO_TYPE.default = :string
+
+ REGEXP_MODIFIERS = /[mixounse]*/
+ REGEXP_SYMBOLS = /
+ [|?*+?(){}\[\].^$]
+ /x
+
+ DECIMAL = /\d+(?:_\d+)*/ # doesn't recognize 09 as octal error
+ OCTAL = /0_?[0-7]+(?:_[0-7]+)*/
+ HEXADECIMAL = /0x[0-9A-Fa-f]+(?:_[0-9A-Fa-f]+)*/
+ BINARY = /0b[01]+(?:_[01]+)*/
+
+ EXPONENT = / [eE] [+-]? #{DECIMAL} /ox
+ FLOAT_OR_INT = / #{DECIMAL} (?: #{EXPONENT} | \. #{DECIMAL} #{EXPONENT}? )? /ox
+ FLOAT = / #{DECIMAL} (?: #{EXPONENT} | \. #{DECIMAL} #{EXPONENT}? ) /ox
+ NUMERIC = / #{OCTAL} | #{HEXADECIMAL} | #{BINARY} | #{FLOAT_OR_INT} /ox
+
+ SYMBOL = /
+ :
+ (?:
+ #{METHOD_NAME_EX}
+ | #{PREFIX_VARIABLE}
+ | ['"]
+ )
+ /ox
+
+ # TODO investigste \M, \c and \C escape sequences
+ # (?: M-\\C-|C-\\M-|M-\\c|c\\M-|c|C-|M-)? (?: \\ (?: [0-7]{3} | x[0-9A-Fa-f]{2} | . ) )
+ # assert_equal(225, ?\M-a)
+ # assert_equal(129, ?\M-\C-a)
+ ESCAPE = /
+ [abefnrstv]
+ | M-\\C-|C-\\M-|M-\\c|c\\M-|c|C-|M-
+ | [0-7]{1,3}
+ | x[0-9A-Fa-f]{1,2}
+ | .
+ /mx
+
+ CHARACTER = /
+ \?
+ (?:
+ [^\s\\]
+ | \\ #{ESCAPE}
+ )
+ /mx
+
+ # NOTE: This is not completel correct, but
+ # nobody needs heredoc delimiters ending with \n.
+ HEREDOC_OPEN = /
+ << (-)? # $1 = float
+ (?:
+ ( [A-Za-z_0-9]+ ) # $2 = delim
+ |
+ ( ["'`] ) # $3 = quote, type
+ ( [^\n]*? ) \3 # $4 = delim
+ )
+ /mx
+
+ RDOC = /
+ =begin (?!\S)
+ .*?
+ (?: \Z | ^=end (?!\S) [^\n]* )
+ /mx
+
+ DATA = /
+ __END__$
+ .*?
+ (?: \Z | (?=^\#CODE) )
+ /mx
+
+ RDOC_DATA_START = / ^=begin (?!\S) | ^__END__$ /x
+
+ FANCY_START = / % ( [qQwWxsr] | (?![\w\s=]) ) (.) /mox
+
+ FancyStringType = {
+ 'q' => [:string, false],
+ 'Q' => [:string, true],
+ 'r' => [:regexp, true],
+ 's' => [:symbol, false],
+ 'x' => [:shell, true],
+ 'w' => [:string, :word],
+ 'W' => [:string, :word],
+ }
+ FancyStringType['w'] = FancyStringType['q']
+ FancyStringType['W'] = FancyStringType[''] = FancyStringType['Q']
+
+ class StringState < Struct.new :type, :interpreted, :delim, :heredoc,
+ :paren, :paren_depth, :pattern
+
+ CLOSING_PAREN = Hash[ *%w[
+ ( )
+ [ ]
+ < >
+ { }
+ ] ]
+
+ CLOSING_PAREN.values.each { |o| o.freeze } # debug, if I try to change it with <<
+ OPENING_PAREN = CLOSING_PAREN.invert
+
+ STRING_PATTERN = Hash.new { |h, k|
+ delim, interpreted = *k
+ delim_pattern = Regexp.escape(delim.dup)
+ if starter = OPENING_PAREN[delim]
+ delim_pattern << Regexp.escape(starter)
+ end
+
+
+ special_escapes =
+ case interpreted
+ when :regexp_symbols
+ '| ' + REGEXP_SYMBOLS.source
+ when :words
+ '| \s'
+ end
+
+ h[k] =
+ if interpreted and not delim == '#'
+ / (?= [#{delim_pattern}\\] | \# [{$@] #{special_escapes} ) /mx
+ else
+ / (?= [#{delim_pattern}\\] #{special_escapes} ) /mx
+ end
+ }
+
+ HEREDOC_PATTERN = Hash.new { |h, k|
+ delim, interpreted, indented = *k
+ delim_pattern = Regexp.escape(delim.dup)
+ delim_pattern = / \n #{ '(?>[\ \t]*)' if indented } #{ Regexp.new delim_pattern } $ /x
+ h[k] =
+ if interpreted
+ / (?= #{delim_pattern}() | \\ | \# [{$@] ) /mx
+ else
+ / (?= #{delim_pattern}() | \\ ) /mx
+ end
+ }
+
+ def initialize kind, interpreted, delim, heredoc = false
+ if paren = CLOSING_PAREN[delim]
+ delim, paren = paren, delim
+ paren_depth = 1
+ end
+ if heredoc
+ pattern = HEREDOC_PATTERN[ [delim, interpreted, heredoc == :indented] ]
+ delim = nil
+ else
+ pattern = STRING_PATTERN[ [delim, interpreted] ]
+ end
+ super kind, interpreted, delim, heredoc, paren, paren_depth, pattern
+ end
+ end unless defined? StringState
+
+ end
+
+end end
diff --git a/lib/coderay/scanners/mush.rb b/lib/coderay/scanners/mush.rb
new file mode 100644
index 0000000..5217ed9
--- /dev/null
+++ b/lib/coderay/scanners/mush.rb
@@ -0,0 +1,102 @@
+module CodeRay module Scanners
+
+ class Mush < Scanner
+
+ register_for :mush
+
+ RESERVED_WORDS = [
+ ]
+
+ IDENT_KIND = Scanner::WordList.new(:ident, :case_ignore).
+ add(RESERVED_WORDS, :reserved).
+ add(DIRECTIVES, :directive)
+
+ def scan_tokens tokens, options
+
+ state = :initial
+
+ until eos?
+
+ kind = :error
+ match = nil
+
+ if state == :initial
+
+ if scan(/ \s+ /x)
+ kind = :space
+
+ elsif scan(%r! \{ \$ [^}]* \}? | \(\* \$ (?: .*? \*\) | .* ) !mx)
+ kind = :preprocessor
+
+ elsif scan(%r! // [^\n]* | \{ [^}]* \}? | \(\* (?: .*? \*\) | .* ) !mx)
+ kind = :comment
+
+ elsif scan(/ [-+*\/=<>:;,.@\^|\(\)\[\]]+ /x)
+ kind = :operator
+
+ elsif match = scan(/ [A-Za-z_][A-Za-z_0-9]* /x)
+ kind = IDENT_KIND[match]
+
+ elsif match = scan(/ ' ( [^\n']|'' ) (?:'|$) /x)
+ tokens << [:open, :char]
+ tokens << ["'", :delimiter]
+ tokens << [self[1], :content]
+ tokens << ["'", :delimiter]
+ tokens << [:close, :char]
+ next
+
+ elsif match = scan(/ ' /x)
+ tokens << [:open, :string]
+ state = :string
+ kind = :delimiter
+
+ elsif scan(/ \# (?: \d+ | \$[0-9A-Fa-f]+ ) /x)
+ kind = :char
+
+ elsif scan(/ \$ [0-9A-Fa-f]+ /x)
+ kind = :hex
+
+ elsif scan(/ (?: \d+ ) (?![eE]|\.[^.]) /x)
+ kind = :integer
+
+ elsif scan(/ \d+ (?: \.\d+ (?: [eE][+-]? \d+ )? | [eE][+-]? \d+ ) /x)
+ kind = :float
+
+ else
+ getch
+ end
+
+ elsif state == :string
+ if scan(/[^\n']+/)
+ kind = :content
+ elsif scan(/''/)
+ kind = :char
+ elsif scan(/'/)
+ tokens << ["'", :delimiter]
+ tokens << [:close, :string]
+ state = :initial
+ next
+ elsif scan(/\n/)
+ state = :initial
+ else
+ raise "else case \' reached; %p not handled." % peek(1), tokens
+ end
+
+ else
+ raise 'else-case reached', tokens
+
+ end
+
+ match ||= matched
+ raise [match, kind], tokens if kind == :error
+
+ tokens << [match, kind]
+
+ end
+
+ tokens
+ end
+
+ end
+
+end end
diff --git a/lib/coderay/scanners/plaintext.rb b/lib/coderay/scanners/plaintext.rb
new file mode 100644
index 0000000..0aebf35
--- /dev/null
+++ b/lib/coderay/scanners/plaintext.rb
@@ -0,0 +1,13 @@
+module CodeRay module Scanners
+
+ class Plaintext < Scanner
+
+ register_for :plaintext, :plain
+
+ def scan_tokens tokens, options
+ tokens << [scan_until(/\z/), :plain]
+ end
+
+ end
+
+end end
diff --git a/lib/coderay/scanners/ruby.rb b/lib/coderay/scanners/ruby.rb
new file mode 100644
index 0000000..433726b
--- /dev/null
+++ b/lib/coderay/scanners/ruby.rb
@@ -0,0 +1,333 @@
+module CodeRay module Scanners
+
+ # This scanner is really complex, since Ruby _is_ a complex language!
+ #
+ # It tries to highlight 100% of all common code,
+ # and 90% of strange codes.
+ #
+ # It is optimized for HTML highlighting, and is not very useful for
+ # parsing or pretty printing.
+ #
+ # For now, I think it's better than the scanners in VIM or Syntax, or
+ # any highlighter I was able to find, except Caleb's RubyLexer.
+ #
+ # I hope it's also better than the rdoc/irb lexer.
+ class Ruby < Scanner
+
+ include Streamable
+
+ register_for :ruby
+
+ require 'coderay/scanners/helpers/ruby_helper'
+
+ DEFAULT_OPTIONS = {
+ :parse_regexps => true,
+ }
+
+ private
+ def scan_tokens tokens, options
+ parse_regexp = false # options[:parse_regexps]
+ first_bake = saved_tokens = nil
+ last_token_dot = false
+ fancy_allowed = regexp_allowed = true
+ heredocs = nil
+ last_state = nil
+ state = :initial
+ depth = nil
+ states = []
+
+ until eos?
+ type = :error
+ match = nil
+ kind = nil
+
+ if state.instance_of? StringState
+# {{{
+
+ match = scan_until(state.pattern) || scan_until(/\z/)
+ tokens << [match, :content] unless match.empty?
+ break if eos?
+
+ if state.heredoc and self[1]
+ match = getch + scan_until(/$/)
+ tokens << [match, :delimiter]
+ tokens << [:close, state.type]
+ state = :initial
+ next
+ end
+
+ case match = getch
+
+ when state.delim
+ if state.paren
+ state.paren_depth -= 1
+ if state.paren_depth > 0
+ tokens << [match, :nesting_delimiter]
+ next
+ end
+ end
+ tokens << [match, :delimiter]
+ if state.type == :regexp and not eos?
+ modifiers = scan(/#{REGEXP_MODIFIERS}/ox)
+ tokens << [modifiers, :modifier] unless modifiers.empty?
+ if parse_regexp
+ extended = modifiers.index ?x
+ tokens, regexp = saved_tokens, tokens
+ for text, type in regexp
+ if text.is_a? String
+ case type
+ when :content
+ text.scan(/([^#]+)|(#.*)/) do |plain, comment|
+ if plain
+ tokens << [plain, :content]
+ else
+ tokens << [comment, :comment]
+ end
+ end
+ when :character
+ if text[/\\(?:[swdSWDAzZbB]|\d+)/]
+ tokens << [text, :modifier]
+ else
+ tokens << [text, type]
+ end
+ else
+ tokens << [text, type]
+ end
+ else
+ tokens << [text, type]
+ end
+ end
+ first_bake = saved_tokens = nil
+ end
+ end
+ tokens << [:close, state.type]
+ fancy_allowed = regexp_allowed = false
+ state = :initial
+
+ when '\\'
+ if state.interpreted
+ if esc = scan(/ #{ESCAPE} /ox)
+ tokens << [match + esc, :char]
+ else
+ tokens << [match, :error]
+ end
+ else
+ case m = getch
+ when state.delim, '\\'
+ tokens << [match + m, :char]
+ else
+ tokens << [match + m, :content]
+ end
+ end
+
+ when '#'
+ case peek(1)[0]
+ when ?{
+ states.push [state, depth, heredocs]
+ fancy_allowed = regexp_allowed = true
+ state, depth = :initial, 1
+ tokens << [match + getch, :escape]
+ when ?$, ?@
+ tokens << [match, :escape]
+ last_state = state # scan one token as normal code, then return here
+ state = :initial
+ else
+ raise "else-case # reached; #%p not handled" % peek(1), tokens
+ end
+
+ when state.paren
+ state.paren_depth += 1
+ tokens << [match, :nesting_delimiter]
+
+ when REGEXP_SYMBOLS
+ tokens << [match, :function]
+
+ else
+ raise "else-case \" reached; %p not handled, state = %p" % [match, state], tokens
+
+ end
+ next
+# }}}
+ else
+# {{{
+ if match = scan(/ [ \t\f]+ | \\? \n | \# .* /x) or
+ ( bol? and match = scan(/ #{DATA} | #{RDOC} /ox) )
+ fancy_allowed = true
+ case m = match[0]
+ when ?\s, ?\t, ?\f
+ match << scan(/\s*/) unless eos? or heredocs
+ type = :space
+ when ?\n, ?\\
+ type = :space
+ regexp_allowed = m == ?\n
+ if heredocs
+ unscan # heredoc scanning needs \n at start
+ state = heredocs.shift
+ tokens << [:open, state.type]
+ heredocs = nil if heredocs.empty?
+ next
+ else
+ match << scan(/\s*/) unless eos?
+ end
+ when ?#, ?=, ?_
+ type = :comment
+ regexp_allowed = true
+ else
+ raise "else-case _ reached, because case %p was not handled" % [matched[0].chr], tokens
+ end
+ tokens << [match, type]
+ next
+
+ elsif state == :initial
+ if match = scan(/ \.\.?\.? | [-+*=>;,|&!\(\)\[\]~^]+ | [\{\}] | :: /x)
+ if match !~ / [.\)\]\}] \z/x or match =~ /\.\.\.?/
+ regexp_allowed = fancy_allowed = :set
+ end
+ last_token_dot = :set if match == '.' or match == '::'
+ type = :operator
+ unless states.empty?
+ case match
+ when '{'
+ depth += 1
+ when '}'
+ depth -= 1
+ if depth == 0
+ state, depth, heredocs = *states.pop
+ type = :escape
+ end
+ end
+ end
+
+ elsif match = scan(/#{METHOD_NAME}/o)
+ if last_token_dot
+ type = if match[/^[A-Z]/] then :constant else :ident end
+ else
+ type = IDENT_KIND[match]
+ if type == :ident and match[/^[A-Z]/]
+ type = :constant
+ elsif type == :reserved
+ state = DEF_NEW_STATE[match]
+ end
+ end
+ fancy_allowed = regexp_allowed = REGEXP_ALLOWED[match]
+
+ elsif match = scan(/ ['"] /mx)
+ tokens << [:open, :string]
+ type = :delimiter
+ state = StringState.new :string, match != '\'', match.dup # important for streaming
+
+ elsif match = scan(/#{INSTANCE_VARIABLE}/o)
+ type = :instance_variable
+
+ elsif regexp_allowed and match = scan(/ \/ /mx)
+ tokens << [:open, :regexp]
+ type = :delimiter
+ interpreted = true
+ state = StringState.new :regexp, interpreted, match.dup
+ if parse_regexp
+ tokens, saved_tokens = [], tokens
+ end
+
+ elsif match = scan(/#{NUMERIC}/o)
+ type = if match[/#{FLOAT}/o] then :float else :integer end
+
+ elsif fancy_allowed and match = scan(/#{SYMBOL}/o)
+ case match[1]
+ when ?', ?"
+ tokens << [:open, :symbol]
+ state = StringState.new :symbol, match[1] == ?", match[1,1]
+ end
+ type = :symbol
+
+ elsif fancy_allowed and match = scan(/#{HEREDOC_OPEN}/o)
+ indented, quote = self[1] == '-', self[3]
+ delim = self[quote ? 4 : 2]
+ type = QUOTE_TO_TYPE[quote]
+ tokens << [:open, type]
+ tokens << [match, :delimiter]
+ match = :close
+ heredoc = StringState.new type, quote != '\'', delim, (indented ? :indented : :linestart )
+ heredocs ||= [] # create heredocs if empty
+ heredocs << heredoc
+
+ elsif fancy_allowed and match = scan(/#{FANCY_START}/o)
+ type, interpreted = *FancyStringType.fetch(self[1]) do
+ raise 'Unknown fancy string: %%%p' % k, tokens
+ end
+ tokens << [:open, type]
+ state = StringState.new type, interpreted, self[2]
+ type = :delimiter
+
+ elsif fancy_allowed and match = scan(/#{CHARACTER}/o)
+ type = :integer
+
+ elsif match = scan(/ [\/%<?:] /x)
+ regexp_allowed = fancy_allowed = :set
+ type = :operator
+
+ elsif match = scan(/`/)
+ if last_token_dot
+ type = :operator
+ else
+ tokens << [:open, :shell]
+ type = :delimiter
+ state = StringState.new :shell, true, '`'
+ end
+
+ elsif match = scan(/#{GLOBAL_VARIABLE}/o)
+ type = :global_variable
+
+ elsif match = scan(/#{CLASS_VARIABLE}/o)
+ type = :class_variable
+
+ else
+ match = getch
+
+ end
+
+ elsif state == :def_expected
+ if match = scan(/ (?: #{VARIABLE} (?: ::#{IDENT} )* \. )? #{METHOD_NAME_EX} /ox)
+ type = :method
+ else
+ match = getch
+ end
+ state = :initial
+
+ elsif state == :module_expected
+ if match = scan(/<</)
+ type = :operator
+ else
+ if match = scan(/ (?:#{IDENT}::)* #{IDENT} /ox)
+ type = :class
+ else
+ match = getch
+ end
+ end
+ state = :initial
+
+ end
+
+ regexp_allowed = regexp_allowed == :set
+ fancy_allowed = fancy_allowed == :set
+ last_token_dot = last_token_dot == :set
+
+ if $DEBUG
+ raise_inspect 'error token %p in line %d' % [tokens.last, line], tokens if not type or type == :error
+ end
+
+ tokens << [match, type]
+
+ if last_state
+ state = last_state
+ last_state = nil
+ end
+# }}}
+ end
+ end
+
+ tokens
+ end
+ end
+
+end end
+# vim:fdm=marker
diff --git a/lib/coderay/scanners/rubyfast.rb b/lib/coderay/scanners/rubyfast.rb
new file mode 100644
index 0000000..baff382
--- /dev/null
+++ b/lib/coderay/scanners/rubyfast.rb
@@ -0,0 +1,287 @@
+module CodeRay module Scanners
+
+ class Ruby < Scanner
+
+ register_for :rubyfast
+
+ RESERVED_WORDS = [
+ 'and', 'def', 'end', 'in', 'or', 'unless', 'begin',
+ 'defined?', 'ensure', 'module', 'redo', 'super', 'until',
+ 'BEGIN', 'break', 'do', 'next', 'rescue', 'then',
+ 'when', 'END', 'case', 'else', 'for', 'retry',
+ 'while', 'alias', 'class', 'elsif', 'if', 'not', 'return',
+ 'undef', 'yield',
+ ]
+
+ DEF_KEYWORDS = ['def']
+ MODULE_KEYWORDS = ['class', 'module']
+ DEF_NEW_STATE = WordList.new(:initial).
+ add(DEF_KEYWORDS, :def_expected).
+ add(MODULE_KEYWORDS, :module_expected)
+
+ WORDS_ALLOWING_REGEXP = [
+ 'and', 'or', 'not', 'while', 'until', 'unless', 'if', 'elsif', 'when'
+ ]
+ REGEXP_ALLOWED = WordList.new(false).
+ add(WORDS_ALLOWING_REGEXP, :set)
+
+ PREDEFINED_CONSTANTS = [
+ 'nil', 'true', 'false', 'self',
+ 'DATA', 'ARGV', 'ARGF', '__FILE__', '__LINE__',
+ ]
+
+ IDENT_KIND = WordList.new(:ident).
+ add(RESERVED_WORDS, :reserved).
+ add(PREDEFINED_CONSTANTS, :pre_constant)
+
+ IDENT = /[a-zA-Z_][a-zA-Z_0-9]*/
+
+ METHOD_NAME = / #{IDENT} [?!]? /xo
+ METHOD_NAME_EX = /
+ #{IDENT}[?!=]? # common methods: split, foo=, empty?, gsub!
+ | \*\*? # multiplication and power
+ | [-+~]@? # plus, minus
+ | [\/%&|^`] # division, modulo or format strings, &and, |or, ^xor, `system`
+ | \[\]=? # array getter and setter
+ | <=?>? | >=? # comparison, rocket operator
+ | << | >> # append or shift left, shift right
+ | ===? # simple equality and case equality
+ /ox
+ GLOBAL_VARIABLE = / \$ (?: #{IDENT} | [1-9] | 0[a-zA-Z_0-9]* | [~&+`'=\/,;_.<>!@$?*":\\] | -[a-zA-Z_0-9] ) /ox
+
+ DOUBLEQ = / " [^"\#\\]* (?: (?: \#\{.*?\} | \#(?:$")? | \\. ) [^"\#\\]* )* "? /mox
+ SINGLEQ = / ' [^'\\]* (?: \\. [^'\\]* )* '? /mox
+ STRING = / #{SINGLEQ} | #{DOUBLEQ} /ox
+
+ SHELL = / ` [^`\#\\]* (?: (?: \#\{.*?\} | \#(?:$`)? | \\. ) [^`\#\\]* )* `? /mox
+ REGEXP =%r! / [^/\#\\]* (?: (?: \#\{.*?\} | \#(?:$/)? | \\. ) [^/\#\\]* )* /? !mox
+
+ DECIMAL = /\d+(?:_\d+)*/ # doesn't recognize 09 as octal error
+ OCTAL = /0_?[0-7]+(?:_[0-7]+)*/
+ HEXADECIMAL = /0x[0-9A-Fa-f]+(?:_[0-9A-Fa-f]+)*/
+ BINARY = /0b[01]+(?:_[01]+)*/
+
+ EXPONENT = / [eE] [+-]? #{DECIMAL} /ox
+ FLOAT = / #{DECIMAL} (?: #{EXPONENT} | \. #{DECIMAL} #{EXPONENT}? ) /
+ INTEGER = /#{OCTAL}|#{HEXADECIMAL}|#{BINARY}|#{DECIMAL}/
+
+ ESCAPE_STRING = /
+ % (?!\s)
+ (?:
+ [qsw]
+ (?:
+ \( [^\)\\]* (?: \\. [^\)\\]* )* \)?
+ |
+ \[ [^\]\\]* (?: \\. [^\]\\]* )* \]?
+ |
+ \{ [^\}\\]* (?: \\. [^\}\\]* )* \}?
+ |
+ \< [^\>\\]* (?: \\. [^\>\\]* )* \>?
+ |
+ \\ [^\\ ]* \\?
+ |
+ ( [^a-zA-Z0-9] ) # $1
+ (?:(?!\1)[^\\])* (?: \\. (?:(?!\1)[^\#\\])* )* \1?
+ )
+ |
+ [QrxWr]?
+ (?:
+ \( [^\)\#\\]* (?: (?:\#\{.*?\}|\#|\\.) [^\)\#\\]* )* \)?
+ |
+ \[ [^\]\#\\]* (?: (?:\#\{.*?\}|\#|\\.) [^\]\#\\]* )* \]?
+ |
+ \{ [^\}\#\\]* (?: (?:\#\{.*?\}|\#|\\.) [^\}\#\\]* )* \}?
+ |
+ \< [^\>\#\\]* (?: (?:\#\{.*?\}|\#|\\.) [^\>\#\\]* )* \>?
+ |
+ \# [^\# \\]* (?: \\. [^\# \\]* )* \#?
+ |
+ \\ [^\\\# ]* (?: (?:\#\{.*?\}|\# ) [^\\\# ]* )* \\?
+ |
+ ( [^a-zA-Z0-9] ) # $2
+ (?:(?!\2)[^\#\\])* (?: (?:\#\{.*?\}|\#|\\.) (?:(?!\2)[^\#\\])* )* \2?
+ )
+ )
+ /mox
+
+ SYMBOL = /
+ :
+ (?:
+ #{GLOBAL_VARIABLE}
+ | @@?#{IDENT}
+ | #{METHOD_NAME_EX}
+ | #{STRING}
+ )/ox
+
+ HEREDOC = /
+ << (?! [\dc] )
+ (?: [^\n]*? << )?
+ (?:
+ ([a-zA-Z_0-9]+)
+ (?: .*? ^\1$ | .* )
+ |
+ -([a-zA-Z_0-9]+)
+ (?: .*? ^\s*\2$ | .* )
+ |
+ (["\'`]) (.*?) \3
+ (?: .*? ^\4$ | .* )
+ |
+ - (["\'`]) (.*?) \5
+ (?: .*? ^\s*\6$ | .* )
+ )
+ /mx
+
+ RDOC = /
+ =begin (?!\S) [^\n]* \n?
+ (?:
+ (?! =end (?!\S) )
+ [^\n]* \n?
+ )*
+ (?:
+ =end (?!\S) [^\n]*
+ )?
+ /mx
+
+ DATA = /
+ __END__\n
+ (?:
+ (?=\#CODE)
+ |
+ .*
+ )
+ /
+
+ private
+ def scan_tokens tokens, options
+
+ state = :initial
+ regexp_allowed = true
+ last_token_dot = false
+
+ until eos?
+ match = nil
+ kind = :error
+
+ if scan(/\s+/) # in every state
+ kind = :space
+ regexp_allowed = :set if regexp_allowed or matched.index(?\n) # delayed flag setting
+
+ elsif scan(/ \#[^\n]* /x) # in every state
+ kind = :comment
+ regexp_allowed = :set if regexp_allowed
+
+ elsif state == :initial
+ # IDENTIFIERS, KEYWORDS
+ if scan(GLOBAL_VARIABLE)
+ kind = :global_variable
+ elsif scan(/ @@ #{IDENT} /ox)
+ kind = :class_variable
+ elsif scan(/ @ #{IDENT} /ox)
+ kind = :instance_variable
+ elsif scan(/ #{DATA} | #{RDOC} /ox)
+ kind = :comment
+ elsif scan(METHOD_NAME)
+ match = matched
+ if last_token_dot
+ kind =
+ if match[/^[A-Z]/]
+ :constant
+ else
+ :ident
+ end
+ else
+ kind = IDENT_KIND[match]
+ if kind == :ident and match[/^[A-Z]/]
+ kind = :constant
+ elsif kind == :reserved
+ state = DEF_NEW_STATE[match]
+ regexp_allowed = REGEXP_ALLOWED[match]
+ end
+ end
+
+ elsif scan(STRING)
+ kind = :string
+ elsif scan(SHELL)
+ kind = :shell
+ elsif scan(HEREDOC)
+ kind = :string
+ elsif check(/\//) and regexp_allowed
+ scan(REGEXP)
+ kind = :regexp
+ elsif scan(ESCAPE_STRING)
+ match = matched
+ kind =
+ case match[0]
+ when ?s
+ :symbol
+ when ?r
+ :regexp
+ when ?x
+ :shell
+ else
+ :string
+ end
+
+ elsif scan(/:(?:#{GLOBAL_VARIABLE}|#{METHOD_NAME_EX}|#{STRING})/ox)
+ kind = :symbol
+ elsif scan(/
+ \? (?:
+ [^\s\\]
+ |
+ \\ (?:M-\\C-|C-\\M-|M-\\c|c\\M-|c|C-|M-))? (?: \\ (?: . | [0-7]{3} | x[0-9A-Fa-f][0-9A-Fa-f] )
+ )
+ /mx)
+ kind = :integer
+
+ elsif scan(/ [-+*\/%=<>;,|&!()\[\]{}~?] | \.\.?\.? | ::? /x)
+ kind = :operator
+ match = matched
+ regexp_allowed = :set if match[-1,1] =~ /[~=!<>|&^,\(\[+\-\/\*%]\z/
+ last_token_dot = :set if match == '.' or match == '::'
+ elsif scan(FLOAT)
+ kind = :float
+ elsif scan(INTEGER)
+ kind = :integer
+ else
+ getch
+ end
+
+ elsif state == :def_expected
+ if scan(/ (?:#{IDENT}::)* (?:#{IDENT}\.)? #{METHOD_NAME_EX} /ox)
+ kind = :method
+ else
+ getch
+ end
+ state = :initial
+
+ elsif state == :module_expected
+ if scan(/<</)
+ kind = :operator
+ else
+ if scan(/ (?:#{IDENT}::)* #{IDENT} /ox)
+ kind = :method
+ else
+ getch
+ end
+ state = :initial
+ end
+
+ end
+
+ text = match || matched
+
+ if kind == :regexp and not eos?
+ text << scan(/[eimnosux]*/)
+ end
+
+ regexp_allowed = (regexp_allowed == :set) # delayed flag setting
+ last_token_dot = last_token_dot == :set
+
+ tokens << [text, kind]
+ end
+
+ tokens
+ end
+ end
+
+end end
diff --git a/lib/coderay/scanners/rubylex.rb b/lib/coderay/scanners/rubylex.rb
new file mode 100644
index 0000000..2e69d39
--- /dev/null
+++ b/lib/coderay/scanners/rubylex.rb
@@ -0,0 +1,102 @@
+require 'rubygems'
+require_gem 'rubylexer'
+require 'rubylexer.rb'
+
+module CodeRay module Scanners
+
+ class RubyLex < Scanner
+
+ register_for :rubylex
+
+ class FakeFile < String
+
+ def initialize(*)
+ super
+ @pos = 0
+ end
+
+ attr_accessor :pos
+
+ def read x
+ pos = @pos
+ @pos += x
+ self[pos ... @pos]
+ end
+
+ def getc
+ pos = @pos
+ @pos += 1
+ self[pos]||-1
+ end
+
+ def eof?
+ @pos == size
+ end
+
+ def each_byte
+ until eof?
+ yield getc
+ end
+ end
+
+ def method_missing meth, *args
+ raise NoMethodError, '%s%s' % [meth, args]
+ end
+
+ end
+
+ private
+ Translate = {
+ :ignore => :comment,
+ :varname => :ident,
+ :number => :integer,
+ :ws => :space,
+ :escnl => :space,
+ :keyword => :reserved,
+ :methname => :method,
+ :renderexactlystring => :regexp,
+ :string => :string,
+ }
+
+ def scan_tokens tokens, options
+ require 'tempfile'
+ Tempfile.open('~coderay_tempfile') do |file|
+ file.binmode
+ file.write code
+ file.rewind
+ lexer = RubyLexer.new 'code', file
+ loop do
+ begin
+ tok = lexer.get1token
+ rescue => kaboom
+ err = <<-EOE
+ ERROR!!!
+#{kaboom.inspect}
+#{kaboom.backtrace.join("\n")}
+ EOE
+ tokens << [err, :error]
+ Kernel.raise
+ end
+ break if tok.is_a? EoiToken
+ next if tok.is_a? FileAndLineToken
+ kind = tok.class.name[/(.*?)Token$/,1].downcase.to_sym
+ kind = Translate.fetch kind, kind
+ text = tok.ident
+ case kind
+ when :hereplaceholder
+ text = tok.ender
+ kind = :string
+ when :herebody, :outlinedherebody
+ text = tok.ident.ident
+ kind = :string
+ end
+ text = text.inspect unless text.is_a? String
+ p token if kind == :error
+ tokens << [text.dup, kind]
+ end
+ end
+ tokens
+ end
+ end
+
+end end