summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authormurphy <murphy@rubychan.de>2010-06-28 15:49:04 +0000
committermurphy <murphy@rubychan.de>2010-06-28 15:49:04 +0000
commitebb546f46873d5a60cb08b1010fe5e8d9ec8a817 (patch)
tree5f46fa4450bd33f5fe415c6aff6f689562bb8f1f
parentb5dd1ba683fb9eb0a74ab0b1f54fd9977e0ed3be (diff)
downloadcoderay-ebb546f46873d5a60cb08b1010fe5e8d9ec8a817.tar.gz
A prototype implementation of "inside Regexp" detection for the Ruby scanner. Would be nice for diffs.
-rw-r--r--etc/todo/scanners/ruby-inside-regexp-detection.rb455
1 files changed, 455 insertions, 0 deletions
diff --git a/etc/todo/scanners/ruby-inside-regexp-detection.rb b/etc/todo/scanners/ruby-inside-regexp-detection.rb
new file mode 100644
index 0000000..c68611a
--- /dev/null
+++ b/etc/todo/scanners/ruby-inside-regexp-detection.rb
@@ -0,0 +1,455 @@
+module CodeRay
+module Scanners
+
+ # This scanner is really complex, since Ruby _is_ a complex language!
+ #
+ # It tries to highlight 100% of all common code,
+ # and 90% of strange codes.
+ #
+ # It is optimized for HTML highlighting, and is not very useful for
+ # parsing or pretty printing.
+ #
+ # For now, I think it's better than the scanners in VIM or Syntax, or
+ # any highlighter I was able to find, except Caleb's RubyLexer.
+ #
+ # I hope it's also better than the rdoc/irb lexer.
+ #
+ # Alias: +irb+
+ class Ruby < Scanner
+
+ register_for :ruby
+ file_extension 'rb'
+
+ helper :patterns
+
+ unless defined? EncodingError
+ EncodingError = Class.new Exception # :nodoc:
+ end
+
+ protected
+
+ def setup
+ @state = :initial
+ end
+
+ def scan_tokens encoder, options
+
+ patterns = Patterns # avoid constant lookup
+
+ state = @state
+ if state.instance_of? patterns::StringState
+ encoder.begin_group state.type
+ end
+
+ last_state = nil
+
+ method_call_expected = false
+ value_expected = true
+
+ heredocs = nil
+ inline_block_stack = nil
+ inline_block_curly_depth = 0
+
+ # def_object_stack = nil
+ # def_object_paren_depth = 0
+
+ unicode = string.respond_to?(:encoding) && string.encoding.name == 'UTF-8'
+
+ until eos?
+
+ if state.instance_of? patterns::StringState
+
+ match = scan_until(state.pattern) || scan_until(/\z/)
+ encoder.text_token match, :content unless match.empty?
+ break if eos?
+
+ if state.heredoc and self[1] # end of heredoc
+ match = getch.to_s
+ match << scan_until(/$/) unless eos?
+ encoder.text_token match, :delimiter
+ encoder.end_group state.type
+ state = state.next_state
+ next
+ end
+
+ case match = getch
+
+ when state.delim
+ if state.paren_depth
+ state.paren_depth -= 1
+ if state.paren_depth > 0
+ encoder.text_token match, :nesting_delimiter
+ next
+ end
+ end
+ encoder.text_token match, :delimiter
+ if state.type == :regexp and not eos?
+ modifiers = scan(/#{patterns::REGEXP_MODIFIERS}/ox)
+ encoder.text_token modifiers, :modifier unless modifiers.empty?
+ end
+ encoder.end_group state.type
+ value_expected = false
+ state = state.next_state
+
+ when '\\'
+ if state.interpreted
+ if esc = scan(/ #{patterns::ESCAPE} /ox)
+ encoder.text_token match + esc, :char
+ else
+ encoder.text_token match, :error
+ end
+ else
+ case m = getch
+ when state.delim, '\\'
+ encoder.text_token match + m, :char
+ when nil
+ encoder.text_token match, :content
+ else
+ encoder.text_token match + m, :content
+ end
+ end
+
+ when '#'
+ case peek(1)
+ when '{'
+ inline_block_stack ||= []
+ inline_block_stack << [state, inline_block_curly_depth, heredocs]
+ value_expected = true
+ state = :initial
+ inline_block_curly_depth = 1
+ encoder.begin_group :inline
+ encoder.text_token match + getch, :inline_delimiter
+ when '$', '@'
+ encoder.text_token match, :escape
+ last_state = state
+ state = :initial
+ else
+ raise_inspect 'else-case # reached; #%p not handled' %
+ [peek(1)], encoder
+ end
+
+ when state.opening_paren
+ state.paren_depth += 1
+ encoder.text_token match, :nesting_delimiter
+
+ when /#{patterns::REGEXP_SYMBOLS}/ox
+ encoder.text_token match, :function
+
+ else
+ raise_inspect 'else-case " reached; %p not handled, state = %p' %
+ [match, state], encoder
+
+ end
+
+ else
+
+ if match = scan(/[ \t\f]+/)
+ match << scan(/\s*/) unless eos? || heredocs
+ value_expected = true if match.index(?\n)
+ encoder.text_token match, :space
+
+ elsif match = scan(/\\?\n/)
+ if match == "\n"
+ value_expected = true
+ state = :initial if state == :undef_comma_expected
+ end
+ if heredocs
+ unscan # heredoc scanning needs \n at start
+ state = heredocs.shift
+ encoder.begin_group state.type
+ heredocs = nil if heredocs.empty?
+ next
+ else
+ match << scan(/\s*/) unless eos?
+ end
+ encoder.text_token match, :space
+
+ elsif bol? && match = scan(/\#!.*/)
+ encoder.text_token match, :doctype
+
+ elsif match = scan(/\#.*/) or
+ (bol? and match = scan(/#{patterns::RUBYDOC_OR_DATA}/o))
+ encoder.text_token match, :comment
+
+ elsif state == :initial
+
+ # IDENTS #
+ if !method_call_expected and
+ match = scan(unicode ? /#{patterns::METHOD_NAME}/uo :
+ /#{patterns::METHOD_NAME}/o)
+ value_expected = false
+ kind = patterns::IDENT_KIND[match]
+ if kind == :ident
+ if match[/^[A-Z]/] && !match[/[!?]$/] && !match?(/\(/)
+ kind = :constant
+ end
+ elsif kind == :reserved
+ state = patterns::KEYWORD_NEW_STATE[match]
+ value_expected = true if patterns::KEYWORDS_EXPECTING_VALUE[match]
+ end
+ value_expected = true if !value_expected && check(/#{patterns::VALUE_FOLLOWS}/o)
+ encoder.text_token match, kind
+
+ elsif method_call_expected and
+ match = scan(unicode ? /#{patterns::METHOD_AFTER_DOT}/uo :
+ /#{patterns::METHOD_AFTER_DOT}/o)
+ if method_call_expected == '::' && match[/^[A-Z]/] && !match?(/\(/)
+ encoder.text_token match, :constant
+ else
+ encoder.text_token match, :ident
+ end
+ method_call_expected = false
+ value_expected = check(/#{patterns::VALUE_FOLLOWS}/o)
+
+ # OPERATORS #
+ elsif not method_call_expected and match = scan(/ \.\.\.? | (\.|::) | [,\(\)\[\]\{\}] | ==?=? /x)
+ value_expected = match !~ / [.\)\]\}] /x || match =~ /\A\.\./
+ method_call_expected = self[1]
+ if inline_block_stack
+ case match
+ when '{'
+ inline_block_curly_depth += 1
+ when '}'
+ inline_block_curly_depth -= 1
+ if inline_block_curly_depth == 0 # closing brace of inline block reached
+ state, inline_block_curly_depth, heredocs = inline_block_stack.pop
+ inline_block_stack = nil if inline_block_stack.empty?
+ heredocs = nil if heredocs && heredocs.empty?
+ encoder.text_token match, :inline_delimiter
+ encoder.end_group :inline
+ next
+ end
+ end
+ end
+ encoder.text_token match, :operator
+
+ elsif match = scan(/ ['"] /mx)
+ encoder.begin_group :string
+ encoder.text_token match, :delimiter
+ state = patterns::StringState.new :string, match == '"', match # important for streaming
+
+ elsif match = scan(unicode ? /#{patterns::INSTANCE_VARIABLE}/uo :
+ /#{patterns::INSTANCE_VARIABLE}/o)
+ value_expected = false
+ encoder.text_token match, :instance_variable
+
+ elsif value_expected and match?(/\//)
+ encoder.begin_group :regexp
+ if match?(/\/#{patterns::REGEXP_MODIFIERS}x#{patterns::REGEXP_MODIFIERS}[ \t]*(?:\n|#|\z|[,\)\]])/o)
+ # most likely a false positive, the end of an extended regexp
+ # so ignore this one and pretend we're inside the regexp
+ else
+ encoder.text_token getch, :delimiter
+ end
+ interpreted = true
+ state = patterns::StringState.new :regexp, interpreted, '/'
+
+ elsif match = scan(value_expected ? /[-+]?#{patterns::NUMERIC}/o : /#{patterns::NUMERIC}/o)
+ if method_call_expected
+ encoder.text_token match, :error
+ method_call_expected = false
+ else
+ encoder.text_token match, self[1] ? :float : :integer
+ end
+ value_expected = false
+
+ elsif match = scan(unicode ? /#{patterns::SYMBOL}/uo :
+ /#{patterns::SYMBOL}/o)
+ case delim = match[1]
+ when ?', ?"
+ encoder.begin_group :symbol
+ encoder.text_token ':', :symbol
+ match = delim.chr
+ encoder.text_token match, :delimiter
+ state = patterns::StringState.new :symbol, delim == ?", match
+ else
+ encoder.text_token match, :symbol
+ value_expected = false
+ end
+
+ elsif match = scan(/ [-+!~^]=? | [*|&]{1,2}=? | >>? /x)
+ value_expected = true
+ encoder.text_token match, :operator
+
+ elsif value_expected and match = scan(/#{patterns::HEREDOC_OPEN}/o)
+ indented = self[1] == '-'
+ quote = self[3]
+ delim = self[quote ? 4 : 2]
+ kind = patterns::QUOTE_TO_TYPE[quote]
+ encoder.begin_group kind
+ encoder.text_token match, :delimiter
+ encoder.end_group kind
+ heredoc = patterns::StringState.new kind, quote != '\'',
+ delim, (indented ? :indented : :linestart )
+ heredocs ||= [] # create heredocs if empty
+ heredocs << heredoc
+ value_expected = false
+
+ elsif value_expected and match = scan(/#{patterns::FANCY_START}/o)
+ kind, interpreted = *patterns::FancyStringType.fetch(self[1]) do
+ raise_inspect 'Unknown fancy string: %%%p' % k, encoder
+ end
+ encoder.begin_group kind
+ state = patterns::StringState.new kind, interpreted, self[2]
+ encoder.text_token match, :delimiter
+
+ elsif value_expected and match = scan(/#{patterns::CHARACTER}/o)
+ value_expected = false
+ encoder.text_token match, :integer
+
+ elsif match = scan(/ [\/%]=? | <(?:<|=>?)? | [?:;] /x)
+ value_expected = true
+ encoder.text_token match, :operator
+
+ elsif match = scan(/`/)
+ if method_call_expected
+ encoder.text_token match, :operator
+ value_expected = true
+ else
+ encoder.begin_group :shell
+ encoder.text_token match, :delimiter
+ state = patterns::StringState.new :shell, true, match
+ end
+
+ elsif match = scan(unicode ? /#{patterns::GLOBAL_VARIABLE}/uo :
+ /#{patterns::GLOBAL_VARIABLE}/o)
+ encoder.text_token match, :global_variable
+ value_expected = false
+
+ elsif match = scan(unicode ? /#{patterns::CLASS_VARIABLE}/uo :
+ /#{patterns::CLASS_VARIABLE}/o)
+ encoder.text_token match, :class_variable
+ value_expected = false
+
+ elsif match = scan(/\\\z/)
+ encoder.text_token match, :space
+
+ else
+ if method_call_expected
+ method_call_expected = false
+ next
+ end
+ if !unicode
+ # check for unicode
+ debug, $DEBUG = $DEBUG, false
+ begin
+ if check(/./mu).size > 1
+ # seems like we should try again with unicode
+ unicode = true
+ end
+ rescue
+ # bad unicode char; use getch
+ ensure
+ $DEBUG = debug
+ end
+ next if unicode
+ end
+
+ encoder.text_token getch, :error
+
+ end
+
+ if last_state
+ state = last_state
+ last_state = nil
+ end
+
+ elsif state == :def_expected
+ if match = scan(unicode ? /(?>#{patterns::METHOD_NAME_EX})(?!\.|::)/uo :
+ /(?>#{patterns::METHOD_NAME_EX})(?!\.|::)/o)
+ encoder.text_token match, :method
+ state = :initial
+ else
+ last_state = :dot_expected
+ state = :initial
+ end
+
+ elsif state == :dot_expected
+ if match = scan(/\.|::/)
+ # invalid definition
+ state = :def_expected
+ encoder.text_token match, :operator
+ else
+ state = :initial
+ end
+
+ elsif state == :module_expected
+ if match = scan(/<</)
+ encoder.text_token match, :operator
+ else
+ state = :initial
+ if match = scan(unicode ? / (?:#{patterns::IDENT}::)* #{patterns::IDENT} /oux :
+ / (?:#{patterns::IDENT}::)* #{patterns::IDENT} /ox)
+ encoder.text_token match, :class
+ end
+ end
+
+ elsif state == :undef_expected
+ state = :undef_comma_expected
+ if match = scan(unicode ? /(?>#{patterns::METHOD_NAME_EX})(?!\.|::)/uo :
+ /(?>#{patterns::METHOD_NAME_EX})(?!\.|::)/o)
+ encoder.text_token match, :method
+ elsif match = scan(/#{patterns::SYMBOL}/o)
+ case delim = match[1]
+ when ?', ?"
+ encoder.begin_group :symbol
+ encoder.text_token ':', :symbol
+ match = delim.chr
+ encoder.text_token match, :delimiter
+ state = patterns::StringState.new :symbol, delim == ?", match
+ state.next_state = :undef_comma_expected
+ else
+ encoder.text_token match, :symbol
+ end
+ else
+ state = :initial
+ end
+
+ elsif state == :undef_comma_expected
+ if match = scan(/,/)
+ encoder.text_token match, :operator
+ state = :undef_expected
+ else
+ state = :initial
+ end
+
+ elsif state == :alias_expected
+ match = scan(unicode ? /(#{patterns::METHOD_NAME_OR_SYMBOL})([ \t]+)(#{patterns::METHOD_NAME_OR_SYMBOL})/uo :
+ /(#{patterns::METHOD_NAME_OR_SYMBOL})([ \t]+)(#{patterns::METHOD_NAME_OR_SYMBOL})/o)
+
+ if match
+ encoder.text_token self[1], (self[1][0] == ?: ? :symbol : :method)
+ encoder.text_token self[2], :space
+ encoder.text_token self[3], (self[3][0] == ?: ? :symbol : :method)
+ end
+ state = :initial
+
+ else
+ raise_inspect 'Unknown state: %p' % [state], encoder
+ end
+
+ end
+ end
+
+ # cleaning up
+ if options[:keep_state]
+ @state = state
+ end
+ if state.is_a? patterns::StringState
+ encoder.end_group state.type
+ end
+ if inline_block_stack
+ until inline_block_stack.empty?
+ state, *more = inline_block_stack.pop
+ encoder.end_group :inline if more
+ encoder.end_group state.type
+ end
+ end
+
+ encoder
+ end
+
+ end
+
+end
+end