diff options
Diffstat (limited to 'lib/coderay/scanners')
-rw-r--r-- | lib/coderay/scanners/java.rb | 2 | ||||
-rw-r--r-- | lib/coderay/scanners/scanner.rb | 337 |
2 files changed, 338 insertions, 1 deletions
diff --git a/lib/coderay/scanners/java.rb b/lib/coderay/scanners/java.rb index b282864..962154e 100644 --- a/lib/coderay/scanners/java.rb +++ b/lib/coderay/scanners/java.rb @@ -36,7 +36,7 @@ module Scanners add(BuiltinTypes::List, :predefined_type). add(BuiltinTypes::List.select { |builtin| builtin[/(Error|Exception)$/] }, :exception). add(DIRECTIVES, :directive) # :nodoc: - + ESCAPE = / [bfnrtv\n\\'"] | x[a-fA-F0-9]{1,2} | [0-7]{1,3} /x # :nodoc: UNICODE_ESCAPE = / u[a-fA-F0-9]{4} | U[a-fA-F0-9]{8} /x # :nodoc: STRING_CONTENT_PATTERN = { diff --git a/lib/coderay/scanners/scanner.rb b/lib/coderay/scanners/scanner.rb new file mode 100644 index 0000000..efa710d --- /dev/null +++ b/lib/coderay/scanners/scanner.rb @@ -0,0 +1,337 @@ +# encoding: utf-8 + +module CodeRay + module Scanners + + # = Scanner + # + # The base class for all Scanners. + # + # It is a subclass of Ruby's great +StringScanner+, which + # makes it easy to access the scanning methods inside. + # + # It is also +Enumerable+, so you can use it like an Array of + # Tokens: + # + # require 'coderay' + # + # c_scanner = CodeRay::Scanners[:c].new "if (*p == '{') nest++;" + # + # for text, kind in c_scanner + # puts text if kind == :operator + # end + # + # # prints: (*==)++; + # + # OK, this is a very simple example :) + # You can also use +map+, +any?+, +find+ and even +sort_by+, + # if you want. + class Scanner < StringScanner + + extend Plugin + plugin_host Scanners + + # Raised if a Scanner fails while scanning + ScanError = Class.new StandardError + + # The default options for all scanner classes. + # + # Define @default_options for subclasses. + DEFAULT_OPTIONS = { } + + KINDS_NOT_LOC = [:comment, :doctype, :docstring] + + attr_accessor :state + + class << self + + # Normalizes the given code into a string with UNIX newlines, in the + # scanner's internal encoding, with invalid and undefined charachters + # replaced by placeholders. Always returns a new object. + def normalize code + # original = code + code = code.to_s unless code.is_a? ::String + return code if code.empty? + + if code.respond_to? :encoding + code = encode_with_encoding code, self.encoding + else + code = to_unix code + end + # code = code.dup if code.eql? original + code + end + + # The typical filename suffix for this scanner's language. + def file_extension extension = lang + @file_extension ||= extension.to_s + end + + # The encoding used internally by this scanner. + def encoding name = 'UTF-8' + @encoding ||= defined?(Encoding.find) && Encoding.find(name) + end + + # The lang of this Scanner class, which is equal to its Plugin ID. + def lang + @plugin_id + end + + protected + + def encode_with_encoding code, target_encoding + if code.encoding == target_encoding + if code.valid_encoding? + return to_unix(code) + else + source_encoding = guess_encoding code + end + else + source_encoding = code.encoding + end + # print "encode_with_encoding from #{source_encoding} to #{target_encoding}" + code.encode target_encoding, source_encoding, :universal_newline => true, :undef => :replace, :invalid => :replace + end + + def to_unix code + code.index(?\r) ? code.gsub(/\r\n?/, "\n") : code + end + + def guess_encoding s + #:nocov: + IO.popen("file -b --mime -", "w+") do |file| + file.write s[0, 1024] + file.close_write + begin + Encoding.find file.gets[/charset=([-\w]+)/, 1] + rescue ArgumentError + Encoding::BINARY + end + end + #:nocov: + end + + end + + # Create a new Scanner. + # + # * +code+ is the input String and is handled by the superclass + # StringScanner. + # * +options+ is a Hash with Symbols as keys. + # It is merged with the default options of the class (you can + # overwrite default options here.) + # + # Else, a Tokens object is used. + def initialize code = '', options = {} + if self.class == Scanner + raise NotImplementedError, "I am only the basic Scanner class. I can't scan anything. :( Use my subclasses." + end + + @options = self.class::DEFAULT_OPTIONS.merge options + + super self.class.normalize(code) + + @tokens = options[:tokens] || Tokens.new + @tokens.scanner = self if @tokens.respond_to? :scanner= + + setup + end + + # Sets back the scanner. Subclasses should redefine the reset_instance + # method instead of this one. + def reset + super + reset_instance + end + + # Set a new string to be scanned. + def string= code + code = self.class.normalize(code) + super code + reset_instance + end + + # the Plugin ID for this scanner + def lang + self.class.lang + end + + # the default file extension for this scanner + def file_extension + self.class.file_extension + end + + # Scan the code and returns all tokens in a Tokens object. + def tokenize source = nil, options = {} + options = @options.merge(options) + + set_tokens_from_options options + set_string_from_source source + + begin + scan_tokens @tokens, options + rescue => e + message = "Error in %s#scan_tokens, initial state was: %p" % [self.class, defined?(state) && state] + raise_inspect e.message, @tokens, message, 30, e.backtrace + end + + @cached_tokens = @tokens + if source.is_a? Array + @tokens.split_into_parts(*source.map { |part| part.size }) + else + @tokens + end + end + + # Cache the result of tokenize. + def tokens + @cached_tokens ||= tokenize + end + + # Traverse the tokens. + def each &block + tokens.each(&block) + end + include Enumerable + + # The current line position of the scanner, starting with 1. + # See also: #column. + # + # Beware, this is implemented inefficiently. It should be used + # for debugging only. + def line pos = self.pos + return 1 if pos <= 0 + binary_string[0...pos].count("\n") + 1 + end + + # The current column position of the scanner, starting with 1. + # See also: #line. + def column pos = self.pos + return 1 if pos <= 0 + pos - (binary_string.rindex(?\n, pos - 1) || -1) + end + + # The string in binary encoding. + # + # To be used with #pos, which is the index of the byte the scanner + # will scan next. + def binary_string + @binary_string ||= + if string.respond_to?(:bytesize) && string.bytesize != string.size + #:nocov: + string.dup.force_encoding('binary') + #:nocov: + else + string + end + end + + protected + + # Can be implemented by subclasses to do some initialization + # that has to be done once per instance. + # + # Use reset for initialization that has to be done once per + # scan. + def setup # :doc: + end + + def set_string_from_source source + case source + when Array + self.string = self.class.normalize(source.join) + when nil + reset + else + self.string = self.class.normalize(source) + end + end + + def set_tokens_from_options options + @tokens = options[:tokens] || @tokens || Tokens.new + @tokens.scanner = self if @tokens.respond_to? :scanner= + end + + # This is the central method, and commonly the only one a + # subclass implements. + # + # Subclasses must implement this method; it must return +tokens+ + # and must only use Tokens#<< for storing scanned tokens! + def scan_tokens tokens, options # :doc: + raise NotImplementedError, "#{self.class}#scan_tokens not implemented." + end + + # Resets the scanner. + def reset_instance + @tokens.clear if @tokens.respond_to?(:clear) && !@options[:keep_tokens] + @cached_tokens = nil + @binary_string = nil if defined? @binary_string + end + + SCAN_ERROR_MESSAGE = <<-MESSAGE + + +***ERROR in %s: %s (after %s tokens) + +tokens: +%s + +%s + +surrounding code: +%p ~~ %p + + +***ERROR*** + + MESSAGE + + def raise_inspect_arguments message, tokens, state, ambit + return File.basename(caller[0]), + message, + tokens_size(tokens), + tokens_last(tokens, 10).map(&:inspect).join("\n"), + scanner_state_info(state), + binary_string[pos - ambit, ambit], + binary_string[pos, ambit] + end + + SCANNER_STATE_INFO = <<-INFO +current line: %d column: %d pos: %d +matched: %p state: %p +bol?: %p, eos?: %p + INFO + + def scanner_state_info state + SCANNER_STATE_INFO % [ + line, column, pos, + matched, state || 'No state given!', + bol?, eos?, + ] + end + + # Scanner error with additional status information + def raise_inspect message, tokens, state = self.state, ambit = 30, backtrace = caller + raise ScanError, SCAN_ERROR_MESSAGE % raise_inspect_arguments(message, tokens, state, ambit), backtrace + end + + def tokens_size tokens + tokens.size if tokens.respond_to?(:size) + end + + def tokens_last tokens, n + tokens.respond_to?(:last) ? tokens.last(n) : [] + end + + # Shorthand for scan_until(/\z/). + # This method also avoids a JRuby 1.9 mode bug. + def scan_rest + rest = self.rest + terminate + rest + end + + end + + end +end |