From 84b8431608174e74a4c0d2394eb330a6621bc74b Mon Sep 17 00:00:00 2001 From: no author Date: Mon, 26 Sep 2005 02:58:54 +0000 Subject: New Repository, initial import --- lib/coderay/encoder.rb | 210 ++++++++++++++++++ lib/coderay/encoders/count.rb | 20 ++ lib/coderay/encoders/div.rb | 16 ++ lib/coderay/encoders/helpers/html_css.rb | 168 ++++++++++++++ lib/coderay/encoders/helpers/html_helper.rb | 68 ++++++ lib/coderay/encoders/helpers/html_output.rb | 240 ++++++++++++++++++++ lib/coderay/encoders/html.rb | 167 ++++++++++++++ lib/coderay/encoders/null.rb | 20 ++ lib/coderay/encoders/span.rb | 17 ++ lib/coderay/encoders/statistic.rb | 74 +++++++ lib/coderay/encoders/text.rb | 33 +++ lib/coderay/encoders/tokens.rb | 44 ++++ lib/coderay/encoders/yaml.rb | 19 ++ lib/coderay/helpers/filetype.rb | 145 ++++++++++++ lib/coderay/helpers/gzip_simple.rb | 123 ++++++++++ lib/coderay/helpers/scanner_helper.rb | 63 ++++++ lib/coderay/scanner.rb | 298 +++++++++++++++++++++++++ lib/coderay/scanners/c.rb | 147 ++++++++++++ lib/coderay/scanners/delphi.rb | 123 ++++++++++ lib/coderay/scanners/helpers/ruby_helper.rb | 212 ++++++++++++++++++ lib/coderay/scanners/mush.rb | 102 +++++++++ lib/coderay/scanners/plaintext.rb | 13 ++ lib/coderay/scanners/ruby.rb | 333 ++++++++++++++++++++++++++++ lib/coderay/scanners/rubyfast.rb | 287 ++++++++++++++++++++++++ lib/coderay/scanners/rubylex.rb | 102 +++++++++ lib/coderay/tokens.rb | 302 +++++++++++++++++++++++++ 26 files changed, 3346 insertions(+) create mode 100644 lib/coderay/encoder.rb create mode 100644 lib/coderay/encoders/count.rb create mode 100644 lib/coderay/encoders/div.rb create mode 100644 lib/coderay/encoders/helpers/html_css.rb create mode 100644 lib/coderay/encoders/helpers/html_helper.rb create mode 100644 lib/coderay/encoders/helpers/html_output.rb create mode 100644 lib/coderay/encoders/html.rb create mode 100644 lib/coderay/encoders/null.rb create mode 100644 lib/coderay/encoders/span.rb create mode 100644 lib/coderay/encoders/statistic.rb create mode 100644 lib/coderay/encoders/text.rb create mode 100644 lib/coderay/encoders/tokens.rb create mode 100644 lib/coderay/encoders/yaml.rb create mode 100644 lib/coderay/helpers/filetype.rb create mode 100644 lib/coderay/helpers/gzip_simple.rb create mode 100644 lib/coderay/helpers/scanner_helper.rb create mode 100644 lib/coderay/scanner.rb create mode 100644 lib/coderay/scanners/c.rb create mode 100644 lib/coderay/scanners/delphi.rb create mode 100644 lib/coderay/scanners/helpers/ruby_helper.rb create mode 100644 lib/coderay/scanners/mush.rb create mode 100644 lib/coderay/scanners/plaintext.rb create mode 100644 lib/coderay/scanners/ruby.rb create mode 100644 lib/coderay/scanners/rubyfast.rb create mode 100644 lib/coderay/scanners/rubylex.rb create mode 100644 lib/coderay/tokens.rb (limited to 'lib/coderay') diff --git a/lib/coderay/encoder.rb b/lib/coderay/encoder.rb new file mode 100644 index 0000000..5f6d511 --- /dev/null +++ b/lib/coderay/encoder.rb @@ -0,0 +1,210 @@ +module CodeRay + + # This module holds class Encoder and its subclasses. + # For example, the HTML encoder is named CodeRay::Encoders::HTML + # can be found in coderay/encoders/html. + # + # Encoders also provides methods and constants for the register mechanism + # and the [] method that returns the Encoder class belonging to the + # given format. + module Encoders + + # Raised if Encoders[] fails because: + # * an file could not be found + # * the requested Encoder is not registered + EncoderNotFound = Class.new Exception + + # Loaded Encoders are saved here. + ENCODERS = Hash.new do |h, lang| + path = Encoders.path_to lang + lang = lang.to_sym + begin + require path + rescue LoadError + raise EncoderNotFound, "#{path} not found." + else + # Encoder should have registered by now + unless h[lang] + raise EncoderNotFound, "No Encoder for #{lang} found in #{path}." + end + end + h[lang] + end + + class << self + + # Every Encoder class must register itself for one or more +formats+ + # by calling register_for, which calls this method. + # + # See CodeRay::Encoder.register_for. + def register encoder_class, *formats + for format in formats + ENCODERS[format.to_sym] = encoder_class + end + end + + # Returns the Encoder for +lang+. + # + # Example: + # require 'coderay' + # yaml_encoder = CodeRay::Encoders[:yaml] + def [] lang + ENCODERS[lang] + end + + # Alias for +[]+. + alias load [] + + # Returns the path to the encoder for format. + def path_to plugin + File.join 'coderay', 'encoders', "#{plugin}.rb" + end + + end + + + # The Encoder base class. Together with CodeRay::Scanner and + # CodeRay::Tokens, it forms the highlighting triad. + # + # Encoder instances take a Tokens object and do something with it. + # + # The most common Encoder is surely the HTML encoder + # (CodeRay::Encoders::HTML). It highlights the code in a colorful + # html page. + # If you want the highlighted code in a div or a span instead, + # use its subclasses Div and Span. + class Encoder + + attr_reader :token_stream + + class << self + + # Register this class for the given langs. + # + # Example: + # class MyEncoder < CodeRay::Encoders:Encoder + # register_for :myenc + # ... + # end + # + # See Encoder.register. + def register_for *args + Encoders.register self, *args + end + + # Returns if the Encoder can be used in streaming mode. + def streamable? + is_a? Streamable + end + + # If FILE_EXTENSION isn't defined, this method returns the downcase + # class name instead. + def const_missing sym + if sym == :FILE_EXTENSION + sym.to_s.downcase + else + super + end + end + + end + + # Subclasses are to store their default options in this constant. + DEFAULT_OPTIONS = { :stream => false } + + # The options you gave the Encoder at creating. + attr_accessor :options + + # Creates a new Encoder. + # +options+ is saved and used for all encode operations, as long as you + # don't overwrite it there by passing additional options. + # + # Encoder objects provide three encode methods: + # - encode simply takes a +code+ string and a +lang+ + # - encode_tokens expects a +tokens+ object instead + # - encode_stream is like encode, but uses streaming mode. + # + # Each method has an optional +options+ parameter. These are added to + # the options you passed at creation. + def initialize options = {} + @options = self.class::DEFAULT_OPTIONS.merge options + raise "I am only the basic Encoder class. I can't encode anything. :(\n" + + "Use my subclasses." if self.class == Encoder + end + + # Encode a Tokens object. + def encode_tokens tokens, options = {} + options = @options.merge options + setup options + compile tokens, options + finish options + end + + # Encode the given +code+ after tokenizing it using the Scanner for + # +lang+. + def encode code, lang, options = {} + options = @options.merge options + scanner_options = options.fetch(:scanner_options, {}) + tokens = CodeRay.scan code, lang, scanner_options + encode_tokens tokens, options + end + + # You can use highlight instead of encode, if that seems + # more clear to you. + alias highlight encode + + # Encode the given +code+ using the Scanner for +lang+ in streaming + # mode. + def encode_stream code, lang, options = {} + raise NotStreamableError, self unless kind_of? Streamable + options = @options.merge options + setup options + scanner_options = options.fetch :scanner_options, {} + @token_stream = CodeRay.scan_stream code, lang, scanner_options, &self + finish options + end + + # Behave like a proc. The tokens method is converted to a proc. + def to_proc + method(:token).to_proc + end + + protected + + # Called with merged options before encoding starts. + # Sets @out to an empty string. + # + # See the HTML Encoder for an example of option caching. + def setup options + @out = '' + end + + # Called with +text+ and +kind+ of the currently scanned token. + # For simple scanners, it's enougth to implement this method. + # + # Raises a NotImplementedError exception if it is not overwritten in + # subclass. + def token text, kind + raise NotImplementedError, "#{self.class}#token not implemented." + end + + # Called with merged options after encoding starts. + # The return value is the result of encoding, typically @out. + def finish options + @out + end + + # Do the encoding. + # + # The already created +tokens+ object must be used; it can be a + # TokenStream or a Tokens object. + def compile tokens, options + tokens.each(&self) + end + + end + + end +end + +# vim:sw=2:ts=2:et:tw=78 diff --git a/lib/coderay/encoders/count.rb b/lib/coderay/encoders/count.rb new file mode 100644 index 0000000..80aec57 --- /dev/null +++ b/lib/coderay/encoders/count.rb @@ -0,0 +1,20 @@ +module CodeRay +module Encoders + + class Count < Encoder + + register_for :count + + protected + + def setup options + @out = 0 + end + + def token text, kind + @out += 1 + end + end + +end +end diff --git a/lib/coderay/encoders/div.rb b/lib/coderay/encoders/div.rb new file mode 100644 index 0000000..640df0e --- /dev/null +++ b/lib/coderay/encoders/div.rb @@ -0,0 +1,16 @@ +module CodeRay module Encoders + + require 'coderay/encoders/html' + class Div < HTML + + FILE_EXTENSION = 'div.html' + + register_for :div + + DEFAULT_OPTIONS = HTML::DEFAULT_OPTIONS.merge({ + :css => :style, + :wrap => :div, + }) + end + +end end diff --git a/lib/coderay/encoders/helpers/html_css.rb b/lib/coderay/encoders/helpers/html_css.rb new file mode 100644 index 0000000..f9cadf7 --- /dev/null +++ b/lib/coderay/encoders/helpers/html_css.rb @@ -0,0 +1,168 @@ +module CodeRay module Encoders + + class HTML + class CSS + + def initialize stylesheet = TOKENS + @classes = Hash.new + parse stylesheet + end + + def [] *styles + cl = @classes[styles.first] + return '' unless cl + style = false + 1.upto(cl.size + 1) do |offset| + break if style = cl[styles[offset .. -1]] + end + return style + end + + private + + CSS_CLASS = / + ( (?: # $1 = classes + \s* \. [-\w]+ + )+ ) + \s* \{ + ( [^\}]* ) # $2 = style + \} \s* + | + ( . ) # $3 = error + /mx + def parse stylesheet + stylesheet.scan CSS_CLASS do |classes, style, error| + raise "CSS parse error: '#{error}' not recognized" if error + styles = classes.scan(/[-\w]+/) + cl = styles.pop + @classes[cl] ||= Hash.new + @classes[cl][styles] = style.strip + end + end + + MAIN = <<-'MAIN' +.code { + background-color: #FAFAFA; + border: 1px solid #D1D7DC; + font-family: 'Courier New', 'Terminal', monospace; + font-size: 10pt; + color: black; + vertical-align: top; + text-align: left; + padding: 0px; +} +span.code { white-space: pre; } +.code tt { font-weight: bold; } +.code pre { + font-size: 10pt; + margin: 0px 5px; +} +.code .code_table { + margin: 0px; +} +.code .line_numbers { + margin: 0px; + background-color:#DEF; color: #777; + vertical-align: top; + text-align: right; +} +.code .code_cell { + width: 100%; + background-color:#FAFAFA; + color: black; + vertical-align: top; + text-align: left; +} +.code .no { + background-color:#DEF; + color: #777; + padding: 0px 5px; + font-weight: normal; + font-style: normal; +} + +.code tt { display: hidden; } + + MAIN + + TOKENS = <<-'TOKENS' +.af { color:#00C; } +.an { color:#007; } +.av { color:#700; } +.aw { color:#C00; } +.bi { color:#509; font-weight:bold; } +.c { color:#888; } + +.ch { color:#04D; /* background-color:#f0f0ff; */ } +.ch .k { color:#04D; } +.ch .dl { color:#039; } + +.cl { color:#B06; font-weight:bold; } +.co { color:#036; font-weight:bold; } +.cr { color:#0A0; } +.cv { color:#369; } +.df { color:#099; font-weight:bold; } +.di { color:#088; font-weight:bold; } +.dl { color:black; } +.do { color:#970; } +.ds { color:#D42; font-weight:bold; } +.e { color:#666; font-weight:bold; } +.er { color:#F00; background-color:#FAA; } +.ex { color:#F00; font-weight:bold; } +.fl { color:#60E; font-weight:bold; } +.fu { color:#06B; font-weight:bold; } +.gv { color:#d70; font-weight:bold; } +.hx { color:#058; font-weight:bold; } +.i { color:#00D; font-weight:bold; } +.ic { color:#B44; font-weight:bold; } +.in { color:#B2B; font-weight:bold; } +.iv { color:#33B; } +.la { color:#970; font-weight:bold; } +.lv { color:#963; } +.oc { color:#40E; font-weight:bold; } +.on { color:#000; font-weight:bold; } +.pc { color:#038; font-weight:bold; } +.pd { color:#369; font-weight:bold; } +.pp { color:#579; } +.pt { color:#339; font-weight:bold; } +.r { color:#080; font-weight:bold; } + +.rx { background-color:#fff0ff; } +.rx .k { color:#808; } +.rx .dl { color:#404; } +.rx .mod { color:#C2C; } +.rx .fu { color:#404; font-weight: bold; } + +.s { background-color:#fff0f0; } +.s .s { background-color:#ffe0e0; } +.s .s .s { background-color:#ffd0d0; } +.s .k { color:#D20; } +.s .dl { color:#710; } + +.sh { background-color:#f0fff0; } +.sh .k { color:#2B2; } +.sh .dl { color:#161; } + +.sy { color:#A60; } +.sy .k { color:#A60; } +.sy .dl { color:#630; } + +.ta { color:#070; } +.tf { color:#070; font-weight:bold; } +.ts { color:#D70; font-weight:bold; } +.ty { color:#339; font-weight:bold; } +.v { color:#036; } +.xt { color:#444; } + TOKENS + + DEFAULT_STYLESHEET = MAIN + TOKENS + + end + end + +end end + +if $0 == __FILE__ + require 'pp' + pp CodeRay::Encoders::HTML::CSS.new +end diff --git a/lib/coderay/encoders/helpers/html_helper.rb b/lib/coderay/encoders/helpers/html_helper.rb new file mode 100644 index 0000000..03ea0a2 --- /dev/null +++ b/lib/coderay/encoders/helpers/html_helper.rb @@ -0,0 +1,68 @@ +module CodeRay module Encoders + + class HTML + + ClassOfKind = { + :attribute_name => 'an', + :attribute_name_fat => 'af', + :attribute_value => 'av', + :attribute_value_fat => 'aw', + :bin => 'bi', + :char => 'ch', + :class => 'cl', + :class_variable => 'cv', + :color => 'cr', + :comment => 'c', + :constant => 'co', + :content => 'k', + :definition => 'df', + :delimiter => 'dl', + :directive => 'di', + :doc => 'do', + :doc_string => 'ds', + :error => 'er', + :escape => 'e', + :exception => 'ex', + :float => 'fl', + :function => 'fu', + :global_variable => 'gv', + :hex => 'hx', + :include => 'ic', + :instance_variable => 'iv', + :integer => 'i', + :interpreted => 'in', + :label => 'la', + :local_variable => 'lv', + :modifier => 'mod', + :oct => 'oc', + :operator_name => 'on', + :pre_constant => 'pc', + :pre_type => 'pt', + :predefined => 'pd', + :preprocessor => 'pp', + :regexp => 'rx', + :reserved => 'r', + :shell => 'sh', + :string => 's', + :symbol => 'sy', + :tag => 'ta', + :tag_fat => 'tf', + :tag_special => 'ts', + :type => 'ty', + :variable => 'v', + :xml_text => 'xt', + + :ident => :NO_HIGHLIGHT, # 'id' + :operator => :NO_HIGHLIGHT, # 'op' + :space => :NO_HIGHLIGHT, # 'sp' + :plain => :NO_HIGHLIGHT, + } + ClassOfKind[:procedure] = ClassOfKind[:method] = ClassOfKind[:function] + ClassOfKind[:open] = ClassOfKind[:close] = ClassOfKind[:delimiter] + ClassOfKind[:nesting_delimiter] = ClassOfKind[:delimiter] + ClassOfKind[:escape] = ClassOfKind[:delimiter] + ClassOfKind.default = ClassOfKind[:error] or raise 'no class found for :error!' + + end + +end end diff --git a/lib/coderay/encoders/helpers/html_output.rb b/lib/coderay/encoders/helpers/html_output.rb new file mode 100644 index 0000000..e2b26e7 --- /dev/null +++ b/lib/coderay/encoders/helpers/html_output.rb @@ -0,0 +1,240 @@ +module CodeRay + module Encoders + + class HTML + + # This module is included in the output String from thew HTML Encoder. + # + # It provides methods like wrap, div, page etc. + # + # Remember to use #clone instead of #dup to keep the modules the object was + # extended with. + # + # TODO: more doc. + module Output + + class << self + + # This makes Output look like a class. + # + # Example: + # + # a = Output.new 'Code' + # a.wrap! :page + def new string, element = nil + output = string.clone.extend self + output.wrapped_in = element + output + end + + # Raises an exception if an object that doesn't respond to to_str is extended by Output, + # to prevent users from misuse. Use Module#remove_method to disable. + def extended o + warn "The Output module is intended to extend instances of String, not #{o.class}." unless o.respond_to? :to_str + end + + def page_template_for_css css = :default + css = CSS::DEFAULT_STYLESHEET if css == :default + PAGE.apply 'CSS', css + end + + # Define a new wrapper. This is meta programming. + def wrapper *wrappers + wrappers.each do |wrapper| + define_method wrapper do |*args| + wrap wrapper, *args + end + define_method(:"#{wrapper}!") do |*args| + wrap! wrapper, *args + end + end + end + end + + wrapper :div, :span, :page + + def wrapped_in + @wrapped_in || nil + end + attr_writer :wrapped_in + + def wrapped_in? element + wrapped_in == element + end + + def wrap_in template + clone.wrap_in! template + end + + def wrap_in! template + Template.wrap! self, template, 'CONTENT' + self + end + + def wrap! element, *args + return self if not element or element == wrapped_in + case element + when :div + raise "Can't wrap %p in %p" % [wrapped_in, element] unless wrapped_in? nil + wrap_in! DIV + when :span + raise "Can't wrap %p in %p" % [wrapped_in, element] unless wrapped_in? nil + wrap_in! SPAN + when :page + wrap! :div if wrapped_in? nil + raise "Can't wrap %p in %p" % [wrapped_in, element] unless wrapped_in? :div + wrap_in! Output.page_template_for_css + when nil + return self + else + raise "Unknown value %p for :wrap" % element + end + @wrapped_in = element + self + end + + def wrap *args + clone.wrap!(*args) + end + + def numerize! mode = :table, options = {} + return self unless mode + + offset = options.fetch :line_numbers_offset, DEFAULT_OPTIONS[:line_numbers_offset] + unless offset.is_a? Integer + raise ArgumentError, "Invalid value %p for :offset; Integer expected." % offset + end + + unless NUMERIZABLE_WRAPPINGS.include? options[:wrap] + raise ArgumentError, "Can't numerize, :wrap must be in %p, but is %p" % [NUMERIZABLE_WRAPPINGS, options[:wrap]] + end + + bold_every = options.fetch :bold_every, DEFAULT_OPTIONS[:bold_every] + bolding = + if bold_every == :no_bolding or bold_every == 0 + proc { |line| line.to_s } + elsif bold_every.is_a? Integer + proc do |line| + if line % bold_every == 0 + "#{line}" # every bold_every-th number in bold + else + line.to_s + end + end + else + raise ArgumentError, "Invalid value %p for :bolding; :no_bolding or Integer expected." % bolding + end + + line_count = count("\n") + line_count += 1 if self[-1] != ?\n + + case mode + when :inline + max_width = line_count.to_s.size + line = offset - 1 + gsub!(/^/) do + line += 1 + line_number = bolding.call line + "#{ line_number.rjust(max_width) } " + end + wrap! :div + + when :table + # This is really ugly. + # Because even monospace fonts seem to have different heights when bold, + # I make the newline bold, both in the code and the line numbers. + # FIXME Still not working perfect for Mr. Internet Exploder + line_numbers = (offset ... offset + line_count).to_a.map(&bolding).join("\n") + line_numbers << "\n" # also for Mr. MS Internet Exploder :-/ + line_numbers.gsub!(/\n/) { "\n" } + + line_numbers_tpl = DIV_TABLE.apply('LINE_NUMBERS', line_numbers) + gsub!(/\n/) { "\n" } + wrap_in! line_numbers_tpl + @wrapped_in = :div + + else + raise ArgumentError, "Unknown value %p for mode: :inline or :table expected" % mode + end + + self + end + + def numerize *args + clone.numerize!(*args) + end + + class Template < String + + def self.wrap! str, template, target + target = Regexp.new(Regexp.escape("<%#{target}%>")) + if template =~ target + str[0,0] = $` + str << $' + else + raise "Template target <%%%p%%> not found" % target + end + end + + def apply target, replacement + target = Regexp.new(Regexp.escape("<%#{target}%>")) + if self =~ target + Template.new($` + replacement + $') + else + raise "Template target <%%%p%%> not found" % target + end + end + + module Simple + def ` str #` + Template.new str + end + end + end + + extend Template::Simple + +#-- don't include the templates in docu + + SPAN = `<%CONTENT%>` + + DIV, DIV_TABLE, PAGE = + <<-`DIV`, <<-`DIV_TABLE`, <<-`PAGE` + +
+
<%CONTENT%>
+
+ DIV + +
+ + + + + +
<%LINE_NUMBERS%>
<%CONTENT%>
+
+ DIV_TABLE + + + + + + CodeRay HTML Encoder Example + + + +<%CONTENT%> + + + PAGE + + end + + end + +end +end diff --git a/lib/coderay/encoders/html.rb b/lib/coderay/encoders/html.rb new file mode 100644 index 0000000..69b6e22 --- /dev/null +++ b/lib/coderay/encoders/html.rb @@ -0,0 +1,167 @@ +module CodeRay +module Encoders + + class HTML < Encoder + + include Streamable + register_for :html + + FILE_EXTENSION = 'html' + + DEFAULT_OPTIONS = { + :tab_width => 8, + + :level => :xhtml, + :css => :class, + + :wrap => :page, + :line_numbers => :table, + :line_numbers_offset => 1, + :bold_every => 10, + } + NUMERIZABLE_WRAPPINGS = [:div, :page] + + require 'coderay/encoders/helpers/html_helper' + require 'coderay/encoders/helpers/html_output' + require 'coderay/encoders/helpers/html_css' + + def initialize(*) + super + @last_options = nil + end + + protected + + HTML_ESCAPE = { #:nodoc: + '&' => '&', + '"' => '"', + '>' => '>', + '<' => '<', + } + + # This is to prevent illegal HTML. + # Strange chars should still be avoided in codes. + evil_chars = Array(0x00...0x20) - [?n, ?t] + evil_chars.each { |i| HTML_ESCAPE[i.chr] = ' ' } + ansi_chars = Array(0x7f..0xff) + ansi_chars.each { |i| HTML_ESCAPE[i.chr] = '&#%d;' % i } + # \x9 (\t) and \xA (\n) not included + HTML_ESCAPE_PATTERN = /[&"><\0-\x8\xB-\x1f\x7f-\xff]/ + + def setup options + if options[:line_numbers] and not NUMERIZABLE_WRAPPINGS.include? options[:wrap] + warn ':line_numbers wanted, but :wrap is %p' % options[:wrap] + end + super + return if options == @last_options + @last_options = options + + @HTML_ESCAPE = HTML_ESCAPE.dup + @HTML_ESCAPE["\t"] = ' ' * options[:tab_width] + + @opened = [nil] + @css = CSS.new + + case options[:css] + + when :class + @css_style = Hash.new do |h, k| + if k.is_a? Array + type = k.first + else + type = k + end + c = ClassOfKind[type] + if c == :NO_HIGHLIGHT + h[k] = false + else + if options[:debug] + debug_info = ' title="%p"' % [ k ] + else + debug_info = '' + end + h[k] = '' % [debug_info, c] + end + end + + when :style + @css_style = Hash.new do |h, k| + if k.is_a? Array + styles = k.dup + else + styles = [k] + end + styles.map! { |c| ClassOfKind[c] } + if styles.first == :NO_HIGHLIGHT + h[k] = false + else + if options[:debug] + debug_info = ' title="%s"' % [ styles.inspect.gsub(/#{HTML_ESCAPE_PATTERN}/o) { |m| @HTML_ESCAPE[m] } ] + else + debug_info = '' + end + style = @css[*styles] + h[k] = + if style + '' % [debug_info, style] + else + false + end + end + end + + else + raise "Unknown value %p for :css." % options[:css] + + end + end + + def finish options + not_needed = @opened.shift + @out << '' * @opened.size + + @out.extend Output + @out.numerize! options[:line_numbers], options # if options[:line_numbers] + @out.wrap! options[:wrap] # if options[:wrap] + + #require 'pp' + #pp @css_style, @css_style.size + + super + end + + def token text, type + if text.is_a? String + # be careful when streaming: text is changed! + text.gsub!(/#{HTML_ESCAPE_PATTERN}/o) { |m| @HTML_ESCAPE[m] } + @opened[0] = type + style = @css_style[@opened] + if style + @out << style << text << '' + else + @out << text + end + else + case text + when :open + @opened[0] = type + @out << @css_style[@opened] + @opened << type + when :close + unless @opened.empty? + raise 'Not Token to be closed.' unless @opened.size > 1 + @out << '' + @opened.pop + end + when nil + raise 'Token with nil as text was given: %p' % [[text, type]] + else + raise 'unknown token kind: %p' % text + end + end + end + + end + +end +end diff --git a/lib/coderay/encoders/null.rb b/lib/coderay/encoders/null.rb new file mode 100644 index 0000000..67c4987 --- /dev/null +++ b/lib/coderay/encoders/null.rb @@ -0,0 +1,20 @@ +module CodeRay + module Encoders + + class Null < Encoder + + include Streamable + register_for :null + + protected + + def token(*) + # do nothing + end + + end + + end +end + + diff --git a/lib/coderay/encoders/span.rb b/lib/coderay/encoders/span.rb new file mode 100644 index 0000000..a7715f4 --- /dev/null +++ b/lib/coderay/encoders/span.rb @@ -0,0 +1,17 @@ +module CodeRay module Encoders + + require 'coderay/encoders/html' + class Span < HTML + + FILE_EXTENSION = 'span.html' + + register_for :span + + DEFAULT_OPTIONS = HTML::DEFAULT_OPTIONS.merge({ + :css => :style, + :wrap => :span, + :line_numbers => nil, + }) + end + +end end diff --git a/lib/coderay/encoders/statistic.rb b/lib/coderay/encoders/statistic.rb new file mode 100644 index 0000000..0685c03 --- /dev/null +++ b/lib/coderay/encoders/statistic.rb @@ -0,0 +1,74 @@ +module CodeRay module Encoders + + # Makes a statistic for the given tokens. + class Statistic < Encoder + + include Streamable + register_for :stats, :statistic + + attr_reader :type_stats, :real_token_count + + protected + + TypeStats = Struct.new :count, :size + + def setup options + @type_stats = Hash.new { |h, k| h[k] = TypeStats.new 0, 0 } + @real_token_count = 0 + end + + def generate tokens, options + @tokens = tokens + super + end + + def token text, type + @type_stats['TOTAL'].count += 1 + if text.is_a? String + @real_token_count += 1 unless type == :space + @type_stats[type].count += 1 + @type_stats[type].size += text.size + @type_stats['TOTAL'].size += text.size + else + @content_type = type + @type_stats['open/close'].count += 1 + end + end + + STATS = <<-STATS + +Code Statistics + +Tokens %8d + Non-Whitespace %8d +Bytes Total %8d + +Token Types (%d): + type count ratio size (average) +------------------------------------------------------------- +%s + STATS +# space 12007 33.81 % 1.7 + TOKEN_TYPES_ROW = <<-TKR + %-20s %8d %6.2f %% %5.1f + TKR + + def finish options + all = @type_stats['TOTAL'] + all_count, all_size = all.count, all.size + @type_stats.each do |type, stat| + stat.size /= stat.count.to_f + end + types_stats = @type_stats.sort_by { |k, v| -v.count }.map do |k, v| + TOKEN_TYPES_ROW % [k, v.count, 100.0 * v.count / all_count, v.size] + end.join + STATS % [ + all_count, @real_token_count, all_size, + @type_stats.delete_if { |k, v| k.is_a? String }.size, + types_stats + ] + end + + end + +end end diff --git a/lib/coderay/encoders/text.rb b/lib/coderay/encoders/text.rb new file mode 100644 index 0000000..4f0a754 --- /dev/null +++ b/lib/coderay/encoders/text.rb @@ -0,0 +1,33 @@ +module CodeRay + module Encoders + + class Text < Encoder + + include Streamable + register_for :text + + FILE_EXTENSION = 'txt' + + DEFAULT_OPTIONS = { + :separator => '' + } + + protected + def setup options + super + @sep = options[:separator] + end + + def token text, kind + return unless text.respond_to :to_str + @out << text + @sep + end + + def finish options + @out.chomp @sep + end + + end + + end +end diff --git a/lib/coderay/encoders/tokens.rb b/lib/coderay/encoders/tokens.rb new file mode 100644 index 0000000..4573307 --- /dev/null +++ b/lib/coderay/encoders/tokens.rb @@ -0,0 +1,44 @@ +module CodeRay + module Encoders + + # The Tokens encoder converts the tokens to a simple + # readable format. It doesn't use colors and is mainly + # intended for console output. + # + # The tokens are converted with Tokens.write_token. + # + # The format is: + # + # \t \n + # + # Example: + # + # require 'coderay' + # puts CodeRay.scan("puts 3 + 4", :ruby).tokens + # + # prints: + # + # ident puts + # space + # integer 3 + # space + # operator + + # space + # integer 4 + # + class Tokens < Encoder + + include Streamable + register_for :tokens + + FILE_EXTENSION = 'tok' + + protected + def token *args + @out << CodeRay::Tokens.write_token(*args) + end + + end + + end +end diff --git a/lib/coderay/encoders/yaml.rb b/lib/coderay/encoders/yaml.rb new file mode 100644 index 0000000..4e2b7a1 --- /dev/null +++ b/lib/coderay/encoders/yaml.rb @@ -0,0 +1,19 @@ +module CodeRay + module Encoders + + class YAML < Encoder + + register_for :yaml + + FILE_EXTENSION = 'yaml' + + protected + def compile tokens, options + require 'yaml' + @out = tokens.to_a.to_yaml + end + + end + + end +end diff --git a/lib/coderay/helpers/filetype.rb b/lib/coderay/helpers/filetype.rb new file mode 100644 index 0000000..7f34c35 --- /dev/null +++ b/lib/coderay/helpers/filetype.rb @@ -0,0 +1,145 @@ +# =FileType +# +# A simple filetype recognizer +# +# Author: murphy (mail to murphy cYcnus de) +# +# Version: 0.1 (2005.september.1) +# +# ==Documentation +# +# TODO +# +module FileType + + UnknownFileType = Class.new Exception + + class << self + + def [] filename, read_shebang = false + name = File.basename filename + ext = File.extname name + ext.sub!(/^\./, '') # delete the leading dot + + type = + TypeFromExt[ext] || + TypeFromExt[ext.downcase] || + TypeFromName[name] || + TypeFromName[name.downcase] + type ||= shebang(filename) if read_shebang + + type + end + + def shebang filename + begin + File.open filename, 'r' do |f| + first_line = f.gets + first_line[TypeFromShebang] + end + rescue IOError + nil + end + end + + # This works like Hash#fetch. + def fetch filename, default = nil, read_shebang = false + if default and block_given? + warn 'block supersedes default value argument' + end + + unless type = self[filename, read_shebang] + return yield if block_given? + return default if default + raise UnknownFileType, 'Could not determine type of %p.' % filename + end + type + end + + end + + TypeFromExt = { + 'rb' => :ruby, + 'rbw' => :ruby, + 'cpp' => :cpp, + 'c' => :c, + 'h' => :c, + 'xml' => :xml, + 'htm' => :html, + 'html' => :html, + } + + TypeFromShebang = /\b(?:ruby|perl|python|sh)\b/ + + TypeFromName = { + 'Rakefile' => :ruby, + 'Rantfile' => :ruby, + } + +end + +if $0 == __FILE__ + $VERBOSE = true + eval DATA.read, nil, $0, __LINE__+4 +end + +__END__ + +require 'test/unit' + +class TC_FileType < Test::Unit::TestCase + + def test_fetch + assert_raise FileType::UnknownFileType do + FileType.fetch '' + end + + assert_throws :not_found do + FileType.fetch '.' do + throw :not_found + end + end + + assert_equal :default, FileType.fetch('c', :default) + + stderr, fake_stderr = $stderr, Object.new + $err = '' + def fake_stderr.write x + $err << x + end + $stderr = fake_stderr + FileType.fetch('c', :default) { } + assert_equal "block supersedes default value argument\n", $err + $stderr = stderr + end + + def test_ruby + assert_equal :ruby, FileType['test.rb'] + assert_equal :ruby, FileType['C:\\Program Files\\x\\y\\c\\test.rbw'] + assert_equal :ruby, FileType['/usr/bin/something/Rakefile'] + assert_equal :ruby, FileType['~/myapp/gem/Rantfile'] + assert_not_equal :ruby, FileType['test_rb'] + assert_not_equal :ruby, FileType['Makefile'] + assert_not_equal :ruby, FileType['set.rb/set'] + assert_not_equal :ruby, FileType['~/projects/blabla/rb'] + end + + def test_c + assert_equal :c, FileType['test.c'] + assert_equal :c, FileType['C:\\Program Files\\x\\y\\c\\test.h'] + assert_not_equal :c, FileType['test_c'] + assert_not_equal :c, FileType['Makefile'] + assert_not_equal :c, FileType['set.h/set'] + assert_not_equal :c, FileType['~/projects/blabla/c'] + end + + def test_shebang + dir = './test' + if File.directory? dir + Dir.chdir dir do + assert_equal :c, FileType['test.c'] + end + end + end + +end diff --git a/lib/coderay/helpers/gzip_simple.rb b/lib/coderay/helpers/gzip_simple.rb new file mode 100644 index 0000000..02d1ffd --- /dev/null +++ b/lib/coderay/helpers/gzip_simple.rb @@ -0,0 +1,123 @@ +# =GZip Simple +# +# A simplified interface to the gzip library +zlib+ (from the Ruby Standard Library.) +# +# Author: murphy (mail to murphy cYcnus de) +# +# Version: 0.2 (2005.may.28) +# +# ==Documentation +# +# See +GZip+ module and the +String+ extensions. +# +module GZip + + require 'zlib' + + # The default zipping level. 7 zips good and fast. + DEFAULT_GZIP_LEVEL = 7 + + # Unzips the given string +s+. + # + # Example: + # require 'gzip_simple' + # print GZip.gunzip(File.read('adresses.gz')) + # + def GZip.gunzip s + Zlib::Inflate.inflate s + end + + # Zips the given string +s+. + # + # Example: + # require 'gzip_simple' + # File.open('adresses.gz', 'w') do |file + # file.write GZip.gzip('Mum: 0123 456 789', 9) + # end + # + # If you provide a +level+, you can control how strong + # the string is compressed: + # - 0: no compression, only convert to gzip format + # - 1: compress fast + # - 7: compress more, but still fast (default) + # - 8: compress more, slower + # - 9: compress best, very slow + def GZip.gzip s, level = DEFAULT_GZIP_LEVEL + Zlib::Deflate.new(level).deflate s, Zlib::FINISH + end +end + +# String extensions to use the GZip module. +# +# The methods gzip and gunzip provide an even more simple +# interface to the ZLib: +# +# # create a big string +# x = 'a' * 1000 +# +# # zip it +# x_gz = x.gzip +# +# # test the result +# puts 'Zipped %d bytes to %d bytes.' % [x.size, x_gz.size] +# #-> Zipped 1000 bytes to 19 bytes. +# +# # unzipping works +# p x_gz.gunzip == x #-> true +class String + # Returns the string, unzipped. + # See GZip.gunzip + def gunzip + GZip.gunzip self + end + # Replaces the string with its unzipped value. + # See GZip.gunzip + def gunzip! + replace gunzip + end + + # Returns the string, zipped. + # +level+ is the gzip compression level, see GZip.gzip. + def gzip level = GZip::DEFAULT_GZIP_LEVEL + GZip.gzip self, level + end + # Replaces the string with its zipped value. + # See GZip.gzip. + def gzip!(*args) + replace gzip(*args) + end +end + +if $0 == __FILE__ + eval DATA.read, nil, $0, __LINE__+4 +end + +__END__ +#CODE + +# Testing / Benchmark +x = 'a' * 1000 +x_gz = x.gzip +puts 'Zipped %d bytes to %d bytes.' % [x.size, x_gz.size] #-> Zipped 1000 bytes to 19 bytes. +p x_gz.gunzip == x #-> true + +require 'benchmark' + +INFO = 'packed to %0.3f%%' # :nodoc: + +x = Array.new(100000) { rand(255).chr + 'aaaaaaaaa' + rand(255).chr }.join +Benchmark.bm(10) do |bm| + for level in 0..9 + bm.report "zip #{level}" do + $x = x.gzip level + end + puts INFO % [100.0 * $x.size / x.size] + end + bm.report 'zip' do + $x = x.gzip + end + puts INFO % [100.0 * $x.size / x.size] + bm.report 'unzip' do + $x.gunzip + end +end diff --git a/lib/coderay/helpers/scanner_helper.rb b/lib/coderay/helpers/scanner_helper.rb new file mode 100644 index 0000000..a2e14bb --- /dev/null +++ b/lib/coderay/helpers/scanner_helper.rb @@ -0,0 +1,63 @@ +module CodeRay +module Scanners + + class Scanner + + # A WordList is a Hash with some additional features. + # It is intended to be used for keyword recognition. + class WordList < Hash + + def initialize default = false, case_mode = :case_match + @case_ignore = + case case_mode + when :case_match then false + when :case_ignore then true + else + raise ArgumentError, + "#{self.class.name}.new: second argument must be :case_ignore or :case_match, but #{case_mode} was given." + end + + if @case_ignore + super() do |h, k| + h[k] = h.fetch k.downcase, default + end + else + super default + end + end + + def include? word + self[word] if @case_ignore + has_key? word + end + + def add words, kind = true + words.each do |word| + self[mind_case(word)] = kind + end + self + end + + alias words keys + + def case_ignore? + @case_mode + end + + private + def mind_case word + if @case_ignore + word.downcase + else + word.dup + end + end + + end + + end + +end +end + +# vim:sw=2:ts=2:et:tw=78 diff --git a/lib/coderay/scanner.rb b/lib/coderay/scanner.rb new file mode 100644 index 0000000..1cca607 --- /dev/null +++ b/lib/coderay/scanner.rb @@ -0,0 +1,298 @@ +module CodeRay + + # This module holds class Scanner and its subclasses. + # For example, the Ruby scanner is named CodeRay::Scanners::Ruby + # can be found in coderay/scanners/ruby. + # + # Scanner also provides methods and constants for the register mechanism + # and the [] method that returns the Scanner class belonging to the + # given lang. + module Scanners + + # Raised if Scanners[] fails because: + # * a file could not be found + # * the requested Scanner is not registered + ScannerNotFound = Class.new(Exception) + + # Loaded Scanners are saved here. + SCANNERS = Hash.new { |h, lang| + raise ScannerNotFound, "No scanner for #{lang} found." + } + + class << self + + # Registers a scanner class by setting SCANNERS[lang]. + # + # Typically used in Scanners, for example in the Ruby scanner: + # + # register_for :ruby + def register scanner_class, *langs + for lang in langs + raise ArgumentError, 'lang must be a Symbol, but it was a %s' % lang.class unless lang.is_a? Symbol + SCANNERS[lang] = scanner_class + end + end + + # Loads the scanner class for +lang+ and returns it. + # + # Example: + # + # Scanners[:xml].new + # + # +lang+ is converted using +normalize+ and must be + # * a String containing only alphanumeric characters (\w+) + # * a Symbol + # + # Strings are converted to lowercase symbols (so +'C'+ and +'c'+ load the + # same scanner, namely the one registered for +:c+.) + # + # If the scanner isn't registered yet, it is searched. + # CodeRay expects that the scanner class is defined in + # + # /coderay/scanners/.rb + # + # (See path_to.) + # + # If the file isn't found, a ScannerNotFound exception is raised + # + # The scanner should register itself using +register+. If the scanner is + # still not found (because has not registered or registered under another lang), + # a ScannerNotFound exception is raised. + def [] lang + lang = normalize lang + + SCANNERS.fetch lang do + scanner_file = path_to lang + + begin + require scanner_file + rescue LoadError + raise ScannerNotFound, "File #{scanner_file} not found." + end + + SCANNERS.fetch lang do + raise ScannerNotFound, <<-ERR +No scanner for #{lang} found in #{scanner_file}. +Known scanners: #{SCANNERS} + ERR + end + end + end + + # Alias for +[]+. + alias load [] + + # Calculates the path where a scanner for +lang+ + # is expected to be. This is: + # + # /coderay/scanners/.rb + def path_to lang + File.join 'coderay', 'scanners', "#{lang}.rb" + end + + # Returns an array of all filenames in the scanners/ folder. + # The extension +.rb+ is not included. + def languages + scanners = File.join File.dirname(__FILE__), 'scanners', '*.rb' + Dir[scanners].map do |file| + File.basename file, '.rb' + end + end + + # Loads all scanners that +languages+ finds using +load+. + def load_all + for lang in languages + load lang + end + end + + # Converts +lang+ to a downcase Symbol if it is a String, + # or returns +lang+ if it already is a Symbol. + # + # Raises +ArgumentError+ for all other objects, or if the + # given String includes non-alphanumeric characters (\W). + def normalize lang + if lang.is_a? Symbol + lang + elsif lang.is_a? String + if lang[/\w+/] == lang + lang[/\w+/].downcase.to_sym + else + raise ArgumentError, "Invalid lang: '#{lang}' given." + end + elsif lang.nil? + :plaintext + else + raise ArgumentError, "String or Symbol expected, but #{lang.class} given." + end + end + + end + + + require 'strscan' + # The base class for all Scanners. + # + # It is a subclass of Ruby's great +StringScanner+, which + # makes it easy to access the scanning methods inside. + # + # It is also +Enumerable+, so you can do this: + # + # require 'coderay' + # + # c_scanner = CodeRay::Scanners[:c].new "if (*p == '{') nest++;" + # + # for text, kind in c_scanner + # puts text if kind == :operator + # end + # + # # prints: (*==)++; + # + # OK, this is not a very good example :) + # You can also use map, any?, find and even sort_by. + class Scanner < StringScanner + + # Raised if a Scanner fails while scanning + ScanError = Class.new(Exception) + + require 'coderay/helpers/scanner_helper' + + # The default options for all scanner classes. + # + # Define @default_options for subclasses. + DEFAULT_OPTIONS = { :stream => false } + + class << self + # Register the scanner class for all + # +langs+. + # + # See Scanners.register. + def register_for *langs + Scanners.register self, *langs + end + + # Returns if the Scanner can be used in streaming mode. + def streamable? + is_a? Streamable + end + + end + +=begin + ## Excluded for speed reasons - protected seems to make methods slow. + + # Save the StringScanner methods from being called. + # This would not be useful for highlighting. + strscan_public_methods = StringScanner.instance_methods - StringScanner.ancestors[1].instance_methods + protected(*strscan_public_methods) +=end + # Creates a new Scanner. + # + # * +code+ is the input String and is handled by the superclass StringScanner. + # * +options+ is a Hash with Symbols as keys. + # It is merged with the default options of the class (you can overwrite + # default options here.) + # * +block+ is the callback for streamed highlighting. + # + # If you set :stream to +true+ in the options, the Scanner uses a + # TokenStream with the +block+ as callback to handle the tokens. + # + # Else, a Tokens object is used. + def initialize code, options = {}, &block + @options = self.class::DEFAULT_OPTIONS.merge options + raise "I am only the basic Scanner class. I can't scan anything. :(\n" + + "Use my subclasses." if self.class == Scanner + + # I love this hack. It seems to silence all dos/unix/mac newline problems. + super code.gsub(/\r\n?/, "\n") + + if @options[:stream] + warn "warning in CodeRay::Scanner.new: :stream is set, but no block was given" unless block_given? + raise NotStreamableError, self unless kind_of? Streamable + @tokens = TokenStream.new(&block) + else + warn "warning in CodeRay::Scanner.new: Block given, but :stream is #{@options[:stream]}" if block_given? + @tokens = Tokens.new + end + end + + # More mnemonic accessor name for the input string. + alias code string + + # Scans the code and returns all tokens in a Tokens object. + def tokenize options = {} + options = @options.merge({}) #options + if @options[:stream] # :stream must have been set already + reset ## what is this for? + scan_tokens @tokens, options + @tokens + else + @cached_tokens ||= scan_tokens @tokens, options + end + end + + # you can also see this as a read-only attribute + alias tokens tokenize + + # Traverses the tokens. + def each &block + raise ArgumentError, 'Cannot traverse TokenStream.' if @options[:stream] + tokens.each(&block) + end + include Enumerable + + # The current line position of the scanner. + # + # Beware, this is implemented inefficiently. It should be used + # for debugging only. + def line + string[0..pos].count("\n") + 1 + end + + protected + + # This is the central method, and often the only one a subclass implements. + # + # Subclasses must implement this method; it must return +tokens+ and must only + # use Tokens#<< for storing scanned tokens. + def scan_tokens tokens, options + raise NotImplementedError, "#{self.class}#scan_tokens not implemented." + end + + # Scanner error with additional status information + def raise_inspect msg, tokens, ambit = 30 + raise ScanError, <<-EOE % [ + + +***ERROR in %s: %s + +tokens: +%s + +current line: %d pos = %d +matched: %p +bol? = %p, eos? = %p + +surrounding code: +%p ~~ %p + + +***ERROR*** + + EOE + File.basename(caller[0]), + msg, + tokens.last(10).map { |t| t.inspect }.join("\n"), + line, pos, + matched, bol?, eos?, + string[pos-ambit,ambit], + string[pos,ambit], + ] + end + + end + + end +end + +# vim:sw=2:ts=2:et:tw=78 diff --git a/lib/coderay/scanners/c.rb b/lib/coderay/scanners/c.rb new file mode 100644 index 0000000..3420822 --- /dev/null +++ b/lib/coderay/scanners/c.rb @@ -0,0 +1,147 @@ +module CodeRay module Scanners + + class C < Scanner + + register_for :c + + RESERVED_WORDS = [ + 'asm', 'break', 'case', 'continue', 'default', 'do', 'else', + 'for', 'goto', 'if', 'return', 'switch', 'while', + 'struct', 'union', 'enum', 'typedef', + 'static', 'register', 'auto', 'extern', + 'sizeof', + 'volatile', 'const', # C89 + 'inline', 'restrict', # C99 + ] + + PREDEFINED_TYPES = [ + 'int', 'long', 'short', 'char', 'void', + 'signed', 'unsigned', 'float', 'double', + 'bool', 'complex', # C99 + ] + + PREDEFINED_CONSTANTS = [ + 'EOF', 'NULL', + 'true', 'false', # C99 + ] + + IDENT_KIND = Scanner::WordList.new(:ident). + add(RESERVED_WORDS, :reserved). + add(PREDEFINED_TYPES, :pre_type). + add(PREDEFINED_CONSTANTS, :pre_constant) + + ESCAPE = / [rbfnrtv\n\\'"] | x[a-fA-F0-9]{1,2} | [0-7]{1,3} /x + UNICODE_ESCAPE = / u[a-fA-F0-9]{4} | U[a-fA-F0-9]{8} /x + + def scan_tokens tokens, options + + state = :initial + + until eos? + + kind = :error + match = nil + + if state == :initial + + if scan(/ \s+ | \\\n /x) + kind = :space + + elsif scan(%r! // [^\n\\]* (?: \\. [^\n\\]* )* | /\* (?: .*? \*/ | .* ) !mx) + kind = :comment + + elsif match = scan(/ \# \s* if \s* 0 /x) + match << scan_until(/ ^\# (?:elif|else|endif) .*? $ | \z /xm) unless eos? + kind = :comment + + elsif scan(/ [-+*\/=<>?:;,!&^|()\[\]{}~%]+ | \.(?!\d) /x) + kind = :operator + + elsif match = scan(/ [A-Za-z_][A-Za-z_0-9]* /x) + kind = IDENT_KIND[match] + if kind == :ident and check(/:(?!:)/) + match << scan(/:/) + kind = :label + end + + elsif match = scan(/L?"/) + tokens << [:open, :string] + if match[0] == ?L + tokens << ['L', :modifier] + match = '"' + end + state = :string + kind = :delimiter + + elsif scan(/#\s*(\w*)/) + kind = :preprocessor # FIXME multiline preprocs + state = :include_expected if self[1] == 'include' + + elsif scan(/ L?' (?: [^\'\n\\] | \\ #{ESCAPE} )? '? /ox) + kind = :char + + elsif scan(/0[xX][0-9A-Fa-f]+/) + kind = :hex + + elsif scan(/(?:0[0-7]+)(?![89.eEfF])/) + kind = :oct + + elsif scan(/(?:\d+)(?![.eEfF])/) + kind = :integer + + elsif scan(/\d[fF]?|\d*\.\d+(?:[eE][+-]?\d+)?[fF]?|\d+[eE][+-]?\d+[fF]?/) + kind = :float + + else + getch + end + + elsif state == :string + if scan(/[^\\"]+/) + kind = :content + elsif scan(/"/) + tokens << ['"', :delimiter] + tokens << [:close, :string] + state = :initial + next + elsif scan(/ \\ (?: #{ESCAPE} | #{UNICODE_ESCAPE} ) /mox) + kind = :char + elsif scan(/ \\ | $ /x) + kind = :error + state = :initial + else + raise "else case \" reached; %p not handled." % peek(1), tokens + end + + elsif state == :include_expected + if scan(/<[^>\n]+>?|"[^"\n\\]*(?:\\.[^"\n\\]*)*"?/) + kind = :include + state = :initial + + elsif match = scan(/\s+/) + kind = :space + state = :initial if match.index ?\n + + else + getch + + end + + else + raise 'else-case reached', tokens + + end + + match ||= matched + raise [match, kind], tokens if kind == :error + + tokens << [match, kind] + + end + + tokens + end + + end + +end end diff --git a/lib/coderay/scanners/delphi.rb b/lib/coderay/scanners/delphi.rb new file mode 100644 index 0000000..4c03147 --- /dev/null +++ b/lib/coderay/scanners/delphi.rb @@ -0,0 +1,123 @@ +module CodeRay module Scanners + + class Delphi < Scanner + + register_for :delphi + + RESERVED_WORDS = [ + 'and', 'array', 'as', 'at', 'asm', 'at', 'begin', 'case', 'class', + 'const', 'constructor', 'destructor', 'dispinterface', 'div', 'do', + 'downto', 'else', 'end', 'except', 'exports', 'file', 'finalization', + 'finally', 'for', 'function', 'goto', 'if', 'implementation', 'in', + 'inherited', 'initialization', 'inline', 'interface', 'is', 'label', + 'library', 'mod', 'nil', 'not', 'object', 'of', 'or', 'out', 'packed', + 'procedure', 'program', 'property', 'raise', 'record', 'repeat', + 'resourcestring', 'set', 'shl', 'shr', 'string', 'then', 'threadvar', + 'to', 'try', 'type', 'unit', 'until', 'uses', 'var', 'while', 'with', + 'xor', 'on' + ] + + DIRECTIVES = [ + 'absolute', 'abstract', 'assembler', 'at', 'automated', 'cdecl', + 'contains', 'deprecated', 'dispid', 'dynamic', 'export', + 'external', 'far', 'forward', 'implements', 'local', + 'near', 'nodefault', 'on', 'overload', 'override', + 'package', 'pascal', 'platform', 'private', 'protected', 'public', + 'published', 'read', 'readonly', 'register', 'reintroduce', + 'requires', 'resident', 'safecall', 'stdcall', 'stored', 'varargs', + 'virtual', 'write', 'writeonly' + ] + + IDENT_KIND = Scanner::WordList.new(:ident, :case_ignore). + add(RESERVED_WORDS, :reserved). + add(DIRECTIVES, :directive) + + def scan_tokens tokens, options + + state = :initial + + until eos? + + kind = :error + match = nil + + if state == :initial + + if scan(/ \s+ /x) + kind = :space + + elsif scan(%r! \{ \$ [^}]* \}? | \(\* \$ (?: .*? \*\) | .* ) !mx) + kind = :preprocessor + + elsif scan(%r! // [^\n]* | \{ [^}]* \}? | \(\* (?: .*? \*\) | .* ) !mx) + kind = :comment + + elsif scan(/ [-+*\/=<>:;,.@\^|\(\)\[\]]+ /x) + kind = :operator + + elsif match = scan(/ [A-Za-z_][A-Za-z_0-9]* /x) + kind = IDENT_KIND[match] + + elsif match = scan(/ ' ( [^\n']|'' ) (?:'|$) /x) + tokens << [:open, :char] + tokens << ["'", :delimiter] + tokens << [self[1], :content] + tokens << ["'", :delimiter] + tokens << [:close, :char] + next + + elsif match = scan(/ ' /x) + tokens << [:open, :string] + state = :string + kind = :delimiter + + elsif scan(/ \# (?: \d+ | \$[0-9A-Fa-f]+ ) /x) + kind = :char + + elsif scan(/ \$ [0-9A-Fa-f]+ /x) + kind = :hex + + elsif scan(/ (?: \d+ ) (?![eE]|\.[^.]) /x) + kind = :integer + + elsif scan(/ \d+ (?: \.\d+ (?: [eE][+-]? \d+ )? | [eE][+-]? \d+ ) /x) + kind = :float + + else + getch + end + + elsif state == :string + if scan(/[^\n']+/) + kind = :content + elsif scan(/''/) + kind = :char + elsif scan(/'/) + tokens << ["'", :delimiter] + tokens << [:close, :string] + state = :initial + next + elsif scan(/\n/) + state = :initial + else + raise "else case \' reached; %p not handled." % peek(1), tokens + end + + else + raise 'else-case reached', tokens + + end + + match ||= matched + raise [match, kind], tokens if kind == :error + + tokens << [match, kind] + + end + + tokens + end + + end + +end end diff --git a/lib/coderay/scanners/helpers/ruby_helper.rb b/lib/coderay/scanners/helpers/ruby_helper.rb new file mode 100644 index 0000000..241b392 --- /dev/null +++ b/lib/coderay/scanners/helpers/ruby_helper.rb @@ -0,0 +1,212 @@ +module CodeRay module Scanners + + class Ruby + + RESERVED_WORDS = %w[ + and def end in or unless begin + defined? ensure module redo super until + BEGIN break do next rescue then + when END case else for retry + while alias class elsif if not return + undef yield + ] + + DEF_KEYWORDS = %w[ def ] + MODULE_KEYWORDS = %w[class module] + DEF_NEW_STATE = WordList.new(:initial). + add(DEF_KEYWORDS, :def_expected). + add(MODULE_KEYWORDS, :module_expected) + + IDENTS_ALLOWING_REGEXP = %w[ + and or not while until unless if then elsif when sub sub! gsub gsub! scan slice slice! split + ] + REGEXP_ALLOWED = WordList.new(false). + add(IDENTS_ALLOWING_REGEXP, :set) + + PREDEFINED_CONSTANTS = %w[ + nil true false self + DATA ARGV ARGF __FILE__ __LINE__ + ] + + IDENT_KIND = WordList.new(:ident). + add(RESERVED_WORDS, :reserved). + add(PREDEFINED_CONSTANTS, :pre_constant) + +# IDENT = /[a-zA-Z_][a-zA-Z_0-9]*/ + IDENT = /[a-z_][\w_]*/i + + METHOD_NAME = / #{IDENT} [?!]? /ox + METHOD_NAME_EX = / + #{IDENT}[?!=]? # common methods: split, foo=, empty?, gsub! + | \*\*? # multiplication and power + | [-+]@? # plus, minus + | [\/%&|^`~] # division, modulo or format strings, &and, |or, ^xor, `system`, tilde + | \[\]=? # array getter and setter + | << | >> # append or shift left, shift right + | <=?>? | >=? # comparison, rocket operator + | ===? # simple equality and case equality + /ox + INSTANCE_VARIABLE = / @ #{IDENT} /ox + CLASS_VARIABLE = / @@ #{IDENT} /ox + OBJECT_VARIABLE = / @@? #{IDENT} /ox + GLOBAL_VARIABLE = / \$ (?: #{IDENT} | [1-9] | 0[a-zA-Z_0-9]* | [~&+`'=\/,;_.<>!@$?*":\\] | -[a-zA-Z_0-9] ) /ox + PREFIX_VARIABLE = / #{GLOBAL_VARIABLE} |#{OBJECT_VARIABLE} /ox + VARIABLE = / @?@? #{IDENT} | #{GLOBAL_VARIABLE} /ox + + QUOTE_TO_TYPE = { + '`' => :shell, + '/'=> :regexp, + } + QUOTE_TO_TYPE.default = :string + + REGEXP_MODIFIERS = /[mixounse]*/ + REGEXP_SYMBOLS = / + [|?*+?(){}\[\].^$] + /x + + DECIMAL = /\d+(?:_\d+)*/ # doesn't recognize 09 as octal error + OCTAL = /0_?[0-7]+(?:_[0-7]+)*/ + HEXADECIMAL = /0x[0-9A-Fa-f]+(?:_[0-9A-Fa-f]+)*/ + BINARY = /0b[01]+(?:_[01]+)*/ + + EXPONENT = / [eE] [+-]? #{DECIMAL} /ox + FLOAT_OR_INT = / #{DECIMAL} (?: #{EXPONENT} | \. #{DECIMAL} #{EXPONENT}? )? /ox + FLOAT = / #{DECIMAL} (?: #{EXPONENT} | \. #{DECIMAL} #{EXPONENT}? ) /ox + NUMERIC = / #{OCTAL} | #{HEXADECIMAL} | #{BINARY} | #{FLOAT_OR_INT} /ox + + SYMBOL = / + : + (?: + #{METHOD_NAME_EX} + | #{PREFIX_VARIABLE} + | ['"] + ) + /ox + + # TODO investigste \M, \c and \C escape sequences + # (?: M-\\C-|C-\\M-|M-\\c|c\\M-|c|C-|M-)? (?: \\ (?: [0-7]{3} | x[0-9A-Fa-f]{2} | . ) ) + # assert_equal(225, ?\M-a) + # assert_equal(129, ?\M-\C-a) + ESCAPE = / + [abefnrstv] + | M-\\C-|C-\\M-|M-\\c|c\\M-|c|C-|M- + | [0-7]{1,3} + | x[0-9A-Fa-f]{1,2} + | . + /mx + + CHARACTER = / + \? + (?: + [^\s\\] + | \\ #{ESCAPE} + ) + /mx + + # NOTE: This is not completel correct, but + # nobody needs heredoc delimiters ending with \n. + HEREDOC_OPEN = / + << (-)? # $1 = float + (?: + ( [A-Za-z_0-9]+ ) # $2 = delim + | + ( ["'`] ) # $3 = quote, type + ( [^\n]*? ) \3 # $4 = delim + ) + /mx + + RDOC = / + =begin (?!\S) + .*? + (?: \Z | ^=end (?!\S) [^\n]* ) + /mx + + DATA = / + __END__$ + .*? + (?: \Z | (?=^\#CODE) ) + /mx + + RDOC_DATA_START = / ^=begin (?!\S) | ^__END__$ /x + + FANCY_START = / % ( [qQwWxsr] | (?![\w\s=]) ) (.) /mox + + FancyStringType = { + 'q' => [:string, false], + 'Q' => [:string, true], + 'r' => [:regexp, true], + 's' => [:symbol, false], + 'x' => [:shell, true], + 'w' => [:string, :word], + 'W' => [:string, :word], + } + FancyStringType['w'] = FancyStringType['q'] + FancyStringType['W'] = FancyStringType[''] = FancyStringType['Q'] + + class StringState < Struct.new :type, :interpreted, :delim, :heredoc, + :paren, :paren_depth, :pattern + + CLOSING_PAREN = Hash[ *%w[ + ( ) + [ ] + < > + { } + ] ] + + CLOSING_PAREN.values.each { |o| o.freeze } # debug, if I try to change it with << + OPENING_PAREN = CLOSING_PAREN.invert + + STRING_PATTERN = Hash.new { |h, k| + delim, interpreted = *k + delim_pattern = Regexp.escape(delim.dup) + if starter = OPENING_PAREN[delim] + delim_pattern << Regexp.escape(starter) + end + + + special_escapes = + case interpreted + when :regexp_symbols + '| ' + REGEXP_SYMBOLS.source + when :words + '| \s' + end + + h[k] = + if interpreted and not delim == '#' + / (?= [#{delim_pattern}\\] | \# [{$@] #{special_escapes} ) /mx + else + / (?= [#{delim_pattern}\\] #{special_escapes} ) /mx + end + } + + HEREDOC_PATTERN = Hash.new { |h, k| + delim, interpreted, indented = *k + delim_pattern = Regexp.escape(delim.dup) + delim_pattern = / \n #{ '(?>[\ \t]*)' if indented } #{ Regexp.new delim_pattern } $ /x + h[k] = + if interpreted + / (?= #{delim_pattern}() | \\ | \# [{$@] ) /mx + else + / (?= #{delim_pattern}() | \\ ) /mx + end + } + + def initialize kind, interpreted, delim, heredoc = false + if paren = CLOSING_PAREN[delim] + delim, paren = paren, delim + paren_depth = 1 + end + if heredoc + pattern = HEREDOC_PATTERN[ [delim, interpreted, heredoc == :indented] ] + delim = nil + else + pattern = STRING_PATTERN[ [delim, interpreted] ] + end + super kind, interpreted, delim, heredoc, paren, paren_depth, pattern + end + end unless defined? StringState + + end + +end end diff --git a/lib/coderay/scanners/mush.rb b/lib/coderay/scanners/mush.rb new file mode 100644 index 0000000..5217ed9 --- /dev/null +++ b/lib/coderay/scanners/mush.rb @@ -0,0 +1,102 @@ +module CodeRay module Scanners + + class Mush < Scanner + + register_for :mush + + RESERVED_WORDS = [ + ] + + IDENT_KIND = Scanner::WordList.new(:ident, :case_ignore). + add(RESERVED_WORDS, :reserved). + add(DIRECTIVES, :directive) + + def scan_tokens tokens, options + + state = :initial + + until eos? + + kind = :error + match = nil + + if state == :initial + + if scan(/ \s+ /x) + kind = :space + + elsif scan(%r! \{ \$ [^}]* \}? | \(\* \$ (?: .*? \*\) | .* ) !mx) + kind = :preprocessor + + elsif scan(%r! // [^\n]* | \{ [^}]* \}? | \(\* (?: .*? \*\) | .* ) !mx) + kind = :comment + + elsif scan(/ [-+*\/=<>:;,.@\^|\(\)\[\]]+ /x) + kind = :operator + + elsif match = scan(/ [A-Za-z_][A-Za-z_0-9]* /x) + kind = IDENT_KIND[match] + + elsif match = scan(/ ' ( [^\n']|'' ) (?:'|$) /x) + tokens << [:open, :char] + tokens << ["'", :delimiter] + tokens << [self[1], :content] + tokens << ["'", :delimiter] + tokens << [:close, :char] + next + + elsif match = scan(/ ' /x) + tokens << [:open, :string] + state = :string + kind = :delimiter + + elsif scan(/ \# (?: \d+ | \$[0-9A-Fa-f]+ ) /x) + kind = :char + + elsif scan(/ \$ [0-9A-Fa-f]+ /x) + kind = :hex + + elsif scan(/ (?: \d+ ) (?![eE]|\.[^.]) /x) + kind = :integer + + elsif scan(/ \d+ (?: \.\d+ (?: [eE][+-]? \d+ )? | [eE][+-]? \d+ ) /x) + kind = :float + + else + getch + end + + elsif state == :string + if scan(/[^\n']+/) + kind = :content + elsif scan(/''/) + kind = :char + elsif scan(/'/) + tokens << ["'", :delimiter] + tokens << [:close, :string] + state = :initial + next + elsif scan(/\n/) + state = :initial + else + raise "else case \' reached; %p not handled." % peek(1), tokens + end + + else + raise 'else-case reached', tokens + + end + + match ||= matched + raise [match, kind], tokens if kind == :error + + tokens << [match, kind] + + end + + tokens + end + + end + +end end diff --git a/lib/coderay/scanners/plaintext.rb b/lib/coderay/scanners/plaintext.rb new file mode 100644 index 0000000..0aebf35 --- /dev/null +++ b/lib/coderay/scanners/plaintext.rb @@ -0,0 +1,13 @@ +module CodeRay module Scanners + + class Plaintext < Scanner + + register_for :plaintext, :plain + + def scan_tokens tokens, options + tokens << [scan_until(/\z/), :plain] + end + + end + +end end diff --git a/lib/coderay/scanners/ruby.rb b/lib/coderay/scanners/ruby.rb new file mode 100644 index 0000000..433726b --- /dev/null +++ b/lib/coderay/scanners/ruby.rb @@ -0,0 +1,333 @@ +module CodeRay module Scanners + + # This scanner is really complex, since Ruby _is_ a complex language! + # + # It tries to highlight 100% of all common code, + # and 90% of strange codes. + # + # It is optimized for HTML highlighting, and is not very useful for + # parsing or pretty printing. + # + # For now, I think it's better than the scanners in VIM or Syntax, or + # any highlighter I was able to find, except Caleb's RubyLexer. + # + # I hope it's also better than the rdoc/irb lexer. + class Ruby < Scanner + + include Streamable + + register_for :ruby + + require 'coderay/scanners/helpers/ruby_helper' + + DEFAULT_OPTIONS = { + :parse_regexps => true, + } + + private + def scan_tokens tokens, options + parse_regexp = false # options[:parse_regexps] + first_bake = saved_tokens = nil + last_token_dot = false + fancy_allowed = regexp_allowed = true + heredocs = nil + last_state = nil + state = :initial + depth = nil + states = [] + + until eos? + type = :error + match = nil + kind = nil + + if state.instance_of? StringState +# {{{ + + match = scan_until(state.pattern) || scan_until(/\z/) + tokens << [match, :content] unless match.empty? + break if eos? + + if state.heredoc and self[1] + match = getch + scan_until(/$/) + tokens << [match, :delimiter] + tokens << [:close, state.type] + state = :initial + next + end + + case match = getch + + when state.delim + if state.paren + state.paren_depth -= 1 + if state.paren_depth > 0 + tokens << [match, :nesting_delimiter] + next + end + end + tokens << [match, :delimiter] + if state.type == :regexp and not eos? + modifiers = scan(/#{REGEXP_MODIFIERS}/ox) + tokens << [modifiers, :modifier] unless modifiers.empty? + if parse_regexp + extended = modifiers.index ?x + tokens, regexp = saved_tokens, tokens + for text, type in regexp + if text.is_a? String + case type + when :content + text.scan(/([^#]+)|(#.*)/) do |plain, comment| + if plain + tokens << [plain, :content] + else + tokens << [comment, :comment] + end + end + when :character + if text[/\\(?:[swdSWDAzZbB]|\d+)/] + tokens << [text, :modifier] + else + tokens << [text, type] + end + else + tokens << [text, type] + end + else + tokens << [text, type] + end + end + first_bake = saved_tokens = nil + end + end + tokens << [:close, state.type] + fancy_allowed = regexp_allowed = false + state = :initial + + when '\\' + if state.interpreted + if esc = scan(/ #{ESCAPE} /ox) + tokens << [match + esc, :char] + else + tokens << [match, :error] + end + else + case m = getch + when state.delim, '\\' + tokens << [match + m, :char] + else + tokens << [match + m, :content] + end + end + + when '#' + case peek(1)[0] + when ?{ + states.push [state, depth, heredocs] + fancy_allowed = regexp_allowed = true + state, depth = :initial, 1 + tokens << [match + getch, :escape] + when ?$, ?@ + tokens << [match, :escape] + last_state = state # scan one token as normal code, then return here + state = :initial + else + raise "else-case # reached; #%p not handled" % peek(1), tokens + end + + when state.paren + state.paren_depth += 1 + tokens << [match, :nesting_delimiter] + + when REGEXP_SYMBOLS + tokens << [match, :function] + + else + raise "else-case \" reached; %p not handled, state = %p" % [match, state], tokens + + end + next +# }}} + else +# {{{ + if match = scan(/ [ \t\f]+ | \\? \n | \# .* /x) or + ( bol? and match = scan(/ #{DATA} | #{RDOC} /ox) ) + fancy_allowed = true + case m = match[0] + when ?\s, ?\t, ?\f + match << scan(/\s*/) unless eos? or heredocs + type = :space + when ?\n, ?\\ + type = :space + regexp_allowed = m == ?\n + if heredocs + unscan # heredoc scanning needs \n at start + state = heredocs.shift + tokens << [:open, state.type] + heredocs = nil if heredocs.empty? + next + else + match << scan(/\s*/) unless eos? + end + when ?#, ?=, ?_ + type = :comment + regexp_allowed = true + else + raise "else-case _ reached, because case %p was not handled" % [matched[0].chr], tokens + end + tokens << [match, type] + next + + elsif state == :initial + if match = scan(/ \.\.?\.? | [-+*=>;,|&!\(\)\[\]~^]+ | [\{\}] | :: /x) + if match !~ / [.\)\]\}] \z/x or match =~ /\.\.\.?/ + regexp_allowed = fancy_allowed = :set + end + last_token_dot = :set if match == '.' or match == '::' + type = :operator + unless states.empty? + case match + when '{' + depth += 1 + when '}' + depth -= 1 + if depth == 0 + state, depth, heredocs = *states.pop + type = :escape + end + end + end + + elsif match = scan(/#{METHOD_NAME}/o) + if last_token_dot + type = if match[/^[A-Z]/] then :constant else :ident end + else + type = IDENT_KIND[match] + if type == :ident and match[/^[A-Z]/] + type = :constant + elsif type == :reserved + state = DEF_NEW_STATE[match] + end + end + fancy_allowed = regexp_allowed = REGEXP_ALLOWED[match] + + elsif match = scan(/ ['"] /mx) + tokens << [:open, :string] + type = :delimiter + state = StringState.new :string, match != '\'', match.dup # important for streaming + + elsif match = scan(/#{INSTANCE_VARIABLE}/o) + type = :instance_variable + + elsif regexp_allowed and match = scan(/ \/ /mx) + tokens << [:open, :regexp] + type = :delimiter + interpreted = true + state = StringState.new :regexp, interpreted, match.dup + if parse_regexp + tokens, saved_tokens = [], tokens + end + + elsif match = scan(/#{NUMERIC}/o) + type = if match[/#{FLOAT}/o] then :float else :integer end + + elsif fancy_allowed and match = scan(/#{SYMBOL}/o) + case match[1] + when ?', ?" + tokens << [:open, :symbol] + state = StringState.new :symbol, match[1] == ?", match[1,1] + end + type = :symbol + + elsif fancy_allowed and match = scan(/#{HEREDOC_OPEN}/o) + indented, quote = self[1] == '-', self[3] + delim = self[quote ? 4 : 2] + type = QUOTE_TO_TYPE[quote] + tokens << [:open, type] + tokens << [match, :delimiter] + match = :close + heredoc = StringState.new type, quote != '\'', delim, (indented ? :indented : :linestart ) + heredocs ||= [] # create heredocs if empty + heredocs << heredoc + + elsif fancy_allowed and match = scan(/#{FANCY_START}/o) + type, interpreted = *FancyStringType.fetch(self[1]) do + raise 'Unknown fancy string: %%%p' % k, tokens + end + tokens << [:open, type] + state = StringState.new type, interpreted, self[2] + type = :delimiter + + elsif fancy_allowed and match = scan(/#{CHARACTER}/o) + type = :integer + + elsif match = scan(/ [\/%? | >=? # comparison, rocket operator + | << | >> # append or shift left, shift right + | ===? # simple equality and case equality + /ox + GLOBAL_VARIABLE = / \$ (?: #{IDENT} | [1-9] | 0[a-zA-Z_0-9]* | [~&+`'=\/,;_.<>!@$?*":\\] | -[a-zA-Z_0-9] ) /ox + + DOUBLEQ = / " [^"\#\\]* (?: (?: \#\{.*?\} | \#(?:$")? | \\. ) [^"\#\\]* )* "? /mox + SINGLEQ = / ' [^'\\]* (?: \\. [^'\\]* )* '? /mox + STRING = / #{SINGLEQ} | #{DOUBLEQ} /ox + + SHELL = / ` [^`\#\\]* (?: (?: \#\{.*?\} | \#(?:$`)? | \\. ) [^`\#\\]* )* `? /mox + REGEXP =%r! / [^/\#\\]* (?: (?: \#\{.*?\} | \#(?:$/)? | \\. ) [^/\#\\]* )* /? !mox + + DECIMAL = /\d+(?:_\d+)*/ # doesn't recognize 09 as octal error + OCTAL = /0_?[0-7]+(?:_[0-7]+)*/ + HEXADECIMAL = /0x[0-9A-Fa-f]+(?:_[0-9A-Fa-f]+)*/ + BINARY = /0b[01]+(?:_[01]+)*/ + + EXPONENT = / [eE] [+-]? #{DECIMAL} /ox + FLOAT = / #{DECIMAL} (?: #{EXPONENT} | \. #{DECIMAL} #{EXPONENT}? ) / + INTEGER = /#{OCTAL}|#{HEXADECIMAL}|#{BINARY}|#{DECIMAL}/ + + ESCAPE_STRING = / + % (?!\s) + (?: + [qsw] + (?: + \( [^\)\\]* (?: \\. [^\)\\]* )* \)? + | + \[ [^\]\\]* (?: \\. [^\]\\]* )* \]? + | + \{ [^\}\\]* (?: \\. [^\}\\]* )* \}? + | + \< [^\>\\]* (?: \\. [^\>\\]* )* \>? + | + \\ [^\\ ]* \\? + | + ( [^a-zA-Z0-9] ) # $1 + (?:(?!\1)[^\\])* (?: \\. (?:(?!\1)[^\#\\])* )* \1? + ) + | + [QrxWr]? + (?: + \( [^\)\#\\]* (?: (?:\#\{.*?\}|\#|\\.) [^\)\#\\]* )* \)? + | + \[ [^\]\#\\]* (?: (?:\#\{.*?\}|\#|\\.) [^\]\#\\]* )* \]? + | + \{ [^\}\#\\]* (?: (?:\#\{.*?\}|\#|\\.) [^\}\#\\]* )* \}? + | + \< [^\>\#\\]* (?: (?:\#\{.*?\}|\#|\\.) [^\>\#\\]* )* \>? + | + \# [^\# \\]* (?: \\. [^\# \\]* )* \#? + | + \\ [^\\\# ]* (?: (?:\#\{.*?\}|\# ) [^\\\# ]* )* \\? + | + ( [^a-zA-Z0-9] ) # $2 + (?:(?!\2)[^\#\\])* (?: (?:\#\{.*?\}|\#|\\.) (?:(?!\2)[^\#\\])* )* \2? + ) + ) + /mox + + SYMBOL = / + : + (?: + #{GLOBAL_VARIABLE} + | @@?#{IDENT} + | #{METHOD_NAME_EX} + | #{STRING} + )/ox + + HEREDOC = / + << (?! [\dc] ) + (?: [^\n]*? << )? + (?: + ([a-zA-Z_0-9]+) + (?: .*? ^\1$ | .* ) + | + -([a-zA-Z_0-9]+) + (?: .*? ^\s*\2$ | .* ) + | + (["\'`]) (.*?) \3 + (?: .*? ^\4$ | .* ) + | + - (["\'`]) (.*?) \5 + (?: .*? ^\s*\6$ | .* ) + ) + /mx + + RDOC = / + =begin (?!\S) [^\n]* \n? + (?: + (?! =end (?!\S) ) + [^\n]* \n? + )* + (?: + =end (?!\S) [^\n]* + )? + /mx + + DATA = / + __END__\n + (?: + (?=\#CODE) + | + .* + ) + / + + private + def scan_tokens tokens, options + + state = :initial + regexp_allowed = true + last_token_dot = false + + until eos? + match = nil + kind = :error + + if scan(/\s+/) # in every state + kind = :space + regexp_allowed = :set if regexp_allowed or matched.index(?\n) # delayed flag setting + + elsif scan(/ \#[^\n]* /x) # in every state + kind = :comment + regexp_allowed = :set if regexp_allowed + + elsif state == :initial + # IDENTIFIERS, KEYWORDS + if scan(GLOBAL_VARIABLE) + kind = :global_variable + elsif scan(/ @@ #{IDENT} /ox) + kind = :class_variable + elsif scan(/ @ #{IDENT} /ox) + kind = :instance_variable + elsif scan(/ #{DATA} | #{RDOC} /ox) + kind = :comment + elsif scan(METHOD_NAME) + match = matched + if last_token_dot + kind = + if match[/^[A-Z]/] + :constant + else + :ident + end + else + kind = IDENT_KIND[match] + if kind == :ident and match[/^[A-Z]/] + kind = :constant + elsif kind == :reserved + state = DEF_NEW_STATE[match] + regexp_allowed = REGEXP_ALLOWED[match] + end + end + + elsif scan(STRING) + kind = :string + elsif scan(SHELL) + kind = :shell + elsif scan(HEREDOC) + kind = :string + elsif check(/\//) and regexp_allowed + scan(REGEXP) + kind = :regexp + elsif scan(ESCAPE_STRING) + match = matched + kind = + case match[0] + when ?s + :symbol + when ?r + :regexp + when ?x + :shell + else + :string + end + + elsif scan(/:(?:#{GLOBAL_VARIABLE}|#{METHOD_NAME_EX}|#{STRING})/ox) + kind = :symbol + elsif scan(/ + \? (?: + [^\s\\] + | + \\ (?:M-\\C-|C-\\M-|M-\\c|c\\M-|c|C-|M-))? (?: \\ (?: . | [0-7]{3} | x[0-9A-Fa-f][0-9A-Fa-f] ) + ) + /mx) + kind = :integer + + elsif scan(/ [-+*\/%=<>;,|&!()\[\]{}~?] | \.\.?\.? | ::? /x) + kind = :operator + match = matched + regexp_allowed = :set if match[-1,1] =~ /[~=!<>|&^,\(\[+\-\/\*%]\z/ + last_token_dot = :set if match == '.' or match == '::' + elsif scan(FLOAT) + kind = :float + elsif scan(INTEGER) + kind = :integer + else + getch + end + + elsif state == :def_expected + if scan(/ (?:#{IDENT}::)* (?:#{IDENT}\.)? #{METHOD_NAME_EX} /ox) + kind = :method + else + getch + end + state = :initial + + elsif state == :module_expected + if scan(/< :comment, + :varname => :ident, + :number => :integer, + :ws => :space, + :escnl => :space, + :keyword => :reserved, + :methname => :method, + :renderexactlystring => :regexp, + :string => :string, + } + + def scan_tokens tokens, options + require 'tempfile' + Tempfile.open('~coderay_tempfile') do |file| + file.binmode + file.write code + file.rewind + lexer = RubyLexer.new 'code', file + loop do + begin + tok = lexer.get1token + rescue => kaboom + err = <<-EOE + ERROR!!! +#{kaboom.inspect} +#{kaboom.backtrace.join("\n")} + EOE + tokens << [err, :error] + Kernel.raise + end + break if tok.is_a? EoiToken + next if tok.is_a? FileAndLineToken + kind = tok.class.name[/(.*?)Token$/,1].downcase.to_sym + kind = Translate.fetch kind, kind + text = tok.ident + case kind + when :hereplaceholder + text = tok.ender + kind = :string + when :herebody, :outlinedherebody + text = tok.ident.ident + kind = :string + end + text = text.inspect unless text.is_a? String + p token if kind == :error + tokens << [text.dup, kind] + end + end + tokens + end + end + +end end diff --git a/lib/coderay/tokens.rb b/lib/coderay/tokens.rb new file mode 100644 index 0000000..71ad33a --- /dev/null +++ b/lib/coderay/tokens.rb @@ -0,0 +1,302 @@ +module CodeRay + + # The Tokens class represents a list of tokens returnd from + # a Scanner. + # + # A token is not a special object, just a two-element Array + # consisting of + # * the _token_ _kind_ (a Symbol representing the type of the token) + # * the _token_ _text_ (the original source of the token in a String) + # + # A token looks like this: + # + # [:comment, '# It looks like this'] + # [:float, '3.1415926'] + # [:error, 'äöü'] + # + # Some scanners also yield some kind of sub-tokens, represented by special + # token texts, namely :open and :close . + # + # The Ruby scanner, for example, splits "a string" into: + # + # [ + # [:open, :string], + # [:delimiter, '"'], + # [:content, 'a string'], + # [:delimiter, '"'], + # [:close, :string] + # ] + # + # Tokens is also the interface between Scanners and Encoders: + # The input is split and saved into a Tokens object. The Encoder + # then builds the output from this object. + # + # Thus, the syntax below becomes clear: + # + # CodeRay.scan('price = 2.59', :ruby).html + # # the Tokens object is here -------^ + # + # See how small it is? ;) + # + # Tokens gives you the power to handle pre-scanned code very easily: + # You can convert it to a webpage, a YAML file, or dump it into a gzip'ed string + # that you put in your DB. + # + # Tokens' subclass TokenStream allows streaming to save memory. + class Tokens < Array + + class << self + + # Convert the token to a string. + # + # This format is used by Encoders.Tokens. + # It can be reverted using read_token. + def write_token text, type + if text.is_a? String + "#{type}\t#{escape(text)}\n" + else + ":#{text}\t#{type}\t\n" + end + end + + # Read a token from the string. + # + # Inversion of write_token. + # + # TODO Test this! + def read_token token + type, text = token.split("\t", 2) + if type[0] == ?: + [text.to_sym, type[1..-1].to_sym] + else + [type.to_sym, unescape(text)] + end + end + + # Escapes a string for use in write_token. + def escape text + text.gsub(/[\n\\]/, '\\\\\&') + end + + # Unescapes a string created by escape. + def unescape text + text.gsub(/\\[\n\\]/) { |m| m[1,1] } + end + + end + + # Whether the object is a TokenStream. + # + # Returns false. + def stream? + false + end + + alias :orig_each :each + # Iterates over all tokens. + # + # If a filter is given, only tokens of that kind are yielded. + def each kind_filter = nil, &block + unless kind_filter + orig_each(&block) + else + orig_each do |text, kind| + next unless kind == kind_filter + yield text, kind + end + end + end + + # Iterates over all text tokens. + # Range tokens like [:open, :string] are left out. + # + # Example: + # tokens.each_text_token { |text, kind| text.replace html_escape(text) } + def each_text_token + orig_each do |text, kind| + next unless text.respond_to? :to_str + yield text, kind + end + end + + # Encode the tokens using encoder. + # + # encoder can be + # * a symbol like :html oder :statistic + # * an Encoder class + # * an Encoder object + # + # options are passed to the encoder. + def encode encoder, options = {} + unless encoder.is_a? Encoders::Encoder + unless encoder.is_a? Class + encoder_class = Encoders[encoder] + end + encoder = encoder_class.new options + end + encoder.encode_tokens self, options + end + + # Redirects unknown methods to encoder calls. + # + # For example, if you call +tokens.html+, the HTML encoder + # is used to highlight the tokens. + def method_missing meth, options = {} + Encoders[meth].new(options).encode_tokens self + end + + # Returns the tokens compressed by joining consecutive + # tokens of the same kind. + # + # This can not be undone, but should yield the same output + # in most Encoders. It basically makes the output smaller. + # + # Combined with dump, it saves database space. + def optimize + last_kind, last_text = nil, nil + new = self.class.new + each do |text, kind| + if text.is_a? String + if kind == last_kind + last_text << text + else + new << [last_text, last_kind] if last_kind + last_text = text + last_kind = kind + end + else + new << [last_text, last_kind] if last_kind + last_kind, last_text = nil, nil + new << [text, kind] + end + end + new << [last_text, last_kind] if last_kind + new + end + + # Compact the object itself; see compact. + def optimize! + replace optimize + end + + # Dumps the object into a String that can be saved + # in files or databases. + # + # The dump is created with Marshal.dump; + # In addition, it is gzipped using GZip.gzip. + # + # The returned String object includes Undumping + # so it has an #undump method. See Tokens.load. + # + # You can configure the level of compression, + # but the default value 7 should be what you want + # in most cases as it is a good comprimise between + # speed and compression rate. + # + # See GZip module. + def dump gzip_level = 7 + require 'coderay/helpers/gzip_simple' + dump = Marshal.dump self + dump = dump.gzip gzip_level + dump.extend Undumping + end + + # The total size of the tokens; + # Should be equal to the input size before + # scanning. + def text_size + map { |t, k| t }.join.size + end + + # Include this module to give an object an #undump + # method. + # + # The string returned by Tokens.dump includes Undumping. + module Undumping + # Calls Tokens.load with itself. + def undump + Tokens.load self + end + end + + # Undump the object using Marshal.load, then + # unzip it using GZip.gunzip. + # + # The result is commonly a Tokens object, but + # this is not guaranteed. + def Tokens.load dump + require 'coderay/helpers/gzip_simple' + dump = dump.gunzip + @dump = Marshal.load dump + end + + end + + + # The TokenStream class is a fake Array without elements. + # + # It redirects the method << to a block given at creation. + # + # This allows scanners and Encoders to use streaming (no + # tokens are saved, the input is highlighted the same time it + # is scanned) with the same code. + # + # See CodeRay.encode_stream and CodeRay.scan_stream + class TokenStream < Tokens + + # Whether the object is a TokenStream. + # + # Returns true. + def stream? + true + end + + # The Array is empty, but size counts the tokens given by <<. + attr_reader :size + + # Creates a new TokenStream that calls +block+ whenever + # its << method is called. + # + # Example: + # + # require 'coderay' + # + # token_stream = CodeRay::TokenStream.new do |kind, text| + # puts 'kind: %s, text size: %d.' % [kind, text.size] + # end + # + # token_stream << [:regexp, '/\d+/'] + # #-> kind: rexpexp, text size: 5. + # + def initialize &block + raise ArgumentError, 'Block expected for streaming.' unless block + @callback = block + @size = 0 + end + + # Calls +block+ with +token+ and increments size. + def << token + @callback.call token + @size += 1 + end + + # This method is not implemented due to speed reasons. Use Tokens. + def text_size + raise NotImplementedError, 'This method is not implemented due to speed reasons.' + end + + # A TokenStream cannot be dumped. Use Tokens. + def dump + raise NotImplementedError, 'A TokenStream cannot be dumped.' + end + + # A TokenStream cannot be compacted. Use Tokens. + def compact + raise NotImplementedError, 'A TokenStream cannot be compacted.' + end + + end + +end + +# vim:sw=2:ts=2:et:tw=78 -- cgit v1.2.1