From 132b75e58dba4c93278721d60f177cfbee7d0e46 Mon Sep 17 00:00:00 2001 From: murphy Date: Tue, 4 Apr 2006 13:23:02 +0000 Subject: Added HTML scanner! Added test/html/suite.rb and tolkien.in.html test. Benchmark produces inline line numbers now. Minor changes to Ruby and C scanners. Rakefile: unit tests now in -d mode. --- lib/coderay/scanners/_map.rb | 4 +- lib/coderay/scanners/c.rb | 10 ++- lib/coderay/scanners/html.rb | 148 ++++++++++++++++++++++++++++++++++ lib/coderay/scanners/ruby.rb | 4 +- lib/coderay/scanners/ruby/patterns.rb | 9 ++- 5 files changed, 164 insertions(+), 11 deletions(-) create mode 100644 lib/coderay/scanners/html.rb (limited to 'lib/coderay/scanners') diff --git a/lib/coderay/scanners/_map.rb b/lib/coderay/scanners/_map.rb index 811546b..fc53d91 100644 --- a/lib/coderay/scanners/_map.rb +++ b/lib/coderay/scanners/_map.rb @@ -4,7 +4,9 @@ module CodeRay map :cpp => :c, :plain => :plaintext, :pascal => :delphi, - :irb => :ruby + :irb => :ruby, + :xml => :html, + :xhtml => :html end end diff --git a/lib/coderay/scanners/c.rb b/lib/coderay/scanners/c.rb index 5764254..ae7ef83 100644 --- a/lib/coderay/scanners/c.rb +++ b/lib/coderay/scanners/c.rb @@ -42,7 +42,9 @@ module CodeRay module Scanners kind = :error match = nil - if state == :initial + case state + + when :initial if scan(/ \s+ | \\\n /x) kind = :space @@ -96,7 +98,7 @@ module CodeRay module Scanners getch end - elsif state == :string + when :string if scan(/[^\\"]+/) kind = :content elsif scan(/"/) @@ -113,7 +115,7 @@ module CodeRay module Scanners raise_inspect "else case \" reached; %p not handled." % peek(1), tokens end - elsif state == :include_expected + when :include_expected if scan(/<[^>\n]+>?|"[^"\n\\]*(?:\\.[^"\n\\]*)*"?/) kind = :include state = :initial @@ -128,7 +130,7 @@ module CodeRay module Scanners end else - raise_inspect 'else-case reached', tokens + raise_inspect 'Unknown state', tokens end diff --git a/lib/coderay/scanners/html.rb b/lib/coderay/scanners/html.rb new file mode 100644 index 0000000..62da13b --- /dev/null +++ b/lib/coderay/scanners/html.rb @@ -0,0 +1,148 @@ +#require 'coderay/common_patterns' + +module CodeRay module Scanners + + # HTML Scanner + class HTML < Scanner + + include Streamable + register_for :html + + ATTR_NAME = /[\w.:-]+/ + ATTR_VALUE_UNQUOTED = ATTR_NAME + TAG_END = /\/?>/ + HEX = /[0-9a-fA-F]/ + ENTITY = / + & + (?: + \w+ + | + \# + (?: + \d+ + | + x#{HEX}+ + ) + ) + ; + /ox + + private + def scan_tokens tokens, options + + state = :initial + + until eos? + + kind = :error + match = nil + + if scan(/\s+/m) + kind = :space + + else + + case state + + when :initial + if scan(//m) + kind = :comment + elsif scan(//m) + kind = :preprocessor + elsif scan(/<\?xml.*?\?>/m) + kind = :preprocessor + elsif scan(/<\?.*?\?>|<%.*?%>/m) + kind = :comment + elsif scan(/<\/[-\w_.:]*>/m) + kind = :tag + elsif match = scan(/<[-\w_.:]*/m) + kind = :tag + if match?(/>/) + match << getch + else + state = :attribute + end + elsif scan(/[^<>&]+/) + kind = :plain + elsif scan(/#{ENTITY}/ox) + kind = :char + elsif scan(/>/) + kind = :error + else + raise_inspect '[BUG] else-case reached with state %p' % [state], tokens + end + + when :attribute + if scan(/#{TAG_END}/) + kind = :tag + state = :initial + elsif scan(/#{ATTR_NAME}/o) + kind = :attribute_name + state = :attribute_equal + end + + when :attribute_equal + if scan(/=/) + kind = :operator + state = :attribute_value + elsif scan(/#{ATTR_NAME}/o) + kind = :attribute_name + elsif scan(/#{TAG_END}/o) + kind = :tag + state = :initial + elsif scan(/./) + state = :attribute + end + + when :attribute_value + if scan(/#{ATTR_VALUE_UNQUOTED}/o) + kind = :attribute_value + state = :attribute + elsif scan(/"/) + tokens << [:open, :string] + state = :attribute_value_string + kind = :delimiter + elsif scan(/#{TAG_END}/o) + kind = :tag + state = :initial + end + + when :attribute_value_string + if scan(/[^"&\n]+/) + kind = :content + elsif scan(/"/) + tokens << ['"', :delimiter] + tokens << [:close, :string] + state = :attribute + next + elsif scan(/#{ENTITY}/ox) + kind = :char + elsif match(/\n/) + tokens << [:close, :string] + state = :attribute + next + end + + else + raise_inspect 'Unknown state: %p' % [state], tokens + + end + + end + + match ||= matched + if $DEBUG and (not kind or kind == :error) + raise_inspect 'Error token %p in line %d' % + [[match, kind], line], tokens + end + raise_inspect 'Empty token', tokens unless match + + tokens << [match, kind] + end + + tokens + end + + end + +end end diff --git a/lib/coderay/scanners/ruby.rb b/lib/coderay/scanners/ruby.rb index dd92caf..810e1fd 100644 --- a/lib/coderay/scanners/ruby.rb +++ b/lib/coderay/scanners/ruby.rb @@ -272,7 +272,7 @@ module CodeRay module Scanners heredocs ||= [] # create heredocs if empty heredocs << heredoc - elsif fancy_allowed and match = scan(/#{FANCY_START}/o) + elsif fancy_allowed and match = scan(/#{FANCY_START_SAVE}/o) type, interpreted = *FancyStringType.fetch(self[1]) do raise_inspect 'Unknown fancy string: %%%p' % k, tokens end @@ -358,6 +358,7 @@ module CodeRay module Scanners end end +# }}} regexp_allowed = regexp_allowed == :set fancy_allowed = fancy_allowed == :set @@ -373,7 +374,6 @@ module CodeRay module Scanners state = last_state last_state = nil end -# }}} end end diff --git a/lib/coderay/scanners/ruby/patterns.rb b/lib/coderay/scanners/ruby/patterns.rb index d75a17a..c007d8c 100644 --- a/lib/coderay/scanners/ruby/patterns.rb +++ b/lib/coderay/scanners/ruby/patterns.rb @@ -130,16 +130,17 @@ module CodeRay module Scanners RDOC_DATA_START = / ^=begin (?!\S) | ^__END__$ /x - FANCY_START = / % ( [qQwWxsr] | (?![\w\s=]) ) (.) /mox + # FIXME: \s and = are only a workaround, they are still allowed + # as delimiters. + FANCY_START_SAVE = / % ( [qQwWxsr] | (?![\w\s=]) ) (.) /mx + FANCY_START_CORRECT = / % ( [qQwWxsr] | (?!\w) ) (.) /mx FancyStringType = { 'q' => [:string, false], 'Q' => [:string, true], 'r' => [:regexp, true], 's' => [:symbol, false], - 'x' => [:shell, true], - 'w' => [:string, :word], - 'W' => [:string, :word], + 'x' => [:shell, true] } FancyStringType['w'] = FancyStringType['q'] FancyStringType['W'] = FancyStringType[''] = FancyStringType['Q'] -- cgit v1.2.1