diff options
author | murphy <murphy@rubychan.de> | 2009-04-20 21:08:33 +0000 |
---|---|---|
committer | murphy <murphy@rubychan.de> | 2009-04-20 21:08:33 +0000 |
commit | 13c1a74215c2af9150f6e61adbfc6c3e48689770 (patch) | |
tree | d9906e0dccf6d9004185f571850048c35918f2bf /lib/coderay | |
parent | ebcf3ab3e0814e4a0187ab23f3209ed162576ef8 (diff) | |
download | coderay-13c1a74215c2af9150f6e61adbfc6c3e48689770.tar.gz |
Experimental Unicode support for Scanners in Ruby 1.9.
* Python scanner uses it for idents.
* Scanner#column method needed to be fixed for multibyte characters because
StringScanner#pos still works on bytes.
Diffstat (limited to 'lib/coderay')
-rw-r--r-- | lib/coderay/scanner.rb | 15 | ||||
-rw-r--r-- | lib/coderay/scanners/python.rb | 4 |
2 files changed, 17 insertions, 2 deletions
diff --git a/lib/coderay/scanner.rb b/lib/coderay/scanner.rb index fd5625e..839b9fc 100644 --- a/lib/coderay/scanner.rb +++ b/lib/coderay/scanner.rb @@ -65,7 +65,14 @@ module CodeRay def normify code code = code.to_s - code.force_encoding 'binary' if code.respond_to? :force_encoding + if code.respond_to? :force_encoding + begin + code.force_encoding 'utf-8' + code[/\z/] # raises an ArgumentError when code contains a non-UTF-8 char + rescue ArgumentError + code.force_encoding 'binary' + end + end code.to_unix end @@ -181,6 +188,11 @@ module CodeRay def column pos = self.pos return 0 if pos <= 0 + string = string() + if string.respond_to?(:bytesize) && (defined?(@bin_string) || string.bytesize != string.size) + @bin_string ||= string.dup.force_encoding(:binary) + string = @bin_string + end pos - (string.rindex(?\n, pos) || 0) end @@ -207,6 +219,7 @@ module CodeRay def reset_instance @tokens.clear unless @options[:keep_tokens] @cached_tokens = nil + @bin_string = nil if defined? @bin_string end # Scanner error with additional status information diff --git a/lib/coderay/scanners/python.rb b/lib/coderay/scanners/python.rb index 05fe8d6..685232b 100644 --- a/lib/coderay/scanners/python.rb +++ b/lib/coderay/scanners/python.rb @@ -75,6 +75,7 @@ module Scanners state = :initial string_delimiter = nil import_clause = class_name_follows = last_token_dot = false + unicode = string.respond_to?(:encoding) && string.encoding.name == 'UTF-8' until eos? @@ -109,7 +110,8 @@ module Scanners state = :string kind = :delimiter - elsif match = scan(/[[:alpha:]_][[:alnum:]_]*/ux) + elsif match = (unicode && scan(/[[:alpha:]_]\w*/ux)) || + scan(/[[:alpha:]_]\w*/x) kind = IDENT_KIND[match] # TODO: handle class, def, from, import # TODO: handle print, exec used as functions in Python 3 code |