summaryrefslogtreecommitdiff
path: root/lib/coderay
diff options
context:
space:
mode:
Diffstat (limited to 'lib/coderay')
-rw-r--r--lib/coderay/scanner.rb15
-rw-r--r--lib/coderay/scanners/python.rb4
2 files changed, 17 insertions, 2 deletions
diff --git a/lib/coderay/scanner.rb b/lib/coderay/scanner.rb
index fd5625e..839b9fc 100644
--- a/lib/coderay/scanner.rb
+++ b/lib/coderay/scanner.rb
@@ -65,7 +65,14 @@ module CodeRay
def normify code
code = code.to_s
- code.force_encoding 'binary' if code.respond_to? :force_encoding
+ if code.respond_to? :force_encoding
+ begin
+ code.force_encoding 'utf-8'
+ code[/\z/] # raises an ArgumentError when code contains a non-UTF-8 char
+ rescue ArgumentError
+ code.force_encoding 'binary'
+ end
+ end
code.to_unix
end
@@ -181,6 +188,11 @@ module CodeRay
def column pos = self.pos
return 0 if pos <= 0
+ string = string()
+ if string.respond_to?(:bytesize) && (defined?(@bin_string) || string.bytesize != string.size)
+ @bin_string ||= string.dup.force_encoding(:binary)
+ string = @bin_string
+ end
pos - (string.rindex(?\n, pos) || 0)
end
@@ -207,6 +219,7 @@ module CodeRay
def reset_instance
@tokens.clear unless @options[:keep_tokens]
@cached_tokens = nil
+ @bin_string = nil if defined? @bin_string
end
# Scanner error with additional status information
diff --git a/lib/coderay/scanners/python.rb b/lib/coderay/scanners/python.rb
index 05fe8d6..685232b 100644
--- a/lib/coderay/scanners/python.rb
+++ b/lib/coderay/scanners/python.rb
@@ -75,6 +75,7 @@ module Scanners
state = :initial
string_delimiter = nil
import_clause = class_name_follows = last_token_dot = false
+ unicode = string.respond_to?(:encoding) && string.encoding.name == 'UTF-8'
until eos?
@@ -109,7 +110,8 @@ module Scanners
state = :string
kind = :delimiter
- elsif match = scan(/[[:alpha:]_][[:alnum:]_]*/ux)
+ elsif match = (unicode && scan(/[[:alpha:]_]\w*/ux)) ||
+ scan(/[[:alpha:]_]\w*/x)
kind = IDENT_KIND[match]
# TODO: handle class, def, from, import
# TODO: handle print, exec used as functions in Python 3 code