diff options
author | murphy <murphy@rubychan.de> | 2009-04-20 23:41:03 +0000 |
---|---|---|
committer | murphy <murphy@rubychan.de> | 2009-04-20 23:41:03 +0000 |
commit | 99308a6da363c78e656aa908f164e45d2f55bf36 (patch) | |
tree | 7e13ea5a86c21aa41b9d4ee9efbf362f750d0453 /lib/coderay/scanners/php.rb | |
parent | d9e3116a9f3d6aa3da0acd8d540d113f2a51bea1 (diff) | |
download | coderay-99308a6da363c78e656aa908f164e45d2f55bf36.tar.gz |
Many improvements to PHP Scanner.
* added lots of built-in functions, constants, syntax rule fixes...
* automatic HTML/PHP document recognition; experimental!
* cYcnus style adjusted; inline_delimiter outside of token group has a style now.
* clean-up still not finished
Diffstat (limited to 'lib/coderay/scanners/php.rb')
-rw-r--r-- | lib/coderay/scanners/php.rb | 322 |
1 files changed, 214 insertions, 108 deletions
diff --git a/lib/coderay/scanners/php.rb b/lib/coderay/scanners/php.rb index 0912ea5..39e8eb4 100644 --- a/lib/coderay/scanners/php.rb +++ b/lib/coderay/scanners/php.rb @@ -27,55 +27,152 @@ module Scanners end module Words - ControlKeywords = %w! - if else elseif while do for switch case default declare foreach as - endif endwhile endfor endforeach endswitch enddeclare return break - continue exit die try catch throw - ! - OtherKeywords = %w! - function class extends implements instanceof parent self var const - private public protected static abstract final global new echo include - require include_once require_once eval print use unset isset empty - interface list array clone null true false - ! - - SpecialConstants = %w! __LINE__ __FILE__ __CLASS__ - __METHOD__ __FUNCTION__ - ! - IdentKinds = WordList.new(:ident). - add(ControlKeywords, :reserved). - add(OtherKeywords, :pre_type). - add(SpecialConstants, :pre_constant) + + # according to http://www.php.net/manual/en/reserved.keywords.php + KEYWORDS = %w[ + abstract and array as break case catch class clone + const continue declare default do else elseif + enddeclare endfor endforeach endif endswitch endwhile + extends final for foreach function global goto if implements + interface instanceof namespace new or + private protected public static switch throw try use var + while xor + cfunction old_function + null true false + ] + + LANGUAGE_CONSTRUCTS = %w[ + die echo empty exit eval include include_once isset list + require require_once return print unset + ] + + TYPES = %w[ int float ] + + # according to http://php.net/quickref.php on 2009-04-21; + # all functions with _ excluded (module functions) + BUILTIN_FUNCTIONS = %w[ + abs acos acosh addcslashes addslashes aggregate array arsort ascii2ebcdic asin asinh asort assert atan atan2 + atanh basename bcadd bccomp bcdiv bcmod bcmul bcpow bcpowmod bcscale bcsqrt bcsub bin2hex bindec + bindtextdomain bzclose bzcompress bzdecompress bzerrno bzerror bzerrstr bzflush bzopen bzread bzwrite + calculhmac ceil chdir checkdate checkdnsrr chgrp chmod chop chown chr chroot clearstatcache closedir closelog + compact constant copy cos cosh count crc32 crypt current date dcgettext dcngettext deaggregate decbin dechex + decoct define defined deg2rad delete dgettext die dirname diskfreespace dl dngettext doubleval each + ebcdic2ascii echo empty end ereg eregi escapeshellarg escapeshellcmd eval exec exit exp explode expm1 extract + fclose feof fflush fgetc fgetcsv fgets fgetss file fileatime filectime filegroup fileinode filemtime fileowner + fileperms filepro filesize filetype floatval flock floor flush fmod fnmatch fopen fpassthru fprintf fputcsv + fputs fread frenchtojd fscanf fseek fsockopen fstat ftell ftok ftruncate fwrite getallheaders getcwd getdate + getenv gethostbyaddr gethostbyname gethostbynamel getimagesize getlastmod getmxrr getmygid getmyinode getmypid + getmyuid getopt getprotobyname getprotobynumber getrandmax getrusage getservbyname getservbyport gettext + gettimeofday gettype glob gmdate gmmktime gmstrftime gregoriantojd gzclose gzcompress gzdecode gzdeflate + gzencode gzeof gzfile gzgetc gzgets gzgetss gzinflate gzopen gzpassthru gzputs gzread gzrewind gzseek gztell + gzuncompress gzwrite hash header hebrev hebrevc hexdec htmlentities htmlspecialchars hypot iconv idate + implode include intval ip2long iptcembed iptcparse isset + jddayofweek jdmonthname jdtofrench jdtogregorian jdtojewish jdtojulian jdtounix jewishtojd join jpeg2wbmp + juliantojd key krsort ksort lcfirst lchgrp lchown levenshtein link linkinfo list localeconv localtime log + log10 log1p long2ip lstat ltrim mail main max md5 metaphone mhash microtime min mkdir mktime msql natcasesort + natsort next ngettext nl2br nthmac octdec opendir openlog + ord overload pack passthru pathinfo pclose pfsockopen phpcredits phpinfo phpversion pi png2wbmp popen pos pow + prev print printf putenv quotemeta rad2deg rand range rawurldecode rawurlencode readdir readfile readgzfile + readline readlink realpath recode rename require reset rewind rewinddir rmdir round rsort rtrim scandir + serialize setcookie setlocale setrawcookie settype sha1 shuffle signeurlpaiement sin sinh sizeof sleep snmpget + snmpgetnext snmprealwalk snmpset snmpwalk snmpwalkoid sort soundex split spliti sprintf sqrt srand sscanf stat + strcasecmp strchr strcmp strcoll strcspn strftime stripcslashes stripos stripslashes stristr strlen + strnatcasecmp strnatcmp strncasecmp strncmp strpbrk strpos strptime strrchr strrev strripos strrpos strspn + strstr strtok strtolower strtotime strtoupper strtr strval substr symlink syslog system tan tanh tempnam + textdomain time tmpfile touch trim uasort ucfirst ucwords uksort umask uniqid unixtojd unlink unpack + unserialize unset urldecode urlencode usleep usort vfprintf virtual vprintf vsprintf wordwrap + ] + %w[ + assert_options base_convert base64_decode base64_encode + chunk_split class_exists class_implements class_parents + count_chars debug_backtrace debug_print_backtrace debug_zval_dump + error_get_last error_log error_reporting extension_loaded + file_exists file_get_contents file_put_contents load_file + func_get_arg func_get_args func_num_args function_exists + get_browser get_called_class get_cfg_var get_class get_class_methods get_class_vars + get_current_user get_declared_classes get_declared_interfaces get_defined_constants + get_defined_functions get_defined_vars get_extension_funcs get_headers get_html_translation_table + get_include_path get_included_files get_loaded_extensions get_magic_quotes_gpc get_magic_quotes_runtime + get_meta_tags get_object_vars get_parent_class get_required_filesget_resource_type + gc_collect_cycles gc_disable gc_enable gc_enabled + halt_compiler headers_list headers_sent highlight_file highlight_string + html_entity_decode htmlspecialchars_decode + in_array include_once inclued_get_data + locale_get_default locale_set_default + number_format override_function parse_str parse_url + php_check_syntax php_ini_loaded_file php_ini_scanned_files php_logo_guid php_sapi_name + php_strip_whitespace php_uname + preg_filter preg_grep preg_last_error preg_match preg_match_all preg_quote preg_replace + preg_replace_callback preg_split print_r + require_once register_shutdown_function register_tick_function + set_error_handler set_exception_handler set_file_buffer set_include_path + set_magic_quotes_runtime set_time_limit shell_exec + str_getcsv str_ireplace str_pad str_repeat str_replace str_rot13 str_shuffle str_split str_word_count + strip_tags substr_compare substr_count substr_replace + time_nanosleep time_sleep_until + token_get_all token_name trigger_error + unregister_tick_function use_soap_error_handler user_error + utf8_decode utf8_encode var_dump var_export + version_compare + zend_logo_guid zend_thread_id zend_version + ] + %w[ + array_change_key_case array_chunk array_combine array_count_values array_diff array_diff_assoc + array_diff_key array_diff_uassoc array_diff_ukey array_fill array_fill_keys array_filter array_flip + array_intersect array_intersect_assoc array_intersect_key array_intersect_uassoc array_intersect_ukey + array_key_exists array_keys array_map array_merge array_merge_recursive array_multisort array_pad + array_pop array_product array_push array_rand array_reduce array_reverse array_search array_shift + array_slice array_splice array_sum array_udiff array_udiff_assoc array_udiff_uassoc array_uintersect + array_uintersect_assoc array_uintersect_uassoc array_unique array_unshift array_values array_walk + array_walk_recursive + ] + %w[ + is_a is_array is_binary is_bool is_buffer is_callable is_dir is_double is_executable is_file is_finite + is_float is_infinite is_int is_integer is_link is_long is_nan is_null is_numeric is_object is_readable + is_real is_resource is_scalar is_soap_fault is_string is_subclass_of is_unicode is_uploaded_file + is_writable is_writeable + ] + + # TODO: more built-in PHP functions? + # TODO: more predefined constants? + + SPECIAL_CONSTANTS = %w[ + __LINE__ __DIR__ __FILE__ __LINE__ + __CLASS__ __NAMESPACE__ __METHOD__ __FUNCTION__ + ] + + IdentKinds = CaseIgnoringWordList.new(:ident, true). + add(KEYWORDS, :reserved). + add(TYPES, :pre_type). + add(LANGUAGE_CONSTRUCTS, :predefined). + add(BUILTIN_FUNCTIONS, :predefined). + add(SPECIAL_CONSTANTS, :pre_constant) end + module RE def self.build_alternatives(array) Regexp.new(array.map { |s| Regexp.escape(s) }.join('|') , Regexp::IGNORECASE) end - - PHPStart = / - <script language="php"> | - <script language='php'> | - <\?php | - <\?(?!xml) | - <% + + PHP_START = / + <script\s+[^>]*?language\s*=\s*"php"[^>]*?> | + <script\s+[^>]*?language\s*=\s*'php'[^>]*?> | + <\?php\d? | + <\?(?!xml) /xi - - PHPEnd = %r! + + PHP_END = %r! </script> | - \?> | - %> + \?> !xi - + IChar = /[a-z0-9_\x80-\xFF]/i IStart = /[a-z_\x80-\xFF]/i Identifier = /#{IStart}#{IChar}*/ Variable = /\$#{Identifier}/ - + Typecasts = build_alternatives %w! float double real int integer bool boolean string array object null !.map{|s| "(#{s})"} - OneLineComment1 = %r!//.*?(?=#{PHPEnd}|$)! - OneLineComment2 = %r!#.*?(?=#{PHPEnd}|$)! + OneLineComment1 = %r!//.*?(?=#{PHP_END}|$)! + OneLineComment2 = %r!#.*?(?=#{PHP_END}|$)! OneLineComment = OneLineComment1 | OneLineComment2 HereDoc = /<<</ + Identifier @@ -97,51 +194,54 @@ module Scanners Operator = ComparisionOperator | IncDecOperator | LogicalOperator | ObjectOperator | AssignOperator | BinaryOperator | OtherOperator - - - S = /\s+/ - - Integer = /-?0x[0-9a-fA-F]/ | /-?\d+/ - Float = /-?(?:\d+\.\d*|\d*\.\d+)(?:e[+-]\d+)?/ - + + Integer = /0x[0-9a-fA-F]/ | /\d+/ + Float = /(?:\d+\.\d*|\d*\.\d+)(?:e[+-]\d+)?/ + end - + def scan_tokens tokens, options - states = [:php] - heredocdelim = nil - + states = [:initial] + if match?(RE::PHP_START) || # starts with <? + (match?(/\s*<(?i:\w|\?xml)/) && exist?(RE::PHP_START)) # starts with HTML tag and contains <? + # start with HTML + else + states << :php + end + # heredocdelim = nil + until eos? match = nil kind = nil case states.last - when :html - if scan RE::PHPStart - kind = :delimiter - states.pop + + when :initial # HTML + if scan RE::PHP_START + kind = :inline_delimiter + states << :php else - match = scan_until(/(?=#{RE::PHPStart})/o) || scan_until(/\z/) - @html_scanner.tokenize match if not match.empty? - kind = :space - match = '' + match = scan_until(/(?=#{RE::PHP_START})/o) || scan_until(/\z/) + @html_scanner.tokenize match unless match.empty? + next end when :php - if scan RE::PHPEnd - kind = :delimiter - states.push :html - - elsif scan RE::S + if scan RE::PHP_END + kind = :inline_delimiter + states.pop + + elsif scan(/\s+/) kind = :space - + elsif scan(/\/\*/) kind = :comment states.push :mlcomment - - elsif scan RE::OneLineComment + + elsif scan RE::OneLineComment kind = :comment - + elsif match = scan(RE::Identifier) kind = Words::IdentKinds[match] if kind == :ident && check(/:(?!:)/) #&& tokens[-2][0] == 'case' @@ -150,33 +250,37 @@ module Scanners elsif kind == :ident and match =~ /^[A-Z]/ kind = :constant end - - elsif scan RE::Integer - kind = :integer - + elsif scan RE::Float kind = :float - + + elsif scan RE::Integer + kind = :integer + elsif scan(/'/) + tokens << [:open, :string] kind = :delimiter states.push :sqstring - + elsif scan(/"/) + tokens << [:open, :string] kind = :delimiter states.push :dqstring - - elsif match = scan(RE::HereDoc) - heredocdelim = match[RE::Identifier] - kind = :delimiter - # states.push :heredocstring - + + # TODO: Heredocs + # elsif match = scan(RE::HereDoc) + # tokens << [:open, :string] + # heredocdelim = match[RE::Identifier] + # kind = :delimiter + # states.push :heredocstring + elsif scan RE::Variable kind = :local_variable - + elsif scan(/\{/) kind = :operator states.push :php - + elsif scan(/\}/) if states.length == 1 kind = :error @@ -184,56 +288,56 @@ module Scanners kind = :operator states.pop end - + elsif scan RE::Operator kind = :operator - + else getch kind = :error - + end - + when :mlcomment if scan(/(?:[^\n\r\f*]|\*(?!\/))+/) kind = :comment - + elsif scan(/\*\//) kind = :comment states.pop - + elsif scan(/[\r\n\f]+/) kind = :space end - + when :sqstring - if scan(/[^\r\n\f'\\]+/) - kind = :string - elsif match = scan(/\\\\|\\'/) - kind = :char + if scan(/[^'\\]+/) + kind = :content + elsif scan(/\\./m) + kind = :content elsif scan(/\\/) - kind = :string - elsif scan(/[\r\n\f ]+/) - kind = :space + kind = :error elsif scan(/'/) - kind = :delimiter + tokens << [matched, :delimiter] + tokens << [:close, :string] states.pop + next end - + when :dqstring -#todo: $foo[bar] kind of stuff - if scan(/[^\r\n\f"${\\]+/) - kind = :string - elsif scan(/\\x[a-fA-F]{2}/) + # TODO: $foo[bar] kind of stuff + if scan(/[^"${\\]+/) + kind = :content + elsif scan(/\\x[0-9a-fA-F]{2}/) kind = :char elsif scan(/\\\d{3}/) kind = :char elsif scan(/\\["\\abcfnrtyv]/) kind = :char + elsif scan(/\\./m) + kind = :content elsif scan(/\\/) - kind = :string - elsif scan(/[\r\n\f]+/) - kind = :space + kind = :error elsif match = scan(/#{RE::Variable}/o) kind = :local_variable if check(/\[#{RE::Identifier}\]/o) @@ -246,7 +350,7 @@ module Scanners end elsif scan(/\{/) if check(/\$/) - kind = :operator + kind = :operator states.push :php else kind = :string @@ -255,30 +359,32 @@ module Scanners elsif scan(/\$\{#{RE::Identifier}\}/o) kind = :local_variable elsif scan(/\$/) - kind = :string + kind = :content elsif scan(/"/) - kind = :delimiter + tokens << [matched, :delimiter] + tokens << [:close, :string] states.pop + next end else raise_inspect 'Unknown state!', tokens, states end - + match ||= matched if $DEBUG and not kind raise_inspect 'Error token %p in line %d' % [[match, kind], line], tokens, states end raise_inspect 'Empty token', tokens, states unless match - + tokens << [match, kind] - + end + tokens - end - + end - + end end |