diff options
| author | Márcio Almada <marcio3w@gmail.com> | 2015-04-05 08:50:35 -0300 |
|---|---|---|
| committer | Márcio Almada <marcio3w@gmail.com> | 2015-04-30 03:03:29 -0300 |
| commit | 110759386e2f9b4d88bf68c669b6c54ad4b5c04f (patch) | |
| tree | c0dc58e312c77662a5f6e10941408560a4b440ac | |
| parent | 02a9eb4f8c736089808b51d862def0e648383e09 (diff) | |
| download | php-git-110759386e2f9b4d88bf68c669b6c54ad4b5c04f.tar.gz | |
ext tokenizer port + cleanup unused lexer states
we basically added a mechanism to store the token stream during parsing
and exposed the entire parser stack on the tokenizer extension through
an opt in flag: token_get_all($src, TOKEN_PARSE).
this change allows easy future language enhancements regarding context
aware parsing & scanning without further maintance on the tokenizer
extension while solves known inconsistencies "parseless" tokenizer
extension has when it handles `__halt_compiler()` presence.
| -rw-r--r-- | Zend/zend_compile.c | 6 | ||||
| -rw-r--r-- | Zend/zend_globals.h | 9 | ||||
| -rw-r--r-- | Zend/zend_language_parser.y | 14 | ||||
| -rw-r--r-- | Zend/zend_language_scanner.h | 4 | ||||
| -rw-r--r-- | Zend/zend_language_scanner.l | 389 | ||||
| -rw-r--r-- | ext/tokenizer/tests/token_get_all_TOKEN_PARSE_000.phpt | 19 | ||||
| -rw-r--r-- | ext/tokenizer/tests/token_get_all_TOKEN_PARSE_001.phpt | 81 | ||||
| -rw-r--r-- | ext/tokenizer/tests/token_get_all_TOKEN_PARSE_002.phpt | 68 | ||||
| -rw-r--r-- | ext/tokenizer/tests/token_get_all_error.phpt | 8 | ||||
| -rw-r--r-- | ext/tokenizer/tokenizer.c | 132 |
10 files changed, 510 insertions, 220 deletions
diff --git a/Zend/zend_compile.c b/Zend/zend_compile.c index c92a25a705..210810379f 100644 --- a/Zend/zend_compile.c +++ b/Zend/zend_compile.c @@ -30,7 +30,6 @@ #include "zend_interfaces.h" #include "zend_virtual_cwd.h" #include "zend_multibyte.h" -#include "zend_language_scanner.h" #include "zend_inheritance.h" #define SET_NODE(target, src) do { \ @@ -568,7 +567,10 @@ static int zend_add_const_name_literal(zend_op_array *op_array, zend_string *nam op.constant = zend_add_literal(CG(active_op_array), &_c); \ } while (0) -void zend_stop_lexing(void) { +void zend_stop_lexing(void) +{ + if(LANG_SCNG(on_event)) LANG_SCNG(on_event)(ON_STOP, END, 0); + LANG_SCNG(yy_cursor) = LANG_SCNG(yy_limit); } diff --git a/Zend/zend_globals.h b/Zend/zend_globals.h index 326955a103..28487a2a4a 100644 --- a/Zend/zend_globals.h +++ b/Zend/zend_globals.h @@ -249,6 +249,12 @@ struct _zend_ini_scanner_globals { int scanner_mode; }; +typedef enum { + ON_TOKEN, + ON_FEEDBACK, + ON_STOP +} zend_php_scanner_event; + struct _zend_php_scanner_globals { zend_file_handle *yy_in; zend_file_handle *yy_out; @@ -278,6 +284,9 @@ struct _zend_php_scanner_globals { /* initial string length after scanning to first variable */ int scanned_string_len; + + /* hooks */ + void (* on_event)(zend_php_scanner_event event, int token, int line); }; #endif /* ZEND_GLOBALS_H */ diff --git a/Zend/zend_language_parser.y b/Zend/zend_language_parser.y index cefcd0cad9..f6318ec0c0 100644 --- a/Zend/zend_language_parser.y +++ b/Zend/zend_language_parser.y @@ -35,7 +35,7 @@ #include "zend_globals.h" #include "zend_API.h" #include "zend_constants.h" -#include "zend_language_scanner_defs.h" +#include "zend_language_scanner.h" #define YYSIZE_T size_t #define yytnamerr zend_yytnamerr @@ -49,12 +49,6 @@ static YYSIZE_T zend_yytnamerr(char*, const char*); #define YYFREE free #endif -#define REWIND { \ - zend_stack_push(&LANG_SCNG(state_stack), (void *) &LANG_SCNG(yy_state)); \ - LANG_SCNG(yy_state) = yycST_LOOKING_FOR_SEMI_RESERVED_NAME; \ - LANG_SCNG(yy_cursor) = (unsigned char*)LANG_SCNG(yy_text); \ - LANG_SCNG(yy_leng) = 0; } - %} %pure_parser @@ -290,7 +284,11 @@ semi_reserved: identifier: T_STRING { $$ = $1; } - | /* if */ semi_reserved { REWIND } /* and rematch as */ T_STRING { $$ = $3; } + | semi_reserved { + zval zv; + zend_lex_tstring(&zv); + $$ = zend_ast_create_zval(&zv); + } ; top_statement_list: diff --git a/Zend/zend_language_scanner.h b/Zend/zend_language_scanner.h index c82b3069c5..3b75ff8cc4 100644 --- a/Zend/zend_language_scanner.h +++ b/Zend/zend_language_scanner.h @@ -50,6 +50,9 @@ typedef struct _zend_lex_state { zend_encoding_filter output_filter; const zend_encoding *script_encoding; + /* hooks */ + void (* on_event)(zend_php_scanner_event event, int token, int line); + zend_ast *ast; zend_arena *ast_arena; } zend_lex_state; @@ -66,6 +69,7 @@ ZEND_API void zend_restore_lexical_state(zend_lex_state *lex_state); ZEND_API int zend_prepare_string_for_scanning(zval *str, char *filename); ZEND_API void zend_multibyte_yyinput_again(zend_encoding_filter old_input_filter, const zend_encoding *old_encoding); ZEND_API int zend_multibyte_set_filter(const zend_encoding *onetime_encoding); +ZEND_API void zend_lex_tstring(zval *zv); END_EXTERN_C() diff --git a/Zend/zend_language_scanner.l b/Zend/zend_language_scanner.l index 2481af605b..cde0621df0 100644 --- a/Zend/zend_language_scanner.l +++ b/Zend/zend_language_scanner.l @@ -193,6 +193,7 @@ void shutdown_scanner(void) zend_stack_destroy(&SCNG(state_stack)); zend_ptr_stack_clean(&SCNG(heredoc_label_stack), (void (*)(void *)) &heredoc_label_dtor, 1); zend_ptr_stack_destroy(&SCNG(heredoc_label_stack)); + SCNG(on_event) = NULL; } ZEND_API void zend_save_lexical_state(zend_lex_state *lex_state) @@ -223,6 +224,8 @@ ZEND_API void zend_save_lexical_state(zend_lex_state *lex_state) lex_state->output_filter = SCNG(output_filter); lex_state->script_encoding = SCNG(script_encoding); + lex_state->on_event = SCNG(on_event); + lex_state->ast = CG(ast); lex_state->ast_arena = CG(ast_arena); } @@ -260,6 +263,8 @@ ZEND_API void zend_restore_lexical_state(zend_lex_state *lex_state) SCNG(output_filter) = lex_state->output_filter; SCNG(script_encoding) = lex_state->script_encoding; + SCNG(on_event) = lex_state->on_event; + CG(ast) = lex_state->ast; CG(ast_arena) = lex_state->ast_arena; @@ -276,6 +281,13 @@ ZEND_API void zend_destroy_file_handle(zend_file_handle *file_handle) } } +ZEND_API void zend_lex_tstring(zval *zv) +{ + if (SCNG(on_event)) SCNG(on_event)(ON_FEEDBACK, T_STRING, 0); + + ZVAL_STRINGL(zv, (char*)SCNG(yy_text), SCNG(yy_leng)); +} + #define BOM_UTF32_BE "\x00\x00\xfe\xff" #define BOM_UTF32_LE "\xff\xfe\x00\x00" #define BOM_UTF16_BE "\xfe\xff" @@ -1083,9 +1095,20 @@ static int zend_scan_escape_string(zval *zendlval, char *str, int len, char quot return SUCCESS; } +static zend_always_inline int emit_token(int token, int token_line) +{ + if(SCNG(on_event)) SCNG(on_event)(ON_TOKEN, token, token_line); + + return token; +} + +#define RETURN_TOKEN(token) return emit_token(token, start_line); int lex_scan(zval *zendlval) { + +int start_line = CG(zend_lineno); + restart: SCNG(yy_text) = YYCURSOR; @@ -1107,183 +1130,183 @@ NEWLINE ("\r"|"\n"|"\r\n") <!*> := yyleng = YYCURSOR - SCNG(yy_text); <ST_IN_SCRIPTING>"exit" { - return T_EXIT; + RETURN_TOKEN(T_EXIT); } <ST_IN_SCRIPTING>"die" { - return T_EXIT; + RETURN_TOKEN(T_EXIT); } <ST_IN_SCRIPTING>"function" { - return T_FUNCTION; + RETURN_TOKEN(T_FUNCTION); } <ST_IN_SCRIPTING>"const" { - return T_CONST; + RETURN_TOKEN(T_CONST); } <ST_IN_SCRIPTING>"return" { - return T_RETURN; + RETURN_TOKEN(T_RETURN); } <ST_IN_SCRIPTING>"yield"{WHITESPACE}"from" { - return T_YIELD_FROM; + RETURN_TOKEN(T_YIELD_FROM); } <ST_IN_SCRIPTING>"yield" { - return T_YIELD; + RETURN_TOKEN(T_YIELD); } <ST_IN_SCRIPTING>"try" { - return T_TRY; + RETURN_TOKEN(T_TRY); } <ST_IN_SCRIPTING>"catch" { - return T_CATCH; + RETURN_TOKEN(T_CATCH); } <ST_IN_SCRIPTING>"finally" { - return T_FINALLY; + RETURN_TOKEN(T_FINALLY); } <ST_IN_SCRIPTING>"throw" { - return T_THROW; + RETURN_TOKEN(T_THROW); } <ST_IN_SCRIPTING>"if" { - return T_IF; + RETURN_TOKEN(T_IF); } <ST_IN_SCRIPTING>"elseif" { - return T_ELSEIF; + RETURN_TOKEN(T_ELSEIF); } <ST_IN_SCRIPTING>"endif" { - return T_ENDIF; + RETURN_TOKEN(T_ENDIF); } <ST_IN_SCRIPTING>"else" { - return T_ELSE; + RETURN_TOKEN(T_ELSE); } <ST_IN_SCRIPTING>"while" { - return T_WHILE; + RETURN_TOKEN(T_WHILE); } <ST_IN_SCRIPTING>"endwhile" { - return T_ENDWHILE; + RETURN_TOKEN(T_ENDWHILE); } <ST_IN_SCRIPTING>"do" { - return T_DO; + RETURN_TOKEN(T_DO); } <ST_IN_SCRIPTING>"for" { - return T_FOR; + RETURN_TOKEN(T_FOR); } <ST_IN_SCRIPTING>"endfor" { - return T_ENDFOR; + RETURN_TOKEN(T_ENDFOR); } <ST_IN_SCRIPTING>"foreach" { - return T_FOREACH; + RETURN_TOKEN(T_FOREACH); } <ST_IN_SCRIPTING>"endforeach" { - return T_ENDFOREACH; + RETURN_TOKEN(T_ENDFOREACH); } <ST_IN_SCRIPTING>"declare" { - return T_DECLARE; + RETURN_TOKEN(T_DECLARE); } <ST_IN_SCRIPTING>"enddeclare" { - return T_ENDDECLARE; + RETURN_TOKEN(T_ENDDECLARE); } <ST_IN_SCRIPTING>"instanceof" { - return T_INSTANCEOF; + RETURN_TOKEN(T_INSTANCEOF); } <ST_IN_SCRIPTING>"as" { - return T_AS; + RETURN_TOKEN(T_AS); } <ST_IN_SCRIPTING>"switch" { - return T_SWITCH; + RETURN_TOKEN(T_SWITCH); } <ST_IN_SCRIPTING>"endswitch" { - return T_ENDSWITCH; + RETURN_TOKEN(T_ENDSWITCH); } <ST_IN_SCRIPTING>"case" { - return T_CASE; + RETURN_TOKEN(T_CASE); } <ST_IN_SCRIPTING>"default" { - return T_DEFAULT; + RETURN_TOKEN(T_DEFAULT); } <ST_IN_SCRIPTING>"break" { - return T_BREAK; + RETURN_TOKEN(T_BREAK); } <ST_IN_SCRIPTING>"continue" { - return T_CONTINUE; + RETURN_TOKEN(T_CONTINUE); } <ST_IN_SCRIPTING>"goto" { - return T_GOTO; + RETURN_TOKEN(T_GOTO); } <ST_IN_SCRIPTING>"echo" { - return T_ECHO; + RETURN_TOKEN(T_ECHO); } <ST_IN_SCRIPTING>"print" { - return T_PRINT; + RETURN_TOKEN(T_PRINT); } <ST_IN_SCRIPTING>"class" { - return T_CLASS; + RETURN_TOKEN(T_CLASS); } <ST_IN_SCRIPTING>"interface" { - return T_INTERFACE; + RETURN_TOKEN(T_INTERFACE); } <ST_IN_SCRIPTING>"trait" { - return T_TRAIT; + RETURN_TOKEN(T_TRAIT); } <ST_IN_SCRIPTING>"extends" { - return T_EXTENDS; + RETURN_TOKEN(T_EXTENDS); } <ST_IN_SCRIPTING>"implements" { - return T_IMPLEMENTS; + RETURN_TOKEN(T_IMPLEMENTS); } <ST_IN_SCRIPTING>"->" { yy_push_state(ST_LOOKING_FOR_PROPERTY); - return T_OBJECT_OPERATOR; + RETURN_TOKEN(T_OBJECT_OPERATOR); } -<ST_IN_SCRIPTING,ST_LOOKING_FOR_PROPERTY,ST_LOOKING_FOR_SEMI_RESERVED_NAME>{WHITESPACE}+ { +<ST_IN_SCRIPTING,ST_LOOKING_FOR_PROPERTY>{WHITESPACE}+ { HANDLE_NEWLINES(yytext, yyleng); - return T_WHITESPACE; + RETURN_TOKEN(T_WHITESPACE); } <ST_LOOKING_FOR_PROPERTY>"->" { - return T_OBJECT_OPERATOR; + RETURN_TOKEN(T_OBJECT_OPERATOR); } <ST_LOOKING_FOR_PROPERTY>{LABEL} { yy_pop_state(); zend_copy_value(zendlval, yytext, yyleng); - return T_STRING; + RETURN_TOKEN(T_STRING); } <ST_LOOKING_FOR_PROPERTY>{ANY_CHAR} { @@ -1293,283 +1316,283 @@ NEWLINE ("\r"|"\n"|"\r\n") } <ST_IN_SCRIPTING>"::" { - return T_PAAMAYIM_NEKUDOTAYIM; + RETURN_TOKEN(T_PAAMAYIM_NEKUDOTAYIM); } <ST_IN_SCRIPTING>"\\" { - return T_NS_SEPARATOR; + RETURN_TOKEN(T_NS_SEPARATOR); } <ST_IN_SCRIPTING>"..." { - return T_ELLIPSIS; + RETURN_TOKEN(T_ELLIPSIS); } <ST_IN_SCRIPTING>"??" { - return T_COALESCE; + RETURN_TOKEN(T_COALESCE); } <ST_IN_SCRIPTING>"new" { - return T_NEW; + RETURN_TOKEN(T_NEW); } <ST_IN_SCRIPTING>"clone" { - return T_CLONE; + RETURN_TOKEN(T_CLONE); } <ST_IN_SCRIPTING>"var" { - return T_VAR; + RETURN_TOKEN(T_VAR); } <ST_IN_SCRIPTING>"("{TABS_AND_SPACES}("int"|"integer"){TABS_AND_SPACES}")" { - return T_INT_CAST; + RETURN_TOKEN(T_INT_CAST); } <ST_IN_SCRIPTING>"("{TABS_AND_SPACES}("real"|"double"|"float"){TABS_AND_SPACES}")" { - return T_DOUBLE_CAST; + RETURN_TOKEN(T_DOUBLE_CAST); } <ST_IN_SCRIPTING>"("{TABS_AND_SPACES}("string"|"binary"){TABS_AND_SPACES}")" { - return T_STRING_CAST; + RETURN_TOKEN(T_STRING_CAST); } <ST_IN_SCRIPTING>"("{TABS_AND_SPACES}"array"{TABS_AND_SPACES}")" { - return T_ARRAY_CAST; + RETURN_TOKEN(T_ARRAY_CAST); } <ST_IN_SCRIPTING>"("{TABS_AND_SPACES}"object"{TABS_AND_SPACES}")" { - return T_OBJECT_CAST; + RETURN_TOKEN(T_OBJECT_CAST); } <ST_IN_SCRIPTING>"("{TABS_AND_SPACES}("bool"|"boolean"){TABS_AND_SPACES}")" { - return T_BOOL_CAST; + RETURN_TOKEN(T_BOOL_CAST); } <ST_IN_SCRIPTING>"("{TABS_AND_SPACES}("unset"){TABS_AND_SPACES}")" { - return T_UNSET_CAST; + RETURN_TOKEN(T_UNSET_CAST); } <ST_IN_SCRIPTING>"eval" { - return T_EVAL; + RETURN_TOKEN(T_EVAL); } <ST_IN_SCRIPTING>"include" { - return T_INCLUDE; + RETURN_TOKEN(T_INCLUDE); } <ST_IN_SCRIPTING>"include_once" { - return T_INCLUDE_ONCE; + RETURN_TOKEN(T_INCLUDE_ONCE); } <ST_IN_SCRIPTING>"require" { - return T_REQUIRE; + RETURN_TOKEN(T_REQUIRE); } <ST_IN_SCRIPTING>"require_once" { - return T_REQUIRE_ONCE; + RETURN_TOKEN(T_REQUIRE_ONCE); } <ST_IN_SCRIPTING>"namespace" { - return T_NAMESPACE; + RETURN_TOKEN(T_NAMESPACE); } <ST_IN_SCRIPTING>"use" { - return T_USE; + RETURN_TOKEN(T_USE); } <ST_IN_SCRIPTING>"insteadof" { - return T_INSTEADOF; + RETURN_TOKEN(T_INSTEADOF); } <ST_IN_SCRIPTING>"global" { - return T_GLOBAL; + RETURN_TOKEN(T_GLOBAL); } <ST_IN_SCRIPTING>"isset" { - return T_ISSET; + RETURN_TOKEN(T_ISSET); } <ST_IN_SCRIPTING>"empty" { - return T_EMPTY; + RETURN_TOKEN(T_EMPTY); } <ST_IN_SCRIPTING>"__halt_compiler" { - return T_HALT_COMPILER; + RETURN_TOKEN(T_HALT_COMPILER); } <ST_IN_SCRIPTING>"static" { - return T_STATIC; + RETURN_TOKEN(T_STATIC); } <ST_IN_SCRIPTING>"abstract" { - return T_ABSTRACT; + RETURN_TOKEN(T_ABSTRACT); } <ST_IN_SCRIPTING>"final" { - return T_FINAL; + RETURN_TOKEN(T_FINAL); } <ST_IN_SCRIPTING>"private" { - return T_PRIVATE; + RETURN_TOKEN(T_PRIVATE); } <ST_IN_SCRIPTING>"protected" { - return T_PROTECTED; + RETURN_TOKEN(T_PROTECTED); } <ST_IN_SCRIPTING>"public" { - return T_PUBLIC; + RETURN_TOKEN(T_PUBLIC); } <ST_IN_SCRIPTING>"unset" { - return T_UNSET; + RETURN_TOKEN(T_UNSET); } <ST_IN_SCRIPTING>"=>" { - return T_DOUBLE_ARROW; + RETURN_TOKEN(T_DOUBLE_ARROW); } <ST_IN_SCRIPTING>"list" { - return T_LIST; + RETURN_TOKEN(T_LIST); } <ST_IN_SCRIPTING>"array" { - return T_ARRAY; + RETURN_TOKEN(T_ARRAY); } <ST_IN_SCRIPTING>"callable" { - return T_CALLABLE; + RETURN_TOKEN(T_CALLABLE); } <ST_IN_SCRIPTING>"++" { - return T_INC; + RETURN_TOKEN(T_INC); } <ST_IN_SCRIPTING>"--" { - return T_DEC; + RETURN_TOKEN(T_DEC); } <ST_IN_SCRIPTING>"===" { - return T_IS_IDENTICAL; + RETURN_TOKEN(T_IS_IDENTICAL); } <ST_IN_SCRIPTING>"!==" { - return T_IS_NOT_IDENTICAL; + RETURN_TOKEN(T_IS_NOT_IDENTICAL); } <ST_IN_SCRIPTING>"==" { - return T_IS_EQUAL; + RETURN_TOKEN(T_IS_EQUAL); } <ST_IN_SCRIPTING>"!="|"<>" { - return T_IS_NOT_EQUAL; + RETURN_TOKEN(T_IS_NOT_EQUAL); } <ST_IN_SCRIPTING>"<=>" { - return T_SPACESHIP; + RETURN_TOKEN(T_SPACESHIP); } <ST_IN_SCRIPTING>"<=" { - return T_IS_SMALLER_OR_EQUAL; + RETURN_TOKEN(T_IS_SMALLER_OR_EQUAL); } <ST_IN_SCRIPTING>">=" { - return T_IS_GREATER_OR_EQUAL; + RETURN_TOKEN(T_IS_GREATER_OR_EQUAL); } <ST_IN_SCRIPTING>"+=" { - return T_PLUS_EQUAL; + RETURN_TOKEN(T_PLUS_EQUAL); } <ST_IN_SCRIPTING>"-=" { - return T_MINUS_EQUAL; + RETURN_TOKEN(T_MINUS_EQUAL); } <ST_IN_SCRIPTING>"*=" { - return T_MUL_EQUAL; + RETURN_TOKEN(T_MUL_EQUAL); } <ST_IN_SCRIPTING>"*\*" { - return T_POW; + RETURN_TOKEN(T_POW); } <ST_IN_SCRIPTING>"*\*=" { - return T_POW_EQUAL; + RETURN_TOKEN(T_POW_EQUAL); } <ST_IN_SCRIPTING>"/=" { - return T_DIV_EQUAL; + RETURN_TOKEN(T_DIV_EQUAL); } <ST_IN_SCRIPTING>".=" { - return T_CONCAT_EQUAL; + RETURN_TOKEN(T_CONCAT_EQUAL); } <ST_IN_SCRIPTING>"%=" { - return T_MOD_EQUAL; + RETURN_TOKEN(T_MOD_EQUAL); } <ST_IN_SCRIPTING>"<<=" { - return T_SL_EQUAL; + RETURN_TOKEN(T_SL_EQUAL); } <ST_IN_SCRIPTING>">>=" { - return T_SR_EQUAL; + RETURN_TOKEN(T_SR_EQUAL); } <ST_IN_SCRIPTING>"&=" { - return T_AND_EQUAL; + RETURN_TOKEN(T_AND_EQUAL); } <ST_IN_SCRIPTING>"|=" { - return T_OR_EQUAL; + RETURN_TOKEN(T_OR_EQUAL); } <ST_IN_SCRIPTING>"^=" { - return T_XOR_EQUAL; + RETURN_TOKEN(T_XOR_EQUAL); } <ST_IN_SCRIPTING>"||" { - return T_BOOLEAN_OR; + RETURN_TOKEN(T_BOOLEAN_OR); } <ST_IN_SCRIPTING>"&&" { - return T_BOOLEAN_AND; + RETURN_TOKEN(T_BOOLEAN_AND); } <ST_IN_SCRIPTING>"OR" { - return T_LOGICAL_OR; + RETURN_TOKEN(T_LOGICAL_OR); } <ST_IN_SCRIPTING>"AND" { - return T_LOGICAL_AND; + RETURN_TOKEN(T_LOGICAL_AND); } <ST_IN_SCRIPTING>"XOR" { - return T_LOGICAL_XOR; + RETURN_TOKEN(T_LOGICAL_XOR); } <ST_IN_SCRIPTING>"<<" { - return T_SL; + RETURN_TOKEN(T_SL); } <ST_IN_SCRIPTING>">>" { - return T_SR; + RETURN_TOKEN(T_SR); } <ST_IN_SCRIPTING>{TOKENS} { - return yytext[0]; + RETURN_TOKEN(yytext[0]); } <ST_IN_SCRIPTING>"{" { yy_push_state(ST_IN_SCRIPTING); - return '{'; + RETURN_TOKEN('{'); } <ST_DOUBLE_QUOTES,ST_BACKQUOTE,ST_HEREDOC>"${" { yy_push_state(ST_LOOKING_FOR_VARNAME); - return T_DOLLAR_OPEN_CURLY_BRACES; + RETURN_TOKEN(T_DOLLAR_OPEN_CURLY_BRACES); } @@ -1578,7 +1601,7 @@ NEWLINE ("\r"|"\n"|"\r\n") if (!zend_stack_is_empty(&SCNG(state_stack))) { yy_pop_state(); } - return '}'; + RETURN_TOKEN('}'); } @@ -1587,7 +1610,7 @@ NEWLINE ("\r"|"\n"|"\r\n") zend_copy_value(zendlval, yytext, yyleng); yy_pop_state(); yy_push_state(ST_IN_SCRIPTING); - return T_STRING_VARNAME; + RETURN_TOKEN(T_STRING_VARNAME); } @@ -1617,12 +1640,12 @@ NEWLINE ("\r"|"\n"|"\r\n") ZVAL_LONG(zendlval, ZEND_STRTOL(bin, &end, 2)); ZEND_ASSERT(!errno && end == yytext + yyleng); } - return T_LNUMBER; + RETURN_TOKEN(T_LNUMBER); } else { ZVAL_DOUBLE(zendlval, zend_bin_strtod(bin, (const char **)&end)); /* errno isn't checked since we allow HUGE_VAL/INF overflow */ ZEND_ASSERT(end == yytext + yyleng); - return T_DNUMBER; + RETURN_TOKEN(T_DNUMBER); } } @@ -1636,7 +1659,7 @@ NEWLINE ("\r"|"\n"|"\r\n") */ if (end != yytext + yyleng) { zend_throw_exception(zend_get_parse_exception(), "Invalid numeric literal", E_PARSE); - return T_ERROR; + RETURN_TOKEN(T_ERROR); } } else { errno = 0; @@ -1653,19 +1676,19 @@ NEWLINE ("\r"|"\n"|"\r\n") if (end != yytext + yyleng) { zend_throw_exception(zend_get_parse_exception(), "Invalid numeric literal", E_PARSE); - return T_ERROR; + RETURN_TOKEN(T_ERROR); } ZEND_ASSERT(!errno); - return T_DNUMBER; + RETURN_TOKEN(T_DNUMBER); } /* Also not an assert for the same reason */ if (end != yytext + yyleng) { zend_throw_exception(zend_get_parse_exception(), "Invalid numeric literal", E_PARSE); - return T_ERROR; + RETURN_TOKEN(T_ERROR); } } ZEND_ASSERT(!errno); - return T_LNUMBER; + RETURN_TOKEN(T_LNUMBER); } <ST_IN_SCRIPTING>{HNUM} { @@ -1687,12 +1710,12 @@ NEWLINE ("\r"|"\n"|"\r\n") ZVAL_LONG(zendlval, ZEND_STRTOL(hex, &end, 16)); ZEND_ASSERT(!errno && end == hex + len); } - return T_LNUMBER; + RETURN_TOKEN(T_LNUMBER); } else { ZVAL_DOUBLE(zendlval, zend_hex_strtod(hex, (const char **)&end)); /* errno isn't checked since we allow HUGE_VAL/INF overflow */ ZEND_ASSERT(end == hex + len); - return T_DNUMBER; + RETURN_TOKEN(T_DNUMBER); } } @@ -1709,12 +1732,12 @@ NEWLINE ("\r"|"\n"|"\r\n") string: ZVAL_STRINGL(zendlval, yytext, yyleng); } - return T_NUM_STRING; + RETURN_TOKEN(T_NUM_STRING); } <ST_VAR_OFFSET>{LNUM}|{HNUM}|{BNUM} { /* Offset must be treated as a string */ ZVAL_STRINGL(zendlval, yytext, yyleng); - return T_NUM_STRING; + RETURN_TOKEN(T_NUM_STRING); } <ST_IN_SCRIPTING>{DNUM}|{EXPONENT_DNUM} { @@ -1723,59 +1746,59 @@ string: ZVAL_DOUBLE(zendlval, zend_strtod(yytext, &end)); /* errno isn't checked since we allow HUGE_VAL/INF overflow */ ZEND_ASSERT(end == yytext + yyleng); - return T_DNUMBER; + RETURN_TOKEN(T_DNUMBER); } <ST_IN_SCRIPTING>"__CLASS__" { - return T_CLASS_C; + RETURN_TOKEN(T_CLASS_C); } <ST_IN_SCRIPTING>"__TRAIT__" { - return T_TRAIT_C; + RETURN_TOKEN(T_TRAIT_C); } <ST_IN_SCRIPTING>"__FUNCTION__" { - return T_FUNC_C; + RETURN_TOKEN(T_FUNC_C); } <ST_IN_SCRIPTING>"__METHOD__" { - return T_METHOD_C; + RETURN_TOKEN(T_METHOD_C); } <ST_IN_SCRIPTING>"__LINE__" { - return T_LINE; + RETURN_TOKEN(T_LINE); } <ST_IN_SCRIPTING>"__FILE__" { - return T_FILE; + RETURN_TOKEN(T_FILE); } <ST_IN_SCRIPTING>"__DIR__" { - return T_DIR; + RETURN_TOKEN(T_DIR); } <ST_IN_SCRIPTING>"__NAMESPACE__" { - return T_NS_C; + RETURN_TOKEN(T_NS_C); } <INITIAL>"<?=" { BEGIN(ST_IN_SCRIPTING); - return T_OPEN_TAG_WITH_ECHO; + RETURN_TOKEN(T_OPEN_TAG_WITH_ECHO); } <INITIAL>"<?php"([ \t]|{NEWLINE}) { HANDLE_NEWLINE(yytext[yyleng-1]); BEGIN(ST_IN_SCRIPTING); - return T_OPEN_TAG; + RETURN_TOKEN(T_OPEN_TAG); } <INITIAL>"<?" { if (CG(short_tags)) { BEGIN(ST_IN_SCRIPTING); - return T_OPEN_TAG; + RETURN_TOKEN(T_OPEN_TAG); } else { goto inline_char_handler; } @@ -1783,7 +1806,7 @@ string: <INITIAL>{ANY_CHAR} { if (YYCURSOR > YYLIMIT) { - return 0; + RETURN_TOKEN(END); } inline_char_handler: @@ -1823,7 +1846,7 @@ inline_char_handler: ZVAL_STRINGL(zendlval, yytext, yyleng); } HANDLE_NEWLINES(yytext, yyleng); - return T_INLINE_HTML; + RETURN_TOKEN(T_INLINE_HTML); } @@ -1834,7 +1857,7 @@ inline_char_handler: yyless(yyleng - 3); yy_push_state(ST_LOOKING_FOR_PROPERTY); zend_copy_value(zendlval, (yytext+1), (yyleng-1)); - return T_VARIABLE; + RETURN_TOKEN(T_VARIABLE); } /* A [ always designates a variable offset, regardless of what follows @@ -1843,22 +1866,22 @@ inline_char_handler: yyless(yyleng - 1); yy_push_state(ST_VAR_OFFSET); zend_copy_value(zendlval, (yytext+1), (yyleng-1)); - return T_VARIABLE; + RETURN_TOKEN(T_VARIABLE); } <ST_IN_SCRIPTING,ST_DOUBLE_QUOTES,ST_HEREDOC,ST_BACKQUOTE,ST_VAR_OFFSET>"$"{LABEL} { zend_copy_value(zendlval, (yytext+1), (yyleng-1)); - return T_VARIABLE; + RETURN_TOKEN(T_VARIABLE); } <ST_VAR_OFFSET>"]" { yy_pop_state(); - return ']'; + RETURN_TOKEN(']'); } <ST_VAR_OFFSET>{TOKENS}|[{}"`] { /* Only '[' can be valid, but returning other tokens will allow a more explicit parse error */ - return yytext[0]; + RETURN_TOKEN(yytext[0]); } <ST_VAR_OFFSET>[ \n\r\t\\'#] { @@ -1866,16 +1889,16 @@ inline_char_handler: yyless(0); yy_pop_state(); ZVAL_NULL(zendlval); - return T_ENCAPSED_AND_WHITESPACE; + RETURN_TOKEN(T_ENCAPSED_AND_WHITESPACE); } <ST_IN_SCRIPTING,ST_VAR_OFFSET>{LABEL} { zend_copy_value(zendlval, yytext, yyleng); - return T_STRING; + RETURN_TOKEN(T_STRING); } -<ST_IN_SCRIPTING,ST_LOOKING_FOR_SEMI_RESERVED_NAME>"#"|"//" { +<ST_IN_SCRIPTING>"#"|"//" { while (YYCURSOR < YYLIMIT) { switch (*YYCURSOR++) { case '\r': @@ -1901,10 +1924,10 @@ inline_char_handler: yyleng = YYCURSOR - SCNG(yy_text); - return T_COMMENT; + RETURN_TOKEN(T_COMMENT); } -<ST_IN_SCRIPTING,ST_LOOKING_FOR_SEMI_RESERVED_NAME>"/*"|"/**"{WHITESPACE} { +<ST_IN_SCRIPTING>"/*"|"/**"{WHITESPACE} { int doc_com; if (yyleng > 2) { @@ -1931,27 +1954,15 @@ inline_char_handler: if (doc_com) { CG(doc_comment) = zend_string_init(yytext, yyleng, 0); - return T_DOC_COMMENT; + RETURN_TOKEN(T_DOC_COMMENT); } - return T_COMMENT; -} - -<ST_LOOKING_FOR_SEMI_RESERVED_NAME>{LABEL} { - zend_copy_value(zendlval, yytext, yyleng); - yy_pop_state(); - return T_STRING; -} - -<ST_LOOKING_FOR_SEMI_RESERVED_NAME>{ANY_CHAR} { - yyless(0); - yy_pop_state(); - goto restart; + RETURN_TOKEN(T_COMMENT); } <ST_IN_SCRIPTING>"?>"{NEWLINE}? { BEGIN(INITIAL); - return T_CLOSE_TAG; /* implicit ';' at php-end tag */ + RETURN_TOKEN(T_CLOSE_TAG); /* implicit ';' at php-end tag */ } @@ -1977,7 +1988,7 @@ inline_char_handler: * for ' (unrecognized by parser), instead of old flex fallback to "Unexpected character..." * rule, which continued in ST_IN_SCRIPTING state after the quote */ ZVAL_NULL(zendlval); - return T_ENCAPSED_AND_WHITESPACE; + RETURN_TOKEN(T_ENCAPSED_AND_WHITESPACE); } } @@ -2020,7 +2031,7 @@ inline_char_handler: SCNG(output_filter)((unsigned char **)&str, &sz, (unsigned char *)s, (size_t)Z_STRLEN_P(zendlval)); ZVAL_STRINGL(zendlval, str, sz); } - return T_CONSTANT_ENCAPSED_STRING; + RETURN_TOKEN(T_CONSTANT_ENCAPSED_STRING); } @@ -2032,9 +2043,9 @@ inline_char_handler: case '"': yyleng = YYCURSOR - SCNG(yy_text); if (zend_scan_escape_string(zendlval, yytext+bprefix+1, yyleng-bprefix-2, '"') == FAILURE) { - return T_ERROR; + RETURN_TOKEN(T_ERROR); } - return T_CONSTANT_ENCAPSED_STRING; + RETURN_TOKEN(T_CONSTANT_ENCAPSED_STRING); case '$': if (IS_LABEL_START(*YYCURSOR) || *YYCURSOR == '{') { break; @@ -2064,7 +2075,7 @@ inline_char_handler: YYCURSOR = SCNG(yy_text) + yyleng; BEGIN(ST_DOUBLE_QUOTES); - return '"'; + RETURN_TOKEN('"'); } @@ -2112,13 +2123,13 @@ inline_char_handler: zend_ptr_stack_push(&SCNG(heredoc_label_stack), (void *) heredoc_label); - return T_START_HEREDOC; + RETURN_TOKEN(T_START_HEREDOC); } <ST_IN_SCRIPTING>[`] { BEGIN(ST_BACKQUOTE); - return '`'; + RETURN_TOKEN('`'); } @@ -2132,7 +2143,7 @@ inline_char_handler: efree(heredoc_label); BEGIN(ST_IN_SCRIPTING); - return T_END_HEREDOC; + RETURN_TOKEN(T_END_HEREDOC); } @@ -2140,18 +2151,18 @@ inline_char_handler: Z_LVAL_P(zendlval) = (zend_long) '{'; yy_push_state(ST_IN_SCRIPTING); yyless(1); - return T_CURLY_OPEN; + RETURN_TOKEN(T_CURLY_OPEN); } <ST_DOUBLE_QUOTES>["] { BEGIN(ST_IN_SCRIPTING); - return '"'; + RETURN_TOKEN('"'); } <ST_BACKQUOTE>[`] { BEGIN(ST_IN_SCRIPTING); - return '`'; + RETURN_TOKEN('`'); } @@ -2164,7 +2175,7 @@ inline_char_handler: } if (YYCURSOR > YYLIMIT) { - return 0; + RETURN_TOKEN(END); } if (yytext[0] == '\\' && YYCURSOR < YYLIMIT) { YYCURSOR++; @@ -2201,15 +2212,15 @@ double_quotes_scan_done: yyleng = YYCURSOR - SCNG(yy_text); if (zend_scan_escape_string(zendlval, yytext, yyleng, '"') == FAILURE) { - return T_ERROR; + RETURN_TOKEN(T_ERROR); } - return T_ENCAPSED_AND_WHITESPACE; + RETURN_TOKEN(T_ENCAPSED_AND_WHITESPACE); } <ST_BACKQUOTE>{ANY_CHAR} { if (YYCURSOR > YYLIMIT) { - return 0; + RETURN_TOKEN(END); } if (yytext[0] == '\\' && YYCURSOR < YYLIMIT) { YYCURSOR++; @@ -2245,9 +2256,9 @@ double_quotes_scan_done: yyleng = YYCURSOR - SCNG(yy_text); if (zend_scan_escape_string(zendlval, yytext, yyleng, '`') == FAILURE) { - return T_ERROR; + RETURN_TOKEN(T_ERROR); } - return T_ENCAPSED_AND_WHITESPACE; + RETURN_TOKEN(T_ENCAPSED_AND_WHITESPACE); } @@ -2257,7 +2268,7 @@ double_quotes_scan_done: zend_heredoc_label *heredoc_label = zend_ptr_stack_top(&SCNG(heredoc_label_stack)); if (YYCURSOR > YYLIMIT) { - return 0; + RETURN_TOKEN(END); } YYCURSOR--; @@ -2321,9 +2332,9 @@ heredoc_scan_done: yyleng = YYCURSOR - SCNG(yy_text); if (zend_scan_escape_string(zendlval, yytext, yyleng - newline, 0) == FAILURE) { - return T_ERROR; + RETURN_TOKEN(T_ERROR); } - return T_ENCAPSED_AND_WHITESPACE; + RETURN_TOKEN(T_ENCAPSED_AND_WHITESPACE); } @@ -2333,7 +2344,7 @@ heredoc_scan_done: zend_heredoc_label *heredoc_label = zend_ptr_stack_top(&SCNG(heredoc_label_stack)); if (YYCURSOR > YYLIMIT) { - return 0; + RETURN_TOKEN(END); } YYCURSOR--; @@ -2380,13 +2391,13 @@ nowdoc_scan_done: zend_copy_value(zendlval, yytext, yyleng - newline); HANDLE_NEWLINES(yytext, yyleng - newline); - return T_ENCAPSED_AND_WHITESPACE; + RETURN_TOKEN(T_ENCAPSED_AND_WHITESPACE); } <ST_IN_SCRIPTING,ST_VAR_OFFSET>{ANY_CHAR} { if (YYCURSOR > YYLIMIT) { - return 0; + RETURN_TOKEN(END); } zend_error(E_COMPILE_WARNING,"Unexpected character in input: '%c' (ASCII=%d) state=%d", yytext[0], yytext[0], YYSTATE); diff --git a/ext/tokenizer/tests/token_get_all_TOKEN_PARSE_000.phpt b/ext/tokenizer/tests/token_get_all_TOKEN_PARSE_000.phpt new file mode 100644 index 0000000000..03b991b1a5 --- /dev/null +++ b/ext/tokenizer/tests/token_get_all_TOKEN_PARSE_000.phpt @@ -0,0 +1,19 @@ +--TEST-- +Parse errors during token_get_all() with TOKEN_PARSE flag +--SKIPIF-- +<?php if (!extension_loaded("tokenizer")) print "skip"; ?> +--FILE-- +<?php + +try { + token_get_all('<?php invalid code;', TOKEN_PARSE); +} catch (ParseException $e) { + echo $e->getMessage(), PHP_EOL; +} + +echo "Done"; + +?> +--EXPECT-- +syntax error, unexpected 'code' (T_STRING) +Done diff --git a/ext/tokenizer/tests/token_get_all_TOKEN_PARSE_001.phpt b/ext/tokenizer/tests/token_get_all_TOKEN_PARSE_001.phpt new file mode 100644 index 0000000000..ab334358ab --- /dev/null +++ b/ext/tokenizer/tests/token_get_all_TOKEN_PARSE_001.phpt @@ -0,0 +1,81 @@ +--TEST-- +Semi reserved words support: member access +--SKIPIF-- +<?php if (!extension_loaded("tokenizer")) print "skip"; ?> +--FILE-- +<?php +$tokens = token_get_all('<?php +X::continue; +X::$continue; +$x->$continue; +X::continue(); +$x->continue(); +X::class; + +class X { + const CONTINUE = 1; + public $x = self::CONTINUE + 1; +} +', TOKEN_PARSE); + +array_walk($tokens, function($tk) { + if(is_array($tk)) { + if(($t = token_name($tk[0])) == 'T_WHITESPACE') return; + echo "L{$tk[2]}: ".$t." {$tk[1]}", PHP_EOL; + } + else echo $tk, PHP_EOL; +}); + +echo "Done"; + +?> +--EXPECTF-- +L1: T_OPEN_TAG <?php + +L2: T_STRING X +L2: T_DOUBLE_COLON :: +L2: T_STRING continue +; +L3: T_STRING X +L3: T_DOUBLE_COLON :: +L3: T_VARIABLE $continue +; +L4: T_VARIABLE $x +L4: T_OBJECT_OPERATOR -> +L4: T_VARIABLE $continue +; +L5: T_STRING X +L5: T_DOUBLE_COLON :: +L5: T_STRING continue +( +) +; +L6: T_VARIABLE $x +L6: T_OBJECT_OPERATOR -> +L6: T_STRING continue +( +) +; +L7: T_STRING X +L7: T_DOUBLE_COLON :: +L7: T_CLASS class +; +L9: T_CLASS class +L9: T_STRING X +{ +L10: T_CONST const +L10: T_STRING CONTINUE += +L10: T_LNUMBER 1 +; +L11: T_PUBLIC public +L11: T_VARIABLE $x += +L11: T_STRING self +L11: T_DOUBLE_COLON :: +L11: T_STRING CONTINUE ++ +L11: T_LNUMBER 1 +; +} +Done diff --git a/ext/tokenizer/tests/token_get_all_TOKEN_PARSE_002.phpt b/ext/tokenizer/tests/token_get_all_TOKEN_PARSE_002.phpt new file mode 100644 index 0000000000..3dd8e14d84 --- /dev/null +++ b/ext/tokenizer/tests/token_get_all_TOKEN_PARSE_002.phpt @@ -0,0 +1,68 @@ +--TEST-- +Semi reserved words support: class const +--SKIPIF-- +<?php if (!extension_loaded("tokenizer")) print "skip"; ?> +--FILE-- +<?php +$tokens = token_get_all('<?php + class SomeClass { + const CONST = 1; + const CONTINUE = (self::CONST + 1); + const ARRAY = [1, self::CONTINUE => [3, 4], 5]; + } +', TOKEN_PARSE); + +array_walk($tokens, function($tk) { + if(is_array($tk)) { + if(($t = token_name($tk[0])) == 'T_WHITESPACE') return; + echo "L{$tk[2]}: ".$t." {$tk[1]}", PHP_EOL; + } + else echo $tk, PHP_EOL; +}); + +echo "Done"; + +?> +--EXPECTF-- +L1: T_OPEN_TAG <?php + +L2: T_CLASS class +L2: T_STRING SomeClass +{ +L3: T_CONST const +L3: T_STRING CONST += +L3: T_LNUMBER 1 +; +L4: T_CONST const +L4: T_STRING CONTINUE += +( +L4: T_STRING self +L4: T_DOUBLE_COLON :: +L4: T_STRING CONST ++ +L4: T_LNUMBER 1 +) +; +L5: T_CONST const +L5: T_STRING ARRAY += +[ +L5: T_LNUMBER 1 +, +L5: T_STRING self +L5: T_DOUBLE_COLON :: +L5: T_STRING CONTINUE +L5: T_DOUBLE_ARROW => +[ +L5: T_LNUMBER 3 +, +L5: T_LNUMBER 4 +] +, +L5: T_LNUMBER 5 +] +; +} +Done diff --git a/ext/tokenizer/tests/token_get_all_error.phpt b/ext/tokenizer/tests/token_get_all_error.phpt index 29e97c38c4..9ded0a1774 100644 --- a/ext/tokenizer/tests/token_get_all_error.phpt +++ b/ext/tokenizer/tests/token_get_all_error.phpt @@ -19,7 +19,7 @@ var_dump( token_get_all()); echo "-- Testing token_get_all() function with more than expected no. of arguments --\n"; $source = '<?php ?>'; $extra_arg = 10; -var_dump( token_get_all($source, $extra_arg)); +var_dump( token_get_all($source, true, $extra_arg)); echo "Done" ?> @@ -28,10 +28,10 @@ echo "Done" -- Testing token_get_all() function with zero arguments -- -Warning: token_get_all() expects exactly 1 parameter, 0 given in %s on line %d +Warning: token_get_all() expects at least 1 parameter, 0 given in %s on line 11 NULL -- Testing token_get_all() function with more than expected no. of arguments -- -Warning: token_get_all() expects exactly 1 parameter, 2 given in %s on line %d +Warning: token_get_all() expects at most 2 parameters, 3 given in %s on line 17 NULL -Done +Done
\ No newline at end of file diff --git a/ext/tokenizer/tokenizer.c b/ext/tokenizer/tokenizer.c index c4b9d14359..2a4fa90ca2 100644 --- a/ext/tokenizer/tokenizer.c +++ b/ext/tokenizer/tokenizer.c @@ -37,6 +37,12 @@ #define zendcursor LANG_SCNG(yy_cursor) #define zendlimit LANG_SCNG(yy_limit) +#define TOKEN_PARSE 1 + +void tokenizer_token_get_all_register_constants(INIT_FUNC_ARGS) { + REGISTER_LONG_CONSTANT("TOKEN_PARSE", TOKEN_PARSE, CONST_CS|CONST_PERSISTENT); +} + /* {{{ arginfo */ ZEND_BEGIN_ARG_INFO_EX(arginfo_token_get_all, 0, 0, 1) ZEND_ARG_INFO(0, source) @@ -83,6 +89,7 @@ ZEND_GET_MODULE(tokenizer) PHP_MINIT_FUNCTION(tokenizer) { tokenizer_register_constants(INIT_FUNC_ARGS_PASSTHRU); + tokenizer_token_get_all_register_constants(INIT_FUNC_ARGS_PASSTHRU); return SUCCESS; } /* }}} */ @@ -97,8 +104,10 @@ PHP_MINFO_FUNCTION(tokenizer) } /* }}} */ -static void tokenize(zval *return_value) +static zend_bool tokenize(zval *return_value, zend_string *source) { + zval source_zval; + zend_lex_state original_lex_state; zval token; zval keyword; int token_type; @@ -106,10 +115,22 @@ static void tokenize(zval *return_value) int token_line = 1; int need_tokens = -1; /* for __halt_compiler lexing. -1 = disabled */ + ZVAL_STR_COPY(&source_zval, source); + zend_save_lexical_state(&original_lex_state); + + if (zend_prepare_string_for_scanning(&source_zval, "") == FAILURE) { + zend_restore_lexical_state(&original_lex_state); + return 0; + } + + LANG_SCNG(yy_state) = yycINITIAL; array_init(return_value); ZVAL_NULL(&token); while ((token_type = lex_scan(&token))) { + + if(token_type == T_ERROR) break; + destroy = 1; switch (token_type) { case T_CLOSE_TAG: @@ -123,8 +144,6 @@ static void tokenize(zval *return_value) case T_DOC_COMMENT: destroy = 0; break; - case T_ERROR: - return; } if (token_type >= 256) { @@ -169,34 +188,113 @@ static void tokenize(zval *return_value) token_line = CG(zend_lineno); } + + zval_dtor(&source_zval); + zend_restore_lexical_state(&original_lex_state); + + return 1; } -/* {{{ proto array token_get_all(string source) - */ -PHP_FUNCTION(token_get_all) +zval token_stream; + +void on_event(zend_php_scanner_event event, int token, int line) { - zend_string *source; - zval source_zval; - zend_lex_state original_lex_state; + zval keyword; + HashTable *tokens_ht; + zval *token_zv; - if (zend_parse_parameters(ZEND_NUM_ARGS(), "S", &source) == FAILURE) { - return; + switch(event) { + case ON_TOKEN: + if (token == T_ERROR || token == END) break; + if (token >= 256) { + array_init(&keyword); + add_next_index_long(&keyword, token); + add_next_index_stringl(&keyword, (char *)LANG_SCNG(yy_text), LANG_SCNG(yy_leng)); + add_next_index_long(&keyword, line); + add_next_index_zval(&token_stream, &keyword); + } else { + add_next_index_stringl(&token_stream, (char *)LANG_SCNG(yy_text), LANG_SCNG(yy_leng)); + } + break; + case ON_FEEDBACK: + tokens_ht = Z_ARRVAL(token_stream); + token_zv = zend_hash_index_find(tokens_ht, zend_hash_num_elements(tokens_ht) - 1); + if (token_zv && Z_TYPE_P(token_zv) == IS_ARRAY) { + ZVAL_LONG(zend_hash_index_find(Z_ARRVAL_P(token_zv), 0), token); + } + break; + case ON_STOP: + if (LANG_SCNG(yy_cursor) != LANG_SCNG(yy_limit)) { + array_init(&keyword); + add_next_index_long(&keyword, T_INLINE_HTML); + add_next_index_stringl(&keyword, + (char *)LANG_SCNG(yy_cursor), LANG_SCNG(yy_limit) - LANG_SCNG(yy_cursor)); + add_next_index_long(&keyword, CG(zend_lineno)); + add_next_index_zval(&token_stream, &keyword); + } + break; } +} + +static zend_bool tokenize_parse(zval *return_value, zend_string *source) +{ + zval source_zval; + zend_lex_state original_lex_state; + zend_bool original_in_compilation; + zend_bool success; ZVAL_STR_COPY(&source_zval, source); + + original_in_compilation = CG(in_compilation); + CG(in_compilation) = 1; zend_save_lexical_state(&original_lex_state); - if (zend_prepare_string_for_scanning(&source_zval, "") == FAILURE) { - zend_restore_lexical_state(&original_lex_state); - RETURN_FALSE; - } + if ((success = (zend_prepare_string_for_scanning(&source_zval, "") == SUCCESS))) { + CG(ast) = NULL; + CG(ast_arena) = zend_arena_create(1024 * 32); + LANG_SCNG(yy_state) = yycINITIAL; + LANG_SCNG(on_event) = on_event; - LANG_SCNG(yy_state) = yycINITIAL; + array_init(&token_stream); + if((success = (zendparse() == SUCCESS))) { + ZVAL_ZVAL(return_value, &token_stream, 1, 0); + } + zval_dtor(&token_stream); - tokenize(return_value); + zend_ast_destroy(CG(ast)); + zend_arena_destroy(CG(ast_arena)); + } + /* restore compiler and scanner global states */ zend_restore_lexical_state(&original_lex_state); + CG(in_compilation) = original_in_compilation; + zval_dtor(&source_zval); + + return success; +} + +/* }}} */ + +/* {{{ proto array token_get_all(string source) + */ +PHP_FUNCTION(token_get_all) +{ + zend_string *source; + zend_long flags = 0; + zend_bool success; + + if (zend_parse_parameters(ZEND_NUM_ARGS(), "S|l", &source, &flags) == FAILURE) { + return; + } + + if (flags & TOKEN_PARSE) { + success = tokenize_parse(return_value, source); + } else { + success = tokenize(return_value, source); + } + + if (!success) RETURN_FALSE; } /* }}} */ |
