diff options
| author | Márcio Almada <marcio3w@gmail.com> | 2015-04-05 08:50:35 -0300 |
|---|---|---|
| committer | Márcio Almada <marcio3w@gmail.com> | 2015-04-30 03:03:29 -0300 |
| commit | 110759386e2f9b4d88bf68c669b6c54ad4b5c04f (patch) | |
| tree | c0dc58e312c77662a5f6e10941408560a4b440ac /ext/tokenizer | |
| parent | 02a9eb4f8c736089808b51d862def0e648383e09 (diff) | |
| download | php-git-110759386e2f9b4d88bf68c669b6c54ad4b5c04f.tar.gz | |
ext tokenizer port + cleanup unused lexer states
we basically added a mechanism to store the token stream during parsing
and exposed the entire parser stack on the tokenizer extension through
an opt in flag: token_get_all($src, TOKEN_PARSE).
this change allows easy future language enhancements regarding context
aware parsing & scanning without further maintance on the tokenizer
extension while solves known inconsistencies "parseless" tokenizer
extension has when it handles `__halt_compiler()` presence.
Diffstat (limited to 'ext/tokenizer')
| -rw-r--r-- | ext/tokenizer/tests/token_get_all_TOKEN_PARSE_000.phpt | 19 | ||||
| -rw-r--r-- | ext/tokenizer/tests/token_get_all_TOKEN_PARSE_001.phpt | 81 | ||||
| -rw-r--r-- | ext/tokenizer/tests/token_get_all_TOKEN_PARSE_002.phpt | 68 | ||||
| -rw-r--r-- | ext/tokenizer/tests/token_get_all_error.phpt | 8 | ||||
| -rw-r--r-- | ext/tokenizer/tokenizer.c | 132 |
5 files changed, 287 insertions, 21 deletions
diff --git a/ext/tokenizer/tests/token_get_all_TOKEN_PARSE_000.phpt b/ext/tokenizer/tests/token_get_all_TOKEN_PARSE_000.phpt new file mode 100644 index 0000000000..03b991b1a5 --- /dev/null +++ b/ext/tokenizer/tests/token_get_all_TOKEN_PARSE_000.phpt @@ -0,0 +1,19 @@ +--TEST-- +Parse errors during token_get_all() with TOKEN_PARSE flag +--SKIPIF-- +<?php if (!extension_loaded("tokenizer")) print "skip"; ?> +--FILE-- +<?php + +try { + token_get_all('<?php invalid code;', TOKEN_PARSE); +} catch (ParseException $e) { + echo $e->getMessage(), PHP_EOL; +} + +echo "Done"; + +?> +--EXPECT-- +syntax error, unexpected 'code' (T_STRING) +Done diff --git a/ext/tokenizer/tests/token_get_all_TOKEN_PARSE_001.phpt b/ext/tokenizer/tests/token_get_all_TOKEN_PARSE_001.phpt new file mode 100644 index 0000000000..ab334358ab --- /dev/null +++ b/ext/tokenizer/tests/token_get_all_TOKEN_PARSE_001.phpt @@ -0,0 +1,81 @@ +--TEST-- +Semi reserved words support: member access +--SKIPIF-- +<?php if (!extension_loaded("tokenizer")) print "skip"; ?> +--FILE-- +<?php +$tokens = token_get_all('<?php +X::continue; +X::$continue; +$x->$continue; +X::continue(); +$x->continue(); +X::class; + +class X { + const CONTINUE = 1; + public $x = self::CONTINUE + 1; +} +', TOKEN_PARSE); + +array_walk($tokens, function($tk) { + if(is_array($tk)) { + if(($t = token_name($tk[0])) == 'T_WHITESPACE') return; + echo "L{$tk[2]}: ".$t." {$tk[1]}", PHP_EOL; + } + else echo $tk, PHP_EOL; +}); + +echo "Done"; + +?> +--EXPECTF-- +L1: T_OPEN_TAG <?php + +L2: T_STRING X +L2: T_DOUBLE_COLON :: +L2: T_STRING continue +; +L3: T_STRING X +L3: T_DOUBLE_COLON :: +L3: T_VARIABLE $continue +; +L4: T_VARIABLE $x +L4: T_OBJECT_OPERATOR -> +L4: T_VARIABLE $continue +; +L5: T_STRING X +L5: T_DOUBLE_COLON :: +L5: T_STRING continue +( +) +; +L6: T_VARIABLE $x +L6: T_OBJECT_OPERATOR -> +L6: T_STRING continue +( +) +; +L7: T_STRING X +L7: T_DOUBLE_COLON :: +L7: T_CLASS class +; +L9: T_CLASS class +L9: T_STRING X +{ +L10: T_CONST const +L10: T_STRING CONTINUE += +L10: T_LNUMBER 1 +; +L11: T_PUBLIC public +L11: T_VARIABLE $x += +L11: T_STRING self +L11: T_DOUBLE_COLON :: +L11: T_STRING CONTINUE ++ +L11: T_LNUMBER 1 +; +} +Done diff --git a/ext/tokenizer/tests/token_get_all_TOKEN_PARSE_002.phpt b/ext/tokenizer/tests/token_get_all_TOKEN_PARSE_002.phpt new file mode 100644 index 0000000000..3dd8e14d84 --- /dev/null +++ b/ext/tokenizer/tests/token_get_all_TOKEN_PARSE_002.phpt @@ -0,0 +1,68 @@ +--TEST-- +Semi reserved words support: class const +--SKIPIF-- +<?php if (!extension_loaded("tokenizer")) print "skip"; ?> +--FILE-- +<?php +$tokens = token_get_all('<?php + class SomeClass { + const CONST = 1; + const CONTINUE = (self::CONST + 1); + const ARRAY = [1, self::CONTINUE => [3, 4], 5]; + } +', TOKEN_PARSE); + +array_walk($tokens, function($tk) { + if(is_array($tk)) { + if(($t = token_name($tk[0])) == 'T_WHITESPACE') return; + echo "L{$tk[2]}: ".$t." {$tk[1]}", PHP_EOL; + } + else echo $tk, PHP_EOL; +}); + +echo "Done"; + +?> +--EXPECTF-- +L1: T_OPEN_TAG <?php + +L2: T_CLASS class +L2: T_STRING SomeClass +{ +L3: T_CONST const +L3: T_STRING CONST += +L3: T_LNUMBER 1 +; +L4: T_CONST const +L4: T_STRING CONTINUE += +( +L4: T_STRING self +L4: T_DOUBLE_COLON :: +L4: T_STRING CONST ++ +L4: T_LNUMBER 1 +) +; +L5: T_CONST const +L5: T_STRING ARRAY += +[ +L5: T_LNUMBER 1 +, +L5: T_STRING self +L5: T_DOUBLE_COLON :: +L5: T_STRING CONTINUE +L5: T_DOUBLE_ARROW => +[ +L5: T_LNUMBER 3 +, +L5: T_LNUMBER 4 +] +, +L5: T_LNUMBER 5 +] +; +} +Done diff --git a/ext/tokenizer/tests/token_get_all_error.phpt b/ext/tokenizer/tests/token_get_all_error.phpt index 29e97c38c4..9ded0a1774 100644 --- a/ext/tokenizer/tests/token_get_all_error.phpt +++ b/ext/tokenizer/tests/token_get_all_error.phpt @@ -19,7 +19,7 @@ var_dump( token_get_all()); echo "-- Testing token_get_all() function with more than expected no. of arguments --\n"; $source = '<?php ?>'; $extra_arg = 10; -var_dump( token_get_all($source, $extra_arg)); +var_dump( token_get_all($source, true, $extra_arg)); echo "Done" ?> @@ -28,10 +28,10 @@ echo "Done" -- Testing token_get_all() function with zero arguments -- -Warning: token_get_all() expects exactly 1 parameter, 0 given in %s on line %d +Warning: token_get_all() expects at least 1 parameter, 0 given in %s on line 11 NULL -- Testing token_get_all() function with more than expected no. of arguments -- -Warning: token_get_all() expects exactly 1 parameter, 2 given in %s on line %d +Warning: token_get_all() expects at most 2 parameters, 3 given in %s on line 17 NULL -Done +Done
\ No newline at end of file diff --git a/ext/tokenizer/tokenizer.c b/ext/tokenizer/tokenizer.c index c4b9d14359..2a4fa90ca2 100644 --- a/ext/tokenizer/tokenizer.c +++ b/ext/tokenizer/tokenizer.c @@ -37,6 +37,12 @@ #define zendcursor LANG_SCNG(yy_cursor) #define zendlimit LANG_SCNG(yy_limit) +#define TOKEN_PARSE 1 + +void tokenizer_token_get_all_register_constants(INIT_FUNC_ARGS) { + REGISTER_LONG_CONSTANT("TOKEN_PARSE", TOKEN_PARSE, CONST_CS|CONST_PERSISTENT); +} + /* {{{ arginfo */ ZEND_BEGIN_ARG_INFO_EX(arginfo_token_get_all, 0, 0, 1) ZEND_ARG_INFO(0, source) @@ -83,6 +89,7 @@ ZEND_GET_MODULE(tokenizer) PHP_MINIT_FUNCTION(tokenizer) { tokenizer_register_constants(INIT_FUNC_ARGS_PASSTHRU); + tokenizer_token_get_all_register_constants(INIT_FUNC_ARGS_PASSTHRU); return SUCCESS; } /* }}} */ @@ -97,8 +104,10 @@ PHP_MINFO_FUNCTION(tokenizer) } /* }}} */ -static void tokenize(zval *return_value) +static zend_bool tokenize(zval *return_value, zend_string *source) { + zval source_zval; + zend_lex_state original_lex_state; zval token; zval keyword; int token_type; @@ -106,10 +115,22 @@ static void tokenize(zval *return_value) int token_line = 1; int need_tokens = -1; /* for __halt_compiler lexing. -1 = disabled */ + ZVAL_STR_COPY(&source_zval, source); + zend_save_lexical_state(&original_lex_state); + + if (zend_prepare_string_for_scanning(&source_zval, "") == FAILURE) { + zend_restore_lexical_state(&original_lex_state); + return 0; + } + + LANG_SCNG(yy_state) = yycINITIAL; array_init(return_value); ZVAL_NULL(&token); while ((token_type = lex_scan(&token))) { + + if(token_type == T_ERROR) break; + destroy = 1; switch (token_type) { case T_CLOSE_TAG: @@ -123,8 +144,6 @@ static void tokenize(zval *return_value) case T_DOC_COMMENT: destroy = 0; break; - case T_ERROR: - return; } if (token_type >= 256) { @@ -169,34 +188,113 @@ static void tokenize(zval *return_value) token_line = CG(zend_lineno); } + + zval_dtor(&source_zval); + zend_restore_lexical_state(&original_lex_state); + + return 1; } -/* {{{ proto array token_get_all(string source) - */ -PHP_FUNCTION(token_get_all) +zval token_stream; + +void on_event(zend_php_scanner_event event, int token, int line) { - zend_string *source; - zval source_zval; - zend_lex_state original_lex_state; + zval keyword; + HashTable *tokens_ht; + zval *token_zv; - if (zend_parse_parameters(ZEND_NUM_ARGS(), "S", &source) == FAILURE) { - return; + switch(event) { + case ON_TOKEN: + if (token == T_ERROR || token == END) break; + if (token >= 256) { + array_init(&keyword); + add_next_index_long(&keyword, token); + add_next_index_stringl(&keyword, (char *)LANG_SCNG(yy_text), LANG_SCNG(yy_leng)); + add_next_index_long(&keyword, line); + add_next_index_zval(&token_stream, &keyword); + } else { + add_next_index_stringl(&token_stream, (char *)LANG_SCNG(yy_text), LANG_SCNG(yy_leng)); + } + break; + case ON_FEEDBACK: + tokens_ht = Z_ARRVAL(token_stream); + token_zv = zend_hash_index_find(tokens_ht, zend_hash_num_elements(tokens_ht) - 1); + if (token_zv && Z_TYPE_P(token_zv) == IS_ARRAY) { + ZVAL_LONG(zend_hash_index_find(Z_ARRVAL_P(token_zv), 0), token); + } + break; + case ON_STOP: + if (LANG_SCNG(yy_cursor) != LANG_SCNG(yy_limit)) { + array_init(&keyword); + add_next_index_long(&keyword, T_INLINE_HTML); + add_next_index_stringl(&keyword, + (char *)LANG_SCNG(yy_cursor), LANG_SCNG(yy_limit) - LANG_SCNG(yy_cursor)); + add_next_index_long(&keyword, CG(zend_lineno)); + add_next_index_zval(&token_stream, &keyword); + } + break; } +} + +static zend_bool tokenize_parse(zval *return_value, zend_string *source) +{ + zval source_zval; + zend_lex_state original_lex_state; + zend_bool original_in_compilation; + zend_bool success; ZVAL_STR_COPY(&source_zval, source); + + original_in_compilation = CG(in_compilation); + CG(in_compilation) = 1; zend_save_lexical_state(&original_lex_state); - if (zend_prepare_string_for_scanning(&source_zval, "") == FAILURE) { - zend_restore_lexical_state(&original_lex_state); - RETURN_FALSE; - } + if ((success = (zend_prepare_string_for_scanning(&source_zval, "") == SUCCESS))) { + CG(ast) = NULL; + CG(ast_arena) = zend_arena_create(1024 * 32); + LANG_SCNG(yy_state) = yycINITIAL; + LANG_SCNG(on_event) = on_event; - LANG_SCNG(yy_state) = yycINITIAL; + array_init(&token_stream); + if((success = (zendparse() == SUCCESS))) { + ZVAL_ZVAL(return_value, &token_stream, 1, 0); + } + zval_dtor(&token_stream); - tokenize(return_value); + zend_ast_destroy(CG(ast)); + zend_arena_destroy(CG(ast_arena)); + } + /* restore compiler and scanner global states */ zend_restore_lexical_state(&original_lex_state); + CG(in_compilation) = original_in_compilation; + zval_dtor(&source_zval); + + return success; +} + +/* }}} */ + +/* {{{ proto array token_get_all(string source) + */ +PHP_FUNCTION(token_get_all) +{ + zend_string *source; + zend_long flags = 0; + zend_bool success; + + if (zend_parse_parameters(ZEND_NUM_ARGS(), "S|l", &source, &flags) == FAILURE) { + return; + } + + if (flags & TOKEN_PARSE) { + success = tokenize_parse(return_value, source); + } else { + success = tokenize(return_value, source); + } + + if (!success) RETURN_FALSE; } /* }}} */ |
