diff options
Diffstat (limited to 'ext/tokenizer/tokenizer.c')
| -rw-r--r-- | ext/tokenizer/tokenizer.c | 217 | 
1 files changed, 150 insertions, 67 deletions
diff --git a/ext/tokenizer/tokenizer.c b/ext/tokenizer/tokenizer.c index 15a7601927..c0eb38a63f 100644 --- a/ext/tokenizer/tokenizer.c +++ b/ext/tokenizer/tokenizer.c @@ -1,6 +1,6 @@  /*     +----------------------------------------------------------------------+ -   | PHP Version 5                                                        | +   | PHP Version 7                                                        |     +----------------------------------------------------------------------+     | Copyright (c) 1997-2016 The PHP Group                                |     +----------------------------------------------------------------------+ @@ -28,6 +28,7 @@  #include "php_tokenizer.h"  #include "zend.h" +#include "zend_exceptions.h"  #include "zend_language_scanner.h"  #include "zend_language_scanner_defs.h"  #include <zend_language_parser.h> @@ -37,9 +38,16 @@  #define zendcursor LANG_SCNG(yy_cursor)  #define zendlimit  LANG_SCNG(yy_limit) +#define TOKEN_PARSE 				1 + +void tokenizer_token_get_all_register_constants(INIT_FUNC_ARGS) { +	REGISTER_LONG_CONSTANT("TOKEN_PARSE", TOKEN_PARSE, CONST_CS|CONST_PERSISTENT); +} +  /* {{{ arginfo */  ZEND_BEGIN_ARG_INFO_EX(arginfo_token_get_all, 0, 0, 1)  	ZEND_ARG_INFO(0, source) +	ZEND_ARG_INFO(0, flags)  ZEND_END_ARG_INFO()  ZEND_BEGIN_ARG_INFO_EX(arginfo_token_name, 0, 0, 1) @@ -61,9 +69,7 @@ const zend_function_entry tokenizer_functions[] = {  /* {{{ tokenizer_module_entry   */  zend_module_entry tokenizer_module_entry = { -#if ZEND_MODULE_API_NO >= 20010901  	STANDARD_MODULE_HEADER, -#endif  	"tokenizer",  	tokenizer_functions,  	PHP_MINIT(tokenizer), @@ -71,9 +77,7 @@ zend_module_entry tokenizer_module_entry = {  	NULL,  	NULL,  	PHP_MINFO(tokenizer), -#if ZEND_MODULE_API_NO >= 20010901 -	"0.1", /* Replace with version number for your extension */ -#endif +	PHP_TOKENIZER_VERSION,  	STANDARD_MODULE_PROPERTIES  };  /* }}} */ @@ -87,6 +91,7 @@ ZEND_GET_MODULE(tokenizer)  PHP_MINIT_FUNCTION(tokenizer)  {  	tokenizer_register_constants(INIT_FUNC_ARGS_PASSTHRU); +	tokenizer_token_get_all_register_constants(INIT_FUNC_ARGS_PASSTHRU);  	return SUCCESS;  }  /* }}} */ @@ -101,69 +106,67 @@ PHP_MINFO_FUNCTION(tokenizer)  }  /* }}} */ -static void tokenize(zval *return_value TSRMLS_DC) +static zend_bool tokenize(zval *return_value, zend_string *source)  { +	zval source_zval; +	zend_lex_state original_lex_state;  	zval token; -	zval *keyword; +	zval keyword;  	int token_type; -	zend_bool destroy;  	int token_line = 1; -	int need_tokens = -1; // for __halt_compiler lexing. -1 = disabled +	int need_tokens = -1; /* for __halt_compiler lexing. -1 = disabled */ +	ZVAL_STR_COPY(&source_zval, source); +	zend_save_lexical_state(&original_lex_state); + +	if (zend_prepare_string_for_scanning(&source_zval, "") == FAILURE) { +		zend_restore_lexical_state(&original_lex_state); +		return 0; +	} + +	LANG_SCNG(yy_state) = yycINITIAL;  	array_init(return_value); -	ZVAL_NULL(&token); -	while ((token_type = lex_scan(&token TSRMLS_CC))) { -		destroy = 1; -		switch (token_type) { -			case T_CLOSE_TAG: -				if (zendtext[zendleng - 1] != '>') { -					CG(zend_lineno)++; -				} -			case T_OPEN_TAG: -			case T_OPEN_TAG_WITH_ECHO: -			case T_WHITESPACE: -			case T_COMMENT: -			case T_DOC_COMMENT: -				destroy = 0; -				break; +	ZVAL_UNDEF(&token); +	while ((token_type = lex_scan(&token))) { +		if (token_type == T_CLOSE_TAG && zendtext[zendleng - 1] != '>') { +			CG(zend_lineno)++;  		}  		if (token_type >= 256) { -			MAKE_STD_ZVAL(keyword); -			array_init(keyword); -			add_next_index_long(keyword, token_type); +			array_init(&keyword); +			add_next_index_long(&keyword, token_type);  			if (token_type == T_END_HEREDOC) {  				if (CG(increment_lineno)) {  					token_line = ++CG(zend_lineno);  					CG(increment_lineno) = 0;  				}  			} -			add_next_index_stringl(keyword, (char *)zendtext, zendleng, 1); -			add_next_index_long(keyword, token_line); -			add_next_index_zval(return_value, keyword); +			add_next_index_stringl(&keyword, (char *)zendtext, zendleng); +			add_next_index_long(&keyword, token_line); +			add_next_index_zval(return_value, &keyword);  		} else { -			add_next_index_stringl(return_value, (char *)zendtext, zendleng, 1); +			add_next_index_stringl(return_value, (char *)zendtext, zendleng);  		} -		if (destroy && Z_TYPE(token) != IS_NULL) { + +		if (Z_TYPE(token) != IS_UNDEF) {  			zval_dtor(&token); +			ZVAL_UNDEF(&token);  		} -		ZVAL_NULL(&token); -		// after T_HALT_COMPILER collect the next three non-dropped tokens +		/* after T_HALT_COMPILER collect the next three non-dropped tokens */  		if (need_tokens != -1) {  			if (token_type != T_WHITESPACE && token_type != T_OPEN_TAG -			    && token_type != T_COMMENT && token_type != T_DOC_COMMENT -			    && --need_tokens == 0 +				&& token_type != T_COMMENT && token_type != T_DOC_COMMENT +				&& --need_tokens == 0  			) { -				// fetch the rest into a T_INLINE_HTML +				/* fetch the rest into a T_INLINE_HTML */  				if (zendcursor != zendlimit) { -					MAKE_STD_ZVAL(keyword); -					array_init(keyword); -					add_next_index_long(keyword, T_INLINE_HTML); -					add_next_index_stringl(keyword, (char *)zendcursor, zendlimit - zendcursor, 1); -					add_next_index_long(keyword, token_line); -					add_next_index_zval(return_value, keyword); +					array_init(&keyword); +					add_next_index_long(&keyword, T_INLINE_HTML); +					add_next_index_stringl(&keyword, (char *)zendcursor, zendlimit - zendcursor); +					add_next_index_long(&keyword, token_line); +					add_next_index_zval(return_value, &keyword);  				}  				break;  			} @@ -173,36 +176,116 @@ static void tokenize(zval *return_value TSRMLS_DC)  		token_line = CG(zend_lineno);  	} + +	zval_dtor(&source_zval); +	zend_restore_lexical_state(&original_lex_state); + +	return 1;  } -/* {{{ proto array token_get_all(string source) - */ -PHP_FUNCTION(token_get_all) +zval token_stream; + +void on_event(zend_php_scanner_event event, int token, int line) +{ +	zval keyword; +	HashTable *tokens_ht; +	zval *token_zv; + +	switch (event) { +		case ON_TOKEN: +			if (token == END) break; +			if (token >= 256) { +				array_init(&keyword); +				add_next_index_long(&keyword, token); +				add_next_index_stringl(&keyword, (char *)LANG_SCNG(yy_text), LANG_SCNG(yy_leng)); +				add_next_index_long(&keyword, line); +				add_next_index_zval(&token_stream, &keyword); +			} else { +				add_next_index_stringl(&token_stream, (char *)LANG_SCNG(yy_text), LANG_SCNG(yy_leng)); +			} +			break; +		case ON_FEEDBACK: +			tokens_ht = Z_ARRVAL(token_stream); +			token_zv = zend_hash_index_find(tokens_ht, zend_hash_num_elements(tokens_ht) - 1); +			if (token_zv && Z_TYPE_P(token_zv) == IS_ARRAY) { +				ZVAL_LONG(zend_hash_index_find(Z_ARRVAL_P(token_zv), 0), token); +			} +			break; +		case ON_STOP: +			if (LANG_SCNG(yy_cursor) != LANG_SCNG(yy_limit)) { +				array_init(&keyword); +				add_next_index_long(&keyword, T_INLINE_HTML); +				add_next_index_stringl(&keyword, +					(char *)LANG_SCNG(yy_cursor), LANG_SCNG(yy_limit) - LANG_SCNG(yy_cursor)); +				add_next_index_long(&keyword, CG(zend_lineno)); +				add_next_index_zval(&token_stream, &keyword); +			} +			break; +	} +} + +static zend_bool tokenize_parse(zval *return_value, zend_string *source)  { -	char *source = NULL; -	int argc = ZEND_NUM_ARGS(); -	int source_len; -	zval source_z; +	zval source_zval;  	zend_lex_state original_lex_state; +	zend_bool original_in_compilation; +	zend_bool success; -	if (zend_parse_parameters(argc TSRMLS_CC, "s", &source, &source_len) == FAILURE) { -		return; +	ZVAL_STR_COPY(&source_zval, source); + +	original_in_compilation = CG(in_compilation); +	CG(in_compilation) = 1; +	zend_save_lexical_state(&original_lex_state); + +	if ((success = (zend_prepare_string_for_scanning(&source_zval, "") == SUCCESS))) { +		CG(ast) = NULL; +		CG(ast_arena) = zend_arena_create(1024 * 32); +		LANG_SCNG(yy_state) = yycINITIAL; +		LANG_SCNG(on_event) = on_event; + +		array_init(&token_stream); +		if((success = (zendparse() == SUCCESS))) { +			ZVAL_COPY_VALUE(return_value, &token_stream); +		} else { +			zval_ptr_dtor(&token_stream); +		} + +		zend_ast_destroy(CG(ast)); +		zend_arena_destroy(CG(ast_arena));  	} -	ZVAL_STRINGL(&source_z, source, source_len, 1); -	zend_save_lexical_state(&original_lex_state TSRMLS_CC); +	/* restore compiler and scanner global states */ +	zend_restore_lexical_state(&original_lex_state); +	CG(in_compilation) = original_in_compilation; + +	zval_dtor(&source_zval); -	if (zend_prepare_string_for_scanning(&source_z, "" TSRMLS_CC) == FAILURE) { -		zend_restore_lexical_state(&original_lex_state TSRMLS_CC); -		RETURN_FALSE; +	return success; +} + +/* }}} */ + +/* {{{ proto array token_get_all(string source [, int flags]) + */ +PHP_FUNCTION(token_get_all) +{ +	zend_string *source; +	zend_long flags = 0; +	zend_bool success; + +	if (zend_parse_parameters(ZEND_NUM_ARGS(), "S|l", &source, &flags) == FAILURE) { +		return;  	} -	LANG_SCNG(yy_state) = yycINITIAL; +	if (flags & TOKEN_PARSE) { +		success = tokenize_parse(return_value, source); +	} else { +		success = tokenize(return_value, source); +		/* Normal token_get_all() should not throw. */ +		zend_clear_exception(); +	} -	tokenize(return_value TSRMLS_CC); -	 -	zend_restore_lexical_state(&original_lex_state TSRMLS_CC); -	zval_dtor(&source_z); +	if (!success) RETURN_FALSE;  }  /* }}} */ @@ -210,13 +293,13 @@ PHP_FUNCTION(token_get_all)   */  PHP_FUNCTION(token_name)  { -	int argc = ZEND_NUM_ARGS(); -	long type; +	zend_long type; -	if (zend_parse_parameters(argc TSRMLS_CC, "l", &type) == FAILURE) { +	if (zend_parse_parameters(ZEND_NUM_ARGS(), "l", &type) == FAILURE) {  		return;  	} -	RETVAL_STRING(get_token_type_name(type), 1); + +	RETVAL_STRING(get_token_type_name(type));  }  /* }}} */  | 
