diff options
Diffstat (limited to 'ext/mbstring/php_mbregex.c')
-rw-r--r-- | ext/mbstring/php_mbregex.c | 1470 |
1 files changed, 1470 insertions, 0 deletions
diff --git a/ext/mbstring/php_mbregex.c b/ext/mbstring/php_mbregex.c new file mode 100644 index 0000000..145ee0c --- /dev/null +++ b/ext/mbstring/php_mbregex.c @@ -0,0 +1,1470 @@ +/* + +----------------------------------------------------------------------+ + | PHP Version 5 | + +----------------------------------------------------------------------+ + | Copyright (c) 1997-2013 The PHP Group | + +----------------------------------------------------------------------+ + | This source file is subject to version 3.01 of the PHP license, | + | that is bundled with this package in the file LICENSE, and is | + | available through the world-wide-web at the following url: | + | http://www.php.net/license/3_01.txt | + | If you did not receive a copy of the PHP license and are unable to | + | obtain it through the world-wide-web, please send a note to | + | license@php.net so we can mail you a copy immediately. | + +----------------------------------------------------------------------+ + | Author: Tsukada Takuya <tsukada@fminn.nagano.nagano.jp> | + +----------------------------------------------------------------------+ + */ + +/* $Id$ */ + + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "php.h" +#include "php_ini.h" + +#if HAVE_MBREGEX + +#include "ext/standard/php_smart_str.h" +#include "ext/standard/info.h" +#include "php_mbregex.h" +#include "mbstring.h" + +#include "php_onig_compat.h" /* must come prior to the oniguruma header */ +#include <oniguruma.h> +#undef UChar + +ZEND_EXTERN_MODULE_GLOBALS(mbstring) + +struct _zend_mb_regex_globals { + OnigEncoding default_mbctype; + OnigEncoding current_mbctype; + HashTable ht_rc; + zval *search_str; + zval *search_str_val; + unsigned int search_pos; + php_mb_regex_t *search_re; + OnigRegion *search_regs; + OnigOptionType regex_default_options; + OnigSyntaxType *regex_default_syntax; +}; + +#define MBREX(g) (MBSTRG(mb_regex_globals)->g) + +/* {{{ static void php_mb_regex_free_cache() */ +static void php_mb_regex_free_cache(php_mb_regex_t **pre) +{ + onig_free(*pre); +} +/* }}} */ + +/* {{{ _php_mb_regex_globals_ctor */ +static int _php_mb_regex_globals_ctor(zend_mb_regex_globals *pglobals TSRMLS_DC) +{ + pglobals->default_mbctype = ONIG_ENCODING_EUC_JP; + pglobals->current_mbctype = ONIG_ENCODING_EUC_JP; + zend_hash_init(&(pglobals->ht_rc), 0, NULL, (void (*)(void *)) php_mb_regex_free_cache, 1); + pglobals->search_str = (zval*) NULL; + pglobals->search_re = (php_mb_regex_t*)NULL; + pglobals->search_pos = 0; + pglobals->search_regs = (OnigRegion*)NULL; + pglobals->regex_default_options = ONIG_OPTION_MULTILINE | ONIG_OPTION_SINGLELINE; + pglobals->regex_default_syntax = ONIG_SYNTAX_RUBY; + return SUCCESS; +} +/* }}} */ + +/* {{{ _php_mb_regex_globals_dtor */ +static void _php_mb_regex_globals_dtor(zend_mb_regex_globals *pglobals TSRMLS_DC) +{ + zend_hash_destroy(&pglobals->ht_rc); +} +/* }}} */ + +/* {{{ php_mb_regex_globals_alloc */ +zend_mb_regex_globals *php_mb_regex_globals_alloc(TSRMLS_D) +{ + zend_mb_regex_globals *pglobals = pemalloc( + sizeof(zend_mb_regex_globals), 1); + if (!pglobals) { + return NULL; + } + if (SUCCESS != _php_mb_regex_globals_ctor(pglobals TSRMLS_CC)) { + pefree(pglobals, 1); + return NULL; + } + return pglobals; +} +/* }}} */ + +/* {{{ php_mb_regex_globals_free */ +void php_mb_regex_globals_free(zend_mb_regex_globals *pglobals TSRMLS_DC) +{ + if (!pglobals) { + return; + } + _php_mb_regex_globals_dtor(pglobals TSRMLS_CC); + pefree(pglobals, 1); +} +/* }}} */ + +/* {{{ PHP_MINIT_FUNCTION(mb_regex) */ +PHP_MINIT_FUNCTION(mb_regex) +{ + onig_init(); + return SUCCESS; +} +/* }}} */ + +/* {{{ PHP_MSHUTDOWN_FUNCTION(mb_regex) */ +PHP_MSHUTDOWN_FUNCTION(mb_regex) +{ + onig_end(); + return SUCCESS; +} +/* }}} */ + +/* {{{ PHP_RINIT_FUNCTION(mb_regex) */ +PHP_RINIT_FUNCTION(mb_regex) +{ + return MBSTRG(mb_regex_globals) ? SUCCESS: FAILURE; +} +/* }}} */ + +/* {{{ PHP_RSHUTDOWN_FUNCTION(mb_regex) */ +PHP_RSHUTDOWN_FUNCTION(mb_regex) +{ + MBREX(current_mbctype) = MBREX(default_mbctype); + + if (MBREX(search_str) != NULL) { + zval_ptr_dtor(&MBREX(search_str)); + MBREX(search_str) = (zval *)NULL; + } + MBREX(search_pos) = 0; + + if (MBREX(search_regs) != NULL) { + onig_region_free(MBREX(search_regs), 1); + MBREX(search_regs) = (OnigRegion *)NULL; + } + zend_hash_clean(&MBREX(ht_rc)); + + return SUCCESS; +} +/* }}} */ + +/* {{{ PHP_MINFO_FUNCTION(mb_regex) */ +PHP_MINFO_FUNCTION(mb_regex) +{ + char buf[32]; + php_info_print_table_start(); + php_info_print_table_row(2, "Multibyte (japanese) regex support", "enabled"); + snprintf(buf, sizeof(buf), "%d.%d.%d", + ONIGURUMA_VERSION_MAJOR, + ONIGURUMA_VERSION_MINOR, + ONIGURUMA_VERSION_TEENY); +#ifdef PHP_ONIG_BUNDLED +#ifdef USE_COMBINATION_EXPLOSION_CHECK + php_info_print_table_row(2, "Multibyte regex (oniguruma) backtrack check", "On"); +#else /* USE_COMBINATION_EXPLOSION_CHECK */ + php_info_print_table_row(2, "Multibyte regex (oniguruma) backtrack check", "Off"); +#endif /* USE_COMBINATION_EXPLOSION_CHECK */ +#endif /* PHP_BUNDLED_ONIG */ + php_info_print_table_row(2, "Multibyte regex (oniguruma) version", buf); + php_info_print_table_end(); +} +/* }}} */ + +/* + * encoding name resolver + */ + +/* {{{ encoding name map */ +typedef struct _php_mb_regex_enc_name_map_t { + const char *names; + OnigEncoding code; +} php_mb_regex_enc_name_map_t; + +php_mb_regex_enc_name_map_t enc_name_map[] = { +#ifdef ONIG_ENCODING_EUC_JP + { + "EUC-JP\0EUCJP\0X-EUC-JP\0UJIS\0EUCJP\0EUCJP-WIN\0", + ONIG_ENCODING_EUC_JP + }, +#endif +#ifdef ONIG_ENCODING_UTF8 + { + "UTF-8\0UTF8\0", + ONIG_ENCODING_UTF8 + }, +#endif +#ifdef ONIG_ENCODING_UTF16_BE + { + "UTF-16\0UTF-16BE\0", + ONIG_ENCODING_UTF16_BE + }, +#endif +#ifdef ONIG_ENCODING_UTF16_LE + { + "UTF-16LE\0", + ONIG_ENCODING_UTF16_LE + }, +#endif +#ifdef ONIG_ENCODING_UTF32_BE + { + "UCS-4\0UTF-32\0UTF-32BE\0", + ONIG_ENCODING_UTF32_BE + }, +#endif +#ifdef ONIG_ENCODING_UTF32_LE + { + "UCS-4LE\0UTF-32LE\0", + ONIG_ENCODING_UTF32_LE + }, +#endif +#ifdef ONIG_ENCODING_SJIS + { + "SJIS\0CP932\0MS932\0SHIFT_JIS\0SJIS-WIN\0WINDOWS-31J\0", + ONIG_ENCODING_SJIS + }, +#endif +#ifdef ONIG_ENCODING_BIG5 + { + "BIG5\0BIG-5\0BIGFIVE\0CN-BIG5\0BIG-FIVE\0", + ONIG_ENCODING_BIG5 + }, +#endif +#ifdef ONIG_ENCODING_EUC_CN + { + "EUC-CN\0EUCCN\0EUC_CN\0GB-2312\0GB2312\0", + ONIG_ENCODING_EUC_CN + }, +#endif +#ifdef ONIG_ENCODING_EUC_TW + { + "EUC-TW\0EUCTW\0EUC_TW\0", + ONIG_ENCODING_EUC_TW + }, +#endif +#ifdef ONIG_ENCODING_EUC_KR + { + "EUC-KR\0EUCKR\0EUC_KR\0", + ONIG_ENCODING_EUC_KR + }, +#endif +#if defined(ONIG_ENCODING_KOI8) && !PHP_ONIG_BAD_KOI8_ENTRY + { + "KOI8\0KOI-8\0", + ONIG_ENCODING_KOI8 + }, +#endif +#ifdef ONIG_ENCODING_KOI8_R + { + "KOI8R\0KOI8-R\0KOI-8R\0", + ONIG_ENCODING_KOI8_R + }, +#endif +#ifdef ONIG_ENCODING_ISO_8859_1 + { + "ISO-8859-1\0ISO8859-1\0ISO_8859_1\0ISO8859_1\0", + ONIG_ENCODING_ISO_8859_1 + }, +#endif +#ifdef ONIG_ENCODING_ISO_8859_2 + { + "ISO-8859-2\0ISO8859-2\0ISO_8859_2\0ISO8859_2\0", + ONIG_ENCODING_ISO_8859_2 + }, +#endif +#ifdef ONIG_ENCODING_ISO_8859_3 + { + "ISO-8859-3\0ISO8859-3\0ISO_8859_3\0ISO8859_3\0", + ONIG_ENCODING_ISO_8859_3 + }, +#endif +#ifdef ONIG_ENCODING_ISO_8859_4 + { + "ISO-8859-4\0ISO8859-4\0ISO_8859_4\0ISO8859_4\0", + ONIG_ENCODING_ISO_8859_4 + }, +#endif +#ifdef ONIG_ENCODING_ISO_8859_5 + { + "ISO-8859-5\0ISO8859-5\0ISO_8859_5\0ISO8859_5\0", + ONIG_ENCODING_ISO_8859_5 + }, +#endif +#ifdef ONIG_ENCODING_ISO_8859_6 + { + "ISO-8859-6\0ISO8859-6\0ISO_8859_6\0ISO8859_6\0", + ONIG_ENCODING_ISO_8859_6 + }, +#endif +#ifdef ONIG_ENCODING_ISO_8859_7 + { + "ISO-8859-7\0ISO8859-7\0ISO_8859_7\0ISO8859_7\0", + ONIG_ENCODING_ISO_8859_7 + }, +#endif +#ifdef ONIG_ENCODING_ISO_8859_8 + { + "ISO-8859-8\0ISO8859-8\0ISO_8859_8\0ISO8859_8\0", + ONIG_ENCODING_ISO_8859_8 + }, +#endif +#ifdef ONIG_ENCODING_ISO_8859_9 + { + "ISO-8859-9\0ISO8859-9\0ISO_8859_9\0ISO8859_9\0", + ONIG_ENCODING_ISO_8859_9 + }, +#endif +#ifdef ONIG_ENCODING_ISO_8859_10 + { + "ISO-8859-10\0ISO8859-10\0ISO_8859_10\0ISO8859_10\0", + ONIG_ENCODING_ISO_8859_10 + }, +#endif +#ifdef ONIG_ENCODING_ISO_8859_11 + { + "ISO-8859-11\0ISO8859-11\0ISO_8859_11\0ISO8859_11\0", + ONIG_ENCODING_ISO_8859_11 + }, +#endif +#ifdef ONIG_ENCODING_ISO_8859_13 + { + "ISO-8859-13\0ISO8859-13\0ISO_8859_13\0ISO8859_13\0", + ONIG_ENCODING_ISO_8859_13 + }, +#endif +#ifdef ONIG_ENCODING_ISO_8859_14 + { + "ISO-8859-14\0ISO8859-14\0ISO_8859_14\0ISO8859_14\0", + ONIG_ENCODING_ISO_8859_14 + }, +#endif +#ifdef ONIG_ENCODING_ISO_8859_15 + { + "ISO-8859-15\0ISO8859-15\0ISO_8859_15\0ISO8859_15\0", + ONIG_ENCODING_ISO_8859_15 + }, +#endif +#ifdef ONIG_ENCODING_ISO_8859_16 + { + "ISO-8859-16\0ISO8859-16\0ISO_8859_16\0ISO8859_16\0", + ONIG_ENCODING_ISO_8859_16 + }, +#endif +#ifdef ONIG_ENCODING_ASCII + { + "ASCII\0US-ASCII\0US_ASCII\0ISO646\0", + ONIG_ENCODING_ASCII + }, +#endif + { NULL, ONIG_ENCODING_UNDEF } +}; +/* }}} */ + +/* {{{ php_mb_regex_name2mbctype */ +static OnigEncoding _php_mb_regex_name2mbctype(const char *pname) +{ + const char *p; + php_mb_regex_enc_name_map_t *mapping; + + if (pname == NULL || !*pname) { + return ONIG_ENCODING_UNDEF; + } + + for (mapping = enc_name_map; mapping->names != NULL; mapping++) { + for (p = mapping->names; *p != '\0'; p += (strlen(p) + 1)) { + if (strcasecmp(p, pname) == 0) { + return mapping->code; + } + } + } + + return ONIG_ENCODING_UNDEF; +} +/* }}} */ + +/* {{{ php_mb_regex_mbctype2name */ +static const char *_php_mb_regex_mbctype2name(OnigEncoding mbctype) +{ + php_mb_regex_enc_name_map_t *mapping; + + for (mapping = enc_name_map; mapping->names != NULL; mapping++) { + if (mapping->code == mbctype) { + return mapping->names; + } + } + + return NULL; +} +/* }}} */ + +/* {{{ php_mb_regex_set_mbctype */ +int php_mb_regex_set_mbctype(const char *encname TSRMLS_DC) +{ + OnigEncoding mbctype = _php_mb_regex_name2mbctype(encname); + if (mbctype == ONIG_ENCODING_UNDEF) { + return FAILURE; + } + MBREX(current_mbctype) = mbctype; + return SUCCESS; +} +/* }}} */ + +/* {{{ php_mb_regex_set_default_mbctype */ +int php_mb_regex_set_default_mbctype(const char *encname TSRMLS_DC) +{ + OnigEncoding mbctype = _php_mb_regex_name2mbctype(encname); + if (mbctype == ONIG_ENCODING_UNDEF) { + return FAILURE; + } + MBREX(default_mbctype) = mbctype; + return SUCCESS; +} +/* }}} */ + +/* {{{ php_mb_regex_get_mbctype */ +const char *php_mb_regex_get_mbctype(TSRMLS_D) +{ + return _php_mb_regex_mbctype2name(MBREX(current_mbctype)); +} +/* }}} */ + +/* {{{ php_mb_regex_get_default_mbctype */ +const char *php_mb_regex_get_default_mbctype(TSRMLS_D) +{ + return _php_mb_regex_mbctype2name(MBREX(default_mbctype)); +} +/* }}} */ + +/* + * regex cache + */ +/* {{{ php_mbregex_compile_pattern */ +static php_mb_regex_t *php_mbregex_compile_pattern(const char *pattern, int patlen, OnigOptionType options, OnigEncoding enc, OnigSyntaxType *syntax TSRMLS_DC) +{ + int err_code = 0; + int found = 0; + php_mb_regex_t *retval = NULL, **rc = NULL; + OnigErrorInfo err_info; + OnigUChar err_str[ONIG_MAX_ERROR_MESSAGE_LEN]; + + found = zend_hash_find(&MBREX(ht_rc), (char *)pattern, patlen+1, (void **) &rc); + if (found == FAILURE || (*rc)->options != options || (*rc)->enc != enc || (*rc)->syntax != syntax) { + if ((err_code = onig_new(&retval, (OnigUChar *)pattern, (OnigUChar *)(pattern + patlen), options, enc, syntax, &err_info)) != ONIG_NORMAL) { + onig_error_code_to_str(err_str, err_code, err_info); + php_error_docref(NULL TSRMLS_CC, E_WARNING, "mbregex compile err: %s", err_str); + retval = NULL; + goto out; + } + zend_hash_update(&MBREX(ht_rc), (char *) pattern, patlen + 1, (void *) &retval, sizeof(retval), NULL); + } else if (found == SUCCESS) { + retval = *rc; + } +out: + return retval; +} +/* }}} */ + +/* {{{ _php_mb_regex_get_option_string */ +static size_t _php_mb_regex_get_option_string(char *str, size_t len, OnigOptionType option, OnigSyntaxType *syntax) +{ + size_t len_left = len; + size_t len_req = 0; + char *p = str; + char c; + + if ((option & ONIG_OPTION_IGNORECASE) != 0) { + if (len_left > 0) { + --len_left; + *(p++) = 'i'; + } + ++len_req; + } + + if ((option & ONIG_OPTION_EXTEND) != 0) { + if (len_left > 0) { + --len_left; + *(p++) = 'x'; + } + ++len_req; + } + + if ((option & (ONIG_OPTION_MULTILINE | ONIG_OPTION_SINGLELINE)) == + (ONIG_OPTION_MULTILINE | ONIG_OPTION_SINGLELINE)) { + if (len_left > 0) { + --len_left; + *(p++) = 'p'; + } + ++len_req; + } else { + if ((option & ONIG_OPTION_MULTILINE) != 0) { + if (len_left > 0) { + --len_left; + *(p++) = 'm'; + } + ++len_req; + } + + if ((option & ONIG_OPTION_SINGLELINE) != 0) { + if (len_left > 0) { + --len_left; + *(p++) = 's'; + } + ++len_req; + } + } + if ((option & ONIG_OPTION_FIND_LONGEST) != 0) { + if (len_left > 0) { + --len_left; + *(p++) = 'l'; + } + ++len_req; + } + if ((option & ONIG_OPTION_FIND_NOT_EMPTY) != 0) { + if (len_left > 0) { + --len_left; + *(p++) = 'n'; + } + ++len_req; + } + + c = 0; + + if (syntax == ONIG_SYNTAX_JAVA) { + c = 'j'; + } else if (syntax == ONIG_SYNTAX_GNU_REGEX) { + c = 'u'; + } else if (syntax == ONIG_SYNTAX_GREP) { + c = 'g'; + } else if (syntax == ONIG_SYNTAX_EMACS) { + c = 'c'; + } else if (syntax == ONIG_SYNTAX_RUBY) { + c = 'r'; + } else if (syntax == ONIG_SYNTAX_PERL) { + c = 'z'; + } else if (syntax == ONIG_SYNTAX_POSIX_BASIC) { + c = 'b'; + } else if (syntax == ONIG_SYNTAX_POSIX_EXTENDED) { + c = 'd'; + } + + if (c != 0) { + if (len_left > 0) { + --len_left; + *(p++) = c; + } + ++len_req; + } + + + if (len_left > 0) { + --len_left; + *(p++) = '\0'; + } + ++len_req; + if (len < len_req) { + return len_req; + } + + return 0; +} +/* }}} */ + +/* {{{ _php_mb_regex_init_options */ +static void +_php_mb_regex_init_options(const char *parg, int narg, OnigOptionType *option, OnigSyntaxType **syntax, int *eval) +{ + int n; + char c; + int optm = 0; + + *syntax = ONIG_SYNTAX_RUBY; + + if (parg != NULL) { + n = 0; + while(n < narg) { + c = parg[n++]; + switch (c) { + case 'i': + optm |= ONIG_OPTION_IGNORECASE; + break; + case 'x': + optm |= ONIG_OPTION_EXTEND; + break; + case 'm': + optm |= ONIG_OPTION_MULTILINE; + break; + case 's': + optm |= ONIG_OPTION_SINGLELINE; + break; + case 'p': + optm |= ONIG_OPTION_MULTILINE | ONIG_OPTION_SINGLELINE; + break; + case 'l': + optm |= ONIG_OPTION_FIND_LONGEST; + break; + case 'n': + optm |= ONIG_OPTION_FIND_NOT_EMPTY; + break; + case 'j': + *syntax = ONIG_SYNTAX_JAVA; + break; + case 'u': + *syntax = ONIG_SYNTAX_GNU_REGEX; + break; + case 'g': + *syntax = ONIG_SYNTAX_GREP; + break; + case 'c': + *syntax = ONIG_SYNTAX_EMACS; + break; + case 'r': + *syntax = ONIG_SYNTAX_RUBY; + break; + case 'z': + *syntax = ONIG_SYNTAX_PERL; + break; + case 'b': + *syntax = ONIG_SYNTAX_POSIX_BASIC; + break; + case 'd': + *syntax = ONIG_SYNTAX_POSIX_EXTENDED; + break; + case 'e': + if (eval != NULL) *eval = 1; + break; + default: + break; + } + } + if (option != NULL) *option|=optm; + } +} +/* }}} */ + +/* + * php funcions + */ + +/* {{{ proto string mb_regex_encoding([string encoding]) + Returns the current encoding for regex as a string. */ +PHP_FUNCTION(mb_regex_encoding) +{ + size_t argc = ZEND_NUM_ARGS(); + char *encoding; + int encoding_len; + OnigEncoding mbctype; + + if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "|s", &encoding, &encoding_len) == FAILURE) { + return; + } + + if (argc == 0) { + const char *retval = _php_mb_regex_mbctype2name(MBREX(current_mbctype)); + + if (retval == NULL) { + RETURN_FALSE; + } + + RETURN_STRING((char *)retval, 1); + } else if (argc == 1) { + mbctype = _php_mb_regex_name2mbctype(encoding); + + if (mbctype == ONIG_ENCODING_UNDEF) { + php_error_docref(NULL TSRMLS_CC, E_WARNING, "Unknown encoding \"%s\"", encoding); + RETURN_FALSE; + } + + MBREX(current_mbctype) = mbctype; + RETURN_TRUE; + } +} +/* }}} */ + +/* {{{ _php_mb_regex_ereg_exec */ +static void _php_mb_regex_ereg_exec(INTERNAL_FUNCTION_PARAMETERS, int icase) +{ + zval **arg_pattern, *array; + char *string; + int string_len; + php_mb_regex_t *re; + OnigRegion *regs = NULL; + int i, match_len, beg, end; + OnigOptionType options; + char *str; + + array = NULL; + + if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "Zs|z", &arg_pattern, &string, &string_len, &array) == FAILURE) { + RETURN_FALSE; + } + + options = MBREX(regex_default_options); + if (icase) { + options |= ONIG_OPTION_IGNORECASE; + } + + /* compile the regular expression from the supplied regex */ + if (Z_TYPE_PP(arg_pattern) != IS_STRING) { + /* we convert numbers to integers and treat them as a string */ + if (Z_TYPE_PP(arg_pattern) == IS_DOUBLE) { + convert_to_long_ex(arg_pattern); /* get rid of decimal places */ + } + convert_to_string_ex(arg_pattern); + /* don't bother doing an extended regex with just a number */ + } + + if (!Z_STRVAL_PP(arg_pattern) || Z_STRLEN_PP(arg_pattern) == 0) { + php_error_docref(NULL TSRMLS_CC, E_WARNING, "empty pattern"); + RETVAL_FALSE; + goto out; + } + + re = php_mbregex_compile_pattern(Z_STRVAL_PP(arg_pattern), Z_STRLEN_PP(arg_pattern), options, MBREX(current_mbctype), MBREX(regex_default_syntax) TSRMLS_CC); + if (re == NULL) { + RETVAL_FALSE; + goto out; + } + + regs = onig_region_new(); + + /* actually execute the regular expression */ + if (onig_search(re, (OnigUChar *)string, (OnigUChar *)(string + string_len), (OnigUChar *)string, (OnigUChar *)(string + string_len), regs, 0) < 0) { + RETVAL_FALSE; + goto out; + } + + match_len = 1; + str = string; + if (array != NULL) { + match_len = regs->end[0] - regs->beg[0]; + zval_dtor(array); + array_init(array); + for (i = 0; i < regs->num_regs; i++) { + beg = regs->beg[i]; + end = regs->end[i]; + if (beg >= 0 && beg < end && end <= string_len) { + add_index_stringl(array, i, (char *)&str[beg], end - beg, 1); + } else { + add_index_bool(array, i, 0); + } + } + } + + if (match_len == 0) { + match_len = 1; + } + RETVAL_LONG(match_len); +out: + if (regs != NULL) { + onig_region_free(regs, 1); + } +} +/* }}} */ + +/* {{{ proto int mb_ereg(string pattern, string string [, array registers]) + Regular expression match for multibyte string */ +PHP_FUNCTION(mb_ereg) +{ + _php_mb_regex_ereg_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0); +} +/* }}} */ + +/* {{{ proto int mb_eregi(string pattern, string string [, array registers]) + Case-insensitive regular expression match for multibyte string */ +PHP_FUNCTION(mb_eregi) +{ + _php_mb_regex_ereg_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1); +} +/* }}} */ + +/* {{{ _php_mb_regex_ereg_replace_exec */ +static void _php_mb_regex_ereg_replace_exec(INTERNAL_FUNCTION_PARAMETERS, OnigOptionType options, int is_callable) +{ + zval **arg_pattern_zval; + + char *arg_pattern; + int arg_pattern_len; + + char *replace; + int replace_len; + + zend_fcall_info arg_replace_fci; + zend_fcall_info_cache arg_replace_fci_cache; + + char *string; + int string_len; + + char *p; + php_mb_regex_t *re; + OnigSyntaxType *syntax; + OnigRegion *regs = NULL; + smart_str out_buf = { 0 }; + smart_str eval_buf = { 0 }; + smart_str *pbuf; + int i, err, eval, n; + OnigUChar *pos; + OnigUChar *string_lim; + char *description = NULL; + char pat_buf[2]; + + const mbfl_encoding *enc; + + { + const char *current_enc_name; + current_enc_name = _php_mb_regex_mbctype2name(MBREX(current_mbctype)); + if (current_enc_name == NULL || + (enc = mbfl_name2encoding(current_enc_name)) == NULL) { + php_error_docref(NULL TSRMLS_CC, E_WARNING, "Unknown error"); + RETURN_FALSE; + } + } + eval = 0; + { + char *option_str = NULL; + int option_str_len = 0; + + if (!is_callable) { + if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "Zss|s", + &arg_pattern_zval, + &replace, &replace_len, + &string, &string_len, + &option_str, &option_str_len) == FAILURE) { + RETURN_FALSE; + } + } else { + if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "Zfs|s", + &arg_pattern_zval, + &arg_replace_fci, &arg_replace_fci_cache, + &string, &string_len, + &option_str, &option_str_len) == FAILURE) { + RETURN_FALSE; + } + } + + if (option_str != NULL) { + _php_mb_regex_init_options(option_str, option_str_len, &options, &syntax, &eval); + } else { + options |= MBREX(regex_default_options); + syntax = MBREX(regex_default_syntax); + } + } + if (Z_TYPE_PP(arg_pattern_zval) == IS_STRING) { + arg_pattern = Z_STRVAL_PP(arg_pattern_zval); + arg_pattern_len = Z_STRLEN_PP(arg_pattern_zval); + } else { + /* FIXME: this code is not multibyte aware! */ + convert_to_long_ex(arg_pattern_zval); + pat_buf[0] = (char)Z_LVAL_PP(arg_pattern_zval); + pat_buf[1] = '\0'; + + arg_pattern = pat_buf; + arg_pattern_len = 1; + } + /* create regex pattern buffer */ + re = php_mbregex_compile_pattern(arg_pattern, arg_pattern_len, options, MBREX(current_mbctype), syntax TSRMLS_CC); + if (re == NULL) { + RETURN_FALSE; + } + + if (eval || is_callable) { + pbuf = &eval_buf; + description = zend_make_compiled_string_description("mbregex replace" TSRMLS_CC); + } else { + pbuf = &out_buf; + description = NULL; + } + + if (is_callable) { + if (eval) { + php_error_docref(NULL TSRMLS_CC, E_WARNING, "Option 'e' cannot be used with replacement callback"); + RETURN_FALSE; + } + } + + /* do the actual work */ + err = 0; + pos = (OnigUChar *)string; + string_lim = (OnigUChar*)(string + string_len); + regs = onig_region_new(); + while (err >= 0) { + err = onig_search(re, (OnigUChar *)string, (OnigUChar *)string_lim, pos, (OnigUChar *)string_lim, regs, 0); + if (err <= -2) { + OnigUChar err_str[ONIG_MAX_ERROR_MESSAGE_LEN]; + onig_error_code_to_str(err_str, err); + php_error_docref(NULL TSRMLS_CC, E_WARNING, "mbregex search failure in php_mbereg_replace_exec(): %s", err_str); + break; + } + if (err >= 0) { +#if moriyoshi_0 + if (regs->beg[0] == regs->end[0]) { + php_error_docref(NULL TSRMLS_CC, E_WARNING, "Empty regular expression"); + break; + } +#endif + /* copy the part of the string before the match */ + smart_str_appendl(&out_buf, pos, (size_t)((OnigUChar *)(string + regs->beg[0]) - pos)); + + if (!is_callable) { + /* copy replacement and backrefs */ + i = 0; + p = replace; + while (i < replace_len) { + int fwd = (int) php_mb_mbchar_bytes_ex(p, enc); + n = -1; + if ((replace_len - i) >= 2 && fwd == 1 && + p[0] == '\\' && p[1] >= '0' && p[1] <= '9') { + n = p[1] - '0'; + } + if (n >= 0 && n < regs->num_regs) { + if (regs->beg[n] >= 0 && regs->beg[n] < regs->end[n] && regs->end[n] <= string_len) { + smart_str_appendl(pbuf, string + regs->beg[n], regs->end[n] - regs->beg[n]); + } + p += 2; + i += 2; + } else { + smart_str_appendl(pbuf, p, fwd); + p += fwd; + i += fwd; + } + } + } + + if (eval) { + zval v; + /* null terminate buffer */ + smart_str_0(&eval_buf); + /* do eval */ + if (zend_eval_stringl(eval_buf.c, eval_buf.len, &v, description TSRMLS_CC) == FAILURE) { + efree(description); + php_error_docref(NULL TSRMLS_CC,E_ERROR, "Failed evaluating code: %s%s", PHP_EOL, eval_buf.c); + /* zend_error() does not return in this case */ + } + + /* result of eval */ + convert_to_string(&v); + smart_str_appendl(&out_buf, Z_STRVAL(v), Z_STRLEN(v)); + /* Clean up */ + eval_buf.len = 0; + zval_dtor(&v); + } else if (is_callable) { + zval *retval_ptr; + zval **args[1]; + zval *subpats; + int i; + + MAKE_STD_ZVAL(subpats); + array_init(subpats); + + for (i = 0; i < regs->num_regs; i++) { + add_next_index_stringl(subpats, string + regs->beg[i], regs->end[i] - regs->beg[i], 1); + } + + args[0] = &subpats; + /* null terminate buffer */ + smart_str_0(&eval_buf); + + arg_replace_fci.param_count = 1; + arg_replace_fci.params = args; + arg_replace_fci.retval_ptr_ptr = &retval_ptr; + if (zend_call_function(&arg_replace_fci, &arg_replace_fci_cache TSRMLS_CC) == SUCCESS && arg_replace_fci.retval_ptr_ptr) { + convert_to_string_ex(&retval_ptr); + smart_str_appendl(&out_buf, Z_STRVAL_P(retval_ptr), Z_STRLEN_P(retval_ptr)); + eval_buf.len = 0; + zval_ptr_dtor(&retval_ptr); + } else { + efree(description); + if (!EG(exception)) { + php_error_docref(NULL TSRMLS_CC, E_WARNING, "Unable to call custom replacement function"); + } + } + zval_ptr_dtor(&subpats); + } + + n = regs->end[0]; + if ((pos - (OnigUChar *)string) < n) { + pos = (OnigUChar *)string + n; + } else { + if (pos < string_lim) { + smart_str_appendl(&out_buf, pos, 1); + } + pos++; + } + } else { /* nomatch */ + /* stick that last bit of string on our output */ + if (string_lim - pos > 0) { + smart_str_appendl(&out_buf, pos, string_lim - pos); + } + } + onig_region_free(regs, 0); + } + + if (description) { + efree(description); + } + if (regs != NULL) { + onig_region_free(regs, 1); + } + smart_str_free(&eval_buf); + + if (err <= -2) { + smart_str_free(&out_buf); + RETVAL_FALSE; + } else { + smart_str_appendc(&out_buf, '\0'); + RETVAL_STRINGL((char *)out_buf.c, out_buf.len - 1, 0); + } +} +/* }}} */ + +/* {{{ proto string mb_ereg_replace(string pattern, string replacement, string string [, string option]) + Replace regular expression for multibyte string */ +PHP_FUNCTION(mb_ereg_replace) +{ + _php_mb_regex_ereg_replace_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0, 0); +} +/* }}} */ + +/* {{{ proto string mb_eregi_replace(string pattern, string replacement, string string) + Case insensitive replace regular expression for multibyte string */ +PHP_FUNCTION(mb_eregi_replace) +{ + _php_mb_regex_ereg_replace_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, ONIG_OPTION_IGNORECASE, 0); +} +/* }}} */ + +/* {{{ proto string mb_ereg_replace_callback(string pattern, string callback, string string [, string option]) + regular expression for multibyte string using replacement callback */ +PHP_FUNCTION(mb_ereg_replace_callback) +{ + _php_mb_regex_ereg_replace_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0, 1); +} +/* }}} */ + +/* {{{ proto array mb_split(string pattern, string string [, int limit]) + split multibyte string into array by regular expression */ +PHP_FUNCTION(mb_split) +{ + char *arg_pattern; + int arg_pattern_len; + php_mb_regex_t *re; + OnigRegion *regs = NULL; + char *string; + OnigUChar *pos, *chunk_pos; + int string_len; + + int n, err; + long count = -1; + + if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|l", &arg_pattern, &arg_pattern_len, &string, &string_len, &count) == FAILURE) { + RETURN_FALSE; + } + + if (count > 0) { + count--; + } + + /* create regex pattern buffer */ + if ((re = php_mbregex_compile_pattern(arg_pattern, arg_pattern_len, MBREX(regex_default_options), MBREX(current_mbctype), MBREX(regex_default_syntax) TSRMLS_CC)) == NULL) { + RETURN_FALSE; + } + + array_init(return_value); + + chunk_pos = pos = (OnigUChar *)string; + err = 0; + regs = onig_region_new(); + /* churn through str, generating array entries as we go */ + while (count != 0 && (pos - (OnigUChar *)string) < string_len) { + int beg, end; + err = onig_search(re, (OnigUChar *)string, (OnigUChar *)(string + string_len), pos, (OnigUChar *)(string + string_len), regs, 0); + if (err < 0) { + break; + } + beg = regs->beg[0], end = regs->end[0]; + /* add it to the array */ + if ((pos - (OnigUChar *)string) < end) { + if (beg < string_len && beg >= (chunk_pos - (OnigUChar *)string)) { + add_next_index_stringl(return_value, (char *)chunk_pos, ((OnigUChar *)(string + beg) - chunk_pos), 1); + --count; + } else { + err = -2; + break; + } + /* point at our new starting point */ + chunk_pos = pos = (OnigUChar *)string + end; + } else { + pos++; + } + onig_region_free(regs, 0); + } + + onig_region_free(regs, 1); + + /* see if we encountered an error */ + if (err <= -2) { + OnigUChar err_str[ONIG_MAX_ERROR_MESSAGE_LEN]; + onig_error_code_to_str(err_str, err); + php_error_docref(NULL TSRMLS_CC, E_WARNING, "mbregex search failure in mbsplit(): %s", err_str); + zval_dtor(return_value); + RETURN_FALSE; + } + + /* otherwise we just have one last element to add to the array */ + n = ((OnigUChar *)(string + string_len) - chunk_pos); + if (n > 0) { + add_next_index_stringl(return_value, (char *)chunk_pos, n, 1); + } else { + add_next_index_stringl(return_value, "", 0, 1); + } +} +/* }}} */ + +/* {{{ proto bool mb_ereg_match(string pattern, string string [,string option]) + Regular expression match for multibyte string */ +PHP_FUNCTION(mb_ereg_match) +{ + char *arg_pattern; + int arg_pattern_len; + + char *string; + int string_len; + + php_mb_regex_t *re; + OnigSyntaxType *syntax; + OnigOptionType option = 0; + int err; + + { + char *option_str = NULL; + int option_str_len = 0; + + if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|s", + &arg_pattern, &arg_pattern_len, &string, &string_len, + &option_str, &option_str_len)==FAILURE) { + RETURN_FALSE; + } + + if (option_str != NULL) { + _php_mb_regex_init_options(option_str, option_str_len, &option, &syntax, NULL); + } else { + option |= MBREX(regex_default_options); + syntax = MBREX(regex_default_syntax); + } + } + + if ((re = php_mbregex_compile_pattern(arg_pattern, arg_pattern_len, option, MBREX(current_mbctype), syntax TSRMLS_CC)) == NULL) { + RETURN_FALSE; + } + + /* match */ + err = onig_match(re, (OnigUChar *)string, (OnigUChar *)(string + string_len), (OnigUChar *)string, NULL, 0); + if (err >= 0) { + RETVAL_TRUE; + } else { + RETVAL_FALSE; + } +} +/* }}} */ + +/* regex search */ +/* {{{ _php_mb_regex_ereg_search_exec */ +static void +_php_mb_regex_ereg_search_exec(INTERNAL_FUNCTION_PARAMETERS, int mode) +{ + size_t argc = ZEND_NUM_ARGS(); + char *arg_pattern, *arg_options; + int arg_pattern_len, arg_options_len; + int n, i, err, pos, len, beg, end; + OnigOptionType option; + OnigUChar *str; + OnigSyntaxType *syntax; + + if (zend_parse_parameters(argc TSRMLS_CC, "|ss", &arg_pattern, &arg_pattern_len, &arg_options, &arg_options_len) == FAILURE) { + return; + } + + option = MBREX(regex_default_options); + + if (argc == 2) { + option = 0; + _php_mb_regex_init_options(arg_options, arg_options_len, &option, &syntax, NULL); + } + + if (argc > 0) { + /* create regex pattern buffer */ + if ((MBREX(search_re) = php_mbregex_compile_pattern(arg_pattern, arg_pattern_len, option, MBREX(current_mbctype), MBREX(regex_default_syntax) TSRMLS_CC)) == NULL) { + RETURN_FALSE; + } + } + + pos = MBREX(search_pos); + str = NULL; + len = 0; + if (MBREX(search_str) != NULL && Z_TYPE_P(MBREX(search_str)) == IS_STRING){ + str = (OnigUChar *)Z_STRVAL_P(MBREX(search_str)); + len = Z_STRLEN_P(MBREX(search_str)); + } + + if (MBREX(search_re) == NULL) { + php_error_docref(NULL TSRMLS_CC, E_WARNING, "No regex given"); + RETURN_FALSE; + } + + if (str == NULL) { + php_error_docref(NULL TSRMLS_CC, E_WARNING, "No string given"); + RETURN_FALSE; + } + + if (MBREX(search_regs)) { + onig_region_free(MBREX(search_regs), 1); + } + MBREX(search_regs) = onig_region_new(); + + err = onig_search(MBREX(search_re), str, str + len, str + pos, str + len, MBREX(search_regs), 0); + if (err == ONIG_MISMATCH) { + MBREX(search_pos) = len; + RETVAL_FALSE; + } else if (err <= -2) { + OnigUChar err_str[ONIG_MAX_ERROR_MESSAGE_LEN]; + onig_error_code_to_str(err_str, err); + php_error_docref(NULL TSRMLS_CC, E_WARNING, "mbregex search failure in mbregex_search(): %s", err_str); + RETVAL_FALSE; + } else { + if (MBREX(search_regs)->beg[0] == MBREX(search_regs)->end[0]) { + php_error_docref(NULL TSRMLS_CC, E_WARNING, "Empty regular expression"); + } + switch (mode) { + case 1: + array_init(return_value); + beg = MBREX(search_regs)->beg[0]; + end = MBREX(search_regs)->end[0]; + add_next_index_long(return_value, beg); + add_next_index_long(return_value, end - beg); + break; + case 2: + array_init(return_value); + n = MBREX(search_regs)->num_regs; + for (i = 0; i < n; i++) { + beg = MBREX(search_regs)->beg[i]; + end = MBREX(search_regs)->end[i]; + if (beg >= 0 && beg <= end && end <= len) { + add_index_stringl(return_value, i, (char *)&str[beg], end - beg, 1); + } else { + add_index_bool(return_value, i, 0); + } + } + break; + default: + RETVAL_TRUE; + break; + } + end = MBREX(search_regs)->end[0]; + if (pos < end) { + MBREX(search_pos) = end; + } else { + MBREX(search_pos) = pos + 1; + } + } + + if (err < 0) { + onig_region_free(MBREX(search_regs), 1); + MBREX(search_regs) = (OnigRegion *)NULL; + } +} +/* }}} */ + +/* {{{ proto bool mb_ereg_search([string pattern[, string option]]) + Regular expression search for multibyte string */ +PHP_FUNCTION(mb_ereg_search) +{ + _php_mb_regex_ereg_search_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0); +} +/* }}} */ + +/* {{{ proto array mb_ereg_search_pos([string pattern[, string option]]) + Regular expression search for multibyte string */ +PHP_FUNCTION(mb_ereg_search_pos) +{ + _php_mb_regex_ereg_search_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1); +} +/* }}} */ + +/* {{{ proto array mb_ereg_search_regs([string pattern[, string option]]) + Regular expression search for multibyte string */ +PHP_FUNCTION(mb_ereg_search_regs) +{ + _php_mb_regex_ereg_search_exec(INTERNAL_FUNCTION_PARAM_PASSTHRU, 2); +} +/* }}} */ + +/* {{{ proto bool mb_ereg_search_init(string string [, string pattern[, string option]]) + Initialize string and regular expression for search. */ +PHP_FUNCTION(mb_ereg_search_init) +{ + size_t argc = ZEND_NUM_ARGS(); + zval *arg_str; + char *arg_pattern = NULL, *arg_options = NULL; + int arg_pattern_len = 0, arg_options_len = 0; + OnigSyntaxType *syntax = NULL; + OnigOptionType option; + + if (zend_parse_parameters(argc TSRMLS_CC, "z|ss", &arg_str, &arg_pattern, &arg_pattern_len, &arg_options, &arg_options_len) == FAILURE) { + return; + } + + if (argc > 1 && arg_pattern_len == 0) { + php_error_docref(NULL TSRMLS_CC, E_WARNING, "Empty pattern"); + RETURN_FALSE; + } + + option = MBREX(regex_default_options); + syntax = MBREX(regex_default_syntax); + + if (argc == 3) { + option = 0; + _php_mb_regex_init_options(arg_options, arg_options_len, &option, &syntax, NULL); + } + + if (argc > 1) { + /* create regex pattern buffer */ + if ((MBREX(search_re) = php_mbregex_compile_pattern(arg_pattern, arg_pattern_len, option, MBREX(current_mbctype), syntax TSRMLS_CC)) == NULL) { + RETURN_FALSE; + } + } + + if (MBREX(search_str) != NULL) { + zval_ptr_dtor(&MBREX(search_str)); + MBREX(search_str) = (zval *)NULL; + } + + MBREX(search_str) = arg_str; + Z_ADDREF_P(MBREX(search_str)); + SEPARATE_ZVAL_IF_NOT_REF(&MBREX(search_str)); + + MBREX(search_pos) = 0; + + if (MBREX(search_regs) != NULL) { + onig_region_free(MBREX(search_regs), 1); + MBREX(search_regs) = (OnigRegion *) NULL; + } + + RETURN_TRUE; +} +/* }}} */ + +/* {{{ proto array mb_ereg_search_getregs(void) + Get matched substring of the last time */ +PHP_FUNCTION(mb_ereg_search_getregs) +{ + int n, i, len, beg, end; + OnigUChar *str; + + if (MBREX(search_regs) != NULL && Z_TYPE_P(MBREX(search_str)) == IS_STRING && Z_STRVAL_P(MBREX(search_str)) != NULL) { + array_init(return_value); + + str = (OnigUChar *)Z_STRVAL_P(MBREX(search_str)); + len = Z_STRLEN_P(MBREX(search_str)); + n = MBREX(search_regs)->num_regs; + for (i = 0; i < n; i++) { + beg = MBREX(search_regs)->beg[i]; + end = MBREX(search_regs)->end[i]; + if (beg >= 0 && beg <= end && end <= len) { + add_index_stringl(return_value, i, (char *)&str[beg], end - beg, 1); + } else { + add_index_bool(return_value, i, 0); + } + } + } else { + RETVAL_FALSE; + } +} +/* }}} */ + +/* {{{ proto int mb_ereg_search_getpos(void) + Get search start position */ +PHP_FUNCTION(mb_ereg_search_getpos) +{ + RETVAL_LONG(MBREX(search_pos)); +} +/* }}} */ + +/* {{{ proto bool mb_ereg_search_setpos(int position) + Set search start position */ +PHP_FUNCTION(mb_ereg_search_setpos) +{ + long position; + + if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "l", &position) == FAILURE) { + return; + } + + if (position < 0 || (MBREX(search_str) != NULL && Z_TYPE_P(MBREX(search_str)) == IS_STRING && position >= Z_STRLEN_P(MBREX(search_str)))) { + php_error_docref(NULL TSRMLS_CC, E_WARNING, "Position is out of range"); + MBREX(search_pos) = 0; + RETURN_FALSE; + } + + MBREX(search_pos) = position; + RETURN_TRUE; +} +/* }}} */ + +/* {{{ php_mb_regex_set_options */ +static void _php_mb_regex_set_options(OnigOptionType options, OnigSyntaxType *syntax, OnigOptionType *prev_options, OnigSyntaxType **prev_syntax TSRMLS_DC) +{ + if (prev_options != NULL) { + *prev_options = MBREX(regex_default_options); + } + if (prev_syntax != NULL) { + *prev_syntax = MBREX(regex_default_syntax); + } + MBREX(regex_default_options) = options; + MBREX(regex_default_syntax) = syntax; +} +/* }}} */ + +/* {{{ proto string mb_regex_set_options([string options]) + Set or get the default options for mbregex functions */ +PHP_FUNCTION(mb_regex_set_options) +{ + OnigOptionType opt; + OnigSyntaxType *syntax; + char *string = NULL; + int string_len; + char buf[16]; + + if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "|s", + &string, &string_len) == FAILURE) { + RETURN_FALSE; + } + if (string != NULL) { + opt = 0; + syntax = NULL; + _php_mb_regex_init_options(string, string_len, &opt, &syntax, NULL); + _php_mb_regex_set_options(opt, syntax, NULL, NULL TSRMLS_CC); + } else { + opt = MBREX(regex_default_options); + syntax = MBREX(regex_default_syntax); + } + _php_mb_regex_get_option_string(buf, sizeof(buf), opt, syntax); + + RETVAL_STRING(buf, 1); +} +/* }}} */ + +#endif /* HAVE_MBREGEX */ + +/* + * Local variables: + * tab-width: 4 + * c-basic-offset: 4 + * End: + * vim600: fdm=marker + * vim: noet sw=4 ts=4 + */ |