diff options
Diffstat (limited to 'ext')
| -rw-r--r-- | ext/ereg/CREDITS | 2 | ||||
| -rw-r--r-- | ext/ereg/config.w32 | 7 | ||||
| -rw-r--r-- | ext/ereg/config0.m4 | 56 | ||||
| -rw-r--r-- | ext/ereg/ereg.c (renamed from ext/standard/reg.c) | 168 | ||||
| -rw-r--r-- | ext/ereg/php_ereg.h (renamed from ext/standard/reg.h) | 27 | ||||
| -rw-r--r-- | ext/ereg/php_regex.h | 65 | ||||
| -rw-r--r-- | ext/ereg/regex/COPYRIGHT | 20 | ||||
| -rw-r--r-- | ext/ereg/regex/README | 32 | ||||
| -rw-r--r-- | ext/ereg/regex/WHATSNEW | 92 | ||||
| -rw-r--r-- | ext/ereg/regex/cclass.h | 30 | ||||
| -rw-r--r-- | ext/ereg/regex/cname.h | 102 | ||||
| -rw-r--r-- | ext/ereg/regex/debug.c | 242 | ||||
| -rw-r--r-- | ext/ereg/regex/debug.ih | 14 | ||||
| -rw-r--r-- | ext/ereg/regex/engine.c | 1019 | ||||
| -rw-r--r-- | ext/ereg/regex/engine.ih | 35 | ||||
| -rw-r--r-- | ext/ereg/regex/main.c | 510 | ||||
| -rw-r--r-- | ext/ereg/regex/main.ih | 19 | ||||
| -rw-r--r-- | ext/ereg/regex/mkh | 76 | ||||
| -rw-r--r-- | ext/ereg/regex/regcomp.c | 1613 | ||||
| -rw-r--r-- | ext/ereg/regex/regcomp.ih | 53 | ||||
| -rw-r--r-- | ext/ereg/regex/regerror.c | 126 | ||||
| -rw-r--r-- | ext/ereg/regex/regerror.ih | 12 | ||||
| -rw-r--r-- | ext/ereg/regex/regex.3 | 502 | ||||
| -rw-r--r-- | ext/ereg/regex/regex.7 | 233 | ||||
| -rw-r--r-- | ext/ereg/regex/regex.dsp | 106 | ||||
| -rw-r--r-- | ext/ereg/regex/regex.dsw | 29 | ||||
| -rw-r--r-- | ext/ereg/regex/regex.h | 83 | ||||
| -rw-r--r-- | ext/ereg/regex/regex.mak | 304 | ||||
| -rw-r--r-- | ext/ereg/regex/regex2.h | 140 | ||||
| -rw-r--r-- | ext/ereg/regex/regexec.c | 138 | ||||
| -rw-r--r-- | ext/ereg/regex/regfree.c | 37 | ||||
| -rw-r--r-- | ext/ereg/regex/split.c | 316 | ||||
| -rw-r--r-- | ext/ereg/regex/tests | 475 | ||||
| -rw-r--r-- | ext/ereg/regex/utils.h | 23 | ||||
| -rw-r--r-- | ext/ereg/tests/001.phpt (renamed from ext/standard/tests/reg/001.phpt) | 0 | ||||
| -rw-r--r-- | ext/ereg/tests/002.phpt (renamed from ext/standard/tests/reg/002.phpt) | 0 | ||||
| -rw-r--r-- | ext/ereg/tests/003.phpt (renamed from ext/standard/tests/reg/003.phpt) | 0 | ||||
| -rw-r--r-- | ext/ereg/tests/004.phpt (renamed from ext/standard/tests/reg/004.phpt) | 0 | ||||
| -rw-r--r-- | ext/ereg/tests/005.phpt (renamed from ext/standard/tests/reg/005.phpt) | 0 | ||||
| -rw-r--r-- | ext/ereg/tests/006.phpt (renamed from ext/standard/tests/reg/006.phpt) | 0 | ||||
| -rw-r--r-- | ext/ereg/tests/007.phpt (renamed from ext/standard/tests/reg/007.phpt) | 0 | ||||
| -rw-r--r-- | ext/ereg/tests/008.phpt (renamed from ext/standard/tests/reg/008.phpt) | 0 | ||||
| -rw-r--r-- | ext/ereg/tests/009.phpt (renamed from ext/standard/tests/reg/009.phpt) | 0 | ||||
| -rw-r--r-- | ext/ereg/tests/010.phpt (renamed from ext/standard/tests/reg/010.phpt) | 0 | ||||
| -rw-r--r-- | ext/ereg/tests/011.phpt (renamed from ext/standard/tests/reg/011.phpt) | 0 | ||||
| -rw-r--r-- | ext/ereg/tests/012.phpt (renamed from ext/standard/tests/reg/012.phpt) | 0 | ||||
| -rw-r--r-- | ext/ereg/tests/013.phpt (renamed from ext/standard/tests/reg/013.phpt) | 0 | ||||
| -rw-r--r-- | ext/ereg/tests/014.phpt (renamed from ext/standard/tests/reg/014.phpt) | 0 | ||||
| -rw-r--r-- | ext/ereg/tests/015.phpt (renamed from ext/standard/tests/reg/015.phpt) | 0 | ||||
| -rw-r--r-- | ext/ereg/tests/016.phpt (renamed from ext/standard/tests/reg/016.phpt) | 0 | ||||
| -rw-r--r-- | ext/pgsql/pgsql.c | 1 | ||||
| -rw-r--r-- | ext/standard/basic_functions.c | 62 | ||||
| -rw-r--r-- | ext/standard/browscap.c | 2 | ||||
| -rw-r--r-- | ext/standard/config.m4 | 48 | ||||
| -rw-r--r-- | ext/standard/config.w32 | 2 | ||||
| -rw-r--r-- | ext/standard/html.c | 1 | ||||
| -rw-r--r-- | ext/standard/php_standard.h | 1 | ||||
| -rw-r--r-- | ext/standard/string.c | 1 | 
58 files changed, 6656 insertions, 168 deletions
diff --git a/ext/ereg/CREDITS b/ext/ereg/CREDITS new file mode 100644 index 0000000000..9cd0ac2d1b --- /dev/null +++ b/ext/ereg/CREDITS @@ -0,0 +1,2 @@ +ereg +Rasmus Lerdorf, Jim Winstead, Jaakko Hyvätti diff --git a/ext/ereg/config.w32 b/ext/ereg/config.w32 new file mode 100644 index 0000000000..9a25368df0 --- /dev/null +++ b/ext/ereg/config.w32 @@ -0,0 +1,7 @@ +// $Id$ +// vim:ft=javascript + +EXTENSION("ereg", "ereg.c", false /* never shared */, "-Dregexec=php_regexec -Dregerror=php_regerror -Dregfree=php_regfree -Dregcomp=php_regcomp -Iext/ereg/regex"); +ADD_SOURCES("ext/ereg/regex", "regcomp.c regexec.c regerror.c regfree.c", "ereg"); +AC_DEFINE('REGEX', 1, 'Bundled regex'); +AC_DEFINE('HSREGEX', 1, 'Bundled regex'); diff --git a/ext/ereg/config0.m4 b/ext/ereg/config0.m4 new file mode 100644 index 0000000000..f4f8190932 --- /dev/null +++ b/ext/ereg/config0.m4 @@ -0,0 +1,56 @@ +dnl $Id$ +dnl config.m4 for extension ereg + +dnl +dnl Check for regex library type +dnl +PHP_ARG_WITH(regex,, +[  --with-regex=TYPE       regex library type: system, php. [TYPE=php] +                          WARNING: Do NOT use unless you know what you are doing!], php, no) + +case $PHP_REGEX in +  system) +    if test "$PHP_SAPI" = "apache" || test "$PHP_SAPI" = "apache2filter" || test "$PHP_SAPI" = "apache2handler"; then +      REGEX_TYPE=php +    else +      REGEX_TYPE=system +    fi +    ;; +  yes | php) +    REGEX_TYPE=php +    ;; +  *) +    REGEX_TYPE=php +    AC_MSG_WARN([Invalid regex library type selected. Using default value: php]) +    ;; +esac + +AC_MSG_CHECKING([which regex library to use]) +AC_MSG_RESULT([$REGEX_TYPE]) + +if test "$REGEX_TYPE" = "php"; then +  ereg_regex_sources="regex/regcomp.c regex/regexec.c regex/regerror.c regex/regfree.c" +  ereg_regex_headers="regex/" +  PHP_EREG_CFLAGS="-Dregexec=php_regexec -Dregerror=php_regerror -Dregfree=php_regfree -Dregcomp=php_regcomp" +fi + +PHP_NEW_EXTENSION(ereg, ereg.c $ereg_regex_sources, no,,$PHP_EREG_CFLAGS) +PHP_INSTALL_HEADERS([ext/ereg], [php_ereg.h php_regex.h $ereg_regex_headers]) + +if test "$REGEX_TYPE" = "php"; then +  AC_DEFINE(HAVE_REGEX_T_RE_MAGIC, 1, [ ]) +  AC_DEFINE(HSREGEX,1,[ ]) +  AC_DEFINE(REGEX,1,[ ])   +  PHP_ADD_BUILD_DIR([$ext_builddir/regex], 1) +  PHP_ADD_INCLUDE([$ext_srcdir/regex]) +elif test "$REGEX_TYPE" = "system"; then +  AC_DEFINE(REGEX,0,[ ]) +  dnl Check if field re_magic exists in struct regex_t +  AC_CACHE_CHECK([whether field re_magic exists in struct regex_t], ac_cv_regex_t_re_magic, [ +  AC_TRY_COMPILE([#include <sys/types.h> +#include <regex.h>], [regex_t rt; rt.re_magic;], +  [ac_cv_regex_t_re_magic=yes], [ac_cv_regex_t_re_magic=no])]) +  if test "$ac_cv_regex_t_re_magic" = "yes"; then +    AC_DEFINE([HAVE_REGEX_T_RE_MAGIC], [ ], 1)    +  fi  +fi    diff --git a/ext/standard/reg.c b/ext/ereg/ereg.c index f47f8d48af..d21d1b0c78 100644 --- a/ext/standard/reg.c +++ b/ext/ereg/ereg.c @@ -22,18 +22,96 @@  #include <stdio.h>  #include <ctype.h>  #include "php.h" -#include "php_string.h" -#include "reg.h" +#include "ext/standard/php_string.h" +#include "php_ereg.h"  #include "ext/standard/info.h" -ZEND_DECLARE_MODULE_GLOBALS(reg) +/* {{{ arginfo */ +static +ZEND_BEGIN_ARG_INFO_EX(arginfo_ereg, 0, 0, 2) +	ZEND_ARG_INFO(0, pattern) +	ZEND_ARG_INFO(0, string)  +	ZEND_ARG_INFO(1, registers) /* ARRAY_INFO(1, registers, 1) */ +ZEND_END_ARG_INFO() + +static +ZEND_BEGIN_ARG_INFO_EX(arginfo_eregi, 0, 0, 2) +	ZEND_ARG_INFO(0, pattern) +	ZEND_ARG_INFO(0, string)  +	ZEND_ARG_INFO(1, registers) /* ARRAY_INFO(1, registers, 1) */ +ZEND_END_ARG_INFO() + +static +ZEND_BEGIN_ARG_INFO(arginfo_ereg_replace, 0) +	ZEND_ARG_INFO(0, pattern) +	ZEND_ARG_INFO(0, replacement) +	ZEND_ARG_INFO(0, string) +ZEND_END_ARG_INFO() + +static +ZEND_BEGIN_ARG_INFO(arginfo_eregi_replace, 0) +	ZEND_ARG_INFO(0, pattern) +	ZEND_ARG_INFO(0, replacement) +	ZEND_ARG_INFO(0, string) +ZEND_END_ARG_INFO() + +static +ZEND_BEGIN_ARG_INFO_EX(arginfo_split, 0, 0, 2) +	ZEND_ARG_INFO(0, pattern) +	ZEND_ARG_INFO(0, string)  +	ZEND_ARG_INFO(0, limit)   +ZEND_END_ARG_INFO() + +static +ZEND_BEGIN_ARG_INFO_EX(arginfo_spliti, 0, 0, 2) +	ZEND_ARG_INFO(0, pattern) +	ZEND_ARG_INFO(0, string)  +	ZEND_ARG_INFO(0, limit)   +ZEND_END_ARG_INFO() + +static +ZEND_BEGIN_ARG_INFO(arginfo_sql_regcase, 0) +	ZEND_ARG_INFO(0, string) +ZEND_END_ARG_INFO() +/* }}} */ + +/* {{{ Function table */ +const zend_function_entry ereg_functions[] = { +	PHP_FE(ereg,						arginfo_ereg)   +	PHP_FE(ereg_replace,				arginfo_ereg_replace) +	PHP_FE(eregi,						arginfo_eregi) +	PHP_FE(eregi_replace,				arginfo_eregi_replace)    +	PHP_FE(split,						arginfo_split) +	PHP_FE(spliti,						arginfo_spliti) +	PHP_FE(sql_regcase,					arginfo_sql_regcase) +	{NULL, NULL, NULL} +}; +/* }}} */ +/* {{{ reg_cache */  typedef struct {  	regex_t preg;  	int cflags;  } reg_cache; -  static int reg_magic = 0; +/* }}} */ + +ZEND_DECLARE_MODULE_GLOBALS(ereg) + +/* {{{ Module entry */ +zend_module_entry ereg_module_entry = { +	STANDARD_MODULE_HEADER, +	"ereg", +	ereg_functions, +	PHP_MINIT(ereg), +	PHP_MSHUTDOWN(ereg), +	NULL, +	NULL, +	PHP_MINFO(ereg), +	NO_VERSION_YET, +	STANDARD_MODULE_PROPERTIES +}; +/* }}} */  /* {{{ _php_regcomp   */ @@ -44,7 +122,7 @@ static int _php_regcomp(regex_t *preg, const char *pattern, int cflags)  	reg_cache *rc = NULL;  	TSRMLS_FETCH(); -	if(zend_hash_find(®(ht_rc), (char *) pattern, patlen+1, (void **) &rc) == SUCCESS +	if(zend_hash_find(&EREG(ht_rc), (char *) pattern, patlen+1, (void **) &rc) == SUCCESS  	   && rc->cflags == cflags) {  #ifdef HAVE_REGEX_T_RE_MAGIC  		/* @@ -52,7 +130,7 @@ static int _php_regcomp(regex_t *preg, const char *pattern, int cflags)  		 * is, we flush it and compile the pattern from scratch.  		 */  		if (rc->preg.re_magic != reg_magic) { -			zend_hash_clean(®(ht_rc)); +			zend_hash_clean(&EREG(ht_rc));  		} else {  			memcpy(preg, &rc->preg, sizeof(*preg));  			return r; @@ -71,7 +149,7 @@ static int _php_regcomp(regex_t *preg, const char *pattern, int cflags)  		 * it's good.  		 */  		if (!reg_magic) reg_magic = preg->re_magic; -		zend_hash_update(®(ht_rc), (char *) pattern, patlen+1, +		zend_hash_update(&EREG(ht_rc), (char *) pattern, patlen+1,  						 (void *) &rcp, sizeof(rcp), NULL);  	}  #else @@ -83,7 +161,7 @@ static int _php_regcomp(regex_t *preg, const char *pattern, int cflags)  			rcp.cflags = cflags;  			memcpy(&rcp.preg, preg, sizeof(*preg)); -			zend_hash_update(®(ht_rc), (char *) pattern, patlen+1, +			zend_hash_update(&EREG(ht_rc), (char *) pattern, patlen+1,  							 (void *) &rcp, sizeof(rcp), NULL);  		}  	} @@ -92,7 +170,7 @@ static int _php_regcomp(regex_t *preg, const char *pattern, int cflags)  }  /* }}} */ -static void _free_reg_cache(reg_cache *rc)  +static void _free_ereg_cache(reg_cache *rc)   {  	regfree(&rc->preg);  } @@ -102,45 +180,47 @@ static void _free_reg_cache(reg_cache *rc)  #undef regcomp  #define regcomp(a, b, c) _php_regcomp(a, b, c) -static void php_reg_init_globals(zend_reg_globals *reg_globals TSRMLS_DC) +static void php_ereg_init_globals(zend_ereg_globals *ereg_globals TSRMLS_DC)  { -	zend_hash_init(®_globals->ht_rc, 0, NULL, (void (*)(void *)) _free_reg_cache, 1); +	zend_hash_init(&ereg_globals->ht_rc, 0, NULL, (void (*)(void *)) _free_ereg_cache, 1);  } -static void php_reg_destroy_globals(zend_reg_globals *reg_globals TSRMLS_DC) +static void php_ereg_destroy_globals(zend_ereg_globals *ereg_globals TSRMLS_DC)  { -	zend_hash_destroy(®_globals->ht_rc); +	zend_hash_destroy(&ereg_globals->ht_rc);  } -PHP_MINIT_FUNCTION(regex) +PHP_MINIT_FUNCTION(ereg)  { -	ZEND_INIT_MODULE_GLOBALS(reg, php_reg_init_globals, php_reg_destroy_globals); +	ZEND_INIT_MODULE_GLOBALS(ereg, php_ereg_init_globals, php_ereg_destroy_globals);  	return SUCCESS;  } -PHP_MSHUTDOWN_FUNCTION(regex) +PHP_MSHUTDOWN_FUNCTION(ereg)  {  #ifndef ZTS -	php_reg_destroy_globals(®_globals TSRMLS_CC); +	php_ereg_destroy_globals(&ereg_globals TSRMLS_CC);  #endif  	return SUCCESS;  } -PHP_MINFO_FUNCTION(regex) +PHP_MINFO_FUNCTION(ereg)  { +	php_info_print_table_start();  #if HSREGEX  	php_info_print_table_row(2, "Regex Library", "Bundled library enabled");  #else  	php_info_print_table_row(2, "Regex Library", "System library enabled");  #endif +	php_info_print_table_end();  } -/* {{{ php_reg_eprint - * php_reg_eprint - convert error number to name +/* {{{ php_ereg_eprint + * php_ereg_eprint - convert error number to name   */ -static void php_reg_eprint(int err, regex_t *re) { +static void php_ereg_eprint(int err, regex_t *re) {  	char *buf = NULL, *message = NULL;  	size_t len;  	size_t buf_len; @@ -198,7 +278,7 @@ static void php_ereg(INTERNAL_FUNCTION_PARAMETERS, int icase)  	int   argc = ZEND_NUM_ARGS();  	if (argc < 2 || argc > 3 || -	    zend_get_parameters_ex(argc, ®ex, &findin, &array) == FAILURE) { +		zend_get_parameters_ex(argc, ®ex, &findin, &array) == FAILURE) {  		WRONG_PARAM_COUNT;  	} @@ -221,7 +301,7 @@ static void php_ereg(INTERNAL_FUNCTION_PARAMETERS, int icase)  	}  	if (err) { -		php_reg_eprint(err, &re); +		php_ereg_eprint(err, &re);  		RETURN_FALSE;  	} @@ -235,7 +315,7 @@ static void php_ereg(INTERNAL_FUNCTION_PARAMETERS, int icase)  	/* actually execute the regular expression */  	err = regexec(&re, string, re.re_nsub+1, subs, 0);  	if (err && err != REG_NOMATCH) { -		php_reg_eprint(err, &re); +		php_ereg_eprint(err, &re);  		regfree(&re);  		efree(subs);  		RETURN_FALSE; @@ -292,15 +372,15 @@ PHP_FUNCTION(eregi)  }  /* }}} */ -/* {{{ php_reg_replace +/* {{{ php_ereg_replace   * this is the meat and potatoes of regex replacement! */ -PHPAPI char *php_reg_replace(const char *pattern, const char *replace, const char *string, int icase, int extended) +PHPAPI char *php_ereg_replace(const char *pattern, const char *replace, const char *string, int icase, int extended)  {  	regex_t re;  	regmatch_t *subs;  	char *buf,	/* buf is where we build the replaced string */ -	     *nbuf,	/* nbuf is used when we grow the buffer */ +		 *nbuf,	/* nbuf is used when we grow the buffer */  		 *walkbuf; /* used to walk buf when replacing backrefs */  	const char *walk; /* used to walk replacement string for backrefs */  	int buf_len; @@ -318,7 +398,7 @@ PHPAPI char *php_reg_replace(const char *pattern, const char *replace, const cha  	err = regcomp(&re, pattern, copts);  	if (err) { -		php_reg_eprint(err, &re); +		php_ereg_eprint(err, &re);  		return ((char *) -1);  	} @@ -337,7 +417,7 @@ PHPAPI char *php_reg_replace(const char *pattern, const char *replace, const cha  		err = regexec(&re, &string[pos], re.re_nsub+1, subs, (pos ? REG_NOTBOL : 0));  		if (err && err != REG_NOMATCH) { -			php_reg_eprint(err, &re); +			php_ereg_eprint(err, &re);  			efree(subs);  			efree(buf);  			regfree(&re); @@ -358,7 +438,7 @@ PHPAPI char *php_reg_replace(const char *pattern, const char *replace, const cha  				if ('\\' == *walk && isdigit((unsigned char)walk[1]) && ((unsigned char)walk[1]) - '0' <= (int)re.re_nsub) {  					if (subs[walk[1] - '0'].rm_so > -1 && subs[walk[1] - '0'].rm_eo > -1) {  						new_l += subs[walk[1] - '0'].rm_eo - subs[walk[1] - '0'].rm_so; -					}     +					}  					walk += 2;  				} else {  					new_l++; @@ -438,9 +518,9 @@ PHPAPI char *php_reg_replace(const char *pattern, const char *replace, const cha  }  /* }}} */ -/* {{{ php_ereg_replace +/* {{{ php_do_ereg_replace   */ -static void php_ereg_replace(INTERNAL_FUNCTION_PARAMETERS, int icase) +static void php_do_ereg_replace(INTERNAL_FUNCTION_PARAMETERS, int icase)  {  	zval **arg_pattern,  		**arg_replace, @@ -451,7 +531,7 @@ static void php_ereg_replace(INTERNAL_FUNCTION_PARAMETERS, int icase)  	char *ret;  	if (ZEND_NUM_ARGS() != 3 ||  -	    zend_get_parameters_ex(3, &arg_pattern, &arg_replace, &arg_string) == FAILURE) { +		zend_get_parameters_ex(3, &arg_pattern, &arg_replace, &arg_string) == FAILURE) {  		WRONG_PARAM_COUNT;  	} @@ -486,7 +566,7 @@ static void php_ereg_replace(INTERNAL_FUNCTION_PARAMETERS, int icase)  		string = STR_EMPTY_ALLOC();  	/* do the actual work */ -	ret = php_reg_replace(pattern, replace, string, icase, 1); +	ret = php_ereg_replace(pattern, replace, string, icase, 1);  	if (ret == (char *) -1) {  		RETVAL_FALSE;  	} else { @@ -504,7 +584,7 @@ static void php_ereg_replace(INTERNAL_FUNCTION_PARAMETERS, int icase)     Replace regular expression */  PHP_FUNCTION(ereg_replace)  { -	php_ereg_replace(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0); +	php_do_ereg_replace(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0);  }  /* }}} */ @@ -512,7 +592,7 @@ PHP_FUNCTION(ereg_replace)     Case insensitive replace regular expression */  PHP_FUNCTION(eregi_replace)  { -	php_ereg_replace(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1); +	php_do_ereg_replace(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1);  }  /* }}} */ @@ -528,27 +608,27 @@ static void php_split(INTERNAL_FUNCTION_PARAMETERS, int icase)  	int argc = ZEND_NUM_ARGS();  	if (argc < 2 || argc > 3 || -	    zend_get_parameters_ex(argc, &spliton, &str, &arg_count) == FAILURE) { +		zend_get_parameters_ex(argc, &spliton, &str, &arg_count) == FAILURE) {  		WRONG_PARAM_COUNT;  	} -     +  	if (argc > 2) {  		convert_to_long_ex(arg_count);  		count = Z_LVAL_PP(arg_count);  	} -     +  	if (icase)  		copts = REG_ICASE; -     -	convert_to_string_ex(spliton);                                         -	convert_to_string_ex(str);                                             + +	convert_to_string_ex(spliton); +	convert_to_string_ex(str);  	strp = Z_STRVAL_PP(str);  	endp = strp + Z_STRLEN_PP(str);  	err = regcomp(&re, Z_STRVAL_PP(spliton), REG_EXTENDED | copts);  	if (err) { -		php_reg_eprint(err, &re); +		php_ereg_eprint(err, &re);  		RETURN_FALSE;  	} @@ -590,7 +670,7 @@ static void php_split(INTERNAL_FUNCTION_PARAMETERS, int icase)  	/* see if we encountered an error */  	if (err && err != REG_NOMATCH) { -		php_reg_eprint(err, &re); +		php_ereg_eprint(err, &re);  		regfree(&re);  		zend_hash_destroy(Z_ARRVAL_P(return_value));  		efree(Z_ARRVAL_P(return_value)); diff --git a/ext/standard/reg.h b/ext/ereg/php_ereg.h index a577db8a96..19c7f426bf 100644 --- a/ext/standard/reg.h +++ b/ext/ereg/php_ereg.h @@ -19,10 +19,15 @@  /* $Id$ */ -#ifndef REG_H -#define REG_H +#ifndef EREG_H +#define EREG_H -PHPAPI char *php_reg_replace(const char *pattern, const char *replace, const char *string, int icase, int extended); +#include "php_regex.h" + +extern zend_module_entry ereg_module_entry; +#define phpext_ereg_ptr &ereg_module_entry + +PHPAPI char *php_ereg_replace(const char *pattern, const char *replace, const char *string, int icase, int extended);  PHP_FUNCTION(ereg);  PHP_FUNCTION(eregi); @@ -32,19 +37,19 @@ PHP_FUNCTION(split);  PHP_FUNCTION(spliti);  PHPAPI PHP_FUNCTION(sql_regcase); -ZEND_BEGIN_MODULE_GLOBALS(reg) +ZEND_BEGIN_MODULE_GLOBALS(ereg)  	HashTable ht_rc; -ZEND_END_MODULE_GLOBALS(reg) - -PHP_MINIT_FUNCTION(regex); -PHP_MSHUTDOWN_FUNCTION(regex); -PHP_MINFO_FUNCTION(regex); +ZEND_END_MODULE_GLOBALS(ereg) +/* Module functions */ +PHP_MINIT_FUNCTION(ereg); +PHP_MSHUTDOWN_FUNCTION(ereg); +PHP_MINFO_FUNCTION(ereg);  #ifdef ZTS -#define REG(v) TSRMG(reg_globals_id, zend_reg_globals *, v) +#define EREG(v) TSRMG(ereg_globals_id, zend_ereg_globals *, v)  #else -#define REG(v) (reg_globals.v) +#define EREG(v) (ereg_globals.v)  #endif  #endif /* REG_H */ diff --git a/ext/ereg/php_regex.h b/ext/ereg/php_regex.h new file mode 100644 index 0000000000..8f6362dc5b --- /dev/null +++ b/ext/ereg/php_regex.h @@ -0,0 +1,65 @@ +/* +  +----------------------------------------------------------------------+ +  | PHP Version 5                                                        | +  +----------------------------------------------------------------------+ +  | Copyright (c) 1997-2007 The PHP Group                                | +  +----------------------------------------------------------------------+ +  | This source file is subject to version 3.01 of the PHP license,      | +  | that is bundled with this package in the file LICENSE, and is        | +  | available through the world-wide-web at the following url:           | +  | http://www.php.net/license/3_01.txt                                  | +  | If you did not receive a copy of the PHP license and are unable to   | +  | obtain it through the world-wide-web, please send a note to          | +  | license@php.net so we can mail you a copy immediately.               | +  +----------------------------------------------------------------------+ +  | Author:                                                              | +  +----------------------------------------------------------------------+ +*/ + +/* $Id$ */ + +#ifndef PHP_REGEX_H +#define PHP_REGEX_H + +/* + * REGEX means: + * 0.. system regex + * 1.. bundled regex + */ + +#if (REGEX == 1) +/* Define aliases */ +#define regexec php_regexec +#define regerror php_regerror +#define regfree php_regfree +#define regcomp php_regcomp + +#include "ext/ereg/regex/regex.h" + +#undef _PCREPOSIX_H +#define _PCREPOSIX_H 1 + +#ifndef _REGEX_H +#define _REGEX_H 1				/* this should stop Apache from loading the system version of regex.h */ +#endif +#ifndef _REGEX_H_ +#define _REGEX_H_ 1 +#endif +#ifndef _RX_H +#define _RX_H 1				  	/* Try defining these for Linux to	*/ +#endif +#ifndef __REGEXP_LIBRARY_H__ +#define __REGEXP_LIBRARY_H__ 1 	/* avoid Apache including regex.h	*/ +#endif +#ifndef _H_REGEX +#define _H_REGEX 1              /* This one is for AIX */ +#endif + +#elif REGEX == 0 +#include <regex.h> +#ifndef _REGEX_H_ +#define _REGEX_H_ 1 +#endif +#endif + +#endif /* PHP_REGEX_H */ diff --git a/ext/ereg/regex/COPYRIGHT b/ext/ereg/regex/COPYRIGHT new file mode 100644 index 0000000000..d43362fbfc --- /dev/null +++ b/ext/ereg/regex/COPYRIGHT @@ -0,0 +1,20 @@ +Copyright 1992, 1993, 1994 Henry Spencer.  All rights reserved. +This software is not subject to any license of the American Telephone +and Telegraph Company or of the Regents of the University of California. + +Permission is granted to anyone to use this software for any purpose on +any computer system, and to alter it and redistribute it, subject +to the following restrictions: + +1. The author is not responsible for the consequences of use of this +   software, no matter how awful, even if they arise from flaws in it. + +2. The origin of this software must not be misrepresented, either by +   explicit claim or by omission.  Since few users ever read sources, +   credits must appear in the documentation. + +3. Altered versions must be plainly marked as such, and must not be +   misrepresented as being the original software.  Since few users +   ever read sources, credits must appear in the documentation. + +4. This notice may not be removed or altered. diff --git a/ext/ereg/regex/README b/ext/ereg/regex/README new file mode 100644 index 0000000000..cea9b67b66 --- /dev/null +++ b/ext/ereg/regex/README @@ -0,0 +1,32 @@ +alpha3.4 release. +Thu Mar 17 23:17:18 EST 1994 +henry@zoo.toronto.edu + +See WHATSNEW for change listing. + +installation notes: +-------- +Read the comments at the beginning of Makefile before running. + +Utils.h contains some things that just might have to be modified on +some systems, as well as a nested include (ugh) of <assert.h>. + +The "fake" directory contains quick-and-dirty fakes for some header +files and routines that old systems may not have.  Note also that +-DUSEBCOPY will make utils.h substitute bcopy() for memmove(). + +After that, "make r" will build regcomp.o, regexec.o, regfree.o, +and regerror.o (the actual routines), bundle them together into a test +program, and run regression tests on them.  No output is good output. + +"make lib" builds just the .o files for the actual routines (when +you're happy with testing and have adjusted CFLAGS for production), +and puts them together into libregex.a.  You can pick up either the +library or *.o ("make lib" makes sure there are no other .o files left +around to confuse things). + +Main.c, debug.c, split.c are used for regression testing but are not part +of the RE routines themselves. + +Regex.h goes in /usr/include.  All other .h files are internal only. +-------- diff --git a/ext/ereg/regex/WHATSNEW b/ext/ereg/regex/WHATSNEW new file mode 100644 index 0000000000..6e82e1dae0 --- /dev/null +++ b/ext/ereg/regex/WHATSNEW @@ -0,0 +1,92 @@ +New in alpha3.4:  The complex bug alluded to below has been fixed (in a +slightly kludgey temporary way that may hurt efficiency a bit; this is +another "get it out the door for 4.4" release).  The tests at the end of +the tests file have accordingly been uncommented.  The primary sign of +the bug was that something like a?b matching ab matched b rather than ab. +(The bug was essentially specific to this exact situation, else it would +have shown up earlier.) + +New in alpha3.3:  The definition of word boundaries has been altered +slightly, to more closely match the usual programming notion that "_" +is an alphabetic.  Stuff used for pre-ANSI systems is now in a subdir, +and the makefile no longer alludes to it in mysterious ways.  The +makefile has generally been cleaned up some.  Fixes have been made +(again!) so that the regression test will run without -DREDEBUG, at +the cost of weaker checking.  A workaround for a bug in some folks' +<assert.h> has been added.  And some more things have been added to +tests, including a couple right at the end which are commented out +because the code currently flunks them (complex bug; fix coming). +Plus the usual minor cleanup. + +New in alpha3.2:  Assorted bits of cleanup and portability improvement +(the development base is now a BSDI system using GCC instead of an ancient +Sun system, and the newer compiler exposed some glitches).  Fix for a +serious bug that affected REs using many [] (including REG_ICASE REs +because of the way they are implemented), *sometimes*, depending on +memory-allocation patterns.  The header-file prototypes no longer name +the parameters, avoiding possible name conflicts.  The possibility that +some clot has defined CHAR_MIN as (say) `-128' instead of `(-128)' is +now handled gracefully.  "uchar" is no longer used as an internal type +name (too many people have the same idea).  Still the same old lousy +performance, alas. + +New in alpha3.1:  Basically nothing, this release is just a bookkeeping +convenience.  Stay tuned. + +New in alpha3.0:  Performance is no better, alas, but some fixes have been +made and some functionality has been added.  (This is basically the "get +it out the door in time for 4.4" release.)  One bug fix:  regfree() didn't +free the main internal structure (how embarrassing).  It is now possible +to put NULs in either the RE or the target string, using (resp.) a new +REG_PEND flag and the old REG_STARTEND flag.  The REG_NOSPEC flag to +regcomp() makes all characters ordinary, so you can match a literal +string easily (this will become more useful when performance improves!). +There are now primitives to match beginnings and ends of words, although +the syntax is disgusting and so is the implementation.  The REG_ATOI +debugging interface has changed a bit.  And there has been considerable +internal cleanup of various kinds. + +New in alpha2.3:  Split change list out of README, and moved flags notes +into Makefile.  Macro-ized the name of regex(7) in regex(3), since it has +to change for 4.4BSD.  Cleanup work in engine.c, and some new regression +tests to catch tricky cases thereof. + +New in alpha2.2:  Out-of-date manpages updated.  Regerror() acquires two +small extensions -- REG_ITOA and REG_ATOI -- which avoid debugging kludges +in my own test program and might be useful to others for similar purposes. +The regression test will now compile (and run) without REDEBUG.  The +BRE \$ bug is fixed.  Most uses of "uchar" are gone; it's all chars now. +Char/uchar parameters are now written int/unsigned, to avoid possible +portability problems with unpromoted parameters.  Some unsigned casts have +been introduced to minimize portability problems with shifting into sign +bits. + +New in alpha2.1:  Lots of little stuff, cleanup and fixes.  The one big +thing is that regex.h is now generated, using mkh, rather than being +supplied in the distribution; due to circularities in dependencies, +you have to build regex.h explicitly by "make h".  The two known bugs +have been fixed (and the regression test now checks for them), as has a +problem with assertions not being suppressed in the absence of REDEBUG. +No performance work yet. + +New in alpha2:  Backslash-anything is an ordinary character, not an +error (except, of course, for the handful of backslashed metacharacters +in BREs), which should reduce script breakage.  The regression test +checks *where* null strings are supposed to match, and has generally +been tightened up somewhat.  Small bug fixes in parameter passing (not +harmful, but technically errors) and some other areas.  Debugging +invoked by defining REDEBUG rather than not defining NDEBUG. + +New in alpha+3:  full prototyping for internal routines, using a little +helper program, mkh, which extracts prototypes given in stylized comments. +More minor cleanup.  Buglet fix:  it's CHAR_BIT, not CHAR_BITS.  Simple +pre-screening of input when a literal string is known to be part of the +RE; this does wonders for performance. + +New in alpha+2:  minor bits of cleanup.  Notably, the number "32" for the +word width isn't hardwired into regexec.c any more, the public header +file prototypes the functions if __STDC__ is defined, and some small typos +in the manpages have been fixed. + +New in alpha+1:  improvements to the manual pages, and an important +extension, the REG_STARTEND option to regexec(). diff --git a/ext/ereg/regex/cclass.h b/ext/ereg/regex/cclass.h new file mode 100644 index 0000000000..df41694b04 --- /dev/null +++ b/ext/ereg/regex/cclass.h @@ -0,0 +1,30 @@ +/* character-class table */ +static struct cclass { +	unsigned char *name; +	unsigned char *chars; +	unsigned char *multis; +} cclasses[] = { +	{"alnum",	"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789",				""}, +	{"alpha",	"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz", +					""}, +	{"blank",	" \t",		""}, +	{"cntrl",	"\007\b\t\n\v\f\r\1\2\3\4\5\6\16\17\20\21\22\23\24\ +\25\26\27\30\31\32\33\34\35\36\37\177",	""}, +	{"digit",	"0123456789",	""}, +	{"graph",	"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\ +0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~", +					""}, +	{"lower",	"abcdefghijklmnopqrstuvwxyz", +					""}, +	{"print",	"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\ +0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ ", +					""}, +	{"punct",	"!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~", +					""}, +	{"space",	"\t\n\v\f\r ",	""}, +	{"upper",	"ABCDEFGHIJKLMNOPQRSTUVWXYZ", +					""}, +	{"xdigit",	"0123456789ABCDEFabcdef", +					""}, +	{NULL,		0,		""} +}; diff --git a/ext/ereg/regex/cname.h b/ext/ereg/regex/cname.h new file mode 100644 index 0000000000..670b273882 --- /dev/null +++ b/ext/ereg/regex/cname.h @@ -0,0 +1,102 @@ +/* character-name table */ +static struct cname { +	char *name; +	char code; +} cnames[] = { +	{"NUL",	'\0'}, +	{"SOH",	'\001'}, +	{"STX",	'\002'}, +	{"ETX",	'\003'}, +	{"EOT",	'\004'}, +	{"ENQ",	'\005'}, +	{"ACK",	'\006'}, +	{"BEL",	'\007'}, +	{"alert",	'\007'}, +	{"BS",		'\010'}, +	{"backspace",	'\b'}, +	{"HT",		'\011'}, +	{"tab",		'\t'}, +	{"LF",		'\012'}, +	{"newline",	'\n'}, +	{"VT",		'\013'}, +	{"vertical-tab",	'\v'}, +	{"FF",		'\014'}, +	{"form-feed",	'\f'}, +	{"CR",		'\015'}, +	{"carriage-return",	'\r'}, +	{"SO",	'\016'}, +	{"SI",	'\017'}, +	{"DLE",	'\020'}, +	{"DC1",	'\021'}, +	{"DC2",	'\022'}, +	{"DC3",	'\023'}, +	{"DC4",	'\024'}, +	{"NAK",	'\025'}, +	{"SYN",	'\026'}, +	{"ETB",	'\027'}, +	{"CAN",	'\030'}, +	{"EM",	'\031'}, +	{"SUB",	'\032'}, +	{"ESC",	'\033'}, +	{"IS4",	'\034'}, +	{"FS",	'\034'}, +	{"IS3",	'\035'}, +	{"GS",	'\035'}, +	{"IS2",	'\036'}, +	{"RS",	'\036'}, +	{"IS1",	'\037'}, +	{"US",	'\037'}, +	{"space",		' '}, +	{"exclamation-mark",	'!'}, +	{"quotation-mark",	'"'}, +	{"number-sign",		'#'}, +	{"dollar-sign",		'$'}, +	{"percent-sign",		'%'}, +	{"ampersand",		'&'}, +	{"apostrophe",		'\''}, +	{"left-parenthesis",	'('}, +	{"right-parenthesis",	')'}, +	{"asterisk",	'*'}, +	{"plus-sign",	'+'}, +	{"comma",	','}, +	{"hyphen",	'-'}, +	{"hyphen-minus",	'-'}, +	{"period",	'.'}, +	{"full-stop",	'.'}, +	{"slash",	'/'}, +	{"solidus",	'/'}, +	{"zero",		'0'}, +	{"one",		'1'}, +	{"two",		'2'}, +	{"three",	'3'}, +	{"four",		'4'}, +	{"five",		'5'}, +	{"six",		'6'}, +	{"seven",	'7'}, +	{"eight",	'8'}, +	{"nine",		'9'}, +	{"colon",	':'}, +	{"semicolon",	';'}, +	{"less-than-sign",	'<'}, +	{"equals-sign",		'='}, +	{"greater-than-sign",	'>'}, +	{"question-mark",	'?'}, +	{"commercial-at",	'@'}, +	{"left-square-bracket",	'['}, +	{"backslash",		'\\'}, +	{"reverse-solidus",	'\\'}, +	{"right-square-bracket",	']'}, +	{"circumflex",		'^'}, +	{"circumflex-accent",	'^'}, +	{"underscore",		'_'}, +	{"low-line",		'_'}, +	{"grave-accent",		'`'}, +	{"left-brace",		'{'}, +	{"left-curly-bracket",	'{'}, +	{"vertical-line",	'|'}, +	{"right-brace",		'}'}, +	{"right-curly-bracket",	'}'}, +	{"tilde",		'~'}, +	{"DEL",	'\177'}, +	{NULL,	0}, +}; diff --git a/ext/ereg/regex/debug.c b/ext/ereg/regex/debug.c new file mode 100644 index 0000000000..3db93ef293 --- /dev/null +++ b/ext/ereg/regex/debug.c @@ -0,0 +1,242 @@ +#include <stdio.h> +#include <string.h> +#include <ctype.h> +#include <limits.h> +#include <stdlib.h> +#include <sys/types.h> +#include <regex.h> + +#include "utils.h" +#include "regex2.h" +#include "debug.ih" + +/* + - regprint - print a regexp for debugging + == void regprint(regex_t *r, FILE *d); + */ +void +regprint(r, d) +regex_t *r; +FILE *d; +{ +	register struct re_guts *g = r->re_g; +	register int i; +	register int c; +	register int last; +	int nincat[NC]; + +	fprintf(d, "%ld states, %d categories", (long)g->nstates, +							g->ncategories); +	fprintf(d, ", first %ld last %ld", (long)g->firststate, +						(long)g->laststate); +	if (g->iflags&USEBOL) +		fprintf(d, ", USEBOL"); +	if (g->iflags&USEEOL) +		fprintf(d, ", USEEOL"); +	if (g->iflags&BAD) +		fprintf(d, ", BAD"); +	if (g->nsub > 0) +		fprintf(d, ", nsub=%ld", (long)g->nsub); +	if (g->must != NULL) +		fprintf(d, ", must(%ld) `%*s'", (long)g->mlen, (int)g->mlen, +								g->must); +	if (g->backrefs) +		fprintf(d, ", backrefs"); +	if (g->nplus > 0) +		fprintf(d, ", nplus %ld", (long)g->nplus); +	fprintf(d, "\n"); +	s_print(g, d); +	for (i = 0; i < g->ncategories; i++) { +		nincat[i] = 0; +		for (c = CHAR_MIN; c <= CHAR_MAX; c++) +			if (g->categories[c] == i) +				nincat[i]++; +	} +	fprintf(d, "cc0#%d", nincat[0]); +	for (i = 1; i < g->ncategories; i++) +		if (nincat[i] == 1) { +			for (c = CHAR_MIN; c <= CHAR_MAX; c++) +				if (g->categories[c] == i) +					break; +			fprintf(d, ", %d=%s", i, regchar(c)); +		} +	fprintf(d, "\n"); +	for (i = 1; i < g->ncategories; i++) +		if (nincat[i] != 1) { +			fprintf(d, "cc%d\t", i); +			last = -1; +			for (c = CHAR_MIN; c <= CHAR_MAX+1; c++)	/* +1 does flush */ +				if (c <= CHAR_MAX && g->categories[c] == i) { +					if (last < 0) { +						fprintf(d, "%s", regchar(c)); +						last = c; +					} +				} else { +					if (last >= 0) { +						if (last != c-1) +							fprintf(d, "-%s", +								regchar(c-1)); +						last = -1; +					} +				} +			fprintf(d, "\n"); +		} +} + +/* + - s_print - print the strip for debugging + == static void s_print(register struct re_guts *g, FILE *d); + */ +static void +s_print(g, d) +register struct re_guts *g; +FILE *d; +{ +	register sop *s; +	register cset *cs; +	register int i; +	register int done = 0; +	register sop opnd; +	register int col = 0; +	register int last; +	register sopno offset = 2; +#	define	GAP()	{	if (offset % 5 == 0) { \ +					if (col > 40) { \ +						fprintf(d, "\n\t"); \ +						col = 0; \ +					} else { \ +						fprintf(d, " "); \ +						col++; \ +					} \ +				} else \ +					col++; \ +				offset++; \ +			} + +	if (OP(g->strip[0]) != OEND) +		fprintf(d, "missing initial OEND!\n"); +	for (s = &g->strip[1]; !done; s++) { +		opnd = OPND(*s); +		switch (OP(*s)) { +		case OEND: +			fprintf(d, "\n"); +			done = 1; +			break; +		case OCHAR: +			if (strchr("\\|()^$.[+*?{}!<> ", (char)opnd) != NULL) +				fprintf(d, "\\%c", (unsigned char)opnd); +			else +				fprintf(d, "%s", regchar((unsigned char)opnd)); +			break; +		case OBOL: +			fprintf(d, "^"); +			break; +		case OEOL: +			fprintf(d, "$"); +			break; +		case OBOW: +			fprintf(d, "\\{"); +			break; +		case OEOW: +			fprintf(d, "\\}"); +			break; +		case OANY: +			fprintf(d, "."); +			break; +		case OANYOF: +			fprintf(d, "[(%ld)", (long)opnd); +			cs = &g->sets[opnd]; +			last = -1; +			for (i = 0; i < g->csetsize+1; i++)	/* +1 flushes */ +				if (CHIN(cs, i) && i < g->csetsize) { +					if (last < 0) { +						fprintf(d, "%s", regchar(i)); +						last = i; +					} +				} else { +					if (last >= 0) { +						if (last != i-1) +							fprintf(d, "-%s", +								regchar(i-1)); +						last = -1; +					} +				} +			fprintf(d, "]"); +			break; +		case OBACK_: +			fprintf(d, "(\\<%ld>", (long)opnd); +			break; +		case O_BACK: +			fprintf(d, "<%ld>\\)", (long)opnd); +			break; +		case OPLUS_: +			fprintf(d, "(+"); +			if (OP(*(s+opnd)) != O_PLUS) +				fprintf(d, "<%ld>", (long)opnd); +			break; +		case O_PLUS: +			if (OP(*(s-opnd)) != OPLUS_) +				fprintf(d, "<%ld>", (long)opnd); +			fprintf(d, "+)"); +			break; +		case OQUEST_: +			fprintf(d, "(?"); +			if (OP(*(s+opnd)) != O_QUEST) +				fprintf(d, "<%ld>", (long)opnd); +			break; +		case O_QUEST: +			if (OP(*(s-opnd)) != OQUEST_) +				fprintf(d, "<%ld>", (long)opnd); +			fprintf(d, "?)"); +			break; +		case OLPAREN: +			fprintf(d, "((<%ld>", (long)opnd); +			break; +		case ORPAREN: +			fprintf(d, "<%ld>))", (long)opnd); +			break; +		case OCH_: +			fprintf(d, "<"); +			if (OP(*(s+opnd)) != OOR2) +				fprintf(d, "<%ld>", (long)opnd); +			break; +		case OOR1: +			if (OP(*(s-opnd)) != OOR1 && OP(*(s-opnd)) != OCH_) +				fprintf(d, "<%ld>", (long)opnd); +			fprintf(d, "|"); +			break; +		case OOR2: +			fprintf(d, "|"); +			if (OP(*(s+opnd)) != OOR2 && OP(*(s+opnd)) != O_CH) +				fprintf(d, "<%ld>", (long)opnd); +			break; +		case O_CH: +			if (OP(*(s-opnd)) != OOR1) +				fprintf(d, "<%ld>", (long)opnd); +			fprintf(d, ">"); +			break; +		default: +			fprintf(d, "!%ld(%ld)!", OP(*s), opnd); +			break; +		} +		if (!done) +			GAP(); +	} +} + +/* + - regchar - make a character printable + == static char *regchar(int ch); + */ +static unsigned char *			/* -> representation */ +regchar(ch) +int ch; +{ +	static unsigned char buf[10]; + +	if (isprint(ch) || ch == ' ') +		sprintf(buf, "%c", ch); +	else +		sprintf(buf, "\\%o", ch); +	return(buf); +} diff --git a/ext/ereg/regex/debug.ih b/ext/ereg/regex/debug.ih new file mode 100644 index 0000000000..5f40ff7917 --- /dev/null +++ b/ext/ereg/regex/debug.ih @@ -0,0 +1,14 @@ +/* ========= begin header generated by ./mkh ========= */ +#ifdef __cplusplus +extern "C" { +#endif + +/* === debug.c === */ +void regprint(regex_t *r, FILE *d); +static void s_print(register struct re_guts *g, FILE *d); +static char *regchar(int ch); + +#ifdef __cplusplus +} +#endif +/* ========= end header generated by ./mkh ========= */ diff --git a/ext/ereg/regex/engine.c b/ext/ereg/regex/engine.c new file mode 100644 index 0000000000..0682267f61 --- /dev/null +++ b/ext/ereg/regex/engine.c @@ -0,0 +1,1019 @@ +/* + * The matching engine and friends.  This file is #included by regexec.c + * after suitable #defines of a variety of macros used herein, so that + * different state representations can be used without duplicating masses + * of code. + */ + +#ifdef SNAMES +#define	matcher	smatcher +#define	fast	sfast +#define	slow	sslow +#define	dissect	sdissect +#define	backref	sbackref +#define	step	sstep +#define	print	sprint +#define	at	sat +#define	match	smat +#endif +#ifdef LNAMES +#define	matcher	lmatcher +#define	fast	lfast +#define	slow	lslow +#define	dissect	ldissect +#define	backref	lbackref +#define	step	lstep +#define	print	lprint +#define	at	lat +#define	match	lmat +#endif + +/* another structure passed up and down to avoid zillions of parameters */ +struct match { +	struct re_guts *g; +	int eflags; +	regmatch_t *pmatch;	/* [nsub+1] (0 element unused) */ +	unsigned char *offp;		/* offsets work from here */ +	unsigned char *beginp;		/* start of string -- virtual NUL precedes */ +	unsigned char *endp;		/* end of string -- virtual NUL here */ +	unsigned char *coldp;		/* can be no match starting before here */ +	unsigned char **lastpos;		/* [nplus+1] */ +	STATEVARS; +	states st;		/* current states */ +	states fresh;		/* states for a fresh start */ +	states tmp;		/* temporary */ +	states empty;		/* empty set of states */ +}; + +#include "engine.ih" + +#ifdef REDEBUG +#define	SP(t, s, c)	print(m, t, s, c, stdout) +#define	AT(t, p1, p2, s1, s2)	at(m, t, p1, p2, s1, s2) +#define	NOTE(str)	{ if (m->eflags®_TRACE) printf("=%s\n", (str)); } +#else +#define	SP(t, s, c)	/* nothing */ +#define	AT(t, p1, p2, s1, s2)	/* nothing */ +#define	NOTE(s)	/* nothing */ +#endif + +/* + - matcher - the actual matching engine + == static int matcher(register struct re_guts *g, char *string, \ + ==	size_t nmatch, regmatch_t pmatch[], int eflags); + */ +static int			/* 0 success, REG_NOMATCH failure */ +matcher(g, string, nmatch, pmatch, eflags) +register struct re_guts *g; +unsigned char *string; +size_t nmatch; +regmatch_t pmatch[]; +int eflags; +{ +	register unsigned char *endp; +	register size_t i; +	struct match mv; +	register struct match *m = &mv; +	register unsigned char *dp; +	const register sopno gf = g->firststate+1;	/* +1 for OEND */ +	const register sopno gl = g->laststate; +	unsigned char *start; +	unsigned char *stop; + +	/* simplify the situation where possible */ +	if (g->cflags®_NOSUB) +		nmatch = 0; +	if (eflags®_STARTEND) { +		start = string + pmatch[0].rm_so; +		stop = string + pmatch[0].rm_eo; +	} else { +		start = string; +		stop = start + strlen(start); +	} +	if (stop < start) +		return(REG_INVARG); + +	/* prescreening; this does wonders for this rather slow code */ +	if (g->must != NULL) { +		for (dp = start; dp < stop; dp++) +			if (*dp == g->must[0] && stop - dp >= g->mlen && +				memcmp(dp, g->must, (size_t)g->mlen) == 0) +				break; +		if (dp == stop)		/* we didn't find g->must */ +			return(REG_NOMATCH); +	} + +	/* match struct setup */ +	m->g = g; +	m->eflags = eflags; +	m->pmatch = NULL; +	m->lastpos = NULL; +	m->offp = string; +	m->beginp = start; +	m->endp = stop; +	STATESETUP(m, 4); +	SETUP(m->st); +	SETUP(m->fresh); +	SETUP(m->tmp); +	SETUP(m->empty); +	CLEAR(m->empty); + +	/* this loop does only one repetition except for backrefs */ +	for (;;) { +		endp = fast(m, start, stop, gf, gl); +		if (endp == NULL) {		/* a miss */ +			STATETEARDOWN(m); +			return(REG_NOMATCH); +		} +		if (nmatch == 0 && !g->backrefs) +			break;		/* no further info needed */ + +		/* where? */ +		assert(m->coldp != NULL); +		for (;;) { +			NOTE("finding start"); +			endp = slow(m, m->coldp, stop, gf, gl); +			if (endp != NULL) +				break; +			assert(m->coldp < m->endp); +			m->coldp++; +		} +		if (nmatch == 1 && !g->backrefs) +			break;		/* no further info needed */ + +		/* oh my, he wants the subexpressions... */ +		if (m->pmatch == NULL) +			m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) * +							sizeof(regmatch_t)); +		if (m->pmatch == NULL) { +			STATETEARDOWN(m); +			return(REG_ESPACE); +		} +		for (i = 1; i <= m->g->nsub; i++) +			m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1; +		if (!g->backrefs && !(m->eflags®_BACKR)) { +			NOTE("dissecting"); +			dp = dissect(m, m->coldp, endp, gf, gl); +		} else { +			if (g->nplus > 0 && m->lastpos == NULL) +				m->lastpos = (unsigned char **)malloc((g->nplus+1) * +							sizeof(unsigned char *)); +			if (g->nplus > 0 && m->lastpos == NULL) { +				free((char *)m->pmatch); +				STATETEARDOWN(m); +				return(REG_ESPACE); +			} +			NOTE("backref dissect"); +			dp = backref(m, m->coldp, endp, gf, gl, (sopno)0); +		} +		if (dp != NULL) +			break; + +		/* uh-oh... we couldn't find a subexpression-level match */ +		assert(g->backrefs);	/* must be back references doing it */ +		assert(g->nplus == 0 || m->lastpos != NULL); +		for (;;) { +			if (dp != NULL || endp <= m->coldp) +				break;		/* defeat */ +			NOTE("backoff"); +			endp = slow(m, m->coldp, endp-1, gf, gl); +			if (endp == NULL) +				break;		/* defeat */ +			/* try it on a shorter possibility */ +#ifndef NDEBUG +			for (i = 1; i <= m->g->nsub; i++) { +				assert(m->pmatch[i].rm_so == -1); +				assert(m->pmatch[i].rm_eo == -1); +			} +#endif +			NOTE("backoff dissect"); +			dp = backref(m, m->coldp, endp, gf, gl, (sopno)0); +		} +		assert(dp == NULL || dp == endp); +		if (dp != NULL)		/* found a shorter one */ +			break; + +		/* despite initial appearances, there is no match here */ +		NOTE("false alarm"); +		start = m->coldp + 1;	/* recycle starting later */ +		assert(start <= stop); +	} + +	/* fill in the details if requested */ +	if (nmatch > 0) { +		pmatch[0].rm_so = m->coldp - m->offp; +		pmatch[0].rm_eo = endp - m->offp; +	} +	if (nmatch > 1) { +		assert(m->pmatch != NULL); +		for (i = 1; i < nmatch; i++) +			if (i <= m->g->nsub) +				pmatch[i] = m->pmatch[i]; +			else { +				pmatch[i].rm_so = -1; +				pmatch[i].rm_eo = -1; +			} +	} + +	if (m->pmatch != NULL) +		free((char *)m->pmatch); +	if (m->lastpos != NULL) +		free((char *)m->lastpos); +	STATETEARDOWN(m); +	return(0); +} + +/* + - dissect - figure out what matched what, no back references + == static unsigned char *dissect(register struct match *m, unsigned char *start, \ + ==	unsigned char *stop, sopno startst, sopno stopst); + */ +static unsigned char *			/* == stop (success) always */ +dissect(m, start, stop, startst, stopst) +register struct match *m; +unsigned char *start; +unsigned char *stop; +sopno startst; +sopno stopst; +{ +	register int i; +	register sopno ss;	/* start sop of current subRE */ +	register sopno es;	/* end sop of current subRE */ +	register unsigned char *sp;	/* start of string matched by it */ +	register unsigned char *stp;	/* string matched by it cannot pass here */ +	register unsigned char *rest;	/* start of rest of string */ +	register unsigned char *tail;	/* string unmatched by rest of RE */ +	register sopno ssub;	/* start sop of subsubRE */ +	register sopno esub;	/* end sop of subsubRE */ +	register unsigned char *ssp;	/* start of string matched by subsubRE */ +	register unsigned char *sep;	/* end of string matched by subsubRE */ +	register unsigned char *oldssp;	/* previous ssp */ +	register unsigned char *dp; + +	AT("diss", start, stop, startst, stopst); +	sp = start; +	for (ss = startst; ss < stopst; ss = es) { +		/* identify end of subRE */ +		es = ss; +		switch (OP(m->g->strip[es])) { +		case OPLUS_: +		case OQUEST_: +			es += OPND(m->g->strip[es]); +			break; +		case OCH_: +			while (OP(m->g->strip[es]) != O_CH) +				es += OPND(m->g->strip[es]); +			break; +		} +		es++; + +		/* figure out what it matched */ +		switch (OP(m->g->strip[ss])) { +		case OEND: +			assert(PHP_REGEX_NOPE); +			break; +		case OCHAR: +			sp++; +			break; +		case OBOL: +		case OEOL: +		case OBOW: +		case OEOW: +			break; +		case OANY: +		case OANYOF: +			sp++; +			break; +		case OBACK_: +		case O_BACK: +			assert(PHP_REGEX_NOPE); +			break; +		/* cases where length of match is hard to find */ +		case OQUEST_: +			stp = stop; +			for (;;) { +				/* how long could this one be? */ +				rest = slow(m, sp, stp, ss, es); +				assert(rest != NULL);	/* it did match */ +				/* could the rest match the rest? */ +				tail = slow(m, rest, stop, es, stopst); +				if (tail == stop) +					break;		/* yes! */ +				/* no -- try a shorter match for this one */ +				stp = rest - 1; +				assert(stp >= sp);	/* it did work */ +			} +			ssub = ss + 1; +			esub = es - 1; +			/* did innards match? */ +			if (slow(m, sp, rest, ssub, esub) != NULL) { +				dp = dissect(m, sp, rest, ssub, esub); +				assert(dp == rest); +			} else		/* no */ +				assert(sp == rest); +			sp = rest; +			break; +		case OPLUS_: +			stp = stop; +			for (;;) { +				/* how long could this one be? */ +				rest = slow(m, sp, stp, ss, es); +				assert(rest != NULL);	/* it did match */ +				/* could the rest match the rest? */ +				tail = slow(m, rest, stop, es, stopst); +				if (tail == stop) +					break;		/* yes! */ +				/* no -- try a shorter match for this one */ +				stp = rest - 1; +				assert(stp >= sp);	/* it did work */ +			} +			ssub = ss + 1; +			esub = es - 1; +			ssp = sp; +			oldssp = ssp; +			for (;;) {	/* find last match of innards */ +				sep = slow(m, ssp, rest, ssub, esub); +				if (sep == NULL || sep == ssp) +					break;	/* failed or matched null */ +				oldssp = ssp;	/* on to next try */ +				ssp = sep; +			} +			if (sep == NULL) { +				/* last successful match */ +				sep = ssp; +				ssp = oldssp; +			} +			assert(sep == rest);	/* must exhaust substring */ +			assert(slow(m, ssp, sep, ssub, esub) == rest); +			dp = dissect(m, ssp, sep, ssub, esub); +			assert(dp == sep); +			sp = rest; +			break; +		case OCH_: +			stp = stop; +			for (;;) { +				/* how long could this one be? */ +				rest = slow(m, sp, stp, ss, es); +				assert(rest != NULL);	/* it did match */ +				/* could the rest match the rest? */ +				tail = slow(m, rest, stop, es, stopst); +				if (tail == stop) +					break;		/* yes! */ +				/* no -- try a shorter match for this one */ +				stp = rest - 1; +				assert(stp >= sp);	/* it did work */ +			} +			ssub = ss + 1; +			esub = ss + OPND(m->g->strip[ss]) - 1; +			assert(OP(m->g->strip[esub]) == OOR1); +			for (;;) {	/* find first matching branch */ +				if (slow(m, sp, rest, ssub, esub) == rest) +					break;	/* it matched all of it */ +				/* that one missed, try next one */ +				assert(OP(m->g->strip[esub]) == OOR1); +				esub++; +				assert(OP(m->g->strip[esub]) == OOR2); +				ssub = esub + 1; +				esub += OPND(m->g->strip[esub]); +				if (OP(m->g->strip[esub]) == OOR2) +					esub--; +				else +					assert(OP(m->g->strip[esub]) == O_CH); +			} +			dp = dissect(m, sp, rest, ssub, esub); +			assert(dp == rest); +			sp = rest; +			break; +		case O_PLUS: +		case O_QUEST: +		case OOR1: +		case OOR2: +		case O_CH: +			assert(PHP_REGEX_NOPE); +			break; +		case OLPAREN: +			i = OPND(m->g->strip[ss]); +			assert(0 < i && i <= m->g->nsub); +			m->pmatch[i].rm_so = sp - m->offp; +			break; +		case ORPAREN: +			i = OPND(m->g->strip[ss]); +			assert(0 < i && i <= m->g->nsub); +			m->pmatch[i].rm_eo = sp - m->offp; +			break; +		default:		/* uh oh */ +			assert(PHP_REGEX_NOPE); +			break; +		} +	} + +	assert(sp == stop); +	return(sp); +} + +/* + - backref - figure out what matched what, figuring in back references + == static unsigned char *backref(register struct match *m, unsigned char *start, \ + ==	unsigned char *stop, sopno startst, sopno stopst, sopno lev); + */ +static unsigned char *			/* == stop (success) or NULL (failure) */ +backref(m, start, stop, startst, stopst, lev) +register struct match *m; +unsigned char *start; +unsigned char *stop; +sopno startst; +sopno stopst; +sopno lev;			/* PLUS nesting level */ +{ +	register int i; +	register sopno ss;	/* start sop of current subRE */ +	register unsigned char *sp;	/* start of string matched by it */ +	register sopno ssub;	/* start sop of subsubRE */ +	register sopno esub;	/* end sop of subsubRE */ +	register unsigned char *ssp;	/* start of string matched by subsubRE */ +	register unsigned char *dp; +	register size_t len; +	register int hard; +	register sop s; +	register regoff_t offsave; +	register cset *cs; + +	AT("back", start, stop, startst, stopst); +	sp = start; + +	/* get as far as we can with easy stuff */ +	hard = 0; +	for (ss = startst; !hard && ss < stopst; ss++) +		switch (OP(s = m->g->strip[ss])) { +		case OCHAR: +			if (sp == stop || *sp++ != (unsigned char)OPND(s)) +				return(NULL); +			break; +		case OANY: +			if (sp == stop) +				return(NULL); +			sp++; +			break; +		case OANYOF: +			cs = &m->g->sets[OPND(s)]; +			if (sp == stop || !CHIN(cs, *sp++)) +				return(NULL); +			break; +		case OBOL: +			if ( (sp == m->beginp && !(m->eflags®_NOTBOL)) || +					(sp < m->endp && *(sp-1) == '\n' && +						(m->g->cflags®_NEWLINE)) ) +				{ /* yes */ } +			else +				return(NULL); +			break; +		case OEOL: +			if ( (sp == m->endp && !(m->eflags®_NOTEOL)) || +					(sp < m->endp && *sp == '\n' && +						(m->g->cflags®_NEWLINE)) ) +				{ /* yes */ } +			else +				return(NULL); +			break; +		case OBOW: +			if (( (sp == m->beginp && !(m->eflags®_NOTBOL)) || +					(sp < m->endp && *(sp-1) == '\n' && +						(m->g->cflags®_NEWLINE)) || +					(sp > m->beginp && +							!ISWORD(*(sp-1))) ) && +					(sp < m->endp && ISWORD(*sp)) ) +				{ /* yes */ } +			else +				return(NULL); +			break; +		case OEOW: +			if (( (sp == m->endp && !(m->eflags®_NOTEOL)) || +					(sp < m->endp && *sp == '\n' && +						(m->g->cflags®_NEWLINE)) || +					(sp < m->endp && !ISWORD(*sp)) ) && +					(sp > m->beginp && ISWORD(*(sp-1))) ) +				{ /* yes */ } +			else +				return(NULL); +			break; +		case O_QUEST: +			break; +		case OOR1:	/* matches null but needs to skip */ +			ss++; +			s = m->g->strip[ss]; +			do { +				assert(OP(s) == OOR2); +				ss += OPND(s); +			} while (OP(s = m->g->strip[ss]) != O_CH); +			/* note that the ss++ gets us past the O_CH */ +			break; +		default:	/* have to make a choice */ +			hard = 1; +			break; +		} +	if (!hard) {		/* that was it! */ +		if (sp != stop) +			return(NULL); +		return(sp); +	} +	ss--;			/* adjust for the for's final increment */ + +	/* the hard stuff */ +	AT("hard", sp, stop, ss, stopst); +	s = m->g->strip[ss]; +	switch (OP(s)) { +	case OBACK_:		/* the vilest depths */ +		i = OPND(s); +		assert(0 < i && i <= m->g->nsub); +		if (m->pmatch[i].rm_eo == -1) +			return(NULL); +		assert(m->pmatch[i].rm_so != -1); +		len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so; +		assert(stop - m->beginp >= len); +		if (sp > stop - len) +			return(NULL);	/* not enough left to match */ +		ssp = m->offp + m->pmatch[i].rm_so; +		if (memcmp(sp, ssp, len) != 0) +			return(NULL); +		while (m->g->strip[ss] != SOP(O_BACK, i)) +			ss++; +		return(backref(m, sp+len, stop, ss+1, stopst, lev)); +		break; +	case OQUEST_:		/* to null or not */ +		dp = backref(m, sp, stop, ss+1, stopst, lev); +		if (dp != NULL) +			return(dp);	/* not */ +		return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev)); +		break; +	case OPLUS_: +		assert(m->lastpos != NULL); +		assert(lev+1 <= m->g->nplus); +		m->lastpos[lev+1] = sp; +		return(backref(m, sp, stop, ss+1, stopst, lev+1)); +		break; +	case O_PLUS: +		if (sp == m->lastpos[lev])	/* last pass matched null */ +			return(backref(m, sp, stop, ss+1, stopst, lev-1)); +		/* try another pass */ +		m->lastpos[lev] = sp; +		dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev); +		if (dp == NULL) +			return(backref(m, sp, stop, ss+1, stopst, lev-1)); +		else +			return(dp); +		break; +	case OCH_:		/* find the right one, if any */ +		ssub = ss + 1; +		esub = ss + OPND(s) - 1; +		assert(OP(m->g->strip[esub]) == OOR1); +		for (;;) {	/* find first matching branch */ +			dp = backref(m, sp, stop, ssub, esub, lev); +			if (dp != NULL) +				return(dp); +			/* that one missed, try next one */ +			if (OP(m->g->strip[esub]) == O_CH) +				return(NULL);	/* there is none */ +			esub++; +			assert(OP(m->g->strip[esub]) == OOR2); +			ssub = esub + 1; +			esub += OPND(m->g->strip[esub]); +			if (OP(m->g->strip[esub]) == OOR2) +				esub--; +			else +				assert(OP(m->g->strip[esub]) == O_CH); +		} +		break; +	case OLPAREN:		/* must undo assignment if rest fails */ +		i = OPND(s); +		assert(0 < i && i <= m->g->nsub); +		offsave = m->pmatch[i].rm_so; +		m->pmatch[i].rm_so = sp - m->offp; +		dp = backref(m, sp, stop, ss+1, stopst, lev); +		if (dp != NULL) +			return(dp); +		m->pmatch[i].rm_so = offsave; +		return(NULL); +		break; +	case ORPAREN:		/* must undo assignment if rest fails */ +		i = OPND(s); +		assert(0 < i && i <= m->g->nsub); +		offsave = m->pmatch[i].rm_eo; +		m->pmatch[i].rm_eo = sp - m->offp; +		dp = backref(m, sp, stop, ss+1, stopst, lev); +		if (dp != NULL) +			return(dp); +		m->pmatch[i].rm_eo = offsave; +		return(NULL); +		break; +	default:		/* uh oh */ +		assert(PHP_REGEX_NOPE); +		break; +	} + +	/* "can't happen" */ +	assert(PHP_REGEX_NOPE); +	/* NOTREACHED */ +	return((unsigned char *)NULL);	/* dummy */ +} + +/* + - fast - step through the string at top speed + == static unsigned char *fast(register struct match *m, unsigned char *start, \ + ==	unsigned char *stop, sopno startst, sopno stopst); + */ +static unsigned char *			/* where tentative match ended, or NULL */ +fast(m, start, stop, startst, stopst) +register struct match *m; +unsigned char *start; +unsigned char *stop; +sopno startst; +sopno stopst; +{ +	register states st = m->st; +	register states fresh = m->fresh; +	register states tmp = m->tmp; +	register unsigned char *p = start; +	register int c = (start == m->beginp) ? OUT : *(start-1); +	register int lastc;	/* previous c */ +	register int flagch; +	register int i; +	register unsigned char *coldp;	/* last p after which no match was underway */ + +	CLEAR(st); +	SET1(st, startst); +	st = step(m->g, startst, stopst, st, NOTHING, st); +	ASSIGN(fresh, st); +	SP("start", st, *p); +	coldp = NULL; +	for (;;) { +		/* next character */ +		lastc = c; +		c = (p == m->endp) ? OUT : *p; +		if (EQ(st, fresh)) +			coldp = p; + +		/* is there an EOL and/or BOL between lastc and c? */ +		flagch = '\0'; +		i = 0; +		if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || +				(lastc == OUT && !(m->eflags®_NOTBOL)) ) { +			flagch = BOL; +			i = m->g->nbol; +		} +		if ( (c == '\n' && m->g->cflags®_NEWLINE) || +				(c == OUT && !(m->eflags®_NOTEOL)) ) { +			flagch = (flagch == BOL) ? BOLEOL : EOL; +			i += m->g->neol; +		} +		if (i != 0) { +			for (; i > 0; i--) +				st = step(m->g, startst, stopst, st, flagch, st); +			SP("boleol", st, c); +		} + +		/* how about a word boundary? */ +		if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && +					(c != OUT && ISWORD(c)) ) { +			flagch = BOW; +		} +		if ( (lastc != OUT && ISWORD(lastc)) && +				(flagch == EOL || (c != OUT && !ISWORD(c))) ) { +			flagch = EOW; +		} +		if (flagch == BOW || flagch == EOW) { +			st = step(m->g, startst, stopst, st, flagch, st); +			SP("boweow", st, c); +		} + +		/* are we done? */ +		if (ISSET(st, stopst) || p == stop) +			break;		/* NOTE BREAK OUT */ + +		/* no, we must deal with this character */ +		ASSIGN(tmp, st); +		ASSIGN(st, fresh); +		assert(c != OUT); +		st = step(m->g, startst, stopst, tmp, c, st); +		SP("aft", st, c); +		assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); +		p++; +	} + +	assert(coldp != NULL); +	m->coldp = coldp; +	if (ISSET(st, stopst)) +		return(p+1); +	else +		return(NULL); +} + +/* + - slow - step through the string more deliberately + == static unsigned char *slow(register struct match *m, unsigned char *start, \ + ==	unsigned char *stop, sopno startst, sopno stopst); + */ +static unsigned char *			/* where it ended */ +slow(m, start, stop, startst, stopst) +register struct match *m; +unsigned char *start; +unsigned char *stop; +sopno startst; +sopno stopst; +{ +	register states st = m->st; +	register states empty = m->empty; +	register states tmp = m->tmp; +	register unsigned char *p = start; +	register int c = (start == m->beginp) ? OUT : *(start-1); +	register int lastc;	/* previous c */ +	register int flagch; +	register int i; +	register unsigned char *matchp;	/* last p at which a match ended */ + +	AT("slow", start, stop, startst, stopst); +	CLEAR(st); +	SET1(st, startst); +	SP("sstart", st, *p); +	st = step(m->g, startst, stopst, st, NOTHING, st); +	matchp = NULL; +	for (;;) { +		/* next character */ +		lastc = c; +		c = (p == m->endp) ? OUT : *p; + +		/* is there an EOL and/or BOL between lastc and c? */ +		flagch = '\0'; +		i = 0; +		if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || +				(lastc == OUT && !(m->eflags®_NOTBOL)) ) { +			flagch = BOL; +			i = m->g->nbol; +		} +		if ( (c == '\n' && m->g->cflags®_NEWLINE) || +				(c == OUT && !(m->eflags®_NOTEOL)) ) { +			flagch = (flagch == BOL) ? BOLEOL : EOL; +			i += m->g->neol; +		} +		if (i != 0) { +			for (; i > 0; i--) +				st = step(m->g, startst, stopst, st, flagch, st); +			SP("sboleol", st, c); +		} + +		/* how about a word boundary? */ +		if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && +					(c != OUT && ISWORD(c)) ) { +			flagch = BOW; +		} +		if ( (lastc != OUT && ISWORD(lastc)) && +				(flagch == EOL || (c != OUT && !ISWORD(c))) ) { +			flagch = EOW; +		} +		if (flagch == BOW || flagch == EOW) { +			st = step(m->g, startst, stopst, st, flagch, st); +			SP("sboweow", st, c); +		} + +		/* are we done? */ +		if (ISSET(st, stopst)) +			matchp = p; +		if (EQ(st, empty) || p == stop) +			break;		/* NOTE BREAK OUT */ + +		/* no, we must deal with this character */ +		ASSIGN(tmp, st); +		ASSIGN(st, empty); +		assert(c != OUT); +		st = step(m->g, startst, stopst, tmp, c, st); +		SP("saft", st, c); +		assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); +		p++; +	} + +	return(matchp); +} + + +/* + - step - map set of states reachable before char to set reachable after + == static states step(register struct re_guts *g, sopno start, sopno stop, \ + ==	register states bef, int ch, register states aft); + == #define	BOL	(OUT+1) + == #define	EOL	(BOL+1) + == #define	BOLEOL	(BOL+2) + == #define	NOTHING	(BOL+3) + == #define	BOW	(BOL+4) + == #define	EOW	(BOL+5) + == #define	CODEMAX	(BOL+5)		// highest code used + == #define	NONCHAR(c)	((c) > UCHAR_MAX) + == #define	NNONCHAR	(CODEMAX-UCHAR_MAX) + */ +static states +step(g, start, stop, bef, ch, aft) +register struct re_guts *g; +sopno start;			/* start state within strip */ +sopno stop;			/* state after stop state within strip */ +register states bef;		/* states reachable before */ +int ch;				/* character or NONCHAR code */ +register states aft;		/* states already known reachable after */ +{ +	register cset *cs; +	register sop s; +	register sopno pc; +	register onestate here;		/* note, macros know this name */ +	register sopno look; +	register long i; + +	for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) { +		s = g->strip[pc]; +		switch (OP(s)) { +		case OEND: +			assert(pc == stop-1); +			break; +		case OCHAR: +			/* only characters can match */ +			assert(!NONCHAR(ch) || ch != (unsigned char)OPND(s)); +			if (ch == (unsigned char)OPND(s)) +				FWD(aft, bef, 1); +			break; +		case OBOL: +			if (ch == BOL || ch == BOLEOL) +				FWD(aft, bef, 1); +			break; +		case OEOL: +			if (ch == EOL || ch == BOLEOL) +				FWD(aft, bef, 1); +			break; +		case OBOW: +			if (ch == BOW) +				FWD(aft, bef, 1); +			break; +		case OEOW: +			if (ch == EOW) +				FWD(aft, bef, 1); +			break; +		case OANY: +			if (!NONCHAR(ch)) +				FWD(aft, bef, 1); +			break; +		case OANYOF: +			cs = &g->sets[OPND(s)]; +			if (!NONCHAR(ch) && CHIN(cs, ch)) +				FWD(aft, bef, 1); +			break; +		case OBACK_:		/* ignored here */ +		case O_BACK: +			FWD(aft, aft, 1); +			break; +		case OPLUS_:		/* forward, this is just an empty */ +			FWD(aft, aft, 1); +			break; +		case O_PLUS:		/* both forward and back */ +			FWD(aft, aft, 1); +			i = ISSETBACK(aft, OPND(s)); +			BACK(aft, aft, OPND(s)); +			if (!i && ISSETBACK(aft, OPND(s))) { +				/* oho, must reconsider loop body */ +				pc -= OPND(s) + 1; +				INIT(here, pc); +			} +			break; +		case OQUEST_:		/* two branches, both forward */ +			FWD(aft, aft, 1); +			FWD(aft, aft, OPND(s)); +			break; +		case O_QUEST:		/* just an empty */ +			FWD(aft, aft, 1); +			break; +		case OLPAREN:		/* not significant here */ +		case ORPAREN: +			FWD(aft, aft, 1); +			break; +		case OCH_:		/* mark the first two branches */ +			FWD(aft, aft, 1); +			assert(OP(g->strip[pc+OPND(s)]) == OOR2); +			FWD(aft, aft, OPND(s)); +			break; +		case OOR1:		/* done a branch, find the O_CH */ +			if (ISSTATEIN(aft, here)) { +				for (look = 1; +						OP(s = g->strip[pc+look]) != O_CH; +						look += OPND(s)) +					assert(OP(s) == OOR2); +				FWD(aft, aft, look); +			} +			break; +		case OOR2:		/* propagate OCH_'s marking */ +			FWD(aft, aft, 1); +			if (OP(g->strip[pc+OPND(s)]) != O_CH) { +				assert(OP(g->strip[pc+OPND(s)]) == OOR2); +				FWD(aft, aft, OPND(s)); +			} +			break; +		case O_CH:		/* just empty */ +			FWD(aft, aft, 1); +			break; +		default:		/* ooooops... */ +			assert(PHP_REGEX_NOPE); +			break; +		} +	} + +	return(aft); +} + +#ifdef REDEBUG +/* + - print - print a set of states + == #ifdef REDEBUG + == static void print(struct match *m, unsigned char *caption, states st, \ + ==	int ch, FILE *d); + == #endif + */ +static void +print(m, caption, st, ch, d) +struct match *m; +unsigned char *caption; +states st; +int ch; +FILE *d; +{ +	register struct re_guts *g = m->g; +	register int i; +	register int first = 1; + +	if (!(m->eflags®_TRACE)) +		return; + +	fprintf(d, "%s", caption); +	if (ch != '\0') +		fprintf(d, " %s", pchar(ch)); +	for (i = 0; i < g->nstates; i++) +		if (ISSET(st, i)) { +			fprintf(d, "%s%d", (first) ? "\t" : ", ", i); +			first = 0; +		} +	fprintf(d, "\n"); +} + +/*  + - at - print current situation + == #ifdef REDEBUG + == static void at(struct match *m, unsigned char *title, unsigned char *start, unsigned char *stop, \ + ==						sopno startst, sopno stopst); + == #endif + */ +static void +at(m, title, start, stop, startst, stopst) +struct match *m; +unsigned char *title; +unsigned char *start; +unsigned char *stop; +sopno startst; +sopno stopst; +{ +	if (!(m->eflags®_TRACE)) +		return; + +	printf("%s %s-", title, pchar(*start)); +	printf("%s ", pchar(*stop)); +	printf("%ld-%ld\n", (long)startst, (long)stopst); +} + +#ifndef PCHARDONE +#define	PCHARDONE	/* never again */ +/* + - pchar - make a character printable + == #ifdef REDEBUG + == static unsigned char *pchar(int ch); + == #endif + * + * Is this identical to regchar() over in debug.c?  Well, yes.  But a + * duplicate here avoids having a debugging-capable regexec.o tied to + * a matching debug.o, and this is convenient.  It all disappears in + * the non-debug compilation anyway, so it doesn't matter much. + */ +static unsigned char *			/* -> representation */ +pchar(ch) +int ch; +{ +	static unsigned char pbuf[10]; + +	if (isprint(ch) || ch == ' ') +		sprintf(pbuf, "%c", ch); +	else +		sprintf(pbuf, "\\%o", ch); +	return(pbuf); +} +#endif +#endif + +#undef	matcher +#undef	fast +#undef	slow +#undef	dissect +#undef	backref +#undef	step +#undef	print +#undef	at +#undef	match diff --git a/ext/ereg/regex/engine.ih b/ext/ereg/regex/engine.ih new file mode 100644 index 0000000000..9a301838bc --- /dev/null +++ b/ext/ereg/regex/engine.ih @@ -0,0 +1,35 @@ +/* ========= begin header generated by ./mkh ========= */ +#ifdef __cplusplus +extern "C" { +#endif + +/* === engine.c === */ +static int matcher(register struct re_guts *g, unsigned char *string, size_t nmatch, regmatch_t pmatch[], int eflags); +static unsigned char *dissect(register struct match *m, unsigned char *start, unsigned char *stop, sopno startst, sopno stopst); +static unsigned char *backref(register struct match *m, unsigned char *start, unsigned char *stop, sopno startst, sopno stopst, sopno lev); +static unsigned char *fast(register struct match *m, unsigned char *start, unsigned char *stop, sopno startst, sopno stopst); +static unsigned char *slow(register struct match *m, unsigned char *start, unsigned char *stop, sopno startst, sopno stopst); +static states step(register struct re_guts *g, sopno start, sopno stop, register states bef, int ch, register states aft); +#define	BOL	(OUT+1) +#define	EOL	(BOL+1) +#define	BOLEOL	(BOL+2) +#define	NOTHING	(BOL+3) +#define	BOW	(BOL+4) +#define	EOW	(BOL+5) +#define	CODEMAX	(BOL+5)		/* highest code used */ +#define	NONCHAR(c)	((c) > UCHAR_MAX) +#define	NNONCHAR	(CODEMAX-UCHAR_MAX) +#ifdef REDEBUG +static void print(struct match *m, unsigned char *caption, states st, int ch, FILE *d); +#endif +#ifdef REDEBUG +static void at(struct match *m, unsigned char *title, unsigned char *start, unsigned char *stop, sopno startst, sopno stopst); +#endif +#ifdef REDEBUG +static unsigned char *pchar(int ch); +#endif + +#ifdef __cplusplus +} +#endif +/* ========= end header generated by ./mkh ========= */ diff --git a/ext/ereg/regex/main.c b/ext/ereg/regex/main.c new file mode 100644 index 0000000000..657338a2c1 --- /dev/null +++ b/ext/ereg/regex/main.c @@ -0,0 +1,510 @@ +#include <stdio.h> +#include <string.h> +#include <sys/types.h> +#include <regex.h> +#include <assert.h> +#include <stdlib.h> + +#include "main.ih" + +char *progname; +int debug = 0; +int line = 0; +int status = 0; + +int copts = REG_EXTENDED; +int eopts = 0; +regoff_t startoff = 0; +regoff_t endoff = 0; + + +extern int split(); +extern void regprint(); + +/* + - main - do the simple case, hand off to regress() for regression + */ +int main(argc, argv) +int argc; +char *argv[]; +{ +	regex_t re; +#	define	NS	10 +	regmatch_t subs[NS]; +	char erbuf[100]; +	int err; +	size_t len; +	int c; +	int errflg = 0; +	register int i; +	extern int optind; +	extern char *optarg; + +	progname = argv[0]; + +	while ((c = getopt(argc, argv, "c:e:S:E:x")) != EOF) +		switch (c) { +		case 'c':	/* compile options */ +			copts = options('c', optarg); +			break; +		case 'e':	/* execute options */ +			eopts = options('e', optarg); +			break; +		case 'S':	/* start offset */ +			startoff = (regoff_t)atoi(optarg); +			break; +		case 'E':	/* end offset */ +			endoff = (regoff_t)atoi(optarg); +			break; +		case 'x':	/* Debugging. */ +			debug++; +			break; +		case '?': +		default: +			errflg++; +			break; +		} +	if (errflg) { +		fprintf(stderr, "usage: %s ", progname); +		fprintf(stderr, "[-c copt][-C][-d] [re]\n"); +		exit(2); +	} + +	if (optind >= argc) { +		regress(stdin); +		exit(status); +	} + +	err = regcomp(&re, argv[optind++], copts); +	if (err) { +		len = regerror(err, &re, erbuf, sizeof(erbuf)); +		fprintf(stderr, "error %s, %d/%d `%s'\n", +			eprint(err), len, sizeof(erbuf), erbuf); +		exit(status); +	} +	regprint(&re, stdout);	 + +	if (optind >= argc) { +		regfree(&re); +		exit(status); +	} + +	if (eopts®_STARTEND) { +		subs[0].rm_so = startoff; +		subs[0].rm_eo = strlen(argv[optind]) - endoff; +	} +	err = regexec(&re, argv[optind], (size_t)NS, subs, eopts); +	if (err) { +		len = regerror(err, &re, erbuf, sizeof(erbuf)); +		fprintf(stderr, "error %s, %d/%d `%s'\n", +			eprint(err), len, sizeof(erbuf), erbuf); +		exit(status); +	} +	if (!(copts®_NOSUB)) { +		len = (int)(subs[0].rm_eo - subs[0].rm_so); +		if (subs[0].rm_so != -1) { +			if (len != 0) +				printf("match `%.*s'\n", (int)len, +					argv[optind] + subs[0].rm_so); +			else +				printf("match `'@%.1s\n", +					argv[optind] + subs[0].rm_so); +		} +		for (i = 1; i < NS; i++) +			if (subs[i].rm_so != -1) +				printf("(%d) `%.*s'\n", i, +					(int)(subs[i].rm_eo - subs[i].rm_so), +					argv[optind] + subs[i].rm_so); +	} +	exit(status); +} + +/* + - regress - main loop of regression test + == void regress(FILE *in); + */ +void +regress(in) +FILE *in; +{ +	char inbuf[1000]; +#	define	MAXF	10 +	char *f[MAXF]; +	int nf; +	int i; +	char erbuf[100]; +	size_t ne; +	char *badpat = "invalid regular expression"; +#	define	SHORT	10 +	char *bpname = "REG_BADPAT"; +	regex_t re; + +	while (fgets(inbuf, sizeof(inbuf), in) != NULL) { +		line++; +		if (inbuf[0] == '#' || inbuf[0] == '\n') +			continue;			/* NOTE CONTINUE */ +		inbuf[strlen(inbuf)-1] = '\0';	/* get rid of stupid \n */ +		if (debug) +			fprintf(stdout, "%d:\n", line); +		nf = split(inbuf, f, MAXF, "\t\t"); +		if (nf < 3) { +			fprintf(stderr, "bad input, line %d\n", line); +			exit(1); +		} +		for (i = 0; i < nf; i++) +			if (strcmp(f[i], "\"\"") == 0) +				f[i] = ""; +		if (nf <= 3) +			f[3] = NULL; +		if (nf <= 4) +			f[4] = NULL; +		try(f[0], f[1], f[2], f[3], f[4], options('c', f[1])); +		if (opt('&', f[1]))	/* try with either type of RE */ +			try(f[0], f[1], f[2], f[3], f[4], +					options('c', f[1]) &~ REG_EXTENDED); +	} + +	ne = regerror(REG_BADPAT, (regex_t *)NULL, erbuf, sizeof(erbuf)); +	if (strcmp(erbuf, badpat) != 0 || ne != strlen(badpat)+1) { +		fprintf(stderr, "end: regerror() test gave `%s' not `%s'\n", +							erbuf, badpat); +		status = 1; +	} +	ne = regerror(REG_BADPAT, (regex_t *)NULL, erbuf, (size_t)SHORT); +	if (strncmp(erbuf, badpat, SHORT-1) != 0 || erbuf[SHORT-1] != '\0' || +						ne != strlen(badpat)+1) { +		fprintf(stderr, "end: regerror() short test gave `%s' not `%.*s'\n", +						erbuf, SHORT-1, badpat); +		status = 1; +	} +	ne = regerror(REG_ITOA|REG_BADPAT, (regex_t *)NULL, erbuf, sizeof(erbuf)); +	if (strcmp(erbuf, bpname) != 0 || ne != strlen(bpname)+1) { +		fprintf(stderr, "end: regerror() ITOA test gave `%s' not `%s'\n", +						erbuf, bpname); +		status = 1; +	} +	re.re_endp = bpname; +	ne = regerror(REG_ATOI, &re, erbuf, sizeof(erbuf)); +	if (atoi(erbuf) != (int)REG_BADPAT) { +		fprintf(stderr, "end: regerror() ATOI test gave `%s' not `%ld'\n", +						erbuf, (long)REG_BADPAT); +		status = 1; +	} else if (ne != strlen(erbuf)+1) { +		fprintf(stderr, "end: regerror() ATOI test len(`%s') = %ld\n", +						erbuf, (long)REG_BADPAT); +		status = 1; +	} +} + +/* + - try - try it, and report on problems + == void try(char *f0, char *f1, char *f2, char *f3, char *f4, int opts); + */ +void +try(f0, f1, f2, f3, f4, opts) +char *f0; +char *f1; +char *f2; +char *f3; +char *f4; +int opts;			/* may not match f1 */ +{ +	regex_t re; +#	define	NSUBS	10 +	regmatch_t subs[NSUBS]; +#	define	NSHOULD	15 +	char *should[NSHOULD]; +	int nshould; +	char erbuf[100]; +	int err; +	int len; +	char *type = (opts & REG_EXTENDED) ? "ERE" : "BRE"; +	register int i; +	char *grump; +	char f0copy[1000]; +	char f2copy[1000]; + +	strcpy(f0copy, f0); +	re.re_endp = (opts®_PEND) ? f0copy + strlen(f0copy) : NULL; +	fixstr(f0copy); +	err = regcomp(&re, f0copy, opts); +	if (err != 0 && (!opt('C', f1) || err != efind(f2))) { +		/* unexpected error or wrong error */ +		len = regerror(err, &re, erbuf, sizeof(erbuf)); +		fprintf(stderr, "%d: %s error %s, %d/%d `%s'\n", +					line, type, eprint(err), len, +					sizeof(erbuf), erbuf); +		status = 1; +	} else if (err == 0 && opt('C', f1)) { +		/* unexpected success */ +		fprintf(stderr, "%d: %s should have given REG_%s\n", +						line, type, f2); +		status = 1; +		err = 1;	/* so we won't try regexec */ +	} + +	if (err != 0) { +		regfree(&re); +		return; +	} + +	strcpy(f2copy, f2); +	fixstr(f2copy); + +	if (options('e', f1)®_STARTEND) { +		if (strchr(f2, '(') == NULL || strchr(f2, ')') == NULL) +			fprintf(stderr, "%d: bad STARTEND syntax\n", line); +		subs[0].rm_so = strchr(f2, '(') - f2 + 1; +		subs[0].rm_eo = strchr(f2, ')') - f2; +	} +	err = regexec(&re, f2copy, NSUBS, subs, options('e', f1)); + +	if (err != 0 && (f3 != NULL || err != REG_NOMATCH)) { +		/* unexpected error or wrong error */ +		len = regerror(err, &re, erbuf, sizeof(erbuf)); +		fprintf(stderr, "%d: %s exec error %s, %d/%d `%s'\n", +					line, type, eprint(err), len, +					sizeof(erbuf), erbuf); +		status = 1; +	} else if (err != 0) { +		/* nothing more to check */ +	} else if (f3 == NULL) { +		/* unexpected success */ +		fprintf(stderr, "%d: %s exec should have failed\n", +						line, type); +		status = 1; +		err = 1;		/* just on principle */ +	} else if (opts®_NOSUB) { +		/* nothing more to check */ +	} else if ((grump = check(f2, subs[0], f3)) != NULL) { +		fprintf(stderr, "%d: %s %s\n", line, type, grump); +		status = 1; +		err = 1; +	} + +	if (err != 0 || f4 == NULL) { +		regfree(&re); +		return; +	} + +	for (i = 1; i < NSHOULD; i++) +		should[i] = NULL; +	nshould = split(f4, should+1, NSHOULD-1, ","); +	if (nshould == 0) { +		nshould = 1; +		should[1] = ""; +	} +	for (i = 1; i < NSUBS; i++) { +		grump = check(f2, subs[i], should[i]); +		if (grump != NULL) { +			fprintf(stderr, "%d: %s $%d %s\n", line, +							type, i, grump); +			status = 1; +			err = 1; +		} +	} + +	regfree(&re); +} + +/* + - options - pick options out of a regression-test string + == int options(int type, char *s); + */ +int +options(type, s) +int type;			/* 'c' compile, 'e' exec */ +char *s; +{ +	register char *p; +	register int o = (type == 'c') ? copts : eopts; +	register char *legal = (type == 'c') ? "bisnmp" : "^$#tl"; + +	for (p = s; *p != '\0'; p++) +		if (strchr(legal, *p) != NULL) +			switch (*p) { +			case 'b': +				o &= ~REG_EXTENDED; +				break; +			case 'i': +				o |= REG_ICASE; +				break; +			case 's': +				o |= REG_NOSUB; +				break; +			case 'n': +				o |= REG_NEWLINE; +				break; +			case 'm': +				o &= ~REG_EXTENDED; +				o |= REG_NOSPEC; +				break; +			case 'p': +				o |= REG_PEND; +				break; +			case '^': +				o |= REG_NOTBOL; +				break; +			case '$': +				o |= REG_NOTEOL; +				break; +			case '#': +				o |= REG_STARTEND; +				break; +			case 't':	/* trace */ +				o |= REG_TRACE; +				break; +			case 'l':	/* force long representation */ +				o |= REG_LARGE; +				break; +			case 'r':	/* force backref use */ +				o |= REG_BACKR; +				break; +			} +	return(o); +} + +/* + - opt - is a particular option in a regression string? + == int opt(int c, char *s); + */ +int				/* predicate */ +opt(c, s) +int c; +char *s; +{ +	return(strchr(s, c) != NULL); +} + +/* + - fixstr - transform magic characters in strings + == void fixstr(register char *p); + */ +void +fixstr(p) +register char *p; +{ +	if (p == NULL) +		return; + +	for (; *p != '\0'; p++) +		if (*p == 'N') +			*p = '\n'; +		else if (*p == 'T') +			*p = '\t'; +		else if (*p == 'S') +			*p = ' '; +		else if (*p == 'Z') +			*p = '\0'; +} + +/* + - check - check a substring match + == char *check(char *str, regmatch_t sub, char *should); + */ +char *				/* NULL or complaint */ +check(str, sub, should) +char *str; +regmatch_t sub; +char *should; +{ +	register int len; +	register int shlen; +	register char *p; +	static char grump[500]; +	register char *at = NULL; + +	if (should != NULL && strcmp(should, "-") == 0) +		should = NULL; +	if (should != NULL && should[0] == '@') { +		at = should + 1; +		should = ""; +	} + +	/* check rm_so and rm_eo for consistency */ +	if (sub.rm_so > sub.rm_eo || (sub.rm_so == -1 && sub.rm_eo != -1) || +				(sub.rm_so != -1 && sub.rm_eo == -1) || +				(sub.rm_so != -1 && sub.rm_so < 0) || +				(sub.rm_eo != -1 && sub.rm_eo < 0) ) { +		sprintf(grump, "start %ld end %ld", (long)sub.rm_so, +							(long)sub.rm_eo); +		return(grump); +	} + +	/* check for no match */ +	if (sub.rm_so == -1 && should == NULL) +		return(NULL); +	if (sub.rm_so == -1) +		return("did not match"); + +	/* check for in range */ +	if (sub.rm_eo > strlen(str)) { +		sprintf(grump, "start %ld end %ld, past end of string", +					(long)sub.rm_so, (long)sub.rm_eo); +		return(grump); +	} + +	len = (int)(sub.rm_eo - sub.rm_so); +	shlen = (int)strlen(should); +	p = str + sub.rm_so; + +	/* check for not supposed to match */ +	if (should == NULL) { +		sprintf(grump, "matched `%.*s'", len, p); +		return(grump); +	} + +	/* check for wrong match */ +	if (len != shlen || strncmp(p, should, (size_t)shlen) != 0) { +		sprintf(grump, "matched `%.*s' instead", len, p); +		return(grump); +	} +	if (shlen > 0) +		return(NULL); + +	/* check null match in right place */ +	if (at == NULL) +		return(NULL); +	shlen = strlen(at); +	if (shlen == 0) +		shlen = 1;	/* force check for end-of-string */ +	if (strncmp(p, at, shlen) != 0) { +		sprintf(grump, "matched null at `%.20s'", p); +		return(grump); +	} +	return(NULL); +} + +/* + - eprint - convert error number to name + == static char *eprint(int err); + */ +static char * +eprint(err) +int err; +{ +	static char epbuf[100]; +	size_t len; + +	len = regerror(REG_ITOA|err, (regex_t *)NULL, epbuf, sizeof(epbuf)); +	assert(len <= sizeof(epbuf)); +	return(epbuf); +} + +/* + - efind - convert error name to number + == static int efind(char *name); + */ +static int +efind(name) +char *name; +{ +	static char efbuf[100]; +	regex_t re; + +	sprintf(efbuf, "REG_%s", name); +	assert(strlen(efbuf) < sizeof(efbuf)); +	re.re_endp = efbuf; +	(void) regerror(REG_ATOI, &re, efbuf, sizeof(efbuf)); +	return(atoi(efbuf)); +} diff --git a/ext/ereg/regex/main.ih b/ext/ereg/regex/main.ih new file mode 100644 index 0000000000..5a0118ac44 --- /dev/null +++ b/ext/ereg/regex/main.ih @@ -0,0 +1,19 @@ +/* ========= begin header generated by ./mkh ========= */ +#ifdef __cplusplus +extern "C" { +#endif + +/* === main.c === */ +void regress(FILE *in); +void try(char *f0, char *f1, char *f2, char *f3, char *f4, int opts); +int options(int type, char *s); +int opt(int c, char *s); +void fixstr(register char *p); +char *check(char *str, regmatch_t sub, char *should); +static char *eprint(int err); +static int efind(char *name); + +#ifdef __cplusplus +} +#endif +/* ========= end header generated by ./mkh ========= */ diff --git a/ext/ereg/regex/mkh b/ext/ereg/regex/mkh new file mode 100644 index 0000000000..252b246c7b --- /dev/null +++ b/ext/ereg/regex/mkh @@ -0,0 +1,76 @@ +#! /bin/sh +# mkh - pull headers out of C source +PATH=/bin:/usr/bin ; export PATH + +# egrep pattern to pick out marked lines +egrep='^ =([ 	]|$)' + +# Sed program to process marked lines into lines for the header file. +# The markers have already been removed.  Two things are done here:  removal +# of backslashed newlines, and some fudging of comments.  The first is done +# because -o needs to have prototypes on one line to strip them down. +# Getting comments into the output is tricky; we turn C++-style // comments +# into /* */ comments, after altering any existing */'s to avoid trouble. +peel='	/\\$/N +	/\\\n[ 	]*/s///g +	/\/\//s;\*/;* /;g +	/\/\//s;//\(.*\);/*\1 */;' + +for a +do +	case "$a" in +	-o)	# old (pre-function-prototype) compiler +		# add code to comment out argument lists +		peel="$peel +			"'/^\([^#\/][^\/]*[a-zA-Z0-9_)]\)(\(.*\))/s;;\1(/*\2*/);' +		shift +		;; +	-b)	# funny Berkeley __P macro +		peel="$peel +			"'/^\([^#\/][^\/]*[a-zA-Z0-9_)]\)(\(.*\))/s;;\1 __P((\2));' +		shift +		;; +	-s)	# compiler doesn't like `static foo();' +		# add code to get rid of the `static' +		peel="$peel +			"'/^static[ 	][^\/]*[a-zA-Z0-9_)](.*)/s;static.;;' +		shift +		;; +	-p)	# private declarations +		egrep='^ ==([ 	]|$)' +		shift +		;; +	-i)	# wrap in #ifndef, argument is name +		ifndef="$2" +		shift ; shift +		;; +	*)	break +		;; +	esac +done + +if test " $ifndef" != " " +then +	echo "#ifndef $ifndef" +	echo "#define	$ifndef	/* never again */" +fi +echo "/* ========= begin header generated by $0 ========= */" +echo '#ifdef __cplusplus' +echo 'extern "C" {' +echo '#endif' +for f +do +	echo +	echo "/* === $f === */" +	egrep "$egrep" $f | sed 's/^ ==*[ 	]//;s/^ ==*$//' | sed "$peel" +	echo +done +echo '#ifdef __cplusplus' +echo '}' +echo '#endif' +echo "/* ========= end header generated by $0 ========= */" +if test " $ifndef" != " " +then +	echo "#endif" +fi +exit 0 diff --git a/ext/ereg/regex/regcomp.c b/ext/ereg/regex/regcomp.c new file mode 100644 index 0000000000..d72cc82940 --- /dev/null +++ b/ext/ereg/regex/regcomp.c @@ -0,0 +1,1613 @@ +#include <sys/types.h> +#include <stdio.h> +#include <string.h> +#include <ctype.h> +#include <limits.h> +#include <stdlib.h> + +#define POSIX_MISTAKE + +#include "utils.h" +#include "regex.h" +#include "regex2.h" + +#include "cclass.h" +#include "cname.h" + +/* + * parse structure, passed up and down to avoid global variables and + * other clumsinesses + */ +struct parse { +	unsigned char *next;		/* next character in RE */ +	unsigned char *end;		/* end of string (-> NUL normally) */ +	int error;		/* has an error been seen? */ +	sop *strip;		/* malloced strip */ +	sopno ssize;		/* malloced strip size (allocated) */ +	sopno slen;		/* malloced strip length (used) */ +	int ncsalloc;		/* number of csets allocated */ +	struct re_guts *g; +#	define	NPAREN	10	/* we need to remember () 1-9 for back refs */ +	sopno pbegin[NPAREN];	/* -> ( ([0] unused) */ +	sopno pend[NPAREN];	/* -> ) ([0] unused) */ +}; + +#include "regcomp.ih" + +static unsigned char nuls[10];		/* place to point scanner in event of error */ + +/* + * macros for use with parse structure + * BEWARE:  these know that the parse structure is named `p' !!! + */ +#define	PEEK()	(*p->next) +#define	PEEK2()	(*(p->next+1)) +#define	MORE()	(p->next < p->end) +#define	MORE2()	(p->next+1 < p->end) +#define	SEE(c)	(MORE() && PEEK() == (c)) +#define	SEETWO(a, b)	(MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) +#define	EAT(c)	((SEE(c)) ? (NEXT(), 1) : 0) +#define	EATTWO(a, b)	((SEETWO(a, b)) ? (NEXT2(), 1) : 0) +#define	NEXT()	(p->next++) +#define	NEXT2()	(p->next += 2) +#define	NEXTn(n)	(p->next += (n)) +#define	GETNEXT()	(*p->next++) +#define	SETERROR(e)	seterr(p, (e)) +#define	REQUIRE(co, e)	(void) ((co) || SETERROR(e)) +#define	MUSTSEE(c, e)	(REQUIRE(MORE() && PEEK() == (c), e)) +#define	MUSTEAT(c, e)	(REQUIRE(MORE() && GETNEXT() == (c), e)) +#define	MUSTNOTSEE(c, e)	(REQUIRE(!MORE() || PEEK() != (c), e)) +#define	EMIT(op, sopnd)	doemit(p, (sop)(op), (size_t)(sopnd)) +#define	INSERT(op, pos)	doinsert(p, (sop)(op), HERE()-(pos)+1, pos) +#define	AHEAD(pos)		dofwd(p, pos, HERE()-(pos)) +#define	ASTERN(sop, pos)	EMIT(sop, HERE()-pos) +#define	HERE()		(p->slen) +#define	THERE()		(p->slen - 1) +#define	THERETHERE()	(p->slen - 2) +#define	DROP(n)	(p->slen -= (n)) + +#ifndef NDEBUG +static int never = 0;		/* for use in asserts; shuts lint up */ +#else +#define	never	0		/* some <assert.h>s have bugs too */ +#endif + +/* + - regcomp - interface for parser and compilation + = API_EXPORT(int) regcomp(regex_t *, const char *, int); + = #define	REG_BASIC	0000 + = #define	REG_EXTENDED	0001 + = #define	REG_ICASE	0002 + = #define	REG_NOSUB	0004 + = #define	REG_NEWLINE	0010 + = #define	REG_NOSPEC	0020 + = #define	REG_PEND	0040 + = #define	REG_DUMP	0200 + */ +API_EXPORT(int)			/* 0 success, otherwise REG_something */ +regcomp(preg, pattern, cflags) +regex_t *preg; +const char *pattern; +int cflags; +{ +	struct parse pa; +	register struct re_guts *g; +	register struct parse *p = &pa; +	register int i; +	register size_t len; +#ifdef REDEBUG +#	define	GOODFLAGS(f)	(f) +#else +#	define	GOODFLAGS(f)	((f)&~REG_DUMP) +#endif + +	cflags = GOODFLAGS(cflags); +	if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) +		return(REG_INVARG); + +	if (cflags®_PEND) { +		if (preg->re_endp < pattern) +			return(REG_INVARG); +		len = preg->re_endp - pattern; +	} else +		len = strlen((char *)pattern); + +	/* do the mallocs early so failure handling is easy */ +	g = (struct re_guts *)malloc(sizeof(struct re_guts) + +							(NC-1)*sizeof(cat_t)); +	if (g == NULL) +		return(REG_ESPACE); +	p->ssize = len/(size_t)2*(size_t)3 + (size_t)1;	/* ugh */ +	p->strip = (sop *)malloc(p->ssize * sizeof(sop)); +	p->slen = 0; +	if (p->strip == NULL) { +		free((char *)g); +		return(REG_ESPACE); +	} + +	/* set things up */ +	p->g = g; +	p->next = (unsigned char *)pattern;	/* convenience; we do not modify it */ +	p->end = p->next + len; +	p->error = 0; +	p->ncsalloc = 0; +	for (i = 0; i < NPAREN; i++) { +		p->pbegin[i] = 0; +		p->pend[i] = 0; +	} +	g->csetsize = NC; +	g->sets = NULL; +	g->setbits = NULL; +	g->ncsets = 0; +	g->cflags = cflags; +	g->iflags = 0; +	g->nbol = 0; +	g->neol = 0; +	g->must = NULL; +	g->mlen = 0; +	g->nsub = 0; +	g->ncategories = 1;	/* category 0 is "everything else" */ +	g->categories = &g->catspace[0]; +	(void) memset((char *)g->catspace, 0, NC*sizeof(cat_t)); +	g->backrefs = 0; + +	/* do it */ +	EMIT(OEND, 0); +	g->firststate = THERE(); +	if (cflags®_EXTENDED) +		p_ere(p, OUT); +	else if (cflags®_NOSPEC) +		p_str(p); +	else +		p_bre(p, OUT, OUT); +	EMIT(OEND, 0); +	g->laststate = THERE(); + +	/* tidy up loose ends and fill things in */ +	categorize(p, g); +	stripsnug(p, g); +	findmust(p, g); +	g->nplus = pluscount(p, g); +	g->magic = MAGIC2; +	preg->re_nsub = g->nsub; +	preg->re_g = g; +	preg->re_magic = MAGIC1; +#ifndef REDEBUG +	/* not debugging, so can't rely on the assert() in regexec() */ +	if (g->iflags&BAD) +		SETERROR(REG_ASSERT); +#endif + +	/* win or lose, we're done */ +	if (p->error != 0)	/* lose */ +		regfree(preg); +	return(p->error); +} + +/* + - p_ere - ERE parser top level, concatenation and alternation + == static void p_ere(register struct parse *p, int stop); + */ +static void +p_ere(p, stop) +register struct parse *p; +int stop;			/* character this ERE should end at */ +{ +	register unsigned char c; +	register sopno prevback = 0; +	register sopno prevfwd = 0; +	register sopno conc; +	register int first = 1;		/* is this the first alternative? */ + +	for (;;) { +		/* do a bunch of concatenated expressions */ +		conc = HERE(); +		while (MORE() && (c = PEEK()) != '|' && c != stop) +			p_ere_exp(p); +		(void) REQUIRE(HERE() != conc, REG_EMPTY);	/* require nonempty */ + +		if (!EAT('|')) +			break;		/* NOTE BREAK OUT */ + +		if (first) { +			INSERT(OCH_, conc);	/* offset is wrong */ +			prevfwd = conc; +			prevback = conc; +			first = 0; +		} +		ASTERN(OOR1, prevback); +		prevback = THERE(); +		AHEAD(prevfwd);			/* fix previous offset */ +		prevfwd = HERE(); +		EMIT(OOR2, 0);			/* offset is very wrong */ +	} + +	if (!first) {		/* tail-end fixups */ +		AHEAD(prevfwd); +		ASTERN(O_CH, prevback); +	} + +	assert(!MORE() || SEE(stop)); +} + +/* + - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op + == static void p_ere_exp(register struct parse *p); + */ +static void +p_ere_exp(p) +register struct parse *p; +{ +	register unsigned char c; +	register sopno pos; +	register int count; +	register int count2; +	register sopno subno; +	int wascaret = 0; + +	assert(MORE());		/* caller should have ensured this */ +	c = GETNEXT(); + +	pos = HERE(); +	switch (c) { +	case '(': +		REQUIRE(MORE(), REG_EPAREN); +		p->g->nsub++; +		subno = p->g->nsub; +		if (subno < NPAREN) +			p->pbegin[subno] = HERE(); +		EMIT(OLPAREN, subno); +		if (!SEE(')')) +			p_ere(p, ')'); +		if (subno < NPAREN) { +			p->pend[subno] = HERE(); +			assert(p->pend[subno] != 0); +		} +		EMIT(ORPAREN, subno); +		MUSTEAT(')', REG_EPAREN); +		break; +#ifndef POSIX_MISTAKE +	case ')':		/* happens only if no current unmatched ( */ +		/* +		 * You may ask, why the ifndef?  Because I didn't notice +		 * this until slightly too late for 1003.2, and none of the +		 * other 1003.2 regular-expression reviewers noticed it at +		 * all.  So an unmatched ) is legal POSIX, at least until +		 * we can get it fixed. +		 */ +		SETERROR(REG_EPAREN); +		break; +#endif +	case '^': +		EMIT(OBOL, 0); +		p->g->iflags |= USEBOL; +		p->g->nbol++; +		wascaret = 1; +		break; +	case '$': +		EMIT(OEOL, 0); +		p->g->iflags |= USEEOL; +		p->g->neol++; +		break; +	case '|': +		SETERROR(REG_EMPTY); +		break; +	case '*': +	case '+': +	case '?': +		SETERROR(REG_BADRPT); +		break; +	case '.': +		if (p->g->cflags®_NEWLINE) +			nonnewline(p); +		else +			EMIT(OANY, 0); +		break; +	case '[': +		p_bracket(p); +		break; +	case '\\': +		REQUIRE(MORE(), REG_EESCAPE); +		c = GETNEXT(); +		ordinary(p, c); +		break; +	case '{':		/* okay as ordinary except if digit follows */ +		REQUIRE(!MORE() || !isdigit(PEEK()), REG_BADRPT); +		/* FALLTHROUGH */ +	default: +		ordinary(p, c); +		break; +	} + +	if (!MORE()) +		return; +	c = PEEK(); +	/* we call { a repetition if followed by a digit */ +	if (!( c == '*' || c == '+' || c == '?' || +				(c == '{' && MORE2() && isdigit(PEEK2())) )) +		return;		/* no repetition, we're done */ +	NEXT(); + +	REQUIRE(!wascaret, REG_BADRPT); +	switch (c) { +	case '*':	/* implemented as +? */ +		/* this case does not require the (y|) trick, noKLUDGE */ +		INSERT(OPLUS_, pos); +		ASTERN(O_PLUS, pos); +		INSERT(OQUEST_, pos); +		ASTERN(O_QUEST, pos); +		break; +	case '+': +		INSERT(OPLUS_, pos); +		ASTERN(O_PLUS, pos); +		break; +	case '?': +		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ +		INSERT(OCH_, pos);		/* offset slightly wrong */ +		ASTERN(OOR1, pos);		/* this one's right */ +		AHEAD(pos);			/* fix the OCH_ */ +		EMIT(OOR2, 0);			/* offset very wrong... */ +		AHEAD(THERE());			/* ...so fix it */ +		ASTERN(O_CH, THERETHERE()); +		break; +	case '{': +		count = p_count(p); +		if (EAT(',')) { +			if (isdigit(PEEK())) { +				count2 = p_count(p); +				REQUIRE(count <= count2, REG_BADBR); +			} else		/* single number with comma */ +				count2 = INFINITY; +		} else		/* just a single number */ +			count2 = count; +		repeat(p, pos, count, count2); +		if (!EAT('}')) {	/* error heuristics */ +			while (MORE() && PEEK() != '}') +				NEXT(); +			REQUIRE(MORE(), REG_EBRACE); +			SETERROR(REG_BADBR); +		} +		break; +	} + +	if (!MORE()) +		return; +	c = PEEK(); +	if (!( c == '*' || c == '+' || c == '?' || +				(c == '{' && MORE2() && isdigit(PEEK2())) ) ) +		return; +	SETERROR(REG_BADRPT); +} + +/* + - p_str - string (no metacharacters) "parser" + == static void p_str(register struct parse *p); + */ +static void +p_str(p) +register struct parse *p; +{ +	REQUIRE(MORE(), REG_EMPTY); +	while (MORE()) +		ordinary(p, GETNEXT()); +} + +/* + - p_bre - BRE parser top level, anchoring and concatenation + == static void p_bre(register struct parse *p, register int end1, \ + ==	register int end2); + * Giving end1 as OUT essentially eliminates the end1/end2 check. + * + * This implementation is a bit of a kludge, in that a trailing $ is first + * taken as an ordinary character and then revised to be an anchor.  The + * only undesirable side effect is that '$' gets included as a character + * category in such cases.  This is fairly harmless; not worth fixing. + * The amount of lookahead needed to avoid this kludge is excessive. + */ +static void +p_bre(p, end1, end2) +register struct parse *p; +register int end1;		/* first terminating character */ +register int end2;		/* second terminating character */ +{ +	register sopno start = HERE(); +	register int first = 1;			/* first subexpression? */ +	register int wasdollar = 0; + +	if (EAT('^')) { +		EMIT(OBOL, 0); +		p->g->iflags |= USEBOL; +		p->g->nbol++; +	} +	while (MORE() && !SEETWO(end1, end2)) { +		wasdollar = p_simp_re(p, first); +		first = 0; +	} +	if (wasdollar) {	/* oops, that was a trailing anchor */ +		DROP(1); +		EMIT(OEOL, 0); +		p->g->iflags |= USEEOL; +		p->g->neol++; +	} + +	REQUIRE(HERE() != start, REG_EMPTY);	/* require nonempty */ +} + +/* + - p_simp_re - parse a simple RE, an atom possibly followed by a repetition + == static int p_simp_re(register struct parse *p, int starordinary); + */ +static int			/* was the simple RE an unbackslashed $? */ +p_simp_re(p, starordinary) +register struct parse *p; +int starordinary;		/* is a leading * an ordinary character? */ +{ +	register int c; +	register int count; +	register int count2; +	register sopno pos; +	register int i; +	register sopno subno; +#	define	BACKSL	(1<<CHAR_BIT) + +	pos = HERE();		/* repetion op, if any, covers from here */ + +	assert(MORE());		/* caller should have ensured this */ +	c = GETNEXT(); +	if (c == '\\') { +		REQUIRE(MORE(), REG_EESCAPE); +		c = BACKSL | (unsigned char)GETNEXT(); +	} +	switch (c) { +	case '.': +		if (p->g->cflags®_NEWLINE) +			nonnewline(p); +		else +			EMIT(OANY, 0); +		break; +	case '[': +		p_bracket(p); +		break; +	case BACKSL|'{': +		SETERROR(REG_BADRPT); +		break; +	case BACKSL|'(': +		p->g->nsub++; +		subno = p->g->nsub; +		if (subno < NPAREN) +			p->pbegin[subno] = HERE(); +		EMIT(OLPAREN, subno); +		/* the MORE here is an error heuristic */ +		if (MORE() && !SEETWO('\\', ')')) +			p_bre(p, '\\', ')'); +		if (subno < NPAREN) { +			p->pend[subno] = HERE(); +			assert(p->pend[subno] != 0); +		} +		EMIT(ORPAREN, subno); +		REQUIRE(EATTWO('\\', ')'), REG_EPAREN); +		break; +	case BACKSL|')':	/* should not get here -- must be user */ +	case BACKSL|'}': +		SETERROR(REG_EPAREN); +		break; +	case BACKSL|'1': +	case BACKSL|'2': +	case BACKSL|'3': +	case BACKSL|'4': +	case BACKSL|'5': +	case BACKSL|'6': +	case BACKSL|'7': +	case BACKSL|'8': +	case BACKSL|'9': +		i = (c&~BACKSL) - '0'; +		assert(i < NPAREN); +		if (p->pend[i] != 0) { +			assert(i <= p->g->nsub); +			EMIT(OBACK_, i); +			assert(p->pbegin[i] != 0); +			assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); +			assert(OP(p->strip[p->pend[i]]) == ORPAREN); +			(void) dupl(p, p->pbegin[i]+1, p->pend[i]); +			EMIT(O_BACK, i); +		} else +			SETERROR(REG_ESUBREG); +		p->g->backrefs = 1; +		break; +	case '*': +		REQUIRE(starordinary, REG_BADRPT); +		/* FALLTHROUGH */ +	default: +		ordinary(p, (unsigned char)c);	/* takes off BACKSL, if any */ +		break; +	} + +	if (EAT('*')) {		/* implemented as +? */ +		/* this case does not require the (y|) trick, noKLUDGE */ +		INSERT(OPLUS_, pos); +		ASTERN(O_PLUS, pos); +		INSERT(OQUEST_, pos); +		ASTERN(O_QUEST, pos); +	} else if (EATTWO('\\', '{')) { +		count = p_count(p); +		if (EAT(',')) { +			if (MORE() && isdigit(PEEK())) { +				count2 = p_count(p); +				REQUIRE(count <= count2, REG_BADBR); +			} else		/* single number with comma */ +				count2 = INFINITY; +		} else		/* just a single number */ +			count2 = count; +		repeat(p, pos, count, count2); +		if (!EATTWO('\\', '}')) {	/* error heuristics */ +			while (MORE() && !SEETWO('\\', '}')) +				NEXT(); +			REQUIRE(MORE(), REG_EBRACE); +			SETERROR(REG_BADBR); +		} +	} else if (c == (unsigned char)'$')	/* $ (but not \$) ends it */ +		return(1); + +	return(0); +} + +/* + - p_count - parse a repetition count + == static int p_count(register struct parse *p); + */ +static int			/* the value */ +p_count(p) +register struct parse *p; +{ +	register int count = 0; +	register int ndigits = 0; + +	while (MORE() && isdigit(PEEK()) && count <= DUPMAX) { +		count = count*10 + (GETNEXT() - '0'); +		ndigits++; +	} + +	REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); +	return(count); +} + +/* + - p_bracket - parse a bracketed character list + == static void p_bracket(register struct parse *p); + * + * Note a significant property of this code:  if the allocset() did SETERROR, + * no set operations are done. + */ +static void +p_bracket(p) +register struct parse *p; +{ +	register cset *cs = allocset(p); +	register int invert = 0; + +	/* Dept of Truly Sickening Special-Case Kludges */ +	if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) { +		EMIT(OBOW, 0); +		NEXTn(6); +		return; +	} +	if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) { +		EMIT(OEOW, 0); +		NEXTn(6); +		return; +	} + +	if (EAT('^')) +		invert++;	/* make note to invert set at end */ +	if (EAT(']')) +		CHadd(cs, ']'); +	else if (EAT('-')) +		CHadd(cs, '-'); +	while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) +		p_b_term(p, cs); +	if (EAT('-')) +		CHadd(cs, '-'); +	MUSTEAT(']', REG_EBRACK); + +	if (p->error != 0)	/* don't mess things up further */ +		return; + +	if (p->g->cflags®_ICASE) { +		register int i; +		register int ci; + +		for (i = p->g->csetsize - 1; i >= 0; i--) +			if (CHIN(cs, i) && isalpha(i)) { +				ci = othercase(i); +				if (ci != i) +					CHadd(cs, ci); +			} +		if (cs->multis != NULL) +			mccase(p, cs); +	} +	if (invert) { +		register int i; + +		for (i = p->g->csetsize - 1; i >= 0; i--) +			if (CHIN(cs, i)) +				CHsub(cs, i); +			else +				CHadd(cs, i); +		if (p->g->cflags®_NEWLINE) +			CHsub(cs, '\n'); +		if (cs->multis != NULL) +			mcinvert(p, cs); +	} + +	assert(cs->multis == NULL);		/* xxx */ + +	if (nch(p, cs) == 1) {		/* optimize singleton sets */ +		ordinary(p, firstch(p, cs)); +		freeset(p, cs); +	} else +		EMIT(OANYOF, freezeset(p, cs)); +} + +/* + - p_b_term - parse one term of a bracketed character list + == static void p_b_term(register struct parse *p, register cset *cs); + */ +static void +p_b_term(p, cs) +register struct parse *p; +register cset *cs; +{ +	register unsigned char c; +	register unsigned char start, finish; +	register int i; + +	/* classify what we've got */ +	switch ((MORE()) ? PEEK() : '\0') { +	case '[': +		c = (MORE2()) ? PEEK2() : '\0'; +		break; +	case '-': +		SETERROR(REG_ERANGE); +		return;			/* NOTE RETURN */ +		break; +	default: +		c = '\0'; +		break; +	} + +	switch (c) { +	case ':':		/* character class */ +		NEXT2(); +		REQUIRE(MORE(), REG_EBRACK); +		c = PEEK(); +		REQUIRE(c != '-' && c != ']', REG_ECTYPE); +		p_b_cclass(p, cs); +		REQUIRE(MORE(), REG_EBRACK); +		REQUIRE(EATTWO(':', ']'), REG_ECTYPE); +		break; +	case '=':		/* equivalence class */ +		NEXT2(); +		REQUIRE(MORE(), REG_EBRACK); +		c = PEEK(); +		REQUIRE(c != '-' && c != ']', REG_ECOLLATE); +		p_b_eclass(p, cs); +		REQUIRE(MORE(), REG_EBRACK); +		REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); +		break; +	default:		/* symbol, ordinary character, or range */ +/* xxx revision needed for multichar stuff */ +		start = p_b_symbol(p); +		if (SEE('-') && MORE2() && PEEK2() != ']') { +			/* range */ +			NEXT(); +			if (EAT('-')) +				finish = '-'; +			else +				finish = p_b_symbol(p); +		} else +			finish = start; +/* xxx what about signed chars here... */ +		REQUIRE(start <= finish, REG_ERANGE); +		for (i = start; i <= finish; i++) +			CHadd(cs, i); +		break; +	} +} + +/* + - p_b_cclass - parse a character-class name and deal with it + == static void p_b_cclass(register struct parse *p, register cset *cs); + */ +static void +p_b_cclass(p, cs) +register struct parse *p; +register cset *cs; +{ +	register unsigned char *sp = p->next; +	register struct cclass *cp; +	register size_t len; +	register unsigned char *u; +	register unsigned char c; + +	while (MORE() && isalpha(PEEK())) +		NEXT(); +	len = p->next - sp; +	for (cp = cclasses; cp->name != NULL; cp++) +		if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') +			break; +	if (cp->name == NULL) { +		/* oops, didn't find it */ +		SETERROR(REG_ECTYPE); +		return; +	} + +	u = cp->chars; +	while ((c = *u++) != '\0') +		CHadd(cs, c); +	for (u = cp->multis; *u != '\0'; u += strlen(u) + 1) +		MCadd(p, cs, u); +} + +/* + - p_b_eclass - parse an equivalence-class name and deal with it + == static void p_b_eclass(register struct parse *p, register cset *cs); + * + * This implementation is incomplete. xxx + */ +static void +p_b_eclass(p, cs) +register struct parse *p; +register cset *cs; +{ +	register unsigned char c; + +	c = p_b_coll_elem(p, '='); +	CHadd(cs, c); +} + +/* + - p_b_symbol - parse a character or [..]ed multicharacter collating symbol + == static char p_b_symbol(register struct parse *p); + */ +static unsigned char			/* value of symbol */ +p_b_symbol(p) +register struct parse *p; +{ +	register unsigned char value; + +	REQUIRE(MORE(), REG_EBRACK); +	if (!EATTWO('[', '.')) +		return(GETNEXT()); + +	/* collating symbol */ +	value = p_b_coll_elem(p, '.'); +	REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); +	return(value); +} + +/* + - p_b_coll_elem - parse a collating-element name and look it up + == static char p_b_coll_elem(register struct parse *p, int endc); + */ +static unsigned char			/* value of collating element */ +p_b_coll_elem(p, endc) +register struct parse *p; +int endc;			/* name ended by endc,']' */ +{ +	register unsigned char *sp = p->next; +	register struct cname *cp; +	register int len; + +	while (MORE() && !SEETWO(endc, ']')) +		NEXT(); +	if (!MORE()) { +		SETERROR(REG_EBRACK); +		return(0); +	} +	len = p->next - sp; +	for (cp = cnames; cp->name != NULL; cp++) +		if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') +			return(cp->code);	/* known name */ +	if (len == 1) +		return(*sp);	/* single character */ +	SETERROR(REG_ECOLLATE);			/* neither */ +	return(0); +} + +/* + - othercase - return the case counterpart of an alphabetic + == static char othercase(int ch); + */ +static unsigned char			/* if no counterpart, return ch */ +othercase(ch) +int ch; +{ +	assert(isalpha(ch)); +	if (isupper(ch)) +		return(tolower(ch)); +	else if (islower(ch)) +		return(toupper(ch)); +	else			/* peculiar, but could happen */ +		return(ch); +} + +/* + - bothcases - emit a dualcase version of a two-case character + == static void bothcases(register struct parse *p, int ch); + * + * Boy, is this implementation ever a kludge... + */ +static void +bothcases(p, ch) +register struct parse *p; +int ch; +{ +	register unsigned char *oldnext = p->next; +	register unsigned char *oldend = p->end; +	unsigned char bracket[3]; + +	assert(othercase(ch) != ch);	/* p_bracket() would recurse */ +	p->next = bracket; +	p->end = bracket+2; +	bracket[0] = ch; +	bracket[1] = ']'; +	bracket[2] = '\0'; +	p_bracket(p); +	assert(p->next == bracket+2); +	p->next = oldnext; +	p->end = oldend; +} + +/* + - ordinary - emit an ordinary character + == static void ordinary(register struct parse *p, register int ch); + */ +static void +ordinary(p, ch) +register struct parse *p; +register int ch; +{ +	register cat_t *cap = p->g->categories; + +	if ((p->g->cflags®_ICASE) && isalpha(ch) && othercase(ch) != ch) +		bothcases(p, ch); +	else { +		EMIT(OCHAR, (unsigned char)ch); +		if (cap[ch] == 0) +			cap[ch] = p->g->ncategories++; +	} +} + +/* + - nonnewline - emit REG_NEWLINE version of OANY + == static void nonnewline(register struct parse *p); + * + * Boy, is this implementation ever a kludge... + */ +static void +nonnewline(p) +register struct parse *p; +{ +	register unsigned char *oldnext = p->next; +	register unsigned char *oldend = p->end; +	unsigned char bracket[4]; + +	p->next = bracket; +	p->end = bracket+3; +	bracket[0] = '^'; +	bracket[1] = '\n'; +	bracket[2] = ']'; +	bracket[3] = '\0'; +	p_bracket(p); +	assert(p->next == bracket+3); +	p->next = oldnext; +	p->end = oldend; +} + +/* + - repeat - generate code for a bounded repetition, recursively if needed + == static void repeat(register struct parse *p, sopno start, int from, int to); + */ +static void +repeat(p, start, from, to) +register struct parse *p; +sopno start;			/* operand from here to end of strip */ +int from;			/* repeated from this number */ +int to;				/* to this number of times (maybe INFINITY) */ +{ +	register sopno finish = HERE(); +#	define	N	2 +#	define	INF	3 +#	define	REP(f, t)	((f)*8 + (t)) +#	define	MAP(n)	(((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) +	register sopno copy; + +	if (p->error != 0)	/* head off possible runaway recursion */ +		return; + +	assert(from <= to); + +	switch (REP(MAP(from), MAP(to))) { +	case REP(0, 0):			/* must be user doing this */ +		DROP(finish-start);	/* drop the operand */ +		break; +	case REP(0, 1):			/* as x{1,1}? */ +	case REP(0, N):			/* as x{1,n}? */ +	case REP(0, INF):		/* as x{1,}? */ +		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ +		INSERT(OCH_, start);		/* offset is wrong... */ +		repeat(p, start+1, 1, to); +		ASTERN(OOR1, start); +		AHEAD(start);			/* ... fix it */ +		EMIT(OOR2, 0); +		AHEAD(THERE()); +		ASTERN(O_CH, THERETHERE()); +		break; +	case REP(1, 1):			/* trivial case */ +		/* done */ +		break; +	case REP(1, N):			/* as x?x{1,n-1} */ +		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ +		INSERT(OCH_, start); +		ASTERN(OOR1, start); +		AHEAD(start); +		EMIT(OOR2, 0);			/* offset very wrong... */ +		AHEAD(THERE());			/* ...so fix it */ +		ASTERN(O_CH, THERETHERE()); +		copy = dupl(p, start+1, finish+1); +		assert(copy == finish+4); +		repeat(p, copy, 1, to-1); +		break; +	case REP(1, INF):		/* as x+ */ +		INSERT(OPLUS_, start); +		ASTERN(O_PLUS, start); +		break; +	case REP(N, N):			/* as xx{m-1,n-1} */ +		copy = dupl(p, start, finish); +		repeat(p, copy, from-1, to-1); +		break; +	case REP(N, INF):		/* as xx{n-1,INF} */ +		copy = dupl(p, start, finish); +		repeat(p, copy, from-1, to); +		break; +	default:			/* "can't happen" */ +		SETERROR(REG_ASSERT);	/* just in case */ +		break; +	} +} + +/* + - seterr - set an error condition + == static int seterr(register struct parse *p, int e); + */ +static int			/* useless but makes type checking happy */ +seterr(p, e) +register struct parse *p; +int e; +{ +	if (p->error == 0)	/* keep earliest error condition */ +		p->error = e; +	p->next = nuls;		/* try to bring things to a halt */ +	p->end = nuls; +	return(0);		/* make the return value well-defined */ +} + +/* + - allocset - allocate a set of characters for [] + == static cset *allocset(register struct parse *p); + */ +static cset * +allocset(p) +register struct parse *p; +{ +	register int no = p->g->ncsets++; +	register size_t nc; +	register size_t nbytes; +	register cset *cs; +	register size_t css = (size_t)p->g->csetsize; +	register int i; + +	if (no >= p->ncsalloc) {	/* need another column of space */ +		p->ncsalloc += CHAR_BIT; +		nc = p->ncsalloc; +		assert(nc % CHAR_BIT == 0); +		nbytes = nc / CHAR_BIT * css; +		if (p->g->sets == NULL) +			p->g->sets = (cset *)malloc(nc * sizeof(cset)); +		else +			p->g->sets = (cset *)realloc((unsigned char *)p->g->sets, +							nc * sizeof(cset)); +		if (p->g->setbits == NULL) +			p->g->setbits = (uch *)malloc(nbytes); +		else { +			p->g->setbits = (uch *)realloc((unsigned char *)p->g->setbits, +								nbytes); +			/* xxx this isn't right if setbits is now NULL */ +			for (i = 0; i < no; i++) +				p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT); +		} +		if (p->g->sets != NULL && p->g->setbits != NULL) +			(void) memset((unsigned char *)p->g->setbits + (nbytes - css), +								0, css); +		else { +			no = 0; +			SETERROR(REG_ESPACE); +			/* caller's responsibility not to do set ops */ +		} +	} + +	assert(p->g->sets != NULL);	/* xxx */ +	cs = &p->g->sets[no]; +	cs->ptr = p->g->setbits + css*((no)/CHAR_BIT); +	cs->mask = 1 << ((no) % CHAR_BIT); +	cs->hash = 0; +	cs->smultis = 0; +	cs->multis = NULL; + +	return(cs); +} + +/* + - freeset - free a now-unused set + == static void freeset(register struct parse *p, register cset *cs); + */ +static void +freeset(p, cs) +register struct parse *p; +register cset *cs; +{ +	register size_t i; +	register cset *top = &p->g->sets[p->g->ncsets]; +	register size_t css = (size_t)p->g->csetsize; + +	for (i = 0; i < css; i++) +		CHsub(cs, i); +	if (cs == top-1)	/* recover only the easy case */ +		p->g->ncsets--; +} + +/* + - freezeset - final processing on a set of characters + == static int freezeset(register struct parse *p, register cset *cs); + * + * The main task here is merging identical sets.  This is usually a waste + * of time (although the hash code minimizes the overhead), but can win + * big if REG_ICASE is being used.  REG_ICASE, by the way, is why the hash + * is done using addition rather than xor -- all ASCII [aA] sets xor to + * the same value! + */ +static int			/* set number */ +freezeset(p, cs) +register struct parse *p; +register cset *cs; +{ +	register uch h = cs->hash; +	register size_t i; +	register cset *top = &p->g->sets[p->g->ncsets]; +	register cset *cs2; +	register size_t css = (size_t)p->g->csetsize; + +	/* look for an earlier one which is the same */ +	for (cs2 = &p->g->sets[0]; cs2 < top; cs2++) +		if (cs2->hash == h && cs2 != cs) { +			/* maybe */ +			for (i = 0; i < css; i++) +				if (!!CHIN(cs2, i) != !!CHIN(cs, i)) +					break;		/* no */ +			if (i == css) +				break;			/* yes */ +		} + +	if (cs2 < top) {	/* found one */ +		freeset(p, cs); +		cs = cs2; +	} + +	return((int)(cs - p->g->sets)); +} + +/* + - firstch - return first character in a set (which must have at least one) + == static int firstch(register struct parse *p, register cset *cs); + */ +static int			/* character; there is no "none" value */ +firstch(p, cs) +register struct parse *p; +register cset *cs; +{ +	register size_t i; +	register size_t css = (size_t)p->g->csetsize; + +	for (i = 0; i < css; i++) +		if (CHIN(cs, i)) +			return((unsigned char)i); +	assert(never); +	return(0);		/* arbitrary */ +} + +/* + - nch - number of characters in a set + == static int nch(register struct parse *p, register cset *cs); + */ +static int +nch(p, cs) +register struct parse *p; +register cset *cs; +{ +	register size_t i; +	register size_t css = (size_t)p->g->csetsize; +	register int n = 0; + +	for (i = 0; i < css; i++) +		if (CHIN(cs, i)) +			n++; +	return(n); +} + +/* + - mcadd - add a collating element to a cset + == static void mcadd(register struct parse *p, register cset *cs, \ + ==	register char *cp); + */ +static void +mcadd(p, cs, cp) +register struct parse *p; +register cset *cs; +register unsigned char *cp; +{ +	register size_t oldend = cs->smultis; + +	cs->smultis += strlen(cp) + 1; +	if (cs->multis == NULL) +		cs->multis = malloc(cs->smultis); +	else +		cs->multis = realloc(cs->multis, cs->smultis); +	if (cs->multis == NULL) { +		SETERROR(REG_ESPACE); +		return; +	} + +	(void) strcpy(cs->multis + oldend - 1, cp); +	cs->multis[cs->smultis - 1] = '\0'; +} + +#if 0 +/* + - mcsub - subtract a collating element from a cset + == static void mcsub(register cset *cs, register unsigned char *cp); + */ +static void +mcsub(cs, cp) +register unsigned cset *cs; +register unsigned char *cp; +{ +	register unsigned char *fp = mcfind(cs, cp); +	register size_t len = strlen(fp); + +	assert(fp != NULL); +	(void) memmove(fp, fp + len + 1, +				cs->smultis - (fp + len + 1 - cs->multis)); +	cs->smultis -= len; + +	if (cs->smultis == 0) { +		free(cs->multis); +		cs->multis = NULL; +		return; +	} + +	cs->multis = realloc(cs->multis, cs->smultis); +	assert(cs->multis != NULL); +} + +/* + - mcin - is a collating element in a cset? + == static int mcin(register cset *cs, register unsigned char *cp); + */ +static int +mcin(cs, cp) +register cset *cs; +register unsigned char *cp; +{ +	return(mcfind(cs, cp) != NULL); +} + + +/* + - mcfind - find a collating element in a cset + == static unsigned char *mcfind(register cset *cs, register unsigned char *cp); + */ +static unsigned char * +mcfind(cs, cp) +register cset *cs; +register unsigned char *cp; +{ +	register unsigned char *p; + +	if (cs->multis == NULL) +		return(NULL); +	for (p = cs->multis; *p != '\0'; p += strlen(p) + 1) +		if (strcmp(cp, p) == 0) +			return(p); +	return(NULL); +} +#endif + +/* + - mcinvert - invert the list of collating elements in a cset + == static void mcinvert(register struct parse *p, register cset *cs); + * + * This would have to know the set of possibilities.  Implementation + * is deferred. + */ +static void +mcinvert(p, cs) +register struct parse *p; +register cset *cs; +{ +	assert(cs->multis == NULL);	/* xxx */ +} + +/* + - mccase - add case counterparts of the list of collating elements in a cset + == static void mccase(register struct parse *p, register cset *cs); + * + * This would have to know the set of possibilities.  Implementation + * is deferred. + */ +static void +mccase(p, cs) +register struct parse *p; +register cset *cs; +{ +	assert(cs->multis == NULL);	/* xxx */ +} + +/* + - isinsets - is this character in any sets? + == static int isinsets(register struct re_guts *g, int c); + */ +static int			/* predicate */ +isinsets(g, c) +register struct re_guts *g; +int c; +{ +	register uch *col; +	register int i; +	register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; +	register unsigned uc = (unsigned char)c; + +	for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) +		if (col[uc] != 0) +			return(1); +	return(0); +} + +/* + - samesets - are these two characters in exactly the same sets? + == static int samesets(register struct re_guts *g, int c1, int c2); + */ +static int			/* predicate */ +samesets(g, c1, c2) +register struct re_guts *g; +int c1; +int c2; +{ +	register uch *col; +	register int i; +	register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; +	register unsigned uc1 = (unsigned char)c1; +	register unsigned uc2 = (unsigned char)c2; + +	for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) +		if (col[uc1] != col[uc2]) +			return(0); +	return(1); +} + +/* + - categorize - sort out character categories + == static void categorize(struct parse *p, register struct re_guts *g); + */ +static void +categorize(p, g) +struct parse *p; +register struct re_guts *g; +{ +	register cat_t *cats = g->categories; +	register int c; +	register int c2; +	register cat_t cat; + +	/* avoid making error situations worse */ +	if (p->error != 0) +		return; + +	for (c = 0; c <= UCHAR_MAX; c++) +		if (cats[c] == 0 && isinsets(g, c)) { +			cat = g->ncategories++; +			cats[c] = cat; +			for (c2 = c+1; c2 <= UCHAR_MAX; c2++) +				if (cats[c2] == 0 && samesets(g, c, c2)) +					cats[c2] = cat; +		} +} + +/* + - dupl - emit a duplicate of a bunch of sops + == static sopno dupl(register struct parse *p, sopno start, sopno finish); + */ +static sopno			/* start of duplicate */ +dupl(p, start, finish) +register struct parse *p; +sopno start;			/* from here */ +sopno finish;			/* to this less one */ +{ +	register sopno ret = HERE(); +	register sopno len = finish - start; + +	assert(finish >= start); +	if (len == 0) +		return(ret); +	enlarge(p, p->ssize + len);	/* this many unexpected additions */ +	assert(p->ssize >= p->slen + len); +	(void) memcpy((char *)(p->strip + p->slen), +		(char *)(p->strip + start), (size_t)len*sizeof(sop)); +	p->slen += len; +	return(ret); +} + +/* + - doemit - emit a strip operator + == static void doemit(register struct parse *p, sop op, size_t opnd); + * + * It might seem better to implement this as a macro with a function as + * hard-case backup, but it's just too big and messy unless there are + * some changes to the data structures.  Maybe later. + */ +static void +doemit(p, op, opnd) +register struct parse *p; +sop op; +size_t opnd; +{ +	/* avoid making error situations worse */ +	if (p->error != 0) +		return; + +	/* deal with oversize operands ("can't happen", more or less) */ +	assert(opnd < 1<<OPSHIFT); + +	/* deal with undersized strip */ +	if (p->slen >= p->ssize) +		enlarge(p, (p->ssize+1) / 2 * 3);	/* +50% */ +	assert(p->slen < p->ssize); + +	/* finally, it's all reduced to the easy case */ +	p->strip[p->slen++] = SOP(op, opnd); +} + +/* + - doinsert - insert a sop into the strip + == static void doinsert(register struct parse *p, sop op, size_t opnd, sopno pos); + */ +static void +doinsert(p, op, opnd, pos) +register struct parse *p; +sop op; +size_t opnd; +sopno pos; +{ +	register sopno sn; +	register sop s; +	register int i; + +	/* avoid making error situations worse */ +	if (p->error != 0) +		return; + +	sn = HERE(); +	EMIT(op, opnd);		/* do checks, ensure space */ +	assert(HERE() == sn+1); +	s = p->strip[sn]; + +	/* adjust paren pointers */ +	assert(pos > 0); +	for (i = 1; i < NPAREN; i++) { +		if (p->pbegin[i] >= pos) { +			p->pbegin[i]++; +		} +		if (p->pend[i] >= pos) { +			p->pend[i]++; +		} +	} + +	memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos], +						(HERE()-pos-1)*sizeof(sop)); +	p->strip[pos] = s; +} + +/* + - dofwd - complete a forward reference + == static void dofwd(register struct parse *p, sopno pos, sop value); + */ +static void +dofwd(p, pos, value) +register struct parse *p; +register sopno pos; +sop value; +{ +	/* avoid making error situations worse */ +	if (p->error != 0) +		return; + +	assert(value < 1<<OPSHIFT); +	p->strip[pos] = OP(p->strip[pos]) | value; +} + +/* + - enlarge - enlarge the strip + == static void enlarge(register struct parse *p, sopno size); + */ +static void +enlarge(p, size) +register struct parse *p; +register sopno size; +{ +	register sop *sp; + +	if (p->ssize >= size) +		return; + +	sp = (sop *)realloc(p->strip, size*sizeof(sop)); +	if (sp == NULL) { +		SETERROR(REG_ESPACE); +		return; +	} +	p->strip = sp; +	p->ssize = size; +} + +/* + - stripsnug - compact the strip + == static void stripsnug(register struct parse *p, register struct re_guts *g); + */ +static void +stripsnug(p, g) +register struct parse *p; +register struct re_guts *g; +{ +	g->nstates = p->slen; +	g->strip = (sop *)realloc((unsigned char *)p->strip, p->slen * sizeof(sop)); +	if (g->strip == NULL) { +		SETERROR(REG_ESPACE); +		g->strip = p->strip; +	} +} + +/* + - findmust - fill in must and mlen with longest mandatory literal string + == static void findmust(register struct parse *p, register struct re_guts *g); + * + * This algorithm could do fancy things like analyzing the operands of | + * for common subsequences.  Someday.  This code is simple and finds most + * of the interesting cases. + * + * Note that must and mlen got initialized during setup. + */ +static void +findmust(p, g) +struct parse *p; +register struct re_guts *g; +{ +	register sop *scan; +	sop *start = NULL; +	register sop *newstart = NULL; +	register sopno newlen; +	register sop s; +	register unsigned char *cp; +	register sopno i; + +	/* avoid making error situations worse */ +	if (p->error != 0) +		return; + +	/* find the longest OCHAR sequence in strip */ +	newlen = 0; +	scan = g->strip + 1; +	do { +		s = *scan++; +		switch (OP(s)) { +		case OCHAR:		/* sequence member */ +			if (newlen == 0)		/* new sequence */ +				newstart = scan - 1; +			newlen++; +			break; +		case OPLUS_:		/* things that don't break one */ +		case OLPAREN: +		case ORPAREN: +			break; +		case OQUEST_:		/* things that must be skipped */ +		case OCH_: +			scan--; +			do { +				scan += OPND(s); +				s = *scan; +				/* assert() interferes w debug printouts */ +				if (OP(s) != O_QUEST && OP(s) != O_CH && +							OP(s) != OOR2) { +					g->iflags |= BAD; +					return; +				} +			} while (OP(s) != O_QUEST && OP(s) != O_CH); +			/* fallthrough */ +		default:		/* things that break a sequence */ +			if (newlen > g->mlen) {		/* ends one */ +				start = newstart; +				g->mlen = newlen; +			} +			newlen = 0; +			break; +		} +	} while (OP(s) != OEND); + +	if (g->mlen == 0)		/* there isn't one */ +		return; + +	if (!start) { +		g->mlen = 0; +		return; +	} + +	/* turn it into a character string */ +	g->must = malloc((size_t)g->mlen + 1); +	if (g->must == NULL) {		/* argh; just forget it */ +		g->mlen = 0; +		return; +	} +	cp = g->must; +	scan = start; +	for (i = g->mlen; i > 0; i--) { +		while (OP(s = *scan++) != OCHAR) +			continue; +		assert(cp < g->must + g->mlen); +		*cp++ = (unsigned char)OPND(s); +	} +	assert(cp == g->must + g->mlen); +	*cp++ = '\0';		/* just on general principles */ +} + +/* + - pluscount - count + nesting + == static sopno pluscount(register struct parse *p, register struct re_guts *g); + */ +static sopno			/* nesting depth */ +pluscount(p, g) +struct parse *p; +register struct re_guts *g; +{ +	register sop *scan; +	register sop s; +	register sopno plusnest = 0; +	register sopno maxnest = 0; + +	if (p->error != 0) +		return(0);	/* there may not be an OEND */ + +	scan = g->strip + 1; +	do { +		s = *scan++; +		switch (OP(s)) { +		case OPLUS_: +			plusnest++; +			break; +		case O_PLUS: +			if (plusnest > maxnest) +				maxnest = plusnest; +			plusnest--; +			break; +		} +	} while (OP(s) != OEND); +	if (plusnest != 0) +		g->iflags |= BAD; +	return(maxnest); +} diff --git a/ext/ereg/regex/regcomp.ih b/ext/ereg/regex/regcomp.ih new file mode 100644 index 0000000000..c93d32e51d --- /dev/null +++ b/ext/ereg/regex/regcomp.ih @@ -0,0 +1,53 @@ +/* ========= begin header generated by ./mkh ========= */ +#ifdef __cplusplus +extern "C" { +#endif + +/* === regcomp.c === */ +static void p_ere(register struct parse *p, int stop); +static void p_ere_exp(register struct parse *p); +static void p_str(register struct parse *p); +static void p_bre(register struct parse *p, register int end1, register int end2); +static int p_simp_re(register struct parse *p, int starordinary); +static int p_count(register struct parse *p); +static void p_bracket(register struct parse *p); +static void p_b_term(register struct parse *p, register cset *cs); +static void p_b_cclass(register struct parse *p, register cset *cs); +static void p_b_eclass(register struct parse *p, register cset *cs); +static unsigned char p_b_symbol(register struct parse *p); +static unsigned char p_b_coll_elem(register struct parse *p, int endc); +static unsigned char othercase(int ch); +static void bothcases(register struct parse *p, int ch); +static void ordinary(register struct parse *p, register int ch); +static void nonnewline(register struct parse *p); +static void repeat(register struct parse *p, sopno start, int from, int to); +static int seterr(register struct parse *p, int e); +static cset *allocset(register struct parse *p); +static void freeset(register struct parse *p, register cset *cs); +static int freezeset(register struct parse *p, register cset *cs); +static int firstch(register struct parse *p, register cset *cs); +static int nch(register struct parse *p, register cset *cs); +static void mcadd(register struct parse *p, register cset *cs, register unsigned char *cp); +#if 0 +static void mcsub(register cset *cs, register unsigned char *cp); +static int mcin(register cset *cs, register unsigned char *cp); +static unsigned char *mcfind(register cset *cs, register unsigned char *cp); +#endif +static void mcinvert(register struct parse *p, register cset *cs); +static void mccase(register struct parse *p, register cset *cs); +static int isinsets(register struct re_guts *g, int c); +static int samesets(register struct re_guts *g, int c1, int c2); +static void categorize(struct parse *p, register struct re_guts *g); +static sopno dupl(register struct parse *p, sopno start, sopno finish); +static void doemit(register struct parse *p, sop op, size_t opnd); +static void doinsert(register struct parse *p, sop op, size_t opnd, sopno pos); +static void dofwd(register struct parse *p, sopno pos, sop value); +static void enlarge(register struct parse *p, sopno size); +static void stripsnug(register struct parse *p, register struct re_guts *g); +static void findmust(register struct parse *p, register struct re_guts *g); +static sopno pluscount(register struct parse *p, register struct re_guts *g); + +#ifdef __cplusplus +} +#endif +/* ========= end header generated by ./mkh ========= */ diff --git a/ext/ereg/regex/regerror.c b/ext/ereg/regex/regerror.c new file mode 100644 index 0000000000..6c8e064c4a --- /dev/null +++ b/ext/ereg/regex/regerror.c @@ -0,0 +1,126 @@ +#include <sys/types.h> +#include <stdio.h> +#include <string.h> +#include <ctype.h> +#include <limits.h> +#include <stdlib.h> +#include <regex.h> + +#include "utils.h" +#include "regerror.ih" + +/* + = #define	REG_OKAY	 0 + = #define	REG_NOMATCH	 1 + = #define	REG_BADPAT	 2 + = #define	REG_ECOLLATE	 3 + = #define	REG_ECTYPE	 4 + = #define	REG_EESCAPE	 5 + = #define	REG_ESUBREG	 6 + = #define	REG_EBRACK	 7 + = #define	REG_EPAREN	 8 + = #define	REG_EBRACE	 9 + = #define	REG_BADBR	10 + = #define	REG_ERANGE	11 + = #define	REG_ESPACE	12 + = #define	REG_BADRPT	13 + = #define	REG_EMPTY	14 + = #define	REG_ASSERT	15 + = #define	REG_INVARG	16 + = #define	REG_ATOI	255	// convert name to number (!) + = #define	REG_ITOA	0400	// convert number to name (!) + */ +static struct rerr { +	int code; +	char *name; +	char *explain; +} rerrs[] = { +	{REG_OKAY,	"REG_OKAY",	"no errors detected"}, +	{REG_NOMATCH,	"REG_NOMATCH",	"regexec() failed to match"}, +	{REG_BADPAT,	"REG_BADPAT",	"invalid regular expression"}, +	{REG_ECOLLATE,	"REG_ECOLLATE",	"invalid collating element"}, +	{REG_ECTYPE,	"REG_ECTYPE",	"invalid character class"}, +	{REG_EESCAPE,	"REG_EESCAPE",	"trailing backslash (\\)"}, +	{REG_ESUBREG,	"REG_ESUBREG",	"invalid backreference number"}, +	{REG_EBRACK,	"REG_EBRACK",	"brackets ([ ]) not balanced"}, +	{REG_EPAREN,	"REG_EPAREN",	"parentheses not balanced"}, +	{REG_EBRACE,	"REG_EBRACE",	"braces not balanced"}, +	{REG_BADBR,	"REG_BADBR",	"invalid repetition count(s)"}, +	{REG_ERANGE,	"REG_ERANGE",	"invalid character range"}, +	{REG_ESPACE,	"REG_ESPACE",	"out of memory"}, +	{REG_BADRPT,	"REG_BADRPT",	"repetition-operator operand invalid"}, +	{REG_EMPTY,	"REG_EMPTY",	"empty (sub)expression"}, +	{REG_ASSERT,	"REG_ASSERT",	"\"can't happen\" -- you found a bug"}, +	{REG_INVARG,	"REG_INVARG",	"invalid argument to regex routine"}, +	{-1,		"",		"*** unknown regexp error code ***"}, +}; + +/* + - regerror - the interface to error numbers + = API_EXPORT(size_t) regerror(int, const regex_t *, char *, size_t); + */ +/* ARGSUSED */ +API_EXPORT(size_t) +regerror( +int errcode, +const regex_t *preg, +char *errbuf, +size_t errbuf_size) +{ +	register struct rerr *r; +	register size_t len; +	register int target = errcode &~ REG_ITOA; +	register char *s; +	char convbuf[50]; + +	if (errcode == REG_ATOI) +		s = regatoi(preg, convbuf); +	else { +		for (r = rerrs; r->code >= 0; r++) +			if (r->code == target) +				break; +	 +		if (errcode®_ITOA) { +			if (r->code >= 0) +				(void) strcpy(convbuf, r->name); +			else +				sprintf(convbuf, "REG_0x%x", target); +			assert(strlen(convbuf) < sizeof(convbuf)); +			s = convbuf; +		} else +			s = r->explain; +	} + +	len = strlen(s) + 1; +	if (errbuf_size > 0) { +		if (errbuf_size > len) +			(void) strcpy(errbuf, s); +		else { +			(void) strncpy(errbuf, s, errbuf_size-1); +			errbuf[errbuf_size-1] = '\0'; +		} +	} + +	return(len); +} + +/* + - regatoi - internal routine to implement REG_ATOI + == static char *regatoi(const regex_t *preg, char *localbuf); + */ +static char * +regatoi(preg, localbuf) +const regex_t *preg; +char *localbuf; +{ +	register struct rerr *r; + +	for (r = rerrs; r->code >= 0; r++) +		if (strcmp(r->name, preg->re_endp) == 0) +			break; +	if (r->code < 0) +		return("0"); + +	sprintf(localbuf, "%d", r->code); +	return(localbuf); +} diff --git a/ext/ereg/regex/regerror.ih b/ext/ereg/regex/regerror.ih new file mode 100644 index 0000000000..2cb668c24f --- /dev/null +++ b/ext/ereg/regex/regerror.ih @@ -0,0 +1,12 @@ +/* ========= begin header generated by ./mkh ========= */ +#ifdef __cplusplus +extern "C" { +#endif + +/* === regerror.c === */ +static char *regatoi(const regex_t *preg, char *localbuf); + +#ifdef __cplusplus +} +#endif +/* ========= end header generated by ./mkh ========= */ diff --git a/ext/ereg/regex/regex.3 b/ext/ereg/regex/regex.3 new file mode 100644 index 0000000000..100c8a7f71 --- /dev/null +++ b/ext/ereg/regex/regex.3 @@ -0,0 +1,502 @@ +.TH REGEX 3 "17 May 1993" +.BY "Henry Spencer" +.de ZR +.\" one other place knows this name:  the SEE ALSO section +.IR regex (7) \\$1 +.. +.SH NAME +regcomp, regexec, regerror, regfree \- regular-expression library +.SH SYNOPSIS +.ft B +.\".na +#include <sys/types.h> +.br +#include <regex.h> +.HP 10 +int regcomp(regex_t\ *preg, const\ char\ *pattern, int\ cflags); +.HP +int\ regexec(const\ regex_t\ *preg, const\ char\ *string, +size_t\ nmatch, regmatch_t\ pmatch[], int\ eflags); +.HP +size_t\ regerror(int\ errcode, const\ regex_t\ *preg, +char\ *errbuf, size_t\ errbuf_size); +.HP +void\ regfree(regex_t\ *preg); +.\".ad +.ft +.SH DESCRIPTION +These routines implement POSIX 1003.2 regular expressions (``RE''s); +see +.ZR . +.I Regcomp +compiles an RE written as a string into an internal form, +.I regexec +matches that internal form against a string and reports results, +.I regerror +transforms error codes from either into human-readable messages, +and +.I regfree +frees any dynamically-allocated storage used by the internal form +of an RE. +.PP +The header +.I <regex.h> +declares two structure types, +.I regex_t +and +.IR regmatch_t , +the former for compiled internal forms and the latter for match reporting. +It also declares the four functions, +a type +.IR regoff_t , +and a number of constants with names starting with ``REG_''. +.PP +.I Regcomp +compiles the regular expression contained in the +.I pattern +string, +subject to the flags in +.IR cflags , +and places the results in the +.I regex_t +structure pointed to by +.IR preg . +.I Cflags +is the bitwise OR of zero or more of the following flags: +.IP REG_EXTENDED \w'REG_EXTENDED'u+2n +Compile modern (``extended'') REs, +rather than the obsolete (``basic'') REs that +are the default. +.IP REG_BASIC +This is a synonym for 0, +provided as a counterpart to REG_EXTENDED to improve readability. +.IP REG_NOSPEC +Compile with recognition of all special characters turned off. +All characters are thus considered ordinary, +so the ``RE'' is a literal string. +This is an extension, +compatible with but not specified by POSIX 1003.2, +and should be used with +caution in software intended to be portable to other systems. +REG_EXTENDED and REG_NOSPEC may not be used +in the same call to +.IR regcomp . +.IP REG_ICASE +Compile for matching that ignores upper/lower case distinctions. +See +.ZR . +.IP REG_NOSUB +Compile for matching that need only report success or failure, +not what was matched. +.IP REG_NEWLINE +Compile for newline-sensitive matching. +By default, newline is a completely ordinary character with no special +meaning in either REs or strings. +With this flag, +`[^' bracket expressions and `.' never match newline, +a `^' anchor matches the null string after any newline in the string +in addition to its normal function, +and the `$' anchor matches the null string before any newline in the +string in addition to its normal function. +.IP REG_PEND +The regular expression ends, +not at the first NUL, +but just before the character pointed to by the +.I re_endp +member of the structure pointed to by +.IR preg . +The +.I re_endp +member is of type +.IR const\ char\ * . +This flag permits inclusion of NULs in the RE; +they are considered ordinary characters. +This is an extension, +compatible with but not specified by POSIX 1003.2, +and should be used with +caution in software intended to be portable to other systems. +.PP +When successful, +.I regcomp +returns 0 and fills in the structure pointed to by +.IR preg . +One member of that structure +(other than +.IR re_endp ) +is publicized: +.IR re_nsub , +of type +.IR size_t , +contains the number of parenthesized subexpressions within the RE +(except that the value of this member is undefined if the +REG_NOSUB flag was used). +If +.I regcomp +fails, it returns a non-zero error code; +see DIAGNOSTICS. +.PP +.I Regexec +matches the compiled RE pointed to by +.I preg +against the +.IR string , +subject to the flags in +.IR eflags , +and reports results using +.IR nmatch , +.IR pmatch , +and the returned value. +The RE must have been compiled by a previous invocation of +.IR regcomp . +The compiled form is not altered during execution of +.IR regexec , +so a single compiled RE can be used simultaneously by multiple threads. +.PP +By default, +the NUL-terminated string pointed to by +.I string +is considered to be the text of an entire line, minus any terminating +newline. +The +.I eflags +argument is the bitwise OR of zero or more of the following flags: +.IP REG_NOTBOL \w'REG_STARTEND'u+2n +The first character of +the string +is not the beginning of a line, so the `^' anchor should not match before it. +This does not affect the behavior of newlines under REG_NEWLINE. +.IP REG_NOTEOL +The NUL terminating +the string +does not end a line, so the `$' anchor should not match before it. +This does not affect the behavior of newlines under REG_NEWLINE. +.IP REG_STARTEND +The string is considered to start at +\fIstring\fR\ + \fIpmatch\fR[0].\fIrm_so\fR +and to have a terminating NUL located at +\fIstring\fR\ + \fIpmatch\fR[0].\fIrm_eo\fR +(there need not actually be a NUL at that location), +regardless of the value of +.IR nmatch . +See below for the definition of +.IR pmatch +and +.IR nmatch . +This is an extension, +compatible with but not specified by POSIX 1003.2, +and should be used with +caution in software intended to be portable to other systems. +Note that a non-zero \fIrm_so\fR does not imply REG_NOTBOL; +REG_STARTEND affects only the location of the string, +not how it is matched. +.PP +See +.ZR +for a discussion of what is matched in situations where an RE or a +portion thereof could match any of several substrings of +.IR string . +.PP +Normally, +.I regexec +returns 0 for success and the non-zero code REG_NOMATCH for failure. +Other non-zero error codes may be returned in exceptional situations; +see DIAGNOSTICS. +.PP +If REG_NOSUB was specified in the compilation of the RE, +or if +.I nmatch +is 0, +.I regexec +ignores the +.I pmatch +argument (but see below for the case where REG_STARTEND is specified). +Otherwise, +.I pmatch +points to an array of +.I nmatch +structures of type +.IR regmatch_t . +Such a structure has at least the members +.I rm_so +and +.IR rm_eo , +both of type +.I regoff_t +(a signed arithmetic type at least as large as an +.I off_t +and a +.IR ssize_t ), +containing respectively the offset of the first character of a substring +and the offset of the first character after the end of the substring. +Offsets are measured from the beginning of the +.I string +argument given to +.IR regexec . +An empty substring is denoted by equal offsets, +both indicating the character following the empty substring. +.PP +The 0th member of the +.I pmatch +array is filled in to indicate what substring of +.I string +was matched by the entire RE. +Remaining members report what substring was matched by parenthesized +subexpressions within the RE; +member +.I i +reports subexpression +.IR i , +with subexpressions counted (starting at 1) by the order of their opening +parentheses in the RE, left to right. +Unused entries in the array\(emcorresponding either to subexpressions that +did not participate in the match at all, or to subexpressions that do not +exist in the RE (that is, \fIi\fR\ > \fIpreg\fR\->\fIre_nsub\fR)\(emhave both +.I rm_so +and +.I rm_eo +set to \-1. +If a subexpression participated in the match several times, +the reported substring is the last one it matched. +(Note, as an example in particular, that when the RE `(b*)+' matches `bbb', +the parenthesized subexpression matches each of the three `b's and then +an infinite number of empty strings following the last `b', +so the reported substring is one of the empties.) +.PP +If REG_STARTEND is specified, +.I pmatch +must point to at least one +.I regmatch_t +(even if +.I nmatch +is 0 or REG_NOSUB was specified), +to hold the input offsets for REG_STARTEND. +Use for output is still entirely controlled by +.IR nmatch ; +if +.I nmatch +is 0 or REG_NOSUB was specified, +the value of +.IR pmatch [0] +will not be changed by a successful +.IR regexec . +.PP +.I Regerror +maps a non-zero +.I errcode +from either +.I regcomp +or +.I regexec +to a human-readable, printable message. +If +.I preg +is non-NULL, +the error code should have arisen from use of +the +.I regex_t +pointed to by +.IR preg , +and if the error code came from +.IR regcomp , +it should have been the result from the most recent +.I regcomp +using that +.IR regex_t . +.RI ( Regerror +may be able to supply a more detailed message using information +from the +.IR regex_t .) +.I Regerror +places the NUL-terminated message into the buffer pointed to by +.IR errbuf , +limiting the length (including the NUL) to at most +.I errbuf_size +bytes. +If the whole message won't fit, +as much of it as will fit before the terminating NUL is supplied. +In any case, +the returned value is the size of buffer needed to hold the whole +message (including terminating NUL). +If +.I errbuf_size +is 0, +.I errbuf +is ignored but the return value is still correct. +.PP +If the +.I errcode +given to +.I regerror +is first ORed with REG_ITOA, +the ``message'' that results is the printable name of the error code, +e.g. ``REG_NOMATCH'', +rather than an explanation thereof. +If +.I errcode +is REG_ATOI, +then +.I preg +shall be non-NULL and the +.I re_endp +member of the structure it points to +must point to the printable name of an error code; +in this case, the result in +.I errbuf +is the decimal digits of +the numeric value of the error code +(0 if the name is not recognized). +REG_ITOA and REG_ATOI are intended primarily as debugging facilities; +they are extensions, +compatible with but not specified by POSIX 1003.2, +and should be used with +caution in software intended to be portable to other systems. +Be warned also that they are considered experimental and changes are possible. +.PP +.I Regfree +frees any dynamically-allocated storage associated with the compiled RE +pointed to by +.IR preg . +The remaining +.I regex_t +is no longer a valid compiled RE +and the effect of supplying it to +.I regexec +or +.I regerror +is undefined. +.PP +None of these functions references global variables except for tables +of constants; +all are safe for use from multiple threads if the arguments are safe. +.SH IMPLEMENTATION CHOICES +There are a number of decisions that 1003.2 leaves up to the implementor, +either by explicitly saying ``undefined'' or by virtue of them being +forbidden by the RE grammar. +This implementation treats them as follows. +.PP +See +.ZR +for a discussion of the definition of case-independent matching. +.PP +There is no particular limit on the length of REs, +except insofar as memory is limited. +Memory usage is approximately linear in RE size, and largely insensitive +to RE complexity, except for bounded repetitions. +See BUGS for one short RE using them +that will run almost any system out of memory. +.PP +A backslashed character other than one specifically given a magic meaning +by 1003.2 (such magic meanings occur only in obsolete [``basic''] REs) +is taken as an ordinary character. +.PP +Any unmatched [ is a REG_EBRACK error. +.PP +Equivalence classes cannot begin or end bracket-expression ranges. +The endpoint of one range cannot begin another. +.PP +RE_DUP_MAX, the limit on repetition counts in bounded repetitions, is 255. +.PP +A repetition operator (?, *, +, or bounds) cannot follow another +repetition operator. +A repetition operator cannot begin an expression or subexpression +or follow `^' or `|'. +.PP +`|' cannot appear first or last in a (sub)expression or after another `|', +i.e. an operand of `|' cannot be an empty subexpression. +An empty parenthesized subexpression, `()', is legal and matches an +empty (sub)string. +An empty string is not a legal RE. +.PP +A `{' followed by a digit is considered the beginning of bounds for a +bounded repetition, which must then follow the syntax for bounds. +A `{' \fInot\fR followed by a digit is considered an ordinary character. +.PP +`^' and `$' beginning and ending subexpressions in obsolete (``basic'') +REs are anchors, not ordinary characters. +.SH SEE ALSO +grep(1), regex(7) +.PP +POSIX 1003.2, sections 2.8 (Regular Expression Notation) +and +B.5 (C Binding for Regular Expression Matching). +.SH DIAGNOSTICS +Non-zero error codes from +.I regcomp +and +.I regexec +include the following: +.PP +.nf +.ta \w'REG_ECOLLATE'u+3n +REG_NOMATCH	regexec() failed to match +REG_BADPAT	invalid regular expression +REG_ECOLLATE	invalid collating element +REG_ECTYPE	invalid character class +REG_EESCAPE	\e applied to unescapable character +REG_ESUBREG	invalid backreference number +REG_EBRACK	brackets [ ] not balanced +REG_EPAREN	parentheses ( ) not balanced +REG_EBRACE	braces { } not balanced +REG_BADBR	invalid repetition count(s) in { } +REG_ERANGE	invalid character range in [ ] +REG_ESPACE	ran out of memory +REG_BADRPT	?, *, or + operand invalid +REG_EMPTY	empty (sub)expression +REG_ASSERT	``can't happen''\(emyou found a bug +REG_INVARG	invalid argument, e.g. negative-length string +.fi +.SH HISTORY +Written by Henry Spencer at University of Toronto, +henry@zoo.toronto.edu. +.SH BUGS +This is an alpha release with known defects. +Please report problems. +.PP +There is one known functionality bug. +The implementation of internationalization is incomplete: +the locale is always assumed to be the default one of 1003.2, +and only the collating elements etc. of that locale are available. +.PP +The back-reference code is subtle and doubts linger about its correctness +in complex cases. +.PP +.I Regexec +performance is poor. +This will improve with later releases. +.I Nmatch +exceeding 0 is expensive; +.I nmatch +exceeding 1 is worse. +.I Regexec +is largely insensitive to RE complexity \fIexcept\fR that back +references are massively expensive. +RE length does matter; in particular, there is a strong speed bonus +for keeping RE length under about 30 characters, +with most special characters counting roughly double. +.PP +.I Regcomp +implements bounded repetitions by macro expansion, +which is costly in time and space if counts are large +or bounded repetitions are nested. +An RE like, say, +`((((a{1,100}){1,100}){1,100}){1,100}){1,100}' +will (eventually) run almost any existing machine out of swap space. +.PP +There are suspected problems with response to obscure error conditions. +Notably, +certain kinds of internal overflow, +produced only by truly enormous REs or by multiply nested bounded repetitions, +are probably not handled well. +.PP +Due to a mistake in 1003.2, things like `a)b' are legal REs because `)' is +a special character only in the presence of a previous unmatched `('. +This can't be fixed until the spec is fixed. +.PP +The standard's definition of back references is vague. +For example, does +`a\e(\e(b\e)*\e2\e)*d' match `abbbd'? +Until the standard is clarified, +behavior in such cases should not be relied on. +.PP +The implementation of word-boundary matching is a bit of a kludge, +and bugs may lurk in combinations of word-boundary matching and anchoring. diff --git a/ext/ereg/regex/regex.7 b/ext/ereg/regex/regex.7 new file mode 100644 index 0000000000..d89012bda1 --- /dev/null +++ b/ext/ereg/regex/regex.7 @@ -0,0 +1,233 @@ +.TH REGEX 7 "7 Feb 1994" +.BY "Henry Spencer" +.SH NAME +regex \- POSIX 1003.2 regular expressions +.SH DESCRIPTION +Regular expressions (``RE''s), +as defined in POSIX 1003.2, come in two forms: +modern REs (roughly those of +.IR egrep ; +1003.2 calls these ``extended'' REs) +and obsolete REs (roughly those of +.IR ed ; +1003.2 ``basic'' REs). +Obsolete REs mostly exist for backward compatibility in some old programs; +they will be discussed at the end. +1003.2 leaves some aspects of RE syntax and semantics open; +`\(dg' marks decisions on these aspects that +may not be fully portable to other 1003.2 implementations. +.PP +A (modern) RE is one\(dg or more non-empty\(dg \fIbranches\fR, +separated by `|'. +It matches anything that matches one of the branches. +.PP +A branch is one\(dg or more \fIpieces\fR, concatenated. +It matches a match for the first, followed by a match for the second, etc. +.PP +A piece is an \fIatom\fR possibly followed +by a single\(dg `*', `+', `?', or \fIbound\fR. +An atom followed by `*' matches a sequence of 0 or more matches of the atom. +An atom followed by `+' matches a sequence of 1 or more matches of the atom. +An atom followed by `?' matches a sequence of 0 or 1 matches of the atom. +.PP +A \fIbound\fR is `{' followed by an unsigned decimal integer, +possibly followed by `,' +possibly followed by another unsigned decimal integer, +always followed by `}'. +The integers must lie between 0 and RE_DUP_MAX (255\(dg) inclusive, +and if there are two of them, the first may not exceed the second. +An atom followed by a bound containing one integer \fIi\fR +and no comma matches +a sequence of exactly \fIi\fR matches of the atom. +An atom followed by a bound +containing one integer \fIi\fR and a comma matches +a sequence of \fIi\fR or more matches of the atom. +An atom followed by a bound +containing two integers \fIi\fR and \fIj\fR matches +a sequence of \fIi\fR through \fIj\fR (inclusive) matches of the atom. +.PP +An atom is a regular expression enclosed in `()' (matching a match for the +regular expression), +an empty set of `()' (matching the null string)\(dg, +a \fIbracket expression\fR (see below), `.' +(matching any single character), `^' (matching the null string at the +beginning of a line), `$' (matching the null string at the +end of a line), a `\e' followed by one of the characters +`^.[$()|*+?{\e' +(matching that character taken as an ordinary character), +a `\e' followed by any other character\(dg +(matching that character taken as an ordinary character, +as if the `\e' had not been present\(dg), +or a single character with no other significance (matching that character). +A `{' followed by a character other than a digit is an ordinary +character, not the beginning of a bound\(dg. +It is illegal to end an RE with `\e'. +.PP +A \fIbracket expression\fR is a list of characters enclosed in `[]'. +It normally matches any single character from the list (but see below). +If the list begins with `^', +it matches any single character +(but see below) \fInot\fR from the rest of the list. +If two characters in the list are separated by `\-', this is shorthand +for the full \fIrange\fR of characters between those two (inclusive) in the +collating sequence, +e.g. `[0-9]' in ASCII matches any decimal digit. +It is illegal\(dg for two ranges to share an +endpoint, e.g. `a-c-e'. +Ranges are very collating-sequence-dependent, +and portable programs should avoid relying on them. +.PP +To include a literal `]' in the list, make it the first character +(following a possible `^'). +To include a literal `\-', make it the first or last character, +or the second endpoint of a range. +To use a literal `\-' as the first endpoint of a range, +enclose it in `[.' and `.]' to make it a collating element (see below). +With the exception of these and some combinations using `[' (see next +paragraphs), all other special characters, including `\e', lose their +special significance within a bracket expression. +.PP +Within a bracket expression, a collating element (a character, +a multi-character sequence that collates as if it were a single character, +or a collating-sequence name for either) +enclosed in `[.' and `.]' stands for the +sequence of characters of that collating element. +The sequence is a single element of the bracket expression's list. +A bracket expression containing a multi-character collating element  +can thus match more than one character, +e.g. if the collating sequence includes a `ch' collating element, +then the RE `[[.ch.]]*c' matches the first five characters +of `chchcc'. +.PP +Within a bracket expression, a collating element enclosed in `[=' and +`=]' is an equivalence class, standing for the sequences of characters +of all collating elements equivalent to that one, including itself. +(If there are no other equivalent collating elements, +the treatment is as if the enclosing delimiters were `[.' and `.]'.) +For example, if o and \o'o^' are the members of an equivalence class, +then `[[=o=]]', `[[=\o'o^'=]]', and `[o\o'o^']' are all synonymous. +An equivalence class may not\(dg be an endpoint +of a range. +.PP +Within a bracket expression, the name of a \fIcharacter class\fR enclosed +in `[:' and `:]' stands for the list of all characters belonging to that +class. +Standard character class names are: +.PP +.RS +.nf +.ta 3c 6c 9c +alnum	digit	punct +alpha	graph	space +blank	lower	upper +cntrl	print	xdigit +.fi +.RE +.PP +These stand for the character classes defined in +.IR ctype (3). +A locale may provide others. +A character class may not be used as an endpoint of a range. +.PP +There are two special cases\(dg of bracket expressions: +the bracket expressions `[[:<:]]' and `[[:>:]]' match the null string at +the beginning and end of a word respectively. +A word is defined as a sequence of +word characters +which is neither preceded nor followed by +word characters. +A word character is an +.I alnum +character (as defined by +.IR ctype (3)) +or an underscore. +This is an extension, +compatible with but not specified by POSIX 1003.2, +and should be used with +caution in software intended to be portable to other systems. +.PP +In the event that an RE could match more than one substring of a given +string, +the RE matches the one starting earliest in the string. +If the RE could match more than one substring starting at that point, +it matches the longest. +Subexpressions also match the longest possible substrings, subject to +the constraint that the whole match be as long as possible, +with subexpressions starting earlier in the RE taking priority over +ones starting later. +Note that higher-level subexpressions thus take priority over +their lower-level component subexpressions. +.PP +Match lengths are measured in characters, not collating elements. +A null string is considered longer than no match at all. +For example, +`bb*' matches the three middle characters of `abbbc', +`(wee|week)(knights|nights)' matches all ten characters of `weeknights', +when `(.*).*' is matched against `abc' the parenthesized subexpression +matches all three characters, and +when `(a*)*' is matched against `bc' both the whole RE and the parenthesized +subexpression match the null string. +.PP +If case-independent matching is specified, +the effect is much as if all case distinctions had vanished from the +alphabet. +When an alphabetic that exists in multiple cases appears as an +ordinary character outside a bracket expression, it is effectively +transformed into a bracket expression containing both cases, +e.g. `x' becomes `[xX]'. +When it appears inside a bracket expression, all case counterparts +of it are added to the bracket expression, so that (e.g.) `[x]' +becomes `[xX]' and `[^x]' becomes `[^xX]'. +.PP +No particular limit is imposed on the length of REs\(dg. +Programs intended to be portable should not employ REs longer +than 256 bytes, +as an implementation can refuse to accept such REs and remain +POSIX-compliant. +.PP +Obsolete (``basic'') regular expressions differ in several respects. +`|', `+', and `?' are ordinary characters and there is no equivalent +for their functionality. +The delimiters for bounds are `\e{' and `\e}', +with `{' and `}' by themselves ordinary characters. +The parentheses for nested subexpressions are `\e(' and `\e)', +with `(' and `)' by themselves ordinary characters. +`^' is an ordinary character except at the beginning of the +RE or\(dg the beginning of a parenthesized subexpression, +`$' is an ordinary character except at the end of the +RE or\(dg the end of a parenthesized subexpression, +and `*' is an ordinary character if it appears at the beginning of the +RE or the beginning of a parenthesized subexpression +(after a possible leading `^'). +Finally, there is one new type of atom, a \fIback reference\fR: +`\e' followed by a non-zero decimal digit \fId\fR +matches the same sequence of characters +matched by the \fId\fRth parenthesized subexpression +(numbering subexpressions by the positions of their opening parentheses, +left to right), +so that (e.g.) `\e([bc]\e)\e1' matches `bb' or `cc' but not `bc'. +.SH SEE ALSO +regex(3) +.PP +POSIX 1003.2, section 2.8 (Regular Expression Notation). +.SH BUGS +Having two kinds of REs is a botch. +.PP +The current 1003.2 spec says that `)' is an ordinary character in +the absence of an unmatched `('; +this was an unintentional result of a wording error, +and change is likely. +Avoid relying on it. +.PP +Back references are a dreadful botch, +posing major problems for efficient implementations. +They are also somewhat vaguely defined +(does +`a\e(\e(b\e)*\e2\e)*d' match `abbbd'?). +Avoid using them. +.PP +1003.2's specification of case-independent matching is vague. +The ``one case implies all cases'' definition given above +is current consensus among implementors as to the right interpretation. +.PP +The syntax for word boundaries is incredibly ugly. diff --git a/ext/ereg/regex/regex.dsp b/ext/ereg/regex/regex.dsp new file mode 100644 index 0000000000..e8f1ad4299 --- /dev/null +++ b/ext/ereg/regex/regex.dsp @@ -0,0 +1,106 @@ +# Microsoft Developer Studio Project File - Name="regex" - Package Owner=<4> +# Microsoft Developer Studio Generated Build File, Format Version 5.00 +# ** DO NOT EDIT ** + +# TARGTYPE "Win32 (x86) Dynamic-Link Library" 0x0102 + +CFG=regex - Win32 Debug +!MESSAGE This is not a valid makefile. To build this project using NMAKE, +!MESSAGE use the Export Makefile command and run +!MESSAGE  +!MESSAGE NMAKE /f "regex.mak". +!MESSAGE  +!MESSAGE You can specify a configuration when running NMAKE +!MESSAGE by defining the macro CFG on the command line. For example: +!MESSAGE  +!MESSAGE NMAKE /f "regex.mak" CFG="regex - Win32 Debug" +!MESSAGE  +!MESSAGE Possible choices for configuration are: +!MESSAGE  +!MESSAGE "regex - Win32 Release" (based on "Win32 (x86) Dynamic-Link Library") +!MESSAGE "regex - Win32 Debug" (based on "Win32 (x86) Dynamic-Link Library") +!MESSAGE  + +# Begin Project +# PROP Scc_ProjName "" +# PROP Scc_LocalPath "" +CPP=cl.exe +MTL=midl.exe +RSC=rc.exe + +!IF  "$(CFG)" == "regex - Win32 Release" + +# PROP BASE Use_MFC 0 +# PROP BASE Use_Debug_Libraries 0 +# PROP BASE Output_Dir "Release" +# PROP BASE Intermediate_Dir "Release" +# PROP BASE Target_Dir "" +# PROP Use_MFC 0 +# PROP Use_Debug_Libraries 0 +# PROP Output_Dir "Release" +# PROP Intermediate_Dir "Release" +# PROP Ignore_Export_Lib 0 +# PROP Target_Dir "" +# ADD BASE CPP /nologo /MT /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_WINDOWS" /YX /FD /c +# ADD CPP /nologo /MT /W3 /GX /O2 /I "." /D "WIN32" /D "NDEBUG" /D "_WINDOWS" /YX /FD /c +# ADD BASE MTL /nologo /D "NDEBUG" /mktyplib203 /o NUL /win32 +# ADD MTL /nologo /D "NDEBUG" /mktyplib203 /o NUL /win32 +# ADD BASE RSC /l 0x409 /d "NDEBUG" +# ADD RSC /l 0x409 /d "NDEBUG" +BSC32=bscmake.exe +# ADD BASE BSC32 /nologo +# ADD BSC32 /nologo +LINK32=link.exe +# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:windows /dll /machine:I386 +# ADD LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:windows /dll /machine:I386 + +!ELSEIF  "$(CFG)" == "regex - Win32 Debug" + +# PROP BASE Use_MFC 0 +# PROP BASE Use_Debug_Libraries 1 +# PROP BASE Output_Dir "Debug" +# PROP BASE Intermediate_Dir "Debug" +# PROP BASE Target_Dir "" +# PROP Use_MFC 0 +# PROP Use_Debug_Libraries 1 +# PROP Output_Dir "Debug" +# PROP Intermediate_Dir "Debug" +# PROP Ignore_Export_Lib 0 +# PROP Target_Dir "" +# ADD BASE CPP /nologo /MTd /W3 /Gm /GX /Zi /Od /D "WIN32" /D "_DEBUG" /D "_WINDOWS" /YX /FD /c +# ADD CPP /nologo /MTd /W3 /Gm /GX /Zi /Od /I "." /D "WIN32" /D "_DEBUG" /D "_WINDOWS" /YX /FD /c +# ADD BASE MTL /nologo /D "_DEBUG" /mktyplib203 /o NUL /win32 +# ADD MTL /nologo /D "_DEBUG" /mktyplib203 /o NUL /win32 +# ADD BASE RSC /l 0x409 /d "_DEBUG" +# ADD RSC /l 0x409 /d "_DEBUG" +BSC32=bscmake.exe +# ADD BASE BSC32 /nologo +# ADD BSC32 /nologo +LINK32=link.exe +# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:windows /dll /debug /machine:I386 /pdbtype:sept +# ADD LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:windows /dll /debug /machine:I386 /pdbtype:sept + +!ENDIF  + +# Begin Target + +# Name "regex - Win32 Release" +# Name "regex - Win32 Debug" +# Begin Source File + +SOURCE=.\regcomp.c +# End Source File +# Begin Source File + +SOURCE=.\regerror.c +# End Source File +# Begin Source File + +SOURCE=.\regexec.c +# End Source File +# Begin Source File + +SOURCE=.\regfree.c +# End Source File +# End Target +# End Project diff --git a/ext/ereg/regex/regex.dsw b/ext/ereg/regex/regex.dsw new file mode 100644 index 0000000000..7b7df8126c --- /dev/null +++ b/ext/ereg/regex/regex.dsw @@ -0,0 +1,29 @@ +Microsoft Developer Studio Workspace File, Format Version 5.00 +# WARNING: DO NOT EDIT OR DELETE THIS WORKSPACE FILE! + +############################################################################### + +Project: "regex"=.\regex.dsp - Package Owner=<4> + +Package=<5> +{{{ +}}} + +Package=<4> +{{{ +}}} + +############################################################################### + +Global: + +Package=<5> +{{{ +}}} + +Package=<3> +{{{ +}}} + +############################################################################### + diff --git a/ext/ereg/regex/regex.h b/ext/ereg/regex/regex.h new file mode 100644 index 0000000000..b39c5e178c --- /dev/null +++ b/ext/ereg/regex/regex.h @@ -0,0 +1,83 @@ +#ifndef _HSREGEX_H_ +#define _HSREGEX_H_ +#ifndef _HSREGEX_H +#define	_HSREGEX_H	/* never again */ +/* ========= begin header generated by ././mkh ========= */ +#ifdef __cplusplus +extern "C" { +#endif + +/* === regex2.h === */ +#ifdef WIN32 +#define API_EXPORT(type)    __declspec(dllexport) type __stdcall +#else +#define API_EXPORT(type)    type +#endif + +typedef off_t regoff_t; +typedef struct { +	int re_magic; +	size_t re_nsub;		/* number of parenthesized subexpressions */ +	const char *re_endp;	/* end pointer for REG_PEND */ +	struct re_guts *re_g;	/* none of your business :-) */ +} regex_t; +typedef struct { +	regoff_t rm_so;		/* start of match */ +	regoff_t rm_eo;		/* end of match */ +} regmatch_t; + + +/* === regcomp.c === */ +API_EXPORT(int) regcomp(regex_t *, const char *, int); +#define	REG_BASIC	0000 +#define	REG_EXTENDED	0001 +#define	REG_ICASE	0002 +#define	REG_NOSUB	0004 +#define	REG_NEWLINE	0010 +#define	REG_NOSPEC	0020 +#define	REG_PEND	0040 +#define	REG_DUMP	0200 + + +/* === regerror.c === */ +#define	REG_OKAY	 0 +#define	REG_NOMATCH	 1 +#define	REG_BADPAT	 2 +#define	REG_ECOLLATE	 3 +#define	REG_ECTYPE	 4 +#define	REG_EESCAPE	 5 +#define	REG_ESUBREG	 6 +#define	REG_EBRACK	 7 +#define	REG_EPAREN	 8 +#define	REG_EBRACE	 9 +#define	REG_BADBR	10 +#define	REG_ERANGE	11 +#define	REG_ESPACE	12 +#define	REG_BADRPT	13 +#define	REG_EMPTY	14 +#define	REG_ASSERT	15 +#define	REG_INVARG	16 +#define	REG_ATOI	255	/* convert name to number (!) */ +#define	REG_ITOA	0400	/* convert number to name (!) */ +API_EXPORT(size_t) regerror(int, const regex_t *, char *, size_t); + + +/* === regexec.c === */ +API_EXPORT(int) regexec(const regex_t *, const char *, size_t, regmatch_t [], int); +#define	REG_NOTBOL	00001 +#define	REG_NOTEOL	00002 +#define	REG_STARTEND	00004 +#define	REG_TRACE	00400	/* tracing of execution */ +#define	REG_LARGE	01000	/* force large representation */ +#define	REG_BACKR	02000	/* force use of backref code */ + + +/* === regfree.c === */ +API_EXPORT(void) regfree(regex_t *); + +#ifdef __cplusplus +} +#endif +/* ========= end header generated by ././mkh ========= */ +#endif +#endif diff --git a/ext/ereg/regex/regex.mak b/ext/ereg/regex/regex.mak new file mode 100644 index 0000000000..b87ded340b --- /dev/null +++ b/ext/ereg/regex/regex.mak @@ -0,0 +1,304 @@ +# Microsoft Developer Studio Generated NMAKE File, Based on regex.dsp +!IF "$(CFG)" == "" +CFG=regex - Win32 Release +!MESSAGE No configuration specified. Defaulting to regex - Win32 Release. +!ENDIF  + +!IF "$(CFG)" != "regex - Win32 Release" && "$(CFG)" != "regex - Win32 Debug" +!MESSAGE Invalid configuration "$(CFG)" specified. +!MESSAGE You can specify a configuration when running NMAKE +!MESSAGE by defining the macro CFG on the command line. For example: +!MESSAGE  +!MESSAGE NMAKE /f "regex.mak" CFG="regex - Win32 Release" +!MESSAGE  +!MESSAGE Possible choices for configuration are: +!MESSAGE  +!MESSAGE "regex - Win32 Release" (based on "Win32 (x86) Static Library") +!MESSAGE "regex - Win32 Debug" (based on "Win32 (x86) Static Library") +!MESSAGE  +!ERROR An invalid configuration is specified. +!ENDIF  + +!IF "$(OS)" == "Windows_NT" +NULL= +!ELSE  +NULL=nul +!ENDIF  + +CPP=cl.exe + +!IF  "$(CFG)" == "regex - Win32 Release" + +OUTDIR=.\Release +INTDIR=.\Release +# Begin Custom Macros +OutDir=.\.\Release +# End Custom Macros + +!IF "$(RECURSE)" == "0"  + +ALL : "$(OUTDIR)\regex.lib" + +!ELSE  + +ALL : "$(OUTDIR)\regex.lib" + +!ENDIF  + +CLEAN : +	-@erase "$(INTDIR)\regcomp.obj" +	-@erase "$(INTDIR)\regerror.obj" +	-@erase "$(INTDIR)\regexec.obj" +	-@erase "$(INTDIR)\regfree.obj" +	-@erase "$(INTDIR)\vc50.idb" +	-@erase "$(OUTDIR)\regex.lib" + +"$(OUTDIR)" : +    if not exist "$(OUTDIR)/$(NULL)" mkdir "$(OUTDIR)" + +CPP_PROJ=/nologo /MD /W3 /GX /O2 /I "." /D "WIN32" /D "NDEBUG" /D "_WINDOWS"\ + /Fp"$(INTDIR)\regex.pch" /YX /Fo"$(INTDIR)\\" /Fd"$(INTDIR)\\" /FD /c  +CPP_OBJS=.\Release/ +CPP_SBRS=. +BSC32=bscmake.exe +BSC32_FLAGS=/nologo /o"$(OUTDIR)\regex.bsc"  +BSC32_SBRS= \ +	 +LIB32=link.exe -lib +LIB32_FLAGS=/nologo /out:"$(OUTDIR)\regex.lib"  +LIB32_OBJS= \ +	"$(INTDIR)\regcomp.obj" \ +	"$(INTDIR)\regerror.obj" \ +	"$(INTDIR)\regexec.obj" \ +	"$(INTDIR)\regfree.obj" + +"$(OUTDIR)\regex.lib" : "$(OUTDIR)" $(DEF_FILE) $(LIB32_OBJS) +    $(LIB32) @<< +  $(LIB32_FLAGS) $(DEF_FLAGS) $(LIB32_OBJS) +<< + +!ELSEIF  "$(CFG)" == "regex - Win32 Debug" + +OUTDIR=.\Debug +INTDIR=.\Debug +# Begin Custom Macros +OutDir=.\.\Debug +# End Custom Macros + +!IF "$(RECURSE)" == "0"  + +ALL : "$(OUTDIR)\regex.lib" "$(OUTDIR)\regex.bsc" + +!ELSE  + +ALL : "$(OUTDIR)\regex.lib" "$(OUTDIR)\regex.bsc" + +!ENDIF  + +CLEAN : +	-@erase "$(INTDIR)\regcomp.obj" +	-@erase "$(INTDIR)\regcomp.sbr" +	-@erase "$(INTDIR)\regerror.obj" +	-@erase "$(INTDIR)\regerror.sbr" +	-@erase "$(INTDIR)\regexec.obj" +	-@erase "$(INTDIR)\regexec.sbr" +	-@erase "$(INTDIR)\regfree.obj" +	-@erase "$(INTDIR)\regfree.sbr" +	-@erase "$(INTDIR)\vc50.idb" +	-@erase "$(OUTDIR)\regex.bsc" +	-@erase "$(OUTDIR)\regex.lib" + +"$(OUTDIR)" : +    if not exist "$(OUTDIR)/$(NULL)" mkdir "$(OUTDIR)" + +CPP_PROJ=/nologo /MDd /W3 /GX /Z7 /Od /I "." /D "WIN32" /D "_DEBUG" /D\ + "_WINDOWS" /FR"$(INTDIR)\\" /Fp"$(INTDIR)\regex.pch" /YX /Fo"$(INTDIR)\\"\ + /Fd"$(INTDIR)\\" /FD /c  +CPP_OBJS=.\Debug/ +CPP_SBRS=.\Debug/ +BSC32=bscmake.exe +BSC32_FLAGS=/nologo /o"$(OUTDIR)\regex.bsc"  +BSC32_SBRS= \ +	"$(INTDIR)\regcomp.sbr" \ +	"$(INTDIR)\regerror.sbr" \ +	"$(INTDIR)\regexec.sbr" \ +	"$(INTDIR)\regfree.sbr" + +"$(OUTDIR)\regex.bsc" : "$(OUTDIR)" $(BSC32_SBRS) +    $(BSC32) @<< +  $(BSC32_FLAGS) $(BSC32_SBRS) +<< + +LIB32=link.exe -lib +LIB32_FLAGS=/nologo /out:"$(OUTDIR)\regex.lib"  +LIB32_OBJS= \ +	"$(INTDIR)\regcomp.obj" \ +	"$(INTDIR)\regerror.obj" \ +	"$(INTDIR)\regexec.obj" \ +	"$(INTDIR)\regfree.obj" + +"$(OUTDIR)\regex.lib" : "$(OUTDIR)" $(DEF_FILE) $(LIB32_OBJS) +    $(LIB32) @<< +  $(LIB32_FLAGS) $(DEF_FLAGS) $(LIB32_OBJS) +<< + +!ENDIF  + +.c{$(CPP_OBJS)}.obj:: +   $(CPP) @<< +   $(CPP_PROJ) $<  +<< + +.cpp{$(CPP_OBJS)}.obj:: +   $(CPP) @<< +   $(CPP_PROJ) $<  +<< + +.cxx{$(CPP_OBJS)}.obj:: +   $(CPP) @<< +   $(CPP_PROJ) $<  +<< + +.c{$(CPP_SBRS)}.sbr:: +   $(CPP) @<< +   $(CPP_PROJ) $<  +<< + +.cpp{$(CPP_SBRS)}.sbr:: +   $(CPP) @<< +   $(CPP_PROJ) $<  +<< + +.cxx{$(CPP_SBRS)}.sbr:: +   $(CPP) @<< +   $(CPP_PROJ) $<  +<< + + +!IF "$(CFG)" == "regex - Win32 Release" || "$(CFG)" == "regex - Win32 Debug" +SOURCE=.\regcomp.c + +!IF  "$(CFG)" == "regex - Win32 Release" + +DEP_CPP_REGCO=\ +	".\cclass.h"\ +	".\cname.h"\ +	".\regcomp.ih"\ +	".\regex.h"\ +	".\regex2.h"\ +	".\utils.h"\ +	 + +"$(INTDIR)\regcomp.obj" : $(SOURCE) $(DEP_CPP_REGCO) "$(INTDIR)" + + +!ELSEIF  "$(CFG)" == "regex - Win32 Debug" + +DEP_CPP_REGCO=\ +	".\cclass.h"\ +	".\cname.h"\ +	".\regcomp.ih"\ +	".\regex.h"\ +	".\regex2.h"\ +	".\utils.h"\ +	{$(INCLUDE)}"sys\types.h"\ +	 + +"$(INTDIR)\regcomp.obj"	"$(INTDIR)\regcomp.sbr" : $(SOURCE) $(DEP_CPP_REGCO)\ + "$(INTDIR)" + + +!ENDIF  + +SOURCE=.\regerror.c + +!IF  "$(CFG)" == "regex - Win32 Release" + +DEP_CPP_REGER=\ +	".\regerror.ih"\ +	".\regex.h"\ +	".\utils.h"\ +	 + +"$(INTDIR)\regerror.obj" : $(SOURCE) $(DEP_CPP_REGER) "$(INTDIR)" + + +!ELSEIF  "$(CFG)" == "regex - Win32 Debug" + +DEP_CPP_REGER=\ +	".\regerror.ih"\ +	".\regex.h"\ +	".\utils.h"\ +	{$(INCLUDE)}"sys\types.h"\ +	 + +"$(INTDIR)\regerror.obj"	"$(INTDIR)\regerror.sbr" : $(SOURCE) $(DEP_CPP_REGER)\ + "$(INTDIR)" + + +!ENDIF  + +SOURCE=.\regexec.c + +!IF  "$(CFG)" == "regex - Win32 Release" + +DEP_CPP_REGEX=\ +	".\engine.c"\ +	".\engine.ih"\ +	".\regex.h"\ +	".\regex2.h"\ +	".\utils.h"\ +	 + +"$(INTDIR)\regexec.obj" : $(SOURCE) $(DEP_CPP_REGEX) "$(INTDIR)" + + +!ELSEIF  "$(CFG)" == "regex - Win32 Debug" + +DEP_CPP_REGEX=\ +	".\engine.c"\ +	".\engine.ih"\ +	".\regex.h"\ +	".\regex2.h"\ +	".\utils.h"\ +	{$(INCLUDE)}"sys\types.h"\ +	 + +"$(INTDIR)\regexec.obj"	"$(INTDIR)\regexec.sbr" : $(SOURCE) $(DEP_CPP_REGEX)\ + "$(INTDIR)" + + +!ENDIF  + +SOURCE=.\regfree.c + +!IF  "$(CFG)" == "regex - Win32 Release" + +DEP_CPP_REGFR=\ +	".\regex.h"\ +	".\regex2.h"\ +	".\utils.h"\ +	 + +"$(INTDIR)\regfree.obj" : $(SOURCE) $(DEP_CPP_REGFR) "$(INTDIR)" + + +!ELSEIF  "$(CFG)" == "regex - Win32 Debug" + +DEP_CPP_REGFR=\ +	".\regex.h"\ +	".\regex2.h"\ +	".\utils.h"\ +	{$(INCLUDE)}"sys\types.h"\ +	 + +"$(INTDIR)\regfree.obj"	"$(INTDIR)\regfree.sbr" : $(SOURCE) $(DEP_CPP_REGFR)\ + "$(INTDIR)" + + +!ENDIF  + +SOURCE=.\engine.c + +!ENDIF  + diff --git a/ext/ereg/regex/regex2.h b/ext/ereg/regex/regex2.h new file mode 100644 index 0000000000..4996f96ecd --- /dev/null +++ b/ext/ereg/regex/regex2.h @@ -0,0 +1,140 @@ +/* + * First, the stuff that ends up in the outside-world include file + = #ifdef WIN32 + = #define API_EXPORT(type)    __declspec(dllexport) type __stdcall + = #else + = #define API_EXPORT(type)    type + = #endif + = + = typedef off_t regoff_t; + = typedef struct { + = 	int re_magic; + = 	size_t re_nsub;		// number of parenthesized subexpressions + = 	const unsigned char *re_endp;	// end pointer for REG_PEND + = 	struct re_guts *re_g;	// none of your business :-) + = } regex_t; + = typedef struct { + = 	regoff_t rm_so;		// start of match + = 	regoff_t rm_eo;		// end of match + = } regmatch_t; + */ +/* + * internals of regex_t + */ +#define	MAGIC1	((('r'^0200)<<8) | 'e') + +/* + * The internal representation is a *strip*, a sequence of + * operators ending with an endmarker.  (Some terminology etc. is a + * historical relic of earlier versions which used multiple strips.) + * Certain oddities in the representation are there to permit running + * the machinery backwards; in particular, any deviation from sequential + * flow must be marked at both its source and its destination.  Some + * fine points: + * + * - OPLUS_ and O_PLUS are *inside* the loop they create. + * - OQUEST_ and O_QUEST are *outside* the bypass they create. + * - OCH_ and O_CH are *outside* the multi-way branch they create, while + *   OOR1 and OOR2 are respectively the end and the beginning of one of + *   the branches.  Note that there is an implicit OOR2 following OCH_ + *   and an implicit OOR1 preceding O_CH. + * + * In state representations, an operator's bit is on to signify a state + * immediately *preceding* "execution" of that operator. + */ +typedef long sop;		/* strip operator */ +typedef long sopno; +#define	OPRMASK	0x7c000000 +#define	OPDMASK	0x03ffffff +#define	OPSHIFT	(26) +#define	OP(n)	((n)&OPRMASK) +#define	OPND(n)	((n)&OPDMASK) +#define	SOP(op, opnd)	((op)|(opnd)) +/* operators			   meaning	operand			*/ +/*						(back, fwd are offsets)	*/ +#define	OEND	(1<<OPSHIFT)	/* endmarker	-			*/ +#define	OCHAR	(2<<OPSHIFT)	/* character	unsigned char		*/ +#define	OBOL	(3<<OPSHIFT)	/* left anchor	-			*/ +#define	OEOL	(4<<OPSHIFT)	/* right anchor	-			*/ +#define	OANY	(5<<OPSHIFT)	/* .		-			*/ +#define	OANYOF	(6<<OPSHIFT)	/* [...]	set number		*/ +#define	OBACK_	(7<<OPSHIFT)	/* begin \d	paren number		*/ +#define	O_BACK	(8<<OPSHIFT)	/* end \d	paren number		*/ +#define	OPLUS_	(9<<OPSHIFT)	/* + prefix	fwd to suffix		*/ +#define	O_PLUS	(10<<OPSHIFT)	/* + suffix	back to prefix		*/ +#define	OQUEST_	(11<<OPSHIFT)	/* ? prefix	fwd to suffix		*/ +#define	O_QUEST	(12<<OPSHIFT)	/* ? suffix	back to prefix		*/ +#define	OLPAREN	(13<<OPSHIFT)	/* (		fwd to )		*/ +#define	ORPAREN	(14<<OPSHIFT)	/* )		back to (		*/ +#define	OCH_	(15<<OPSHIFT)	/* begin choice	fwd to OOR2		*/ +#define	OOR1	(16<<OPSHIFT)	/* | pt. 1	back to OOR1 or OCH_	*/ +#define	OOR2	(17<<OPSHIFT)	/* | pt. 2	fwd to OOR2 or O_CH	*/ +#define	O_CH	(18<<OPSHIFT)	/* end choice	back to OOR1		*/ +#define	OBOW	(19<<OPSHIFT)	/* begin word	-			*/ +#define	OEOW	(20<<OPSHIFT)	/* end word	-			*/ + +/* + * Structure for [] character-set representation.  Character sets are + * done as bit vectors, grouped 8 to a byte vector for compactness. + * The individual set therefore has both a pointer to the byte vector + * and a mask to pick out the relevant bit of each byte.  A hash code + * simplifies testing whether two sets could be identical. + * + * This will get trickier for multicharacter collating elements.  As + * preliminary hooks for dealing with such things, we also carry along + * a string of multi-character elements, and decide the size of the + * vectors at run time. + */ +typedef struct { +	uch *ptr;		/* -> uch [csetsize] */ +	uch mask;		/* bit within array */ +	uch hash;		/* hash code */ +	size_t smultis; +	unsigned char *multis;		/* -> char[smulti]  ab\0cd\0ef\0\0 */ +} cset; +/* note that CHadd and CHsub are unsafe, and CHIN doesn't yield 0/1 */ +#define	CHadd(cs, c)	((cs)->ptr[(uch)(c)] |= (cs)->mask, (cs)->hash += (c)) +#define	CHsub(cs, c)	((cs)->ptr[(uch)(c)] &= ~(cs)->mask, (cs)->hash -= (c)) +#define	CHIN(cs, c)	((cs)->ptr[(uch)(c)] & (cs)->mask) +#define	MCadd(p, cs, cp)	mcadd(p, cs, cp)	/* regcomp() internal fns */ +#define	MCsub(p, cs, cp)	mcsub(p, cs, cp) +#define	MCin(p, cs, cp)	mcin(p, cs, cp) + +/* stuff for character categories */ +typedef unsigned char cat_t; + +/* + * main compiled-expression structure + */ +struct re_guts { +	int magic; +#		define	MAGIC2	((('R'^0200)<<8)|'E') +	sop *strip;		/* malloced area for strip */ +	int csetsize;		/* number of bits in a cset vector */ +	int ncsets;		/* number of csets in use */ +	cset *sets;		/* -> cset [ncsets] */ +	uch *setbits;		/* -> uch[csetsize][ncsets/CHAR_BIT] */ +	int cflags;		/* copy of regcomp() cflags argument */ +	sopno nstates;		/* = number of sops */ +	sopno firststate;	/* the initial OEND (normally 0) */ +	sopno laststate;	/* the final OEND */ +	int iflags;		/* internal flags */ +#		define	USEBOL	01	/* used ^ */ +#		define	USEEOL	02	/* used $ */ +#		define	BAD	04	/* something wrong */ +	int nbol;		/* number of ^ used */ +	int neol;		/* number of $ used */ +	int ncategories;	/* how many character categories */ +	cat_t *categories;	/* ->catspace[-UCHAR_MIN] */ +	unsigned char *must;		/* match must contain this string */ +	int mlen;		/* length of must */ +	size_t nsub;		/* copy of re_nsub */ +	int backrefs;		/* does it use back references? */ +	sopno nplus;		/* how deep does it nest +s? */ +	/* catspace must be last */ +	cat_t catspace[1];	/* actually [NC] */ +}; + +/* misc utilities */ +#define	OUT	(UCHAR_MAX+1)	/* a non-character value */ +#define	ISWORD(c)	(isalnum(c) || (c) == '_') diff --git a/ext/ereg/regex/regexec.c b/ext/ereg/regex/regexec.c new file mode 100644 index 0000000000..bbfe094c96 --- /dev/null +++ b/ext/ereg/regex/regexec.c @@ -0,0 +1,138 @@ +/* + * the outer shell of regexec() + * + * This file includes engine.c *twice*, after muchos fiddling with the + * macros that code uses.  This lets the same code operate on two different + * representations for state sets. + */ +#include <sys/types.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <limits.h> +#include <ctype.h> +#include <regex.h> + +#include "utils.h" +#include "regex2.h" + +#define PHP_REGEX_NOPE 0;		/* for use in asserts; shuts lint up */ + +/* macros for manipulating states, small version */ +#define	states	unsigned +#define	states1	unsigned	/* for later use in regexec() decision */ +#define	CLEAR(v)	((v) = 0) +#define	SET0(v, n)	((v) &= ~((unsigned)1 << (n))) +#define	SET1(v, n)	((v) |= (unsigned)1 << (n)) +#define	ISSET(v, n)	((v) & ((unsigned)1 << (n))) +#define	ASSIGN(d, s)	((d) = (s)) +#define	EQ(a, b)	((a) == (b)) +#define	STATEVARS	int dummy	/* dummy version */ +#define	STATESETUP(m, n)	/* nothing */ +#define	STATETEARDOWN(m)	/* nothing */ +#define	SETUP(v)	((v) = 0) +#define	onestate	unsigned +#define	INIT(o, n)	((o) = (unsigned)1 << (n)) +#define	INC(o)	((o) <<= 1) +#define	ISSTATEIN(v, o)	((v) & (o)) +/* some abbreviations; note that some of these know variable names! */ +/* do "if I'm here, I can also be there" etc without branches */ +#define	FWD(dst, src, n)	((dst) |= ((unsigned)(src)&(here)) << (n)) +#define	BACK(dst, src, n)	((dst) |= ((unsigned)(src)&(here)) >> (n)) +#define	ISSETBACK(v, n)	((v) & ((unsigned)here >> (n))) +/* function names */ +#define SNAMES			/* engine.c looks after details */ + +#include "engine.c" + +/* now undo things */ +#undef	states +#undef	CLEAR +#undef	SET0 +#undef	SET1 +#undef	ISSET +#undef	ASSIGN +#undef	EQ +#undef	STATEVARS +#undef	STATESETUP +#undef	STATETEARDOWN +#undef	SETUP +#undef	onestate +#undef	INIT +#undef	INC +#undef	ISSTATEIN +#undef	FWD +#undef	BACK +#undef	ISSETBACK +#undef	SNAMES + +/* macros for manipulating states, large version */ +#define	states	unsigned char * +#define	CLEAR(v)	memset(v, 0, m->g->nstates) +#define	SET0(v, n)	((v)[n] = 0) +#define	SET1(v, n)	((v)[n] = 1) +#define	ISSET(v, n)	((v)[n]) +#define	ASSIGN(d, s)	memcpy(d, s, m->g->nstates) +#define	EQ(a, b)	(memcmp(a, b, m->g->nstates) == 0) +#define	STATEVARS	int vn; unsigned char *space +#define	STATESETUP(m, nv)	{ (m)->space = malloc((nv)*(m)->g->nstates); \ +				if ((m)->space == NULL) return(REG_ESPACE); \ +				(m)->vn = 0; } +#define	STATETEARDOWN(m)	{ free((m)->space); } +#define	SETUP(v)	((v) = &m->space[m->vn++ * m->g->nstates]) +#define	onestate	int +#define	INIT(o, n)	((o) = (n)) +#define	INC(o)	((o)++) +#define	ISSTATEIN(v, o)	((v)[o]) +/* some abbreviations; note that some of these know variable names! */ +/* do "if I'm here, I can also be there" etc without branches */ +#define	FWD(dst, src, n)	((dst)[here+(n)] |= (src)[here]) +#define	BACK(dst, src, n)	((dst)[here-(n)] |= (src)[here]) +#define	ISSETBACK(v, n)	((v)[here - (n)]) +/* function names */ +#define	LNAMES			/* flag */ + +#include "engine.c" + +/* + - regexec - interface for matching + = API_EXPORT(int) regexec(const regex_t *, const char *, size_t, \ + =					regmatch_t [], int); + = #define	REG_NOTBOL	00001 + = #define	REG_NOTEOL	00002 + = #define	REG_STARTEND	00004 + = #define	REG_TRACE	00400	// tracing of execution + = #define	REG_LARGE	01000	// force large representation + = #define	REG_BACKR	02000	// force use of backref code + * + * We put this here so we can exploit knowledge of the state representation + * when choosing which matcher to call.  Also, by this point the matchers + * have been prototyped. + */ +API_EXPORT(int)				/* 0 success, REG_NOMATCH failure */ +regexec(preg, string, nmatch, pmatch, eflags) +const regex_t *preg; +const char *string; +size_t nmatch; +regmatch_t pmatch[]; +int eflags; +{ +	register struct re_guts *g = preg->re_g; +#ifdef REDEBUG +#	define	GOODFLAGS(f)	(f) +#else +#	define	GOODFLAGS(f)	((f)&(REG_NOTBOL|REG_NOTEOL|REG_STARTEND)) +#endif + +	if (preg->re_magic != MAGIC1 || g->magic != MAGIC2) +		return(REG_BADPAT); +	assert(!(g->iflags&BAD)); +	if (g->iflags&BAD)		/* backstop for no-debug case */ +		return(REG_BADPAT); +	eflags = GOODFLAGS(eflags); + +	if (g->nstates <= CHAR_BIT*sizeof(states1) && !(eflags®_LARGE)) +		return(smatcher(g, (unsigned char *)string, nmatch, pmatch, eflags)); +	else +		return(lmatcher(g, (unsigned char *)string, nmatch, pmatch, eflags)); +} diff --git a/ext/ereg/regex/regfree.c b/ext/ereg/regex/regfree.c new file mode 100644 index 0000000000..9fd618a13b --- /dev/null +++ b/ext/ereg/regex/regfree.c @@ -0,0 +1,37 @@ +#include <sys/types.h> +#include <stdio.h> +#include <stdlib.h> +#include <regex.h> + +#include "utils.h" +#include "regex2.h" + +/* + - regfree - free everything + = API_EXPORT(void) regfree(regex_t *); + */ +API_EXPORT(void) +regfree(preg) +regex_t *preg; +{ +	register struct re_guts *g; + +	if (preg->re_magic != MAGIC1)	/* oops */ +		return;			/* nice to complain, but hard */ + +	g = preg->re_g; +	if (g == NULL || g->magic != MAGIC2)	/* oops again */ +		return; +	preg->re_magic = 0;		/* mark it invalid */ +	g->magic = 0;			/* mark it invalid */ + +	if (g->strip != NULL) +		free((char *)g->strip); +	if (g->sets != NULL) +		free((char *)g->sets); +	if (g->setbits != NULL) +		free((char *)g->setbits); +	if (g->must != NULL) +		free(g->must); +	free((char *)g); +} diff --git a/ext/ereg/regex/split.c b/ext/ereg/regex/split.c new file mode 100644 index 0000000000..188bdb775b --- /dev/null +++ b/ext/ereg/regex/split.c @@ -0,0 +1,316 @@ +#include <stdio.h> +#include <string.h> + +/* + - split - divide a string into fields, like awk split() + = int split(char *string, char *fields[], int nfields, char *sep); + */ +int				/* number of fields, including overflow */ +split(string, fields, nfields, sep) +char *string; +char *fields[];			/* list is not NULL-terminated */ +int nfields;			/* number of entries available in fields[] */ +char *sep;			/* "" white, "c" single char, "ab" [ab]+ */ +{ +	register char *p = string; +	register char c;			/* latest character */ +	register char sepc = sep[0]; +	register char sepc2; +	register int fn; +	register char **fp = fields; +	register char *sepp; +	register int trimtrail; + +	/* white space */ +	if (sepc == '\0') { +		while ((c = *p++) == ' ' || c == '\t') +			continue; +		p--; +		trimtrail = 1; +		sep = " \t";	/* note, code below knows this is 2 long */ +		sepc = ' '; +	} else +		trimtrail = 0; +	sepc2 = sep[1];		/* now we can safely pick this up */ + +	/* catch empties */ +	if (*p == '\0') +		return(0); + +	/* single separator */ +	if (sepc2 == '\0') { +		fn = nfields; +		for (;;) { +			*fp++ = p; +			fn--; +			if (fn == 0) +				break; +			while ((c = *p++) != sepc) +				if (c == '\0') +					return(nfields - fn); +			*(p-1) = '\0'; +		} +		/* we have overflowed the fields vector -- just count them */ +		fn = nfields; +		for (;;) { +			while ((c = *p++) != sepc) +				if (c == '\0') +					return(fn); +			fn++; +		} +		/* not reached */ +	} + +	/* two separators */ +	if (sep[2] == '\0') { +		fn = nfields; +		for (;;) { +			*fp++ = p; +			fn--; +			while ((c = *p++) != sepc && c != sepc2) +				if (c == '\0') { +					if (trimtrail && **(fp-1) == '\0') +						fn++; +					return(nfields - fn); +				} +			if (fn == 0) +				break; +			*(p-1) = '\0'; +			while ((c = *p++) == sepc || c == sepc2) +				continue; +			p--; +		} +		/* we have overflowed the fields vector -- just count them */ +		fn = nfields; +		while (c != '\0') { +			while ((c = *p++) == sepc || c == sepc2) +				continue; +			p--; +			fn++; +			while ((c = *p++) != '\0' && c != sepc && c != sepc2) +				continue; +		} +		/* might have to trim trailing white space */ +		if (trimtrail) { +			p--; +			while ((c = *--p) == sepc || c == sepc2) +				continue; +			p++; +			if (*p != '\0') { +				if (fn == nfields+1) +					*p = '\0'; +				fn--; +			} +		} +		return(fn); +	} + +	/* n separators */ +	fn = 0; +	for (;;) { +		if (fn < nfields) +			*fp++ = p; +		fn++; +		for (;;) { +			c = *p++; +			if (c == '\0') +				return(fn); +			sepp = sep; +			while ((sepc = *sepp++) != '\0' && sepc != c) +				continue; +			if (sepc != '\0')	/* it was a separator */ +				break; +		} +		if (fn < nfields) +			*(p-1) = '\0'; +		for (;;) { +			c = *p++; +			sepp = sep; +			while ((sepc = *sepp++) != '\0' && sepc != c) +				continue; +			if (sepc == '\0')	/* it wasn't a separator */ +				break; +		} +		p--; +	} + +	/* not reached */ +} + +#ifdef TEST_SPLIT + + +/* + * test program + * pgm		runs regression + * pgm sep	splits stdin lines by sep + * pgm str sep	splits str by sep + * pgm str sep n	splits str by sep n times + */ +int +main(argc, argv) +int argc; +char *argv[]; +{ +	char buf[512]; +	register int n; +#	define	MNF	10 +	char *fields[MNF]; + +	if (argc > 4) +		for (n = atoi(argv[3]); n > 0; n--) { +			(void) strcpy(buf, argv[1]); +		} +	else if (argc > 3) +		for (n = atoi(argv[3]); n > 0; n--) { +			(void) strcpy(buf, argv[1]); +			(void) split(buf, fields, MNF, argv[2]); +		} +	else if (argc > 2) +		dosplit(argv[1], argv[2]); +	else if (argc > 1) +		while (fgets(buf, sizeof(buf), stdin) != NULL) { +			buf[strlen(buf)-1] = '\0';	/* stomp newline */ +			dosplit(buf, argv[1]); +		} +	else +		regress(); + +	exit(0); +} + +dosplit(string, seps) +char *string; +char *seps; +{ +#	define	NF	5 +	char *fields[NF]; +	register int nf; + +	nf = split(string, fields, NF, seps); +	print(nf, NF, fields); +} + +print(nf, nfp, fields) +int nf; +int nfp; +char *fields[]; +{ +	register int fn; +	register int bound; + +	bound = (nf > nfp) ? nfp : nf; +	printf("%d:\t", nf); +	for (fn = 0; fn < bound; fn++) +		printf("\"%s\"%s", fields[fn], (fn+1 < nf) ? ", " : "\n"); +} + +#define	RNF	5		/* some table entries know this */ +struct { +	char *str; +	char *seps; +	int nf; +	char *fi[RNF]; +} tests[] = { +	"",		" ",	0,	{ "" }, +	" ",		" ",	2,	{ "", "" }, +	"x",		" ",	1,	{ "x" }, +	"xy",		" ",	1,	{ "xy" }, +	"x y",		" ",	2,	{ "x", "y" }, +	"abc def  g ",	" ",	5,	{ "abc", "def", "", "g", "" }, +	"  a bcd",	" ",	4,	{ "", "", "a", "bcd" }, +	"a b c d e f",	" ",	6,	{ "a", "b", "c", "d", "e f" }, +	" a b c d ",	" ",	6,	{ "", "a", "b", "c", "d " }, + +	"",		" _",	0,	{ "" }, +	" ",		" _",	2,	{ "", "" }, +	"x",		" _",	1,	{ "x" }, +	"x y",		" _",	2,	{ "x", "y" }, +	"ab _ cd",	" _",	2,	{ "ab", "cd" }, +	" a_b  c ",	" _",	5,	{ "", "a", "b", "c", "" }, +	"a b c_d e f",	" _",	6,	{ "a", "b", "c", "d", "e f" }, +	" a b c d ",	" _",	6,	{ "", "a", "b", "c", "d " }, + +	"",		" _~",	0,	{ "" }, +	" ",		" _~",	2,	{ "", "" }, +	"x",		" _~",	1,	{ "x" }, +	"x y",		" _~",	2,	{ "x", "y" }, +	"ab _~ cd",	" _~",	2,	{ "ab", "cd" }, +	" a_b  c~",	" _~",	5,	{ "", "a", "b", "c", "" }, +	"a b_c d~e f",	" _~",	6,	{ "a", "b", "c", "d", "e f" }, +	"~a b c d ",	" _~",	6,	{ "", "a", "b", "c", "d " }, + +	"",		" _~-",	0,	{ "" }, +	" ",		" _~-",	2,	{ "", "" }, +	"x",		" _~-",	1,	{ "x" }, +	"x y",		" _~-",	2,	{ "x", "y" }, +	"ab _~- cd",	" _~-",	2,	{ "ab", "cd" }, +	" a_b  c~",	" _~-",	5,	{ "", "a", "b", "c", "" }, +	"a b_c-d~e f",	" _~-",	6,	{ "a", "b", "c", "d", "e f" }, +	"~a-b c d ",	" _~-",	6,	{ "", "a", "b", "c", "d " }, + +	"",		"  ",	0,	{ "" }, +	" ",		"  ",	2,	{ "", "" }, +	"x",		"  ",	1,	{ "x" }, +	"xy",		"  ",	1,	{ "xy" }, +	"x y",		"  ",	2,	{ "x", "y" }, +	"abc def  g ",	"  ",	4,	{ "abc", "def", "g", "" }, +	"  a bcd",	"  ",	3,	{ "", "a", "bcd" }, +	"a b c d e f",	"  ",	6,	{ "a", "b", "c", "d", "e f" }, +	" a b c d ",	"  ",	6,	{ "", "a", "b", "c", "d " }, + +	"",		"",	0,	{ "" }, +	" ",		"",	0,	{ "" }, +	"x",		"",	1,	{ "x" }, +	"xy",		"",	1,	{ "xy" }, +	"x y",		"",	2,	{ "x", "y" }, +	"abc def  g ",	"",	3,	{ "abc", "def", "g" }, +	"\t a bcd",	"",	2,	{ "a", "bcd" }, +	"  a \tb\t c ",	"",	3,	{ "a", "b", "c" }, +	"a b c d e ",	"",	5,	{ "a", "b", "c", "d", "e" }, +	"a b\tc d e f",	"",	6,	{ "a", "b", "c", "d", "e f" }, +	" a b c d e f ",	"",	6,	{ "a", "b", "c", "d", "e f " }, + +	NULL,		NULL,	0,	{ NULL }, +}; + +regress() +{ +	char buf[512]; +	register int n; +	char *fields[RNF+1]; +	register int nf; +	register int i; +	register int printit; +	register char *f; + +	for (n = 0; tests[n].str != NULL; n++) { +		(void) strcpy(buf, tests[n].str); +		fields[RNF] = NULL; +		nf = split(buf, fields, RNF, tests[n].seps); +		printit = 0; +		if (nf != tests[n].nf) { +			printf("split `%s' by `%s' gave %d fields, not %d\n", +				tests[n].str, tests[n].seps, nf, tests[n].nf); +			printit = 1; +		} else if (fields[RNF] != NULL) { +			printf("split() went beyond array end\n"); +			printit = 1; +		} else { +			for (i = 0; i < nf && i < RNF; i++) { +				f = fields[i]; +				if (f == NULL) +					f = "(NULL)"; +				if (strcmp(f, tests[n].fi[i]) != 0) { +					printf("split `%s' by `%s', field %d is `%s', not `%s'\n", +						tests[n].str, tests[n].seps, +						i, fields[i], tests[n].fi[i]); +					printit = 1; +				} +			} +		} +		if (printit) +			print(nf, RNF, fields); +	} +} +#endif diff --git a/ext/ereg/regex/tests b/ext/ereg/regex/tests new file mode 100644 index 0000000000..c05846177f --- /dev/null +++ b/ext/ereg/regex/tests @@ -0,0 +1,475 @@ +# regular expression test set +# Lines are at least three fields, separated by one or more tabs.  "" stands +# for an empty field.  First field is an RE.  Second field is flags.  If +# C flag given, regcomp() is expected to fail, and the third field is the +# error name (minus the leading REG_). +# +# Otherwise it is expected to succeed, and the third field is the string to +# try matching it against.  If there is no fourth field, the match is +# expected to fail.  If there is a fourth field, it is the substring that +# the RE is expected to match.  If there is a fifth field, it is a comma- +# separated list of what the subexpressions should match, with - indicating +# no match for that one.  In both the fourth and fifth fields, a (sub)field +# starting with @ indicates that the (sub)expression is expected to match +# a null string followed by the stuff after the @; this provides a way to +# test where null strings match.  The character `N' in REs and strings +# is newline, `S' is space, `T' is tab, `Z' is NUL. +# +# The full list of flags: +#	-	placeholder, does nothing +#	b	RE is a BRE, not an ERE +#	&	try it as both an ERE and a BRE +#	C	regcomp() error expected, third field is error name +#	i	REG_ICASE +#	m	("mundane") REG_NOSPEC +#	s	REG_NOSUB (not really testable) +#	n	REG_NEWLINE +#	^	REG_NOTBOL +#	$	REG_NOTEOL +#	#	REG_STARTEND (see below) +#	p	REG_PEND +# +# For REG_STARTEND, the start/end offsets are those of the substring +# enclosed in (). + +# basics +a		&	a	a +abc		&	abc	abc +abc|de		-	abc	abc +a|b|c		-	abc	a + +# parentheses and perversions thereof +a(b)c		-	abc	abc +a\(b\)c		b	abc	abc +a(		C	EPAREN +a(		b	a(	a( +a\(		-	a(	a( +a\(		bC	EPAREN +a\(b		bC	EPAREN +a(b		C	EPAREN +a(b		b	a(b	a(b +# gag me with a right parenthesis -- 1003.2 goofed here (my fault, partly) +a)		-	a)	a) +)		-	)	) +# end gagging (in a just world, those *should* give EPAREN) +a)		b	a)	a) +a\)		bC	EPAREN +\)		bC	EPAREN +a()b		-	ab	ab +a\(\)b		b	ab	ab + +# anchoring and REG_NEWLINE +^abc$		&	abc	abc +a^b		-	a^b +a^b		b	a^b	a^b +a$b		-	a$b +a$b		b	a$b	a$b +^		&	abc	@abc +$		&	abc	@ +^$		&	""	@ +$^		-	""	@ +\($\)\(^\)	b	""	@ +# stop retching, those are legitimate (although disgusting) +^^		-	""	@ +$$		-	""	@ +b$		&	abNc +b$		&n	abNc	b +^b$		&	aNbNc +^b$		&n	aNbNc	b +^$		&n	aNNb	@Nb +^$		n	abc +^$		n	abcN	@ +$^		n	aNNb	@Nb +\($\)\(^\)	bn	aNNb	@Nb +^^		n^	aNNb	@Nb +$$		n	aNNb	@NN +^a		^	a +a$		$	a +^a		^n	aNb +^b		^n	aNb	b +a$		$n	bNa +b$		$n	bNa	b +a*(^b$)c*	-	b	b +a*\(^b$\)c*	b	b	b + +# certain syntax errors and non-errors +|		C	EMPTY +|		b	|	| +*		C	BADRPT +*		b	*	* ++		C	BADRPT +?		C	BADRPT +""		&C	EMPTY +()		-	abc	@abc +\(\)		b	abc	@abc +a||b		C	EMPTY +|ab		C	EMPTY +ab|		C	EMPTY +(|a)b		C	EMPTY +(a|)b		C	EMPTY +(*a)		C	BADRPT +(+a)		C	BADRPT +(?a)		C	BADRPT +({1}a)		C	BADRPT +\(\{1\}a\)	bC	BADRPT +(a|*b)		C	BADRPT +(a|+b)		C	BADRPT +(a|?b)		C	BADRPT +(a|{1}b)	C	BADRPT +^*		C	BADRPT +^*		b	*	* +^+		C	BADRPT +^?		C	BADRPT +^{1}		C	BADRPT +^\{1\}		bC	BADRPT + +# metacharacters, backslashes +a.c		&	abc	abc +a[bc]d		&	abd	abd +a\*c		&	a*c	a*c +a\\b		&	a\b	a\b +a\\\*b		&	a\*b	a\*b +a\bc		&	abc	abc +a\		&C	EESCAPE +a\\bc		&	a\bc	a\bc +\{		bC	BADRPT +a\[b		&	a[b	a[b +a[b		&C	EBRACK +# trailing $ is a peculiar special case for the BRE code +a$		&	a	a +a$		&	a$ +a\$		&	a +a\$		&	a$	a$ +a\\$		&	a +a\\$		&	a$ +a\\$		&	a\$ +a\\$		&	a\	a\ + +# back references, ugh +a\(b\)\2c	bC	ESUBREG +a\(b\1\)c	bC	ESUBREG +a\(b*\)c\1d	b	abbcbbd	abbcbbd	bb +a\(b*\)c\1d	b	abbcbd +a\(b*\)c\1d	b	abbcbbbd +^\(.\)\1	b	abc +a\([bc]\)\1d	b	abcdabbd	abbd	b +a\(\([bc]\)\2\)*d	b	abbccd	abbccd +a\(\([bc]\)\2\)*d	b	abbcbd +# actually, this next one probably ought to fail, but the spec is unclear +a\(\(b\)*\2\)*d		b	abbbd	abbbd +# here is a case that no NFA implementation does right +\(ab*\)[ab]*\1	b	ababaaa	ababaaa	a +# check out normal matching in the presence of back refs +\(a\)\1bcd	b	aabcd	aabcd +\(a\)\1bc*d	b	aabcd	aabcd +\(a\)\1bc*d	b	aabd	aabd +\(a\)\1bc*d	b	aabcccd	aabcccd +\(a\)\1bc*[ce]d	b	aabcccd	aabcccd +^\(a\)\1b\(c\)*cd$	b	aabcccd	aabcccd + +# ordinary repetitions +ab*c		&	abc	abc +ab+c		-	abc	abc +ab?c		-	abc	abc +a\(*\)b		b	a*b	a*b +a\(**\)b	b	ab	ab +a\(***\)b	bC	BADRPT +*a		b	*a	*a +**a		b	a	a +***a		bC	BADRPT + +# the dreaded bounded repetitions +{		&	{	{ +{abc		&	{abc	{abc +{1		C	BADRPT +{1}		C	BADRPT +a{b		&	a{b	a{b +a{1}b		-	ab	ab +a\{1\}b		b	ab	ab +a{1,}b		-	ab	ab +a\{1,\}b	b	ab	ab +a{1,2}b		-	aab	aab +a\{1,2\}b	b	aab	aab +a{1		C	EBRACE +a\{1		bC	EBRACE +a{1a		C	EBRACE +a\{1a		bC	EBRACE +a{1a}		C	BADBR +a\{1a\}		bC	BADBR +a{,2}		-	a{,2}	a{,2} +a\{,2\}		bC	BADBR +a{,}		-	a{,}	a{,} +a\{,\}		bC	BADBR +a{1,x}		C	BADBR +a\{1,x\}	bC	BADBR +a{1,x		C	EBRACE +a\{1,x		bC	EBRACE +a{300}		C	BADBR +a\{300\}	bC	BADBR +a{1,0}		C	BADBR +a\{1,0\}	bC	BADBR +ab{0,0}c	-	abcac	ac +ab\{0,0\}c	b	abcac	ac +ab{0,1}c	-	abcac	abc +ab\{0,1\}c	b	abcac	abc +ab{0,3}c	-	abbcac	abbc +ab\{0,3\}c	b	abbcac	abbc +ab{1,1}c	-	acabc	abc +ab\{1,1\}c	b	acabc	abc +ab{1,3}c	-	acabc	abc +ab\{1,3\}c	b	acabc	abc +ab{2,2}c	-	abcabbc	abbc +ab\{2,2\}c	b	abcabbc	abbc +ab{2,4}c	-	abcabbc	abbc +ab\{2,4\}c	b	abcabbc	abbc +((a{1,10}){1,10}){1,10}	-	a	a	a,a + +# multiple repetitions +a**		&C	BADRPT +a++		C	BADRPT +a??		C	BADRPT +a*+		C	BADRPT +a*?		C	BADRPT +a+*		C	BADRPT +a+?		C	BADRPT +a?*		C	BADRPT +a?+		C	BADRPT +a{1}{1}		C	BADRPT +a*{1}		C	BADRPT +a+{1}		C	BADRPT +a?{1}		C	BADRPT +a{1}*		C	BADRPT +a{1}+		C	BADRPT +a{1}?		C	BADRPT +a*{b}		-	a{b}	a{b} +a\{1\}\{1\}	bC	BADRPT +a*\{1\}		bC	BADRPT +a\{1\}*		bC	BADRPT + +# brackets, and numerous perversions thereof +a[b]c		&	abc	abc +a[ab]c		&	abc	abc +a[^ab]c		&	adc	adc +a[]b]c		&	a]c	a]c +a[[b]c		&	a[c	a[c +a[-b]c		&	a-c	a-c +a[^]b]c		&	adc	adc +a[^-b]c		&	adc	adc +a[b-]c		&	a-c	a-c +a[b		&C	EBRACK +a[]		&C	EBRACK +a[1-3]c		&	a2c	a2c +a[3-1]c		&C	ERANGE +a[1-3-5]c	&C	ERANGE +a[[.-.]--]c	&	a-c	a-c +a[1-		&C	ERANGE +a[[.		&C	EBRACK +a[[.x		&C	EBRACK +a[[.x.		&C	EBRACK +a[[.x.]		&C	EBRACK +a[[.x.]]	&	ax	ax +a[[.x,.]]	&C	ECOLLATE +a[[.one.]]b	&	a1b	a1b +a[[.notdef.]]b	&C	ECOLLATE +a[[.].]]b	&	a]b	a]b +a[[:alpha:]]c	&	abc	abc +a[[:notdef:]]c	&C	ECTYPE +a[[:		&C	EBRACK +a[[:alpha	&C	EBRACK +a[[:alpha:]	&C	EBRACK +a[[:alpha,:]	&C	ECTYPE +a[[:]:]]b	&C	ECTYPE +a[[:-:]]b	&C	ECTYPE +a[[:alph:]]	&C	ECTYPE +a[[:alphabet:]]	&C	ECTYPE +[[:alnum:]]+	-	-%@a0X-	a0X +[[:alpha:]]+	-	-%@aX0-	aX +[[:blank:]]+	-	aSSTb	SST +[[:cntrl:]]+	-	aNTb	NT +[[:digit:]]+	-	a019b	019 +[[:graph:]]+	-	Sa%bS	a%b +[[:lower:]]+	-	AabC	ab +[[:print:]]+	-	NaSbN	aSb +[[:punct:]]+	-	S%-&T	%-& +[[:space:]]+	-	aSNTb	SNT +[[:upper:]]+	-	aBCd	BC +[[:xdigit:]]+	-	p0f3Cq	0f3C +a[[=b=]]c	&	abc	abc +a[[=		&C	EBRACK +a[[=b		&C	EBRACK +a[[=b=		&C	EBRACK +a[[=b=]		&C	EBRACK +a[[=b,=]]	&C	ECOLLATE +a[[=one=]]b	&	a1b	a1b + +# complexities +a(((b)))c	-	abc	abc +a(b|(c))d	-	abd	abd +a(b*|c)d	-	abbd	abbd +# just gotta have one DFA-buster, of course +a[ab]{20}	-	aaaaabaaaabaaaabaaaab	aaaaabaaaabaaaabaaaab +# and an inline expansion in case somebody gets tricky +a[ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab]	-	aaaaabaaaabaaaabaaaab	aaaaabaaaabaaaabaaaab +# and in case somebody just slips in an NFA... +a[ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab](wee|week)(knights|night)	-	aaaaabaaaabaaaabaaaabweeknights	aaaaabaaaabaaaabaaaabweeknights +# fish for anomalies as the number of states passes 32 +12345678901234567890123456789	-	a12345678901234567890123456789b	12345678901234567890123456789 +123456789012345678901234567890	-	a123456789012345678901234567890b	123456789012345678901234567890 +1234567890123456789012345678901	-	a1234567890123456789012345678901b	1234567890123456789012345678901 +12345678901234567890123456789012	-	a12345678901234567890123456789012b	12345678901234567890123456789012 +123456789012345678901234567890123	-	a123456789012345678901234567890123b	123456789012345678901234567890123 +# and one really big one, beyond any plausible word width +1234567890123456789012345678901234567890123456789012345678901234567890	-	a1234567890123456789012345678901234567890123456789012345678901234567890b	1234567890123456789012345678901234567890123456789012345678901234567890 +# fish for problems as brackets go past 8 +[ab][cd][ef][gh][ij][kl][mn]	-	xacegikmoq	acegikm +[ab][cd][ef][gh][ij][kl][mn][op]	-	xacegikmoq	acegikmo +[ab][cd][ef][gh][ij][kl][mn][op][qr]	-	xacegikmoqy	acegikmoq +[ab][cd][ef][gh][ij][kl][mn][op][q]	-	xacegikmoqy	acegikmoq + +# subtleties of matching +abc		&	xabcy	abc +a\(b\)?c\1d	b	acd +aBc		i	Abc	Abc +a[Bc]*d		i	abBCcd	abBCcd +0[[:upper:]]1	&i	0a1	0a1 +0[[:lower:]]1	&i	0A1	0A1 +a[^b]c		&i	abc +a[^b]c		&i	aBc +a[^b]c		&i	adc	adc +[a]b[c]		-	abc	abc +[a]b[a]		-	aba	aba +[abc]b[abc]	-	abc	abc +[abc]b[abd]	-	abd	abd +a(b?c)+d	-	accd	accd +(wee|week)(knights|night)	-	weeknights	weeknights +(we|wee|week|frob)(knights|night|day)	-	weeknights	weeknights +a[bc]d		-	xyzaaabcaababdacd	abd +a[ab]c		-	aaabc	abc +abc		s	abc	abc +a*		&	b	@b + +# Let's have some fun -- try to match a C comment. +# first the obvious, which looks okay at first glance... +/\*.*\*/	-	/*x*/	/*x*/ +# but... +/\*.*\*/	-	/*x*/y/*z*/	/*x*/y/*z*/ +# okay, we must not match */ inside; try to do that... +/\*([^*]|\*[^/])*\*/	-	/*x*/	/*x*/ +/\*([^*]|\*[^/])*\*/	-	/*x*/y/*z*/	/*x*/ +# but... +/\*([^*]|\*[^/])*\*/	-	/*x**/y/*z*/	/*x**/y/*z*/ +# and a still fancier version, which does it right (I think)... +/\*([^*]|\*+[^*/])*\*+/	-	/*x*/	/*x*/ +/\*([^*]|\*+[^*/])*\*+/	-	/*x*/y/*z*/	/*x*/ +/\*([^*]|\*+[^*/])*\*+/	-	/*x**/y/*z*/	/*x**/ +/\*([^*]|\*+[^*/])*\*+/	-	/*x****/y/*z*/	/*x****/ +/\*([^*]|\*+[^*/])*\*+/	-	/*x**x*/y/*z*/	/*x**x*/ +/\*([^*]|\*+[^*/])*\*+/	-	/*x***x/y/*z*/	/*x***x/y/*z*/ + +# subexpressions +a(b)(c)d	-	abcd	abcd	b,c +a(((b)))c	-	abc	abc	b,b,b +a(b|(c))d	-	abd	abd	b,- +a(b*|c|e)d	-	abbd	abbd	bb +a(b*|c|e)d	-	acd	acd	c +a(b*|c|e)d	-	ad	ad	@d +a(b?)c		-	abc	abc	b +a(b?)c		-	ac	ac	@c +a(b+)c		-	abc	abc	b +a(b+)c		-	abbbc	abbbc	bbb +a(b*)c		-	ac	ac	@c +(a|ab)(bc([de]+)f|cde)	-	abcdef	abcdef	a,bcdef,de +# the regression tester only asks for 9 subexpressions +a(b)(c)(d)(e)(f)(g)(h)(i)(j)k	-	abcdefghijk	abcdefghijk	b,c,d,e,f,g,h,i,j +a(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)l	-	abcdefghijkl	abcdefghijkl	b,c,d,e,f,g,h,i,j,k +a([bc]?)c	-	abc	abc	b +a([bc]?)c	-	ac	ac	@c +a([bc]+)c	-	abc	abc	b +a([bc]+)c	-	abcc	abcc	bc +a([bc]+)bc	-	abcbc	abcbc	bc +a(bb+|b)b	-	abb	abb	b +a(bbb+|bb+|b)b	-	abb	abb	b +a(bbb+|bb+|b)b	-	abbb	abbb	bb +a(bbb+|bb+|b)bb	-	abbb	abbb	b +(.*).*		-	abcdef	abcdef	abcdef +(a*)*		-	bc	@b	@b + +# do we get the right subexpression when it is used more than once? +a(b|c)*d	-	ad	ad	- +a(b|c)*d	-	abcd	abcd	c +a(b|c)+d	-	abd	abd	b +a(b|c)+d	-	abcd	abcd	c +a(b|c?)+d	-	ad	ad	@d +a(b|c?)+d	-	abcd	abcd	@d +a(b|c){0,0}d	-	ad	ad	- +a(b|c){0,1}d	-	ad	ad	- +a(b|c){0,1}d	-	abd	abd	b +a(b|c){0,2}d	-	ad	ad	- +a(b|c){0,2}d	-	abcd	abcd	c +a(b|c){0,}d	-	ad	ad	- +a(b|c){0,}d	-	abcd	abcd	c +a(b|c){1,1}d	-	abd	abd	b +a(b|c){1,1}d	-	acd	acd	c +a(b|c){1,2}d	-	abd	abd	b +a(b|c){1,2}d	-	abcd	abcd	c +a(b|c){1,}d	-	abd	abd	b +a(b|c){1,}d	-	abcd	abcd	c +a(b|c){2,2}d	-	acbd	acbd	b +a(b|c){2,2}d	-	abcd	abcd	c +a(b|c){2,4}d	-	abcd	abcd	c +a(b|c){2,4}d	-	abcbd	abcbd	b +a(b|c){2,4}d	-	abcbcd	abcbcd	c +a(b|c){2,}d	-	abcd	abcd	c +a(b|c){2,}d	-	abcbd	abcbd	b +a(b+|((c)*))+d	-	abd	abd	@d,@d,- +a(b+|((c)*))+d	-	abcd	abcd	@d,@d,- + +# check out the STARTEND option +[abc]		&#	a(b)c	b +[abc]		&#	a(d)c +[abc]		&#	a(bc)d	b +[abc]		&#	a(dc)d	c +.		&#	a()c +b.*c		&#	b(bc)c	bc +b.*		&#	b(bc)c	bc +.*c		&#	b(bc)c	bc + +# plain strings, with the NOSPEC flag +abc		m	abc	abc +abc		m	xabcy	abc +abc		m	xyz +a*b		m	aba*b	a*b +a*b		m	ab +""		mC	EMPTY + +# cases involving NULs +aZb		&	a	a +aZb		&p	a +aZb		&p#	(aZb)	aZb +aZ*b		&p#	(ab)	ab +a.b		&#	(aZb)	aZb +a.*		&#	(aZb)c	aZb + +# word boundaries (ick) +[[:<:]]a	&	a	a +[[:<:]]a	&	ba +[[:<:]]a	&	-a	a +a[[:>:]]	&	a	a +a[[:>:]]	&	ab +a[[:>:]]	&	a-	a +[[:<:]]a.c[[:>:]]	&	axcd-dayc-dazce-abc	abc +[[:<:]]a.c[[:>:]]	&	axcd-dayc-dazce-abc-q	abc +[[:<:]]a.c[[:>:]]	&	axc-dayc-dazce-abc	axc +[[:<:]]b.c[[:>:]]	&	a_bxc-byc_d-bzc-q	bzc +[[:<:]].x..[[:>:]]	&	y_xa_-_xb_y-_xc_-axdc	_xc_ +[[:<:]]a_b[[:>:]]	&	x_a_b + +# past problems, and suspected problems +(A[1])|(A[2])|(A[3])|(A[4])|(A[5])|(A[6])|(A[7])|(A[8])|(A[9])|(A[A])	-	A1	A1 +abcdefghijklmnop	i	abcdefghijklmnop	abcdefghijklmnop +abcdefghijklmnopqrstuv	i	abcdefghijklmnopqrstuv	abcdefghijklmnopqrstuv +(ALAK)|(ALT[AB])|(CC[123]1)|(CM[123]1)|(GAMC)|(LC[23][EO ])|(SEM[1234])|(SL[ES][12])|(SLWW)|(SLF )|(SLDT)|(VWH[12])|(WH[34][EW])|(WP1[ESN])	-	CC11	CC11 +CC[13]1|a{21}[23][EO][123][Es][12]a{15}aa[34][EW]aaaaaaa[X]a	-	CC11	CC11 +Char \([a-z0-9_]*\)\[.*	b	Char xyz[k	Char xyz[k	xyz +a?b	-	ab	ab +-\{0,1\}[0-9]*$	b	-5	-5 diff --git a/ext/ereg/regex/utils.h b/ext/ereg/regex/utils.h new file mode 100644 index 0000000000..66ae78437e --- /dev/null +++ b/ext/ereg/regex/utils.h @@ -0,0 +1,23 @@ +/* utility definitions */ + +#ifdef _POSIX2_RE_DUP_MAX +#define	DUPMAX	_POSIX2_RE_DUP_MAX +#else +#define	DUPMAX	255 +#endif +#define	INFINITY	(DUPMAX + 1) +#define	NC		(CHAR_MAX - CHAR_MIN + 1) +typedef unsigned char uch; + +/* switch off assertions (if not already off) if no REDEBUG */ +#ifndef REDEBUG +#ifndef NDEBUG +#define	NDEBUG	/* no assertions please */ +#endif +#endif +#include <assert.h> + +/* for old systems with bcopy() but no memmove() */ +#ifdef USEBCOPY +#define	memmove(d, s, c)	bcopy(s, d, c) +#endif diff --git a/ext/standard/tests/reg/001.phpt b/ext/ereg/tests/001.phpt index 13c50d0d1e..13c50d0d1e 100644 --- a/ext/standard/tests/reg/001.phpt +++ b/ext/ereg/tests/001.phpt diff --git a/ext/standard/tests/reg/002.phpt b/ext/ereg/tests/002.phpt index abe9e063d8..abe9e063d8 100644 --- a/ext/standard/tests/reg/002.phpt +++ b/ext/ereg/tests/002.phpt diff --git a/ext/standard/tests/reg/003.phpt b/ext/ereg/tests/003.phpt index 4257f0d27e..4257f0d27e 100644 --- a/ext/standard/tests/reg/003.phpt +++ b/ext/ereg/tests/003.phpt diff --git a/ext/standard/tests/reg/004.phpt b/ext/ereg/tests/004.phpt index 3e535c6a9d..3e535c6a9d 100644 --- a/ext/standard/tests/reg/004.phpt +++ b/ext/ereg/tests/004.phpt diff --git a/ext/standard/tests/reg/005.phpt b/ext/ereg/tests/005.phpt index ee9ccc6da4..ee9ccc6da4 100644 --- a/ext/standard/tests/reg/005.phpt +++ b/ext/ereg/tests/005.phpt diff --git a/ext/standard/tests/reg/006.phpt b/ext/ereg/tests/006.phpt index cae349672e..cae349672e 100644 --- a/ext/standard/tests/reg/006.phpt +++ b/ext/ereg/tests/006.phpt diff --git a/ext/standard/tests/reg/007.phpt b/ext/ereg/tests/007.phpt index beb3cdc159..beb3cdc159 100644 --- a/ext/standard/tests/reg/007.phpt +++ b/ext/ereg/tests/007.phpt diff --git a/ext/standard/tests/reg/008.phpt b/ext/ereg/tests/008.phpt index 9a0cedabc5..9a0cedabc5 100644 --- a/ext/standard/tests/reg/008.phpt +++ b/ext/ereg/tests/008.phpt diff --git a/ext/standard/tests/reg/009.phpt b/ext/ereg/tests/009.phpt index 528606f3c8..528606f3c8 100644 --- a/ext/standard/tests/reg/009.phpt +++ b/ext/ereg/tests/009.phpt diff --git a/ext/standard/tests/reg/010.phpt b/ext/ereg/tests/010.phpt index f6f8909f73..f6f8909f73 100644 --- a/ext/standard/tests/reg/010.phpt +++ b/ext/ereg/tests/010.phpt diff --git a/ext/standard/tests/reg/011.phpt b/ext/ereg/tests/011.phpt index 65554b302f..65554b302f 100644 --- a/ext/standard/tests/reg/011.phpt +++ b/ext/ereg/tests/011.phpt diff --git a/ext/standard/tests/reg/012.phpt b/ext/ereg/tests/012.phpt index 88ad5992ad..88ad5992ad 100644 --- a/ext/standard/tests/reg/012.phpt +++ b/ext/ereg/tests/012.phpt diff --git a/ext/standard/tests/reg/013.phpt b/ext/ereg/tests/013.phpt index a2d9ee0099..a2d9ee0099 100644 --- a/ext/standard/tests/reg/013.phpt +++ b/ext/ereg/tests/013.phpt diff --git a/ext/standard/tests/reg/014.phpt b/ext/ereg/tests/014.phpt index d2a32451a9..d2a32451a9 100644 --- a/ext/standard/tests/reg/014.phpt +++ b/ext/ereg/tests/014.phpt diff --git a/ext/standard/tests/reg/015.phpt b/ext/ereg/tests/015.phpt index c255ddf05b..c255ddf05b 100644 --- a/ext/standard/tests/reg/015.phpt +++ b/ext/ereg/tests/015.phpt diff --git a/ext/standard/tests/reg/016.phpt b/ext/ereg/tests/016.phpt index c354ab26fc..c354ab26fc 100644 --- a/ext/standard/tests/reg/016.phpt +++ b/ext/ereg/tests/016.phpt diff --git a/ext/pgsql/pgsql.c b/ext/pgsql/pgsql.c index 92ceec2683..4a405471a7 100644 --- a/ext/pgsql/pgsql.c +++ b/ext/pgsql/pgsql.c @@ -36,6 +36,7 @@  #include "php_ini.h"  #include "ext/standard/php_standard.h"  #include "ext/standard/php_smart_str.h" +#include "ext/ereg/php_regex.h"  #undef PACKAGE_BUGREPORT  #undef PACKAGE_NAME diff --git a/ext/standard/basic_functions.c b/ext/standard/basic_functions.c index 588af8b595..6119e74fb8 100644 --- a/ext/standard/basic_functions.c +++ b/ext/standard/basic_functions.c @@ -2203,54 +2203,6 @@ static  ZEND_BEGIN_ARG_INFO(arginfo_mt_getrandmax, 0)  ZEND_END_ARG_INFO()  /* }}} */ -/* {{{ reg.c */ -static -ZEND_BEGIN_ARG_INFO_EX(arginfo_ereg, 0, 0, 2) -	ZEND_ARG_INFO(0, pattern) -	ZEND_ARG_INFO(0, string) -	ZEND_ARG_INFO(1, registers) /* ARRAY_INFO(1, registers, 1) */ -ZEND_END_ARG_INFO() - -static -ZEND_BEGIN_ARG_INFO_EX(arginfo_eregi, 0, 0, 2) -	ZEND_ARG_INFO(0, pattern) -	ZEND_ARG_INFO(0, string) -	ZEND_ARG_INFO(1, registers) /* ARRAY_INFO(1, registers, 1) */ -ZEND_END_ARG_INFO() - -static -ZEND_BEGIN_ARG_INFO(arginfo_ereg_replace, 0) -	ZEND_ARG_INFO(0, pattern) -	ZEND_ARG_INFO(0, replacement) -	ZEND_ARG_INFO(0, string) -ZEND_END_ARG_INFO() - -static -ZEND_BEGIN_ARG_INFO(arginfo_eregi_replace, 0) -	ZEND_ARG_INFO(0, pattern) -	ZEND_ARG_INFO(0, replacement) -	ZEND_ARG_INFO(0, string) -ZEND_END_ARG_INFO() - -static -ZEND_BEGIN_ARG_INFO_EX(arginfo_split, 0, 0, 2) -	ZEND_ARG_INFO(0, pattern) -	ZEND_ARG_INFO(0, string) -	ZEND_ARG_INFO(0, limit) -ZEND_END_ARG_INFO() - -static -ZEND_BEGIN_ARG_INFO_EX(arginfo_spliti, 0, 0, 2) -	ZEND_ARG_INFO(0, pattern) -	ZEND_ARG_INFO(0, string) -	ZEND_ARG_INFO(0, limit) -ZEND_END_ARG_INFO() - -static -ZEND_BEGIN_ARG_INFO(arginfo_sql_regcase, 0) -	ZEND_ARG_INFO(0, string) -ZEND_END_ARG_INFO() -/* }}} */  /* {{{ sha1.c */  static  ZEND_BEGIN_ARG_INFO_EX(arginfo_sha1, 0, 0, 1) @@ -3202,6 +3154,7 @@ const zend_function_entry basic_functions[] = { /* {{{ */  	PHP_FE(similar_text,													arginfo_similar_text)  	PHP_FE(explode,															arginfo_explode)  	PHP_FE(implode,															arginfo_implode) +	PHP_FALIAS(join,				implode,								arginfo_implode)  	PHP_FE(setlocale,														arginfo_setlocale)  	PHP_FE(localeconv,														arginfo_localeconv) @@ -3478,16 +3431,6 @@ const zend_function_entry basic_functions[] = { /* {{{ */  	PHP_FE(is_scalar,														arginfo_is_scalar)  	PHP_FE(is_callable,														arginfo_is_callable) -	/* functions from reg.c */ -	PHP_FE(ereg,															arginfo_ereg) -	PHP_FE(ereg_replace,													arginfo_ereg_replace) -	PHP_FE(eregi,															arginfo_eregi) -	PHP_FE(eregi_replace,													arginfo_eregi_replace) -	PHP_FE(split,															arginfo_split) -	PHP_FE(spliti,															arginfo_spliti) -	PHP_FALIAS(join,				implode,								arginfo_implode) -	PHP_FE(sql_regcase,														arginfo_sql_regcase) -  	/* functions from dl.c */  	PHP_FE(dl,																arginfo_dl) @@ -4043,7 +3986,6 @@ PHP_MINIT_FUNCTION(basic) /* {{{ */  	register_html_constants(INIT_FUNC_ARGS_PASSTHRU);  	register_string_constants(INIT_FUNC_ARGS_PASSTHRU); -	PHP_MINIT(regex)(INIT_FUNC_ARGS_PASSTHRU);  	PHP_MINIT(file)(INIT_FUNC_ARGS_PASSTHRU);  	PHP_MINIT(pack)(INIT_FUNC_ARGS_PASSTHRU);  	PHP_MINIT(browscap)(INIT_FUNC_ARGS_PASSTHRU); @@ -4121,7 +4063,6 @@ PHP_MSHUTDOWN_FUNCTION(basic) /* {{{ */  	UNREGISTER_INI_ENTRIES(); -	PHP_MSHUTDOWN(regex)(SHUTDOWN_FUNC_ARGS_PASSTHRU);  	PHP_MSHUTDOWN(browscap)(SHUTDOWN_FUNC_ARGS_PASSTHRU);  	PHP_MSHUTDOWN(array)(SHUTDOWN_FUNC_ARGS_PASSTHRU);  	PHP_MSHUTDOWN(assert)(SHUTDOWN_FUNC_ARGS_PASSTHRU); @@ -4234,7 +4175,6 @@ PHP_RSHUTDOWN_FUNCTION(basic) /* {{{ */  PHP_MINFO_FUNCTION(basic) /* {{{ */  {  	php_info_print_table_start(); -	PHP_MINFO(regex)(ZEND_MODULE_INFO_FUNC_ARGS_PASSTHRU);  	PHP_MINFO(dl)(ZEND_MODULE_INFO_FUNC_ARGS_PASSTHRU);  	PHP_MINFO(mail)(ZEND_MODULE_INFO_FUNC_ARGS_PASSTHRU);  	php_info_print_table_end(); diff --git a/ext/standard/browscap.c b/ext/standard/browscap.c index 9737007974..96f4d95bf1 100644 --- a/ext/standard/browscap.c +++ b/ext/standard/browscap.c @@ -19,7 +19,7 @@  /* $Id$ */  #include "php.h" -#include "php_regex.h" +#include "ext/ereg/php_regex.h"  #include "php_browscap.h"  #include "php_ini.h"  #include "php_string.h" diff --git a/ext/standard/config.m4 b/ext/standard/config.m4 index 11c3ee3191..462ac15dd9 100644 --- a/ext/standard/config.m4 +++ b/ext/standard/config.m4 @@ -208,52 +208,6 @@ AC_FUNC_FNMATCH  divert(5)dnl  dnl -dnl Check for regex library type -dnl -PHP_ARG_WITH(regex,, -[  --with-regex=TYPE       regex library type: system, apache, php. [TYPE=php] -                          WARNING: Do NOT use unless you know what you are doing!], php, no) - -case $PHP_REGEX in -  system) -    if test "$PHP_SAPI" = "apache" || test "$PHP_SAPI" = "apache2filter" || test "$PHP_SAPI" = "apache2handler"; then -      REGEX_TYPE=php -    else -      REGEX_TYPE=system -    fi -    ;; -  apache) -    REGEX_TYPE=apache -    ;; -  php) -    REGEX_TYPE=php -    ;; -  *) -    REGEX_TYPE=php -    AC_MSG_WARN(Invalid regex library type selected. Using default value: php) -    ;; -esac - -if test "$REGEX_TYPE" = "php"; then -  AC_DEFINE(HAVE_REGEX_T_RE_MAGIC, 1, [ ]) -  AC_DEFINE(HSREGEX,1,[ ]) -  AC_DEFINE(REGEX,1,[ ])   -  PHP_ADD_SOURCES(regex, regcomp.c regexec.c regerror.c regfree.c) -elif test "$REGEX_TYPE" = "system"; then -  AC_DEFINE(REGEX,0,[ ]) -  dnl Check if field re_magic exists in struct regex_t -  AC_CACHE_CHECK([whether field re_magic exists in struct regex_t], ac_cv_regex_t_re_magic, [ -  AC_TRY_COMPILE([#include <sys/types.h> -#include <regex.h>], [regex_t rt; rt.re_magic;], -  [ac_cv_regex_t_re_magic=yes], [ac_cv_regex_t_re_magic=no])]) -  if test "$ac_cv_regex_t_re_magic" = "yes"; then -    AC_DEFINE([HAVE_REGEX_T_RE_MAGIC], [ ], 1)    -  fi  -fi    -AC_MSG_CHECKING([which regex library to use]) -AC_MSG_RESULT([$REGEX_TYPE]) - -dnl  dnl round fuzz  dnl  AC_MSG_CHECKING([whether rounding works as expected]) @@ -502,7 +456,7 @@ PHP_NEW_EXTENSION(standard, array.c base64.c basic_functions.c browscap.c crc32.                              flock_compat.c formatted_print.c fsock.c head.c html.c image.c \                              info.c iptc.c lcg.c link.c mail.c math.c md5.c metaphone.c \                              microtime.c pack.c pageinfo.c quot_print.c rand.c \ -                            reg.c soundex.c string.c scanf.c syslog.c type.c uniqid.c url.c \ +                            soundex.c string.c scanf.c syslog.c type.c uniqid.c url.c \                              url_scanner.c var.c versioning.c assert.c strnatcmp.c levenshtein.c \                              incomplete_class.c url_scanner_ex.c ftp_fopen_wrapper.c \                              http_fopen_wrapper.c php_fopen_wrapper.c credits.c css.c \ diff --git a/ext/standard/config.w32 b/ext/standard/config.w32 index bbfde2c88c..81352ea5e3 100644 --- a/ext/standard/config.w32 +++ b/ext/standard/config.w32 @@ -10,7 +10,7 @@ EXTENSION("standard", "array.c base64.c basic_functions.c browscap.c \  	crc32.c crypt.c cyr_convert.c datetime.c dir.c dl.c dns.c exec.c \  	file.c filestat.c formatted_print.c fsock.c head.c html.c image.c \  	info.c iptc.c lcg.c link.c mail.c math.c md5.c metaphone.c microtime.c \ -	pack.c pageinfo.c quot_print.c rand.c reg.c soundex.c \ +	pack.c pageinfo.c quot_print.c rand.c soundex.c \  	string.c scanf.c syslog.c type.c uniqid.c url.c url_scanner.c var.c \  	versioning.c assert.c strnatcmp.c levenshtein.c incomplete_class.c \  	url_scanner_ex.c ftp_fopen_wrapper.c http_fopen_wrapper.c \ diff --git a/ext/standard/html.c b/ext/standard/html.c index 0160cef571..dbacf6bb13 100644 --- a/ext/standard/html.c +++ b/ext/standard/html.c @@ -37,7 +37,6 @@  #else  #include <php_config.h>  #endif -#include "reg.h"  #include "html.h"  #include "php_string.h"  #include "SAPI.h" diff --git a/ext/standard/php_standard.h b/ext/standard/php_standard.h index 2ad546b128..9474eb32e8 100644 --- a/ext/standard/php_standard.h +++ b/ext/standard/php_standard.h @@ -24,7 +24,6 @@  #include "base64.h"  #include "php_dir.h"  #include "dns.h" -#include "reg.h"  #include "php_mail.h"  #include "md5.h"  #include "sha1.h" diff --git a/ext/standard/string.c b/ext/standard/string.c index bb9fc79b5e..45c36d5c0c 100644 --- a/ext/standard/string.c +++ b/ext/standard/string.c @@ -24,7 +24,6 @@  #include <stdio.h>  #include "php.h" -#include "reg.h"  #include "php_rand.h"  #include "php_string.h"  #include "php_variables.h"  | 
