diff options
Diffstat (limited to 'ext/mbstring/oniguruma/regparse.c')
| -rw-r--r-- | ext/mbstring/oniguruma/regparse.c | 1774 |
1 files changed, 1031 insertions, 743 deletions
diff --git a/ext/mbstring/oniguruma/regparse.c b/ext/mbstring/oniguruma/regparse.c index 2260df4155..58e122f486 100644 --- a/ext/mbstring/oniguruma/regparse.c +++ b/ext/mbstring/oniguruma/regparse.c @@ -1,120 +1,36 @@ /********************************************************************** - regparse.c - Oniguruma (regular expression library) - - Copyright (C) 2003-2004 K.Kosako (kosako@sofnec.co.jp) - **********************************************************************/ +/*- + * Copyright (c) 2002-2005 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + #include "regparse.h" #define WARN_BUFSIZE 256 -#define SYN_POSIX_COMMON_OP \ - ( ONIG_SYN_OP_DOT_ANYCHAR | ONIG_SYN_OP_POSIX_BRACKET | \ - ONIG_SYN_OP_DECIMAL_BACKREF | \ - ONIG_SYN_OP_BRACKET_CC | ONIG_SYN_OP_ASTERISK_ZERO_INF | \ - ONIG_SYN_OP_LINE_ANCHOR | \ - ONIG_SYN_OP_ESC_CONTROL_CHARS ) - -#define SYN_GNU_REGEX_OP \ - ( ONIG_SYN_OP_DOT_ANYCHAR | ONIG_SYN_OP_BRACKET_CC | \ - ONIG_SYN_OP_POSIX_BRACKET | ONIG_SYN_OP_DECIMAL_BACKREF | \ - ONIG_SYN_OP_BRACE_INTERVAL | ONIG_SYN_OP_LPAREN_SUBEXP | \ - ONIG_SYN_OP_VBAR_ALT | \ - ONIG_SYN_OP_ASTERISK_ZERO_INF | ONIG_SYN_OP_PLUS_ONE_INF | \ - ONIG_SYN_OP_QMARK_ZERO_ONE | \ - ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR | ONIG_SYN_OP_ESC_CAPITAL_G_BEGIN_ANCHOR | \ - ONIG_SYN_OP_ESC_W_WORD | \ - ONIG_SYN_OP_ESC_B_WORD_BOUND | ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END | \ - ONIG_SYN_OP_ESC_S_WHITE_SPACE | ONIG_SYN_OP_ESC_D_DIGIT | \ - ONIG_SYN_OP_LINE_ANCHOR ) - -#define SYN_GNU_REGEX_BV \ - ( ONIG_SYN_CONTEXT_INDEP_ANCHORS | ONIG_SYN_CONTEXT_INDEP_REPEAT_OPS | \ - ONIG_SYN_CONTEXT_INVALID_REPEAT_OPS | ONIG_SYN_ALLOW_INVALID_INTERVAL | \ - ONIG_SYN_BACKSLASH_ESCAPE_IN_CC | ONIG_SYN_ALLOW_DOUBLE_RANGE_OP_IN_CC ) - -#ifdef USE_VARIABLE_SYNTAX -OnigSyntaxType OnigSyntaxPosixBasic = { - ( SYN_POSIX_COMMON_OP | ONIG_SYN_OP_ESC_LPAREN_SUBEXP | - ONIG_SYN_OP_ESC_BRACE_INTERVAL ) - , 0 - , 0 - , ( ONIG_OPTION_SINGLELINE | ONIG_OPTION_MULTILINE ) -}; - -OnigSyntaxType OnigSyntaxPosixExtended = { - ( SYN_POSIX_COMMON_OP | ONIG_SYN_OP_LPAREN_SUBEXP | - ONIG_SYN_OP_BRACE_INTERVAL | - ONIG_SYN_OP_PLUS_ONE_INF | ONIG_SYN_OP_QMARK_ZERO_ONE | ONIG_SYN_OP_VBAR_ALT ) - , 0 - , ( ONIG_SYN_CONTEXT_INDEP_ANCHORS | - ONIG_SYN_CONTEXT_INDEP_REPEAT_OPS | ONIG_SYN_CONTEXT_INVALID_REPEAT_OPS | - ONIG_SYN_ALLOW_UNMATCHED_CLOSE_SUBEXP | - ONIG_SYN_ALLOW_DOUBLE_RANGE_OP_IN_CC ) - , ( ONIG_OPTION_SINGLELINE | ONIG_OPTION_MULTILINE ) -}; - -OnigSyntaxType OnigSyntaxEmacs = { - ( ONIG_SYN_OP_DOT_ANYCHAR | ONIG_SYN_OP_BRACKET_CC | - ONIG_SYN_OP_ESC_BRACE_INTERVAL | - ONIG_SYN_OP_ESC_LPAREN_SUBEXP | ONIG_SYN_OP_ESC_VBAR_ALT | - ONIG_SYN_OP_ASTERISK_ZERO_INF | ONIG_SYN_OP_PLUS_ONE_INF | - ONIG_SYN_OP_QMARK_ZERO_ONE | ONIG_SYN_OP_DECIMAL_BACKREF | - ONIG_SYN_OP_LINE_ANCHOR | ONIG_SYN_OP_ESC_CONTROL_CHARS ) - , ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR - , ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC - , ONIG_OPTION_NONE -}; - -OnigSyntaxType OnigSyntaxGrep = { - ( ONIG_SYN_OP_DOT_ANYCHAR | ONIG_SYN_OP_BRACKET_CC | ONIG_SYN_OP_POSIX_BRACKET | - ONIG_SYN_OP_BRACE_INTERVAL | ONIG_SYN_OP_ESC_LPAREN_SUBEXP | - ONIG_SYN_OP_ESC_VBAR_ALT | - ONIG_SYN_OP_ASTERISK_ZERO_INF | ONIG_SYN_OP_ESC_PLUS_ONE_INF | - ONIG_SYN_OP_ESC_QMARK_ZERO_ONE | ONIG_SYN_OP_LINE_ANCHOR | - ONIG_SYN_OP_ESC_W_WORD | ONIG_SYN_OP_ESC_B_WORD_BOUND | - ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END | ONIG_SYN_OP_DECIMAL_BACKREF ) - , 0 - , ( ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC | ONIG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC ) - , ONIG_OPTION_NONE -}; - -OnigSyntaxType OnigSyntaxGnuRegex = { - SYN_GNU_REGEX_OP - , 0 - , SYN_GNU_REGEX_BV - , ONIG_OPTION_NONE -}; - -OnigSyntaxType OnigSyntaxJava = { - (( SYN_GNU_REGEX_OP | ONIG_SYN_OP_QMARK_NON_GREEDY | - ONIG_SYN_OP_ESC_CONTROL_CHARS | ONIG_SYN_OP_ESC_C_CONTROL | - ONIG_SYN_OP_ESC_OCTAL3 | ONIG_SYN_OP_ESC_X_HEX2 ) - & ~ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END ) - , ( ONIG_SYN_OP2_ESC_CAPITAL_Q_QUOTE | ONIG_SYN_OP2_QMARK_GROUP_EFFECT | - ONIG_SYN_OP2_OPTION_PERL | ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT | - ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL | ONIG_SYN_OP2_CCLASS_SET_OP | - ONIG_SYN_OP2_ESC_V_VTAB | ONIG_SYN_OP2_ESC_U_HEX4 | - ONIG_SYN_OP2_ESC_P_CHAR_PROPERTY ) - , ( SYN_GNU_REGEX_BV | ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND ) - , ONIG_OPTION_SINGLELINE -}; - -OnigSyntaxType OnigSyntaxPerl = { - (( SYN_GNU_REGEX_OP | ONIG_SYN_OP_QMARK_NON_GREEDY | - ONIG_SYN_OP_ESC_OCTAL3 | ONIG_SYN_OP_ESC_X_HEX2 | - ONIG_SYN_OP_ESC_X_BRACE_HEX8 | ONIG_SYN_OP_ESC_CONTROL_CHARS | - ONIG_SYN_OP_ESC_C_CONTROL ) - & ~ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END ) - , ( ONIG_SYN_OP2_ESC_CAPITAL_Q_QUOTE | - ONIG_SYN_OP2_QMARK_GROUP_EFFECT | ONIG_SYN_OP2_OPTION_PERL | - ONIG_SYN_OP2_ESC_P_CHAR_PROPERTY ) - , SYN_GNU_REGEX_BV - , ONIG_OPTION_SINGLELINE -}; -#endif /* USE_VARIABLE_SYNTAX */ - OnigSyntaxType OnigSyntaxRuby = { (( SYN_GNU_REGEX_OP | ONIG_SYN_OP_QMARK_NON_GREEDY | ONIG_SYN_OP_ESC_OCTAL3 | ONIG_SYN_OP_ESC_X_HEX2 | @@ -127,12 +43,14 @@ OnigSyntaxType OnigSyntaxRuby = { ONIG_SYN_OP2_ESC_G_SUBEXP_CALL | ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT | ONIG_SYN_OP2_CCLASS_SET_OP | ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL | - ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META | ONIG_SYN_OP2_ESC_V_VTAB ) + ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META | ONIG_SYN_OP2_ESC_V_VTAB | + ONIG_SYN_OP2_ESC_H_XDIGIT ) , ( SYN_GNU_REGEX_BV | ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV | ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND | ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP | ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME | + ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY | ONIG_SYN_WARN_CC_OP_NOT_ESCAPED | ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT ) , ONIG_OPTION_NONE @@ -140,89 +58,7 @@ OnigSyntaxType OnigSyntaxRuby = { OnigSyntaxType* OnigDefaultSyntax = ONIG_SYNTAX_RUBY; -#ifdef USE_VARIABLE_SYNTAX -extern int -onig_set_default_syntax(OnigSyntaxType* syntax) -{ - if (IS_NULL(syntax)) - syntax = ONIG_SYNTAX_RUBY; - - OnigDefaultSyntax = syntax; - return 0; -} - -extern void -onig_copy_syntax(OnigSyntaxType* to, OnigSyntaxType* from) -{ - *to = *from; -} - -extern void -onig_set_syntax_op(OnigSyntaxType* syntax, unsigned int op) -{ - syntax->op = op; -} - -extern void -onig_set_syntax_op2(OnigSyntaxType* syntax, unsigned int op2) -{ - syntax->op2 = op2; -} - -extern void -onig_set_syntax_behavior(OnigSyntaxType* syntax, unsigned int behavior) -{ - syntax->behavior = behavior; -} - -extern void -onig_set_syntax_options(OnigSyntaxType* syntax, OnigOptionType options) -{ - syntax->options = options; -} -#endif - -OnigMetaCharTableType OnigMetaCharTable = { - (OnigCodePoint )'\\' /* esc */ - , (OnigCodePoint )0 /* anychar '.' */ - , (OnigCodePoint )0 /* anytime '*' */ - , (OnigCodePoint )0 /* zero or one time '?' */ - , (OnigCodePoint )0 /* one or more time '+' */ - , (OnigCodePoint )0 /* anychar anytime */ -}; - -#ifdef USE_VARIABLE_META_CHARS -extern int onig_set_meta_char(unsigned int what, unsigned int c) -{ - switch (what) { - case ONIG_META_CHAR_ESCAPE: - OnigMetaCharTable.esc = c; - break; - case ONIG_META_CHAR_ANYCHAR: - OnigMetaCharTable.anychar = c; - break; - case ONIG_META_CHAR_ANYTIME: - OnigMetaCharTable.anytime = c; - break; - case ONIG_META_CHAR_ZERO_OR_ONE_TIME: - OnigMetaCharTable.zero_or_one_time = c; - break; - case ONIG_META_CHAR_ONE_OR_MORE_TIME: - OnigMetaCharTable.one_or_more_time = c; - break; - case ONIG_META_CHAR_ANYCHAR_ANYTIME: - OnigMetaCharTable.anychar_anytime = c; - break; - default: - return ONIGERR_INVALID_ARGUMENT; - break; - } - return 0; -} -#endif /* USE_VARIABLE_META_CHARS */ - - -extern void onig_null_warn(char* s) { } +extern void onig_null_warn(const char* s) { } #ifdef DEFAULT_WARN_FUNCTION static OnigWarnFunc onig_warn = (OnigWarnFunc )DEFAULT_WARN_FUNCTION; @@ -272,12 +108,15 @@ bbuf_clone(BBuf** rto, BBuf* from) #define ONOFF(v,f,negative) (negative) ? ((v) &= ~(f)) : ((v) |= (f)) -#define SET_ALL_MULTI_BYTE_RANGE(pbuf) \ - add_code_range_to_buf(pbuf, (OnigCodePoint )0x80, ~((OnigCodePoint )0)) +#define MBCODE_START_POS(enc) \ + (OnigCodePoint )(ONIGENC_MBC_MINLEN(enc) > 1 ? 0 : 0x80) + +#define SET_ALL_MULTI_BYTE_RANGE(enc, pbuf) \ + add_code_range_to_buf(pbuf, MBCODE_START_POS(enc), ~((OnigCodePoint )0)) -#define ADD_ALL_MULTI_BYTE_RANGE(code, mbuf) do {\ - if (! ONIGENC_IS_SINGLEBYTE(code)) {\ - r = SET_ALL_MULTI_BYTE_RANGE(&(mbuf));\ +#define ADD_ALL_MULTI_BYTE_RANGE(enc, mbuf) do {\ + if (! ONIGENC_IS_SINGLEBYTE(enc)) {\ + r = SET_ALL_MULTI_BYTE_RANGE(enc, &(mbuf));\ if (r) return r;\ }\ } while (0) @@ -359,7 +198,7 @@ bitset_copy(BitSetRef dest, BitSetRef bs) } extern int -onig_strncmp(UChar* s1, UChar* s2, int n) +onig_strncmp(const UChar* s1, const UChar* s2, int n) { int x; @@ -371,7 +210,7 @@ onig_strncmp(UChar* s1, UChar* s2, int n) } static void -k_strcpy(UChar* dest, UChar* src, UChar* end) +k_strcpy(UChar* dest, const UChar* src, const UChar* end) { int len = end - src; if (len > 0) { @@ -380,33 +219,47 @@ k_strcpy(UChar* dest, UChar* src, UChar* end) } } -extern UChar* -onig_strdup(UChar* s, UChar* end) +static UChar* +strdup_with_null(OnigEncoding enc, UChar* s, UChar* end) { - int len = end - s; + int slen, term_len, i; + UChar *r; - if (len > 0) { - UChar* r = (UChar* )xmalloc(len + 1); - CHECK_NULL_RETURN(r); - xmemcpy(r, s, len); - r[len] = (UChar )0; - return r; - } - else return NULL; + slen = end - s; + term_len = ONIGENC_MBC_MINLEN(enc); + + r = (UChar* )xmalloc(slen + term_len); + CHECK_NULL_RETURN(r); + xmemcpy(r, s, slen); + + for (i = 0; i < term_len; i++) + r[slen + i] = (UChar )0; + + return r; } -/* scan pattern methods */ -#define PEND_VALUE -1 -#define PFETCH(c) do { (c) = *p++; } while (0) -#define PUNFETCH p-- -#define PINC p++ -#define PPEEK (p < end ? *p : PEND_VALUE) -#define PEND (p < end ? 0 : 1) +/* scan pattern methods */ +#define PEND_VALUE 0 + +#define PFETCH_READY UChar* pfetch_prev +#define PEND (p < end ? 0 : 1) +#define PUNFETCH p = pfetch_prev +#define PINC do { \ + pfetch_prev = p; \ + p += ONIGENC_MBC_ENC_LEN(enc, p); \ +} while (0) +#define PFETCH(c) do { \ + c = ONIGENC_MBC_TO_CODE(enc, p, end); \ + pfetch_prev = p; \ + p += ONIGENC_MBC_ENC_LEN(enc, p); \ +} while (0) +#define PPEEK (p < end ? ONIGENC_MBC_TO_CODE(enc, p, end) : PEND_VALUE) +#define PPEEK_IS(c) (PPEEK == (OnigCodePoint )c) static UChar* -k_strcat_capa(UChar* dest, UChar* dest_end, UChar* src, UChar* src_end, +k_strcat_capa(UChar* dest, UChar* dest_end, const UChar* src, const UChar* src_end, int capa) { UChar* r; @@ -424,7 +277,7 @@ k_strcat_capa(UChar* dest, UChar* dest_end, UChar* src, UChar* src_end, /* dest on static area */ static UChar* strcat_capa_from_static(UChar* dest, UChar* dest_end, - UChar* src, UChar* src_end, int capa) + const UChar* src, const UChar* src_end, int capa) { UChar* r; @@ -450,7 +303,7 @@ typedef struct { #ifdef USE_ST_HASH_TABLE -#include <st.h> +#include "st.h" typedef st_table NameTable; typedef st_data_t HashDataType; /* 1.6 st.h doesn't define st_data_t type */ @@ -487,7 +340,7 @@ onig_print_names(FILE* fp, regex_t* reg) if (IS_NOT_NULL(t)) { fprintf(fp, "name table\n"); - st_foreach(t, i_print_name_entry, (HashDataType )fp); + onig_st_foreach(t, i_print_name_entry, (HashDataType )fp); fputs("\n", fp); } return 0; @@ -508,7 +361,7 @@ names_clear(regex_t* reg) NameTable* t = (NameTable* )reg->name_table; if (IS_NOT_NULL(t)) { - st_foreach(t, i_free_name_entry, 0); + onig_st_foreach(t, i_free_name_entry, 0); } return 0; } @@ -523,56 +376,39 @@ onig_names_free(regex_t* reg) if (r) return r; t = (NameTable* )reg->name_table; - if (IS_NOT_NULL(t)) st_free_table(t); + if (IS_NOT_NULL(t)) onig_st_free_table(t); reg->name_table = (void* )NULL; return 0; } static NameEntry* -name_find(regex_t* reg, UChar* name, UChar* name_end) +name_find(regex_t* reg, const UChar* name, const UChar* name_end) { - int len; - UChar namebuf[NAMEBUF_SIZE_1]; - UChar *key; NameEntry* e; NameTable* t = (NameTable* )reg->name_table; e = (NameEntry* )NULL; if (IS_NOT_NULL(t)) { - if (*name_end == '\0') { - key = name; - } - else { - /* dirty, but st.c API claims NULL terminated key. */ - len = name_end - name; - if (len <= NAMEBUF_SIZE) { - xmemcpy(namebuf, name, len); - namebuf[len] = '\0'; - key = namebuf; - } - else { - key = onig_strdup(name, name_end); - if (IS_NULL(key)) return (NameEntry* )NULL; - } - } - - st_lookup(t, (HashDataType )key, (HashDataType * )&e); - if (key != name && key != namebuf) xfree(key); + onig_st_lookup_strend(t, name, name_end, (HashDataType* )((void* )(&e))); } return e; } typedef struct { - int (*func)(UChar*,UChar*,int,int*,regex_t*,void*); + int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*); regex_t* reg; void* arg; int ret; + OnigEncoding enc; } INamesArg; static int i_names(UChar* key, NameEntry* e, INamesArg* arg) { - int r = (*(arg->func))(e->name, e->name + strlen(e->name), e->back_num, + int r = (*(arg->func))(e->name, + /*e->name + onigenc_str_bytelen_null(arg->enc, e->name), */ + e->name + e->name_len, + e->back_num, (e->back_num > 1 ? e->back_refs : &(e->back_ref1)), arg->reg, arg->arg); if (r != 0) { @@ -584,8 +420,8 @@ i_names(UChar* key, NameEntry* e, INamesArg* arg) extern int onig_foreach_name(regex_t* reg, - int (*func)(UChar*,UChar*,int,int*,regex_t*,void*), - void* arg) + int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), + void* arg) { INamesArg narg; NameTable* t = (NameTable* )reg->name_table; @@ -595,11 +431,41 @@ onig_foreach_name(regex_t* reg, narg.func = func; narg.reg = reg; narg.arg = arg; - st_foreach(t, i_names, (HashDataType )&narg); + narg.enc = reg->enc; /* should be pattern encoding. */ + onig_st_foreach(t, i_names, (HashDataType )&narg); } return narg.ret; } +static int +i_renumber_name(UChar* key, NameEntry* e, GroupNumRemap* map) +{ + int i; + + if (e->back_num > 1) { + for (i = 0; i < e->back_num; i++) { + e->back_refs[i] = map[e->back_refs[i]].new_val; + } + } + else if (e->back_num == 1) { + e->back_ref1 = map[e->back_ref1].new_val; + } + + return ST_CONTINUE; +} + +extern int +onig_renumber_name_table(regex_t* reg, GroupNumRemap* map) +{ + NameTable* t = (NameTable* )reg->name_table; + + if (IS_NOT_NULL(t)) { + onig_st_foreach(t, i_renumber_name, (HashDataType )map); + } + return 0; +} + + extern int onig_number_of_names(regex_t* reg) { @@ -719,8 +585,8 @@ name_find(regex_t* reg, UChar* name, UChar* name_end) extern int onig_foreach_name(regex_t* reg, - int (*func)(UChar*,UChar*,int,int*,regex_t*,void*), - void* arg) + int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), + void* arg) { int i, r; NameEntry* e; @@ -765,14 +631,16 @@ name_add(regex_t* reg, UChar* name, UChar* name_end, int backref, ScanEnv* env) if (IS_NULL(e)) { #ifdef USE_ST_HASH_TABLE if (IS_NULL(t)) { - reg->name_table = t = st_init_strtable(); + t = onig_st_init_strend_table_with_size(5); + reg->name_table = (void* )t; } e = (NameEntry* )xmalloc(sizeof(NameEntry)); CHECK_NULL_RETURN_VAL(e, ONIGERR_MEMORY); - e->name = onig_strdup(name, name_end); + e->name = strdup_with_null(reg->enc, name, name_end); if (IS_NULL(e->name)) return ONIGERR_MEMORY; - st_insert(t, (HashDataType )e->name, (HashDataType )e); + onig_st_insert_strend(t, e->name, (e->name + (name_end - name)), + (HashDataType )e); e->name_len = name_end - name; e->back_num = 0; @@ -817,7 +685,7 @@ name_add(regex_t* reg, UChar* name, UChar* name_end, int backref, ScanEnv* env) } e = &(t->e[t->num]); t->num++; - e->name = onig_strdup(name, name_end); + e->name = strdup_with_null(reg->enc, name, name_end); e->name_len = name_end - name; #endif } @@ -857,8 +725,8 @@ name_add(regex_t* reg, UChar* name, UChar* name_end, int backref, ScanEnv* env) } extern int -onig_name_to_group_numbers(regex_t* reg, UChar* name, UChar* name_end, - int** nums) +onig_name_to_group_numbers(regex_t* reg, const UChar* name, + const UChar* name_end, int** nums) { NameEntry* e; @@ -879,8 +747,8 @@ onig_name_to_group_numbers(regex_t* reg, UChar* name, UChar* name_end, } extern int -onig_name_to_backref_number(regex_t* reg, UChar* name, UChar* name_end, - OnigRegion *region) +onig_name_to_backref_number(regex_t* reg, const UChar* name, + const UChar* name_end, OnigRegion *region) { int i, n, *nums; @@ -905,23 +773,23 @@ onig_name_to_backref_number(regex_t* reg, UChar* name, UChar* name_end, #else /* USE_NAMED_GROUP */ extern int -onig_name_to_group_numbers(regex_t* reg, UChar* name, UChar* name_end, - int** nums) +onig_name_to_group_numbers(regex_t* reg, const UChar* name, + const UChar* name_end, int** nums) { return ONIG_NO_SUPPORT_CONFIG; } extern int -onig_name_to_backref_number(regex_t* reg, UChar* name, UChar* name_end, - OnigRegion* region) +onig_name_to_backref_number(regex_t* reg, const UChar* name, + const UChar* name_end, OnigRegion* region) { return ONIG_NO_SUPPORT_CONFIG; } extern int onig_foreach_name(regex_t* reg, - int (*func)(UChar*,UChar*,int,int*,regex_t*,void*), - void* arg) + int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), + void* arg) { return ONIG_NO_SUPPORT_CONFIG; } @@ -1014,6 +882,7 @@ static FreeNode* FreeNodeList = (FreeNode* )NULL; extern void onig_node_free(Node* node) { + start: if (IS_NULL(node)) return ; switch (NTYPE(node)) { @@ -1026,12 +895,38 @@ onig_node_free(Node* node) case N_LIST: case N_ALT: onig_node_free(NCONS(node).left); - onig_node_free(NCONS(node).right); + /* onig_node_free(NCONS(node).right); */ + { + Node* next_node = NCONS(node).right; + +#ifdef USE_RECYCLE_NODE + { + FreeNode* n = (FreeNode* )node; + + THREAD_ATOMIC_START; + n->next = FreeNodeList; + FreeNodeList = n; + THREAD_ATOMIC_END; + } +#else + xfree(node); +#endif + + node = next_node; + goto start; + } break; case N_CCLASS: - if (NCCLASS(node).mbuf) - bbuf_free(NCCLASS(node).mbuf); + { + CClassNode* cc = &(NCCLASS(node)); + + if (IS_CCLASS_SHARE(cc)) + return ; + + if (cc->mbuf) + bbuf_free(cc->mbuf); + } break; case N_QUALIFIER: @@ -1057,11 +952,12 @@ onig_node_free(Node* node) #ifdef USE_RECYCLE_NODE { - FreeNode* n; + FreeNode* n = (FreeNode* )node; - n = (FreeNode* )node; + THREAD_ATOMIC_START; n->next = FreeNodeList; FreeNodeList = n; + THREAD_ATOMIC_END; } #else xfree(node); @@ -1092,8 +988,10 @@ node_new() #ifdef USE_RECYCLE_NODE if (IS_NOT_NULL(FreeNodeList)) { + THREAD_ATOMIC_START; node = (Node* )FreeNodeList; FreeNodeList = FreeNodeList->next; + THREAD_ATOMIC_END; return node; } #endif @@ -1107,8 +1005,8 @@ static void initialize_cclass(CClassNode* cc) { BITSET_CLEAR(cc->bs); - cc->not = 0; - cc->mbuf = NULL; + cc->flags = 0; + cc->mbuf = NULL; } static Node* @@ -1122,6 +1020,54 @@ node_new_cclass() return node; } +extern Node* +node_new_cclass_by_codepoint_range(int not, + OnigCodePoint sbr[], OnigCodePoint mbr[]) +{ + CClassNode* cc; + int n, i, j; + + Node* node = node_new(); + CHECK_NULL_RETURN(node); + node->type = N_CCLASS; + + cc = &(NCCLASS(node)); + cc->flags = 0; + if (not != 0) CCLASS_SET_NOT(cc); + + BITSET_CLEAR(cc->bs); + if (IS_NOT_NULL(sbr)) { + n = ONIGENC_CODE_RANGE_NUM(sbr); + for (i = 0; i < n; i++) { + for (j = ONIGENC_CODE_RANGE_FROM(sbr, i); + j <= (int )ONIGENC_CODE_RANGE_TO(sbr, i); j++) { + BITSET_SET_BIT(cc->bs, j); + } + } + } + + if (IS_NULL(mbr)) { + is_null: + cc->mbuf = NULL; + } + else { + BBuf* bbuf; + + n = ONIGENC_CODE_RANGE_NUM(mbr); + if (n == 0) goto is_null; + + bbuf = (BBuf* )xmalloc(sizeof(BBuf)); + CHECK_NULL_RETURN_VAL(bbuf, NULL); + bbuf->alloc = n + 1; + bbuf->used = n + 1; + bbuf->p = (UChar* )((void* )mbr); + + cc->mbuf = bbuf; + } + + return node; +} + static Node* node_new_ctype(int type) { @@ -1152,6 +1098,12 @@ node_new_list(Node* left, Node* right) return node; } +extern Node* +onig_node_new_list(Node* left, Node* right) +{ + return node_new_list(left, right); +} + static Node* node_new_alt(Node* left, Node* right) { @@ -1237,6 +1189,7 @@ node_new_qualifier(int lower, int upper, int by_number) Node* node = node_new(); CHECK_NULL_RETURN(node); node->type = N_QUALIFIER; + NQUALIFIER(node).state = 0; NQUALIFIER(node).target = NULL; NQUALIFIER(node).lower = lower; NQUALIFIER(node).upper = upper; @@ -1295,7 +1248,7 @@ node_new_option(OnigOptionType option) } extern int -onig_node_str_cat(Node* node, UChar* s, UChar* end) +onig_node_str_cat(Node* node, const UChar* s, const UChar* end) { int addlen = end - s; @@ -1350,8 +1303,22 @@ onig_node_conv_to_str_node(Node* node, int flag) NSTRING(node).end = NSTRING(node).buf; } +extern void +onig_node_str_clear(Node* node) +{ + if (NSTRING(node).capa != 0 && + IS_NOT_NULL(NSTRING(node).s) && NSTRING(node).s != NSTRING(node).buf) { + xfree(NSTRING(node).s); + } + + NSTRING(node).capa = 0; + NSTRING(node).flag = 0; + NSTRING(node).s = NSTRING(node).buf; + NSTRING(node).end = NSTRING(node).buf; +} + static Node* -node_new_str(UChar* s, UChar* end) +node_new_str(const UChar* s, const UChar* end) { Node* node = node_new(); CHECK_NULL_RETURN(node); @@ -1368,6 +1335,12 @@ node_new_str(UChar* s, UChar* end) return node; } +extern Node* +onig_node_new_str(const UChar* s, const UChar* end) +{ + return node_new_str(s, end); +} + static Node* node_new_str_raw(UChar* s, UChar* end) { @@ -1383,15 +1356,6 @@ node_new_empty() } static Node* -node_new_str_char(UChar c) -{ - UChar p[1]; - - p[0] = c; - return node_new_str(p, p + 1); -} - -static Node* node_new_str_raw_char(UChar c) { UChar p[1]; @@ -1403,7 +1367,7 @@ node_new_str_raw_char(UChar c) static Node* str_node_split_last_char(StrNode* sn, OnigEncoding enc) { - UChar *p; + const UChar *p; Node* n = NULL_NODE; if (sn->end > sn->s) { @@ -1412,7 +1376,7 @@ str_node_split_last_char(StrNode* sn, OnigEncoding enc) n = node_new_str(p, sn->end); if ((sn->flag & NSTR_RAW) != 0) NSTRING_SET_RAW(n); - sn->end = p; + sn->end = (UChar* )p; } } return n; @@ -1422,17 +1386,18 @@ static int str_node_can_be_split(StrNode* sn, OnigEncoding enc) { if (sn->end > sn->s) { - return ((enc_len(enc, *(sn->s)) < sn->end - sn->s) ? 1 : 0); + return ((enc_len(enc, sn->s) < sn->end - sn->s) ? 1 : 0); } return 0; } extern int -onig_scan_unsigned_number(UChar** src, UChar* end, OnigEncoding enc) +onig_scan_unsigned_number(UChar** src, const UChar* end, OnigEncoding enc) { unsigned int num, val; - int c; + OnigCodePoint c; UChar* p = *src; + PFETCH_READY; num = 0; while (!PEND) { @@ -1457,9 +1422,10 @@ static int scan_unsigned_hexadecimal_number(UChar** src, UChar* end, int maxlen, OnigEncoding enc) { - int c; + OnigCodePoint c; unsigned int num, val; UChar* p = *src; + PFETCH_READY; num = 0; while (!PEND && maxlen-- != 0) { @@ -1484,9 +1450,10 @@ static int scan_unsigned_octal_number(UChar** src, UChar* end, int maxlen, OnigEncoding enc) { - int c; + OnigCodePoint c; unsigned int num, val; UChar* p = *src; + PFETCH_READY; num = 0; while (!PEND && maxlen-- != 0) { @@ -1622,15 +1589,15 @@ add_code_range(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to) } static int -not_code_range_buf(BBuf* bbuf, BBuf** pbuf) +not_code_range_buf(OnigEncoding enc, BBuf* bbuf, BBuf** pbuf) { int r, i, n; - OnigCodePoint pre, from, to, *data; + OnigCodePoint pre, from, *data, to = 0; *pbuf = (BBuf* )NULL; if (IS_NULL(bbuf)) { set_all: - return SET_ALL_MULTI_BYTE_RANGE(pbuf); + return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf); } data = (OnigCodePoint* )(bbuf->p); @@ -1639,7 +1606,7 @@ not_code_range_buf(BBuf* bbuf, BBuf** pbuf) if (n <= 0) goto set_all; r = 0; - pre = 0x80; + pre = MBCODE_START_POS(enc); for (i = 0; i < n; i++) { from = data[i*2]; to = data[i*2+1]; @@ -1664,7 +1631,8 @@ not_code_range_buf(BBuf* bbuf, BBuf** pbuf) } while (0) static int -or_code_range_buf(BBuf* bbuf1, int not1, BBuf* bbuf2, int not2, BBuf** pbuf) +or_code_range_buf(OnigEncoding enc, BBuf* bbuf1, int not1, + BBuf* bbuf2, int not2, BBuf** pbuf) { int r; OnigCodePoint i, n1, *data1; @@ -1673,7 +1641,7 @@ or_code_range_buf(BBuf* bbuf1, int not1, BBuf* bbuf2, int not2, BBuf** pbuf) *pbuf = (BBuf* )NULL; if (IS_NULL(bbuf1) && IS_NULL(bbuf2)) { if (not1 != 0 || not2 != 0) - return SET_ALL_MULTI_BYTE_RANGE(pbuf); + return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf); return 0; } @@ -1683,14 +1651,14 @@ or_code_range_buf(BBuf* bbuf1, int not1, BBuf* bbuf2, int not2, BBuf** pbuf) if (IS_NULL(bbuf1)) { if (not1 != 0) { - return SET_ALL_MULTI_BYTE_RANGE(pbuf); + return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf); } else { if (not2 == 0) { return bbuf_clone(pbuf, bbuf2); } else { - return not_code_range_buf(bbuf2, pbuf); + return not_code_range_buf(enc, bbuf2, pbuf); } } } @@ -1706,7 +1674,7 @@ or_code_range_buf(BBuf* bbuf1, int not1, BBuf* bbuf2, int not2, BBuf** pbuf) r = bbuf_clone(pbuf, bbuf2); } else if (not1 == 0) { /* 1 OR (not 2) */ - r = not_code_range_buf(bbuf2, pbuf); + r = not_code_range_buf(enc, bbuf2, pbuf); } if (r != 0) return r; @@ -1817,6 +1785,29 @@ and_code_range_buf(BBuf* bbuf1, int not1, BBuf* bbuf2, int not2, BBuf** pbuf) } static int +clear_not_flag_cclass(CClassNode* cc, OnigEncoding enc) +{ + BBuf *tbuf; + int r; + + if (IS_CCLASS_NOT(cc)) { + bitset_invert(cc->bs); + + if (! ONIGENC_IS_SINGLEBYTE(enc)) { + r = not_code_range_buf(enc, cc->mbuf, &tbuf); + if (r != 0) return r; + + bbuf_free(cc->mbuf); + cc->mbuf = tbuf; + } + + CCLASS_CLEAR_NOT(cc); + } + + return 0; +} + +static int and_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc) { int r, not1, not2; @@ -1824,10 +1815,10 @@ and_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc) BitSetRef bsr1, bsr2; BitSet bs1, bs2; - not1 = dest->not; + not1 = IS_CCLASS_NOT(dest); bsr1 = dest->bs; buf1 = dest->mbuf; - not2 = cc->not; + not2 = IS_CCLASS_NOT(cc); bsr2 = cc->bs; buf2 = cc->mbuf; @@ -1850,13 +1841,13 @@ and_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc) if (! ONIGENC_IS_SINGLEBYTE(enc)) { if (not1 != 0 && not2 != 0) { - r = or_code_range_buf(buf1, 0, buf2, 0, &pbuf); + r = or_code_range_buf(enc, buf1, 0, buf2, 0, &pbuf); } else { r = and_code_range_buf(buf1, not1, buf2, not2, &pbuf); if (r == 0 && not1 != 0) { BBuf *tbuf; - r = not_code_range_buf(pbuf, &tbuf); + r = not_code_range_buf(enc, pbuf, &tbuf); if (r != 0) { bbuf_free(pbuf); return r; @@ -1882,10 +1873,10 @@ or_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc) BitSetRef bsr1, bsr2; BitSet bs1, bs2; - not1 = dest->not; + not1 = IS_CCLASS_NOT(dest); bsr1 = dest->bs; buf1 = dest->mbuf; - not2 = cc->not; + not2 = IS_CCLASS_NOT(cc); bsr2 = cc->bs; buf2 = cc->mbuf; @@ -1911,10 +1902,10 @@ or_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc) r = and_code_range_buf(buf1, 0, buf2, 0, &pbuf); } else { - r = or_code_range_buf(buf1, not1, buf2, not2, &pbuf); + r = or_code_range_buf(enc, buf1, not1, buf2, not2, &pbuf); if (r == 0 && not1 != 0) { BBuf *tbuf; - r = not_code_range_buf(pbuf, &tbuf); + r = not_code_range_buf(enc, pbuf, &tbuf); if (r != 0) { bbuf_free(pbuf); return r; @@ -2014,26 +2005,29 @@ popular_qualifier_num(QualifierNode* qf) return -1; } + +enum ReduceType { + RQ_ASIS = 0, /* as is */ + RQ_DEL = 1, /* delete parent */ + RQ_A, /* to '*' */ + RQ_AQ, /* to '*?' */ + RQ_QQ, /* to '??' */ + RQ_P_QQ, /* to '+)??' */ + RQ_PQ_Q, /* to '+?)?' */ +}; + +static enum ReduceType ReduceTypeTable[6][6] = { + {RQ_DEL, RQ_A, RQ_A, RQ_QQ, RQ_AQ, RQ_ASIS}, /* '?' */ + {RQ_DEL, RQ_DEL, RQ_DEL, RQ_P_QQ, RQ_P_QQ, RQ_DEL}, /* '*' */ + {RQ_A, RQ_A, RQ_DEL, RQ_ASIS, RQ_P_QQ, RQ_DEL}, /* '+' */ + {RQ_DEL, RQ_AQ, RQ_AQ, RQ_DEL, RQ_AQ, RQ_AQ}, /* '??' */ + {RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL}, /* '*?' */ + {RQ_ASIS, RQ_PQ_Q, RQ_DEL, RQ_AQ, RQ_AQ, RQ_DEL} /* '+?' */ +}; + extern void onig_reduce_nested_qualifier(Node* pnode, Node* cnode) { -#define NQ_ASIS 0 /* as is */ -#define NQ_DEL 1 /* delete parent */ -#define NQ_A 2 /* to '*' */ -#define NQ_AQ 3 /* to '*?' */ -#define NQ_QQ 4 /* to '??' */ -#define NQ_P_QQ 5 /* to '+)??' */ -#define NQ_PQ_Q 6 /* to '+?)?' */ - - static char reduces[][6] = { - {NQ_DEL, NQ_A, NQ_A, NQ_QQ, NQ_AQ, NQ_ASIS}, /* '?' */ - {NQ_DEL, NQ_DEL, NQ_DEL, NQ_P_QQ, NQ_P_QQ, NQ_DEL}, /* '*' */ - {NQ_A, NQ_A, NQ_DEL, NQ_ASIS, NQ_P_QQ, NQ_DEL}, /* '+' */ - {NQ_DEL, NQ_AQ, NQ_AQ, NQ_DEL, NQ_AQ, NQ_AQ}, /* '??' */ - {NQ_DEL, NQ_DEL, NQ_DEL, NQ_DEL, NQ_DEL, NQ_DEL}, /* '*?' */ - {NQ_ASIS, NQ_PQ_Q, NQ_DEL, NQ_AQ, NQ_AQ, NQ_DEL} /* '+?' */ - }; - int pnum, cnum; QualifierNode *p, *c; @@ -2042,35 +2036,35 @@ onig_reduce_nested_qualifier(Node* pnode, Node* cnode) pnum = popular_qualifier_num(p); cnum = popular_qualifier_num(c); - switch(reduces[cnum][pnum]) { - case NQ_DEL: + switch(ReduceTypeTable[cnum][pnum]) { + case RQ_DEL: *p = *c; break; - case NQ_A: + case RQ_A: p->target = c->target; p->lower = 0; p->upper = REPEAT_INFINITE; p->greedy = 1; break; - case NQ_AQ: + case RQ_AQ: p->target = c->target; p->lower = 0; p->upper = REPEAT_INFINITE; p->greedy = 0; break; - case NQ_QQ: + case RQ_QQ: p->target = c->target; p->lower = 0; p->upper = 1; p->greedy = 0; break; - case NQ_P_QQ: + case RQ_P_QQ: p->target = cnode; p->lower = 0; p->upper = 1; p->greedy = 0; c->lower = 1; c->upper = REPEAT_INFINITE; c->greedy = 1; return ; break; - case NQ_PQ_Q: + case RQ_PQ_Q: p->target = cnode; p->lower = 0; p->upper = 1; p->greedy = 1; c->lower = 1; c->upper = REPEAT_INFINITE; c->greedy = 0; return ; break; - case NQ_ASIS: + case RQ_ASIS: p->target = cnode; return ; break; @@ -2083,8 +2077,9 @@ onig_reduce_nested_qualifier(Node* pnode, Node* cnode) enum TokenSyms { TK_EOT = 0, /* end of token */ - TK_BYTE = 1, - TK_RAW_BYTE = 2, + TK_RAW_BYTE = 1, + TK_CHAR, + TK_STRING, TK_CODE_POINT, TK_ANYCHAR, TK_CHAR_TYPE, @@ -2114,6 +2109,7 @@ typedef struct { int base; /* is number: 8, 16 (used in [....]) */ UChar* backp; union { + UChar* s; int c; OnigCodePoint code; int anchor; @@ -2145,8 +2141,11 @@ static int fetch_range_qualifier(UChar** src, UChar* end, OnigToken* tok, ScanEnv* env) { int low, up, syn_allow, non_low = 0; - int c; + int r = 0; + OnigCodePoint c; + OnigEncoding enc = env->enc; UChar* p = *src; + PFETCH_READY; syn_allow = IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INVALID_INTERVAL); @@ -2200,12 +2199,13 @@ fetch_range_qualifier(UChar** src, UChar* end, OnigToken* tok, ScanEnv* env) PUNFETCH; up = low; /* {n} : exact n times */ + r = 2; /* fixed */ } if (PEND) goto invalid; PFETCH(c); if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) { - if (c != MC_ESC) goto invalid; + if (c != MC_ESC(enc)) goto invalid; PFETCH(c); } if (c != '}') goto invalid; @@ -2218,7 +2218,7 @@ fetch_range_qualifier(UChar** src, UChar* end, OnigToken* tok, ScanEnv* env) tok->u.repeat.lower = low; tok->u.repeat.upper = up; *src = p; - return 0; + return r; /* 0: normal {n,m}, 2: fixed {n} */ invalid: if (syn_allow) @@ -2231,10 +2231,13 @@ fetch_range_qualifier(UChar** src, UChar* end, OnigToken* tok, ScanEnv* env) static int fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env) { - int c; + int v; + OnigCodePoint c; + OnigEncoding enc = env->enc; UChar* p = *src; + PFETCH_READY; - if (PEND) return ONIGERR_END_PATTERN_AT_BACKSLASH; + if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE; PFETCH(c); switch (c) { @@ -2245,9 +2248,10 @@ fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env) if (c != '-') return ONIGERR_META_CODE_SYNTAX; if (PEND) return ONIGERR_END_PATTERN_AT_META; PFETCH(c); - if (c == MC_ESC) { - c = fetch_escaped_value(&p, end, env); - if (c < 0) return c; + if (c == MC_ESC(enc)) { + v = fetch_escaped_value(&p, end, env); + if (v < 0) return v; + c = (OnigCodePoint )v; } c = ((c & 0xff) | 0x80); } @@ -2270,9 +2274,10 @@ fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env) control: if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL; PFETCH(c); - if (c == MC_ESC) { - c = fetch_escaped_value(&p, end, env); - if (c < 0) return c; + if (c == MC_ESC(enc)) { + v = fetch_escaped_value(&p, end, env); + if (v < 0) return v; + c = (OnigCodePoint )v; } else if (c == '?') c = 0177; @@ -2304,10 +2309,13 @@ static int fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env); static int fetch_name(UChar** src, UChar* end, UChar** rname_end, ScanEnv* env, int ref) { - int r, len, is_num; - int c = 0; + int r, is_num; + OnigCodePoint c = 0; + OnigCodePoint first_code; + OnigEncoding enc = env->enc; UChar *name_end; UChar *p = *src; + PFETCH_READY; name_end = end; r = 0; @@ -2317,19 +2325,20 @@ fetch_name(UChar** src, UChar* end, UChar** rname_end, ScanEnv* env, int ref) } else { PFETCH(c); + first_code = c; if (c == '>') return ONIGERR_EMPTY_GROUP_NAME; - if (ONIGENC_IS_CODE_DIGIT(env->enc, c)) { + if (ONIGENC_IS_CODE_DIGIT(enc, c)) { if (ref == 1) is_num = 1; else { r = ONIGERR_INVALID_GROUP_NAME; } } - len = enc_len(env->enc, c); - while (!PEND && len-- > 1) - PFETCH(c); + else if (!ONIGENC_IS_CODE_WORD(enc, c)) { + r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; + } } while (!PEND) { @@ -2337,35 +2346,28 @@ fetch_name(UChar** src, UChar* end, UChar** rname_end, ScanEnv* env, int ref) PFETCH(c); if (c == '>' || c == ')') break; - len = enc_len(env->enc, c); if (is_num == 1) { - if (! ONIGENC_IS_CODE_DIGIT(env->enc, c)) { - if (!ONIGENC_IS_CODE_ALPHA(env->enc, c) && c != '_') + if (! ONIGENC_IS_CODE_DIGIT(enc, c)) { + if (!ONIGENC_IS_CODE_WORD(enc, c)) r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; else r = ONIGERR_INVALID_GROUP_NAME; } } else { - if (len == 1) { - if (!ONIGENC_IS_CODE_ALPHA(env->enc, c) && - !ONIGENC_IS_CODE_DIGIT(env->enc, c) && - c != '_') { - r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; - } + if (!ONIGENC_IS_CODE_WORD(enc, c)) { + r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; } } - - while (!PEND && len-- > 1) - PFETCH(c); } + if (c != '>') { r = ONIGERR_INVALID_GROUP_NAME; name_end = end; } else { - c = **src; - if (ONIGENC_IS_CODE_UPPER(env->enc, c)) + if (ONIGENC_IS_CODE_ASCII(first_code) && + ONIGENC_IS_CODE_UPPER(enc, first_code)) r = ONIGERR_INVALID_GROUP_NAME; } @@ -2384,19 +2386,21 @@ static int fetch_name(UChar** src, UChar* end, UChar** rname_end, ScanEnv* env, int ref) { int r, len; - int c = 0; + OnigCodePoint c = 0; UChar *name_end; + OnigEncoding enc = env->enc; UChar *p = *src; + PFETCH_READY; r = 0; while (!PEND) { name_end = p; - PFETCH(c); - if (enc_len(env->enc, c) > 1) + if (enc_len(enc, p) > 1) r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; + PFETCH(c); if (c == '>' || c == ')') break; - if (! ONIGENC_IS_CODE_DIGIT(env->enc, c)) + if (! ONIGENC_IS_CODE_DIGIT(enc, c)) r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; } if (c != '>') { @@ -2457,12 +2461,12 @@ find_str_position(OnigCodePoint s[], int n, UChar* from, UChar* to, while (p < to) { x = ONIGENC_MBC_TO_CODE(enc, p, to); - q = p + enc_len(enc, *p); + q = p + enc_len(enc, p); if (x == s[0]) { for (i = 1; i < n && q < to; i++) { x = ONIGENC_MBC_TO_CODE(enc, q, to); if (x != s[i]) break; - q += enc_len(enc, *q); + q += enc_len(enc, q); } if (i >= n) { if (IS_NOT_NULL(next)) @@ -2488,24 +2492,24 @@ str_exist_check_with_esc(OnigCodePoint s[], int n, UChar* from, UChar* to, while (p < to) { if (in_esc) { in_esc = 0; - p += enc_len(enc, *p); + p += enc_len(enc, p); } else { x = ONIGENC_MBC_TO_CODE(enc, p, to); - q = p + enc_len(enc, *p); + q = p + enc_len(enc, p); if (x == s[0]) { for (i = 1; i < n && q < to; i++) { x = ONIGENC_MBC_TO_CODE(enc, q, to); if (x != s[i]) break; - q += enc_len(enc, *q); + q += enc_len(enc, q); } if (i >= n) return 1; - p += enc_len(enc, *p); + p += enc_len(enc, p); } else { x = ONIGENC_MBC_TO_CODE(enc, p, to); if (x == bad) return 0; - else if (x == MC_ESC) in_esc = 1; + else if (x == MC_ESC(enc)) in_esc = 1; p = q; } } @@ -2516,10 +2520,13 @@ str_exist_check_with_esc(OnigCodePoint s[], int n, UChar* from, UChar* to, static int fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) { - int c, num; + int num; + OnigCodePoint c, c2; OnigSyntaxType* syn = env->syntax; + OnigEncoding enc = env->enc; UChar* prev; UChar* p = *src; + PFETCH_READY; if (PEND) { tok->type = TK_EOT; @@ -2527,7 +2534,7 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) } PFETCH(c); - tok->type = TK_BYTE; + tok->type = TK_CHAR; tok->base = 0; tok->u.c = c; if (c == ']') { @@ -2536,11 +2543,11 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) else if (c == '-') { tok->type = TK_CC_RANGE; } - else if (c == MC_ESC) { + else if (c == MC_ESC(enc)) { if (! IS_SYNTAX_BV(syn, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC)) goto end; - if (PEND) return ONIGERR_END_PATTERN_AT_BACKSLASH; + if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE; PFETCH(c); tok->escaped = 1; @@ -2570,14 +2577,34 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) tok->type = TK_CHAR_TYPE; tok->u.subtype = CTYPE_NOT_WHITE_SPACE; break; + case 'h': + if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break; + tok->type = TK_CHAR_TYPE; + tok->u.subtype = CTYPE_XDIGIT; + break; + case 'H': + if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break; + tok->type = TK_CHAR_TYPE; + tok->u.subtype = CTYPE_NOT_XDIGIT; + break; case 'p': case 'P': - if (PPEEK == '{' && - IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_CHAR_PROPERTY)) { + c2 = PPEEK; + if (c2 == '{' && + IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) { PINC; tok->type = TK_CHAR_PROPERTY; tok->u.prop.not = (c == 'P' ? 1 : 0); + + if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) { + PFETCH(c2); + if (c2 == '^') { + tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0); + } + else + PUNFETCH; + } } break; @@ -2585,14 +2612,17 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) if (PEND) break; prev = p; - if (PPEEK == '{' && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) { + if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) { PINC; - num = scan_unsigned_hexadecimal_number(&p, end, 8, env->enc); + num = scan_unsigned_hexadecimal_number(&p, end, 8, enc); if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE; - if (!PEND && ONIGENC_IS_CODE_XDIGIT(env->enc, *p) && p - prev >= 9) - return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE; + if (!PEND) { + c2 = PPEEK; + if (ONIGENC_IS_CODE_XDIGIT(enc, c2)) + return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE; + } - if (p > prev + 1 && !PEND && PPEEK == '}') { + if (p > prev + enc_len(enc, prev) && !PEND && (PPEEK_IS('}'))) { PINC; tok->type = TK_CODE_POINT; tok->base = 16; @@ -2604,7 +2634,7 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) } } else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) { - num = scan_unsigned_hexadecimal_number(&p, end, 2, env->enc); + num = scan_unsigned_hexadecimal_number(&p, end, 2, enc); if (num < 0) return ONIGERR_TOO_BIG_NUMBER; if (p == prev) { /* can't read nothing. */ num = 0; /* but, it's not error */ @@ -2620,14 +2650,14 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) prev = p; if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) { - num = scan_unsigned_hexadecimal_number(&p, end, 4, env->enc); + num = scan_unsigned_hexadecimal_number(&p, end, 4, enc); if (num < 0) return ONIGERR_TOO_BIG_NUMBER; if (p == prev) { /* can't read nothing. */ num = 0; /* but, it's not error */ } - tok->type = TK_RAW_BYTE; - tok->base = 16; - tok->u.c = num; + tok->type = TK_CODE_POINT; + tok->base = 16; + tok->u.code = (OnigCodePoint )num; } break; @@ -2636,7 +2666,7 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) { PUNFETCH; prev = p; - num = scan_unsigned_octal_number(&p, end, 3, env->enc); + num = scan_unsigned_octal_number(&p, end, 3, enc); if (num < 0) return ONIGERR_TOO_BIG_NUMBER; if (p == prev) { /* can't read nothing. */ num = 0; /* but, it's not error */ @@ -2652,19 +2682,19 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) num = fetch_escaped_value(&p, end, env); if (num < 0) return num; if (tok->u.c != num) { - tok->u.c = num; - tok->type = TK_RAW_BYTE; + tok->u.code = (OnigCodePoint )num; + tok->type = TK_CODE_POINT; } break; } } else if (c == '[') { - if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_POSIX_BRACKET) && PPEEK == ':') { + if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_POSIX_BRACKET) && (PPEEK_IS(':'))) { OnigCodePoint send[] = { (OnigCodePoint )':', (OnigCodePoint )']' }; tok->backp = p; /* point at '[' is readed */ PINC; - if (str_exist_check_with_esc(send, 2, p, end, (OnigCodePoint )']', - env->enc)) { + if (str_exist_check_with_esc(send, 2, p, end, + (OnigCodePoint )']', enc)) { tok->type = TK_POSIX_BRACKET_OPEN; } else { @@ -2684,7 +2714,7 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) } else if (c == '&') { if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP) && - !PEND && PPEEK == '&') { + !PEND && (PPEEK_IS('&'))) { PINC; tok->type = TK_CC_AND; } @@ -2698,10 +2728,13 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) static int fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) { - int r, c, num; + int r, num; + OnigCodePoint c; + OnigEncoding enc = env->enc; OnigSyntaxType* syn = env->syntax; UChar* prev; UChar* p = *src; + PFETCH_READY; start: if (PEND) { @@ -2709,13 +2742,17 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) return tok->type; } - tok->type = TK_BYTE; - tok->base = 0; + tok->type = TK_STRING; + tok->base = 0; + tok->backp = p; + PFETCH(c); - if (c == MC_ESC) { - if (PEND) return ONIGERR_END_PATTERN_AT_BACKSLASH; + if (c == MC_ESC(enc)) { + if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE; + tok->backp = p; PFETCH(c); + tok->u.c = c; tok->escaped = 1; switch (c) { @@ -2741,37 +2778,42 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) tok->u.repeat.lower = 0; tok->u.repeat.upper = 1; greedy_check: - if (!PEND && PPEEK == '?' && + if (!PEND && PPEEK_IS('?') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_NON_GREEDY)) { PFETCH(c); tok->u.repeat.greedy = 0; tok->u.repeat.possessive = 0; } - else if (!PEND && PPEEK == '+' && - ((IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT) && - tok->type != TK_INTERVAL) || - (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL) && - tok->type == TK_INTERVAL))) { - PFETCH(c); - tok->u.repeat.greedy = 1; - tok->u.repeat.possessive = 1; - } else { - tok->u.repeat.greedy = 1; - tok->u.repeat.possessive = 0; + possessive_check: + if (!PEND && PPEEK_IS('+') && + ((IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT) && + tok->type != TK_INTERVAL) || + (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL) && + tok->type == TK_INTERVAL))) { + PFETCH(c); + tok->u.repeat.greedy = 1; + tok->u.repeat.possessive = 1; + } + else { + tok->u.repeat.greedy = 1; + tok->u.repeat.possessive = 0; + } } break; case '{': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) break; - tok->backp = p; r = fetch_range_qualifier(&p, end, tok, env); if (r < 0) return r; /* error */ - if (r > 0) { - /* normal char */ - } - else + if (r == 0) goto greedy_check; + else if (r == 2) { /* {n} */ + if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY)) + goto possessive_check; + goto greedy_check; + } + /* r == 1 : normal char */ break; case '|': @@ -2851,6 +2893,18 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) tok->u.subtype = CTYPE_NOT_DIGIT; break; + case 'h': + if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break; + tok->type = TK_CHAR_TYPE; + tok->u.subtype = CTYPE_XDIGIT; + break; + + case 'H': + if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break; + tok->type = TK_CHAR_TYPE; + tok->u.subtype = CTYPE_NOT_XDIGIT; + break; + case 'A': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break; begin_buf: @@ -2891,14 +2945,16 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) if (PEND) break; prev = p; - if (PPEEK == '{' && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) { + if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) { PINC; - num = scan_unsigned_hexadecimal_number(&p, end, 8, env->enc); + num = scan_unsigned_hexadecimal_number(&p, end, 8, enc); if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE; - if (!PEND && ONIGENC_IS_CODE_XDIGIT(env->enc, *p) && p - prev >= 9) - return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE; + if (!PEND) { + if (ONIGENC_IS_CODE_XDIGIT(enc, PPEEK)) + return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE; + } - if (p > prev + 1 && !PEND && PPEEK == '}') { + if ((p > prev + enc_len(enc, prev)) && !PEND && PPEEK_IS('}')) { PINC; tok->type = TK_CODE_POINT; tok->u.code = (OnigCodePoint )num; @@ -2909,7 +2965,7 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) } } else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) { - num = scan_unsigned_hexadecimal_number(&p, end, 2, env->enc); + num = scan_unsigned_hexadecimal_number(&p, end, 2, enc); if (num < 0) return ONIGERR_TOO_BIG_NUMBER; if (p == prev) { /* can't read nothing. */ num = 0; /* but, it's not error */ @@ -2925,14 +2981,14 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) prev = p; if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) { - num = scan_unsigned_hexadecimal_number(&p, end, 4, env->enc); + num = scan_unsigned_hexadecimal_number(&p, end, 4, enc); if (num < 0) return ONIGERR_TOO_BIG_NUMBER; if (p == prev) { /* can't read nothing. */ num = 0; /* but, it's not error */ } - tok->type = TK_RAW_BYTE; - tok->base = 16; - tok->u.c = num; + tok->type = TK_CODE_POINT; + tok->base = 16; + tok->u.code = (OnigCodePoint )num; } break; @@ -2940,9 +2996,10 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) case '5': case '6': case '7': case '8': case '9': PUNFETCH; prev = p; - num = onig_scan_unsigned_number(&p, end, env->enc); - if (num < 0) return ONIGERR_TOO_BIG_NUMBER; - if (num > ONIG_MAX_BACKREF_NUM) return ONIGERR_TOO_BIG_BACKREF_NUMBER; + num = onig_scan_unsigned_number(&p, end, enc); + if (num < 0 || num > ONIG_MAX_BACKREF_NUM) { + goto skip_backref; + } if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_DECIMAL_BACKREF) && (num <= env->num_mem || num <= 9)) { /* This spec. from GNU regex */ @@ -2957,7 +3014,9 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) tok->u.backref.by_name = 0; break; } - else if (c == '8' || c == '9') { + + skip_backref: + if (c == '8' || c == '9') { /* normal char */ p = prev; PINC; break; @@ -2968,7 +3027,7 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) case '0': if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) { prev = p; - num = scan_unsigned_octal_number(&p, end, (c == '0' ? 2:3), env->enc); + num = scan_unsigned_octal_number(&p, end, (c == '0' ? 2:3), enc); if (num < 0) return ONIGERR_TOO_BIG_NUMBER; if (p == prev) { /* can't read nothing. */ num = 0; /* but, it's not error */ @@ -3054,11 +3113,20 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) case 'p': case 'P': - if (PPEEK == '{' && - IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_CHAR_PROPERTY)) { + if (PPEEK_IS('{') && + IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) { PINC; tok->type = TK_CHAR_PROPERTY; tok->u.prop.not = (c == 'P' ? 1 : 0); + + if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) { + PFETCH(c); + if (c == '^') { + tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0); + } + else + PUNFETCH; + } } break; @@ -3068,8 +3136,11 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) if (num < 0) return num; /* set_raw: */ if (tok->u.c != num) { - tok->type = TK_RAW_BYTE; - tok->u.c = num; + tok->type = TK_CODE_POINT; + tok->u.code = (OnigCodePoint )num; + } + else { /* string */ + p = tok->backp + enc_len(enc, tok->backp); } break; } @@ -3081,15 +3152,15 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) #ifdef USE_VARIABLE_META_CHARS if ((c != ONIG_INEFFECTIVE_META_CHAR) && IS_SYNTAX_OP(syn, ONIG_SYN_OP_VARIABLE_META_CHARACTERS)) { - if (c == MC_ANYCHAR) + if (c == MC_ANYCHAR(enc)) goto any_char; - else if (c == MC_ANYTIME) + else if (c == MC_ANYTIME(enc)) goto anytime; - else if (c == MC_ZERO_OR_ONE_TIME) + else if (c == MC_ZERO_OR_ONE_TIME(enc)) goto zero_or_one_time; - else if (c == MC_ONE_OR_MORE_TIME) + else if (c == MC_ONE_OR_MORE_TIME(enc)) goto one_or_more_time; - else if (c == MC_ANYCHAR_ANYTIME) { + else if (c == MC_ANYCHAR_ANYTIME(enc)) { tok->type = TK_ANYCHAR_ANYTIME; goto out; } @@ -3132,14 +3203,16 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) case '{': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACE_INTERVAL)) break; - tok->backp = p; r = fetch_range_qualifier(&p, end, tok, env); if (r < 0) return r; /* error */ - if (r > 0) { - /* normal char */ - } - else + if (r == 0) goto greedy_check; + else if (r == 2) { /* {n} */ + if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY)) + goto possessive_check; + goto greedy_check; + } + /* r == 1 : normal char */ break; case '|': @@ -3148,6 +3221,26 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) break; case '(': + if (PPEEK_IS('?') && + IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) { + PINC; + if (PPEEK_IS('#')) { + PFETCH(c); + while (1) { + if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; + PFETCH(c); + if (c == MC_ESC(enc)) { + if (!PEND) PFETCH(c); + } + else { + if (c == ')') break; + } + } + goto start; + } + PUNFETCH; + } + if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break; tok->type = TK_SUBEXP_OPEN; break; @@ -3185,7 +3278,7 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) if (IS_EXTEND(env->option)) { while (!PEND) { PFETCH(c); - if (ONIG_IS_NEWLINE(c)) + if (ONIGENC_IS_CODE_NEWLINE(enc, c)) break; } goto start; @@ -3199,6 +3292,7 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) break; default: + /* string */ break; } } @@ -3209,48 +3303,57 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) } static int -add_ctype_to_cc_by_list(CClassNode* cc, int ctype, int not, - OnigEncoding enc) +add_ctype_to_cc_by_range(CClassNode* cc, int ctype, int not, OnigEncoding enc, + OnigCodePoint sbr[], OnigCodePoint mbr[]) { - int i, r, nsb, nmb; - OnigCodePointRange *sbr, *mbr; + int i, r; OnigCodePoint j; - r = ONIGENC_GET_CTYPE_CODE_RANGE(enc, ctype, &nsb, &nmb, &sbr, &mbr); - if (r != 0) return r; + int nsb = ONIGENC_CODE_RANGE_NUM(sbr); + int nmb = ONIGENC_CODE_RANGE_NUM(mbr); if (not == 0) { for (i = 0; i < nsb; i++) { - for (j = sbr[i].from; j <= sbr[i].to; j++) { - BITSET_SET_BIT(cc->bs, j); + for (j = ONIGENC_CODE_RANGE_FROM(sbr, i); + j <= ONIGENC_CODE_RANGE_TO(sbr, i); j++) { + BITSET_SET_BIT(cc->bs, j); } } + for (i = 0; i < nmb; i++) { - r = add_code_range_to_buf(&(cc->mbuf), mbr[i].from, mbr[i].to); + r = add_code_range_to_buf(&(cc->mbuf), + ONIGENC_CODE_RANGE_FROM(mbr, i), + ONIGENC_CODE_RANGE_TO(mbr, i)); if (r != 0) return r; } } else { OnigCodePoint prev = 0; - for (i = 0; i < nsb; i++) { - for (j = prev; j < sbr[i].from; j++) { - BITSET_SET_BIT(cc->bs, j); + + if (ONIGENC_MBC_MINLEN(enc) == 1) { + for (i = 0; i < nsb; i++) { + for (j = prev; + j < ONIGENC_CODE_RANGE_FROM(sbr, i); j++) { + BITSET_SET_BIT(cc->bs, j); + } + prev = ONIGENC_CODE_RANGE_TO(sbr, i) + 1; } - prev = sbr[i].to + 1; - } - if (prev < 0x7f) { - for (j = prev; j < 0x7f; j++) { - BITSET_SET_BIT(cc->bs, j); + if (prev < 0x7f) { + for (j = prev; j < 0x7f; j++) { + BITSET_SET_BIT(cc->bs, j); + } } + + prev = 0x80; } - prev = 0x80; for (i = 0; i < nmb; i++) { - if (prev < mbr[i].from) { - r = add_code_range_to_buf(&(cc->mbuf), prev, mbr[i].from - 1); + if (prev < ONIGENC_CODE_RANGE_FROM(mbr, i)) { + r = add_code_range_to_buf(&(cc->mbuf), prev, + ONIGENC_CODE_RANGE_FROM(mbr, i) - 1); if (r != 0) return r; } - prev = mbr[i].to + 1; + prev = ONIGENC_CODE_RANGE_TO(mbr, i) + 1; } if (prev < 0x7fffffff) { r = add_code_range_to_buf(&(cc->mbuf), prev, 0x7fffffff); @@ -3258,17 +3361,21 @@ add_ctype_to_cc_by_list(CClassNode* cc, int ctype, int not, } } - return r; + return 0; } static int add_ctype_to_cc(CClassNode* cc, int ctype, int not, ScanEnv* env) { int c, r; + OnigCodePoint *sbr, *mbr; OnigEncoding enc = env->enc; - if (ONIGENC_CTYPE_SUPPORT_LEVEL(enc) != ONIGENC_CTYPE_SUPPORT_LEVEL_SB) { - r = add_ctype_to_cc_by_list(cc, ctype, not, env->enc); + r = ONIGENC_GET_CTYPE_CODE_RANGE(enc, ctype, &sbr, &mbr); + if (r == 0) { + return add_ctype_to_cc_by_range(cc, ctype, not, env->enc, sbr, mbr); + } + else if (r != ONIG_NO_SUPPORT_CONFIG) { return r; } @@ -3326,7 +3433,8 @@ add_ctype_to_cc(CClassNode* cc, int ctype, int not, ScanEnv* env) } else { for (c = 0; c < SINGLE_BYTE_SIZE; c++) { - if (! ONIGENC_IS_CODE_SB_WORD(enc, c) && ! ONIGENC_IS_MBC_HEAD(enc, c)) + if ((ONIGENC_CODE_TO_MBCLEN(enc, c) > 0) /* 0: invalid code point */ + && ! ONIGENC_IS_CODE_WORD(enc, c)) BITSET_SET_BIT(cc->bs, c); } } @@ -3370,6 +3478,14 @@ parse_ctype_to_enc_ctype(int pctype, int* not) ctype = ONIGENC_CTYPE_DIGIT; *not = 1; break; + case CTYPE_XDIGIT: + ctype = ONIGENC_CTYPE_XDIGIT; + *not = 0; + break; + case CTYPE_NOT_XDIGIT: + ctype = ONIGENC_CTYPE_XDIGIT; + *not = 1; + break; default: return ONIGERR_PARSER_BUG; break; @@ -3407,23 +3523,26 @@ parse_posix_bracket(CClassNode* cc, UChar** src, UChar* end, ScanEnv* env) }; PosixBracketEntryType *pb; - int not, i, c, r; + int not, i, r; + OnigCodePoint c; + OnigEncoding enc = env->enc; UChar *p = *src; + PFETCH_READY; - if (PPEEK == '^') { + if (PPEEK_IS('^')) { PINC; not = 1; } else not = 0; - if (end - p < POSIX_BRACKET_NAME_MAX_LEN + 1) + if (onigenc_strlen(enc, p, end) < POSIX_BRACKET_NAME_MAX_LEN + 2) goto not_posix_bracket; for (pb = PBS; IS_NOT_NULL(pb->name); pb++) { - if (onig_strncmp(p, pb->name, pb->len) == 0) { - p += pb->len; - if (end - p < 2 || *p != ':' || *(p+1) != ']') + if (onigenc_with_ascii_strncmp(enc, p, end, pb->name, pb->len) == 0) { + p = (UChar* )onigenc_step(enc, p, end, pb->len); + if (onigenc_with_ascii_strncmp(enc, p, end, ":]", 2) != 0) return ONIGERR_INVALID_POSIX_BRACKET_TYPE; r = add_ctype_to_cc(cc, pb->ctype, not, env); @@ -3442,9 +3561,9 @@ parse_posix_bracket(CClassNode* cc, UChar** src, UChar* end, ScanEnv* env) PINC; if (++i > POSIX_BRACKET_CHECK_LIMIT_LENGTH) break; } - if (c == ':' && !PEND) { + if (c == ':' && ! PEND) { PINC; - if (!PEND) { + if (! PEND) { PFETCH(c); if (c == ']') return ONIGERR_INVALID_POSIX_BRACKET_TYPE; @@ -3455,7 +3574,7 @@ parse_posix_bracket(CClassNode* cc, UChar** src, UChar* end, ScanEnv* env) } static int -property_name_to_ctype(UChar* p, UChar* end) +property_name_to_ctype(UChar* p, UChar* end, OnigEncoding enc) { static PosixBracketEntryType PBS[] = { { "Alnum", ONIGENC_CTYPE_ALNUM, 5 }, @@ -3477,28 +3596,49 @@ property_name_to_ctype(UChar* p, UChar* end) PosixBracketEntryType *pb; int len; - len = end - p; + len = onigenc_strlen(enc, p, end); for (pb = PBS; IS_NOT_NULL(pb->name); pb++) { - if (len == pb->len && onig_strncmp(p, pb->name, pb->len) == 0) + if (len == pb->len && + onigenc_with_ascii_strncmp(enc, p, end, pb->name, pb->len) == 0) return pb->ctype; } - return ONIGERR_INVALID_CHAR_PROPERTY_NAME; + return -1; } static int fetch_char_property_to_ctype(UChar** src, UChar* end, ScanEnv* env) { int ctype; - UChar *prev, *p = *src; - int c = 0; + OnigCodePoint c; + OnigEncoding enc = env->enc; + UChar *prev, *start, *p = *src; + PFETCH_READY; + + /* 'IsXXXX' => 'XXXX' */ + if (!PEND && + IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_CHAR_PROPERTY_PREFIX_IS)) { + c = PPEEK; + if (c == 'I') { + PINC; + if (! PEND) { + c = PPEEK; + if (c == 's') + PINC; + else + PUNFETCH; + } + } + } + + start = prev = p; while (!PEND) { prev = p; PFETCH(c); if (c == '}') { - ctype = property_name_to_ctype(*src, prev); - if (ctype < 0) return ctype; + ctype = property_name_to_ctype(start, prev, enc); + if (ctype < 0) break; *src = p; return ctype; @@ -3507,6 +3647,8 @@ fetch_char_property_to_ctype(UChar** src, UChar* end, ScanEnv* env) break; } + onig_scan_env_set_error_string(env, ONIGERR_INVALID_CHAR_PROPERTY_NAME, + *src, prev); return ONIGERR_INVALID_CHAR_PROPERTY_NAME; } @@ -3588,6 +3730,9 @@ next_state_val(CClassNode* cc, OnigCodePoint *vs, OnigCodePoint v, case CCS_RANGE: if (intype == *type) { if (intype == CCV_SB) { + if (*vs > 0xff || v > 0xff) + return ONIGERR_INVALID_WIDE_CHAR_VALUE; + if (*vs > v) { if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC)) goto ccs_range_end; @@ -3602,14 +3747,23 @@ next_state_val(CClassNode* cc, OnigCodePoint *vs, OnigCodePoint v, } } else { - if (intype == CCV_CODE_POINT && *type == CCV_SB && - ONIGENC_IS_CONTINUOUS_SB_MB(env->enc)) { - bitset_set_range(cc->bs, (int )*vs, 0x7f); - r = add_code_range(&(cc->mbuf), env, (OnigCodePoint )0x80, v); +#if 0 + if (intype == CCV_CODE_POINT && *type == CCV_SB) { +#endif + if (*vs > v) { + if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC)) + goto ccs_range_end; + else + return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS; + } + bitset_set_range(cc->bs, (int )*vs, (int )(v < 0xff ? v : 0xff)); + r = add_code_range(&(cc->mbuf), env, (OnigCodePoint )*vs, v); if (r < 0) return r; +#if 0 } else return ONIGERR_MISMATCH_CODE_LENGTH_IN_CLASS_RANGE; +#endif } ccs_range_end: *state = CCS_COMPLETE; @@ -3631,22 +3785,24 @@ next_state_val(CClassNode* cc, OnigCodePoint *vs, OnigCodePoint v, } static int -char_exist_check(UChar c, UChar* from, UChar* to, int ignore_escaped, +code_exist_check(OnigCodePoint c, UChar* from, UChar* end, int ignore_escaped, OnigEncoding enc) { int in_esc; + OnigCodePoint code; UChar* p = from; + PFETCH_READY; in_esc = 0; - while (p < to) { + while (! PEND) { if (ignore_escaped && in_esc) { in_esc = 0; } else { - if (*p == c) return 1; - if (*p == MC_ESC) in_esc = 1; + PFETCH(code); + if (code == c) return 1; + if (code == MC_ESC(enc)) in_esc = 1; } - p += enc_len(enc, *p); } return 0; } @@ -3669,7 +3825,7 @@ parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end, prev_cc = (CClassNode* )NULL; *np = NULL_NODE; r = fetch_token_in_cc(tok, src, end, env); - if (r == TK_BYTE && tok->u.c == '^' && tok->escaped == 0) { + if (r == TK_CHAR && tok->u.c == '^' && tok->escaped == 0) { neg = 1; r = fetch_token_in_cc(tok, src, end, env); } @@ -3679,11 +3835,12 @@ parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end, if (r < 0) return r; if (r == TK_CC_CLOSE) { - if (! char_exist_check(']', *src, env->pattern_end, 1, env->enc)) + if (! code_exist_check((OnigCodePoint )']', + *src, env->pattern_end, 1, env->enc)) return ONIGERR_EMPTY_CHAR_CLASS; CC_ESC_WARN(env, "]"); - r = tok->type = TK_BYTE; /* allow []...] */ + r = tok->type = TK_CHAR; /* allow []...] */ } *np = node = node_new_cclass(); @@ -3696,58 +3853,69 @@ parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end, while (r != TK_CC_CLOSE) { fetched = 0; switch (r) { - case TK_BYTE: - len = enc_len(env->enc, tok->u.c); + case TK_CHAR: + len = ONIGENC_CODE_TO_MBCLEN(env->enc, tok->u.c); if (len > 1) { - PUNFETCH; - v = ONIGENC_MBC_TO_CODE(env->enc, p, end); - p += len; in_type = CCV_CODE_POINT; } else { sb_char: - v = (OnigCodePoint )tok->u.c; in_type = CCV_SB; } + v = (OnigCodePoint )tok->u.c; in_israw = 0; goto val_entry2; break; case TK_RAW_BYTE: - len = enc_len(env->enc, tok->u.c); - if (len > 1 && tok->base != 0) { /* tok->base != 0 : octal or hexadec. */ + /* tok->base != 0 : octal or hexadec. */ + if (! ONIGENC_IS_SINGLEBYTE(env->enc) && tok->base != 0) { UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN]; - UChar* bufp = buf; UChar* bufe = buf + ONIGENC_CODE_TO_MBC_MAXLEN; + UChar* psave = p; int i, base = tok->base; - if (len > ONIGENC_CODE_TO_MBC_MAXLEN) { - bufp = (UChar* )xmalloc(len); - if (IS_NULL(bufp)) { - r = ONIGERR_MEMORY; - goto err; + buf[0] = tok->u.c; + for (i = 1; i < ONIGENC_MBC_MAXLEN(env->enc); i++) { + r = fetch_token_in_cc(tok, &p, end, env); + if (r < 0) goto err; + if (r != TK_RAW_BYTE || tok->base != base) { + fetched = 1; + break; } - bufe = bufp + len; + buf[i] = tok->u.c; } - bufp[0] = tok->u.c; - for (i = 1; i < len; i++) { - r = fetch_token_in_cc(tok, &p, end, env); - if (r < 0) goto raw_byte_err; - if (r != TK_RAW_BYTE || tok->base != base) break; - bufp[i] = tok->u.c; + + if (i < ONIGENC_MBC_MINLEN(env->enc)) { + r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING; + goto err; } + + len = enc_len(env->enc, buf); if (i < len) { r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING; - raw_byte_err: - if (bufp != buf) xfree(bufp); goto err; } - v = ONIGENC_MBC_TO_CODE(env->enc, bufp, bufe); - if (bufp != buf) xfree(bufp); - in_type = CCV_CODE_POINT; + else if (i > len) { /* fetch back */ + p = psave; + for (i = 1; i < len; i++) { + r = fetch_token_in_cc(tok, &p, end, env); + } + fetched = 0; + } + + if (i == 1) { + v = (OnigCodePoint )buf[0]; + goto raw_single; + } + else { + v = ONIGENC_MBC_TO_CODE(env->enc, buf, bufe); + in_type = CCV_CODE_POINT; + } } else { v = (OnigCodePoint )tok->u.c; + raw_single: in_type = CCV_SB; } in_israw = 1; @@ -3881,7 +4049,7 @@ parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end, case TK_CC_AND: /* && */ { if (state == CCS_VALUE) { - r = next_state_val(cc, &vs, 0, &val_israw, 0, CCV_SB, + r = next_state_val(cc, &vs, 0, &val_israw, 0, val_type, &val_type, &state, env); if (r != 0) goto err; } @@ -3921,7 +4089,7 @@ parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end, } if (state == CCS_VALUE) { - r = next_state_val(cc, &vs, 0, &val_israw, 0, CCV_SB, + r = next_state_val(cc, &vs, 0, &val_israw, 0, val_type, &val_type, &state, env); if (r != 0) goto err; } @@ -3933,16 +4101,28 @@ parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end, cc = prev_cc; } - cc->not = neg; - if (cc->not != 0 && + if (neg != 0) + CCLASS_SET_NOT(cc); + else + CCLASS_CLEAR_NOT(cc); + if (IS_CCLASS_NOT(cc) && IS_SYNTAX_BV(env->syntax, ONIG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC)) { int is_empty; is_empty = (IS_NULL(cc->mbuf) ? 1 : 0); if (is_empty != 0) BITSET_IS_EMPTY(cc->bs, is_empty); - if (is_empty == 0) - BITSET_SET_BIT(cc->bs, ONIG_NEWLINE); + + if (is_empty == 0) { +#define NEWLINE_CODE 0x0a + + if (ONIGENC_IS_CODE_NEWLINE(env->enc, NEWLINE_CODE)) { + if (ONIGENC_CODE_TO_MBCLEN(env->enc, NEWLINE_CODE) == 1) + BITSET_SET_BIT(cc->bs, NEWLINE_CODE); + else + add_code_range(&(cc->mbuf), env, NEWLINE_CODE, NEWLINE_CODE); + } + } } *src = p; return 0; @@ -3961,33 +4141,26 @@ static int parse_effect(Node** np, OnigToken* tok, int term, UChar** src, UChar* end, ScanEnv* env) { + int r, num; + int list_capture; Node *target; OnigOptionType option; - int r, c, num; - int list_capture; + OnigEncoding enc = env->enc; + OnigCodePoint c; UChar* p = *src; + PFETCH_READY; *np = NULL; if (PEND) return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS; option = env->option; - if (PPEEK == '?' && + if (PPEEK_IS('?') && IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) { PINC; if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; PFETCH(c); switch (c) { - case '#': /* (?#...) comment */ - while (1) { - if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; - PFETCH(c); - if (c == ')') break; - } - *src = p; - return 3; /* 3: comment */ - break; - case ':': /* (?:...) grouping only */ group: r = fetch_token(tok, &p, end, env); @@ -4129,7 +4302,7 @@ parse_effect(Node** np, OnigToken* tok, int term, UChar** src, UChar* end, else if (c == ':') { OnigOptionType prev = env->option; - env->option = option; + env->option = option; r = fetch_token(tok, &p, end, env); if (r < 0) return r; r = parse_subexp(&target, tok, term, &p, end, env); @@ -4185,6 +4358,14 @@ parse_effect(Node** np, OnigToken* tok, int term, UChar** src, UChar* end, return 0; } +static char* PopularQStr[] = { + "?", "*", "+", "??", "*?", "+?" +}; + +static char* ReduceQStr[] = { + "", "", "*", "*?", "??", "+ and ??", "+? and ?" +}; + static int set_qualifier(Node* qnode, Node* target, int group, ScanEnv* env) { @@ -4217,38 +4398,38 @@ set_qualifier(Node* qnode, Node* target, int group, ScanEnv* env) #ifdef USE_WARNING_REDUNDANT_NESTED_REPEAT_OPERATOR if (qn->by_number == 0 && qnt->by_number == 0 && IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT)) { - if (IS_REPEAT_INFINITE(qn->upper)) { - if (qn->lower == 0) { /* '*' */ - redundant: - { - char buf[WARN_BUFSIZE]; - if (onig_verb_warn != onig_null_warn) { - onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc, - env->pattern, env->pattern_end, - "redundant nested repeat operator"); - (*onig_verb_warn)(buf); - } - goto warn_exit; - } - } - else if (qn->lower == 1) { /* '+' */ - /* (?:a?)+? only allowed. */ - if (qn->greedy || !(qnt->upper == 1 && qnt->greedy)) - goto redundant; - } - } - else if (qn->upper == 1 && qn->lower == 0) { - if (qn->greedy) { /* '?' */ - if (!(qnt->lower == 1 && qnt->greedy == 0)) /* not '+?' */ - goto redundant; - } - else { /* '??' */ - /* '(?:a+)?? only allowd. (?:a*)?? can be replaced to (?:a+)?? */ - if (!(qnt->greedy && qnt->lower == 1 && - IS_REPEAT_INFINITE(qnt->upper))) - goto redundant; - } - } + int nestq_num, targetq_num; + char buf[WARN_BUFSIZE]; + + nestq_num = popular_qualifier_num(qn); + targetq_num = popular_qualifier_num(qnt); + + switch(ReduceTypeTable[targetq_num][nestq_num]) { + case RQ_ASIS: + break; + + case RQ_DEL: + if (onig_verb_warn != onig_null_warn) { + onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc, + env->pattern, env->pattern_end, + "redundant nested repeat operator"); + (*onig_verb_warn)(buf); + } + goto warn_exit; + break; + + default: + if (onig_verb_warn != onig_null_warn) { + onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc, + env->pattern, env->pattern_end, + "nested repeat operator %s and %s was replaced with '%s'", + PopularQStr[targetq_num], PopularQStr[nestq_num], + ReduceQStr[ReduceTypeTable[targetq_num][nestq_num]]); + (*onig_verb_warn)(buf); + } + goto warn_exit; + break; + } } warn_exit: @@ -4269,74 +4450,151 @@ set_qualifier(Node* qnode, Node* target, int group, ScanEnv* env) return 0; } -#ifdef USE_FOLD_MATCH -static int -make_alt_node_from_fold_info(OnigEncFoldMatchInfo* info, Node** node) -{ - int i; - UChar *s, *end; - Node *root, **ptail, *snode; - - ptail = &root; - for (i = 0; i < info->target_num; i++) { - s = info->target_str[i]; - end = s + info->target_byte_len[i]; - /* ex. - U+00DF match "ss" and "SS, but not match "Ss". - So, string nodes must be raw. - */ - snode = node_new_str_raw(s, end); - CHECK_NULL_RETURN_VAL(snode, ONIGERR_MEMORY); - - *ptail = node_new_alt(snode, NULL_NODE); - CHECK_NULL_RETURN_VAL(*ptail, ONIGERR_MEMORY); - ptail = &(NCONS(*ptail).right); - } - *ptail = NULL_NODE; - *node = root; - return 0; -} - static int -make_fold_alt_node_from_cc(OnigEncoding enc, CClassNode* cc, Node** root) +make_compound_alt_node_from_cc(OnigAmbigType ambig_flag, OnigEncoding enc, + CClassNode* cc, Node** root) { - int i, j, flen, len, ncode, n; - UChar *s, *end, buf[ONIGENC_CODE_TO_MBC_MAXLEN]; - OnigCodePoint* codes; - Node **ptail, *snode; - OnigEncFoldMatchInfo* info; + int r, i, j, k, clen, len, ncode, n; + UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN]; + Node **ptail, *snode = NULL_NODE; + OnigCompAmbigCodes* ccs; + OnigCompAmbigCodeItem* ci; + OnigAmbigType amb; + n = 0; *root = NULL_NODE; ptail = root; - ncode = ONIGENC_GET_ALL_FOLD_MATCH_CODE(enc, &codes); - n = 0; - for (i = 0; i < ncode; i++) { - if (onig_is_code_in_cc(enc, codes[i], cc)) { - len = ONIGENC_CODE_TO_MBC(enc, codes[i], buf); - flen = ONIGENC_GET_FOLD_MATCH_INFO(enc, buf, buf + len, &info); - if (flen > 0) { /* fold */ - for (j = 0; j < info->target_num; j++) { - s = info->target_str[j]; - end = s + info->target_byte_len[j]; - if (onig_strncmp(s, buf, enc_len(enc, *s)) == 0) - continue; /* ignore single char. */ - - snode = node_new_str_raw(s, end); - CHECK_NULL_RETURN_VAL(snode, ONIGERR_MEMORY); - - *ptail = node_new_alt(snode, NULL_NODE); - CHECK_NULL_RETURN_VAL(*ptail, ONIGERR_MEMORY); - ptail = &(NCONS(*ptail).right); - n++; - } + + for (amb = 0x01; amb <= ONIGENC_AMBIGUOUS_MATCH_LIMIT; amb <<= 1) { + if ((amb & ambig_flag) == 0) continue; + + ncode = ONIGENC_GET_ALL_COMP_AMBIG_CODES(enc, amb, &ccs); + for (i = 0; i < ncode; i++) { + if (onig_is_code_in_cc(enc, ccs[i].code, cc)) { + for (j = 0; j < ccs[i].n; j++) { + ci = &(ccs[i].items[j]); + if (ci->len > 1) { /* compound only */ + if (IS_CCLASS_NOT(cc)) clear_not_flag_cclass(cc, enc); + + clen = ci->len; + for (k = 0; k < clen; k++) { + len = ONIGENC_CODE_TO_MBC(enc, ci->code[k], buf); + + if (k == 0) { + snode = node_new_str_raw(buf, buf + len); + CHECK_NULL_RETURN_VAL(snode, ONIGERR_MEMORY); + } + else { + r = onig_node_str_cat(snode, buf, buf + len); + if (r < 0) return r; + } + } + + *ptail = node_new_alt(snode, NULL_NODE); + CHECK_NULL_RETURN_VAL(*ptail, ONIGERR_MEMORY); + ptail = &(NCONS(*ptail).right); + n++; + } + } } } } return n; } -#endif + + +#ifdef USE_SHARED_CCLASS_TABLE + +#define THRESHOLD_RANGE_NUM_FOR_SHARE_CCLASS 8 + +/* for ctype node hash table */ + +typedef struct { + OnigEncoding enc; + int not; + int type; +} type_cclass_key; + +static int type_cclass_cmp(type_cclass_key* x, type_cclass_key* y) +{ + if (x->type != y->type) return 1; + if (x->enc != y->enc) return 1; + if (x->not != y->not) return 1; + return 0; +} + +static int type_cclass_hash(type_cclass_key* key) +{ + int i, val; + unsigned char *p; + + val = 0; + + p = (unsigned char* )&(key->enc); + for (i = 0; i < sizeof(key->enc); i++) { + val = val * 997 + (int )*p++; + } + + p = (unsigned char* )(&key->type); + for (i = 0; i < sizeof(key->type); i++) { + val = val * 997 + (int )*p++; + } + + val += key->not; + return val + (val >> 5); +} + +static int type_cclass_key_free(st_data_t x) +{ + xfree((void* )x); + return 0; +} + +static st_data_t type_cclass_key_clone(st_data_t x) +{ + type_cclass_key* new_key; + type_cclass_key* key = (type_cclass_key* )x; + + new_key = (type_cclass_key* )xmalloc(sizeof(type_cclass_key)); + *new_key = *key; + return (st_data_t )new_key; +} + +static struct st_hash_type type_type_cclass_hash = { + type_cclass_cmp, + type_cclass_hash, + type_cclass_key_free, + type_cclass_key_clone +}; + +static st_table* OnigTypeCClassTable; + + +static int +i_free_shared_class(type_cclass_key* key, Node* node, void* arg) +{ + if (IS_NOT_NULL(node)) { + CClassNode* cc = &(NCCLASS(node)); + if (IS_NOT_NULL(cc->mbuf)) xfree(cc->mbuf); + xfree(node); + } + return ST_DELETE; +} + +extern int +onig_free_shared_cclass_table() +{ + if (IS_NOT_NULL(OnigTypeCClassTable)) { + onig_st_foreach(OnigTypeCClassTable, i_free_shared_class, 0); + } + + return 0; +} + +#endif /* USE_SHARED_CCLASS_TABLE */ + static int parse_exp(Node** np, OnigToken* tok, int term, @@ -4346,7 +4604,6 @@ parse_exp(Node** np, OnigToken* tok, int term, Node* qn; Node** targetp; - start: *np = NULL; if (tok->type == term) goto end_of_token; @@ -4376,11 +4633,6 @@ parse_exp(Node** np, OnigToken* tok, int term, NEFFECT(*np).target = target; return tok->type; } - else if (r == 3) { /* comment */ - r = fetch_token(tok, src, end, env); - if (r < 0) return r; - goto start; - } break; case TK_SUBEXP_CLOSE: @@ -4391,76 +4643,22 @@ parse_exp(Node** np, OnigToken* tok, int term, else goto tk_byte; break; - case TK_BYTE: + case TK_STRING: tk_byte: { - *np = node_new_str_char((UChar )tok->u.c); + *np = node_new_str(tok->backp, *src); CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY); while (1) { - len = enc_len(env->enc, tok->u.c); - if (len > 1) { - r = onig_node_str_cat(*np, *src, *src + len - 1); - if (r < 0) return r; - *src += (len - 1); - } - r = fetch_token(tok, src, end, env); if (r < 0) return r; - if (r != TK_BYTE) break; + if (r != TK_STRING) break; - r = node_str_cat_char(*np, (UChar )tok->u.c); + r = onig_node_str_cat(*np, tok->backp, *src); if (r < 0) return r; } - fold_entry: -#ifdef USE_FOLD_MATCH - if (IS_IGNORECASE(env->option) && ONIGENC_IS_FOLD_MATCH(env->enc)) { - int flen, ret; - Node *root, **ptail, *work, *snode, *anode; - UChar *p, *pprev; - OnigEncFoldMatchInfo* fold_info; - StrNode* sn = &(NSTRING(*np)); - - ptail = &root; - pprev = sn->s; - for (p = sn->s; p < sn->end; ) { - flen = ONIGENC_GET_FOLD_MATCH_INFO(env->enc, p, sn->end, &fold_info); - if (flen > 0) { /* fold */ - ret = make_alt_node_from_fold_info(fold_info, &anode); - if (ret != 0) return ret; - work = node_new_list(anode, NULL); - CHECK_NULL_RETURN_VAL(work, ONIGERR_MEMORY); - - if (pprev < p) { - snode = node_new_str(pprev, p); - CHECK_NULL_RETURN_VAL(snode, ONIGERR_MEMORY); - *ptail = node_new_list(snode, work); - CHECK_NULL_RETURN_VAL(*ptail, ONIGERR_MEMORY); - } - else { - *ptail = work; - } - ptail = &(NCONS(work).right); - p += flen; - pprev = p; - } - else - p += enc_len(env->enc, *p); - } - *ptail = NULL_NODE; - if (IS_NOT_NULL(root)) { - if (pprev < sn->end) { - snode = node_new_str(pprev, sn->end); - CHECK_NULL_RETURN_VAL(snode, ONIGERR_MEMORY); - *ptail = node_new_list(snode, NULL_NODE); - CHECK_NULL_RETURN_VAL(*ptail, ONIGERR_MEMORY); - } - onig_node_free(*np); - *np = root; - } - } -#endif + string_end: targetp = np; goto repeat; } @@ -4469,22 +4667,19 @@ parse_exp(Node** np, OnigToken* tok, int term, case TK_RAW_BYTE: tk_raw_byte: { - int expect_len; - *np = node_new_str_raw_char((UChar )tok->u.c); CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY); - expect_len = enc_len(env->enc, tok->u.c); len = 1; while (1) { r = fetch_token(tok, src, end, env); if (r < 0) return r; if (r != TK_RAW_BYTE) { #ifndef NUMBERED_CHAR_IS_NOT_CASE_AMBIG - if (len >= expect_len) { + if (len >= enc_len(env->enc, NSTRING(*np).s)) { NSTRING_CLEAR_RAW(*np); } #endif - goto fold_entry; + goto string_end; } r = node_str_cat_char(*np, (UChar )tok->u.c); @@ -4510,9 +4705,11 @@ parse_exp(Node** np, OnigToken* tok, int term, case TK_QUOTE_OPEN: { - OnigCodePoint end_op[] = { (OnigCodePoint )MC_ESC, (OnigCodePoint )'E' }; + OnigCodePoint end_op[2]; UChar *qstart, *qend, *nextp; + end_op[0] = (OnigCodePoint )MC_ESC(env->enc); + end_op[1] = (OnigCodePoint )'E'; qstart = *src; qend = find_str_position(end_op, 2, qstart, end, &nextp, env->enc); if (IS_NULL(qend)) { @@ -4537,17 +4734,69 @@ parse_exp(Node** np, OnigToken* tok, int term, case CTYPE_NOT_WHITE_SPACE: case CTYPE_DIGIT: case CTYPE_NOT_DIGIT: + case CTYPE_XDIGIT: + case CTYPE_NOT_XDIGIT: { CClassNode* cc; int ctype, not; - ctype = parse_ctype_to_enc_ctype(tok->u.subtype, ¬); +#ifdef USE_SHARED_CCLASS_TABLE + OnigCodePoint *sbr, *mbr; - *np = node_new_cclass(); - CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY); - cc = &(NCCLASS(*np)); - add_ctype_to_cc(cc, ctype, 0, env); - if (not != 0) CCLASS_SET_NOT(cc); + ctype = parse_ctype_to_enc_ctype(tok->u.subtype, ¬); + r = ONIGENC_GET_CTYPE_CODE_RANGE(env->enc, ctype, &sbr, &mbr); + if (r == 0 && + ONIGENC_CODE_RANGE_NUM(mbr) + >= THRESHOLD_RANGE_NUM_FOR_SHARE_CCLASS) { + type_cclass_key key; + type_cclass_key* new_key; + + key.enc = env->enc; + key.not = not; + key.type = ctype; + + THREAD_ATOMIC_START; + + if (IS_NULL(OnigTypeCClassTable)) { + OnigTypeCClassTable + = onig_st_init_table_with_size(&type_type_cclass_hash, 10); + if (IS_NULL(OnigTypeCClassTable)) { + THREAD_ATOMIC_END; + return ONIGERR_MEMORY; + } + } + else { + if (onig_st_lookup(OnigTypeCClassTable, (st_data_t )&key, + (st_data_t* )np)) { + THREAD_ATOMIC_END; + break; + } + } + + *np = node_new_cclass_by_codepoint_range(not, sbr, mbr); + if (IS_NULL(*np)) { + THREAD_ATOMIC_END; + return ONIGERR_MEMORY; + } + + CCLASS_SET_SHARE(&(NCCLASS(*np))); + new_key = (type_cclass_key* )xmalloc(sizeof(type_cclass_key)); + onig_st_add_direct(OnigTypeCClassTable, (st_data_t )new_key, + (st_data_t )*np); + + THREAD_ATOMIC_END; + } + else { +#endif + ctype = parse_ctype_to_enc_ctype(tok->u.subtype, ¬); + *np = node_new_cclass(); + CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY); + cc = &(NCCLASS(*np)); + add_ctype_to_cc(cc, ctype, 0, env); + if (not != 0) CCLASS_SET_NOT(cc); +#ifdef USE_SHARED_CCLASS_TABLE + } +#endif } break; @@ -4564,27 +4813,66 @@ parse_exp(Node** np, OnigToken* tok, int term, break; case TK_CC_OPEN: - r = parse_char_class(np, tok, src, end, env); - if (r != 0) return r; + { + CClassNode* cc; -#ifdef USE_FOLD_MATCH - if (IS_IGNORECASE(env->option) && ONIGENC_IS_FOLD_MATCH(env->enc)) { - int res; - Node *alt_root, *work; - CClassNode* cc = &(NCCLASS(*np)); - - res = make_fold_alt_node_from_cc(env->enc, cc, &alt_root); - if (res < 0) return res; - if (res > 0) { - work = node_new_alt(*np, alt_root); - if (IS_NULL(work)) { - onig_node_free(alt_root); - return ONIGERR_MEMORY; - } - *np = work; + r = parse_char_class(np, tok, src, end, env); + if (r != 0) return r; + + cc = &(NCCLASS(*np)); + + if (IS_IGNORECASE(env->option)) { + int i, n, in_cc; + OnigPairAmbigCodes* ccs; + BitSetRef bs = cc->bs; + OnigAmbigType amb; + + for (amb = 0x01; amb <= ONIGENC_AMBIGUOUS_MATCH_LIMIT; amb <<= 1) { + if ((amb & env->ambig_flag) == 0) continue; + + n = ONIGENC_GET_ALL_PAIR_AMBIG_CODES(env->enc, amb, &ccs); + for (i = 0; i < n; i++) { + in_cc = onig_is_code_in_cc(env->enc, ccs[i].from, cc); + + if ((in_cc != 0 && !IS_CCLASS_NOT(cc)) || + (in_cc == 0 && IS_CCLASS_NOT(cc))) { + if (ONIGENC_MBC_MINLEN(env->enc) > 1 || + ccs[i].from >= SINGLE_BYTE_SIZE) { + /* if (cc->not) clear_not_flag_cclass(cc, env->enc); */ + add_code_range(&(cc->mbuf), env, ccs[i].to, ccs[i].to); + } + else { + if (BITSET_AT(bs, ccs[i].from)) { + /* /(?i:[^A-C])/.match("a") ==> fail. */ + BITSET_SET_BIT(bs, ccs[i].to); + } + if (BITSET_AT(bs, ccs[i].to)) { + BITSET_SET_BIT(bs, ccs[i].from); + } + } + } + } + } + } + + if (IS_IGNORECASE(env->option) && + (env->ambig_flag & ONIGENC_AMBIGUOUS_MATCH_COMPOUND) != 0) { + int res; + Node *alt_root, *work; + + res = make_compound_alt_node_from_cc(env->ambig_flag, env->enc, + cc, &alt_root); + if (res < 0) return res; + if (res > 0) { + work = node_new_alt(*np, alt_root); + if (IS_NULL(work)) { + onig_node_free(alt_root); + return ONIGERR_MEMORY; + } + *np = work; + } } } -#endif break; case TK_ANYCHAR: @@ -4630,7 +4918,6 @@ parse_exp(Node** np, OnigToken* tok, int term, *np = node_new_empty(); } else { - *src = tok->backp; goto tk_byte; } break; @@ -4781,7 +5068,7 @@ parse_regexp(Node** top, UChar** src, UChar* end, ScanEnv* env) } extern int -onig_parse_make_tree(Node** root, UChar* pattern, UChar* end, regex_t* reg, +onig_parse_make_tree(Node** root, const UChar* pattern, const UChar* end, regex_t* reg, ScanEnv* env) { int r; @@ -4793,15 +5080,16 @@ onig_parse_make_tree(Node** root, UChar* pattern, UChar* end, regex_t* reg, scan_env_clear(env); env->option = reg->options; + env->ambig_flag = reg->ambig_flag; env->enc = reg->enc; env->syntax = reg->syntax; - env->pattern = pattern; - env->pattern_end = end; + env->pattern = (UChar* )pattern; + env->pattern_end = (UChar* )end; env->reg = reg; *root = NULL; - p = pattern; - r = parse_regexp(root, &p, end, env); + p = (UChar* )pattern; + r = parse_regexp(root, &p, (UChar* )end, env); reg->num_mem = env->num_mem; return r; } |
