1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
|
/**********************************************************************
oniguruma.h - Oniguruma (regular expression library)
Copyright (C) 2002-2003 K.Kosako (kosako@sofnec.co.jp)
**********************************************************************/
#ifndef ONIGURUMA_H
#define ONIGURUMA_H
#include "php_compat.h"
#define ONIGURUMA
#define ONIGURUMA_VERSION_MAJOR 1
#define ONIGURUMA_VERSION_MINOR 9
#define ONIGURUMA_VERSION_TEENY 1
/* config parameters */
#ifndef RE_NREGS
#define RE_NREGS 10
#endif
#define REG_NREGION RE_NREGS
#define REG_MAX_BACKREF_NUM 1000
#define REG_MAX_REPEAT_NUM 100000
#define REG_MAX_MULTI_BYTE_RANGES_NUM 1000
/* constants */
#define REG_MAX_ERROR_MESSAGE_LEN 90
#ifndef P_
#ifdef __STDC__
# define P_(args) args
#else
# define P_(args) ()
#endif
#endif
#ifndef PV_
#ifdef HAVE_STDARG_PROTOTYPES
# define PV_(args) args
#else
# define PV_(args) ()
#endif
#endif
#ifndef REG_EXTERN
#if defined(_WIN32) && !defined(__CYGWIN__)
#if defined(EXPORT) || defined(RUBY_EXPORT)
#define REG_EXTERN extern __declspec(dllexport)
#elif defined(IMPORT)
#define REG_EXTERN extern __declspec(dllimport)
#endif
#endif
#endif
#ifndef REG_EXTERN
#define REG_EXTERN extern
#endif
#define REG_CHAR_TABLE_SIZE 256
#define REGCODE_UNDEF ((RegCharEncoding )0)
#if defined(RUBY_PLATFORM) && defined(M17N_H)
#define REG_RUBY_M17N
typedef m17n_encoding* RegCharEncoding;
#define REGCODE_DEFAULT REGCODE_UNDEF
#else
typedef const char* RegCharEncoding;
#define MBCTYPE_ASCII 0
#define MBCTYPE_EUC 1
#define MBCTYPE_SJIS 2
#define MBCTYPE_UTF8 3
#define REGCODE_ASCII REG_MBLEN_TABLE[MBCTYPE_ASCII]
#define REGCODE_UTF8 REG_MBLEN_TABLE[MBCTYPE_UTF8]
#define REGCODE_EUCJP REG_MBLEN_TABLE[MBCTYPE_EUC]
#define REGCODE_SJIS REG_MBLEN_TABLE[MBCTYPE_SJIS]
#define REGCODE_DEFAULT REGCODE_ASCII
REG_EXTERN const char REG_MBLEN_TABLE[][REG_CHAR_TABLE_SIZE];
#endif /* else RUBY && M17N */
REG_EXTERN RegCharEncoding RegDefaultCharEncoding;
#if defined(RUBY_PLATFORM) && !defined(M17N_H)
#undef ismbchar
#define ismbchar(c) (mbclen((c)) != 1)
#define mbclen(c) RegDefaultCharEncoding[(unsigned char )(c)]
#endif
typedef unsigned int RegOptionType;
typedef unsigned char* RegTransTableType;
typedef unsigned int RegDistance;
typedef unsigned char UChar;
#define REG_OPTION_DEFAULT REG_OPTION_NONE
/* GNU regex options */
#define RE_OPTION_IGNORECASE (1L)
#define RE_OPTION_EXTENDED (RE_OPTION_IGNORECASE << 1)
#define RE_OPTION_MULTILINE (RE_OPTION_EXTENDED << 1)
#define RE_OPTION_SINGLELINE (RE_OPTION_MULTILINE << 1)
#define RE_OPTION_POSIXLINE (RE_OPTION_MULTILINE|RE_OPTION_SINGLELINE)
#define RE_OPTION_LONGEST (RE_OPTION_SINGLELINE << 1)
/* options */
#define REG_OPTION_NONE 0
#define REG_OPTION_SINGLELINE RE_OPTION_SINGLELINE
#define REG_OPTION_MULTILINE RE_OPTION_MULTILINE
#define REG_OPTION_IGNORECASE RE_OPTION_IGNORECASE
#define REG_OPTION_EXTEND RE_OPTION_EXTENDED
#define REG_OPTION_FIND_LONGEST RE_OPTION_LONGEST
#define REG_OPTION_FIND_NOT_EMPTY (REG_OPTION_FIND_LONGEST << 1)
#define REG_OPTION_NEGATE_SINGLELINE (REG_OPTION_FIND_NOT_EMPTY << 1)
#define REG_OPTION_CAPTURE_ONLY_NAMED_GROUP (REG_OPTION_NEGATE_SINGLELINE << 1)
/* options (search time) */
#define REG_OPTION_NOTBOL (REG_OPTION_CAPTURE_ONLY_NAMED_GROUP << 1)
#define REG_OPTION_NOTEOL (REG_OPTION_NOTBOL << 1)
#define REG_OPTION_POSIX_REGION (REG_OPTION_NOTEOL << 1)
#define REG_OPTION_ON(options,regopt) ((options) |= (regopt))
#define REG_OPTION_OFF(options,regopt) ((options) &= ~(regopt))
#define IS_REG_OPTION_ON(options,option) ((options) & (option))
/* syntax */
typedef struct {
unsigned int op;
unsigned int op2;
unsigned int behavior;
RegOptionType options; /* default option */
} RegSyntaxType;
REG_EXTERN RegSyntaxType RegSyntaxPosixBasic;
REG_EXTERN RegSyntaxType RegSyntaxPosixExtended;
REG_EXTERN RegSyntaxType RegSyntaxEmacs;
REG_EXTERN RegSyntaxType RegSyntaxGrep;
REG_EXTERN RegSyntaxType RegSyntaxGnuRegex;
REG_EXTERN RegSyntaxType RegSyntaxJava;
REG_EXTERN RegSyntaxType RegSyntaxPerl;
REG_EXTERN RegSyntaxType RegSyntaxRuby;
/* predefined syntaxes (see regcomp.c) */
#define REG_SYNTAX_POSIX_BASIC (&RegSyntaxPosixBasic)
#define REG_SYNTAX_POSIX_EXTENDED (&RegSyntaxPosixExtended)
#define REG_SYNTAX_EMACS (&RegSyntaxEmacs)
#define REG_SYNTAX_GREP (&RegSyntaxGrep)
#define REG_SYNTAX_GNU_REGEX (&RegSyntaxGnuRegex)
#define REG_SYNTAX_JAVA (&RegSyntaxJava)
#define REG_SYNTAX_PERL (&RegSyntaxPerl)
#define REG_SYNTAX_RUBY (&RegSyntaxRuby)
/* default syntax */
#define REG_SYNTAX_DEFAULT RegDefaultSyntax
REG_EXTERN RegSyntaxType* RegDefaultSyntax;
/* syntax (operators) */
#define REG_SYN_OP_ANYCHAR 1 /* . */
#define REG_SYN_OP_0INF (1<<1) /* * */
#define REG_SYN_OP_ESC_0INF (1<<2)
#define REG_SYN_OP_1INF (1<<3) /* + */
#define REG_SYN_OP_ESC_1INF (1<<4)
#define REG_SYN_OP_01 (1<<5) /* ? */
#define REG_SYN_OP_ESC_01 (1<<6)
#define REG_SYN_OP_INTERVAL (1<<7) /* {lower,upper} */
#define REG_SYN_OP_ESC_INTERVAL (1<<8)
#define REG_SYN_OP_ALT (1<<9) /* | */
#define REG_SYN_OP_ESC_ALT (1<<10)
#define REG_SYN_OP_SUBEXP (1<<11) /* (...) */
#define REG_SYN_OP_ESC_SUBEXP (1<<12)
#define REG_SYN_OP_ESC_BUF_ANCHOR (1<<13) /* \A, \Z, \z */
#define REG_SYN_OP_ESC_GNU_BUF_ANCHOR (1<<14) /* \`, \' */
#define REG_SYN_OP_BACK_REF (1<<15) /* \num */
#define REG_SYN_OP_CC (1<<16) /* [...] */
#define REG_SYN_OP_ESC_WORD (1<<17) /* \w, \W */
#define REG_SYN_OP_ESC_WORD_BEGIN_END (1<<18) /* \<. \> */
#define REG_SYN_OP_ESC_WORD_BOUND (1<<19) /* \b, \B */
#define REG_SYN_OP_ESC_WHITE_SPACE (1<<20) /* \s, \S */
#define REG_SYN_OP_ESC_DIGIT (1<<21) /* \d, \D */
#define REG_SYN_OP_LINE_ANCHOR (1<<22) /* ^, $ */
#define REG_SYN_OP_POSIX_BRACKET (1<<23) /* [:xxxx:] */
#define REG_SYN_OP_NON_GREEDY (1<<24) /* ??,*?,+?,{n,m}? */
#define REG_SYN_OP_ESC_CONTROL_CHAR (1<<25) /* \n,\r,\t,\a ... */
#define REG_SYN_OP_ESC_C_CONTROL (1<<26) /* \cx */
#define REG_SYN_OP_ESC_OCTAL3 (1<<27) /* \OOO */
#define REG_SYN_OP_ESC_X_HEX2 (1<<28) /* \xHH */
#define REG_SYN_OP_ESC_X_BRACE_HEX8 (1<<29) /* \x{7HHHHHHH} */
#define REG_SYN_OP_SUBEXP_EFFECT (1<<30) /* (?...) */
#define REG_SYN_OP_QUOTE (1<<31) /* \Q...\E */
#define REG_SYN_OP2_OPTION_PERL (1<<0) /* (?imsx), (?-imsx) */
#define REG_SYN_OP2_OPTION_RUBY (1<<1) /* (?imx), (?-imx) */
#define REG_SYN_OP2_POSSESSIVE_REPEAT (1<<2) /* ?+,*+,++ */
#define REG_SYN_OP2_POSSESSIVE_INTERVAL (1<<3) /* {n,m}+ */
#define REG_SYN_OP2_CCLASS_SET (1<<4) /* [...&&..[..].] */
#define REG_SYN_OP2_NAMED_SUBEXP (1<<5) /*(?<name>.),\k<name>*/
#define REG_SYN_OP2_SUBEXP_CALL (1<<6) /* \g<name> */
#define REG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL (1<<7) /* \C-x */
#define REG_SYN_OP2_ESC_M_BAR_META (1<<8) /* \M-x */
#define REG_SYN_OP2_ESC_V_VTAB (1<<9) /* \v as VTAB */
#define REG_SYN_OP2_ESC_U_HEX4 (1<<10) /* \uHHHH */
/* syntax (behavior) */
#define REG_SYN_CONTEXT_INDEP_ANCHORS (1<<0) /* not implemented */
#define REG_SYN_CONTEXT_INDEP_OPS (1<<1) /* ?, *, +, {n,m} */
#define REG_SYN_CONTEXT_INVALID_OPS (1<<2) /* error or ignore */
#define REG_SYN_ALLOW_UNMATCHED_CLOSE_SUBEXP (1<<3) /* ...)... */
#define REG_SYN_ALLOW_INVALID_INTERVAL (1<<4) /* {??? */
#define REG_SYN_STRICT_CHECK_BACKREF (1<<5) /* /(\1)/,/\1()/ etc.*/
#define REG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND (1<<6) /* (?<=a|bc) */
/* syntax in char class [...] */
#define REG_SYN_WARN_FOR_CC_OP_NOT_ESCAPED (1<<10) /* [,-,] */
#define REG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC (1<<11)
#define REG_SYN_ESCAPE_IN_CC (1<<12) /* [...\w..] etc.. */
#define REG_SYN_ALLOW_EMPTY_RANGE_IN_CC (1<<13)
#define REG_SYN_ALLOW_RANGE_OP_IN_CC (1<<14) /* [0-9-a] */
/* error codes */
#define REG_IS_PATTERN_ERROR(ecode) ((ecode) <= -100 && (ecode) > -300)
/* normal return */
#define REG_NORMAL 0
#define REG_MISMATCH -1
#define REG_NO_SUPPORT_CONFIG -2
/* internal error */
#define REGERR_MEMORY -5
#define REGERR_MATCH_STACK_LIMIT_OVER -6
#define REGERR_TYPE_BUG -10
#define REGERR_PARSER_BUG -11
#define REGERR_STACK_BUG -12
#define REGERR_UNDEFINED_BYTECODE -13
#define REGERR_UNEXPECTED_BYTECODE -14
#define REGERR_DEFAULT_ENCODING_IS_NOT_SETTED -21
#define REGERR_SPECIFIED_ENCODING_CANT_CONVERT_TO_WIDE_CHAR -22
/* syntax error */
#define REGERR_END_PATTERN_AT_LEFT_BRACE -100
#define REGERR_END_PATTERN_AT_LEFT_BRACKET -101
#define REGERR_EMPTY_CHAR_CLASS -102
#define REGERR_PREMATURE_END_OF_CHAR_CLASS -103
#define REGERR_END_PATTERN_AT_BACKSLASH -104
#define REGERR_END_PATTERN_AT_META -105
#define REGERR_END_PATTERN_AT_CONTROL -106
#define REGERR_META_CODE_SYNTAX -108
#define REGERR_CONTROL_CODE_SYNTAX -109
#define REGERR_CHAR_CLASS_VALUE_AT_END_OF_RANGE -110
#define REGERR_CHAR_CLASS_VALUE_AT_START_OF_RANGE -111
#define REGERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS -112
#define REGERR_TARGET_OF_REPEAT_OPERATOR_NOT_SPECIFIED -113
#define REGERR_TARGET_OF_REPEAT_OPERATOR_INVALID -114
#define REGERR_NESTED_REPEAT_OPERATOR -115
#define REGERR_UNMATCHED_CLOSE_PARENTHESIS -116
#define REGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS -117
#define REGERR_END_PATTERN_IN_GROUP -118
#define REGERR_UNDEFINED_GROUP_OPTION -119
#define REGERR_INVALID_POSIX_BRACKET_TYPE -121
#define REGERR_INVALID_LOOK_BEHIND_PATTERN -122
#define REGERR_INVALID_REPEAT_RANGE_PATTERN -123
/* values error (syntax error) */
#define REGERR_TOO_BIG_NUMBER -200
#define REGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE -201
#define REGERR_UPPER_SMALLER_THAN_LOWER_IN_REPEAT_RANGE -202
#define REGERR_EMPTY_RANGE_IN_CHAR_CLASS -203
#define REGERR_MISMATCH_CODE_LENGTH_IN_CLASS_RANGE -204
#define REGERR_TOO_MANY_MULTI_BYTE_RANGES -205
#define REGERR_TOO_SHORT_MULTI_BYTE_STRING -206
#define REGERR_TOO_BIG_BACKREF_NUMBER -207
#define REGERR_INVALID_BACKREF -208
#define REGERR_TOO_BIG_WIDE_CHAR_VALUE -209
#define REGERR_TOO_LONG_WIDE_CHAR_VALUE -210
#define REGERR_INVALID_WIDE_CHAR_VALUE -211
#define REGERR_INVALID_SUBEXP_NAME -212
#define REGERR_UNDEFINED_NAME_REFERENCE -213
#define REGERR_UNDEFINED_GROUP_REFERENCE -214
#define REGERR_MULTIPLEX_DEFINITION_NAME_CALL -215
#define REGERR_NEVER_ENDING_RECURSION -216
/* errors related to thread */
#define REGERR_OVER_THREAD_PASS_LIMIT_COUNT -301
/* match result region type */
struct re_registers {
int allocated;
int num_regs;
int* beg;
int* end;
};
#define REG_REGION_NOTPOS -1
typedef struct re_registers RegRegion;
typedef struct {
UChar* par;
UChar* par_end;
} RegErrorInfo;
typedef struct {
int lower;
int upper;
} RegRepeatRange;
/* regex_t state */
#define REG_STATE_NORMAL 0
#define REG_STATE_SEARCHING 1
#define REG_STATE_COMPILING -1
#define REG_STATE_MODIFY -2
#define REG_STATE(regex) \
((regex)->state > 0 ? REG_STATE_SEARCHING : (regex)->state)
typedef struct re_pattern_buffer {
/* common members in BBuf(bytes-buffer) type */
unsigned char* p; /* compiled pattern */
unsigned int used; /* used space for p */
unsigned int alloc; /* allocated space for p */
int state; /* normal, searching, compiling */
int num_mem; /* used memory(...) num counted from 1 */
int num_repeat; /* OP_REPEAT/OP_REPEAT_NG id-counter */
int num_null_check; /* OP_NULL_CHECK_START/END id counter */
int num_call; /* number of subexp call */
unsigned int backtrack_mem;
int stack_pop_level;
int repeat_range_alloc;
RegRepeatRange* repeat_range;
RegCharEncoding enc;
RegOptionType options;
RegSyntaxType* syntax;
void* name_table;
/* optimize info (string search and char-map and anchor) */
int optimize; /* optimize flag */
int threshold_len; /* search str-length for apply optimize */
int anchor; /* BEGIN_BUF, BEGIN_POS, (SEMI_)END_BUF */
RegDistance anchor_dmin; /* (SEMI_)END_BUF anchor distance */
RegDistance anchor_dmax; /* (SEMI_)END_BUF anchor distance */
int sub_anchor; /* start-anchor for exact or map */
unsigned char *exact;
unsigned char *exact_end;
unsigned char map[REG_CHAR_TABLE_SIZE]; /* used as BM skip or char-map */
int *int_map; /* BM skip for exact_len > 255 */
int *int_map_backward; /* BM skip for backward search */
RegDistance dmin; /* min-distance of exact or map */
RegDistance dmax; /* max-distance of exact or map */
/* regex_t link chain */
struct re_pattern_buffer* chain; /* escape compile-conflict on multi-thread */
} regex_t;
#ifdef RUBY_PLATFORM
#define re_mbcinit ruby_re_mbcinit
#define re_compile_pattern ruby_re_compile_pattern
#define re_recompile_pattern ruby_re_recompile_pattern
#define re_free_pattern ruby_re_free_pattern
#define re_adjust_startpos ruby_re_adjust_startpos
#define re_search ruby_re_search
#define re_match ruby_re_match
#define re_set_casetable ruby_re_set_casetable
#define re_copy_registers ruby_re_copy_registers
#define re_free_registers ruby_re_free_registers
#define register_info_type ruby_register_info_type
#define re_error_code_to_str ruby_error_code_to_str
#define ruby_error_code_to_str regex_error_code_to_str
#define ruby_re_copy_registers regex_region_copy
#else
#define re_error_code_to_str regex_error_code_to_str
#define re_copy_registers regex_region_copy
#endif
/* Oniguruma Native API */
REG_EXTERN
int regex_init P_((void));
REG_EXTERN
int regex_error_code_to_str PV_((UChar* s, int err_code, ...));
REG_EXTERN
int regex_new P_((regex_t**, UChar* pattern, UChar* pattern_end, RegOptionType option, RegCharEncoding code, RegSyntaxType* syntax, RegErrorInfo* einfo));
REG_EXTERN
void regex_free P_((regex_t*));
REG_EXTERN
int regex_recompile P_((regex_t*, UChar* pattern, UChar* pattern_end, RegOptionType option, RegCharEncoding code, RegSyntaxType* syntax, RegErrorInfo* einfo));
REG_EXTERN
int regex_search P_((regex_t*, UChar* str, UChar* end, UChar* start, UChar* range, RegRegion* region, RegOptionType option));
REG_EXTERN
int regex_match P_((regex_t*, UChar* str, UChar* end, UChar* at, RegRegion* region, RegOptionType option));
REG_EXTERN
RegRegion* regex_region_new P_((void));
REG_EXTERN
void regex_region_free P_((RegRegion* region, int free_self));
REG_EXTERN
void regex_region_copy P_((RegRegion* to, RegRegion* from));
REG_EXTERN
void regex_region_clear P_((RegRegion* region));
REG_EXTERN
int regex_region_resize P_((RegRegion* region, int n));
REG_EXTERN
int regex_name_to_group_numbers P_((regex_t* reg, UChar* name, UChar* name_end,
int** nums));
REG_EXTERN
int regex_foreach_name P_((regex_t* reg, int (*func)(UChar*,int,int*,void*), void* arg));
REG_EXTERN
UChar* regex_get_prev_char_head P_((RegCharEncoding code, UChar* start, UChar* s));
REG_EXTERN
UChar* regex_get_left_adjust_char_head P_((RegCharEncoding code, UChar* start, UChar* s));
REG_EXTERN
UChar* regex_get_right_adjust_char_head P_((RegCharEncoding code, UChar* start, UChar* s));
REG_EXTERN
void regex_set_default_trans_table P_((UChar* table));
REG_EXTERN
int regex_set_default_syntax P_((RegSyntaxType* syntax));
REG_EXTERN
int regex_end P_((void));
REG_EXTERN
const char* regex_version P_((void));
/* GNU regex API */
#ifdef REG_RUBY_M17N
REG_EXTERN
void re_mbcinit P_((RegCharEncoding));
#else
REG_EXTERN
void re_mbcinit P_((int));
#endif
REG_EXTERN
int re_compile_pattern P_((const char*, int, struct re_pattern_buffer*, char* err_buf));
REG_EXTERN
int re_recompile_pattern P_((const char*, int, struct re_pattern_buffer*, char* err_buf));
REG_EXTERN
void re_free_pattern P_((struct re_pattern_buffer*));
REG_EXTERN
int re_adjust_startpos P_((struct re_pattern_buffer*, const char*, int, int, int));
REG_EXTERN
int re_search P_((struct re_pattern_buffer*, const char*, int, int, int, struct re_registers*));
REG_EXTERN
int re_match P_((struct re_pattern_buffer*, const char *, int, int, struct re_registers*));
REG_EXTERN
void re_set_casetable P_((const char*));
REG_EXTERN
void re_free_registers P_((struct re_registers*));
REG_EXTERN
int re_alloc_pattern P_((struct re_pattern_buffer**)); /* added */
#endif /* ONIGURUMA_H */
|