diff options
| author | Guido van Rossum <guido@python.org> | 1997-10-06 14:45:17 +0000 | 
|---|---|---|
| committer | Guido van Rossum <guido@python.org> | 1997-10-06 14:45:17 +0000 | 
| commit | bf9d353babbd4126fc080a9a84ae65cc529b209d (patch) | |
| tree | 9f4cf02319a56dc553466f5e338c90c0954982d9 /Lib | |
| parent | 51b3aa3d38a49be5cc060d783dd1c80b6082b640 (diff) | |
| download | cpython-git-bf9d353babbd4126fc080a9a84ae65cc529b209d.tar.gz | |
New "re" regular expression support.
The new re module was written by Andrew Kuchling and uses the pcre
code in ../Modules/.  The old re module has been renamed to re1,
just in case you need it for comparison.
Diffstat (limited to 'Lib')
| -rw-r--r-- | Lib/re.py | 1212 | ||||
| -rw-r--r-- | Lib/re1.py | 1508 | 
2 files changed, 1575 insertions, 1145 deletions
@@ -2,42 +2,27 @@  # -*- mode: python -*-  # $Id$ -import string -import reop - -# reop.error and re.error should be the same, since exceptions can be -# raised from  either module. -error = reop.error # 're error' -from reop import NORMAL, CHARCLASS, REPLACEMENT -from reop import CHAR, MEMORY_REFERENCE, SYNTAX, NOT_SYNTAX, SET -from reop import WORD_BOUNDARY, NOT_WORD_BOUNDARY, BEGINNING_OF_BUFFER, END_OF_BUFFER - -# compilation flags - -IGNORECASE = I = 0x01 - -MULTILINE = M = 0x02 -DOTALL = S = 0x04 -VERBOSE = X = 0x08 +import sys +import string +from pcre import * -repetition_operators = ['*', '*?', '+', '+?', '?', '??', '{n}', '{n}?', -			'{n,}', '{n,}?', '{n,m}', '{n,m}?'] +[ NORMAL, CHARCLASS, REPLACEMENT ] = range(3) +[ CHAR, MEMORY_REFERENCE, SYNTAX, NOT_SYNTAX, SET, WORD_BOUNDARY, NOT_WORD_BOUNDARY, BEGINNING_OF_BUFFER, END_OF_BUFFER ] = range(9)  # -# +# First, the public part of the interface:  # -def valid_identifier(id): -    if len(id) == 0: -	return 0 -    if (not reop.syntax_table[id[0]] & reop.word) or \ -       (reop.syntax_table[id[0]] & reop.digit): -	return 0 -    for char in id[1:]: -	if not reop.syntax_table[char] & reop.word: -	    return 0 -    return 1 +# pcre.error and re.error should be the same, since exceptions can be +# raised from  either module. + +# compilation flags + +I = IGNORECASE +M = MULTILINE +S = DOTALL  +X = VERBOSE   #  # @@ -83,60 +68,17 @@ def split(pattern, string, maxsplit=0):  #  # -def _expand(m, repl): -    results = [] -    index = 0 -    size = len(repl) -    while index < size: -	found = string.find(repl, '\\', index) -	if found < 0: -	    results.append(repl[index:]) -	    break -	if found > index: -	    results.append(repl[index:found]) -	escape_type, value, index = expand_escape(repl, found+1, REPLACEMENT) -	if escape_type == CHAR: -	    results.append(value) -	elif escape_type == MEMORY_REFERENCE: -	    r = m.group(value) -	    if r is None: -		raise error, ('group "' + str(value) + '" did not contribute ' -			      'to the match') -	    results.append(m.group(value)) -	else: -	    raise error, "bad escape in replacement" -    return string.join(results, '') -  class RegexObject: -    def __init__(self, pattern, flags, code, num_regs, groupindex): -	self.code = code -	self.num_regs = num_regs +    def __init__(self, pattern, flags, code, groupindex): +	self.code = code   	self.flags = flags  	self.pattern = pattern  	self.groupindex = groupindex -	self.fastmap = build_fastmap(code) -	 -	if code[0].name == 'bol': -	    self.anchor = 1 -	     -	elif code[0].name == 'begbuf': -	    self.anchor = 2 -	     -	else: -	    self.anchor = 0 -	     -	self.buffer = assemble(code)      def search(self, string, pos=0): -	regs = reop.search(self.buffer, -			   self.num_regs, -			   self.flags, -			   self.fastmap.can_be_null, -			   self.fastmap.fastmap(), -			   self.anchor, -			   string, -			   pos) +	regs = self.code.match(string, pos, 0)  	if regs is None:  	    return None +	self.num_regs=len(regs)  	return MatchObject(self,  			   string, @@ -144,17 +86,10 @@ class RegexObject:  			   regs)      def match(self, string, pos=0): -	regs = reop.match(self.buffer, -			  self.num_regs, -			  self.flags, -			  self.fastmap.can_be_null, -			  self.fastmap.fastmap(), -			  self.anchor, -			  string, -			  pos) +	regs = self.code.match(string, pos, ANCHORED)  	if regs is None:  	    return None -	 +	self.num_regs=len(regs)/2  	return MatchObject(self,  			   string,  			   pos, @@ -165,13 +100,13 @@ class RegexObject:      def subn(self, repl, source, count=0):  	if count < 0: -	    raise ValueError, "negative substibution count" +	    raise error, "negative substitution count"  	if count == 0:  	    import sys  	    count = sys.maxint  	if type(repl) == type(''):  	    if '\\' in repl: -		repl = lambda m, r=repl: _expand(m, r) +		repl = lambda m, r=repl: pcre_expand(m, r)  	    else:  		repl = lambda m, r=repl: r  	n = 0           # Number of matches @@ -275,7 +210,8 @@ class MatchObject:  		    g = self.re.groupindex[g]  		except (KeyError, TypeError):  		    raise IndexError, ('group "' + g + '" is undefined') -	    if (self.regs[g][0] == -1) or (self.regs[g][1] == -1): +	    if len(self.regs)<=g: raise IndexError, ('group "' + str(g) + '" is undefined') +	    elif (self.regs[g][0] == -1) or (self.regs[g][1] == -1):  		result.append(None)  	    else:  		result.append(self.string[self.regs[g][0]:self.regs[g][1]]) @@ -286,364 +222,56 @@ class MatchObject:  	else:  	    return () -# -# A set of classes to make assembly a bit easier, if a bit verbose. -# - -class Instruction: -    def __init__(self, opcode, size=1): -	self.opcode = opcode -	self.size = size -    def assemble(self, position, labels): -	return self.opcode -    def __repr__(self): -	return '%-15s' % (self.name) - -class End(Instruction): -    name = 'end' -    def __init__(self): -	Instruction.__init__(self, chr(0)) - -class Bol(Instruction): -    name = 'bol' -    def __init__(self): -	self.name = 'bol' -	Instruction.__init__(self, chr(1)) - -class Eol(Instruction): -    name = 'eol' -    def __init__(self): -	Instruction.__init__(self, chr(2)) - -class Set(Instruction): -    name = 'set' -    def __init__(self, set, flags=0): -	self.set = set -	if flags & IGNORECASE: self.set=map(string.lower, self.set) -	if len(set)==1:  -	    # If only one element, use the "exact" opcode (it'll be faster) -	    Instruction.__init__(self, chr(4), 2) -	else: -	    # Use the "set" opcode -	    Instruction.__init__(self, chr(3), 33) -    def assemble(self, position, labels): -	if len(self.set)==1: -	    # If only one character in set, generate an "exact" opcode -	    return self.opcode + self.set[0] -	result = self.opcode -	temp = 0 -	for i, c in map(lambda x: (x, chr(x)), range(256)): -	    if c in self.set: -		temp = temp | (1 << (i & 7)) -	    if (i % 8) == 7: -		result = result + chr(temp) -		temp = 0 -	return result -    def __repr__(self): -	result = '%-15s' % (self.name) -	self.set.sort() -	# XXX this should print more intelligently -	for char in self.set: -	    result = result + char -	return result -     -class Exact(Instruction): -    name = 'exact' -    def __init__(self, char, flags): -	self.char = char -	if flags & IGNORECASE: self.char=string.lower(self.char) -	Instruction.__init__(self, chr(4), 2) -    def assemble(self, position, labels): -	return self.opcode + self.char -    def __repr__(self): -	return '%-15s %s' % (self.name, `self.char`) -     -class AnyChar(Instruction): -    name = 'anychar' -    def __init__(self): -	Instruction.__init__(self, chr(5)) -    def assemble(self, position, labels): -	return self.opcode - -class MemoryInstruction(Instruction): -    def __init__(self, opcode, register): -	self.register = register -	Instruction.__init__(self, opcode, 2) -    def assemble(self, position, labels): -	return self.opcode + chr(self.register) -    def __repr__(self): -	return '%-15s %i' % (self.name, self.register) - -class StartMemory(MemoryInstruction): -    name = 'start_memory' -    def __init__(self, register): -	MemoryInstruction.__init__(self, chr(6), register) - -class EndMemory(MemoryInstruction): -    name = 'end_memory' -    def __init__(self, register): -	MemoryInstruction.__init__(self, chr(7), register) - -class MatchMemory(MemoryInstruction): -    name = 'match_memory' -    def __init__(self, register): -	MemoryInstruction.__init__(self, chr(8), register) - -class JumpInstruction(Instruction): -    def __init__(self, opcode, label): -	self.label = label -	Instruction.__init__(self, opcode, 3) -    def compute_offset(self, start, dest): -	return dest - (start + 3) -    def pack_offset(self, offset): -	if offset > 32767: -	    raise error, 'offset out of range (pos)' -	elif offset < -32768: -	    raise error, 'offset out of range (neg)' -	elif offset < 0: -	    offset = offset + 65536 -	return chr(offset & 0xff) + chr((offset >> 8) & 0xff) -    def assemble(self, position, labels): -	return self.opcode + \ -	       self.pack_offset(self.compute_offset(position, -						    labels[self.label])) -    def __repr__(self): -	return '%-15s %i' % (self.name, self.label) - -class Jump(JumpInstruction): -    name = 'jump' -    def __init__(self, label): -	JumpInstruction.__init__(self, chr(9), label) - -class StarJump(JumpInstruction): -    name = 'star_jump' -    def __init__(self, label): -	JumpInstruction.__init__(self, chr(10), label) - -class FailureJump(JumpInstruction): -    name = 'failure_jump' -    def __init__(self, label): -	JumpInstruction.__init__(self, chr(11), label) - -class UpdateFailureJump(JumpInstruction): -    name = 'update_failure_jump' -    def __init__(self, label): -	JumpInstruction.__init__(self, chr(12), label) - -class DummyFailureJump(JumpInstruction): -    name = 'dummy_failure_jump' -    def __init__(self, label): -	JumpInstruction.__init__(self, chr(13), label) -	 -class BegBuf(Instruction): -    name = 'begbuf' -    def __init__(self): -	Instruction.__init__(self, chr(14)) - -class EndBuf(Instruction): -    name = 'endbuf' -    def __init__(self): -	Instruction.__init__(self, chr(15)) - -class WordBeg(Instruction): -    name = 'wordbeg' -    def __init__(self): -	Instruction.__init__(self, chr(16)) - -class WordEnd(Instruction): -    name = 'wordend' -    def __init__(self): -	Instruction.__init__(self, chr(17)) - -class WordBound(Instruction): -    name = 'wordbound' -    def __init__(self): -	Instruction.__init__(self, chr(18)) - -class NotWordBound(Instruction): -    name = 'notwordbound' -    def __init__(self): -	Instruction.__init__(self, chr(19)) - -class SyntaxSpec(Instruction): -    name = 'syntaxspec' -    def __init__(self, syntax): -	self.syntax = syntax -	Instruction.__init__(self, chr(20), 2) -    def assemble(self, postition, labels): -	return self.opcode + chr(self.syntax) - -class NotSyntaxSpec(Instruction): -    name = 'notsyntaxspec' -    def __init__(self, syntax): -	self.syntax = syntax -	Instruction.__init__(self, chr(21), 2) -    def assemble(self, postition, labels): -	return self.opcode + chr(self.syntax) - -class Label(Instruction): -    name = 'label' -    def __init__(self, label): -	self.label = label -	Instruction.__init__(self, '', 0) -    def __repr__(self): -	return '%-15s %i' % (self.name, self.label) - -class OpenParen(Instruction): -    name = '(' -    def __init__(self, register): -	self.register = register -	Instruction.__init__(self, '', 0) -    def assemble(self, position, labels): -	raise error, 'unmatched open parenthesis' - -class Alternation(Instruction): -    name = '|' -    def __init__(self): -	Instruction.__init__(self, '', 0) -    def assemble(self, position, labels): -	raise error, 'an alternation was not taken care of' - -# -# -# - -def assemble(instructions): -    labels = {} -    position = 0 -    pass1 = [] -    for instruction in instructions: -	if instruction.name == 'label': -	    labels[instruction.label] = position -	else: -	    pass1.append((position, instruction)) -	    position = position + instruction.size -    pass2 = '' -    for position, instruction in pass1: -	pass2 = pass2 + instruction.assemble(position, labels) -    return pass2 - -# -# -# -  def escape(pattern):      result = [] +    alphanum=string.letters+'_'+string.digits      for char in pattern: -	if not reop.syntax_table[char] & reop.word: +	if char not in alphanum:  	    result.append('\\')  	result.append(char)      return string.join(result, '') -# -# -# - -def registers_used(instructions): -    result = [] -    for instruction in instructions: -	if (instruction.name in ['set_memory', 'end_memory']) and \ -	   (instruction.register not in result): -	    result.append(instruction.register) -    return result - -# -# -# - -class Fastmap: -    def __init__(self): -	self.map = ['\000']*256 -	self.can_be_null = 0 -    def add(self, char): -	self.map[ord(char)] = '\001' -    def fastmap(self): -	return string.join(self.map, '') -    def __getitem__(self, char): -	return ord(self.map[ord(char)]) -    def __repr__(self): -	self.map.sort() -	return 'Fastmap(' + `self.can_be_null` + ', ' + `self.map` + ')' - -# -# -# - -def find_label(code, label): -    line = 0 -    for instruction in code: -	if (instruction.name == 'label') and (instruction.label == label): -	    return line + 1 -	line = line + 1 -	 -def build_fastmap_aux(code, pos, visited, fastmap): -    if visited[pos]: -	return -    while 1: -	instruction = code[pos] -	visited[pos] = 1 -	pos = pos + 1 -	if instruction.name == 'end': -	    fastmap.can_be_null = 1 -	    return -	elif instruction.name == 'syntaxspec': -	    for char in map(chr, range(256)): -		if reop.syntax_table[char] & instruction.syntax: -		    fastmap.add(char) -	    return -	elif instruction.name == 'notsyntaxspec': -	    for char in map(chr, range(256)): -		if not reop.syntax_table[char] & instruction.syntax: -		    fastmap.add(char) -	    return -	elif instruction.name == 'eol': -	    fastmap.add('\n') -	    if fastmap.can_be_null == 0: -		fastmap.can_be_null = 2 -	    return -	elif instruction.name == 'set': -	    for char in instruction.set: -		fastmap.add(char) -	    return -	elif instruction.name == 'exact': -	    fastmap.add(instruction.char) -	elif instruction.name == 'anychar': -	    for char in map(chr, range(256)): -		if char != '\n': -		    fastmap.add(char) -	    return -	elif instruction.name == 'match_memory': -	    for char in map(chr, range(256)): -		fastmap.add(char) -	    fastmap.can_be_null = 1 -	    return -	elif instruction.name in ['jump', 'dummy_failure_jump', \ -				  'update_failure_jump', 'star_jump']: -	    pos = find_label(code, instruction.label) -	    if visited[pos]: -		return -	    visited[pos] = 1 -	elif instruction.name  == 'failure_jump': -	    build_fastmap_aux(code, -			      find_label(code, instruction.label), -			      visited, -			      fastmap) -	 -def build_fastmap(code, pos=0): -    visited = [0] * len(code) -    fastmap = Fastmap() -    build_fastmap_aux(code, pos, visited, fastmap) -    return fastmap - -# -# -# +def valid_identifier(id): +    import string +    if len(id) == 0: +	return 0 +    if id[0] not in string.letters+'_': +	return 0 +    for char in id[1:]: +	if not syntax_table[char] & word: +	    return 0 +    return 1 -#[NORMAL, CHARCLASS, REPLACEMENT] = range(3) -#[CHAR, MEMORY_REFERENCE, SYNTAX, NOT_SYNTAX, SET, WORD_BOUNDARY, -# NOT_WORD_BOUNDARY, BEGINNING_OF_BUFFER, END_OF_BUFFER] = range(9) +def compile(pattern, flags=0): +    groupindex={} +    code=pcre_compile(pattern, flags, groupindex) +    return RegexObject(pattern, flags, code, groupindex) +     +def _expand(m, repl): +    results = [] +    index = 0 +    size = len(repl) +    while index < size: +	found = string.find(repl, '\\', index) +	if found < 0: +	    results.append(repl[index:]) +	    break +	if found > index: +	    results.append(repl[index:found]) +	escape_type, value, index = _expand_escape(repl, found+1, REPLACEMENT) +	if escape_type == CHAR: +	    results.append(value) +	elif escape_type == MEMORY_REFERENCE: +	    r = m.group(value) +	    if r is None: +		raise error, ('group "' + str(value) + '" did not contribute ' +			      'to the match') +	    results.append(m.group(value)) +	else: +	    raise error, "bad escape in replacement" +    return string.join(results, '') -def expand_escape(pattern, index, context=NORMAL): +def _expand_escape(pattern, index, context=NORMAL):      if index >= len(pattern):  	raise error, 'escape ends too soon' @@ -676,7 +304,7 @@ def expand_escape(pattern, index, context=NORMAL):  	# what it's doing (and Python in turn passes it on to sscanf,  	# so that *it* doesn't incorrectly 2nd-guess what C does!)  	char = eval ('"' + pattern[index-1:end] + '"') -	assert len(char) == 1 +#	assert len(char) == 1  	return CHAR, char, end      elif pattern[index] == 'b': @@ -707,75 +335,21 @@ def expand_escape(pattern, index, context=NORMAL):  	raise error, ('\\' + pattern[index] + ' is not allowed')      elif pattern[index] == 'w': -	if context == NORMAL: -	    return SYNTAX, reop.word, index + 1 -	elif context == CHARCLASS: -	    set = [] -	    for char in reop.syntax_table.keys(): -		if reop.syntax_table[char] & reop.word: -		    set.append(char) -	    return SET, set, index + 1 -	else:  	    return CHAR, 'w', index + 1      elif pattern[index] == 'W': -	if context == NORMAL: -	    return NOT_SYNTAX, reop.word, index + 1 -	elif context == CHARCLASS: -	    set = [] -	    for char in reop.syntax_table.keys(): -		if not reop.syntax_table[char] & reop.word: -		    set.append(char) -	    return SET, set, index + 1 -	else:  	    return CHAR, 'W', index + 1      elif pattern[index] == 's': -	if context == NORMAL: -	    return SYNTAX, reop.whitespace, index + 1 -	elif context == CHARCLASS: -	    set = [] -	    for char in reop.syntax_table.keys(): -		if reop.syntax_table[char] & reop.whitespace: -		    set.append(char) -	    return SET, set, index + 1 -	else:  	    return CHAR, 's', index + 1      elif pattern[index] == 'S': -	if context == NORMAL: -	    return NOT_SYNTAX, reop.whitespace, index + 1 -	elif context == CHARCLASS: -	    set = [] -	    for char in reop.syntax_table.keys(): -		if not reop.syntax_table[char] & reop.whitespace: -		    set.append(char) -	    return SET, set, index + 1 -	else:  	    return CHAR, 'S', index + 1      elif pattern[index] == 'd': -	if context == NORMAL: -	    return SYNTAX, reop.digit, index + 1 -	elif context == CHARCLASS: -	    set = [] -	    for char in reop.syntax_table.keys(): -		if reop.syntax_table[char] & reop.digit: -		    set.append(char) -	    return SET, set, index + 1 -	else:  	    return CHAR, 'd', index + 1      elif pattern[index] == 'D': -	if context == NORMAL: -	    return NOT_SYNTAX, reop.digit, index + 1 -	elif context == CHARCLASS: -	    set = [] -	    for char in reop.syntax_table.keys(): -		if not reop.syntax_table[char] & reop.digit: -		    set.append(char) -	    return SET, set, index + 1 -	else:  	    return CHAR, 'D', index + 1      elif pattern[index] in '0123456789': @@ -854,655 +428,3 @@ def expand_escape(pattern, index, context=NORMAL):      else:  	return CHAR, pattern[index], index + 1 -def compile(pattern, flags=0): -    stack = [] -    label = 0 -    register = 1 -    groupindex = {} -    lastop = '' - -    # look for embedded pattern modifiers at the beginning of the pattern - -    index = 0 - -    if len(pattern) >= 3 and \ -       (pattern[:2] == '(?') and \ -       (pattern[2] in 'iImMsSxX'): -	index = 2 -	while (index < len(pattern)) and (pattern[index] != ')'): -	    if pattern[index] in 'iI': -		flags = flags | IGNORECASE -	    elif pattern[index] in 'mM': -		flags = flags | MULTILINE -	    elif pattern[index] in 'sS': -		flags = flags | DOTALL -	    elif pattern[index] in 'xX': -		flags = flags | VERBOSE -	    else: -		raise error, 'unknown modifier' -	    index = index + 1 -	index = index + 1 - -    # compile the rest of the pattern -     -    while (index < len(pattern)): -	char = pattern[index] -	index = index + 1 -	if char == '\\': -	    escape_type, value, index = expand_escape(pattern, index) - -	    if escape_type == CHAR: -		stack.append([Exact(value, flags)]) -		lastop = '\\' + value -		 -	    elif escape_type == MEMORY_REFERENCE: -		if value >= register: -		    raise error, ('cannot reference a register ' -				  'not yet used') -		stack.append([MatchMemory(value)]) -		lastop = '\\1' -		 -	    elif escape_type == BEGINNING_OF_BUFFER: -		stack.append([BegBuf()]) -		lastop = '\\A' -		 -	    elif escape_type == END_OF_BUFFER: -		stack.append([EndBuf()]) -		lastop = '\\Z' -		 -	    elif escape_type == WORD_BOUNDARY: -		stack.append([WordBound()]) -		lastop = '\\b' -		 -	    elif escape_type == NOT_WORD_BOUNDARY: -		stack.append([NotWordBound()]) -		lastop = '\\B' -		 -	    elif escape_type == SYNTAX: -		stack.append([SyntaxSpec(value)]) -		if value == reop.word: -		    lastop = '\\w' -		elif value == reop.whitespace: -		    lastop = '\\s' -		elif value == reop.digit: -		    lastop = '\\d' -		else: -		    lastop = '\\?' -		     -	    elif escape_type == NOT_SYNTAX: -		stack.append([NotSyntaxSpec(value)]) -		if value == reop.word: -		    lastop = '\\W' -		elif value == reop.whitespace: -		    lastop = '\\S' -		elif value == reop.digit: -		    lastop = '\\D' -		else: -		    lastop = '\\?' -		 -	    elif escape_type == SET: -		raise error, 'cannot use set escape type here' -	     -	    else: -		raise error, 'unknown escape type' - -	elif char == '|': -	    expr = [] -	     -	    while (len(stack) != 0) and \ -		  (stack[-1][0].name != '(') and \ -		  (stack[-1][0].name != '|'): -		expr = stack[-1] + expr -		del stack[-1] -	    stack.append([FailureJump(label)] + \ -			 expr + \ -			 [Jump(-1), -			  Label(label)]) -	    stack.append([Alternation()]) -	    label = label + 1 -	    lastop = '|' -	     -	elif char == '(': -	    if index >= len(pattern): -		raise error, 'no matching close paren' - -	    elif pattern[index] == '?': -		# Perl style (?...) extensions -		index = index + 1 -		if index >= len(pattern): -		    raise error, 'extension ends prematurely' - -		elif pattern[index] == 'P': -		    # Python extensions -		    index = index + 1 -		    if index >= len(pattern): -			raise error, 'extension ends prematurely' - -		    elif pattern[index] == '<': -			# Handle Python symbolic group names (?P<...>...) -			index = index + 1 -			end = string.find(pattern, '>', index) -			if end == -1: -			    raise error, 'no end to symbolic group name' -			name = pattern[index:end] -			if not valid_identifier(name): -			    raise error, ('symbolic group name must be a ' -					  'valid identifier') -			index = end + 1 -			groupindex[name] = register -			stack.append([OpenParen(register)]) -			register = register + 1 -			lastop = '(' - -		    elif pattern[index] == '=': -			# backreference to symbolic group name -			if index >= len(pattern): -			    raise error, '(?P= at the end of the pattern' -			start = index + 1 -			end = string.find(pattern, ')', start) -			if end == -1: -			    raise error, 'no ) to end symbolic group name' -			name = pattern[start:end] -			if name not in groupindex.keys(): -			    raise error, ('symbolic group name ' + name + \ -					  ' has not been used yet') -			stack.append([MatchMemory(groupindex[name])]) -			index = end + 1 -			lastop = '(?P=)' -			 -		    else: -			raise error, ('unknown Python extension: ' + \ -				      pattern[index]) -		     -		elif pattern[index] == ':': -		    # grouping, but no registers -		    index = index + 1 -		    stack.append([OpenParen(-1)]) -		    lastop = '(' -		     -		elif pattern[index] == '#': -		    # comment -		    index = index + 1 -		    end = string.find(pattern, ')', index) -		    if end == -1: -			raise error, 'no end to comment' -		    index = end + 1 -		    # do not change lastop -		     -		elif pattern[index] == '=': -		    raise error, ('zero-width positive lookahead ' -				  'assertion is unsupported') - -		elif pattern[index] == '!': -		    raise error, ('zero-width negative lookahead ' -				  'assertion is unsupported') - -		elif pattern[index] in 'iImMsSxX': -		    raise error, ('embedded pattern modifiers are only ' -				  'allowed at the beginning of the pattern') - -		else: -		    raise error, 'unknown extension' - -	    else: -		stack.append([OpenParen(register)]) -		register = register + 1 -		lastop = '(' - -	elif char == ')': -	    # make one expression out of everything on the stack up to -	    # the marker left by the last parenthesis -	    expr = [] -	    while (len(stack) > 0) and (stack[-1][0].name != '('): -		expr = stack[-1] + expr -		del stack[-1] - -	    if len(stack) == 0: -		raise error, 'too many close parens' -	     -	    # remove markers left by alternation -	    expr = filter(lambda x: x.name != '|', expr) - -	    # clean up jumps inserted by alternation -	    need_label = 0 -	    for i in range(len(expr)): -		if (expr[i].name == 'jump') and (expr[i].label == -1): -		    expr[i] = Jump(label) -		    need_label = 1 -	    if need_label: -		expr.append(Label(label)) -		label = label + 1 - -	    if stack[-1][0].register > 0: -		expr = [StartMemory(stack[-1][0].register)] + \ -		       expr + \ -		       [EndMemory(stack[-1][0].register)] -	    del stack[-1] -	    stack.append(expr) -	    lastop = ')' - -	elif char == '{': -	    if len(stack) == 0: -		raise error, 'no expression to repeat' -	    end = string.find(pattern, '}', index) -	    if end == -1: -		raise error, ('no close curly bracket to match' -			      ' open curly bracket') - -	    fields = map(string.strip, -			 string.split(pattern[index:end], ',')) -	    index = end + 1 - -	    minimal = 0 -	    if (index < len(pattern)) and (pattern[index] == '?'): -		minimal = 1 -		index = index + 1 - -	    if len(fields) == 1: -		# {n} or {n}? (there's really no difference) -		try: -		    count = string.atoi(fields[0]) -		except ValueError: -		    raise error, ('count must be an integer ' -				  'inside curly braces') -		if count > 65535: -		    raise error, 'repeat count out of range' -		expr = [] -		while count > 0: -		    expr = expr + stack[-1] -		    count = count - 1 -		del stack[-1] -		stack.append(expr) -		if minimal: -		    lastop = '{n}?' -		else: -		    lastop = '{n}' - -	    elif len(fields) == 2: -		# {n,} or {n,m} -		if fields[1] == '': -		    # {n,} -		    try: -			min = string.atoi(fields[0]) -		    except ValueError: -			raise error, ('minimum must be an integer ' -				      'inside curly braces') -		    if min > 65535: -			raise error, 'minimum repeat count out of range' - -		    expr = [] -		    while min > 0: -			expr = expr + stack[-1] -			min = min - 1 -		    if minimal: -			expr = expr + \ -			       ([Jump(label + 1), -				 Label(label)] + \ -				stack[-1] + \ -				[Label(label + 1), -				 FailureJump(label)]) -			lastop = '{n,}?' -		    else: -			expr = expr + \ -			       ([Label(label), -				 FailureJump(label + 1)] + -				stack[-1] + -				[StarJump(label), -				 Label(label + 1)]) -			lastop = '{n,}' - -		    del stack[-1] -		    stack.append(expr) -		    label = label + 2 - -		else: -		    # {n,m} -		    try: -			min = string.atoi(fields[0]) -		    except ValueError: -			raise error, ('minimum must be an integer ' -				      'inside curly braces') -		    try: -			max = string.atoi(fields[1]) -		    except ValueError: -			raise error, ('maximum must be an integer ' -				      'inside curly braces') -		    if min > 65535: -			raise error, ('minumim repeat count out ' -				      'of range') -		    if max > 65535: -			raise error, ('maximum repeat count out ' -				      'of range') -		    if min > max: -			raise error, ('minimum repeat count must be ' -				      'less than the maximum ' -				      'repeat count') -		    expr = [] -		    while min > 0: -			expr = expr + stack[-1] -			min = min - 1 -			max = max - 1 -		    if minimal: -			while max > 0: -			    expr = expr + \ -				   [FailureJump(label), -				    Jump(label + 1), -				    Label(label)] + \ -				   stack[-1] + \ -				   [Label(label + 1)] -			    max = max - 1 -			    label = label + 2 -			del stack[-1] -			stack.append(expr) -			lastop = '{n,m}?' -		    else: -			while max > 0: -			    expr = expr + \ -				   [FailureJump(label)] + \ -				   stack[-1] -			    max = max - 1 -			del stack[-1] -			stack.append(expr + [Label(label)]) -			label = label + 1 -			lastop = '{n,m}' - -	    else: -		raise error, ('there need to be one or two fields ' -			      'in a {} expression') - -	elif char == '}': -	    raise error, 'unbalanced close curly brace' - -	elif char == '*': -	    # Kleene closure -	    if len(stack) == 0: -		raise error, '* needs something to repeat' - -	    if lastop in ['(', '|']: -		raise error, '* needs something to repeat' - -	    if lastop in repetition_operators: -		raise error, 'nested repetition operators' -	     -	    if (index < len(pattern)) and (pattern[index] == '?'): -		# non-greedy matching -		expr = [Jump(label + 1), -			Label(label)] + \ -		       stack[-1] + \ -		       [Label(label + 1), -			FailureJump(label)] -		index = index + 1 -		lastop = '*?' -	    else: -		# greedy matching -		expr = [Label(label), -			FailureJump(label + 1)] + \ -		       stack[-1] + \ -		       [StarJump(label), -			Label(label + 1)] -		lastop = '*' -	    del stack[-1] -	    stack.append(expr) -	    label = label + 2 - -	elif char == '+': -	    # positive closure -	    if len(stack) == 0: -		raise error, '+ needs something to repeat' -	     -	    if lastop in ['(', '|']: -		raise error, '+ needs something to repeat' - -	    if lastop in repetition_operators: -		raise error, 'nested repetition operators' - -	    if (index < len(pattern)) and (pattern[index] == '?'): -		# non-greedy -		expr = [Label(label)] + \ -		       stack[-1] + \ -		       [FailureJump(label)] -		label = label + 1 -		index = index + 1 -		lastop = '+?' -		 -	    else: -		# greedy -		expr = [DummyFailureJump(label + 1), -			Label(label), -			FailureJump(label + 2), -			Label(label + 1)] + \ -		       stack[-1] + \ -		       [StarJump(label), -			Label(label + 2)] -		label = label + 3 -		lastop = '+' -		 -	    del stack[-1] -	    stack.append(expr) - -	elif char == '?': -	    if len(stack) == 0: -		raise error, 'need something to be optional' -	     -	    if len(stack) == 0: -		raise error, '? needs something to repeat' -	     -	    if lastop in ['(', '|']: -		raise error, '? needs something to repeat' - -	    if lastop in repetition_operators: -		raise error, 'nested repetition operators' - -	    if (index < len(pattern)) and (pattern[index] == '?'): -		# non-greedy matching -		expr = [FailureJump(label), -			Jump(label + 1), -			Label(label)] + \ -		       stack[-1] + \ -		       [Label(label + 1)] -		label = label + 2 -		index = index + 1 -		lastop = '??' -		 -	    else: -		# greedy matching -		expr = [FailureJump(label)] + \ -		       stack[-1] + \ -		       [Label(label)] -		label = label + 1 -		lastop = '?' -		 -	    del stack[-1] -	    stack.append(expr) - -	elif char == '.': -	    if flags & DOTALL: -		stack.append([Set(map(chr, range(256)), flags)]) -	    else: -		stack.append([AnyChar()]) -	    lastop = '.' - -	elif char == '^': -	    if flags & MULTILINE: -		stack.append([Bol()]) -	    else: -		stack.append([BegBuf()]) -	    lastop = '^' - -	elif char == '$': -	    if flags & MULTILINE: -		stack.append([Eol()]) -	    else: -		stack.append([EndBuf()]) -	    lastop = '$' - -	elif char == '#': -	    if flags & VERBOSE: -		# comment -		index = index + 1 -		end = string.find(pattern, '\n', index) -		if end == -1: -		    index = len(pattern) -		else: -		    index = end + 1 -		# do not change lastop -	    else: -		stack.append([Exact(char, flags)]) -		lastop = '#' - -	elif char in string.whitespace: -	    if not (flags & VERBOSE): -		stack.append([Exact(char, flags)]) -		lastop = char - -	elif char == '[': -	    # compile character class -	     -	    if index >= len(pattern): -		raise error, 'unclosed character class' -	     -	    negate = 0 -	    last = '' -	    set = [] - -	    if pattern[index] == '^': -		negate = 1 -		index = index + 1 -		if index >= len(pattern): -		    raise error, 'unclosed character class' - -	    if pattern[index] == ']': -		set.append(']') -		index = index + 1 -		if index >= len(pattern): -		    raise error, 'unclosed character class' -		 -	    elif pattern[index] == '-': -		set.append('-') -		index = index + 1 -		if index >= len(pattern): -		    raise error, 'unclosed character class' -		 -	    while (index < len(pattern)) and (pattern[index] != ']'): -		next = pattern[index] -		index = index + 1 -		if next == '-': -		    if index >= len(pattern): -			raise error, 'incomplete range in character class' - -		    elif pattern[index] == ']': -			set.append('-') -			 -		    else: -			if last == '': -			    raise error, ('improper use of range in ' -					  'character class') - -			start = last -			 -			if pattern[index] == '\\': -			    escape_type, -			    value, -			    index = expand_escape(pattern, -						  index + 1, -						  CHARCLASS) - -			    if escape_type == CHAR: -				end = value -				 -			    else: -				raise error, ('illegal escape in character ' -					      'class range') -			else: -			    end = pattern[index] -			    index = index + 1 - -			if start > end: -			    raise error, ('range arguments out of order ' -					  'in character class') -			 -			for char in map(chr, range(ord(start), ord(end) + 1)): -			    if char not in set: -				set.append(char) -			     -			last = '' - -		elif next == '\\': -		    # expand syntax meta-characters and add to set -		    if index >= len(pattern): -			raise error, 'incomplete set' - -		    escape_type, value, index = expand_escape(pattern, -							      index, -							      CHARCLASS) - -		    if escape_type == CHAR: -			set.append(value) -			last = value - -		    elif escape_type == SET: -			for char in value: -			    if char not in set: -				set.append(char) -			last = '' - -		    else: -			raise error, 'illegal escape type in character class' - -		else: -		    if next not in set: -			set.append(next) -		    last = next -		     -	    if (index >= len(pattern)) or ( pattern[index] != ']'): -		raise error, 'incomplete set' - -	    index = index + 1 - -	    if negate: -		# If case is being ignored, then both upper- and lowercase -		# versions of the letters must be excluded. -		if flags & IGNORECASE: set=set+map(string.upper, set) -		notset = [] -		for char in map(chr, range(256)): -		    if char not in set: -			notset.append(char) -		if len(notset) == 0: -		    raise error, 'empty negated set' -		stack.append([Set(notset, flags)]) -	    else: -		if len(set) == 0: -		    raise error, 'empty set' -		stack.append([Set(set, flags)]) - -	    lastop = '[]' - -	else: -	    stack.append([Exact(char, flags)]) -	    lastop = char - -    code = [] -    while len(stack) > 0: -	if stack[-1][0].name == '(': -	    raise error, 'too many open parens' -	code = stack[-1] + code -	del stack[-1] -    if len(code) == 0: -	raise error, 'no code generated' -    code = filter(lambda x: x.name != '|', code) -    need_label = 0 -    for i in range(len(code)): -	if (code[i].name == 'jump') and (code[i].label == -1): -	    code[i] = Jump(label) -	    need_label = 1 -    if need_label: -	code.append(Label(label)) -	label = label + 1 -    code.append(End()) -#    print code -    return RegexObject(pattern, flags, code, register, groupindex) - -# Replace expand_escape and _expand functions with their C equivalents. -# If you suspect bugs in the C versions, comment out the next two lines -expand_escape = reop.expand_escape -_expand  = reop._expand diff --git a/Lib/re1.py b/Lib/re1.py new file mode 100644 index 0000000000..6c24797ffd --- /dev/null +++ b/Lib/re1.py @@ -0,0 +1,1508 @@ +#!/usr/bin/env python +# -*- mode: python -*- +# $Id$ + +import string +import reop + +# reop.error and re.error should be the same, since exceptions can be +# raised from  either module. +error = reop.error # 're error' + +from reop import NORMAL, CHARCLASS, REPLACEMENT +from reop import CHAR, MEMORY_REFERENCE, SYNTAX, NOT_SYNTAX, SET +from reop import WORD_BOUNDARY, NOT_WORD_BOUNDARY, BEGINNING_OF_BUFFER, END_OF_BUFFER + +# compilation flags + +IGNORECASE = I = 0x01 + +MULTILINE = M = 0x02 +DOTALL = S = 0x04 +VERBOSE = X = 0x08 + +repetition_operators = ['*', '*?', '+', '+?', '?', '??', '{n}', '{n}?', +			'{n,}', '{n,}?', '{n,m}', '{n,m}?'] + +# +# +# + +def valid_identifier(id): +    if len(id) == 0: +	return 0 +    if (not reop.syntax_table[id[0]] & reop.word) or \ +       (reop.syntax_table[id[0]] & reop.digit): +	return 0 +    for char in id[1:]: +	if not reop.syntax_table[char] & reop.word: +	    return 0 +    return 1 + +# +# +# + +_cache = {} +_MAXCACHE = 20 + +def _cachecompile(pattern, flags=0): +    key = (pattern, flags) +    try: +	return _cache[key] +    except KeyError: +	pass +    value = compile(pattern, flags) +    if len(_cache) >= _MAXCACHE: +	_cache.clear() +    _cache[key] = value +    return value + +def match(pattern, string, flags=0): +    return _cachecompile(pattern, flags).match(string) +   +def search(pattern, string, flags=0): +    return _cachecompile(pattern, flags).search(string) +   +def sub(pattern, repl, string, count=0): +    if type(pattern) == type(''): +	pattern = _cachecompile(pattern) +    return pattern.sub(repl, string, count) + +def subn(pattern, repl, string, count=0): +    if type(pattern) == type(''): +	pattern = _cachecompile(pattern) +    return pattern.subn(repl, string, count) +   +def split(pattern, string, maxsplit=0): +    if type(pattern) == type(''): +	pattern = _cachecompile(pattern) +    return pattern.split(string, maxsplit) + +# +# +# + +def _expand(m, repl): +    results = [] +    index = 0 +    size = len(repl) +    while index < size: +	found = string.find(repl, '\\', index) +	if found < 0: +	    results.append(repl[index:]) +	    break +	if found > index: +	    results.append(repl[index:found]) +	escape_type, value, index = expand_escape(repl, found+1, REPLACEMENT) +	if escape_type == CHAR: +	    results.append(value) +	elif escape_type == MEMORY_REFERENCE: +	    r = m.group(value) +	    if r is None: +		raise error, ('group "' + str(value) + '" did not contribute ' +			      'to the match') +	    results.append(m.group(value)) +	else: +	    raise error, "bad escape in replacement" +    return string.join(results, '') + +class RegexObject: +    def __init__(self, pattern, flags, code, num_regs, groupindex): +	self.code = code +	self.num_regs = num_regs +	self.flags = flags +	self.pattern = pattern +	self.groupindex = groupindex +	self.fastmap = build_fastmap(code) +	 +	if code[0].name == 'bol': +	    self.anchor = 1 +	     +	elif code[0].name == 'begbuf': +	    self.anchor = 2 +	     +	else: +	    self.anchor = 0 +	     +	self.buffer = assemble(code) +    def search(self, string, pos=0): +	regs = reop.search(self.buffer, +			   self.num_regs, +			   self.flags, +			   self.fastmap.can_be_null, +			   self.fastmap.fastmap(), +			   self.anchor, +			   string, +			   pos) +	if regs is None: +	    return None +	 +	return MatchObject(self, +			   string, +			   pos, +			   regs) +     +    def match(self, string, pos=0): +	regs = reop.match(self.buffer, +			  self.num_regs, +			  self.flags, +			  self.fastmap.can_be_null, +			  self.fastmap.fastmap(), +			  self.anchor, +			  string, +			  pos) +	if regs is None: +	    return None +	 +	return MatchObject(self, +			   string, +			   pos, +			   regs) +     +    def sub(self, repl, string, count=0): +        return self.subn(repl, string, count)[0] +     +    def subn(self, repl, source, count=0): +	if count < 0: +	    raise ValueError, "negative substibution count" +	if count == 0: +	    import sys +	    count = sys.maxint +	if type(repl) == type(''): +	    if '\\' in repl: +		repl = lambda m, r=repl: _expand(m, r) +	    else: +		repl = lambda m, r=repl: r +	n = 0           # Number of matches +	pos = 0         # Where to start searching +	lastmatch = -1  # End of last match +	results = []    # Substrings making up the result +	end = len(source) +	while n < count and pos <= end: +	    m = self.search(source, pos) +	    if not m: +		break +	    i, j = m.span(0) +	    if i == j == lastmatch: +		# Empty match adjacent to previous match +		pos = pos + 1 +		results.append(source[lastmatch:pos]) +		continue +	    if pos < i: +		results.append(source[pos:i]) +	    results.append(repl(m)) +	    pos = lastmatch = j +	    if i == j: +		# Last match was empty; don't try here again +		pos = pos + 1 +		results.append(source[lastmatch:pos]) +	    n = n + 1 +	results.append(source[pos:]) +	return (string.join(results, ''), n) +									     +    def split(self, source, maxsplit=0): +	if maxsplit < 0: +	    raise error, "negative split count" +	if maxsplit == 0: +	    import sys +	    maxsplit = sys.maxint +	n = 0 +	pos = 0 +	lastmatch = 0 +	results = [] +	end = len(source) +	while n < maxsplit: +	    m = self.search(source, pos) +	    if not m: +		break +	    i, j = m.span(0) +	    if i == j: +		# Empty match +		if pos >= end: +		    break +		pos = pos+1 +		continue +	    results.append(source[lastmatch:i]) +	    g = m.group() +	    if g: +		results[len(results):] = list(g) +	    pos = lastmatch = j +	results.append(source[lastmatch:]) +	return results + +class MatchObject: +    def __init__(self, re, string, pos, regs): +	self.re = re +	self.string = string +	self.pos = pos +	self.regs = regs +	 +    def start(self, g): +	if type(g) == type(''): +	    try: +		g = self.re.groupindex[g] +	    except (KeyError, TypeError): +		raise IndexError, ('group "' + g + '" is undefined') +	return self.regs[g][0] +     +    def end(self, g): +	if type(g) == type(''): +	    try: +		g = self.re.groupindex[g] +	    except (KeyError, TypeError): +		raise IndexError, ('group "' + g + '" is undefined') +	return self.regs[g][1] +     +    def span(self, g): +	if type(g) == type(''): +	    try: +		g = self.re.groupindex[g] +	    except (KeyError, TypeError): +		raise IndexError, ('group "' + g + '" is undefined') +	return self.regs[g] +     +    def group(self, *groups): +	if len(groups) == 0: +	    groups = range(1, self.re.num_regs) +	    use_all = 1 +	else: +	    use_all = 0 +	result = [] +	for g in groups: +	    if type(g) == type(''): +		try: +		    g = self.re.groupindex[g] +		except (KeyError, TypeError): +		    raise IndexError, ('group "' + g + '" is undefined') +	    if (self.regs[g][0] == -1) or (self.regs[g][1] == -1): +		result.append(None) +	    else: +		result.append(self.string[self.regs[g][0]:self.regs[g][1]]) +	if use_all or len(result) > 1: +	    return tuple(result) +	elif len(result) == 1: +	    return result[0] +	else: +	    return () + +# +# A set of classes to make assembly a bit easier, if a bit verbose. +# + +class Instruction: +    def __init__(self, opcode, size=1): +	self.opcode = opcode +	self.size = size +    def assemble(self, position, labels): +	return self.opcode +    def __repr__(self): +	return '%-15s' % (self.name) + +class End(Instruction): +    name = 'end' +    def __init__(self): +	Instruction.__init__(self, chr(0)) + +class Bol(Instruction): +    name = 'bol' +    def __init__(self): +	self.name = 'bol' +	Instruction.__init__(self, chr(1)) + +class Eol(Instruction): +    name = 'eol' +    def __init__(self): +	Instruction.__init__(self, chr(2)) + +class Set(Instruction): +    name = 'set' +    def __init__(self, set, flags=0): +	self.set = set +	if flags & IGNORECASE: self.set=map(string.lower, self.set) +	if len(set)==1:  +	    # If only one element, use the "exact" opcode (it'll be faster) +	    Instruction.__init__(self, chr(4), 2) +	else: +	    # Use the "set" opcode +	    Instruction.__init__(self, chr(3), 33) +    def assemble(self, position, labels): +	if len(self.set)==1: +	    # If only one character in set, generate an "exact" opcode +	    return self.opcode + self.set[0] +	result = self.opcode +	temp = 0 +	for i, c in map(lambda x: (x, chr(x)), range(256)): +	    if c in self.set: +		temp = temp | (1 << (i & 7)) +	    if (i % 8) == 7: +		result = result + chr(temp) +		temp = 0 +	return result +    def __repr__(self): +	result = '%-15s' % (self.name) +	self.set.sort() +	# XXX this should print more intelligently +	for char in self.set: +	    result = result + char +	return result +     +class Exact(Instruction): +    name = 'exact' +    def __init__(self, char, flags): +	self.char = char +	if flags & IGNORECASE: self.char=string.lower(self.char) +	Instruction.__init__(self, chr(4), 2) +    def assemble(self, position, labels): +	return self.opcode + self.char +    def __repr__(self): +	return '%-15s %s' % (self.name, `self.char`) +     +class AnyChar(Instruction): +    name = 'anychar' +    def __init__(self): +	Instruction.__init__(self, chr(5)) +    def assemble(self, position, labels): +	return self.opcode + +class MemoryInstruction(Instruction): +    def __init__(self, opcode, register): +	self.register = register +	Instruction.__init__(self, opcode, 2) +    def assemble(self, position, labels): +	return self.opcode + chr(self.register) +    def __repr__(self): +	return '%-15s %i' % (self.name, self.register) + +class StartMemory(MemoryInstruction): +    name = 'start_memory' +    def __init__(self, register): +	MemoryInstruction.__init__(self, chr(6), register) + +class EndMemory(MemoryInstruction): +    name = 'end_memory' +    def __init__(self, register): +	MemoryInstruction.__init__(self, chr(7), register) + +class MatchMemory(MemoryInstruction): +    name = 'match_memory' +    def __init__(self, register): +	MemoryInstruction.__init__(self, chr(8), register) + +class JumpInstruction(Instruction): +    def __init__(self, opcode, label): +	self.label = label +	Instruction.__init__(self, opcode, 3) +    def compute_offset(self, start, dest): +	return dest - (start + 3) +    def pack_offset(self, offset): +	if offset > 32767: +	    raise error, 'offset out of range (pos)' +	elif offset < -32768: +	    raise error, 'offset out of range (neg)' +	elif offset < 0: +	    offset = offset + 65536 +	return chr(offset & 0xff) + chr((offset >> 8) & 0xff) +    def assemble(self, position, labels): +	return self.opcode + \ +	       self.pack_offset(self.compute_offset(position, +						    labels[self.label])) +    def __repr__(self): +	return '%-15s %i' % (self.name, self.label) + +class Jump(JumpInstruction): +    name = 'jump' +    def __init__(self, label): +	JumpInstruction.__init__(self, chr(9), label) + +class StarJump(JumpInstruction): +    name = 'star_jump' +    def __init__(self, label): +	JumpInstruction.__init__(self, chr(10), label) + +class FailureJump(JumpInstruction): +    name = 'failure_jump' +    def __init__(self, label): +	JumpInstruction.__init__(self, chr(11), label) + +class UpdateFailureJump(JumpInstruction): +    name = 'update_failure_jump' +    def __init__(self, label): +	JumpInstruction.__init__(self, chr(12), label) + +class DummyFailureJump(JumpInstruction): +    name = 'dummy_failure_jump' +    def __init__(self, label): +	JumpInstruction.__init__(self, chr(13), label) +	 +class BegBuf(Instruction): +    name = 'begbuf' +    def __init__(self): +	Instruction.__init__(self, chr(14)) + +class EndBuf(Instruction): +    name = 'endbuf' +    def __init__(self): +	Instruction.__init__(self, chr(15)) + +class WordBeg(Instruction): +    name = 'wordbeg' +    def __init__(self): +	Instruction.__init__(self, chr(16)) + +class WordEnd(Instruction): +    name = 'wordend' +    def __init__(self): +	Instruction.__init__(self, chr(17)) + +class WordBound(Instruction): +    name = 'wordbound' +    def __init__(self): +	Instruction.__init__(self, chr(18)) + +class NotWordBound(Instruction): +    name = 'notwordbound' +    def __init__(self): +	Instruction.__init__(self, chr(19)) + +class SyntaxSpec(Instruction): +    name = 'syntaxspec' +    def __init__(self, syntax): +	self.syntax = syntax +	Instruction.__init__(self, chr(20), 2) +    def assemble(self, postition, labels): +	return self.opcode + chr(self.syntax) + +class NotSyntaxSpec(Instruction): +    name = 'notsyntaxspec' +    def __init__(self, syntax): +	self.syntax = syntax +	Instruction.__init__(self, chr(21), 2) +    def assemble(self, postition, labels): +	return self.opcode + chr(self.syntax) + +class Label(Instruction): +    name = 'label' +    def __init__(self, label): +	self.label = label +	Instruction.__init__(self, '', 0) +    def __repr__(self): +	return '%-15s %i' % (self.name, self.label) + +class OpenParen(Instruction): +    name = '(' +    def __init__(self, register): +	self.register = register +	Instruction.__init__(self, '', 0) +    def assemble(self, position, labels): +	raise error, 'unmatched open parenthesis' + +class Alternation(Instruction): +    name = '|' +    def __init__(self): +	Instruction.__init__(self, '', 0) +    def assemble(self, position, labels): +	raise error, 'an alternation was not taken care of' + +# +# +# + +def assemble(instructions): +    labels = {} +    position = 0 +    pass1 = [] +    for instruction in instructions: +	if instruction.name == 'label': +	    labels[instruction.label] = position +	else: +	    pass1.append((position, instruction)) +	    position = position + instruction.size +    pass2 = '' +    for position, instruction in pass1: +	pass2 = pass2 + instruction.assemble(position, labels) +    return pass2 + +# +# +# + +def escape(pattern): +    result = [] +    for char in pattern: +	if not reop.syntax_table[char] & reop.word: +	    result.append('\\') +	result.append(char) +    return string.join(result, '') + +# +# +# + +def registers_used(instructions): +    result = [] +    for instruction in instructions: +	if (instruction.name in ['set_memory', 'end_memory']) and \ +	   (instruction.register not in result): +	    result.append(instruction.register) +    return result + +# +# +# + +class Fastmap: +    def __init__(self): +	self.map = ['\000']*256 +	self.can_be_null = 0 +    def add(self, char): +	self.map[ord(char)] = '\001' +    def fastmap(self): +	return string.join(self.map, '') +    def __getitem__(self, char): +	return ord(self.map[ord(char)]) +    def __repr__(self): +	self.map.sort() +	return 'Fastmap(' + `self.can_be_null` + ', ' + `self.map` + ')' + +# +# +# + +def find_label(code, label): +    line = 0 +    for instruction in code: +	if (instruction.name == 'label') and (instruction.label == label): +	    return line + 1 +	line = line + 1 +	 +def build_fastmap_aux(code, pos, visited, fastmap): +    if visited[pos]: +	return +    while 1: +	instruction = code[pos] +	visited[pos] = 1 +	pos = pos + 1 +	if instruction.name == 'end': +	    fastmap.can_be_null = 1 +	    return +	elif instruction.name == 'syntaxspec': +	    for char in map(chr, range(256)): +		if reop.syntax_table[char] & instruction.syntax: +		    fastmap.add(char) +	    return +	elif instruction.name == 'notsyntaxspec': +	    for char in map(chr, range(256)): +		if not reop.syntax_table[char] & instruction.syntax: +		    fastmap.add(char) +	    return +	elif instruction.name == 'eol': +	    fastmap.add('\n') +	    if fastmap.can_be_null == 0: +		fastmap.can_be_null = 2 +	    return +	elif instruction.name == 'set': +	    for char in instruction.set: +		fastmap.add(char) +	    return +	elif instruction.name == 'exact': +	    fastmap.add(instruction.char) +	elif instruction.name == 'anychar': +	    for char in map(chr, range(256)): +		if char != '\n': +		    fastmap.add(char) +	    return +	elif instruction.name == 'match_memory': +	    for char in map(chr, range(256)): +		fastmap.add(char) +	    fastmap.can_be_null = 1 +	    return +	elif instruction.name in ['jump', 'dummy_failure_jump', \ +				  'update_failure_jump', 'star_jump']: +	    pos = find_label(code, instruction.label) +	    if visited[pos]: +		return +	    visited[pos] = 1 +	elif instruction.name  == 'failure_jump': +	    build_fastmap_aux(code, +			      find_label(code, instruction.label), +			      visited, +			      fastmap) +	 +def build_fastmap(code, pos=0): +    visited = [0] * len(code) +    fastmap = Fastmap() +    build_fastmap_aux(code, pos, visited, fastmap) +    return fastmap + +# +# +# + +#[NORMAL, CHARCLASS, REPLACEMENT] = range(3) +#[CHAR, MEMORY_REFERENCE, SYNTAX, NOT_SYNTAX, SET, WORD_BOUNDARY, +# NOT_WORD_BOUNDARY, BEGINNING_OF_BUFFER, END_OF_BUFFER] = range(9) + +def expand_escape(pattern, index, context=NORMAL): +    if index >= len(pattern): +	raise error, 'escape ends too soon' + +    elif pattern[index] == 't': +	return CHAR, chr(9), index + 1 +     +    elif pattern[index] == 'n': +	return CHAR, chr(10), index + 1 +     +    elif pattern[index] == 'v': +	return CHAR, chr(11), index + 1 +     +    elif pattern[index] == 'r': +	return CHAR, chr(13), index + 1 +     +    elif pattern[index] == 'f': +	return CHAR, chr(12), index + 1 +     +    elif pattern[index] == 'a': +	return CHAR, chr(7), index + 1 +     +    elif pattern[index] == 'x': +	# CAUTION: this is the Python rule, not the Perl rule! +	end = index + 1  # Skip over the 'x' character +	while (end < len(pattern)) and (pattern[end] in string.hexdigits): +	    end = end + 1 +	if end == index: +	    raise error, "\\x must be followed by hex digit(s)" +	# let Python evaluate it, so we don't incorrectly 2nd-guess +	# what it's doing (and Python in turn passes it on to sscanf, +	# so that *it* doesn't incorrectly 2nd-guess what C does!) +	char = eval ('"' + pattern[index-1:end] + '"') +	assert len(char) == 1 +	return CHAR, char, end + +    elif pattern[index] == 'b': +	if context != NORMAL: +	    return CHAR, chr(8), index + 1 +	else: +	    return WORD_BOUNDARY, '', index + 1 +	     +    elif pattern[index] == 'B': +	if context != NORMAL: +	    return CHAR, 'B', index + 1 +	else: +	    return NOT_WORD_BOUNDARY, '', index + 1 +	     +    elif pattern[index] == 'A': +	if context != NORMAL: +	    return CHAR, 'A', index + 1 +	else: +	    return BEGINNING_OF_BUFFER, '', index + 1 +	     +    elif pattern[index] == 'Z': +	if context != NORMAL: +	    return CHAR, 'Z', index + 1 +	else: +	    return END_OF_BUFFER, '', index + 1 +	     +    elif pattern[index] in 'GluLUQE': +	raise error, ('\\' + pattern[index] + ' is not allowed') +     +    elif pattern[index] == 'w': +	if context == NORMAL: +	    return SYNTAX, reop.word, index + 1 +	elif context == CHARCLASS: +	    set = [] +	    for char in reop.syntax_table.keys(): +		if reop.syntax_table[char] & reop.word: +		    set.append(char) +	    return SET, set, index + 1 +	else: +	    return CHAR, 'w', index + 1 +	 +    elif pattern[index] == 'W': +	if context == NORMAL: +	    return NOT_SYNTAX, reop.word, index + 1 +	elif context == CHARCLASS: +	    set = [] +	    for char in reop.syntax_table.keys(): +		if not reop.syntax_table[char] & reop.word: +		    set.append(char) +	    return SET, set, index + 1 +	else: +	    return CHAR, 'W', index + 1 +	 +    elif pattern[index] == 's': +	if context == NORMAL: +	    return SYNTAX, reop.whitespace, index + 1 +	elif context == CHARCLASS: +	    set = [] +	    for char in reop.syntax_table.keys(): +		if reop.syntax_table[char] & reop.whitespace: +		    set.append(char) +	    return SET, set, index + 1 +	else: +	    return CHAR, 's', index + 1 +	 +    elif pattern[index] == 'S': +	if context == NORMAL: +	    return NOT_SYNTAX, reop.whitespace, index + 1 +	elif context == CHARCLASS: +	    set = [] +	    for char in reop.syntax_table.keys(): +		if not reop.syntax_table[char] & reop.whitespace: +		    set.append(char) +	    return SET, set, index + 1 +	else: +	    return CHAR, 'S', index + 1 +	 +    elif pattern[index] == 'd': +	if context == NORMAL: +	    return SYNTAX, reop.digit, index + 1 +	elif context == CHARCLASS: +	    set = [] +	    for char in reop.syntax_table.keys(): +		if reop.syntax_table[char] & reop.digit: +		    set.append(char) +	    return SET, set, index + 1 +	else: +	    return CHAR, 'd', index + 1 +	 +    elif pattern[index] == 'D': +	if context == NORMAL: +	    return NOT_SYNTAX, reop.digit, index + 1 +	elif context == CHARCLASS: +	    set = [] +	    for char in reop.syntax_table.keys(): +		if not reop.syntax_table[char] & reop.digit: +		    set.append(char) +	    return SET, set, index + 1 +	else: +	    return CHAR, 'D', index + 1 + +    elif pattern[index] in '0123456789': + +	if pattern[index] == '0': +	    if (index + 1 < len(pattern)) and \ +	       (pattern[index + 1] in string.octdigits): +		if (index + 2 < len(pattern)) and \ +		   (pattern[index + 2] in string.octdigits): +		    value = string.atoi(pattern[index:index + 3], 8) +		    index = index + 3 + +		else: +		    value = string.atoi(pattern[index:index + 2], 8) +		    index = index + 2 + +	    else: +		value = 0 +		index = index + 1 + +	    if value > 255: +		raise error, 'octal value out of range' + +	    return CHAR, chr(value), index +	 +	else: +	    if (index + 1 < len(pattern)) and \ +	       (pattern[index + 1] in string.digits): +		if (index + 2 < len(pattern)) and \ +		   (pattern[index + 2] in string.octdigits) and \ +		   (pattern[index + 1] in string.octdigits) and \ +		   (pattern[index] in string.octdigits): +		    value = string.atoi(pattern[index:index + 3], 8) +		    if value > 255: +			raise error, 'octal value out of range' + +		    return CHAR, chr(value), index + 3 + +		else: +		    value = string.atoi(pattern[index:index + 2]) +		    if (value < 1) or (value > 99): +			raise error, 'memory reference out of range' + +		    if context == CHARCLASS: +			raise error, ('cannot reference a register from ' +				      'inside a character class') +		    return MEMORY_REFERENCE, value, index + 2 + +	    else: +		if context == CHARCLASS: +		    raise error, ('cannot reference a register from ' +				  'inside a character class') + +		value = string.atoi(pattern[index]) +		return MEMORY_REFERENCE, value, index + 1 +	     +    elif pattern[index] == 'g': +	if context != REPLACEMENT: +	    return CHAR, 'g', index + 1 + +	index = index + 1 +	if index >= len(pattern): +	    raise error, 'unfinished symbolic reference' +	if pattern[index] != '<': +	    raise error, 'missing < in symbolic reference' + +	index = index + 1 +	end = string.find(pattern, '>', index) +	if end == -1: +	    raise error, 'unfinished symbolic reference' +	value = pattern[index:end] +	if not valid_identifier(value): +	    raise error, 'illegal symbolic reference' +	return MEMORY_REFERENCE, value, end + 1 +     +    else: +	return CHAR, pattern[index], index + 1 + +def compile(pattern, flags=0): +    stack = [] +    label = 0 +    register = 1 +    groupindex = {} +    lastop = '' + +    # look for embedded pattern modifiers at the beginning of the pattern + +    index = 0 + +    if len(pattern) >= 3 and \ +       (pattern[:2] == '(?') and \ +       (pattern[2] in 'iImMsSxX'): +	index = 2 +	while (index < len(pattern)) and (pattern[index] != ')'): +	    if pattern[index] in 'iI': +		flags = flags | IGNORECASE +	    elif pattern[index] in 'mM': +		flags = flags | MULTILINE +	    elif pattern[index] in 'sS': +		flags = flags | DOTALL +	    elif pattern[index] in 'xX': +		flags = flags | VERBOSE +	    else: +		raise error, 'unknown modifier' +	    index = index + 1 +	index = index + 1 + +    # compile the rest of the pattern +     +    while (index < len(pattern)): +	char = pattern[index] +	index = index + 1 +	if char == '\\': +	    escape_type, value, index = expand_escape(pattern, index) + +	    if escape_type == CHAR: +		stack.append([Exact(value, flags)]) +		lastop = '\\' + value +		 +	    elif escape_type == MEMORY_REFERENCE: +		if value >= register: +		    raise error, ('cannot reference a register ' +				  'not yet used') +		stack.append([MatchMemory(value)]) +		lastop = '\\1' +		 +	    elif escape_type == BEGINNING_OF_BUFFER: +		stack.append([BegBuf()]) +		lastop = '\\A' +		 +	    elif escape_type == END_OF_BUFFER: +		stack.append([EndBuf()]) +		lastop = '\\Z' +		 +	    elif escape_type == WORD_BOUNDARY: +		stack.append([WordBound()]) +		lastop = '\\b' +		 +	    elif escape_type == NOT_WORD_BOUNDARY: +		stack.append([NotWordBound()]) +		lastop = '\\B' +		 +	    elif escape_type == SYNTAX: +		stack.append([SyntaxSpec(value)]) +		if value == reop.word: +		    lastop = '\\w' +		elif value == reop.whitespace: +		    lastop = '\\s' +		elif value == reop.digit: +		    lastop = '\\d' +		else: +		    lastop = '\\?' +		     +	    elif escape_type == NOT_SYNTAX: +		stack.append([NotSyntaxSpec(value)]) +		if value == reop.word: +		    lastop = '\\W' +		elif value == reop.whitespace: +		    lastop = '\\S' +		elif value == reop.digit: +		    lastop = '\\D' +		else: +		    lastop = '\\?' +		 +	    elif escape_type == SET: +		raise error, 'cannot use set escape type here' +	     +	    else: +		raise error, 'unknown escape type' + +	elif char == '|': +	    expr = [] +	     +	    while (len(stack) != 0) and \ +		  (stack[-1][0].name != '(') and \ +		  (stack[-1][0].name != '|'): +		expr = stack[-1] + expr +		del stack[-1] +	    stack.append([FailureJump(label)] + \ +			 expr + \ +			 [Jump(-1), +			  Label(label)]) +	    stack.append([Alternation()]) +	    label = label + 1 +	    lastop = '|' +	     +	elif char == '(': +	    if index >= len(pattern): +		raise error, 'no matching close paren' + +	    elif pattern[index] == '?': +		# Perl style (?...) extensions +		index = index + 1 +		if index >= len(pattern): +		    raise error, 'extension ends prematurely' + +		elif pattern[index] == 'P': +		    # Python extensions +		    index = index + 1 +		    if index >= len(pattern): +			raise error, 'extension ends prematurely' + +		    elif pattern[index] == '<': +			# Handle Python symbolic group names (?P<...>...) +			index = index + 1 +			end = string.find(pattern, '>', index) +			if end == -1: +			    raise error, 'no end to symbolic group name' +			name = pattern[index:end] +			if not valid_identifier(name): +			    raise error, ('symbolic group name must be a ' +					  'valid identifier') +			index = end + 1 +			groupindex[name] = register +			stack.append([OpenParen(register)]) +			register = register + 1 +			lastop = '(' + +		    elif pattern[index] == '=': +			# backreference to symbolic group name +			if index >= len(pattern): +			    raise error, '(?P= at the end of the pattern' +			start = index + 1 +			end = string.find(pattern, ')', start) +			if end == -1: +			    raise error, 'no ) to end symbolic group name' +			name = pattern[start:end] +			if name not in groupindex.keys(): +			    raise error, ('symbolic group name ' + name + \ +					  ' has not been used yet') +			stack.append([MatchMemory(groupindex[name])]) +			index = end + 1 +			lastop = '(?P=)' +			 +		    else: +			raise error, ('unknown Python extension: ' + \ +				      pattern[index]) +		     +		elif pattern[index] == ':': +		    # grouping, but no registers +		    index = index + 1 +		    stack.append([OpenParen(-1)]) +		    lastop = '(' +		     +		elif pattern[index] == '#': +		    # comment +		    index = index + 1 +		    end = string.find(pattern, ')', index) +		    if end == -1: +			raise error, 'no end to comment' +		    index = end + 1 +		    # do not change lastop +		     +		elif pattern[index] == '=': +		    raise error, ('zero-width positive lookahead ' +				  'assertion is unsupported') + +		elif pattern[index] == '!': +		    raise error, ('zero-width negative lookahead ' +				  'assertion is unsupported') + +		elif pattern[index] in 'iImMsSxX': +		    raise error, ('embedded pattern modifiers are only ' +				  'allowed at the beginning of the pattern') + +		else: +		    raise error, 'unknown extension' + +	    else: +		stack.append([OpenParen(register)]) +		register = register + 1 +		lastop = '(' + +	elif char == ')': +	    # make one expression out of everything on the stack up to +	    # the marker left by the last parenthesis +	    expr = [] +	    while (len(stack) > 0) and (stack[-1][0].name != '('): +		expr = stack[-1] + expr +		del stack[-1] + +	    if len(stack) == 0: +		raise error, 'too many close parens' +	     +	    # remove markers left by alternation +	    expr = filter(lambda x: x.name != '|', expr) + +	    # clean up jumps inserted by alternation +	    need_label = 0 +	    for i in range(len(expr)): +		if (expr[i].name == 'jump') and (expr[i].label == -1): +		    expr[i] = Jump(label) +		    need_label = 1 +	    if need_label: +		expr.append(Label(label)) +		label = label + 1 + +	    if stack[-1][0].register > 0: +		expr = [StartMemory(stack[-1][0].register)] + \ +		       expr + \ +		       [EndMemory(stack[-1][0].register)] +	    del stack[-1] +	    stack.append(expr) +	    lastop = ')' + +	elif char == '{': +	    if len(stack) == 0: +		raise error, 'no expression to repeat' +	    end = string.find(pattern, '}', index) +	    if end == -1: +		raise error, ('no close curly bracket to match' +			      ' open curly bracket') + +	    fields = map(string.strip, +			 string.split(pattern[index:end], ',')) +	    index = end + 1 + +	    minimal = 0 +	    if (index < len(pattern)) and (pattern[index] == '?'): +		minimal = 1 +		index = index + 1 + +	    if len(fields) == 1: +		# {n} or {n}? (there's really no difference) +		try: +		    count = string.atoi(fields[0]) +		except ValueError: +		    raise error, ('count must be an integer ' +				  'inside curly braces') +		if count > 65535: +		    raise error, 'repeat count out of range' +		expr = [] +		while count > 0: +		    expr = expr + stack[-1] +		    count = count - 1 +		del stack[-1] +		stack.append(expr) +		if minimal: +		    lastop = '{n}?' +		else: +		    lastop = '{n}' + +	    elif len(fields) == 2: +		# {n,} or {n,m} +		if fields[1] == '': +		    # {n,} +		    try: +			min = string.atoi(fields[0]) +		    except ValueError: +			raise error, ('minimum must be an integer ' +				      'inside curly braces') +		    if min > 65535: +			raise error, 'minimum repeat count out of range' + +		    expr = [] +		    while min > 0: +			expr = expr + stack[-1] +			min = min - 1 +		    if minimal: +			expr = expr + \ +			       ([Jump(label + 1), +				 Label(label)] + \ +				stack[-1] + \ +				[Label(label + 1), +				 FailureJump(label)]) +			lastop = '{n,}?' +		    else: +			expr = expr + \ +			       ([Label(label), +				 FailureJump(label + 1)] + +				stack[-1] + +				[StarJump(label), +				 Label(label + 1)]) +			lastop = '{n,}' + +		    del stack[-1] +		    stack.append(expr) +		    label = label + 2 + +		else: +		    # {n,m} +		    try: +			min = string.atoi(fields[0]) +		    except ValueError: +			raise error, ('minimum must be an integer ' +				      'inside curly braces') +		    try: +			max = string.atoi(fields[1]) +		    except ValueError: +			raise error, ('maximum must be an integer ' +				      'inside curly braces') +		    if min > 65535: +			raise error, ('minumim repeat count out ' +				      'of range') +		    if max > 65535: +			raise error, ('maximum repeat count out ' +				      'of range') +		    if min > max: +			raise error, ('minimum repeat count must be ' +				      'less than the maximum ' +				      'repeat count') +		    expr = [] +		    while min > 0: +			expr = expr + stack[-1] +			min = min - 1 +			max = max - 1 +		    if minimal: +			while max > 0: +			    expr = expr + \ +				   [FailureJump(label), +				    Jump(label + 1), +				    Label(label)] + \ +				   stack[-1] + \ +				   [Label(label + 1)] +			    max = max - 1 +			    label = label + 2 +			del stack[-1] +			stack.append(expr) +			lastop = '{n,m}?' +		    else: +			while max > 0: +			    expr = expr + \ +				   [FailureJump(label)] + \ +				   stack[-1] +			    max = max - 1 +			del stack[-1] +			stack.append(expr + [Label(label)]) +			label = label + 1 +			lastop = '{n,m}' + +	    else: +		raise error, ('there need to be one or two fields ' +			      'in a {} expression') + +	elif char == '}': +	    raise error, 'unbalanced close curly brace' + +	elif char == '*': +	    # Kleene closure +	    if len(stack) == 0: +		raise error, '* needs something to repeat' + +	    if lastop in ['(', '|']: +		raise error, '* needs something to repeat' + +	    if lastop in repetition_operators: +		raise error, 'nested repetition operators' +	     +	    if (index < len(pattern)) and (pattern[index] == '?'): +		# non-greedy matching +		expr = [Jump(label + 1), +			Label(label)] + \ +		       stack[-1] + \ +		       [Label(label + 1), +			FailureJump(label)] +		index = index + 1 +		lastop = '*?' +	    else: +		# greedy matching +		expr = [Label(label), +			FailureJump(label + 1)] + \ +		       stack[-1] + \ +		       [StarJump(label), +			Label(label + 1)] +		lastop = '*' +	    del stack[-1] +	    stack.append(expr) +	    label = label + 2 + +	elif char == '+': +	    # positive closure +	    if len(stack) == 0: +		raise error, '+ needs something to repeat' +	     +	    if lastop in ['(', '|']: +		raise error, '+ needs something to repeat' + +	    if lastop in repetition_operators: +		raise error, 'nested repetition operators' + +	    if (index < len(pattern)) and (pattern[index] == '?'): +		# non-greedy +		expr = [Label(label)] + \ +		       stack[-1] + \ +		       [FailureJump(label)] +		label = label + 1 +		index = index + 1 +		lastop = '+?' +		 +	    else: +		# greedy +		expr = [DummyFailureJump(label + 1), +			Label(label), +			FailureJump(label + 2), +			Label(label + 1)] + \ +		       stack[-1] + \ +		       [StarJump(label), +			Label(label + 2)] +		label = label + 3 +		lastop = '+' +		 +	    del stack[-1] +	    stack.append(expr) + +	elif char == '?': +	    if len(stack) == 0: +		raise error, 'need something to be optional' +	     +	    if len(stack) == 0: +		raise error, '? needs something to repeat' +	     +	    if lastop in ['(', '|']: +		raise error, '? needs something to repeat' + +	    if lastop in repetition_operators: +		raise error, 'nested repetition operators' + +	    if (index < len(pattern)) and (pattern[index] == '?'): +		# non-greedy matching +		expr = [FailureJump(label), +			Jump(label + 1), +			Label(label)] + \ +		       stack[-1] + \ +		       [Label(label + 1)] +		label = label + 2 +		index = index + 1 +		lastop = '??' +		 +	    else: +		# greedy matching +		expr = [FailureJump(label)] + \ +		       stack[-1] + \ +		       [Label(label)] +		label = label + 1 +		lastop = '?' +		 +	    del stack[-1] +	    stack.append(expr) + +	elif char == '.': +	    if flags & DOTALL: +		stack.append([Set(map(chr, range(256)), flags)]) +	    else: +		stack.append([AnyChar()]) +	    lastop = '.' + +	elif char == '^': +	    if flags & MULTILINE: +		stack.append([Bol()]) +	    else: +		stack.append([BegBuf()]) +	    lastop = '^' + +	elif char == '$': +	    if flags & MULTILINE: +		stack.append([Eol()]) +	    else: +		stack.append([EndBuf()]) +	    lastop = '$' + +	elif char == '#': +	    if flags & VERBOSE: +		# comment +		index = index + 1 +		end = string.find(pattern, '\n', index) +		if end == -1: +		    index = len(pattern) +		else: +		    index = end + 1 +		# do not change lastop +	    else: +		stack.append([Exact(char, flags)]) +		lastop = '#' + +	elif char in string.whitespace: +	    if not (flags & VERBOSE): +		stack.append([Exact(char, flags)]) +		lastop = char + +	elif char == '[': +	    # compile character class +	     +	    if index >= len(pattern): +		raise error, 'unclosed character class' +	     +	    negate = 0 +	    last = '' +	    set = [] + +	    if pattern[index] == '^': +		negate = 1 +		index = index + 1 +		if index >= len(pattern): +		    raise error, 'unclosed character class' + +	    if pattern[index] == ']': +		set.append(']') +		index = index + 1 +		if index >= len(pattern): +		    raise error, 'unclosed character class' +		 +	    elif pattern[index] == '-': +		set.append('-') +		index = index + 1 +		if index >= len(pattern): +		    raise error, 'unclosed character class' +		 +	    while (index < len(pattern)) and (pattern[index] != ']'): +		next = pattern[index] +		index = index + 1 +		if next == '-': +		    if index >= len(pattern): +			raise error, 'incomplete range in character class' + +		    elif pattern[index] == ']': +			set.append('-') +			 +		    else: +			if last == '': +			    raise error, ('improper use of range in ' +					  'character class') + +			start = last +			 +			if pattern[index] == '\\': +			    escape_type, +			    value, +			    index = expand_escape(pattern, +						  index + 1, +						  CHARCLASS) + +			    if escape_type == CHAR: +				end = value +				 +			    else: +				raise error, ('illegal escape in character ' +					      'class range') +			else: +			    end = pattern[index] +			    index = index + 1 + +			if start > end: +			    raise error, ('range arguments out of order ' +					  'in character class') +			 +			for char in map(chr, range(ord(start), ord(end) + 1)): +			    if char not in set: +				set.append(char) +			     +			last = '' + +		elif next == '\\': +		    # expand syntax meta-characters and add to set +		    if index >= len(pattern): +			raise error, 'incomplete set' + +		    escape_type, value, index = expand_escape(pattern, +							      index, +							      CHARCLASS) + +		    if escape_type == CHAR: +			set.append(value) +			last = value + +		    elif escape_type == SET: +			for char in value: +			    if char not in set: +				set.append(char) +			last = '' + +		    else: +			raise error, 'illegal escape type in character class' + +		else: +		    if next not in set: +			set.append(next) +		    last = next +		     +	    if (index >= len(pattern)) or ( pattern[index] != ']'): +		raise error, 'incomplete set' + +	    index = index + 1 + +	    if negate: +		# If case is being ignored, then both upper- and lowercase +		# versions of the letters must be excluded. +		if flags & IGNORECASE: set=set+map(string.upper, set) +		notset = [] +		for char in map(chr, range(256)): +		    if char not in set: +			notset.append(char) +		if len(notset) == 0: +		    raise error, 'empty negated set' +		stack.append([Set(notset, flags)]) +	    else: +		if len(set) == 0: +		    raise error, 'empty set' +		stack.append([Set(set, flags)]) + +	    lastop = '[]' + +	else: +	    stack.append([Exact(char, flags)]) +	    lastop = char + +    code = [] +    while len(stack) > 0: +	if stack[-1][0].name == '(': +	    raise error, 'too many open parens' +	code = stack[-1] + code +	del stack[-1] +    if len(code) == 0: +	raise error, 'no code generated' +    code = filter(lambda x: x.name != '|', code) +    need_label = 0 +    for i in range(len(code)): +	if (code[i].name == 'jump') and (code[i].label == -1): +	    code[i] = Jump(label) +	    need_label = 1 +    if need_label: +	code.append(Label(label)) +	label = label + 1 +    code.append(End()) +#    print code +    return RegexObject(pattern, flags, code, register, groupindex) + +# Replace expand_escape and _expand functions with their C equivalents. +# If you suspect bugs in the C versions, comment out the next two lines +expand_escape = reop.expand_escape +_expand  = reop._expand  | 
