Source code for reCompiler

""" Module re_compile -- compile a regular expression into an FSA

To Do
-----
New features:
    - add \-, \~
    - add remaining metachars
    - char set with ^ as first char will print wrong
    - figure out when to print spaces between operators
"""

__author__  = "Oliver Steele <steele@osteele.com>"

from functools import reduce
import charm.toolbox.FSA as FSA

[docs]def compileSymbolRE(str): return SymbolRECompiler(str).toFSA()
[docs]def dummy_func(a, b): return a, b
[docs]class SymbolRECompiler: EOF = -1 def __init__(self, str, recordSourcePositions=0): self.str = str self.recordSourcePositions = recordSourcePositions
[docs] def toFSA(self, minimize=1): self.index = 0 self.nextToken = None fsa = self.compileExpr() if self.index < len(self.str): raise ValueError('extra ' + str(')')) del self.index fsa.label = self.str if minimize: fsa = fsa.minimized() return fsa
[docs] def readChar(self): if self.index < len(self.str): c, self.index = self.str[self.index], self.index + 1 return c
[docs] def peekChar(self): if self.index < len(self.str): return self.str[self.index]
[docs] def readToken(self): token = self.nextToken or self._readNextToken() self.nextToken = None return token != self.EOF and token
[docs] def peekToken(self): token = self.nextToken = self.nextToken or self._readNextToken() #print 'peekToken', token return token != self.EOF and token
def _readNextToken(self): c = self.readChar() if not c: return self.EOF elif c in '()|&': return c elif c == '.': return ANY return c
[docs] def skipTokens(self, bag): while self.peekToken() and self.peekToken() in bag: self.readToken()
[docs] def compileExpr(self): fsa = FSA.NULL_FSA while self.peekToken() and self.peekToken() != ')': fsa = FSA.union(fsa, self.compileConjunction()) self.skipTokens(['|']) return fsa
[docs] def compileConjunction(self): fsa = None while self.peekToken() and self.peekToken() not in (')', '|'): sequence = self.compileSequence() fsa = fsa and FSA.intersection(fsa, sequence) or sequence self.skipTokens(['&']) return fsa
[docs] def compileSequence(self): fsa = FSA.EMPTY_STRING_FSA while self.peekToken() and self.peekToken() not in (')', '|', '&'): fsa = FSA.concatenation(fsa, self.compileItem()) return fsa
[docs] def compileItem(self): startPosition = self.index c = self.readToken() if c == '(': fsa = self.compileExpr() if self.readToken() != ')': raise ValueError("missing ')'") elif c == '~': fsa = FSA.complement(self.compileItem()) else: fsa = FSA.singleton(c, arcMetadata=self.recordSourcePositions and [startPosition]) while self.peekChar() and self.peekChar() in '?*+': c = self.readChar() if c == '*': fsa = FSA.closure(fsa) elif c == '?': fsa = FSA.union(fsa, FSA.EMPTY_STRING_FSA) elif c == '+': fsa = FSA.iteration(fsa) else: raise ValueError('program error') return fsa
# # Character REs #
[docs]class CharacterSet: def __init__(self, ranges): if type(ranges) == str: ranges = self.convertString(ranges) accum = [] # copy, so sort doesn't destroy the arg for item in ranges: if type(item) == tuple: if len(item) == 1: accum.append((item, item)) elif len(item) == 2: accum.append(item) else: raise ValueError("invalid argument to CharacterSet") elif type(item) == str: for c in item: accum.append((c, c)) else: raise ValueError("invalid argument to CharacterSet") ranges = accum ranges.sort() index = 0 while index < len(ranges) - 1: [(c0, c1), (c2, c3)] = ranges[index:index + 2] if c1 >= c2: ranges[index:index + 2] = [(c0, max(c1, c3))] else: index = index + 1 self.ranges = ranges def __cmp__(self, other): return cmp(type(self), type(other)) or cmp(self.__class__, other.__class__) or cmp(self.ranges, other.ranges) def __hash__(self): return reduce(lambda a, b:a ^ b, map(hash, self.ranges))
[docs] def convertString(self, _str): ranges = [] index = 0 while index < len(_str): c0 = c1 = _str[index] index = index + 1 if index + 1 < len(_str) and _str[index ] == '-': c1 = _str[index + 1] index = index + 2 ranges.append((c0, c1)) return ranges
[docs] def matches(self, c): for c0, c1 in self.ranges: if c0 <= c and c <= c1: return 1 return 0
[docs] def complement(self): results = [] for (_, c0), (c1, _) in map(dummy_func, [(None, None)] + self.ranges, self.ranges + [(None, None)]): i0 = c0 and ord(c0) + 1 or 0 i1 = c1 and ord(c1) - 1 or 255 if i0 <= i1: results.append((chr(i0), chr(i1))) if results: return CharacterSet(results)
[docs] def union(self, other): a = self.complement() b = other.complement() if a and b: c = a.intersection(b) if c: return c.complement() else: return self.ANY else: return a or b
def __add__(self, other): return self.union(other)
[docs] def intersection(self, other): if self.ranges == other.ranges: return self results = [] for (a0, a1) in self.ranges: for (b0, b1) in other.ranges: c0 = max(a0, b0) c1 = min(a1, b1) if c0 <= c1: results.append((c0, c1)) results.sort() if results: return CharacterSet(results)
def __str__(self): """ ### print(CharacterSet([('a', 'a')])) a ### print(CharacterSet([('a', 'b')])) [ab] """ if self == self.ANY: return '.' elif not self.ranges: return '[^.]' for key, value in METACHARS.items(): if self == value: return '\\' + key ranges = self.ranges if len(ranges) == 1 and ranges[0][0] == ranges[0][1]: return ranges[0][0] if ranges[0][0] == chr(0) and ranges[-1][1] == chr(255): s = str(self.complement()) if s[0] == '[' and s[-1] == ']': s = s[1:-1] return '[^' + s + ']' s = '' for c0, c1 in ranges: if c0 == c1 and c0 != '-': s = s + self.crep(c0) elif ord(c0) + 1 == ord(c1) and c0 != '-' and c1 != '-': s = s + "%s%s" % (self.crep(c0), self.crep(c1)) else: s = s + "%s-%s" % (self.crep(c0), self.crep(c1)) return '[' + s + ']'
[docs] def crep(self, c): return {'\t': '\\t', '\n': '\\n', '\r': '\\r', '\f': '\\f', '\v': '\\v'}.get(c, c)
def __repr__(self): return '<' + self.__class__.__name__ + ' ' + str(self) + '>'
METACHARS = { 'd': CharacterSet('0-9'), 's': CharacterSet(' \t\n\r\f\v'), 'w': CharacterSet('a-zA-Z0-9')} METACHARS['D'] = METACHARS['d'].complement() METACHARS['S'] = METACHARS['s'].complement() METACHARS['W'] = METACHARS['w'].complement() CharacterSet.ANY = CharacterSet([(chr(0), chr(255))])
[docs]class RECompiler(SymbolRECompiler): def _readNextToken(self): c = self.readChar() if not c: return self.EOF elif c in '()|': return c elif c == '.': return CharacterSet.ANY elif c == '[': if self.peekChar() == '~': self.readChar() return self.readCSetInnards().complement() else: return self.readCSetInnards() elif c == '\\': c = self.readChar() if METACHARS.get(c): return METACHARS.get(c) elif c == '&': return c else: return CharacterSet([(c,c)]) else: return CharacterSet([(c,c)])
[docs] def readCSetInnards(self): cset = CharacterSet([]) while 1: c = self.readChar() if c == ']': return cset if self.peekChar() == '-': self.readChar() cset = cset.union(CharacterSet([(c, self.readChar())])) else: cset = cset.union(CharacterSet([(c, c)]))
[docs]def compileRE(_str, minimize=1, recordSourcePositions=0): return RECompiler(_str, recordSourcePositions=recordSourcePositions).toFSA(minimize=minimize)
# # testing # def _printCompiledREs(): print (compileRE('a')) print (compileRE('ab')) print (compileRE('a|b')) print (compileRE('abc')) print (compileRE('ab*c')) print (compileRE('ab?c')) print (compileRE('ab+c')) print (compileRE('ab|c')) print (compileRE('a(b|c)')) #print compileRE('a\&a') #print compileRE('ab+\&a+b') #print compileRE('ab*\&a*b') print (compileRE('ab|c?')) print (compileRE('ab|bc?')) print (compileRE('a?')) print (compileRE('abc|acb|bac|bca|cab|cba')) print (compileRE('abc|acb|bac|bca|cab|cba', 0).determinized()) print (compileRE('abc|acb|bac|bca|cab|cba', 0).determinized()) print (compileRE('abc|acb|bac|bca|cab|cba', 0).minimized()) print (compileRE('abc|acb|bac|bca|cab', 0).determinized()) print (compileRE('a', 0)) print (compileRE('a', 0).determinized()) print (compileRE('ab', 0).determinized()) print (compileRE('a', 0).minimized()) print (compileRE('ab', 0).minimized()) print (compileRE('a')) print (compileRE('a|b', 0).determinized()) print (compileRE('a|b', 0).minimized().getArcMetadata()) print (compileRE('a|b', 0).minimized()) def _test(reset=0): import doctest, compileRE if reset: doctest.master = None # This keeps doctest from complaining after a reload. return doctest.testmod(compileRE)