/tags/v1_0_2_Build_899/extensions/extScripting/scripts/jython/Lib/sre_parse.py - Application: gvSIG desktop - gvSIG

svn-gvsig-desktop / tags / v1_0_2_Build_899 / extensions / extScripting / scripts / jython / Lib / sre_parse.py @ 10517

History | View | Annotate | Download (23.8 KB)

+      #
       # Secret Labs' Regular Expression Engine
+      #
       # convert re-style regular expression to sre pattern
+      #
       # Copyright (c) 1998-2001 by Secret Labs AB.  All rights reserved.
+      #
       # See the sre.py file for information on usage and redistribution.
+      #
       # XXX: show string offset and offending character for all errors
       # this module works under 1.5.2 and later.  don't use string methods
       import string, sys
       from sre_constants import *
       SPECIAL_CHARS = ".\\[{()*+?^$|"
       REPEAT_CHARS = "*+?{"
       DIGITS = tuple("0123456789")
       OCTDIGITS = tuple("01234567")
       HEXDIGITS = tuple("0123456789abcdefABCDEF")
       WHITESPACE = tuple(" \t\n\r\v\f")
       ESCAPES = {
           r"\a": (LITERAL, ord("\a")),
           r"\b": (LITERAL, ord("\b")),
           r"\f": (LITERAL, ord("\f")),
           r"\n": (LITERAL, ord("\n")),
           r"\r": (LITERAL, ord("\r")),
           r"\t": (LITERAL, ord("\t")),
           r"\v": (LITERAL, ord("\v")),
           r"\\": (LITERAL, ord("\\"))
+      }
       CATEGORIES = {
           r"\A": (AT, AT_BEGINNING_STRING), # start of string
           r"\b": (AT, AT_BOUNDARY),
           r"\B": (AT, AT_NON_BOUNDARY),
           r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]),
           r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]),
           r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]),
           r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]),
           r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]),
           r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]),
           r"\Z": (AT, AT_END_STRING), # end of string
+      }
       FLAGS = {
           # standard flags
           "i": SRE_FLAG_IGNORECASE,
           "L": SRE_FLAG_LOCALE,
           "m": SRE_FLAG_MULTILINE,
           "s": SRE_FLAG_DOTALL,
           "x": SRE_FLAG_VERBOSE,
           # extensions
           "t": SRE_FLAG_TEMPLATE,
           "u": SRE_FLAG_UNICODE,
+      }
       # figure out best way to convert hex/octal numbers to integers
       try:
           int("10", 8)
           atoi = int # 2.0 and later
       except TypeError:
           atoi = string.atoi # 1.5.2
       class Pattern:
           # master pattern object.  keeps track of global attributes
           def __init__(self):
               self.flags = 0
               self.open = []
               self.groups = 1
               self.groupdict = {}
           def opengroup(self, name=None):
               gid = self.groups
               self.groups = gid + 1
               if name:
                   self.groupdict[name] = gid
               self.open.append(gid)
               return gid
           def closegroup(self, gid):
               self.open.remove(gid)
           def checkgroup(self, gid):
               return gid < self.groups and gid not in self.open
       class SubPattern:
           # a subpattern, in intermediate form
           def __init__(self, pattern, data=None):
               self.pattern = pattern
               if not data:
                   data = []
               self.data = data
               self.width = None
           def dump(self, level=0):
               nl = 1
               for op, av in self.data:
                   print level*"  " + op,; nl = 0
                   if op == "in":
                       # member sublanguage
                       print; nl = 1
                       for op, a in av:
                           print (level+1)*"  " + op, a
                   elif op == "branch":
                       print; nl = 1
                       i = 0
                       for a in av[1]:
                           if i > 0:
                               print level*"  " + "or"
                           a.dump(level+1); nl = 1
                           i = i + 1
                   elif type(av) in (type(()), type([])):
                       for a in av:
                           if isinstance(a, SubPattern):
                               if not nl: print
                               a.dump(level+1); nl = 1
                           else:
                               print a, ; nl = 0
                   else:
                       print av, ; nl = 0
                   if not nl: print
           def __repr__(self):
               return repr(self.data)
           def __len__(self):
               return len(self.data)
           def __delitem__(self, index):
               del self.data[index]
           def __getitem__(self, index):
               return self.data[index]
           def __setitem__(self, index, code):
               self.data[index] = code
           def __getslice__(self, start, stop):
               return SubPattern(self.pattern, self.data[start:stop])
           def insert(self, index, code):
               self.data.insert(index, code)
           def append(self, code):
               self.data.append(code)
           def getwidth(self):
               # determine the width (min, max) for this subpattern
               if self.width:
                   return self.width
               lo = hi = 0L
               for op, av in self.data:
                   if op is BRANCH:
                       i = sys.maxint
                       j = 0
                       for av in av[1]:
                           l, h = av.getwidth()
                           i = min(i, l)
                           j = max(j, h)
                       lo = lo + i
                       hi = hi + j
                   elif op is CALL:
                       i, j = av.getwidth()
                       lo = lo + i
                       hi = hi + j
                   elif op is SUBPATTERN:
                       i, j = av[1].getwidth()
                       lo = lo + i
                       hi = hi + j
                   elif op in (MIN_REPEAT, MAX_REPEAT):
                       i, j = av[2].getwidth()
                       lo = lo + long(i) * av[0]
                       hi = hi + long(j) * av[1]
                   elif op in (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY):
                       lo = lo + 1
                       hi = hi + 1
                   elif op == SUCCESS:
                       break
               self.width = int(min(lo, sys.maxint)), int(min(hi, sys.maxint))
               return self.width
       class Tokenizer:
           def __init__(self, string):
               self.string = string
               self.index = 0
               self.__next()
           def __next(self):
               if self.index >= len(self.string):
                   self.next = None
                   return
               char = self.string[self.index]
               if char[0] == "\\":
                   try:
                       c = self.string[self.index + 1]
                   except IndexError:
                       raise error, "bogus escape"
                   char = char + c
               self.index = self.index + len(char)
               self.next = char
           def match(self, char, skip=1):
               if char == self.next:
                   if skip:
                       self.__next()
                   return 1
               return 0
           def get(self):
               this = self.next
               self.__next()
               return this
           def tell(self):
               return self.index, self.next
           def seek(self, index):
               self.index, self.next = index
       def isident(char):
           return "a" <= char <= "z" or "A" <= char <= "Z" or char == "_"
       def isdigit(char):
           return "0" <= char <= "9"
       def isname(name):
           # check that group name is a valid string
           if not isident(name[0]):
               return 0
           for char in name:
               if not isident(char) and not isdigit(char):
                   return 0
           return 1
       def _group(escape, groups):
           # check if the escape string represents a valid group
           try:
               gid = atoi(escape[1:])
               if gid and gid < groups:
                   return gid
           except ValueError:
               pass
           return None # not a valid group
       def _class_escape(source, escape):
           # handle escape code inside character class
           code = ESCAPES.get(escape)
           if code:
               return code
           code = CATEGORIES.get(escape)
           if code:
               return code
           try:
               if escape[1:2] == "x":
                   # hexadecimal escape (exactly two digits)
                   while source.next in HEXDIGITS and len(escape) < 4:
                       escape = escape + source.get()
                   escape = escape[2:]
                   if len(escape) != 2:
                       raise error, "bogus escape: %s" % repr("\\" + escape)
                   return LITERAL, atoi(escape, 16) & 0xff
               elif str(escape[1:2]) in OCTDIGITS:
                   # octal escape (up to three digits)
                   while source.next in OCTDIGITS and len(escape) < 5:
                       escape = escape + source.get()
                   escape = escape[1:]
                   return LITERAL, atoi(escape, 8) & 0xff
               if len(escape) == 2:
                   return LITERAL, ord(escape[1])
           except ValueError:
               pass
           raise error, "bogus escape: %s" % repr(escape)
       def _escape(source, escape, state):
           # handle escape code in expression
           code = CATEGORIES.get(escape)
           if code:
               return code
           code = ESCAPES.get(escape)
           if code:
               return code
           try:
               if escape[1:2] == "x":
                   # hexadecimal escape
                   while source.next in HEXDIGITS and len(escape) < 4:
                       escape = escape + source.get()
                   if len(escape) != 4:
                       raise ValueError
                   return LITERAL, atoi(escape[2:], 16) & 0xff
               elif escape[1:2] == "0":
                   # octal escape
                   while source.next in OCTDIGITS and len(escape) < 4:
                       escape = escape + source.get()
                   return LITERAL, atoi(escape[1:], 8) & 0xff
               elif escape[1:2] in DIGITS:
                   # octal escape *or* decimal group reference (sigh)
                   here = source.tell()
                   if source.next in DIGITS:
                       escape = escape + source.get()
                       if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and
                           source.next in OCTDIGITS):
                           # got three octal digits; this is an octal escape
                           escape = escape + source.get()
                           return LITERAL, atoi(escape[1:], 8) & 0xff
                   # got at least one decimal digit; this is a group reference
                   group = _group(escape, state.groups)
                   if group:
                       if not state.checkgroup(group):
                           raise error, "cannot refer to open group"
                       return GROUPREF, group
                   raise ValueError
               if len(escape) == 2:
                   return LITERAL, ord(escape[1])
           except ValueError:
               pass
           raise error, "bogus escape: %s" % repr(escape)
       def _parse_sub(source, state, nested=1):
           # parse an alternation: a|b|c
           items = []
           while 1:
               items.append(_parse(source, state))
               if source.match("|"):
                   continue
               if not nested:
                   break
               if not source.next or source.match(")", 0):
                   break
               else:
                   raise error, "pattern not properly closed"
           if len(items) == 1:
               return items[0]
           subpattern = SubPattern(state)
           # check if all items share a common prefix
           while 1:
               prefix = None
               for item in items:
                   if not item:
                       break
                   if prefix is None:
                       prefix = item[0]
                   elif item[0] != prefix:
                       break
               else:
                   # all subitems start with a common "prefix".
                   # move it out of the branch
                   for item in items:
                       del item[0]
                   subpattern.append(prefix)
                   continue # check next one
               break
           # check if the branch can be replaced by a character set
           for item in items:
               if len(item) != 1 or item[0][0] != LITERAL:
                   break
           else:
               # we can store this as a character set instead of a
               # branch (the compiler may optimize this even more)
               set = []
               for item in items:
                   set.append(item[0])
               subpattern.append((IN, set))
               return subpattern
           subpattern.append((BRANCH, (None, items)))
           return subpattern
       def _parse(source, state):
           # parse a simple pattern
           subpattern = SubPattern(state)
           while 1:
               if source.next in ("|", ")"):
                   break # end of subpattern
               this = source.get()
               if this is None:
                   break # end of pattern
               if state.flags & SRE_FLAG_VERBOSE:
                   # skip whitespace and comments
                   if this in WHITESPACE:
                       continue
                   if this == "#":
                       while 1:
                           this = source.get()
                           if this in (None, "\n"):
                               break
                       continue
               if this and this[0] not in SPECIAL_CHARS:
                   subpattern.append((LITERAL, ord(this)))
               elif this == "[":
                   # character set
                   set = []
       ##          if source.match(":"):
       ##              pass # handle character classes
                   if source.match("^"):
                       set.append((NEGATE, None))
                   # check remaining characters
                   start = set[:]
                   while 1:
                       this = source.get()
                       if this == "]" and set != start:
                           break
                       elif this and this[0] == "\\":
                           code1 = _class_escape(source, this)
                       elif this:
                           code1 = LITERAL, ord(this)
                       else:
                           raise error, "unexpected end of regular expression"
                       if source.match("-"):
                           # potential range
                           this = source.get()
                           if this == "]":
                               if code1[0] is IN:
                                   code1 = code1[1][0]
                               set.append(code1)
                               set.append((LITERAL, ord("-")))
                               break
                           else:
                               if this[0] == "\\":
                                   code2 = _class_escape(source, this)
                               else:
                                   code2 = LITERAL, ord(this)
                               if code1[0] != LITERAL or code2[0] != LITERAL:
                                   raise error, "bad character range"
                               lo = code1[1]
                               hi = code2[1]
                               if hi < lo:
                                   raise error, "bad character range"
                               set.append((RANGE, (lo, hi)))
                       else:
                           if code1[0] is IN:
                               code1 = code1[1][0]
                           set.append(code1)
                   # XXX: <fl> should move set optimization to compiler!
                   if len(set)==1 and set[0][0] is LITERAL:
                       subpattern.append(set[0]) # optimization
                   elif len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL:
                       subpattern.append((NOT_LITERAL, set[1][1])) # optimization
                   else:
                       # XXX: <fl> should add charmap optimization here
                       subpattern.append((IN, set))
               elif this and this[0] in REPEAT_CHARS:
                   # repeat previous item
                   if this == "?":
                       min, max = 0, 1
                   elif this == "*":
                       min, max = 0, MAXREPEAT
                   elif this == "+":
                       min, max = 1, MAXREPEAT
                   elif this == "{":
                       here = source.tell()
                       min, max = 0, MAXREPEAT
                       lo = hi = ""
                       while source.next in DIGITS:
                           lo = lo + source.get()
                       if source.match(","):
                           while source.next in DIGITS:
                               hi = hi + source.get()
                       else:
                           hi = lo
                       if not source.match("}"):
                           subpattern.append((LITERAL, ord(this)))
                           source.seek(here)
                           continue
                       if lo:
                           min = atoi(lo)
                       if hi:
                           max = atoi(hi)
                       if max < min:
                           raise error, "bad repeat interval"
                   else:
                       raise error, "not supported"
                   # figure out which item to repeat
                   if subpattern:
                       item = subpattern[-1:]
                   else:
                       item = None
                   if not item or (len(item) == 1 and item[0][0] == AT):
                       raise error, "nothing to repeat"
                   if item[0][0] in (MIN_REPEAT, MAX_REPEAT):
                       raise error, "multiple repeat"
                   if source.match("?"):
                       subpattern[-1] = (MIN_REPEAT, (min, max, item))
                   else:
                       subpattern[-1] = (MAX_REPEAT, (min, max, item))
               elif this == ".":
                   subpattern.append((ANY, None))
               elif this == "(":
                   group = 1
                   name = None
                   if source.match("?"):
                       group = 0
                       # options
                       if source.match("P"):
                           # python extensions
                           if source.match("<"):
                               # named group: skip forward to end of name
                               name = ""
                               while 1:
                                   char = source.get()
                                   if char is None:
                                       raise error, "unterminated name"
                                   if char == ">":
                                       break
                                   name = name + char
                               group = 1
                               if not isname(name):
                                   raise error, "bad character in group name"
                           elif source.match("="):
                               # named backreference
                               name = ""
                               while 1:
                                   char = source.get()
                                   if char is None:
                                       raise error, "unterminated name"
                                   if char == ")":
                                       break
                                   name = name + char
                               if not isname(name):
                                   raise error, "bad character in group name"
                               gid = state.groupdict.get(name)
                               if gid is None:
                                   raise error, "unknown group name"
                               subpattern.append((GROUPREF, gid))
                               continue
                           else:
                               char = source.get()
                               if char is None:
                                   raise error, "unexpected end of pattern"
                               raise error, "unknown specifier: ?P%s" % char
                       elif source.match(":"):
                           # non-capturing group
                           group = 2
                       elif source.match("#"):
                           # comment
                           while 1:
                               if source.next is None or source.next == ")":
                                   break
                               source.get()
                           if not source.match(")"):
                               raise error, "unbalanced parenthesis"
                           continue
                       elif source.next in ("=", "!", "<"):
                           # lookahead assertions
                           char = source.get()
                           dir = 1
                           if char == "<":
                               if source.next not in ("=", "!"):
                                   raise error, "syntax error"
                               dir = -1 # lookbehind
                               char = source.get()
                           p = _parse_sub(source, state)
                           if not source.match(")"):
                               raise error, "unbalanced parenthesis"
                           if char == "=":
                               subpattern.append((ASSERT, (dir, p)))
                           else:
                               subpattern.append((ASSERT_NOT, (dir, p)))
                           continue
                       else:
                           # flags
                           if not FLAGS.has_key(source.next):
                               raise error, "unexpected end of pattern"
                           while FLAGS.has_key(source.next):
                               state.flags = state.flags | FLAGS[source.get()]
                   if group:
                       # parse group contents
                       if group == 2:
                           # anonymous group
                           group = None
                       else:
                           group = state.opengroup(name)
                       p = _parse_sub(source, state)
                       if not source.match(")"):
                           raise error, "unbalanced parenthesis"
                       if group is not None:
                           state.closegroup(group)
                       subpattern.append((SUBPATTERN, (group, p)))
                   else:
                       while 1:
                           char = source.get()
                           if char is None:
                               raise error, "unexpected end of pattern"
                           if char == ")":
                               break
                           raise error, "unknown extension"
               elif this == "^":
                   subpattern.append((AT, AT_BEGINNING))
               elif this == "$":
                   subpattern.append((AT, AT_END))
               elif this and this[0] == "\\":
                   code = _escape(source, this, state)
                   subpattern.append(code)
               else:
                   raise error, "parser error"
           return subpattern
       def parse(str, flags=0, pattern=None):
           # parse 're' pattern into list of (opcode, argument) tuples
           source = Tokenizer(str)
           if pattern is None:
               pattern = Pattern()
           pattern.flags = flags
           pattern.str = str
           p = _parse_sub(source, pattern, 0)
           tail = source.get()
           if tail == ")":
               raise error, "unbalanced parenthesis"
           elif tail:
               raise error, "bogus characters at end of regular expression"
           if flags & SRE_FLAG_DEBUG:
               p.dump()
           if not (flags & SRE_FLAG_VERBOSE) and p.pattern.flags & SRE_FLAG_VERBOSE:
               # the VERBOSE flag was switched on inside the pattern.  to be
               # on the safe side, we'll parse the whole thing again...
               return parse(str, p.pattern.flags)
           return p
       def parse_template(source, pattern):
           # parse 're' replacement string into list of literals and
           # group references
           s = Tokenizer(source)
           p = []
           a = p.append
           def literal(literal, p=p):
               if p and p[-1][0] is LITERAL:
                   p[-1] = LITERAL, p[-1][1] + literal
               else:
                   p.append((LITERAL, literal))
           sep = source[:0]
           if type(sep) is type(""):
               char = chr
           else:
               char = unichr
           while 1:
               this = s.get()
               if this is None:
                   break # end of replacement string
               if this and this[0] == "\\":
                   # group
                   if this == "\\g":
                       name = ""
                       if s.match("<"):
                           while 1:
                               char = s.get()
                               if char is None:
                                   raise error, "unterminated group name"
                               if char == ">":
                                   break
                               name = name + char
                       if not name:
                           raise error, "bad group name"
                       try:
                           index = atoi(name)
                       except ValueError:
                           if not isname(name):
                               raise error, "bad character in group name"
                           try:
                               index = pattern.groupindex[name]
                           except KeyError:
                               raise IndexError, "unknown group name"
                       a((MARK, index))
                   elif len(this) > 1 and this[1] in DIGITS:
                       code = None
                       while 1:
                           group = _group(this, pattern.groups+1)
                           if group:
                               if (s.next not in DIGITS or
                                   not _group(this + s.next, pattern.groups+1)):
                                   code = MARK, group
                                   break
                           elif s.next in OCTDIGITS:
                               this = this + s.get()
                           else:
                               break
                       if not code:
                           this = this[1:]
                           code = LITERAL, char(atoi(this[-6:], 8) & 0xff)
                       if code[0] is LITERAL:
                           literal(code[1])
                       else:
                           a(code)
                   else:
                       try:
                           this = char(ESCAPES[this][1])
                       except KeyError:
                           pass
                       literal(this)
               else:
                   literal(this)
           # convert template to groups and literals lists
           i = 0
           groups = []
           literals = []
           for c, s in p:
               if c is MARK:
                   groups.append((i, s))
                   literals.append(None)
               else:
                   literals.append(s)
               i = i + 1
           return groups, literals
       def expand_template(template, match):
           g = match.group
           sep = match.string[:0]
           groups, literals = template
           literals = literals[:]
           try:
               for index, group in groups:
                   literals[index] = s = g(group)
                   if s is None:
                       raise IndexError
           except IndexError:
               raise error, "empty group"
           return string.join(literals, sep)