Statistics
| Revision:

svn-gvsig-desktop / tags / v1_0_2_Build_899 / extensions / extScripting / scripts / jython / Lib / sre_parse.py @ 10517

History | View | Annotate | Download (23.8 KB)

1
#
2
# Secret Labs' Regular Expression Engine
3
#
4
# convert re-style regular expression to sre pattern
5
#
6
# Copyright (c) 1998-2001 by Secret Labs AB.  All rights reserved.
7
#
8
# See the sre.py file for information on usage and redistribution.
9
#
10

    
11
# XXX: show string offset and offending character for all errors
12

    
13
# this module works under 1.5.2 and later.  don't use string methods
14
import string, sys
15

    
16
from sre_constants import *
17

    
18
SPECIAL_CHARS = ".\\[{()*+?^$|"
19
REPEAT_CHARS = "*+?{"
20

    
21
DIGITS = tuple("0123456789")
22

    
23
OCTDIGITS = tuple("01234567")
24
HEXDIGITS = tuple("0123456789abcdefABCDEF")
25

    
26
WHITESPACE = tuple(" \t\n\r\v\f")
27

    
28
ESCAPES = {
29
    r"\a": (LITERAL, ord("\a")),
30
    r"\b": (LITERAL, ord("\b")),
31
    r"\f": (LITERAL, ord("\f")),
32
    r"\n": (LITERAL, ord("\n")),
33
    r"\r": (LITERAL, ord("\r")),
34
    r"\t": (LITERAL, ord("\t")),
35
    r"\v": (LITERAL, ord("\v")),
36
    r"\\": (LITERAL, ord("\\"))
37
}
38

    
39
CATEGORIES = {
40
    r"\A": (AT, AT_BEGINNING_STRING), # start of string
41
    r"\b": (AT, AT_BOUNDARY),
42
    r"\B": (AT, AT_NON_BOUNDARY),
43
    r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]),
44
    r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]),
45
    r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]),
46
    r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]),
47
    r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]),
48
    r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]),
49
    r"\Z": (AT, AT_END_STRING), # end of string
50
}
51

    
52
FLAGS = {
53
    # standard flags
54
    "i": SRE_FLAG_IGNORECASE,
55
    "L": SRE_FLAG_LOCALE,
56
    "m": SRE_FLAG_MULTILINE,
57
    "s": SRE_FLAG_DOTALL,
58
    "x": SRE_FLAG_VERBOSE,
59
    # extensions
60
    "t": SRE_FLAG_TEMPLATE,
61
    "u": SRE_FLAG_UNICODE,
62
}
63

    
64
# figure out best way to convert hex/octal numbers to integers
65
try:
66
    int("10", 8)
67
    atoi = int # 2.0 and later
68
except TypeError:
69
    atoi = string.atoi # 1.5.2
70

    
71
class Pattern:
72
    # master pattern object.  keeps track of global attributes
73
    def __init__(self):
74
        self.flags = 0
75
        self.open = []
76
        self.groups = 1
77
        self.groupdict = {}
78
    def opengroup(self, name=None):
79
        gid = self.groups
80
        self.groups = gid + 1
81
        if name:
82
            self.groupdict[name] = gid
83
        self.open.append(gid)
84
        return gid
85
    def closegroup(self, gid):
86
        self.open.remove(gid)
87
    def checkgroup(self, gid):
88
        return gid < self.groups and gid not in self.open
89

    
90
class SubPattern:
91
    # a subpattern, in intermediate form
92
    def __init__(self, pattern, data=None):
93
        self.pattern = pattern
94
        if not data:
95
            data = []
96
        self.data = data
97
        self.width = None
98
    def dump(self, level=0):
99
        nl = 1
100
        for op, av in self.data:
101
            print level*"  " + op,; nl = 0
102
            if op == "in":
103
                # member sublanguage
104
                print; nl = 1
105
                for op, a in av:
106
                    print (level+1)*"  " + op, a
107
            elif op == "branch":
108
                print; nl = 1
109
                i = 0
110
                for a in av[1]:
111
                    if i > 0:
112
                        print level*"  " + "or"
113
                    a.dump(level+1); nl = 1
114
                    i = i + 1
115
            elif type(av) in (type(()), type([])):
116
                for a in av:
117
                    if isinstance(a, SubPattern):
118
                        if not nl: print
119
                        a.dump(level+1); nl = 1
120
                    else:
121
                        print a, ; nl = 0
122
            else:
123
                print av, ; nl = 0
124
            if not nl: print
125
    def __repr__(self):
126
        return repr(self.data)
127
    def __len__(self):
128
        return len(self.data)
129
    def __delitem__(self, index):
130
        del self.data[index]
131
    def __getitem__(self, index):
132
        return self.data[index]
133
    def __setitem__(self, index, code):
134
        self.data[index] = code
135
    def __getslice__(self, start, stop):
136
        return SubPattern(self.pattern, self.data[start:stop])
137
    def insert(self, index, code):
138
        self.data.insert(index, code)
139
    def append(self, code):
140
        self.data.append(code)
141
    def getwidth(self):
142
        # determine the width (min, max) for this subpattern
143
        if self.width:
144
            return self.width
145
        lo = hi = 0L
146
        for op, av in self.data:
147
            if op is BRANCH:
148
                i = sys.maxint
149
                j = 0
150
                for av in av[1]:
151
                    l, h = av.getwidth()
152
                    i = min(i, l)
153
                    j = max(j, h)
154
                lo = lo + i
155
                hi = hi + j
156
            elif op is CALL:
157
                i, j = av.getwidth()
158
                lo = lo + i
159
                hi = hi + j
160
            elif op is SUBPATTERN:
161
                i, j = av[1].getwidth()
162
                lo = lo + i
163
                hi = hi + j
164
            elif op in (MIN_REPEAT, MAX_REPEAT):
165
                i, j = av[2].getwidth()
166
                lo = lo + long(i) * av[0]
167
                hi = hi + long(j) * av[1]
168
            elif op in (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY):
169
                lo = lo + 1
170
                hi = hi + 1
171
            elif op == SUCCESS:
172
                break
173
        self.width = int(min(lo, sys.maxint)), int(min(hi, sys.maxint))
174
        return self.width
175

    
176
class Tokenizer:
177
    def __init__(self, string):
178
        self.string = string
179
        self.index = 0
180
        self.__next()
181
    def __next(self):
182
        if self.index >= len(self.string):
183
            self.next = None
184
            return
185
        char = self.string[self.index]
186
        if char[0] == "\\":
187
            try:
188
                c = self.string[self.index + 1]
189
            except IndexError:
190
                raise error, "bogus escape"
191
            char = char + c
192
        self.index = self.index + len(char)
193
        self.next = char
194
    def match(self, char, skip=1):
195
        if char == self.next:
196
            if skip:
197
                self.__next()
198
            return 1
199
        return 0
200
    def get(self):
201
        this = self.next
202
        self.__next()
203
        return this
204
    def tell(self):
205
        return self.index, self.next
206
    def seek(self, index):
207
        self.index, self.next = index
208

    
209
def isident(char):
210
    return "a" <= char <= "z" or "A" <= char <= "Z" or char == "_"
211

    
212
def isdigit(char):
213
    return "0" <= char <= "9"
214

    
215
def isname(name):
216
    # check that group name is a valid string
217
    if not isident(name[0]):
218
        return 0
219
    for char in name:
220
        if not isident(char) and not isdigit(char):
221
            return 0
222
    return 1
223

    
224
def _group(escape, groups):
225
    # check if the escape string represents a valid group
226
    try:
227
        gid = atoi(escape[1:])
228
        if gid and gid < groups:
229
            return gid
230
    except ValueError:
231
        pass
232
    return None # not a valid group
233

    
234
def _class_escape(source, escape):
235
    # handle escape code inside character class
236
    code = ESCAPES.get(escape)
237
    if code:
238
        return code
239
    code = CATEGORIES.get(escape)
240
    if code:
241
        return code
242
    try:
243
        if escape[1:2] == "x":
244
            # hexadecimal escape (exactly two digits)
245
            while source.next in HEXDIGITS and len(escape) < 4:
246
                escape = escape + source.get()
247
            escape = escape[2:]
248
            if len(escape) != 2:
249
                raise error, "bogus escape: %s" % repr("\\" + escape)
250
            return LITERAL, atoi(escape, 16) & 0xff
251
        elif str(escape[1:2]) in OCTDIGITS:
252
            # octal escape (up to three digits)
253
            while source.next in OCTDIGITS and len(escape) < 5:
254
                escape = escape + source.get()
255
            escape = escape[1:]
256
            return LITERAL, atoi(escape, 8) & 0xff
257
        if len(escape) == 2:
258
            return LITERAL, ord(escape[1])
259
    except ValueError:
260
        pass
261
    raise error, "bogus escape: %s" % repr(escape)
262

    
263
def _escape(source, escape, state):
264
    # handle escape code in expression
265
    code = CATEGORIES.get(escape)
266
    if code:
267
        return code
268
    code = ESCAPES.get(escape)
269
    if code:
270
        return code
271
    try:
272
        if escape[1:2] == "x":
273
            # hexadecimal escape
274
            while source.next in HEXDIGITS and len(escape) < 4:
275
                escape = escape + source.get()
276
            if len(escape) != 4:
277
                raise ValueError
278
            return LITERAL, atoi(escape[2:], 16) & 0xff
279
        elif escape[1:2] == "0":
280
            # octal escape
281
            while source.next in OCTDIGITS and len(escape) < 4:
282
                escape = escape + source.get()
283
            return LITERAL, atoi(escape[1:], 8) & 0xff
284
        elif escape[1:2] in DIGITS:
285
            # octal escape *or* decimal group reference (sigh)
286
            here = source.tell()
287
            if source.next in DIGITS:
288
                escape = escape + source.get()
289
                if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and
290
                    source.next in OCTDIGITS):
291
                    # got three octal digits; this is an octal escape
292
                    escape = escape + source.get()
293
                    return LITERAL, atoi(escape[1:], 8) & 0xff
294
            # got at least one decimal digit; this is a group reference
295
            group = _group(escape, state.groups)
296
            if group:
297
                if not state.checkgroup(group):
298
                    raise error, "cannot refer to open group"
299
                return GROUPREF, group
300
            raise ValueError
301
        if len(escape) == 2:
302
            return LITERAL, ord(escape[1])
303
    except ValueError:
304
        pass
305
    raise error, "bogus escape: %s" % repr(escape)
306

    
307
def _parse_sub(source, state, nested=1):
308
    # parse an alternation: a|b|c
309

    
310
    items = []
311
    while 1:
312
        items.append(_parse(source, state))
313
        if source.match("|"):
314
            continue
315
        if not nested:
316
            break
317
        if not source.next or source.match(")", 0):
318
            break
319
        else:
320
            raise error, "pattern not properly closed"
321

    
322
    if len(items) == 1:
323
        return items[0]
324

    
325
    subpattern = SubPattern(state)
326

    
327
    # check if all items share a common prefix
328
    while 1:
329
        prefix = None
330
        for item in items:
331
            if not item:
332
                break
333
            if prefix is None:
334
                prefix = item[0]
335
            elif item[0] != prefix:
336
                break
337
        else:
338
            # all subitems start with a common "prefix".
339
            # move it out of the branch
340
            for item in items:
341
                del item[0]
342
            subpattern.append(prefix)
343
            continue # check next one
344
        break
345

    
346
    # check if the branch can be replaced by a character set
347
    for item in items:
348
        if len(item) != 1 or item[0][0] != LITERAL:
349
            break
350
    else:
351
        # we can store this as a character set instead of a
352
        # branch (the compiler may optimize this even more)
353
        set = []
354
        for item in items:
355
            set.append(item[0])
356
        subpattern.append((IN, set))
357
        return subpattern
358

    
359
    subpattern.append((BRANCH, (None, items)))
360
    return subpattern
361

    
362
def _parse(source, state):
363
    # parse a simple pattern
364

    
365
    subpattern = SubPattern(state)
366

    
367
    while 1:
368

    
369
        if source.next in ("|", ")"):
370
            break # end of subpattern
371
        this = source.get()
372
        if this is None:
373
            break # end of pattern
374

    
375
        if state.flags & SRE_FLAG_VERBOSE:
376
            # skip whitespace and comments
377
            if this in WHITESPACE:
378
                continue
379
            if this == "#":
380
                while 1:
381
                    this = source.get()
382
                    if this in (None, "\n"):
383
                        break
384
                continue
385

    
386
        if this and this[0] not in SPECIAL_CHARS:
387
            subpattern.append((LITERAL, ord(this)))
388

    
389
        elif this == "[":
390
            # character set
391
            set = []
392
##          if source.match(":"):
393
##              pass # handle character classes
394
            if source.match("^"):
395
                set.append((NEGATE, None))
396
            # check remaining characters
397
            start = set[:]
398
            while 1:
399
                this = source.get()
400
                if this == "]" and set != start:
401
                    break
402
                elif this and this[0] == "\\":
403
                    code1 = _class_escape(source, this)
404
                elif this:
405
                    code1 = LITERAL, ord(this)
406
                else:
407
                    raise error, "unexpected end of regular expression"
408
                if source.match("-"):
409
                    # potential range
410
                    this = source.get()
411
                    if this == "]":
412
                        if code1[0] is IN:
413
                            code1 = code1[1][0]
414
                        set.append(code1)
415
                        set.append((LITERAL, ord("-")))
416
                        break
417
                    else:
418
                        if this[0] == "\\":
419
                            code2 = _class_escape(source, this)
420
                        else:
421
                            code2 = LITERAL, ord(this)
422
                        if code1[0] != LITERAL or code2[0] != LITERAL:
423
                            raise error, "bad character range"
424
                        lo = code1[1]
425
                        hi = code2[1]
426
                        if hi < lo:
427
                            raise error, "bad character range"
428
                        set.append((RANGE, (lo, hi)))
429
                else:
430
                    if code1[0] is IN:
431
                        code1 = code1[1][0]
432
                    set.append(code1)
433

    
434
            # XXX: <fl> should move set optimization to compiler!
435
            if len(set)==1 and set[0][0] is LITERAL:
436
                subpattern.append(set[0]) # optimization
437
            elif len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL:
438
                subpattern.append((NOT_LITERAL, set[1][1])) # optimization
439
            else:
440
                # XXX: <fl> should add charmap optimization here
441
                subpattern.append((IN, set))
442

    
443
        elif this and this[0] in REPEAT_CHARS:
444
            # repeat previous item
445
            if this == "?":
446
                min, max = 0, 1
447
            elif this == "*":
448
                min, max = 0, MAXREPEAT
449

    
450
            elif this == "+":
451
                min, max = 1, MAXREPEAT
452
            elif this == "{":
453
                here = source.tell()
454
                min, max = 0, MAXREPEAT
455
                lo = hi = ""
456
                while source.next in DIGITS:
457
                    lo = lo + source.get()
458
                if source.match(","):
459
                    while source.next in DIGITS:
460
                        hi = hi + source.get()
461
                else:
462
                    hi = lo
463
                if not source.match("}"):
464
                    subpattern.append((LITERAL, ord(this)))
465
                    source.seek(here)
466
                    continue
467
                if lo:
468
                    min = atoi(lo)
469
                if hi:
470
                    max = atoi(hi)
471
                if max < min:
472
                    raise error, "bad repeat interval"
473
            else:
474
                raise error, "not supported"
475
            # figure out which item to repeat
476
            if subpattern:
477
                item = subpattern[-1:]
478
            else:
479
                item = None
480
            if not item or (len(item) == 1 and item[0][0] == AT):
481
                raise error, "nothing to repeat"
482
            if item[0][0] in (MIN_REPEAT, MAX_REPEAT):
483
                raise error, "multiple repeat"
484
            if source.match("?"):
485
                subpattern[-1] = (MIN_REPEAT, (min, max, item))
486
            else:
487
                subpattern[-1] = (MAX_REPEAT, (min, max, item))
488

    
489
        elif this == ".":
490
            subpattern.append((ANY, None))
491

    
492
        elif this == "(":
493
            group = 1
494
            name = None
495
            if source.match("?"):
496
                group = 0
497
                # options
498
                if source.match("P"):
499
                    # python extensions
500
                    if source.match("<"):
501
                        # named group: skip forward to end of name
502
                        name = ""
503
                        while 1:
504
                            char = source.get()
505
                            if char is None:
506
                                raise error, "unterminated name"
507
                            if char == ">":
508
                                break
509
                            name = name + char
510
                        group = 1
511
                        if not isname(name):
512
                            raise error, "bad character in group name"
513
                    elif source.match("="):
514
                        # named backreference
515
                        name = ""
516
                        while 1:
517
                            char = source.get()
518
                            if char is None:
519
                                raise error, "unterminated name"
520
                            if char == ")":
521
                                break
522
                            name = name + char
523
                        if not isname(name):
524
                            raise error, "bad character in group name"
525
                        gid = state.groupdict.get(name)
526
                        if gid is None:
527
                            raise error, "unknown group name"
528
                        subpattern.append((GROUPREF, gid))
529
                        continue
530
                    else:
531
                        char = source.get()
532
                        if char is None:
533
                            raise error, "unexpected end of pattern"
534
                        raise error, "unknown specifier: ?P%s" % char
535
                elif source.match(":"):
536
                    # non-capturing group
537
                    group = 2
538
                elif source.match("#"):
539
                    # comment
540
                    while 1:
541
                        if source.next is None or source.next == ")":
542
                            break
543
                        source.get()
544
                    if not source.match(")"):
545
                        raise error, "unbalanced parenthesis"
546
                    continue
547
                elif source.next in ("=", "!", "<"):
548
                    # lookahead assertions
549
                    char = source.get()
550
                    dir = 1
551
                    if char == "<":
552
                        if source.next not in ("=", "!"):
553
                            raise error, "syntax error"
554
                        dir = -1 # lookbehind
555
                        char = source.get()
556
                    p = _parse_sub(source, state)
557
                    if not source.match(")"):
558
                        raise error, "unbalanced parenthesis"
559
                    if char == "=":
560
                        subpattern.append((ASSERT, (dir, p)))
561
                    else:
562
                        subpattern.append((ASSERT_NOT, (dir, p)))
563
                    continue
564
                else:
565
                    # flags
566
                    if not FLAGS.has_key(source.next):
567
                        raise error, "unexpected end of pattern"
568
                    while FLAGS.has_key(source.next):
569
                        state.flags = state.flags | FLAGS[source.get()]
570
            if group:
571
                # parse group contents
572
                if group == 2:
573
                    # anonymous group
574
                    group = None
575
                else:
576
                    group = state.opengroup(name)
577
                p = _parse_sub(source, state)
578
                if not source.match(")"):
579
                    raise error, "unbalanced parenthesis"
580
                if group is not None:
581
                    state.closegroup(group)
582
                subpattern.append((SUBPATTERN, (group, p)))
583
            else:
584
                while 1:
585
                    char = source.get()
586
                    if char is None:
587
                        raise error, "unexpected end of pattern"
588
                    if char == ")":
589
                        break
590
                    raise error, "unknown extension"
591

    
592
        elif this == "^":
593
            subpattern.append((AT, AT_BEGINNING))
594

    
595
        elif this == "$":
596
            subpattern.append((AT, AT_END))
597

    
598
        elif this and this[0] == "\\":
599
            code = _escape(source, this, state)
600
            subpattern.append(code)
601

    
602
        else:
603
            raise error, "parser error"
604

    
605
    return subpattern
606

    
607
def parse(str, flags=0, pattern=None):
608
    # parse 're' pattern into list of (opcode, argument) tuples
609

    
610
    source = Tokenizer(str)
611

    
612
    if pattern is None:
613
        pattern = Pattern()
614
    pattern.flags = flags
615
    pattern.str = str
616

    
617
    p = _parse_sub(source, pattern, 0)
618

    
619
    tail = source.get()
620
    if tail == ")":
621
        raise error, "unbalanced parenthesis"
622
    elif tail:
623
        raise error, "bogus characters at end of regular expression"
624

    
625
    if flags & SRE_FLAG_DEBUG:
626
        p.dump()
627

    
628
    if not (flags & SRE_FLAG_VERBOSE) and p.pattern.flags & SRE_FLAG_VERBOSE:
629
        # the VERBOSE flag was switched on inside the pattern.  to be
630
        # on the safe side, we'll parse the whole thing again...
631
        return parse(str, p.pattern.flags)
632

    
633
    return p
634

    
635
def parse_template(source, pattern):
636
    # parse 're' replacement string into list of literals and
637
    # group references
638
    s = Tokenizer(source)
639
    p = []
640
    a = p.append
641
    def literal(literal, p=p):
642
        if p and p[-1][0] is LITERAL:
643
            p[-1] = LITERAL, p[-1][1] + literal
644
        else:
645
            p.append((LITERAL, literal))
646
    sep = source[:0]
647
    if type(sep) is type(""):
648
        char = chr
649
    else:
650
        char = unichr
651
    while 1:
652
        this = s.get()
653
        if this is None:
654
            break # end of replacement string
655
        if this and this[0] == "\\":
656
            # group
657
            if this == "\\g":
658
                name = ""
659
                if s.match("<"):
660
                    while 1:
661
                        char = s.get()
662
                        if char is None:
663
                            raise error, "unterminated group name"
664
                        if char == ">":
665
                            break
666
                        name = name + char
667
                if not name:
668
                    raise error, "bad group name"
669
                try:
670
                    index = atoi(name)
671
                except ValueError:
672
                    if not isname(name):
673
                        raise error, "bad character in group name"
674
                    try:
675
                        index = pattern.groupindex[name]
676
                    except KeyError:
677
                        raise IndexError, "unknown group name"
678
                a((MARK, index))
679
            elif len(this) > 1 and this[1] in DIGITS:
680
                code = None
681
                while 1:
682
                    group = _group(this, pattern.groups+1)
683
                    if group:
684
                        if (s.next not in DIGITS or
685
                            not _group(this + s.next, pattern.groups+1)):
686
                            code = MARK, group
687
                            break
688
                    elif s.next in OCTDIGITS:
689
                        this = this + s.get()
690
                    else:
691
                        break
692
                if not code:
693
                    this = this[1:]
694
                    code = LITERAL, char(atoi(this[-6:], 8) & 0xff)
695
                if code[0] is LITERAL:
696
                    literal(code[1])
697
                else:
698
                    a(code)
699
            else:
700
                try:
701
                    this = char(ESCAPES[this][1])
702
                except KeyError:
703
                    pass
704
                literal(this)
705
        else:
706
            literal(this)
707
    # convert template to groups and literals lists
708
    i = 0
709
    groups = []
710
    literals = []
711
    for c, s in p:
712
        if c is MARK:
713
            groups.append((i, s))
714
            literals.append(None)
715
        else:
716
            literals.append(s)
717
        i = i + 1
718
    return groups, literals
719

    
720
def expand_template(template, match):
721
    g = match.group
722
    sep = match.string[:0]
723
    groups, literals = template
724
    literals = literals[:]
725
    try:
726
        for index, group in groups:
727
            literals[index] = s = g(group)
728
            if s is None:
729
                raise IndexError
730
    except IndexError:
731
        raise error, "empty group"
732
    return string.join(literals, sep)