svn-gvsig-desktop / tags / v1_0_2_Build_899 / extensions / extScripting / scripts / jython / Lib / sre_parse.py @ 10517
History | View | Annotate | Download (23.8 KB)
1 |
#
|
---|---|
2 |
# Secret Labs' Regular Expression Engine
|
3 |
#
|
4 |
# convert re-style regular expression to sre pattern
|
5 |
#
|
6 |
# Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved.
|
7 |
#
|
8 |
# See the sre.py file for information on usage and redistribution.
|
9 |
#
|
10 |
|
11 |
# XXX: show string offset and offending character for all errors
|
12 |
|
13 |
# this module works under 1.5.2 and later. don't use string methods
|
14 |
import string, sys |
15 |
|
16 |
from sre_constants import * |
17 |
|
18 |
SPECIAL_CHARS = ".\\[{()*+?^$|"
|
19 |
REPEAT_CHARS = "*+?{"
|
20 |
|
21 |
DIGITS = tuple("0123456789") |
22 |
|
23 |
OCTDIGITS = tuple("01234567") |
24 |
HEXDIGITS = tuple("0123456789abcdefABCDEF") |
25 |
|
26 |
WHITESPACE = tuple(" \t\n\r\v\f") |
27 |
|
28 |
ESCAPES = { |
29 |
r"\a": (LITERAL, ord("\a")), |
30 |
r"\b": (LITERAL, ord("\b")), |
31 |
r"\f": (LITERAL, ord("\f")), |
32 |
r"\n": (LITERAL, ord("\n")), |
33 |
r"\r": (LITERAL, ord("\r")), |
34 |
r"\t": (LITERAL, ord("\t")), |
35 |
r"\v": (LITERAL, ord("\v")), |
36 |
r"\\": (LITERAL, ord("\\")) |
37 |
} |
38 |
|
39 |
CATEGORIES = { |
40 |
r"\A": (AT, AT_BEGINNING_STRING), # start of string |
41 |
r"\b": (AT, AT_BOUNDARY),
|
42 |
r"\B": (AT, AT_NON_BOUNDARY),
|
43 |
r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]),
|
44 |
r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]),
|
45 |
r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]),
|
46 |
r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]),
|
47 |
r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]),
|
48 |
r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]),
|
49 |
r"\Z": (AT, AT_END_STRING), # end of string |
50 |
} |
51 |
|
52 |
FLAGS = { |
53 |
# standard flags
|
54 |
"i": SRE_FLAG_IGNORECASE,
|
55 |
"L": SRE_FLAG_LOCALE,
|
56 |
"m": SRE_FLAG_MULTILINE,
|
57 |
"s": SRE_FLAG_DOTALL,
|
58 |
"x": SRE_FLAG_VERBOSE,
|
59 |
# extensions
|
60 |
"t": SRE_FLAG_TEMPLATE,
|
61 |
"u": SRE_FLAG_UNICODE,
|
62 |
} |
63 |
|
64 |
# figure out best way to convert hex/octal numbers to integers
|
65 |
try:
|
66 |
int("10", 8) |
67 |
atoi = int # 2.0 and later |
68 |
except TypeError: |
69 |
atoi = string.atoi # 1.5.2
|
70 |
|
71 |
class Pattern: |
72 |
# master pattern object. keeps track of global attributes
|
73 |
def __init__(self): |
74 |
self.flags = 0 |
75 |
self.open = []
|
76 |
self.groups = 1 |
77 |
self.groupdict = {}
|
78 |
def opengroup(self, name=None): |
79 |
gid = self.groups
|
80 |
self.groups = gid + 1 |
81 |
if name:
|
82 |
self.groupdict[name] = gid
|
83 |
self.open.append(gid)
|
84 |
return gid
|
85 |
def closegroup(self, gid): |
86 |
self.open.remove(gid)
|
87 |
def checkgroup(self, gid): |
88 |
return gid < self.groups and gid not in self.open |
89 |
|
90 |
class SubPattern: |
91 |
# a subpattern, in intermediate form
|
92 |
def __init__(self, pattern, data=None): |
93 |
self.pattern = pattern
|
94 |
if not data: |
95 |
data = [] |
96 |
self.data = data
|
97 |
self.width = None |
98 |
def dump(self, level=0): |
99 |
nl = 1
|
100 |
for op, av in self.data: |
101 |
print level*" " + op,; nl = 0 |
102 |
if op == "in": |
103 |
# member sublanguage
|
104 |
print; nl = 1 |
105 |
for op, a in av: |
106 |
print (level+1)*" " + op, a |
107 |
elif op == "branch": |
108 |
print; nl = 1 |
109 |
i = 0
|
110 |
for a in av[1]: |
111 |
if i > 0: |
112 |
print level*" " + "or" |
113 |
a.dump(level+1); nl = 1 |
114 |
i = i + 1
|
115 |
elif type(av) in (type(()), type([])): |
116 |
for a in av: |
117 |
if isinstance(a, SubPattern): |
118 |
if not nl: print |
119 |
a.dump(level+1); nl = 1 |
120 |
else:
|
121 |
print a, ; nl = 0 |
122 |
else:
|
123 |
print av, ; nl = 0 |
124 |
if not nl: print |
125 |
def __repr__(self): |
126 |
return repr(self.data) |
127 |
def __len__(self): |
128 |
return len(self.data) |
129 |
def __delitem__(self, index): |
130 |
del self.data[index] |
131 |
def __getitem__(self, index): |
132 |
return self.data[index] |
133 |
def __setitem__(self, index, code): |
134 |
self.data[index] = code
|
135 |
def __getslice__(self, start, stop): |
136 |
return SubPattern(self.pattern, self.data[start:stop]) |
137 |
def insert(self, index, code): |
138 |
self.data.insert(index, code)
|
139 |
def append(self, code): |
140 |
self.data.append(code)
|
141 |
def getwidth(self): |
142 |
# determine the width (min, max) for this subpattern
|
143 |
if self.width: |
144 |
return self.width |
145 |
lo = hi = 0L
|
146 |
for op, av in self.data: |
147 |
if op is BRANCH: |
148 |
i = sys.maxint |
149 |
j = 0
|
150 |
for av in av[1]: |
151 |
l, h = av.getwidth() |
152 |
i = min(i, l)
|
153 |
j = max(j, h)
|
154 |
lo = lo + i |
155 |
hi = hi + j |
156 |
elif op is CALL: |
157 |
i, j = av.getwidth() |
158 |
lo = lo + i |
159 |
hi = hi + j |
160 |
elif op is SUBPATTERN: |
161 |
i, j = av[1].getwidth()
|
162 |
lo = lo + i |
163 |
hi = hi + j |
164 |
elif op in (MIN_REPEAT, MAX_REPEAT): |
165 |
i, j = av[2].getwidth()
|
166 |
lo = lo + long(i) * av[0] |
167 |
hi = hi + long(j) * av[1] |
168 |
elif op in (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY): |
169 |
lo = lo + 1
|
170 |
hi = hi + 1
|
171 |
elif op == SUCCESS:
|
172 |
break
|
173 |
self.width = int(min(lo, sys.maxint)), int(min(hi, sys.maxint)) |
174 |
return self.width |
175 |
|
176 |
class Tokenizer: |
177 |
def __init__(self, string): |
178 |
self.string = string
|
179 |
self.index = 0 |
180 |
self.__next()
|
181 |
def __next(self): |
182 |
if self.index >= len(self.string): |
183 |
self.next = None |
184 |
return
|
185 |
char = self.string[self.index] |
186 |
if char[0] == "\\": |
187 |
try:
|
188 |
c = self.string[self.index + 1] |
189 |
except IndexError: |
190 |
raise error, "bogus escape" |
191 |
char = char + c |
192 |
self.index = self.index + len(char) |
193 |
self.next = char
|
194 |
def match(self, char, skip=1): |
195 |
if char == self.next: |
196 |
if skip:
|
197 |
self.__next()
|
198 |
return 1 |
199 |
return 0 |
200 |
def get(self): |
201 |
this = self.next
|
202 |
self.__next()
|
203 |
return this
|
204 |
def tell(self): |
205 |
return self.index, self.next |
206 |
def seek(self, index): |
207 |
self.index, self.next = index |
208 |
|
209 |
def isident(char): |
210 |
return "a" <= char <= "z" or "A" <= char <= "Z" or char == "_" |
211 |
|
212 |
def isdigit(char): |
213 |
return "0" <= char <= "9" |
214 |
|
215 |
def isname(name): |
216 |
# check that group name is a valid string
|
217 |
if not isident(name[0]): |
218 |
return 0 |
219 |
for char in name: |
220 |
if not isident(char) and not isdigit(char): |
221 |
return 0 |
222 |
return 1 |
223 |
|
224 |
def _group(escape, groups): |
225 |
# check if the escape string represents a valid group
|
226 |
try:
|
227 |
gid = atoi(escape[1:])
|
228 |
if gid and gid < groups: |
229 |
return gid
|
230 |
except ValueError: |
231 |
pass
|
232 |
return None # not a valid group |
233 |
|
234 |
def _class_escape(source, escape): |
235 |
# handle escape code inside character class
|
236 |
code = ESCAPES.get(escape) |
237 |
if code:
|
238 |
return code
|
239 |
code = CATEGORIES.get(escape) |
240 |
if code:
|
241 |
return code
|
242 |
try:
|
243 |
if escape[1:2] == "x": |
244 |
# hexadecimal escape (exactly two digits)
|
245 |
while source.next in HEXDIGITS and len(escape) < 4: |
246 |
escape = escape + source.get() |
247 |
escape = escape[2:]
|
248 |
if len(escape) != 2: |
249 |
raise error, "bogus escape: %s" % repr("\\" + escape) |
250 |
return LITERAL, atoi(escape, 16) & 0xff |
251 |
elif str(escape[1:2]) in OCTDIGITS: |
252 |
# octal escape (up to three digits)
|
253 |
while source.next in OCTDIGITS and len(escape) < 5: |
254 |
escape = escape + source.get() |
255 |
escape = escape[1:]
|
256 |
return LITERAL, atoi(escape, 8) & 0xff |
257 |
if len(escape) == 2: |
258 |
return LITERAL, ord(escape[1]) |
259 |
except ValueError: |
260 |
pass
|
261 |
raise error, "bogus escape: %s" % repr(escape) |
262 |
|
263 |
def _escape(source, escape, state): |
264 |
# handle escape code in expression
|
265 |
code = CATEGORIES.get(escape) |
266 |
if code:
|
267 |
return code
|
268 |
code = ESCAPES.get(escape) |
269 |
if code:
|
270 |
return code
|
271 |
try:
|
272 |
if escape[1:2] == "x": |
273 |
# hexadecimal escape
|
274 |
while source.next in HEXDIGITS and len(escape) < 4: |
275 |
escape = escape + source.get() |
276 |
if len(escape) != 4: |
277 |
raise ValueError |
278 |
return LITERAL, atoi(escape[2:], 16) & 0xff |
279 |
elif escape[1:2] == "0": |
280 |
# octal escape
|
281 |
while source.next in OCTDIGITS and len(escape) < 4: |
282 |
escape = escape + source.get() |
283 |
return LITERAL, atoi(escape[1:], 8) & 0xff |
284 |
elif escape[1:2] in DIGITS: |
285 |
# octal escape *or* decimal group reference (sigh)
|
286 |
here = source.tell() |
287 |
if source.next in DIGITS: |
288 |
escape = escape + source.get() |
289 |
if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and |
290 |
source.next in OCTDIGITS):
|
291 |
# got three octal digits; this is an octal escape
|
292 |
escape = escape + source.get() |
293 |
return LITERAL, atoi(escape[1:], 8) & 0xff |
294 |
# got at least one decimal digit; this is a group reference
|
295 |
group = _group(escape, state.groups) |
296 |
if group:
|
297 |
if not state.checkgroup(group): |
298 |
raise error, "cannot refer to open group" |
299 |
return GROUPREF, group
|
300 |
raise ValueError |
301 |
if len(escape) == 2: |
302 |
return LITERAL, ord(escape[1]) |
303 |
except ValueError: |
304 |
pass
|
305 |
raise error, "bogus escape: %s" % repr(escape) |
306 |
|
307 |
def _parse_sub(source, state, nested=1): |
308 |
# parse an alternation: a|b|c
|
309 |
|
310 |
items = [] |
311 |
while 1: |
312 |
items.append(_parse(source, state)) |
313 |
if source.match("|"): |
314 |
continue
|
315 |
if not nested: |
316 |
break
|
317 |
if not source.next or source.match(")", 0): |
318 |
break
|
319 |
else:
|
320 |
raise error, "pattern not properly closed" |
321 |
|
322 |
if len(items) == 1: |
323 |
return items[0] |
324 |
|
325 |
subpattern = SubPattern(state) |
326 |
|
327 |
# check if all items share a common prefix
|
328 |
while 1: |
329 |
prefix = None
|
330 |
for item in items: |
331 |
if not item: |
332 |
break
|
333 |
if prefix is None: |
334 |
prefix = item[0]
|
335 |
elif item[0] != prefix: |
336 |
break
|
337 |
else:
|
338 |
# all subitems start with a common "prefix".
|
339 |
# move it out of the branch
|
340 |
for item in items: |
341 |
del item[0] |
342 |
subpattern.append(prefix) |
343 |
continue # check next one |
344 |
break
|
345 |
|
346 |
# check if the branch can be replaced by a character set
|
347 |
for item in items: |
348 |
if len(item) != 1 or item[0][0] != LITERAL: |
349 |
break
|
350 |
else:
|
351 |
# we can store this as a character set instead of a
|
352 |
# branch (the compiler may optimize this even more)
|
353 |
set = [] |
354 |
for item in items: |
355 |
set.append(item[0]) |
356 |
subpattern.append((IN, set))
|
357 |
return subpattern
|
358 |
|
359 |
subpattern.append((BRANCH, (None, items)))
|
360 |
return subpattern
|
361 |
|
362 |
def _parse(source, state): |
363 |
# parse a simple pattern
|
364 |
|
365 |
subpattern = SubPattern(state) |
366 |
|
367 |
while 1: |
368 |
|
369 |
if source.next in ("|", ")"): |
370 |
break # end of subpattern |
371 |
this = source.get() |
372 |
if this is None: |
373 |
break # end of pattern |
374 |
|
375 |
if state.flags & SRE_FLAG_VERBOSE:
|
376 |
# skip whitespace and comments
|
377 |
if this in WHITESPACE: |
378 |
continue
|
379 |
if this == "#": |
380 |
while 1: |
381 |
this = source.get() |
382 |
if this in (None, "\n"): |
383 |
break
|
384 |
continue
|
385 |
|
386 |
if this and this[0] not in SPECIAL_CHARS: |
387 |
subpattern.append((LITERAL, ord(this)))
|
388 |
|
389 |
elif this == "[": |
390 |
# character set
|
391 |
set = [] |
392 |
## if source.match(":"):
|
393 |
## pass # handle character classes
|
394 |
if source.match("^"): |
395 |
set.append((NEGATE, None)) |
396 |
# check remaining characters
|
397 |
start = set[:]
|
398 |
while 1: |
399 |
this = source.get() |
400 |
if this == "]" and set != start: |
401 |
break
|
402 |
elif this and this[0] == "\\": |
403 |
code1 = _class_escape(source, this) |
404 |
elif this:
|
405 |
code1 = LITERAL, ord(this)
|
406 |
else:
|
407 |
raise error, "unexpected end of regular expression" |
408 |
if source.match("-"): |
409 |
# potential range
|
410 |
this = source.get() |
411 |
if this == "]": |
412 |
if code1[0] is IN: |
413 |
code1 = code1[1][0] |
414 |
set.append(code1)
|
415 |
set.append((LITERAL, ord("-"))) |
416 |
break
|
417 |
else:
|
418 |
if this[0] == "\\": |
419 |
code2 = _class_escape(source, this) |
420 |
else:
|
421 |
code2 = LITERAL, ord(this)
|
422 |
if code1[0] != LITERAL or code2[0] != LITERAL: |
423 |
raise error, "bad character range" |
424 |
lo = code1[1]
|
425 |
hi = code2[1]
|
426 |
if hi < lo:
|
427 |
raise error, "bad character range" |
428 |
set.append((RANGE, (lo, hi)))
|
429 |
else:
|
430 |
if code1[0] is IN: |
431 |
code1 = code1[1][0] |
432 |
set.append(code1)
|
433 |
|
434 |
# XXX: <fl> should move set optimization to compiler!
|
435 |
if len(set)==1 and set[0][0] is LITERAL: |
436 |
subpattern.append(set[0]) # optimization |
437 |
elif len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL: |
438 |
subpattern.append((NOT_LITERAL, set[1][1])) # optimization |
439 |
else:
|
440 |
# XXX: <fl> should add charmap optimization here
|
441 |
subpattern.append((IN, set))
|
442 |
|
443 |
elif this and this[0] in REPEAT_CHARS: |
444 |
# repeat previous item
|
445 |
if this == "?": |
446 |
min, max = 0, 1 |
447 |
elif this == "*": |
448 |
min, max = 0, MAXREPEAT |
449 |
|
450 |
elif this == "+": |
451 |
min, max = 1, MAXREPEAT |
452 |
elif this == "{": |
453 |
here = source.tell() |
454 |
min, max = 0, MAXREPEAT |
455 |
lo = hi = ""
|
456 |
while source.next in DIGITS: |
457 |
lo = lo + source.get() |
458 |
if source.match(","): |
459 |
while source.next in DIGITS: |
460 |
hi = hi + source.get() |
461 |
else:
|
462 |
hi = lo |
463 |
if not source.match("}"): |
464 |
subpattern.append((LITERAL, ord(this)))
|
465 |
source.seek(here) |
466 |
continue
|
467 |
if lo:
|
468 |
min = atoi(lo) |
469 |
if hi:
|
470 |
max = atoi(hi) |
471 |
if max < min: |
472 |
raise error, "bad repeat interval" |
473 |
else:
|
474 |
raise error, "not supported" |
475 |
# figure out which item to repeat
|
476 |
if subpattern:
|
477 |
item = subpattern[-1:]
|
478 |
else:
|
479 |
item = None
|
480 |
if not item or (len(item) == 1 and item[0][0] == AT): |
481 |
raise error, "nothing to repeat" |
482 |
if item[0][0] in (MIN_REPEAT, MAX_REPEAT): |
483 |
raise error, "multiple repeat" |
484 |
if source.match("?"): |
485 |
subpattern[-1] = (MIN_REPEAT, (min, max, item)) |
486 |
else:
|
487 |
subpattern[-1] = (MAX_REPEAT, (min, max, item)) |
488 |
|
489 |
elif this == ".": |
490 |
subpattern.append((ANY, None))
|
491 |
|
492 |
elif this == "(": |
493 |
group = 1
|
494 |
name = None
|
495 |
if source.match("?"): |
496 |
group = 0
|
497 |
# options
|
498 |
if source.match("P"): |
499 |
# python extensions
|
500 |
if source.match("<"): |
501 |
# named group: skip forward to end of name
|
502 |
name = ""
|
503 |
while 1: |
504 |
char = source.get() |
505 |
if char is None: |
506 |
raise error, "unterminated name" |
507 |
if char == ">": |
508 |
break
|
509 |
name = name + char |
510 |
group = 1
|
511 |
if not isname(name): |
512 |
raise error, "bad character in group name" |
513 |
elif source.match("="): |
514 |
# named backreference
|
515 |
name = ""
|
516 |
while 1: |
517 |
char = source.get() |
518 |
if char is None: |
519 |
raise error, "unterminated name" |
520 |
if char == ")": |
521 |
break
|
522 |
name = name + char |
523 |
if not isname(name): |
524 |
raise error, "bad character in group name" |
525 |
gid = state.groupdict.get(name) |
526 |
if gid is None: |
527 |
raise error, "unknown group name" |
528 |
subpattern.append((GROUPREF, gid)) |
529 |
continue
|
530 |
else:
|
531 |
char = source.get() |
532 |
if char is None: |
533 |
raise error, "unexpected end of pattern" |
534 |
raise error, "unknown specifier: ?P%s" % char |
535 |
elif source.match(":"): |
536 |
# non-capturing group
|
537 |
group = 2
|
538 |
elif source.match("#"): |
539 |
# comment
|
540 |
while 1: |
541 |
if source.next is None or source.next == ")": |
542 |
break
|
543 |
source.get() |
544 |
if not source.match(")"): |
545 |
raise error, "unbalanced parenthesis" |
546 |
continue
|
547 |
elif source.next in ("=", "!", "<"): |
548 |
# lookahead assertions
|
549 |
char = source.get() |
550 |
dir = 1
|
551 |
if char == "<": |
552 |
if source.next not in ("=", "!"): |
553 |
raise error, "syntax error" |
554 |
dir = -1 # lookbehind |
555 |
char = source.get() |
556 |
p = _parse_sub(source, state) |
557 |
if not source.match(")"): |
558 |
raise error, "unbalanced parenthesis" |
559 |
if char == "=": |
560 |
subpattern.append((ASSERT, (dir, p)))
|
561 |
else:
|
562 |
subpattern.append((ASSERT_NOT, (dir, p)))
|
563 |
continue
|
564 |
else:
|
565 |
# flags
|
566 |
if not FLAGS.has_key(source.next): |
567 |
raise error, "unexpected end of pattern" |
568 |
while FLAGS.has_key(source.next):
|
569 |
state.flags = state.flags | FLAGS[source.get()] |
570 |
if group:
|
571 |
# parse group contents
|
572 |
if group == 2: |
573 |
# anonymous group
|
574 |
group = None
|
575 |
else:
|
576 |
group = state.opengroup(name) |
577 |
p = _parse_sub(source, state) |
578 |
if not source.match(")"): |
579 |
raise error, "unbalanced parenthesis" |
580 |
if group is not None: |
581 |
state.closegroup(group) |
582 |
subpattern.append((SUBPATTERN, (group, p))) |
583 |
else:
|
584 |
while 1: |
585 |
char = source.get() |
586 |
if char is None: |
587 |
raise error, "unexpected end of pattern" |
588 |
if char == ")": |
589 |
break
|
590 |
raise error, "unknown extension" |
591 |
|
592 |
elif this == "^": |
593 |
subpattern.append((AT, AT_BEGINNING)) |
594 |
|
595 |
elif this == "$": |
596 |
subpattern.append((AT, AT_END)) |
597 |
|
598 |
elif this and this[0] == "\\": |
599 |
code = _escape(source, this, state) |
600 |
subpattern.append(code) |
601 |
|
602 |
else:
|
603 |
raise error, "parser error" |
604 |
|
605 |
return subpattern
|
606 |
|
607 |
def parse(str, flags=0, pattern=None): |
608 |
# parse 're' pattern into list of (opcode, argument) tuples
|
609 |
|
610 |
source = Tokenizer(str)
|
611 |
|
612 |
if pattern is None: |
613 |
pattern = Pattern() |
614 |
pattern.flags = flags |
615 |
pattern.str = str
|
616 |
|
617 |
p = _parse_sub(source, pattern, 0)
|
618 |
|
619 |
tail = source.get() |
620 |
if tail == ")": |
621 |
raise error, "unbalanced parenthesis" |
622 |
elif tail:
|
623 |
raise error, "bogus characters at end of regular expression" |
624 |
|
625 |
if flags & SRE_FLAG_DEBUG:
|
626 |
p.dump() |
627 |
|
628 |
if not (flags & SRE_FLAG_VERBOSE) and p.pattern.flags & SRE_FLAG_VERBOSE: |
629 |
# the VERBOSE flag was switched on inside the pattern. to be
|
630 |
# on the safe side, we'll parse the whole thing again...
|
631 |
return parse(str, p.pattern.flags) |
632 |
|
633 |
return p
|
634 |
|
635 |
def parse_template(source, pattern): |
636 |
# parse 're' replacement string into list of literals and
|
637 |
# group references
|
638 |
s = Tokenizer(source) |
639 |
p = [] |
640 |
a = p.append |
641 |
def literal(literal, p=p): |
642 |
if p and p[-1][0] is LITERAL: |
643 |
p[-1] = LITERAL, p[-1][1] + literal |
644 |
else:
|
645 |
p.append((LITERAL, literal)) |
646 |
sep = source[:0]
|
647 |
if type(sep) is type(""): |
648 |
char = chr
|
649 |
else:
|
650 |
char = unichr
|
651 |
while 1: |
652 |
this = s.get() |
653 |
if this is None: |
654 |
break # end of replacement string |
655 |
if this and this[0] == "\\": |
656 |
# group
|
657 |
if this == "\\g": |
658 |
name = ""
|
659 |
if s.match("<"): |
660 |
while 1: |
661 |
char = s.get() |
662 |
if char is None: |
663 |
raise error, "unterminated group name" |
664 |
if char == ">": |
665 |
break
|
666 |
name = name + char |
667 |
if not name: |
668 |
raise error, "bad group name" |
669 |
try:
|
670 |
index = atoi(name) |
671 |
except ValueError: |
672 |
if not isname(name): |
673 |
raise error, "bad character in group name" |
674 |
try:
|
675 |
index = pattern.groupindex[name] |
676 |
except KeyError: |
677 |
raise IndexError, "unknown group name" |
678 |
a((MARK, index)) |
679 |
elif len(this) > 1 and this[1] in DIGITS: |
680 |
code = None
|
681 |
while 1: |
682 |
group = _group(this, pattern.groups+1)
|
683 |
if group:
|
684 |
if (s.next not in DIGITS or |
685 |
not _group(this + s.next, pattern.groups+1)): |
686 |
code = MARK, group |
687 |
break
|
688 |
elif s.next in OCTDIGITS: |
689 |
this = this + s.get() |
690 |
else:
|
691 |
break
|
692 |
if not code: |
693 |
this = this[1:]
|
694 |
code = LITERAL, char(atoi(this[-6:], 8) & 0xff) |
695 |
if code[0] is LITERAL: |
696 |
literal(code[1])
|
697 |
else:
|
698 |
a(code) |
699 |
else:
|
700 |
try:
|
701 |
this = char(ESCAPES[this][1])
|
702 |
except KeyError: |
703 |
pass
|
704 |
literal(this) |
705 |
else:
|
706 |
literal(this) |
707 |
# convert template to groups and literals lists
|
708 |
i = 0
|
709 |
groups = [] |
710 |
literals = [] |
711 |
for c, s in p: |
712 |
if c is MARK: |
713 |
groups.append((i, s)) |
714 |
literals.append(None)
|
715 |
else:
|
716 |
literals.append(s) |
717 |
i = i + 1
|
718 |
return groups, literals
|
719 |
|
720 |
def expand_template(template, match): |
721 |
g = match.group |
722 |
sep = match.string[:0]
|
723 |
groups, literals = template |
724 |
literals = literals[:] |
725 |
try:
|
726 |
for index, group in groups: |
727 |
literals[index] = s = g(group) |
728 |
if s is None: |
729 |
raise IndexError |
730 |
except IndexError: |
731 |
raise error, "empty group" |
732 |
return string.join(literals, sep)
|