Statistics
| Revision:

gvsig-scripting / org.gvsig.scripting / trunk / org.gvsig.scripting / org.gvsig.scripting.app / org.gvsig.scripting.app.mainplugin / src / main / resources-plugin / scripting / lib / BeautifulSoup.py @ 475

History | View | Annotate | Download (77.7 KB)

1
"""Beautiful Soup
2
Elixir and Tonic
3
"The Screen-Scraper's Friend"
4
http://www.crummy.com/software/BeautifulSoup/
5

6
Beautiful Soup parses a (possibly invalid) XML or HTML document into a
7
tree representation. It provides methods and Pythonic idioms that make
8
it easy to navigate, search, and modify the tree.
9

10
A well-formed XML/HTML document yields a well-formed data
11
structure. An ill-formed XML/HTML document yields a correspondingly
12
ill-formed data structure. If your document is only locally
13
well-formed, you can use this library to find and process the
14
well-formed part of it.
15

16
Beautiful Soup works with Python 2.2 and up. It has no external
17
dependencies, but you'll have more success at converting data to UTF-8
18
if you also install these three packages:
19

20
* chardet, for auto-detecting character encodings
21
  http://chardet.feedparser.org/
22
* cjkcodecs and iconv_codec, which add more encodings to the ones supported
23
  by stock Python.
24
  http://cjkpython.i18n.org/
25

26
Beautiful Soup defines classes for two main parsing strategies:
27

28
 * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
29
   language that kind of looks like XML.
30

31
 * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
32
   or invalid. This class has web browser-like heuristics for
33
   obtaining a sensible parse tree in the face of common HTML errors.
34

35
Beautiful Soup also defines a class (UnicodeDammit) for autodetecting
36
the encoding of an HTML or XML document, and converting it to
37
Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser.
38

39
For more than you ever wanted to know about Beautiful Soup, see the
40
documentation:
41
http://www.crummy.com/software/BeautifulSoup/documentation.html
42

43
Here, have some legalese:
44

45
Copyright (c) 2004-2010, Leonard Richardson
46

47
All rights reserved.
48

49
Redistribution and use in source and binary forms, with or without
50
modification, are permitted provided that the following conditions are
51
met:
52

53
  * Redistributions of source code must retain the above copyright
54
    notice, this list of conditions and the following disclaimer.
55

56
  * Redistributions in binary form must reproduce the above
57
    copyright notice, this list of conditions and the following
58
    disclaimer in the documentation and/or other materials provided
59
    with the distribution.
60

61
  * Neither the name of the the Beautiful Soup Consortium and All
62
    Night Kosher Bakery nor the names of its contributors may be
63
    used to endorse or promote products derived from this software
64
    without specific prior written permission.
65

66
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
67
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
68
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
69
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
70
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
71
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
72
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
73
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
74
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
75
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
76
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT.
77

78
"""
79
from __future__ import generators
80

    
81
__author__ = "Leonard Richardson (leonardr@segfault.org)"
82
__version__ = "3.2.1"
83
__copyright__ = "Copyright (c) 2004-2012 Leonard Richardson"
84
__license__ = "New-style BSD"
85

    
86
from sgmllib import SGMLParser, SGMLParseError
87
import codecs
88
import markupbase
89
import types
90
import re
91
import sgmllib
92
try:
93
  from htmlentitydefs import name2codepoint
94
except ImportError:
95
  name2codepoint = {}
96
try:
97
    set
98
except NameError:
99
    from sets import Set as set
100

    
101
#These hacks make Beautiful Soup able to parse XML with namespaces
102
sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
103
markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match
104

    
105
DEFAULT_OUTPUT_ENCODING = "utf-8"
106

    
107
def _match_css_class(str):
108
    """Build a RE to match the given CSS class."""
109
    return re.compile(r"(^|.*\s)%s($|\s)" % str)
110

    
111
# First, the classes that represent markup elements.
112

    
113
class PageElement(object):
114
    """Contains the navigational information for some part of the page
115
    (either a tag or a piece of text)"""
116

    
117
    def _invert(h):
118
        "Cheap function to invert a hash."
119
        i = {}
120
        for k,v in h.items():
121
            i[v] = k
122
        return i
123

    
124
    XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'",
125
                                      "quot" : '"',
126
                                      "amp" : "&",
127
                                      "lt" : "<",
128
                                      "gt" : ">" }
129

    
130
    XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS)
131

    
132
    def setup(self, parent=None, previous=None):
133
        """Sets up the initial relations between this element and
134
        other elements."""
135
        self.parent = parent
136
        self.previous = previous
137
        self.next = None
138
        self.previousSibling = None
139
        self.nextSibling = None
140
        if self.parent and self.parent.contents:
141
            self.previousSibling = self.parent.contents[-1]
142
            self.previousSibling.nextSibling = self
143

    
144
    def replaceWith(self, replaceWith):
145
        oldParent = self.parent
146
        myIndex = self.parent.index(self)
147
        if hasattr(replaceWith, "parent")\
148
                  and replaceWith.parent is self.parent:
149
            # We're replacing this element with one of its siblings.
150
            index = replaceWith.parent.index(replaceWith)
151
            if index and index < myIndex:
152
                # Furthermore, it comes before this element. That
153
                # means that when we extract it, the index of this
154
                # element will change.
155
                myIndex = myIndex - 1
156
        self.extract()
157
        oldParent.insert(myIndex, replaceWith)
158

    
159
    def replaceWithChildren(self):
160
        myParent = self.parent
161
        myIndex = self.parent.index(self)
162
        self.extract()
163
        reversedChildren = list(self.contents)
164
        reversedChildren.reverse()
165
        for child in reversedChildren:
166
            myParent.insert(myIndex, child)
167

    
168
    def extract(self):
169
        """Destructively rips this element out of the tree."""
170
        if self.parent:
171
            try:
172
                del self.parent.contents[self.parent.index(self)]
173
            except ValueError:
174
                pass
175

    
176
        #Find the two elements that would be next to each other if
177
        #this element (and any children) hadn't been parsed. Connect
178
        #the two.
179
        lastChild = self._lastRecursiveChild()
180
        nextElement = lastChild.next
181

    
182
        if self.previous:
183
            self.previous.next = nextElement
184
        if nextElement:
185
            nextElement.previous = self.previous
186
        self.previous = None
187
        lastChild.next = None
188

    
189
        self.parent = None
190
        if self.previousSibling:
191
            self.previousSibling.nextSibling = self.nextSibling
192
        if self.nextSibling:
193
            self.nextSibling.previousSibling = self.previousSibling
194
        self.previousSibling = self.nextSibling = None
195
        return self
196

    
197
    def _lastRecursiveChild(self):
198
        "Finds the last element beneath this object to be parsed."
199
        lastChild = self
200
        while hasattr(lastChild, 'contents') and lastChild.contents:
201
            lastChild = lastChild.contents[-1]
202
        return lastChild
203

    
204
    def insert(self, position, newChild):
205
        if isinstance(newChild, basestring) \
206
            and not isinstance(newChild, NavigableString):
207
            newChild = NavigableString(newChild)
208

    
209
        position =  min(position, len(self.contents))
210
        if hasattr(newChild, 'parent') and newChild.parent is not None:
211
            # We're 'inserting' an element that's already one
212
            # of this object's children.
213
            if newChild.parent is self:
214
                index = self.index(newChild)
215
                if index > position:
216
                    # Furthermore we're moving it further down the
217
                    # list of this object's children. That means that
218
                    # when we extract this element, our target index
219
                    # will jump down one.
220
                    position = position - 1
221
            newChild.extract()
222

    
223
        newChild.parent = self
224
        previousChild = None
225
        if position == 0:
226
            newChild.previousSibling = None
227
            newChild.previous = self
228
        else:
229
            previousChild = self.contents[position-1]
230
            newChild.previousSibling = previousChild
231
            newChild.previousSibling.nextSibling = newChild
232
            newChild.previous = previousChild._lastRecursiveChild()
233
        if newChild.previous:
234
            newChild.previous.next = newChild
235

    
236
        newChildsLastElement = newChild._lastRecursiveChild()
237

    
238
        if position >= len(self.contents):
239
            newChild.nextSibling = None
240

    
241
            parent = self
242
            parentsNextSibling = None
243
            while not parentsNextSibling:
244
                parentsNextSibling = parent.nextSibling
245
                parent = parent.parent
246
                if not parent: # This is the last element in the document.
247
                    break
248
            if parentsNextSibling:
249
                newChildsLastElement.next = parentsNextSibling
250
            else:
251
                newChildsLastElement.next = None
252
        else:
253
            nextChild = self.contents[position]
254
            newChild.nextSibling = nextChild
255
            if newChild.nextSibling:
256
                newChild.nextSibling.previousSibling = newChild
257
            newChildsLastElement.next = nextChild
258

    
259
        if newChildsLastElement.next:
260
            newChildsLastElement.next.previous = newChildsLastElement
261
        self.contents.insert(position, newChild)
262

    
263
    def append(self, tag):
264
        """Appends the given tag to the contents of this tag."""
265
        self.insert(len(self.contents), tag)
266

    
267
    def findNext(self, name=None, attrs={}, text=None, **kwargs):
268
        """Returns the first item that matches the given criteria and
269
        appears after this Tag in the document."""
270
        return self._findOne(self.findAllNext, name, attrs, text, **kwargs)
271

    
272
    def findAllNext(self, name=None, attrs={}, text=None, limit=None,
273
                    **kwargs):
274
        """Returns all items that match the given criteria and appear
275
        after this Tag in the document."""
276
        return self._findAll(name, attrs, text, limit, self.nextGenerator,
277
                             **kwargs)
278

    
279
    def findNextSibling(self, name=None, attrs={}, text=None, **kwargs):
280
        """Returns the closest sibling to this Tag that matches the
281
        given criteria and appears after this Tag in the document."""
282
        return self._findOne(self.findNextSiblings, name, attrs, text,
283
                             **kwargs)
284

    
285
    def findNextSiblings(self, name=None, attrs={}, text=None, limit=None,
286
                         **kwargs):
287
        """Returns the siblings of this Tag that match the given
288
        criteria and appear after this Tag in the document."""
289
        return self._findAll(name, attrs, text, limit,
290
                             self.nextSiblingGenerator, **kwargs)
291
    fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x
292

    
293
    def findPrevious(self, name=None, attrs={}, text=None, **kwargs):
294
        """Returns the first item that matches the given criteria and
295
        appears before this Tag in the document."""
296
        return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs)
297

    
298
    def findAllPrevious(self, name=None, attrs={}, text=None, limit=None,
299
                        **kwargs):
300
        """Returns all items that match the given criteria and appear
301
        before this Tag in the document."""
302
        return self._findAll(name, attrs, text, limit, self.previousGenerator,
303
                           **kwargs)
304
    fetchPrevious = findAllPrevious # Compatibility with pre-3.x
305

    
306
    def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):
307
        """Returns the closest sibling to this Tag that matches the
308
        given criteria and appears before this Tag in the document."""
309
        return self._findOne(self.findPreviousSiblings, name, attrs, text,
310
                             **kwargs)
311

    
312
    def findPreviousSiblings(self, name=None, attrs={}, text=None,
313
                             limit=None, **kwargs):
314
        """Returns the siblings of this Tag that match the given
315
        criteria and appear before this Tag in the document."""
316
        return self._findAll(name, attrs, text, limit,
317
                             self.previousSiblingGenerator, **kwargs)
318
    fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x
319

    
320
    def findParent(self, name=None, attrs={}, **kwargs):
321
        """Returns the closest parent of this Tag that matches the given
322
        criteria."""
323
        # NOTE: We can't use _findOne because findParents takes a different
324
        # set of arguments.
325
        r = None
326
        l = self.findParents(name, attrs, 1)
327
        if l:
328
            r = l[0]
329
        return r
330

    
331
    def findParents(self, name=None, attrs={}, limit=None, **kwargs):
332
        """Returns the parents of this Tag that match the given
333
        criteria."""
334

    
335
        return self._findAll(name, attrs, None, limit, self.parentGenerator,
336
                             **kwargs)
337
    fetchParents = findParents # Compatibility with pre-3.x
338

    
339
    #These methods do the real heavy lifting.
340

    
341
    def _findOne(self, method, name, attrs, text, **kwargs):
342
        r = None
343
        l = method(name, attrs, text, 1, **kwargs)
344
        if l:
345
            r = l[0]
346
        return r
347

    
348
    def _findAll(self, name, attrs, text, limit, generator, **kwargs):
349
        "Iterates over a generator looking for things that match."
350

    
351
        if isinstance(name, SoupStrainer):
352
            strainer = name
353
        # (Possibly) special case some findAll*(...) searches
354
        elif text is None and not limit and not attrs and not kwargs:
355
            # findAll*(True)
356
            if name is True:
357
                return [element for element in generator()
358
                        if isinstance(element, Tag)]
359
            # findAll*('tag-name')
360
            elif isinstance(name, basestring):
361
                return [element for element in generator()
362
                        if isinstance(element, Tag) and
363
                        element.name == name]
364
            else:
365
                strainer = SoupStrainer(name, attrs, text, **kwargs)
366
        # Build a SoupStrainer
367
        else:
368
            strainer = SoupStrainer(name, attrs, text, **kwargs)
369
        results = ResultSet(strainer)
370
        g = generator()
371
        while True:
372
            try:
373
                i = g.next()
374
            except StopIteration:
375
                break
376
            if i:
377
                found = strainer.search(i)
378
                if found:
379
                    results.append(found)
380
                    if limit and len(results) >= limit:
381
                        break
382
        return results
383

    
384
    #These Generators can be used to navigate starting from both
385
    #NavigableStrings and Tags.
386
    def nextGenerator(self):
387
        i = self
388
        while i is not None:
389
            i = i.next
390
            yield i
391

    
392
    def nextSiblingGenerator(self):
393
        i = self
394
        while i is not None:
395
            i = i.nextSibling
396
            yield i
397

    
398
    def previousGenerator(self):
399
        i = self
400
        while i is not None:
401
            i = i.previous
402
            yield i
403

    
404
    def previousSiblingGenerator(self):
405
        i = self
406
        while i is not None:
407
            i = i.previousSibling
408
            yield i
409

    
410
    def parentGenerator(self):
411
        i = self
412
        while i is not None:
413
            i = i.parent
414
            yield i
415

    
416
    # Utility methods
417
    def substituteEncoding(self, str, encoding=None):
418
        encoding = encoding or "utf-8"
419
        return str.replace("%SOUP-ENCODING%", encoding)
420

    
421
    def toEncoding(self, s, encoding=None):
422
        """Encodes an object to a string in some encoding, or to Unicode.
423
        ."""
424
        if isinstance(s, unicode):
425
            if encoding:
426
                s = s.encode(encoding)
427
        elif isinstance(s, str):
428
            if encoding:
429
                s = s.encode(encoding)
430
            else:
431
                s = unicode(s)
432
        else:
433
            if encoding:
434
                s  = self.toEncoding(str(s), encoding)
435
            else:
436
                s = unicode(s)
437
        return s
438

    
439
    BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
440
                                           + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
441
                                           + ")")
442

    
443
    def _sub_entity(self, x):
444
        """Used with a regular expression to substitute the
445
        appropriate XML entity for an XML special character."""
446
        return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";"
447

    
448

    
449
class NavigableString(unicode, PageElement):
450

    
451
    def __new__(cls, value):
452
        """Create a new NavigableString.
453

454
        When unpickling a NavigableString, this method is called with
455
        the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
456
        passed in to the superclass's __new__ or the superclass won't know
457
        how to handle non-ASCII characters.
458
        """
459
        if isinstance(value, unicode):
460
            return unicode.__new__(cls, value)
461
        return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
462

    
463
    def __getnewargs__(self):
464
        return (NavigableString.__str__(self),)
465

    
466
    def __getattr__(self, attr):
467
        """text.string gives you text. This is for backwards
468
        compatibility for Navigable*String, but for CData* it lets you
469
        get the string without the CData wrapper."""
470
        if attr == 'string':
471
            return self
472
        else:
473
            raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
474

    
475
    def __unicode__(self):
476
        return str(self).decode(DEFAULT_OUTPUT_ENCODING)
477

    
478
    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
479
        # Substitute outgoing XML entities.
480
        data = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, self)
481
        if encoding:
482
            return data.encode(encoding)
483
        else:
484
            return data
485

    
486
class CData(NavigableString):
487

    
488
    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
489
        return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding)
490

    
491
class ProcessingInstruction(NavigableString):
492
    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
493
        output = self
494
        if "%SOUP-ENCODING%" in output:
495
            output = self.substituteEncoding(output, encoding)
496
        return "<?%s?>" % self.toEncoding(output, encoding)
497

    
498
class Comment(NavigableString):
499
    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
500
        return "<!--%s-->" % NavigableString.__str__(self, encoding)
501

    
502
class Declaration(NavigableString):
503
    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
504
        return "<!%s>" % NavigableString.__str__(self, encoding)
505

    
506
class Tag(PageElement):
507

    
508
    """Represents a found HTML tag with its attributes and contents."""
509

    
510
    def _convertEntities(self, match):
511
        """Used in a call to re.sub to replace HTML, XML, and numeric
512
        entities with the appropriate Unicode characters. If HTML
513
        entities are being converted, any unrecognized entities are
514
        escaped."""
515
        x = match.group(1)
516
        if self.convertHTMLEntities and x in name2codepoint:
517
            return unichr(name2codepoint[x])
518
        elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS:
519
            if self.convertXMLEntities:
520
                return self.XML_ENTITIES_TO_SPECIAL_CHARS[x]
521
            else:
522
                return u'&%s;' % x
523
        elif len(x) > 0 and x[0] == '#':
524
            # Handle numeric entities
525
            if len(x) > 1 and x[1] == 'x':
526
                return unichr(int(x[2:], 16))
527
            else:
528
                return unichr(int(x[1:]))
529

    
530
        elif self.escapeUnrecognizedEntities:
531
            return u'&amp;%s;' % x
532
        else:
533
            return u'&%s;' % x
534

    
535
    def __init__(self, parser, name, attrs=None, parent=None,
536
                 previous=None):
537
        "Basic constructor."
538

    
539
        # We don't actually store the parser object: that lets extracted
540
        # chunks be garbage-collected
541
        self.parserClass = parser.__class__
542
        self.isSelfClosing = parser.isSelfClosingTag(name)
543
        self.name = name
544
        if attrs is None:
545
            attrs = []
546
        elif isinstance(attrs, dict):
547
            attrs = attrs.items()
548
        self.attrs = attrs
549
        self.contents = []
550
        self.setup(parent, previous)
551
        self.hidden = False
552
        self.containsSubstitutions = False
553
        self.convertHTMLEntities = parser.convertHTMLEntities
554
        self.convertXMLEntities = parser.convertXMLEntities
555
        self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities
556

    
557
        # Convert any HTML, XML, or numeric entities in the attribute values.
558
        convert = lambda(k, val): (k,
559
                                   re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);",
560
                                          self._convertEntities,
561
                                          val))
562
        self.attrs = map(convert, self.attrs)
563

    
564
    def getString(self):
565
        if (len(self.contents) == 1
566
            and isinstance(self.contents[0], NavigableString)):
567
            return self.contents[0]
568

    
569
    def setString(self, string):
570
        """Replace the contents of the tag with a string"""
571
        self.clear()
572
        self.append(string)
573

    
574
    string = property(getString, setString)
575

    
576
    def getText(self, separator=u""):
577
        if not len(self.contents):
578
            return u""
579
        stopNode = self._lastRecursiveChild().next
580
        strings = []
581
        current = self.contents[0]
582
        while current is not stopNode:
583
            if isinstance(current, NavigableString):
584
                strings.append(current.strip())
585
            current = current.next
586
        return separator.join(strings)
587

    
588
    text = property(getText)
589

    
590
    def get(self, key, default=None):
591
        """Returns the value of the 'key' attribute for the tag, or
592
        the value given for 'default' if it doesn't have that
593
        attribute."""
594
        return self._getAttrMap().get(key, default)
595

    
596
    def clear(self):
597
        """Extract all children."""
598
        for child in self.contents[:]:
599
            child.extract()
600

    
601
    def index(self, element):
602
        for i, child in enumerate(self.contents):
603
            if child is element:
604
                return i
605
        raise ValueError("Tag.index: element not in tag")
606

    
607
    def has_key(self, key):
608
        return self._getAttrMap().has_key(key)
609

    
610
    def __getitem__(self, key):
611
        """tag[key] returns the value of the 'key' attribute for the tag,
612
        and throws an exception if it's not there."""
613
        return self._getAttrMap()[key]
614

    
615
    def __iter__(self):
616
        "Iterating over a tag iterates over its contents."
617
        return iter(self.contents)
618

    
619
    def __len__(self):
620
        "The length of a tag is the length of its list of contents."
621
        return len(self.contents)
622

    
623
    def __contains__(self, x):
624
        return x in self.contents
625

    
626
    def __nonzero__(self):
627
        "A tag is non-None even if it has no contents."
628
        return True
629

    
630
    def __setitem__(self, key, value):
631
        """Setting tag[key] sets the value of the 'key' attribute for the
632
        tag."""
633
        self._getAttrMap()
634
        self.attrMap[key] = value
635
        found = False
636
        for i in range(0, len(self.attrs)):
637
            if self.attrs[i][0] == key:
638
                self.attrs[i] = (key, value)
639
                found = True
640
        if not found:
641
            self.attrs.append((key, value))
642
        self._getAttrMap()[key] = value
643

    
644
    def __delitem__(self, key):
645
        "Deleting tag[key] deletes all 'key' attributes for the tag."
646
        for item in self.attrs:
647
            if item[0] == key:
648
                self.attrs.remove(item)
649
                #We don't break because bad HTML can define the same
650
                #attribute multiple times.
651
            self._getAttrMap()
652
            if self.attrMap.has_key(key):
653
                del self.attrMap[key]
654

    
655
    def __call__(self, *args, **kwargs):
656
        """Calling a tag like a function is the same as calling its
657
        findAll() method. Eg. tag('a') returns a list of all the A tags
658
        found within this tag."""
659
        return apply(self.findAll, args, kwargs)
660

    
661
    def __getattr__(self, tag):
662
        #print "Getattr %s.%s" % (self.__class__, tag)
663
        if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
664
            return self.find(tag[:-3])
665
        elif tag.find('__') != 0:
666
            return self.find(tag)
667
        raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag)
668

    
669
    def __eq__(self, other):
670
        """Returns true iff this tag has the same name, the same attributes,
671
        and the same contents (recursively) as the given tag.
672

673
        NOTE: right now this will return false if two tags have the
674
        same attributes in a different order. Should this be fixed?"""
675
        if other is self:
676
            return True
677
        if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
678
            return False
679
        for i in range(0, len(self.contents)):
680
            if self.contents[i] != other.contents[i]:
681
                return False
682
        return True
683

    
684
    def __ne__(self, other):
685
        """Returns true iff this tag is not identical to the other tag,
686
        as defined in __eq__."""
687
        return not self == other
688

    
689
    def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
690
        """Renders this tag as a string."""
691
        return self.__str__(encoding)
692

    
693
    def __unicode__(self):
694
        return self.__str__(None)
695

    
696
    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,
697
                prettyPrint=False, indentLevel=0):
698
        """Returns a string or Unicode representation of this tag and
699
        its contents. To get Unicode, pass None for encoding.
700

701
        NOTE: since Python's HTML parser consumes whitespace, this
702
        method is not certain to reproduce the whitespace present in
703
        the original string."""
704

    
705
        encodedName = self.toEncoding(self.name, encoding)
706

    
707
        attrs = []
708
        if self.attrs:
709
            for key, val in self.attrs:
710
                fmt = '%s="%s"'
711
                if isinstance(val, basestring):
712
                    if self.containsSubstitutions and '%SOUP-ENCODING%' in val:
713
                        val = self.substituteEncoding(val, encoding)
714

    
715
                    # The attribute value either:
716
                    #
717
                    # * Contains no embedded double quotes or single quotes.
718
                    #   No problem: we enclose it in double quotes.
719
                    # * Contains embedded single quotes. No problem:
720
                    #   double quotes work here too.
721
                    # * Contains embedded double quotes. No problem:
722
                    #   we enclose it in single quotes.
723
                    # * Embeds both single _and_ double quotes. This
724
                    #   can't happen naturally, but it can happen if
725
                    #   you modify an attribute value after parsing
726
                    #   the document. Now we have a bit of a
727
                    #   problem. We solve it by enclosing the
728
                    #   attribute in single quotes, and escaping any
729
                    #   embedded single quotes to XML entities.
730
                    if '"' in val:
731
                        fmt = "%s='%s'"
732
                        if "'" in val:
733
                            # TODO: replace with apos when
734
                            # appropriate.
735
                            val = val.replace("'", "&squot;")
736

    
737
                    # Now we're okay w/r/t quotes. But the attribute
738
                    # value might also contain angle brackets, or
739
                    # ampersands that aren't part of entities. We need
740
                    # to escape those to XML entities too.
741
                    val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val)
742

    
743
                attrs.append(fmt % (self.toEncoding(key, encoding),
744
                                    self.toEncoding(val, encoding)))
745
        close = ''
746
        closeTag = ''
747
        if self.isSelfClosing:
748
            close = ' /'
749
        else:
750
            closeTag = '</%s>' % encodedName
751

    
752
        indentTag, indentContents = 0, 0
753
        if prettyPrint:
754
            indentTag = indentLevel
755
            space = (' ' * (indentTag-1))
756
            indentContents = indentTag + 1
757
        contents = self.renderContents(encoding, prettyPrint, indentContents)
758
        if self.hidden:
759
            s = contents
760
        else:
761
            s = []
762
            attributeString = ''
763
            if attrs:
764
                attributeString = ' ' + ' '.join(attrs)
765
            if prettyPrint:
766
                s.append(space)
767
            s.append('<%s%s%s>' % (encodedName, attributeString, close))
768
            if prettyPrint:
769
                s.append("\n")
770
            s.append(contents)
771
            if prettyPrint and contents and contents[-1] != "\n":
772
                s.append("\n")
773
            if prettyPrint and closeTag:
774
                s.append(space)
775
            s.append(closeTag)
776
            if prettyPrint and closeTag and self.nextSibling:
777
                s.append("\n")
778
            s = ''.join(s)
779
        return s
780

    
781
    def decompose(self):
782
        """Recursively destroys the contents of this tree."""
783
        self.extract()
784
        if len(self.contents) == 0:
785
            return
786
        current = self.contents[0]
787
        while current is not None:
788
            next = current.next
789
            if isinstance(current, Tag):
790
                del current.contents[:]
791
            current.parent = None
792
            current.previous = None
793
            current.previousSibling = None
794
            current.next = None
795
            current.nextSibling = None
796
            current = next
797

    
798
    def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
799
        return self.__str__(encoding, True)
800

    
801
    def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
802
                       prettyPrint=False, indentLevel=0):
803
        """Renders the contents of this tag as a string in the given
804
        encoding. If encoding is None, returns a Unicode string.."""
805
        s=[]
806
        for c in self:
807
            text = None
808
            if isinstance(c, NavigableString):
809
                text = c.__str__(encoding)
810
            elif isinstance(c, Tag):
811
                s.append(c.__str__(encoding, prettyPrint, indentLevel))
812
            if text and prettyPrint:
813
                text = text.strip()
814
            if text:
815
                if prettyPrint:
816
                    s.append(" " * (indentLevel-1))
817
                s.append(text)
818
                if prettyPrint:
819
                    s.append("\n")
820
        return ''.join(s)
821

    
822
    #Soup methods
823

    
824
    def find(self, name=None, attrs={}, recursive=True, text=None,
825
             **kwargs):
826
        """Return only the first child of this Tag matching the given
827
        criteria."""
828
        r = None
829
        l = self.findAll(name, attrs, recursive, text, 1, **kwargs)
830
        if l:
831
            r = l[0]
832
        return r
833
    findChild = find
834

    
835
    def findAll(self, name=None, attrs={}, recursive=True, text=None,
836
                limit=None, **kwargs):
837
        """Extracts a list of Tag objects that match the given
838
        criteria.  You can specify the name of the Tag and any
839
        attributes you want the Tag to have.
840

841
        The value of a key-value pair in the 'attrs' map can be a
842
        string, a list of strings, a regular expression object, or a
843
        callable that takes a string and returns whether or not the
844
        string matches for some custom definition of 'matches'. The
845
        same is true of the tag name."""
846
        generator = self.recursiveChildGenerator
847
        if not recursive:
848
            generator = self.childGenerator
849
        return self._findAll(name, attrs, text, limit, generator, **kwargs)
850
    findChildren = findAll
851

    
852
    # Pre-3.x compatibility methods
853
    first = find
854
    fetch = findAll
855

    
856
    def fetchText(self, text=None, recursive=True, limit=None):
857
        return self.findAll(text=text, recursive=recursive, limit=limit)
858

    
859
    def firstText(self, text=None, recursive=True):
860
        return self.find(text=text, recursive=recursive)
861

    
862
    #Private methods
863

    
864
    def _getAttrMap(self):
865
        """Initializes a map representation of this tag's attributes,
866
        if not already initialized."""
867
        if not getattr(self, 'attrMap'):
868
            self.attrMap = {}
869
            for (key, value) in self.attrs:
870
                self.attrMap[key] = value
871
        return self.attrMap
872

    
873
    #Generator methods
874
    def childGenerator(self):
875
        # Just use the iterator from the contents
876
        return iter(self.contents)
877

    
878
    def recursiveChildGenerator(self):
879
        if not len(self.contents):
880
            raise StopIteration
881
        stopNode = self._lastRecursiveChild().next
882
        current = self.contents[0]
883
        while current is not stopNode:
884
            yield current
885
            current = current.next
886

    
887

    
888
# Next, a couple classes to represent queries and their results.
889
class SoupStrainer:
890
    """Encapsulates a number of ways of matching a markup element (tag or
891
    text)."""
892

    
893
    def __init__(self, name=None, attrs={}, text=None, **kwargs):
894
        self.name = name
895
        if isinstance(attrs, basestring):
896
            kwargs['class'] = _match_css_class(attrs)
897
            attrs = None
898
        if kwargs:
899
            if attrs:
900
                attrs = attrs.copy()
901
                attrs.update(kwargs)
902
            else:
903
                attrs = kwargs
904
        self.attrs = attrs
905
        self.text = text
906

    
907
    def __str__(self):
908
        if self.text:
909
            return self.text
910
        else:
911
            return "%s|%s" % (self.name, self.attrs)
912

    
913
    def searchTag(self, markupName=None, markupAttrs={}):
914
        found = None
915
        markup = None
916
        if isinstance(markupName, Tag):
917
            markup = markupName
918
            markupAttrs = markup
919
        callFunctionWithTagData = callable(self.name) \
920
                                and not isinstance(markupName, Tag)
921

    
922
        if (not self.name) \
923
               or callFunctionWithTagData \
924
               or (markup and self._matches(markup, self.name)) \
925
               or (not markup and self._matches(markupName, self.name)):
926
            if callFunctionWithTagData:
927
                match = self.name(markupName, markupAttrs)
928
            else:
929
                match = True
930
                markupAttrMap = None
931
                for attr, matchAgainst in self.attrs.items():
932
                    if not markupAttrMap:
933
                         if hasattr(markupAttrs, 'get'):
934
                            markupAttrMap = markupAttrs
935
                         else:
936
                            markupAttrMap = {}
937
                            for k,v in markupAttrs:
938
                                markupAttrMap[k] = v
939
                    attrValue = markupAttrMap.get(attr)
940
                    if not self._matches(attrValue, matchAgainst):
941
                        match = False
942
                        break
943
            if match:
944
                if markup:
945
                    found = markup
946
                else:
947
                    found = markupName
948
        return found
949

    
950
    def search(self, markup):
951
        #print 'looking for %s in %s' % (self, markup)
952
        found = None
953
        # If given a list of items, scan it for a text element that
954
        # matches.
955
        if hasattr(markup, "__iter__") \
956
                and not isinstance(markup, Tag):
957
            for element in markup:
958
                if isinstance(element, NavigableString) \
959
                       and self.search(element):
960
                    found = element
961
                    break
962
        # If it's a Tag, make sure its name or attributes match.
963
        # Don't bother with Tags if we're searching for text.
964
        elif isinstance(markup, Tag):
965
            if not self.text:
966
                found = self.searchTag(markup)
967
        # If it's text, make sure the text matches.
968
        elif isinstance(markup, NavigableString) or \
969
                 isinstance(markup, basestring):
970
            if self._matches(markup, self.text):
971
                found = markup
972
        else:
973
            raise Exception, "I don't know how to match against a %s" \
974
                  % markup.__class__
975
        return found
976

    
977
    def _matches(self, markup, matchAgainst):
978
        #print "Matching %s against %s" % (markup, matchAgainst)
979
        result = False
980
        if matchAgainst is True:
981
            result = markup is not None
982
        elif callable(matchAgainst):
983
            result = matchAgainst(markup)
984
        else:
985
            #Custom match methods take the tag as an argument, but all
986
            #other ways of matching match the tag name as a string.
987
            if isinstance(markup, Tag):
988
                markup = markup.name
989
            if markup and not isinstance(markup, basestring):
990
                markup = unicode(markup)
991
            #Now we know that chunk is either a string, or None.
992
            if hasattr(matchAgainst, 'match'):
993
                # It's a regexp object.
994
                result = markup and matchAgainst.search(markup)
995
            elif hasattr(matchAgainst, '__iter__'): # list-like
996
                result = markup in matchAgainst
997
            elif hasattr(matchAgainst, 'items'):
998
                result = markup.has_key(matchAgainst)
999
            elif matchAgainst and isinstance(markup, basestring):
1000
                if isinstance(markup, unicode):
1001
                    matchAgainst = unicode(matchAgainst)
1002
                else:
1003
                    matchAgainst = str(matchAgainst)
1004

    
1005
            if not result:
1006
                result = matchAgainst == markup
1007
        return result
1008

    
1009
class ResultSet(list):
1010
    """A ResultSet is just a list that keeps track of the SoupStrainer
1011
    that created it."""
1012
    def __init__(self, source):
1013
        list.__init__([])
1014
        self.source = source
1015

    
1016
# Now, some helper functions.
1017

    
1018
def buildTagMap(default, *args):
1019
    """Turns a list of maps, lists, or scalars into a single map.
1020
    Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and
1021
    NESTING_RESET_TAGS maps out of lists and partial maps."""
1022
    built = {}
1023
    for portion in args:
1024
        if hasattr(portion, 'items'):
1025
            #It's a map. Merge it.
1026
            for k,v in portion.items():
1027
                built[k] = v
1028
        elif hasattr(portion, '__iter__'): # is a list
1029
            #It's a list. Map each item to the default.
1030
            for k in portion:
1031
                built[k] = default
1032
        else:
1033
            #It's a scalar. Map it to the default.
1034
            built[portion] = default
1035
    return built
1036

    
1037
# Now, the parser classes.
1038

    
1039
class BeautifulStoneSoup(Tag, SGMLParser):
1040

    
1041
    """This class contains the basic parser and search code. It defines
1042
    a parser that knows nothing about tag behavior except for the
1043
    following:
1044

1045
      You can't close a tag without closing all the tags it encloses.
1046
      That is, "<foo><bar></foo>" actually means
1047
      "<foo><bar></bar></foo>".
1048

1049
    [Another possible explanation is "<foo><bar /></foo>", but since
1050
    this class defines no SELF_CLOSING_TAGS, it will never use that
1051
    explanation.]
1052

1053
    This class is useful for parsing XML or made-up markup languages,
1054
    or when BeautifulSoup makes an assumption counter to what you were
1055
    expecting."""
1056

    
1057
    SELF_CLOSING_TAGS = {}
1058
    NESTABLE_TAGS = {}
1059
    RESET_NESTING_TAGS = {}
1060
    QUOTE_TAGS = {}
1061
    PRESERVE_WHITESPACE_TAGS = []
1062

    
1063
    MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'),
1064
                       lambda x: x.group(1) + ' />'),
1065
                      (re.compile('<!\s+([^<>]*)>'),
1066
                       lambda x: '<!' + x.group(1) + '>')
1067
                      ]
1068

    
1069
    ROOT_TAG_NAME = u'[document]'
1070

    
1071
    HTML_ENTITIES = "html"
1072
    XML_ENTITIES = "xml"
1073
    XHTML_ENTITIES = "xhtml"
1074
    # TODO: This only exists for backwards-compatibility
1075
    ALL_ENTITIES = XHTML_ENTITIES
1076

    
1077
    # Used when determining whether a text node is all whitespace and
1078
    # can be replaced with a single space. A text node that contains
1079
    # fancy Unicode spaces (usually non-breaking) should be left
1080
    # alone.
1081
    STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, }
1082

    
1083
    def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
1084
                 markupMassage=True, smartQuotesTo=XML_ENTITIES,
1085
                 convertEntities=None, selfClosingTags=None, isHTML=False):
1086
        """The Soup object is initialized as the 'root tag', and the
1087
        provided markup (which can be a string or a file-like object)
1088
        is fed into the underlying parser.
1089

1090
        sgmllib will process most bad HTML, and the BeautifulSoup
1091
        class has some tricks for dealing with some HTML that kills
1092
        sgmllib, but Beautiful Soup can nonetheless choke or lose data
1093
        if your data uses self-closing tags or declarations
1094
        incorrectly.
1095

1096
        By default, Beautiful Soup uses regexes to sanitize input,
1097
        avoiding the vast majority of these problems. If the problems
1098
        don't apply to you, pass in False for markupMassage, and
1099
        you'll get better performance.
1100

1101
        The default parser massage techniques fix the two most common
1102
        instances of invalid HTML that choke sgmllib:
1103

1104
         <br/> (No space between name of closing tag and tag close)
1105
         <! --Comment--> (Extraneous whitespace in declaration)
1106

1107
        You can pass in a custom list of (RE object, replace method)
1108
        tuples to get Beautiful Soup to scrub your input the way you
1109
        want."""
1110

    
1111
        self.parseOnlyThese = parseOnlyThese
1112
        self.fromEncoding = fromEncoding
1113
        self.smartQuotesTo = smartQuotesTo
1114
        self.convertEntities = convertEntities
1115
        # Set the rules for how we'll deal with the entities we
1116
        # encounter
1117
        if self.convertEntities:
1118
            # It doesn't make sense to convert encoded characters to
1119
            # entities even while you're converting entities to Unicode.
1120
            # Just convert it all to Unicode.
1121
            self.smartQuotesTo = None
1122
            if convertEntities == self.HTML_ENTITIES:
1123
                self.convertXMLEntities = False
1124
                self.convertHTMLEntities = True
1125
                self.escapeUnrecognizedEntities = True
1126
            elif convertEntities == self.XHTML_ENTITIES:
1127
                self.convertXMLEntities = True
1128
                self.convertHTMLEntities = True
1129
                self.escapeUnrecognizedEntities = False
1130
            elif convertEntities == self.XML_ENTITIES:
1131
                self.convertXMLEntities = True
1132
                self.convertHTMLEntities = False
1133
                self.escapeUnrecognizedEntities = False
1134
        else:
1135
            self.convertXMLEntities = False
1136
            self.convertHTMLEntities = False
1137
            self.escapeUnrecognizedEntities = False
1138

    
1139
        self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags)
1140
        SGMLParser.__init__(self)
1141

    
1142
        if hasattr(markup, 'read'):        # It's a file-type object.
1143
            markup = markup.read()
1144
        self.markup = markup
1145
        self.markupMassage = markupMassage
1146
        try:
1147
            self._feed(isHTML=isHTML)
1148
        except StopParsing:
1149
            pass
1150
        self.markup = None                 # The markup can now be GCed
1151

    
1152
    def convert_charref(self, name):
1153
        """This method fixes a bug in Python's SGMLParser."""
1154
        try:
1155
            n = int(name)
1156
        except ValueError:
1157
            return
1158
        if not 0 <= n <= 127 : # ASCII ends at 127, not 255
1159
            return
1160
        return self.convert_codepoint(n)
1161

    
1162
    def _feed(self, inDocumentEncoding=None, isHTML=False):
1163
        # Convert the document to Unicode.
1164
        markup = self.markup
1165
        if isinstance(markup, unicode):
1166
            if not hasattr(self, 'originalEncoding'):
1167
                self.originalEncoding = None
1168
        else:
1169
            dammit = UnicodeDammit\
1170
                     (markup, [self.fromEncoding, inDocumentEncoding],
1171
                      smartQuotesTo=self.smartQuotesTo, isHTML=isHTML)
1172
            markup = dammit.unicode
1173
            self.originalEncoding = dammit.originalEncoding
1174
            self.declaredHTMLEncoding = dammit.declaredHTMLEncoding
1175
        if markup:
1176
            if self.markupMassage:
1177
                if not hasattr(self.markupMassage, "__iter__"):
1178
                    self.markupMassage = self.MARKUP_MASSAGE
1179
                for fix, m in self.markupMassage:
1180
                    markup = fix.sub(m, markup)
1181
                # TODO: We get rid of markupMassage so that the
1182
                # soup object can be deepcopied later on. Some
1183
                # Python installations can't copy regexes. If anyone
1184
                # was relying on the existence of markupMassage, this
1185
                # might cause problems.
1186
                del(self.markupMassage)
1187
        self.reset()
1188

    
1189
        SGMLParser.feed(self, markup)
1190
        # Close out any unfinished strings and close all the open tags.
1191
        self.endData()
1192
        while self.currentTag.name != self.ROOT_TAG_NAME:
1193
            self.popTag()
1194

    
1195
    def __getattr__(self, methodName):
1196
        """This method routes method call requests to either the SGMLParser
1197
        superclass or the Tag superclass, depending on the method name."""
1198
        #print "__getattr__ called on %s.%s" % (self.__class__, methodName)
1199

    
1200
        if methodName.startswith('start_') or methodName.startswith('end_') \
1201
               or methodName.startswith('do_'):
1202
            return SGMLParser.__getattr__(self, methodName)
1203
        elif not methodName.startswith('__'):
1204
            return Tag.__getattr__(self, methodName)
1205
        else:
1206
            raise AttributeError
1207

    
1208
    def isSelfClosingTag(self, name):
1209
        """Returns true iff the given string is the name of a
1210
        self-closing tag according to this parser."""
1211
        return self.SELF_CLOSING_TAGS.has_key(name) \
1212
               or self.instanceSelfClosingTags.has_key(name)
1213

    
1214
    def reset(self):
1215
        Tag.__init__(self, self, self.ROOT_TAG_NAME)
1216
        self.hidden = 1
1217
        SGMLParser.reset(self)
1218
        self.currentData = []
1219
        self.currentTag = None
1220
        self.tagStack = []
1221
        self.quoteStack = []
1222
        self.pushTag(self)
1223

    
1224
    def popTag(self):
1225
        tag = self.tagStack.pop()
1226

    
1227
        #print "Pop", tag.name
1228
        if self.tagStack:
1229
            self.currentTag = self.tagStack[-1]
1230
        return self.currentTag
1231

    
1232
    def pushTag(self, tag):
1233
        #print "Push", tag.name
1234
        if self.currentTag:
1235
            self.currentTag.contents.append(tag)
1236
        self.tagStack.append(tag)
1237
        self.currentTag = self.tagStack[-1]
1238

    
1239
    def endData(self, containerClass=NavigableString):
1240
        if self.currentData:
1241
            currentData = u''.join(self.currentData)
1242
            if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and
1243
                not set([tag.name for tag in self.tagStack]).intersection(
1244
                    self.PRESERVE_WHITESPACE_TAGS)):
1245
                if '\n' in currentData:
1246
                    currentData = '\n'
1247
                else:
1248
                    currentData = ' '
1249
            self.currentData = []
1250
            if self.parseOnlyThese and len(self.tagStack) <= 1 and \
1251
                   (not self.parseOnlyThese.text or \
1252
                    not self.parseOnlyThese.search(currentData)):
1253
                return
1254
            o = containerClass(currentData)
1255
            o.setup(self.currentTag, self.previous)
1256
            if self.previous:
1257
                self.previous.next = o
1258
            self.previous = o
1259
            self.currentTag.contents.append(o)
1260

    
1261

    
1262
    def _popToTag(self, name, inclusivePop=True):
1263
        """Pops the tag stack up to and including the most recent
1264
        instance of the given tag. If inclusivePop is false, pops the tag
1265
        stack up to but *not* including the most recent instqance of
1266
        the given tag."""
1267
        #print "Popping to %s" % name
1268
        if name == self.ROOT_TAG_NAME:
1269
            return
1270

    
1271
        numPops = 0
1272
        mostRecentTag = None
1273
        for i in range(len(self.tagStack)-1, 0, -1):
1274
            if name == self.tagStack[i].name:
1275
                numPops = len(self.tagStack)-i
1276
                break
1277
        if not inclusivePop:
1278
            numPops = numPops - 1
1279

    
1280
        for i in range(0, numPops):
1281
            mostRecentTag = self.popTag()
1282
        return mostRecentTag
1283

    
1284
    def _smartPop(self, name):
1285

    
1286
        """We need to pop up to the previous tag of this type, unless
1287
        one of this tag's nesting reset triggers comes between this
1288
        tag and the previous tag of this type, OR unless this tag is a
1289
        generic nesting trigger and another generic nesting trigger
1290
        comes between this tag and the previous tag of this type.
1291

1292
        Examples:
1293
         <p>Foo<b>Bar *<p>* should pop to 'p', not 'b'.
1294
         <p>Foo<table>Bar *<p>* should pop to 'table', not 'p'.
1295
         <p>Foo<table><tr>Bar *<p>* should pop to 'tr', not 'p'.
1296

1297
         <li><ul><li> *<li>* should pop to 'ul', not the first 'li'.
1298
         <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'
1299
         <td><tr><td> *<td>* should pop to 'tr', not the first 'td'
1300
        """
1301

    
1302
        nestingResetTriggers = self.NESTABLE_TAGS.get(name)
1303
        isNestable = nestingResetTriggers != None
1304
        isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
1305
        popTo = None
1306
        inclusive = True
1307
        for i in range(len(self.tagStack)-1, 0, -1):
1308
            p = self.tagStack[i]
1309
            if (not p or p.name == name) and not isNestable:
1310
                #Non-nestable tags get popped to the top or to their
1311
                #last occurance.
1312
                popTo = name
1313
                break
1314
            if (nestingResetTriggers is not None
1315
                and p.name in nestingResetTriggers) \
1316
                or (nestingResetTriggers is None and isResetNesting
1317
                    and self.RESET_NESTING_TAGS.has_key(p.name)):
1318

    
1319
                #If we encounter one of the nesting reset triggers
1320
                #peculiar to this tag, or we encounter another tag
1321
                #that causes nesting to reset, pop up to but not
1322
                #including that tag.
1323
                popTo = p.name
1324
                inclusive = False
1325
                break
1326
            p = p.parent
1327
        if popTo:
1328
            self._popToTag(popTo, inclusive)
1329

    
1330
    def unknown_starttag(self, name, attrs, selfClosing=0):
1331
        #print "Start tag %s: %s" % (name, attrs)
1332
        if self.quoteStack:
1333
            #This is not a real tag.
1334
            #print "<%s> is not real!" % name
1335
            attrs = ''.join([' %s="%s"' % (x, y) for x, y in attrs])
1336
            self.handle_data('<%s%s>' % (name, attrs))
1337
            return
1338
        self.endData()
1339

    
1340
        if not self.isSelfClosingTag(name) and not selfClosing:
1341
            self._smartPop(name)
1342

    
1343
        if self.parseOnlyThese and len(self.tagStack) <= 1 \
1344
               and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)):
1345
            return
1346

    
1347
        tag = Tag(self, name, attrs, self.currentTag, self.previous)
1348
        if self.previous:
1349
            self.previous.next = tag
1350
        self.previous = tag
1351
        self.pushTag(tag)
1352
        if selfClosing or self.isSelfClosingTag(name):
1353
            self.popTag()
1354
        if name in self.QUOTE_TAGS:
1355
            #print "Beginning quote (%s)" % name
1356
            self.quoteStack.append(name)
1357
            self.literal = 1
1358
        return tag
1359

    
1360
    def unknown_endtag(self, name):
1361
        #print "End tag %s" % name
1362
        if self.quoteStack and self.quoteStack[-1] != name:
1363
            #This is not a real end tag.
1364
            #print "</%s> is not real!" % name
1365
            self.handle_data('</%s>' % name)
1366
            return
1367
        self.endData()
1368
        self._popToTag(name)
1369
        if self.quoteStack and self.quoteStack[-1] == name:
1370
            self.quoteStack.pop()
1371
            self.literal = (len(self.quoteStack) > 0)
1372

    
1373
    def handle_data(self, data):
1374
        self.currentData.append(data)
1375

    
1376
    def _toStringSubclass(self, text, subclass):
1377
        """Adds a certain piece of text to the tree as a NavigableString
1378
        subclass."""
1379
        self.endData()
1380
        self.handle_data(text)
1381
        self.endData(subclass)
1382

    
1383
    def handle_pi(self, text):
1384
        """Handle a processing instruction as a ProcessingInstruction
1385
        object, possibly one with a %SOUP-ENCODING% slot into which an
1386
        encoding will be plugged later."""
1387
        if text[:3] == "xml":
1388
            text = u"xml version='1.0' encoding='%SOUP-ENCODING%'"
1389
        self._toStringSubclass(text, ProcessingInstruction)
1390

    
1391
    def handle_comment(self, text):
1392
        "Handle comments as Comment objects."
1393
        self._toStringSubclass(text, Comment)
1394

    
1395
    def handle_charref(self, ref):
1396
        "Handle character references as data."
1397
        if self.convertEntities:
1398
            data = unichr(int(ref))
1399
        else:
1400
            data = '&#%s;' % ref
1401
        self.handle_data(data)
1402

    
1403
    def handle_entityref(self, ref):
1404
        """Handle entity references as data, possibly converting known
1405
        HTML and/or XML entity references to the corresponding Unicode
1406
        characters."""
1407
        data = None
1408
        if self.convertHTMLEntities:
1409
            try:
1410
                data = unichr(name2codepoint[ref])
1411
            except KeyError:
1412
                pass
1413

    
1414
        if not data and self.convertXMLEntities:
1415
                data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)
1416

    
1417
        if not data and self.convertHTMLEntities and \
1418
            not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
1419
                # TODO: We've got a problem here. We're told this is
1420
                # an entity reference, but it's not an XML entity
1421
                # reference or an HTML entity reference. Nonetheless,
1422
                # the logical thing to do is to pass it through as an
1423
                # unrecognized entity reference.
1424
                #
1425
                # Except: when the input is "&carol;" this function
1426
                # will be called with input "carol". When the input is
1427
                # "AT&T", this function will be called with input
1428
                # "T". We have no way of knowing whether a semicolon
1429
                # was present originally, so we don't know whether
1430
                # this is an unknown entity or just a misplaced
1431
                # ampersand.
1432
                #
1433
                # The more common case is a misplaced ampersand, so I
1434
                # escape the ampersand and omit the trailing semicolon.
1435
                data = "&amp;%s" % ref
1436
        if not data:
1437
            # This case is different from the one above, because we
1438
            # haven't already gone through a supposedly comprehensive
1439
            # mapping of entities to Unicode characters. We might not
1440
            # have gone through any mapping at all. So the chances are
1441
            # very high that this is a real entity, and not a
1442
            # misplaced ampersand.
1443
            data = "&%s;" % ref
1444
        self.handle_data(data)
1445

    
1446
    def handle_decl(self, data):
1447
        "Handle DOCTYPEs and the like as Declaration objects."
1448
        self._toStringSubclass(data, Declaration)
1449

    
1450
    def parse_declaration(self, i):
1451
        """Treat a bogus SGML declaration as raw data. Treat a CDATA
1452
        declaration as a CData object."""
1453
        j = None
1454
        if self.rawdata[i:i+9] == '<![CDATA[':
1455
             k = self.rawdata.find(']]>', i)
1456
             if k == -1:
1457
                 k = len(self.rawdata)
1458
             data = self.rawdata[i+9:k]
1459
             j = k+3
1460
             self._toStringSubclass(data, CData)
1461
        else:
1462
            try:
1463
                j = SGMLParser.parse_declaration(self, i)
1464
            except SGMLParseError:
1465
                toHandle = self.rawdata[i:]
1466
                self.handle_data(toHandle)
1467
                j = i + len(toHandle)
1468
        return j
1469

    
1470
class BeautifulSoup(BeautifulStoneSoup):
1471

    
1472
    """This parser knows the following facts about HTML:
1473

1474
    * Some tags have no closing tag and should be interpreted as being
1475
      closed as soon as they are encountered.
1476

1477
    * The text inside some tags (ie. 'script') may contain tags which
1478
      are not really part of the document and which should be parsed
1479
      as text, not tags. If you want to parse the text as tags, you can
1480
      always fetch it and parse it explicitly.
1481

1482
    * Tag nesting rules:
1483

1484
      Most tags can't be nested at all. For instance, the occurance of
1485
      a <p> tag should implicitly close the previous <p> tag.
1486

1487
       <p>Para1<p>Para2
1488
        should be transformed into:
1489
       <p>Para1</p><p>Para2
1490

1491
      Some tags can be nested arbitrarily. For instance, the occurance
1492
      of a <blockquote> tag should _not_ implicitly close the previous
1493
      <blockquote> tag.
1494

1495
       Alice said: <blockquote>Bob said: <blockquote>Blah
1496
        should NOT be transformed into:
1497
       Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah
1498

1499
      Some tags can be nested, but the nesting is reset by the
1500
      interposition of other tags. For instance, a <tr> tag should
1501
      implicitly close the previous <tr> tag within the same <table>,
1502
      but not close a <tr> tag in another table.
1503

1504
       <table><tr>Blah<tr>Blah
1505
        should be transformed into:
1506
       <table><tr>Blah</tr><tr>Blah
1507
        but,
1508
       <tr>Blah<table><tr>Blah
1509
        should NOT be transformed into
1510
       <tr>Blah<table></tr><tr>Blah
1511

1512
    Differing assumptions about tag nesting rules are a major source
1513
    of problems with the BeautifulSoup class. If BeautifulSoup is not
1514
    treating as nestable a tag your page author treats as nestable,
1515
    try ICantBelieveItsBeautifulSoup, MinimalSoup, or
1516
    BeautifulStoneSoup before writing your own subclass."""
1517

    
1518
    def __init__(self, *args, **kwargs):
1519
        if not kwargs.has_key('smartQuotesTo'):
1520
            kwargs['smartQuotesTo'] = self.HTML_ENTITIES
1521
        kwargs['isHTML'] = True
1522
        BeautifulStoneSoup.__init__(self, *args, **kwargs)
1523

    
1524
    SELF_CLOSING_TAGS = buildTagMap(None,
1525
                                    ('br' , 'hr', 'input', 'img', 'meta',
1526
                                    'spacer', 'link', 'frame', 'base', 'col'))
1527

    
1528
    PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
1529

    
1530
    QUOTE_TAGS = {'script' : None, 'textarea' : None}
1531

    
1532
    #According to the HTML standard, each of these inline tags can
1533
    #contain another tag of the same type. Furthermore, it's common
1534
    #to actually use these tags this way.
1535
    NESTABLE_INLINE_TAGS = ('span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',
1536
                            'center')
1537

    
1538
    #According to the HTML standard, these block tags can contain
1539
    #another tag of the same type. Furthermore, it's common
1540
    #to actually use these tags this way.
1541
    NESTABLE_BLOCK_TAGS = ('blockquote', 'div', 'fieldset', 'ins', 'del')
1542

    
1543
    #Lists can contain other lists, but there are restrictions.
1544
    NESTABLE_LIST_TAGS = { 'ol' : [],
1545
                           'ul' : [],
1546
                           'li' : ['ul', 'ol'],
1547
                           'dl' : [],
1548
                           'dd' : ['dl'],
1549
                           'dt' : ['dl'] }
1550

    
1551
    #Tables can contain other tables, but there are restrictions.
1552
    NESTABLE_TABLE_TAGS = {'table' : [],
1553
                           'tr' : ['table', 'tbody', 'tfoot', 'thead'],
1554
                           'td' : ['tr'],
1555
                           'th' : ['tr'],
1556
                           'thead' : ['table'],
1557
                           'tbody' : ['table'],
1558
                           'tfoot' : ['table'],
1559
                           }
1560

    
1561
    NON_NESTABLE_BLOCK_TAGS = ('address', 'form', 'p', 'pre')
1562

    
1563
    #If one of these tags is encountered, all tags up to the next tag of
1564
    #this type are popped.
1565
    RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript',
1566
                                     NON_NESTABLE_BLOCK_TAGS,
1567
                                     NESTABLE_LIST_TAGS,
1568
                                     NESTABLE_TABLE_TAGS)
1569

    
1570
    NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS,
1571
                                NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS)
1572

    
1573
    # Used to detect the charset in a META tag; see start_meta
1574
    CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
1575

    
1576
    def start_meta(self, attrs):
1577
        """Beautiful Soup can detect a charset included in a META tag,
1578
        try to convert the document to that charset, and re-parse the
1579
        document from the beginning."""
1580
        httpEquiv = None
1581
        contentType = None
1582
        contentTypeIndex = None
1583
        tagNeedsEncodingSubstitution = False
1584

    
1585
        for i in range(0, len(attrs)):
1586
            key, value = attrs[i]
1587
            key = key.lower()
1588
            if key == 'http-equiv':
1589
                httpEquiv = value
1590
            elif key == 'content':
1591
                contentType = value
1592
                contentTypeIndex = i
1593

    
1594
        if httpEquiv and contentType: # It's an interesting meta tag.
1595
            match = self.CHARSET_RE.search(contentType)
1596
            if match:
1597
                if (self.declaredHTMLEncoding is not None or
1598
                    self.originalEncoding == self.fromEncoding):
1599
                    # An HTML encoding was sniffed while converting
1600
                    # the document to Unicode, or an HTML encoding was
1601
                    # sniffed during a previous pass through the
1602
                    # document, or an encoding was specified
1603
                    # explicitly and it worked. Rewrite the meta tag.
1604
                    def rewrite(match):
1605
                        return match.group(1) + "%SOUP-ENCODING%"
1606
                    newAttr = self.CHARSET_RE.sub(rewrite, contentType)
1607
                    attrs[contentTypeIndex] = (attrs[contentTypeIndex][0],
1608
                                               newAttr)
1609
                    tagNeedsEncodingSubstitution = True
1610
                else:
1611
                    # This is our first pass through the document.
1612
                    # Go through it again with the encoding information.
1613
                    newCharset = match.group(3)
1614
                    if newCharset and newCharset != self.originalEncoding:
1615
                        self.declaredHTMLEncoding = newCharset
1616
                        self._feed(self.declaredHTMLEncoding)
1617
                        raise StopParsing
1618
                    pass
1619
        tag = self.unknown_starttag("meta", attrs)
1620
        if tag and tagNeedsEncodingSubstitution:
1621
            tag.containsSubstitutions = True
1622

    
1623
class StopParsing(Exception):
1624
    pass
1625

    
1626
class ICantBelieveItsBeautifulSoup(BeautifulSoup):
1627

    
1628
    """The BeautifulSoup class is oriented towards skipping over
1629
    common HTML errors like unclosed tags. However, sometimes it makes
1630
    errors of its own. For instance, consider this fragment:
1631

1632
     <b>Foo<b>Bar</b></b>
1633

1634
    This is perfectly valid (if bizarre) HTML. However, the
1635
    BeautifulSoup class will implicitly close the first b tag when it
1636
    encounters the second 'b'. It will think the author wrote
1637
    "<b>Foo<b>Bar", and didn't close the first 'b' tag, because
1638
    there's no real-world reason to bold something that's already
1639
    bold. When it encounters '</b></b>' it will close two more 'b'
1640
    tags, for a grand total of three tags closed instead of two. This
1641
    can throw off the rest of your document structure. The same is
1642
    true of a number of other tags, listed below.
1643

1644
    It's much more common for someone to forget to close a 'b' tag
1645
    than to actually use nested 'b' tags, and the BeautifulSoup class
1646
    handles the common case. This class handles the not-co-common
1647
    case: where you can't believe someone wrote what they did, but
1648
    it's valid HTML and BeautifulSoup screwed up by assuming it
1649
    wouldn't be."""
1650

    
1651
    I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \
1652
     ('em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
1653
      'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
1654
      'big')
1655

    
1656
    I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ('noscript',)
1657

    
1658
    NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS,
1659
                                I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS,
1660
                                I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)
1661

    
1662
class MinimalSoup(BeautifulSoup):
1663
    """The MinimalSoup class is for parsing HTML that contains
1664
    pathologically bad markup. It makes no assumptions about tag
1665
    nesting, but it does know which tags are self-closing, that
1666
    <script> tags contain Javascript and should not be parsed, that
1667
    META tags may contain encoding information, and so on.
1668

1669
    This also makes it better for subclassing than BeautifulStoneSoup
1670
    or BeautifulSoup."""
1671

    
1672
    RESET_NESTING_TAGS = buildTagMap('noscript')
1673
    NESTABLE_TAGS = {}
1674

    
1675
class BeautifulSOAP(BeautifulStoneSoup):
1676
    """This class will push a tag with only a single string child into
1677
    the tag's parent as an attribute. The attribute's name is the tag
1678
    name, and the value is the string child. An example should give
1679
    the flavor of the change:
1680

1681
    <foo><bar>baz</bar></foo>
1682
     =>
1683
    <foo bar="baz"><bar>baz</bar></foo>
1684

1685
    You can then access fooTag['bar'] instead of fooTag.barTag.string.
1686

1687
    This is, of course, useful for scraping structures that tend to
1688
    use subelements instead of attributes, such as SOAP messages. Note
1689
    that it modifies its input, so don't print the modified version
1690
    out.
1691

1692
    I'm not sure how many people really want to use this class; let me
1693
    know if you do. Mainly I like the name."""
1694

    
1695
    def popTag(self):
1696
        if len(self.tagStack) > 1:
1697
            tag = self.tagStack[-1]
1698
            parent = self.tagStack[-2]
1699
            parent._getAttrMap()
1700
            if (isinstance(tag, Tag) and len(tag.contents) == 1 and
1701
                isinstance(tag.contents[0], NavigableString) and
1702
                not parent.attrMap.has_key(tag.name)):
1703
                parent[tag.name] = tag.contents[0]
1704
        BeautifulStoneSoup.popTag(self)
1705

    
1706
#Enterprise class names! It has come to our attention that some people
1707
#think the names of the Beautiful Soup parser classes are too silly
1708
#and "unprofessional" for use in enterprise screen-scraping. We feel
1709
#your pain! For such-minded folk, the Beautiful Soup Consortium And
1710
#All-Night Kosher Bakery recommends renaming this file to
1711
#"RobustParser.py" (or, in cases of extreme enterprisiness,
1712
#"RobustParserBeanInterface.class") and using the following
1713
#enterprise-friendly class aliases:
1714
class RobustXMLParser(BeautifulStoneSoup):
1715
    pass
1716
class RobustHTMLParser(BeautifulSoup):
1717
    pass
1718
class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup):
1719
    pass
1720
class RobustInsanelyWackAssHTMLParser(MinimalSoup):
1721
    pass
1722
class SimplifyingSOAPParser(BeautifulSOAP):
1723
    pass
1724

    
1725
######################################################
1726
#
1727
# Bonus library: Unicode, Dammit
1728
#
1729
# This class forces XML data into a standard format (usually to UTF-8
1730
# or Unicode).  It is heavily based on code from Mark Pilgrim's
1731
# Universal Feed Parser. It does not rewrite the XML or HTML to
1732
# reflect a new encoding: that happens in BeautifulStoneSoup.handle_pi
1733
# (XML) and BeautifulSoup.start_meta (HTML).
1734

    
1735
# Autodetects character encodings.
1736
# Download from http://chardet.feedparser.org/
1737
try:
1738
    import chardet
1739
#    import chardet.constants
1740
#    chardet.constants._debug = 1
1741
except ImportError:
1742
    chardet = None
1743

    
1744
# cjkcodecs and iconv_codec make Python know about more character encodings.
1745
# Both are available from http://cjkpython.i18n.org/
1746
# They're built in if you use Python 2.4.
1747
try:
1748
    import cjkcodecs.aliases
1749
except ImportError:
1750
    pass
1751
try:
1752
    import iconv_codec
1753
except ImportError:
1754
    pass
1755

    
1756
class UnicodeDammit:
1757
    """A class for detecting the encoding of a *ML document and
1758
    converting it to a Unicode string. If the source encoding is
1759
    windows-1252, can replace MS smart quotes with their HTML or XML
1760
    equivalents."""
1761

    
1762
    # This dictionary maps commonly seen values for "charset" in HTML
1763
    # meta tags to the corresponding Python codec names. It only covers
1764
    # values that aren't in Python's aliases and can't be determined
1765
    # by the heuristics in find_codec.
1766
    CHARSET_ALIASES = { "macintosh" : "mac-roman",
1767
                        "x-sjis" : "shift-jis" }
1768

    
1769
    def __init__(self, markup, overrideEncodings=[],
1770
                 smartQuotesTo='xml', isHTML=False):
1771
        self.declaredHTMLEncoding = None
1772
        self.markup, documentEncoding, sniffedEncoding = \
1773
                     self._detectEncoding(markup, isHTML)
1774
        self.smartQuotesTo = smartQuotesTo
1775
        self.triedEncodings = []
1776
        if markup == '' or isinstance(markup, unicode):
1777
            self.originalEncoding = None
1778
            self.unicode = unicode(markup)
1779
            return
1780

    
1781
        u = None
1782
        for proposedEncoding in overrideEncodings:
1783
            u = self._convertFrom(proposedEncoding)
1784
            if u: break
1785
        if not u:
1786
            for proposedEncoding in (documentEncoding, sniffedEncoding):
1787
                u = self._convertFrom(proposedEncoding)
1788
                if u: break
1789

    
1790
        # If no luck and we have auto-detection library, try that:
1791
        if not u and chardet and not isinstance(self.markup, unicode):
1792
            u = self._convertFrom(chardet.detect(self.markup)['encoding'])
1793

    
1794
        # As a last resort, try utf-8 and windows-1252:
1795
        if not u:
1796
            for proposed_encoding in ("utf-8", "windows-1252"):
1797
                u = self._convertFrom(proposed_encoding)
1798
                if u: break
1799

    
1800
        self.unicode = u
1801
        if not u: self.originalEncoding = None
1802

    
1803
    def _subMSChar(self, orig):
1804
        """Changes a MS smart quote character to an XML or HTML
1805
        entity."""
1806
        sub = self.MS_CHARS.get(orig)
1807
        if isinstance(sub, tuple):
1808
            if self.smartQuotesTo == 'xml':
1809
                sub = '&#x%s;' % sub[1]
1810
            else:
1811
                sub = '&%s;' % sub[0]
1812
        return sub
1813

    
1814
    def _convertFrom(self, proposed):
1815
        proposed = self.find_codec(proposed)
1816
        if not proposed or proposed in self.triedEncodings:
1817
            return None
1818
        self.triedEncodings.append(proposed)
1819
        markup = self.markup
1820

    
1821
        # Convert smart quotes to HTML if coming from an encoding
1822
        # that might have them.
1823
        if self.smartQuotesTo and proposed.lower() in("windows-1252",
1824
                                                      "iso-8859-1",
1825
                                                      "iso-8859-2"):
1826
            markup = re.compile("([\x80-\x9f])").sub \
1827
                     (lambda(x): self._subMSChar(x.group(1)),
1828
                      markup)
1829

    
1830
        try:
1831
            # print "Trying to convert document to %s" % proposed
1832
            u = self._toUnicode(markup, proposed)
1833
            self.markup = u
1834
            self.originalEncoding = proposed
1835
        except Exception, e:
1836
            # print "That didn't work!"
1837
            # print e
1838
            return None
1839
        #print "Correct encoding: %s" % proposed
1840
        return self.markup
1841

    
1842
    def _toUnicode(self, data, encoding):
1843
        '''Given a string and its encoding, decodes the string into Unicode.
1844
        %encoding is a string recognized by encodings.aliases'''
1845

    
1846
        # strip Byte Order Mark (if present)
1847
        if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
1848
               and (data[2:4] != '\x00\x00'):
1849
            encoding = 'utf-16be'
1850
            data = data[2:]
1851
        elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
1852
                 and (data[2:4] != '\x00\x00'):
1853
            encoding = 'utf-16le'
1854
            data = data[2:]
1855
        elif data[:3] == '\xef\xbb\xbf':
1856
            encoding = 'utf-8'
1857
            data = data[3:]
1858
        elif data[:4] == '\x00\x00\xfe\xff':
1859
            encoding = 'utf-32be'
1860
            data = data[4:]
1861
        elif data[:4] == '\xff\xfe\x00\x00':
1862
            encoding = 'utf-32le'
1863
            data = data[4:]
1864
        newdata = unicode(data, encoding)
1865
        return newdata
1866

    
1867
    def _detectEncoding(self, xml_data, isHTML=False):
1868
        """Given a document, tries to detect its XML encoding."""
1869
        xml_encoding = sniffed_xml_encoding = None
1870
        try:
1871
            if xml_data[:4] == '\x4c\x6f\xa7\x94':
1872
                # EBCDIC
1873
                xml_data = self._ebcdic_to_ascii(xml_data)
1874
            elif xml_data[:4] == '\x00\x3c\x00\x3f':
1875
                # UTF-16BE
1876
                sniffed_xml_encoding = 'utf-16be'
1877
                xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
1878
            elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
1879
                     and (xml_data[2:4] != '\x00\x00'):
1880
                # UTF-16BE with BOM
1881
                sniffed_xml_encoding = 'utf-16be'
1882
                xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
1883
            elif xml_data[:4] == '\x3c\x00\x3f\x00':
1884
                # UTF-16LE
1885
                sniffed_xml_encoding = 'utf-16le'
1886
                xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
1887
            elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
1888
                     (xml_data[2:4] != '\x00\x00'):
1889
                # UTF-16LE with BOM
1890
                sniffed_xml_encoding = 'utf-16le'
1891
                xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
1892
            elif xml_data[:4] == '\x00\x00\x00\x3c':
1893
                # UTF-32BE
1894
                sniffed_xml_encoding = 'utf-32be'
1895
                xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
1896
            elif xml_data[:4] == '\x3c\x00\x00\x00':
1897
                # UTF-32LE
1898
                sniffed_xml_encoding = 'utf-32le'
1899
                xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
1900
            elif xml_data[:4] == '\x00\x00\xfe\xff':
1901
                # UTF-32BE with BOM
1902
                sniffed_xml_encoding = 'utf-32be'
1903
                xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
1904
            elif xml_data[:4] == '\xff\xfe\x00\x00':
1905
                # UTF-32LE with BOM
1906
                sniffed_xml_encoding = 'utf-32le'
1907
                xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
1908
            elif xml_data[:3] == '\xef\xbb\xbf':
1909
                # UTF-8 with BOM
1910
                sniffed_xml_encoding = 'utf-8'
1911
                xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
1912
            else:
1913
                sniffed_xml_encoding = 'ascii'
1914
                pass
1915
        except:
1916
            xml_encoding_match = None
1917
        xml_encoding_match = re.compile(
1918
            '^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data)
1919
        if not xml_encoding_match and isHTML:
1920
            regexp = re.compile('<\s*meta[^>]+charset=([^>]*?)[;\'">]', re.I)
1921
            xml_encoding_match = regexp.search(xml_data)
1922
        if xml_encoding_match is not None:
1923
            xml_encoding = xml_encoding_match.groups()[0].lower()
1924
            if isHTML:
1925
                self.declaredHTMLEncoding = xml_encoding
1926
            if sniffed_xml_encoding and \
1927
               (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
1928
                                 'iso-10646-ucs-4', 'ucs-4', 'csucs4',
1929
                                 'utf-16', 'utf-32', 'utf_16', 'utf_32',
1930
                                 'utf16', 'u16')):
1931
                xml_encoding = sniffed_xml_encoding
1932
        return xml_data, xml_encoding, sniffed_xml_encoding
1933

    
1934

    
1935
    def find_codec(self, charset):
1936
        return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
1937
               or (charset and self._codec(charset.replace("-", ""))) \
1938
               or (charset and self._codec(charset.replace("-", "_"))) \
1939
               or charset
1940

    
1941
    def _codec(self, charset):
1942
        if not charset: return charset
1943
        codec = None
1944
        try:
1945
            codecs.lookup(charset)
1946
            codec = charset
1947
        except (LookupError, ValueError):
1948
            pass
1949
        return codec
1950

    
1951
    EBCDIC_TO_ASCII_MAP = None
1952
    def _ebcdic_to_ascii(self, s):
1953
        c = self.__class__
1954
        if not c.EBCDIC_TO_ASCII_MAP:
1955
            emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
1956
                    16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
1957
                    128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
1958
                    144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
1959
                    32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
1960
                    38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
1961
                    45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
1962
                    186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
1963
                    195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
1964
                    201,202,106,107,108,109,110,111,112,113,114,203,204,205,
1965
                    206,207,208,209,126,115,116,117,118,119,120,121,122,210,
1966
                    211,212,213,214,215,216,217,218,219,220,221,222,223,224,
1967
                    225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
1968
                    73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
1969
                    82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
1970
                    90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
1971
                    250,251,252,253,254,255)
1972
            import string
1973
            c.EBCDIC_TO_ASCII_MAP = string.maketrans( \
1974
            ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
1975
        return s.translate(c.EBCDIC_TO_ASCII_MAP)
1976

    
1977
    MS_CHARS = { '\x80' : ('euro', '20AC'),
1978
                 '\x81' : ' ',
1979
                 '\x82' : ('sbquo', '201A'),
1980
                 '\x83' : ('fnof', '192'),
1981
                 '\x84' : ('bdquo', '201E'),
1982
                 '\x85' : ('hellip', '2026'),
1983
                 '\x86' : ('dagger', '2020'),
1984
                 '\x87' : ('Dagger', '2021'),
1985
                 '\x88' : ('circ', '2C6'),
1986
                 '\x89' : ('permil', '2030'),
1987
                 '\x8A' : ('Scaron', '160'),
1988
                 '\x8B' : ('lsaquo', '2039'),
1989
                 '\x8C' : ('OElig', '152'),
1990
                 '\x8D' : '?',
1991
                 '\x8E' : ('#x17D', '17D'),
1992
                 '\x8F' : '?',
1993
                 '\x90' : '?',
1994
                 '\x91' : ('lsquo', '2018'),
1995
                 '\x92' : ('rsquo', '2019'),
1996
                 '\x93' : ('ldquo', '201C'),
1997
                 '\x94' : ('rdquo', '201D'),
1998
                 '\x95' : ('bull', '2022'),
1999
                 '\x96' : ('ndash', '2013'),
2000
                 '\x97' : ('mdash', '2014'),
2001
                 '\x98' : ('tilde', '2DC'),
2002
                 '\x99' : ('trade', '2122'),
2003
                 '\x9a' : ('scaron', '161'),
2004
                 '\x9b' : ('rsaquo', '203A'),
2005
                 '\x9c' : ('oelig', '153'),
2006
                 '\x9d' : '?',
2007
                 '\x9e' : ('#x17E', '17E'),
2008
                 '\x9f' : ('Yuml', ''),}
2009

    
2010
#######################################################################
2011

    
2012

    
2013
#By default, act as an HTML pretty-printer.
2014
if __name__ == '__main__':
2015
    import sys
2016
    soup = BeautifulSoup(sys.stdin)
2017
    print soup.prettify()