Statistics
| Revision:

gvsig-scripting / org.gvsig.scripting / trunk / org.gvsig.scripting / org.gvsig.scripting.app / org.gvsig.scripting.app.mainplugin / src / main / resources-plugin / scripting / lib / encutils / __init__.py @ 475

History | View | Annotate | Download (22.2 KB)

1
# -*- coding: utf-8 -*-
2
#!/usr/bin/env python
3
"""encutils - encoding detection collection for Python
4

5
:Version: 0.9.8
6
:Author: Christof Hoeke, see http://cthedot.de/encutils/
7
:Contributor: Robert Siemer, Fredrik Hedman <fredrik.hedman@me.com> ported to python3
8
:Copyright: 2005-2012: Christof Hoeke
9
:License: encutils has a dual-license, please choose whatever you prefer:
10

11
    * encutils is published under the
12
      `LGPL 3 or later <http://cthedot.de/encutils/license/>`__
13
    * encutils is published under the
14
      `Creative Commons License <http://creativecommons.org/licenses/by/3.0/>`__.
15

16
    encutils is free software: you can redistribute it and/or modify
17
    it under the terms of the GNU Lesser General Public License as published by
18
    the Free Software Foundation, either version 3 of the License, or
19
    (at your option) any later version.
20

21
    encutils is distributed in the hope that it will be useful,
22
    but WITHOUT ANY WARRANTY; without even the implied warranty of
23
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24
    GNU Lesser General Public License for more details.
25

26
    You should have received a copy of the GNU Lesser General Public License
27
    along with encutils.  If not, see <http://www.gnu.org/licenses/>.
28

29

30
A collection of helper functions to detect encodings of text files (like HTML, XHTML, XML, CSS, etc.) retrieved via HTTP, file or string.
31

32
:func:`getEncodingInfo` is probably the main function of interest which uses
33
other supplied functions itself and gathers all information together and
34
supplies an :class:`EncodingInfo` object.
35

36
example::
37

38
    >>> import encutils
39
    >>> info = encutils.getEncodingInfo(url='http://cthedot.de/encutils/')
40

41
    >>> str(info)
42
    utf-8
43

44
    >>> repr(info) # doctest:+ELLIPSIS
45
    <encutils.EncodingInfo object encoding='utf-8' mismatch=False at...>
46

47
    >>> info.logtext
48
    HTTP media_type: text/html
49
    HTTP encoding: utf-8
50
    Encoding (probably): utf-8 (Mismatch: False)
51
    <BLANKLINE>
52

53
references
54
    XML
55
        RFC 3023 (http://www.ietf.org/rfc/rfc3023.txt)
56

57
        easier explained in
58
            - http://feedparser.org/docs/advanced.html
59
            - http://www.xml.com/pub/a/2004/07/21/dive.html
60

61
    HTML
62
        http://www.w3.org/TR/REC-html40/charset.html#h-5.2.2
63

64
TODO
65
    - parse @charset of HTML elements?
66
    - check for more texttypes if only text given
67
"""
68
__all__ = ['buildlog',
69
           'encodingByMediaType',
70
           'getHTTPInfo',
71
           'getMetaInfo',
72
           'detectXMLEncoding',
73
           'getEncodingInfo',
74
           'tryEncodings',
75
           'EncodingInfo']
76
__docformat__ = 'restructuredtext'
77
__author__ = 'Christof Hoeke, Robert Siemer, Fredrik Hedman'
78
__version__ = '$Id$'
79

    
80
import HTMLParser
81
import StringIO
82
import cgi
83
import httplib
84
import re
85
import sys
86
import types
87
import urllib
88

    
89
VERSION = '0.9.8'
90
PY2x = sys.version_info < (3,0)
91

    
92

    
93
class _MetaHTMLParser(HTMLParser.HTMLParser):
94
    """Parse given data for <meta http-equiv="content-type">."""
95
    content_type = None
96

    
97
    def handle_starttag(self, tag, attrs):
98
        if tag == 'meta' and not self.content_type:
99
            atts = dict([(a.lower(), v.lower()) for a, v in attrs])
100
            if atts.get('http-equiv', u'').strip() == u'content-type':
101
                self.content_type = atts.get('content')
102

    
103

    
104
# application/xml, application/xml-dtd, application/xml-external-parsed-entity, or a subtype like application/rss+xml.
105
_XML_APPLICATION_TYPE = 0
106

    
107
# text/xml, text/xml-external-parsed-entity, or a subtype like text/AnythingAtAll+xml
108
_XML_TEXT_TYPE = 1
109

    
110
# text/html
111
_HTML_TEXT_TYPE = 2
112

    
113
# any other of text/* like text/plain, ...
114
_TEXT_TYPE = 3
115

    
116
# any text/* like which defaults to UTF-8 encoding, for now only text/css
117
_TEXT_UTF8 = 5
118

    
119
# types not fitting in above types
120
_OTHER_TYPE = 4
121

    
122
class EncodingInfo(object):
123
    """
124
    All encoding related information, returned by :func:`getEncodingInfo`.
125

126
    Attributes filled:
127
        - ``encoding``: The guessed encoding
128
            Encoding is the explicit or implicit encoding or None and
129
            always lowercase.
130

131
        - from HTTP response
132
            * ``http_encoding``
133
            * ``http_media_type``
134

135
        - from HTML <meta> element
136
            * ``meta_encoding``
137
            * ``meta_media_type``
138

139
        - from XML declaration
140
            * ``xml_encoding``
141

142
        - ``mismatch``: True if mismatch between XML declaration and HTTP
143
            header.
144
            Mismatch is True if any mismatches between HTTP header, XML
145
            declaration or textcontent (meta) are found. More detailed
146
            mismatch reports are written to the optional log or ``logtext``
147

148
            Mismatches are not necessarily errors as preferences are defined.
149
            For details see the specifications.
150

151
        - ``logtext``: if no log was given log reports are given here
152
    """
153
    def __init__(self):
154
        """Initialize all possible properties to ``None``, see class
155
        description
156
        """
157
        self.encoding = self.mismatch = self.logtext =\
158
            self.http_encoding = self.http_media_type =\
159
            self.meta_encoding = self.meta_media_type =\
160
            self.xml_encoding =\
161
                None
162

    
163
    def __str__(self):
164
        """Output the guessed encoding itself or the empty string."""
165
        if self.encoding:
166
            return self.encoding
167
        else:
168
            return u''
169

    
170
    def __repr__(self):
171
        return "<%s.%s object encoding=%r mismatch=%s at 0x%x>" % (
172
                self.__class__.__module__, self.__class__.__name__,
173
                self.encoding, self.mismatch, id(self))
174

    
175

    
176
def buildlog(logname='encutils', level='INFO', stream=sys.stderr,
177
            filename=None, filemode="w",
178
            format='%(levelname)s\t%(message)s'):
179
    """Helper to build a basic log
180

181
    - if `filename` is given returns a log logging to `filename` with
182
      mode `filemode`
183
    - else uses a log streaming to `stream` which defaults to
184
      `sys.stderr`
185
    - `level` defines the level of the log
186
    - `format` defines the formatter format of the log
187

188
    :returns:
189
        a log with the name `logname`
190
    """
191
    import logging
192

    
193
    log = logging.getLogger(logname)
194

    
195
    if filename:
196
        hdlr = logging.FileHandler(filename, filemode)
197
    else:
198
        hdlr = logging.StreamHandler(stream)
199

    
200
    formatter = logging.Formatter(format)
201
    hdlr.setFormatter(formatter)
202

    
203
    log.addHandler(hdlr)
204
    log.setLevel(logging.__dict__.get(level, logging.INFO))
205

    
206
    return log
207

    
208

    
209
def _getTextTypeByMediaType(media_type, log=None):
210
    """
211
    :returns:
212
        type as defined by constants in this class
213
    """
214
    if not media_type:
215
        return _OTHER_TYPE
216
    xml_application_types = [
217
        ur'application/.*?\+xml',
218
        u'application/xml',
219
        u'application/xml-dtd',
220
        u'application/xml-external-parsed-entity']
221
    xml_text_types = [
222
        ur'text\/.*?\+xml',
223
        u'text/xml',
224
        u'text/xml-external-parsed-entity']
225

    
226
    media_type = media_type.strip().lower()
227

    
228
    if media_type in xml_application_types or\
229
            re.match(xml_application_types[0], media_type, re.I|re.S|re.X):
230
        return _XML_APPLICATION_TYPE
231
    elif media_type in xml_text_types or\
232
            re.match(xml_text_types[0], media_type, re.I|re.S|re.X):
233
        return _XML_TEXT_TYPE
234
    elif media_type == u'text/html':
235
        return _HTML_TEXT_TYPE
236
    elif media_type == u'text/css':
237
        return _TEXT_UTF8
238
    elif media_type.startswith(u'text/'):
239
        return _TEXT_TYPE
240
    else:
241
        return _OTHER_TYPE
242

    
243

    
244
def _getTextType(text, log=None):
245
    """Check if given text is XML (**naive test!**)
246
    used if no content-type given
247
    """
248
    if text[:30].find(u'<?xml version=') != -1:
249
        return _XML_APPLICATION_TYPE
250
    else:
251
        return _OTHER_TYPE
252

    
253

    
254
def encodingByMediaType(media_type, log=None):
255
    """
256
    :param media_type:
257
        a media type like "text/html"
258
    :returns:
259
        a default encoding for given `media_type`. For example
260
        ``"utf-8"`` for ``media_type="application/xml"``.
261

262
        If no default encoding is available returns ``None``.
263

264
        Refers to RFC 3023 and HTTP MIME specification.
265
    """
266
    defaultencodings = {
267
        _XML_APPLICATION_TYPE: u'utf-8',
268
        _XML_TEXT_TYPE: u'ascii',
269
        _HTML_TEXT_TYPE: u'iso-8859-1', # should be None?
270
        _TEXT_TYPE: u'iso-8859-1', # should be None?
271
        _TEXT_UTF8: u'utf-8',
272
        _OTHER_TYPE: None}
273

    
274
    texttype = _getTextTypeByMediaType(media_type)
275
    encoding = defaultencodings.get(texttype, None)
276

    
277
    if log:
278
        if not encoding:
279
            log.debug(u'"%s" Media-Type has no default encoding',
280
                media_type)
281
        else:
282
            log.debug(
283
                u'Default encoding for Media Type "%s": %s',
284
                media_type, encoding)
285
    return encoding
286

    
287

    
288
def getHTTPInfo(response, log=None):
289
    """
290
    :param response:
291
        a HTTP response object
292
    :returns:
293
        ``(media_type, encoding)`` information from the `response`
294
        Content-Type HTTP header. (Case of headers is ignored.)
295
        May be ``(None, None)`` e.g. if no Content-Type header is
296
        available.
297
    """
298
    info = response.info()
299
    if PY2x:
300
        media_type, encoding = info.gettype(), info.getparam('charset')
301
    else:
302
        media_type, encoding = info.get_content_type(), info.get_content_charset()
303

    
304
    if encoding:
305
        encoding = encoding.lower()
306

    
307
    if log:
308
        log.info(u'HTTP media_type: %s', media_type)
309
        log.info(u'HTTP encoding: %s', encoding)
310

    
311
    return media_type, encoding
312

    
313

    
314
def getMetaInfo(text, log=None):
315
    """
316
    :param text:
317
        a byte string
318
    :returns:
319
        ``(media_type, encoding)`` information from (first)
320
        X/HTML Content-Type ``<meta>`` element if available in `text`.
321

322
        XHTML format::
323

324
            <meta http-equiv="Content-Type"
325
                  content="media_type;charset=encoding" />
326
    """
327
    p = _MetaHTMLParser()
328

    
329
    try:
330
        p.feed(text)
331
    except HTMLParser.HTMLParseError, e:
332
        pass
333

    
334
    if p.content_type:
335
        media_type, params = cgi.parse_header(p.content_type)
336
        encoding = params.get('charset') # defaults to None
337
        if encoding:
338
            encoding = encoding.lower()
339
        if log:
340
            log.info(u'HTML META media_type: %s', media_type)
341
            log.info(u'HTML META encoding: %s', encoding)
342
    else:
343
        media_type = encoding = None
344

    
345
    return media_type, encoding
346

    
347

    
348
def detectXMLEncoding(fp, log=None, includeDefault=True):
349
    """Attempt to detect the character encoding of the xml file
350
    given by a file object `fp`. `fp` must not be a codec wrapped file
351
    object! `fp` may be a string or unicode string though.
352

353
    Based on a recipe by Lars Tiede:
354
    http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/363841
355
    which itself is based on Paul Prescotts recipe:
356
    http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/52257
357

358
    :returns:
359
        - if detection of the BOM succeeds, the codec name of the
360
          corresponding unicode charset is returned
361

362
        - if BOM detection fails, the xml declaration is searched for
363
          the encoding attribute and its value returned. the "<"
364
          character has to be the very first in the file then (it's xml
365
          standard after all).
366

367
        - if BOM and xml declaration fail, utf-8 is returned according
368
          to XML 1.0.
369
    """
370
    if PY2x and isinstance(fp, types.StringTypes):
371
        fp = StringIO.StringIO(fp)
372
    elif isinstance(fp, (str,)):
373
        fp = StringIO.StringIO(fp)
374

    
375
    ### detection using BOM
376

    
377
    ## the BOMs we know, by their pattern
378
    bomDict={ # bytepattern: name
379
             (0x00, 0x00, 0xFE, 0xFF) : "utf_32_be",
380
             (0xFF, 0xFE, 0x00, 0x00) : "utf_32_le",
381
             (0xFE, 0xFF, None, None) : "utf_16_be",
382
             (0xFF, 0xFE, None, None) : "utf_16_le",
383
             (0xEF, 0xBB, 0xBF, None) : "utf-8",
384
            }
385

    
386
    ## go to beginning of file and get the first 4 bytes
387
    oldFP = fp.tell()
388
    fp.seek(0)
389
    (byte1, byte2, byte3, byte4) = tuple(map(ord, fp.read(4)))
390

    
391
    ## try bom detection using 4 bytes, 3 bytes, or 2 bytes
392
    bomDetection = bomDict.get((byte1, byte2, byte3, byte4))
393
    if not bomDetection:
394
        bomDetection = bomDict.get((byte1, byte2, byte3, None))
395
        if not bomDetection:
396
            bomDetection = bomDict.get((byte1, byte2, None, None))
397

    
398
    ## if BOM detected, we're done :-)
399
    if bomDetection:
400
        if log:
401
            log.info(u'XML BOM encoding: %s' % bomDetection)
402
        fp.seek(oldFP)
403
        return bomDetection
404

    
405
    ## still here? BOM detection failed.
406
    ##  now that BOM detection has failed we assume one byte character
407
    ##  encoding behaving ASCII
408

    
409
    ### search xml declaration for encoding attribute
410

    
411
    ## assume xml declaration fits into the first 2 KB (*cough*)
412
    fp.seek(0)
413
    buffer = fp.read(2048)
414

    
415
    ## set up regular expression
416
    xmlDeclPattern = r"""
417
    ^<\?xml             # w/o BOM, xmldecl starts with <?xml at the first byte
418
    .+?                 # some chars (version info), matched minimal
419
    encoding=           # encoding attribute begins
420
    ["']                # attribute start delimiter
421
    (?P<encstr>         # what's matched in the brackets will be named encstr
422
     [^"']+              # every character not delimiter (not overly exact!)
423
    )                   # closes the brackets pair for the named group
424
    ["']                # attribute end delimiter
425
    .*?                 # some chars optionally (standalone decl or whitespace)
426
    \?>                 # xmldecl end
427
    """
428
    xmlDeclRE = re.compile(xmlDeclPattern, re.VERBOSE)
429

    
430
    ## search and extract encoding string
431
    match = xmlDeclRE.search(buffer)
432
    fp.seek(oldFP)
433
    if match:
434
        enc = match.group("encstr").lower()
435
        if log:
436
            log.info(u'XML encoding="%s"' % enc)
437
        return enc
438
    else:
439
        if includeDefault:
440
            if log:
441
                log.info(u'XML encoding default utf-8')
442
            return u'utf-8'
443
        else:
444
            return None
445

    
446

    
447
def tryEncodings(text, log=None):
448
    """If installed uses chardet http://chardet.feedparser.org/ to detect
449
    encoding, else tries different encodings on `text` and returns the one
450
    that does not raise an exception which is not very advanced or may
451
    be totally wrong. The tried encoding are in order 'ascii', 'iso-8859-1',
452
    'windows-1252' (which probably will never happen as 'iso-8859-1' can decode
453
    these strings too) and lastly 'utf-8'.
454

455
    :param text:
456
        a byte string
457
    :returns:
458
        Working encoding or ``None`` if no encoding does work at all.
459

460
        The returned encoding might nevertheless be not the one intended by
461
        the author as it is only checked if the text might be encoded in
462
        that encoding. Some texts might be working in "iso-8859-1" *and*
463
        "windows-1252" *and* "ascii" *and* "utf-8" and ...
464
    """
465
    try:
466
        import chardet
467
        encoding = chardet.detect(text)["encoding"]
468

    
469
    except ImportError:
470
        msg = 'Using simplified encoding detection, you might want to install chardet.'
471
        if log:
472
            log.warn(msg)
473
        else:
474
            print msg
475

    
476
        encodings = (
477
            'ascii',
478
            'iso-8859-1',
479
            #'windows-1252', # test later
480
            'utf-8'
481
            )
482
        encoding = None
483
        for e in encodings:
484
            try:
485
                text.decode(e)
486
            except UnicodeDecodeError:
487
                pass
488
            else:
489
                if 'iso-8859-1' == e:
490
                    try:
491
                        if u'' in text.decode('windows-1252'):
492
                            return 'windows-1252'
493
                    except UnicodeDecodeError:
494
                        pass
495

    
496
                return e
497

    
498
    return encoding
499

    
500

    
501
def getEncodingInfo(response=None, text=u'', log=None, url=None):
502
    """Find all encoding related information in given `text`.
503

504
    Information in headers of supplied HTTPResponse, possible XML
505
    declaration and X/HTML ``<meta>`` elements are used.
506

507
    :param response:
508
        HTTP response object, e.g. via ``urllib.urlopen('url')``
509
    :param text:
510
        a byte string to guess encoding for. XML prolog with
511
        encoding pseudo attribute or HTML meta element will be used to detect
512
        the encoding
513
    :param url:
514
        When given fetches document at `url` and all needed information.
515
        No `reponse` or `text` parameters are needed in this case.
516
    :param log:
517
        an optional logging logger to which messages may go, if
518
        no log given all log messages are available from resulting
519
        ``EncodingInfo``
520

521
    :returns:
522
        instance of :class:`EncodingInfo`.
523

524
    How the resulting encoding is retrieved:
525

526
    XML
527
        RFC 3023 states if media type given in the Content-Type HTTP header is
528
        application/xml, application/xml-dtd,
529
        application/xml-external-parsed-entity, or any one of the subtypes of
530
        application/xml such as application/atom+xml or application/rss+xml
531
        etc then the character encoding is determined in this order:
532

533
        1. the encoding given in the charset parameter of the Content-Type HTTP
534
        header, or
535
        2. the encoding given in the encoding attribute of the XML declaration
536
        within the document, or
537
        3. utf-8.
538

539
        Mismatch possibilities:
540
            - HTTP + XMLdecla
541
            - HTTP + HTMLmeta
542

543
            application/xhtml+xml ?
544
                XMLdecla + HTMLmeta
545

546

547
        If the media type given in the Content-Type HTTP header is text/xml,
548
        text/xml-external-parsed-entity, or a subtype like text/Anything+xml,
549
        the encoding attribute of the XML declaration is ignored completely
550
        and the character encoding is determined in the order:
551
        1. the encoding given in the charset parameter of the Content-Type HTTP
552
        header, or
553
        2. ascii.
554

555
        No mismatch possible.
556

557

558
        If no media type is given the XML encoding pseuso attribute is used
559
        if present.
560

561
        No mismatch possible.
562

563
    HTML
564
        For HTML served as text/html:
565
            http://www.w3.org/TR/REC-html40/charset.html#h-5.2.2
566

567
        1. An HTTP "charset" parameter in a "Content-Type" field.
568
           (maybe defaults to ISO-8859-1, but should not assume this)
569
        2. A META declaration with "http-equiv" set to "Content-Type" and a
570
           value set for "charset".
571
        3. The charset attribute set on an element that designates an external
572
           resource. (NOT IMPLEMENTED HERE YET)
573

574
        Mismatch possibilities:
575
            - HTTP + HTMLmeta
576

577
    TEXT
578
        For most text/* types the encoding will be reported as iso-8859-1.
579
        Exceptions are XML formats send as text/* mime type (see above) and
580
        text/css which has a default encoding of UTF-8.
581
    """
582
    if url:
583
        # may cause IOError which is raised
584
        response = urllib.urlopen(url)
585

    
586
    if text is None:
587
        # read text from response only if not explicitly given
588
        try:
589
            text = response.read()
590
        except IOError, e:
591
            pass
592

    
593
    if text is None:
594
        # text must be a string (not None)
595
        text = ''
596

    
597
    encinfo = EncodingInfo()
598

    
599
    logstream = StringIO.StringIO()
600
    if not log:
601
        log = buildlog(stream=logstream, format='%(message)s')
602

    
603
    # HTTP
604
    if response:
605
        encinfo.http_media_type, encinfo.http_encoding = getHTTPInfo(
606
            response, log)
607
        texttype = _getTextTypeByMediaType(encinfo.http_media_type, log)
608
    else:
609
        # check if maybe XML or (TODO:) HTML
610
        texttype = _getTextType(text, log)
611

    
612
    # XML only served as application/xml ! #(also XHTML served as text/html)
613
    if texttype == _XML_APPLICATION_TYPE:# or texttype == _XML_TEXT_TYPE:
614
        try:
615
            encinfo.xml_encoding = detectXMLEncoding(text, log)
616
        except (AttributeError, ValueError), e:
617
            encinfo.xml_encoding = None
618

    
619
    # XML (also XHTML served as text/html)
620
    if texttype == _HTML_TEXT_TYPE:
621
        try:
622
            encinfo.xml_encoding = detectXMLEncoding(text, log, includeDefault=False)
623
        except (AttributeError, ValueError), e:
624
            encinfo.xml_encoding = None
625

    
626
    # HTML
627
    if texttype == _HTML_TEXT_TYPE or texttype == _TEXT_TYPE:
628
        encinfo.meta_media_type, encinfo.meta_encoding = getMetaInfo(
629
            text, log)
630

    
631
    # guess
632
    # 1. HTTP charset?
633
    encinfo.encoding = encinfo.http_encoding
634
    encinfo.mismatch = False
635

    
636
    # 2. media_type?
637
    #   XML application/...
638
    if texttype == _XML_APPLICATION_TYPE:
639
        if not encinfo.encoding:
640
            encinfo.encoding = encinfo.xml_encoding
641
            # xml_encoding has default of utf-8
642

    
643
    #   text/html
644
    elif texttype == _HTML_TEXT_TYPE:
645
        if not encinfo.encoding:
646
            encinfo.encoding = encinfo.meta_encoding
647
        if not encinfo.encoding:
648
            encinfo.encoding = encodingByMediaType(encinfo.http_media_type)
649
        if not encinfo.encoding:
650
            encinfo.encoding = tryEncodings(text)
651

    
652
    #   text/... + xml or text/*
653
    elif texttype == _XML_TEXT_TYPE or texttype == _TEXT_TYPE:
654
        if not encinfo.encoding:
655
            encinfo.encoding = encodingByMediaType(encinfo.http_media_type)
656

    
657
    elif texttype == _TEXT_UTF8:
658
        if not encinfo.encoding:
659
            encinfo.encoding = encodingByMediaType(encinfo.http_media_type)
660

    
661
    # possible mismatches, checks if present at all and then if equal
662
    # HTTP + XML
663
    if encinfo.http_encoding and encinfo.xml_encoding and\
664
       encinfo.http_encoding != encinfo.xml_encoding:
665
        encinfo.mismatch = True
666
        log.warn(u'"%s" (HTTP) != "%s" (XML) encoding mismatch' %
667
                 (encinfo.http_encoding, encinfo.xml_encoding))
668
    # HTTP + Meta
669
    if encinfo.http_encoding and encinfo.meta_encoding and\
670
         encinfo.http_encoding != encinfo.meta_encoding:
671
        encinfo.mismatch = True
672
        log.warning(u'"%s" (HTTP) != "%s" (HTML <meta>) encoding mismatch' %
673
                 (encinfo.http_encoding, encinfo.meta_encoding))
674
    # XML + Meta
675
    if encinfo.xml_encoding and encinfo.meta_encoding and\
676
         encinfo.xml_encoding != encinfo.meta_encoding:
677
        encinfo.mismatch = True
678
        log.warning(u'"%s" (XML) != "%s" (HTML <meta>) encoding mismatch' %
679
                 (encinfo.xml_encoding, encinfo.meta_encoding))
680

    
681
    log.info(u'Encoding (probably): %s (Mismatch: %s)',
682
             encinfo.encoding, encinfo.mismatch)
683

    
684
    encinfo.logtext = logstream.getvalue()
685
    return encinfo
686

    
687

    
688
if __name__ == '__main__':
689
    import pydoc
690
    pydoc.help(__name__)