gvsig-scripting / org.gvsig.scripting / trunk / org.gvsig.scripting / org.gvsig.scripting.app / org.gvsig.scripting.app.mainplugin / src / main / resources-plugin / scripting / lib / encutils / __init__.py @ 475
History | View | Annotate | Download (22.2 KB)
1 |
# -*- coding: utf-8 -*-
|
---|---|
2 |
#!/usr/bin/env python
|
3 |
"""encutils - encoding detection collection for Python
|
4 |
|
5 |
:Version: 0.9.8
|
6 |
:Author: Christof Hoeke, see http://cthedot.de/encutils/
|
7 |
:Contributor: Robert Siemer, Fredrik Hedman <fredrik.hedman@me.com> ported to python3
|
8 |
:Copyright: 2005-2012: Christof Hoeke
|
9 |
:License: encutils has a dual-license, please choose whatever you prefer:
|
10 |
|
11 |
* encutils is published under the
|
12 |
`LGPL 3 or later <http://cthedot.de/encutils/license/>`__
|
13 |
* encutils is published under the
|
14 |
`Creative Commons License <http://creativecommons.org/licenses/by/3.0/>`__.
|
15 |
|
16 |
encutils is free software: you can redistribute it and/or modify
|
17 |
it under the terms of the GNU Lesser General Public License as published by
|
18 |
the Free Software Foundation, either version 3 of the License, or
|
19 |
(at your option) any later version.
|
20 |
|
21 |
encutils is distributed in the hope that it will be useful,
|
22 |
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
23 |
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
24 |
GNU Lesser General Public License for more details.
|
25 |
|
26 |
You should have received a copy of the GNU Lesser General Public License
|
27 |
along with encutils. If not, see <http://www.gnu.org/licenses/>.
|
28 |
|
29 |
|
30 |
A collection of helper functions to detect encodings of text files (like HTML, XHTML, XML, CSS, etc.) retrieved via HTTP, file or string.
|
31 |
|
32 |
:func:`getEncodingInfo` is probably the main function of interest which uses
|
33 |
other supplied functions itself and gathers all information together and
|
34 |
supplies an :class:`EncodingInfo` object.
|
35 |
|
36 |
example::
|
37 |
|
38 |
>>> import encutils
|
39 |
>>> info = encutils.getEncodingInfo(url='http://cthedot.de/encutils/')
|
40 |
|
41 |
>>> str(info)
|
42 |
utf-8
|
43 |
|
44 |
>>> repr(info) # doctest:+ELLIPSIS
|
45 |
<encutils.EncodingInfo object encoding='utf-8' mismatch=False at...>
|
46 |
|
47 |
>>> info.logtext
|
48 |
HTTP media_type: text/html
|
49 |
HTTP encoding: utf-8
|
50 |
Encoding (probably): utf-8 (Mismatch: False)
|
51 |
<BLANKLINE>
|
52 |
|
53 |
references
|
54 |
XML
|
55 |
RFC 3023 (http://www.ietf.org/rfc/rfc3023.txt)
|
56 |
|
57 |
easier explained in
|
58 |
- http://feedparser.org/docs/advanced.html
|
59 |
- http://www.xml.com/pub/a/2004/07/21/dive.html
|
60 |
|
61 |
HTML
|
62 |
http://www.w3.org/TR/REC-html40/charset.html#h-5.2.2
|
63 |
|
64 |
TODO
|
65 |
- parse @charset of HTML elements?
|
66 |
- check for more texttypes if only text given
|
67 |
"""
|
68 |
__all__ = ['buildlog',
|
69 |
'encodingByMediaType',
|
70 |
'getHTTPInfo',
|
71 |
'getMetaInfo',
|
72 |
'detectXMLEncoding',
|
73 |
'getEncodingInfo',
|
74 |
'tryEncodings',
|
75 |
'EncodingInfo']
|
76 |
__docformat__ = 'restructuredtext'
|
77 |
__author__ = 'Christof Hoeke, Robert Siemer, Fredrik Hedman'
|
78 |
__version__ = '$Id$'
|
79 |
|
80 |
import HTMLParser |
81 |
import StringIO |
82 |
import cgi |
83 |
import httplib |
84 |
import re |
85 |
import sys |
86 |
import types |
87 |
import urllib |
88 |
|
89 |
VERSION = '0.9.8'
|
90 |
PY2x = sys.version_info < (3,0) |
91 |
|
92 |
|
93 |
class _MetaHTMLParser(HTMLParser.HTMLParser): |
94 |
"""Parse given data for <meta http-equiv="content-type">."""
|
95 |
content_type = None
|
96 |
|
97 |
def handle_starttag(self, tag, attrs): |
98 |
if tag == 'meta' and not self.content_type: |
99 |
atts = dict([(a.lower(), v.lower()) for a, v in attrs]) |
100 |
if atts.get('http-equiv', u'').strip() == u'content-type': |
101 |
self.content_type = atts.get('content') |
102 |
|
103 |
|
104 |
# application/xml, application/xml-dtd, application/xml-external-parsed-entity, or a subtype like application/rss+xml.
|
105 |
_XML_APPLICATION_TYPE = 0
|
106 |
|
107 |
# text/xml, text/xml-external-parsed-entity, or a subtype like text/AnythingAtAll+xml
|
108 |
_XML_TEXT_TYPE = 1
|
109 |
|
110 |
# text/html
|
111 |
_HTML_TEXT_TYPE = 2
|
112 |
|
113 |
# any other of text/* like text/plain, ...
|
114 |
_TEXT_TYPE = 3
|
115 |
|
116 |
# any text/* like which defaults to UTF-8 encoding, for now only text/css
|
117 |
_TEXT_UTF8 = 5
|
118 |
|
119 |
# types not fitting in above types
|
120 |
_OTHER_TYPE = 4
|
121 |
|
122 |
class EncodingInfo(object): |
123 |
"""
|
124 |
All encoding related information, returned by :func:`getEncodingInfo`.
|
125 |
|
126 |
Attributes filled:
|
127 |
- ``encoding``: The guessed encoding
|
128 |
Encoding is the explicit or implicit encoding or None and
|
129 |
always lowercase.
|
130 |
|
131 |
- from HTTP response
|
132 |
* ``http_encoding``
|
133 |
* ``http_media_type``
|
134 |
|
135 |
- from HTML <meta> element
|
136 |
* ``meta_encoding``
|
137 |
* ``meta_media_type``
|
138 |
|
139 |
- from XML declaration
|
140 |
* ``xml_encoding``
|
141 |
|
142 |
- ``mismatch``: True if mismatch between XML declaration and HTTP
|
143 |
header.
|
144 |
Mismatch is True if any mismatches between HTTP header, XML
|
145 |
declaration or textcontent (meta) are found. More detailed
|
146 |
mismatch reports are written to the optional log or ``logtext``
|
147 |
|
148 |
Mismatches are not necessarily errors as preferences are defined.
|
149 |
For details see the specifications.
|
150 |
|
151 |
- ``logtext``: if no log was given log reports are given here
|
152 |
"""
|
153 |
def __init__(self): |
154 |
"""Initialize all possible properties to ``None``, see class
|
155 |
description
|
156 |
"""
|
157 |
self.encoding = self.mismatch = self.logtext =\ |
158 |
self.http_encoding = self.http_media_type =\ |
159 |
self.meta_encoding = self.meta_media_type =\ |
160 |
self.xml_encoding =\
|
161 |
None
|
162 |
|
163 |
def __str__(self): |
164 |
"""Output the guessed encoding itself or the empty string."""
|
165 |
if self.encoding: |
166 |
return self.encoding |
167 |
else:
|
168 |
return u'' |
169 |
|
170 |
def __repr__(self): |
171 |
return "<%s.%s object encoding=%r mismatch=%s at 0x%x>" % ( |
172 |
self.__class__.__module__, self.__class__.__name__, |
173 |
self.encoding, self.mismatch, id(self)) |
174 |
|
175 |
|
176 |
def buildlog(logname='encutils', level='INFO', stream=sys.stderr, |
177 |
filename=None, filemode="w", |
178 |
format='%(levelname)s\t%(message)s'):
|
179 |
"""Helper to build a basic log
|
180 |
|
181 |
- if `filename` is given returns a log logging to `filename` with
|
182 |
mode `filemode`
|
183 |
- else uses a log streaming to `stream` which defaults to
|
184 |
`sys.stderr`
|
185 |
- `level` defines the level of the log
|
186 |
- `format` defines the formatter format of the log
|
187 |
|
188 |
:returns:
|
189 |
a log with the name `logname`
|
190 |
"""
|
191 |
import logging |
192 |
|
193 |
log = logging.getLogger(logname) |
194 |
|
195 |
if filename:
|
196 |
hdlr = logging.FileHandler(filename, filemode) |
197 |
else:
|
198 |
hdlr = logging.StreamHandler(stream) |
199 |
|
200 |
formatter = logging.Formatter(format) |
201 |
hdlr.setFormatter(formatter) |
202 |
|
203 |
log.addHandler(hdlr) |
204 |
log.setLevel(logging.__dict__.get(level, logging.INFO)) |
205 |
|
206 |
return log
|
207 |
|
208 |
|
209 |
def _getTextTypeByMediaType(media_type, log=None): |
210 |
"""
|
211 |
:returns:
|
212 |
type as defined by constants in this class
|
213 |
"""
|
214 |
if not media_type: |
215 |
return _OTHER_TYPE
|
216 |
xml_application_types = [ |
217 |
ur'application/.*?\+xml',
|
218 |
u'application/xml',
|
219 |
u'application/xml-dtd',
|
220 |
u'application/xml-external-parsed-entity']
|
221 |
xml_text_types = [ |
222 |
ur'text\/.*?\+xml',
|
223 |
u'text/xml',
|
224 |
u'text/xml-external-parsed-entity']
|
225 |
|
226 |
media_type = media_type.strip().lower() |
227 |
|
228 |
if media_type in xml_application_types or\ |
229 |
re.match(xml_application_types[0], media_type, re.I|re.S|re.X):
|
230 |
return _XML_APPLICATION_TYPE
|
231 |
elif media_type in xml_text_types or\ |
232 |
re.match(xml_text_types[0], media_type, re.I|re.S|re.X):
|
233 |
return _XML_TEXT_TYPE
|
234 |
elif media_type == u'text/html': |
235 |
return _HTML_TEXT_TYPE
|
236 |
elif media_type == u'text/css': |
237 |
return _TEXT_UTF8
|
238 |
elif media_type.startswith(u'text/'): |
239 |
return _TEXT_TYPE
|
240 |
else:
|
241 |
return _OTHER_TYPE
|
242 |
|
243 |
|
244 |
def _getTextType(text, log=None): |
245 |
"""Check if given text is XML (**naive test!**)
|
246 |
used if no content-type given
|
247 |
"""
|
248 |
if text[:30].find(u'<?xml version=') != -1: |
249 |
return _XML_APPLICATION_TYPE
|
250 |
else:
|
251 |
return _OTHER_TYPE
|
252 |
|
253 |
|
254 |
def encodingByMediaType(media_type, log=None): |
255 |
"""
|
256 |
:param media_type:
|
257 |
a media type like "text/html"
|
258 |
:returns:
|
259 |
a default encoding for given `media_type`. For example
|
260 |
``"utf-8"`` for ``media_type="application/xml"``.
|
261 |
|
262 |
If no default encoding is available returns ``None``.
|
263 |
|
264 |
Refers to RFC 3023 and HTTP MIME specification.
|
265 |
"""
|
266 |
defaultencodings = { |
267 |
_XML_APPLICATION_TYPE: u'utf-8',
|
268 |
_XML_TEXT_TYPE: u'ascii',
|
269 |
_HTML_TEXT_TYPE: u'iso-8859-1', # should be None? |
270 |
_TEXT_TYPE: u'iso-8859-1', # should be None? |
271 |
_TEXT_UTF8: u'utf-8',
|
272 |
_OTHER_TYPE: None}
|
273 |
|
274 |
texttype = _getTextTypeByMediaType(media_type) |
275 |
encoding = defaultencodings.get(texttype, None)
|
276 |
|
277 |
if log:
|
278 |
if not encoding: |
279 |
log.debug(u'"%s" Media-Type has no default encoding',
|
280 |
media_type) |
281 |
else:
|
282 |
log.debug( |
283 |
u'Default encoding for Media Type "%s": %s',
|
284 |
media_type, encoding) |
285 |
return encoding
|
286 |
|
287 |
|
288 |
def getHTTPInfo(response, log=None): |
289 |
"""
|
290 |
:param response:
|
291 |
a HTTP response object
|
292 |
:returns:
|
293 |
``(media_type, encoding)`` information from the `response`
|
294 |
Content-Type HTTP header. (Case of headers is ignored.)
|
295 |
May be ``(None, None)`` e.g. if no Content-Type header is
|
296 |
available.
|
297 |
"""
|
298 |
info = response.info() |
299 |
if PY2x:
|
300 |
media_type, encoding = info.gettype(), info.getparam('charset')
|
301 |
else:
|
302 |
media_type, encoding = info.get_content_type(), info.get_content_charset() |
303 |
|
304 |
if encoding:
|
305 |
encoding = encoding.lower() |
306 |
|
307 |
if log:
|
308 |
log.info(u'HTTP media_type: %s', media_type)
|
309 |
log.info(u'HTTP encoding: %s', encoding)
|
310 |
|
311 |
return media_type, encoding
|
312 |
|
313 |
|
314 |
def getMetaInfo(text, log=None): |
315 |
"""
|
316 |
:param text:
|
317 |
a byte string
|
318 |
:returns:
|
319 |
``(media_type, encoding)`` information from (first)
|
320 |
X/HTML Content-Type ``<meta>`` element if available in `text`.
|
321 |
|
322 |
XHTML format::
|
323 |
|
324 |
<meta http-equiv="Content-Type"
|
325 |
content="media_type;charset=encoding" />
|
326 |
"""
|
327 |
p = _MetaHTMLParser() |
328 |
|
329 |
try:
|
330 |
p.feed(text) |
331 |
except HTMLParser.HTMLParseError, e:
|
332 |
pass
|
333 |
|
334 |
if p.content_type:
|
335 |
media_type, params = cgi.parse_header(p.content_type) |
336 |
encoding = params.get('charset') # defaults to None |
337 |
if encoding:
|
338 |
encoding = encoding.lower() |
339 |
if log:
|
340 |
log.info(u'HTML META media_type: %s', media_type)
|
341 |
log.info(u'HTML META encoding: %s', encoding)
|
342 |
else:
|
343 |
media_type = encoding = None
|
344 |
|
345 |
return media_type, encoding
|
346 |
|
347 |
|
348 |
def detectXMLEncoding(fp, log=None, includeDefault=True): |
349 |
"""Attempt to detect the character encoding of the xml file
|
350 |
given by a file object `fp`. `fp` must not be a codec wrapped file
|
351 |
object! `fp` may be a string or unicode string though.
|
352 |
|
353 |
Based on a recipe by Lars Tiede:
|
354 |
http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/363841
|
355 |
which itself is based on Paul Prescotts recipe:
|
356 |
http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/52257
|
357 |
|
358 |
:returns:
|
359 |
- if detection of the BOM succeeds, the codec name of the
|
360 |
corresponding unicode charset is returned
|
361 |
|
362 |
- if BOM detection fails, the xml declaration is searched for
|
363 |
the encoding attribute and its value returned. the "<"
|
364 |
character has to be the very first in the file then (it's xml
|
365 |
standard after all).
|
366 |
|
367 |
- if BOM and xml declaration fail, utf-8 is returned according
|
368 |
to XML 1.0.
|
369 |
"""
|
370 |
if PY2x and isinstance(fp, types.StringTypes): |
371 |
fp = StringIO.StringIO(fp) |
372 |
elif isinstance(fp, (str,)): |
373 |
fp = StringIO.StringIO(fp) |
374 |
|
375 |
### detection using BOM
|
376 |
|
377 |
## the BOMs we know, by their pattern
|
378 |
bomDict={ # bytepattern: name
|
379 |
(0x00, 0x00, 0xFE, 0xFF) : "utf_32_be", |
380 |
(0xFF, 0xFE, 0x00, 0x00) : "utf_32_le", |
381 |
(0xFE, 0xFF, None, None) : "utf_16_be", |
382 |
(0xFF, 0xFE, None, None) : "utf_16_le", |
383 |
(0xEF, 0xBB, 0xBF, None) : "utf-8", |
384 |
} |
385 |
|
386 |
## go to beginning of file and get the first 4 bytes
|
387 |
oldFP = fp.tell() |
388 |
fp.seek(0)
|
389 |
(byte1, byte2, byte3, byte4) = tuple(map(ord, fp.read(4))) |
390 |
|
391 |
## try bom detection using 4 bytes, 3 bytes, or 2 bytes
|
392 |
bomDetection = bomDict.get((byte1, byte2, byte3, byte4)) |
393 |
if not bomDetection: |
394 |
bomDetection = bomDict.get((byte1, byte2, byte3, None))
|
395 |
if not bomDetection: |
396 |
bomDetection = bomDict.get((byte1, byte2, None, None)) |
397 |
|
398 |
## if BOM detected, we're done :-)
|
399 |
if bomDetection:
|
400 |
if log:
|
401 |
log.info(u'XML BOM encoding: %s' % bomDetection)
|
402 |
fp.seek(oldFP) |
403 |
return bomDetection
|
404 |
|
405 |
## still here? BOM detection failed.
|
406 |
## now that BOM detection has failed we assume one byte character
|
407 |
## encoding behaving ASCII
|
408 |
|
409 |
### search xml declaration for encoding attribute
|
410 |
|
411 |
## assume xml declaration fits into the first 2 KB (*cough*)
|
412 |
fp.seek(0)
|
413 |
buffer = fp.read(2048)
|
414 |
|
415 |
## set up regular expression
|
416 |
xmlDeclPattern = r"""
|
417 |
^<\?xml # w/o BOM, xmldecl starts with <?xml at the first byte
|
418 |
.+? # some chars (version info), matched minimal
|
419 |
encoding= # encoding attribute begins
|
420 |
["'] # attribute start delimiter
|
421 |
(?P<encstr> # what's matched in the brackets will be named encstr
|
422 |
[^"']+ # every character not delimiter (not overly exact!)
|
423 |
) # closes the brackets pair for the named group
|
424 |
["'] # attribute end delimiter
|
425 |
.*? # some chars optionally (standalone decl or whitespace)
|
426 |
\?> # xmldecl end
|
427 |
"""
|
428 |
xmlDeclRE = re.compile(xmlDeclPattern, re.VERBOSE) |
429 |
|
430 |
## search and extract encoding string
|
431 |
match = xmlDeclRE.search(buffer)
|
432 |
fp.seek(oldFP) |
433 |
if match:
|
434 |
enc = match.group("encstr").lower()
|
435 |
if log:
|
436 |
log.info(u'XML encoding="%s"' % enc)
|
437 |
return enc
|
438 |
else:
|
439 |
if includeDefault:
|
440 |
if log:
|
441 |
log.info(u'XML encoding default utf-8')
|
442 |
return u'utf-8' |
443 |
else:
|
444 |
return None |
445 |
|
446 |
|
447 |
def tryEncodings(text, log=None): |
448 |
"""If installed uses chardet http://chardet.feedparser.org/ to detect
|
449 |
encoding, else tries different encodings on `text` and returns the one
|
450 |
that does not raise an exception which is not very advanced or may
|
451 |
be totally wrong. The tried encoding are in order 'ascii', 'iso-8859-1',
|
452 |
'windows-1252' (which probably will never happen as 'iso-8859-1' can decode
|
453 |
these strings too) and lastly 'utf-8'.
|
454 |
|
455 |
:param text:
|
456 |
a byte string
|
457 |
:returns:
|
458 |
Working encoding or ``None`` if no encoding does work at all.
|
459 |
|
460 |
The returned encoding might nevertheless be not the one intended by
|
461 |
the author as it is only checked if the text might be encoded in
|
462 |
that encoding. Some texts might be working in "iso-8859-1" *and*
|
463 |
"windows-1252" *and* "ascii" *and* "utf-8" and ...
|
464 |
"""
|
465 |
try:
|
466 |
import chardet |
467 |
encoding = chardet.detect(text)["encoding"]
|
468 |
|
469 |
except ImportError: |
470 |
msg = 'Using simplified encoding detection, you might want to install chardet.'
|
471 |
if log:
|
472 |
log.warn(msg) |
473 |
else:
|
474 |
print msg
|
475 |
|
476 |
encodings = ( |
477 |
'ascii',
|
478 |
'iso-8859-1',
|
479 |
#'windows-1252', # test later
|
480 |
'utf-8'
|
481 |
) |
482 |
encoding = None
|
483 |
for e in encodings: |
484 |
try:
|
485 |
text.decode(e) |
486 |
except UnicodeDecodeError: |
487 |
pass
|
488 |
else:
|
489 |
if 'iso-8859-1' == e: |
490 |
try:
|
491 |
if u'€' in text.decode('windows-1252'): |
492 |
return 'windows-1252' |
493 |
except UnicodeDecodeError: |
494 |
pass
|
495 |
|
496 |
return e
|
497 |
|
498 |
return encoding
|
499 |
|
500 |
|
501 |
def getEncodingInfo(response=None, text=u'', log=None, url=None): |
502 |
"""Find all encoding related information in given `text`.
|
503 |
|
504 |
Information in headers of supplied HTTPResponse, possible XML
|
505 |
declaration and X/HTML ``<meta>`` elements are used.
|
506 |
|
507 |
:param response:
|
508 |
HTTP response object, e.g. via ``urllib.urlopen('url')``
|
509 |
:param text:
|
510 |
a byte string to guess encoding for. XML prolog with
|
511 |
encoding pseudo attribute or HTML meta element will be used to detect
|
512 |
the encoding
|
513 |
:param url:
|
514 |
When given fetches document at `url` and all needed information.
|
515 |
No `reponse` or `text` parameters are needed in this case.
|
516 |
:param log:
|
517 |
an optional logging logger to which messages may go, if
|
518 |
no log given all log messages are available from resulting
|
519 |
``EncodingInfo``
|
520 |
|
521 |
:returns:
|
522 |
instance of :class:`EncodingInfo`.
|
523 |
|
524 |
How the resulting encoding is retrieved:
|
525 |
|
526 |
XML
|
527 |
RFC 3023 states if media type given in the Content-Type HTTP header is
|
528 |
application/xml, application/xml-dtd,
|
529 |
application/xml-external-parsed-entity, or any one of the subtypes of
|
530 |
application/xml such as application/atom+xml or application/rss+xml
|
531 |
etc then the character encoding is determined in this order:
|
532 |
|
533 |
1. the encoding given in the charset parameter of the Content-Type HTTP
|
534 |
header, or
|
535 |
2. the encoding given in the encoding attribute of the XML declaration
|
536 |
within the document, or
|
537 |
3. utf-8.
|
538 |
|
539 |
Mismatch possibilities:
|
540 |
- HTTP + XMLdecla
|
541 |
- HTTP + HTMLmeta
|
542 |
|
543 |
application/xhtml+xml ?
|
544 |
XMLdecla + HTMLmeta
|
545 |
|
546 |
|
547 |
If the media type given in the Content-Type HTTP header is text/xml,
|
548 |
text/xml-external-parsed-entity, or a subtype like text/Anything+xml,
|
549 |
the encoding attribute of the XML declaration is ignored completely
|
550 |
and the character encoding is determined in the order:
|
551 |
1. the encoding given in the charset parameter of the Content-Type HTTP
|
552 |
header, or
|
553 |
2. ascii.
|
554 |
|
555 |
No mismatch possible.
|
556 |
|
557 |
|
558 |
If no media type is given the XML encoding pseuso attribute is used
|
559 |
if present.
|
560 |
|
561 |
No mismatch possible.
|
562 |
|
563 |
HTML
|
564 |
For HTML served as text/html:
|
565 |
http://www.w3.org/TR/REC-html40/charset.html#h-5.2.2
|
566 |
|
567 |
1. An HTTP "charset" parameter in a "Content-Type" field.
|
568 |
(maybe defaults to ISO-8859-1, but should not assume this)
|
569 |
2. A META declaration with "http-equiv" set to "Content-Type" and a
|
570 |
value set for "charset".
|
571 |
3. The charset attribute set on an element that designates an external
|
572 |
resource. (NOT IMPLEMENTED HERE YET)
|
573 |
|
574 |
Mismatch possibilities:
|
575 |
- HTTP + HTMLmeta
|
576 |
|
577 |
TEXT
|
578 |
For most text/* types the encoding will be reported as iso-8859-1.
|
579 |
Exceptions are XML formats send as text/* mime type (see above) and
|
580 |
text/css which has a default encoding of UTF-8.
|
581 |
"""
|
582 |
if url:
|
583 |
# may cause IOError which is raised
|
584 |
response = urllib.urlopen(url) |
585 |
|
586 |
if text is None: |
587 |
# read text from response only if not explicitly given
|
588 |
try:
|
589 |
text = response.read() |
590 |
except IOError, e: |
591 |
pass
|
592 |
|
593 |
if text is None: |
594 |
# text must be a string (not None)
|
595 |
text = ''
|
596 |
|
597 |
encinfo = EncodingInfo() |
598 |
|
599 |
logstream = StringIO.StringIO() |
600 |
if not log: |
601 |
log = buildlog(stream=logstream, format='%(message)s')
|
602 |
|
603 |
# HTTP
|
604 |
if response:
|
605 |
encinfo.http_media_type, encinfo.http_encoding = getHTTPInfo( |
606 |
response, log) |
607 |
texttype = _getTextTypeByMediaType(encinfo.http_media_type, log) |
608 |
else:
|
609 |
# check if maybe XML or (TODO:) HTML
|
610 |
texttype = _getTextType(text, log) |
611 |
|
612 |
# XML only served as application/xml ! #(also XHTML served as text/html)
|
613 |
if texttype == _XML_APPLICATION_TYPE:# or texttype == _XML_TEXT_TYPE: |
614 |
try:
|
615 |
encinfo.xml_encoding = detectXMLEncoding(text, log) |
616 |
except (AttributeError, ValueError), e: |
617 |
encinfo.xml_encoding = None
|
618 |
|
619 |
# XML (also XHTML served as text/html)
|
620 |
if texttype == _HTML_TEXT_TYPE:
|
621 |
try:
|
622 |
encinfo.xml_encoding = detectXMLEncoding(text, log, includeDefault=False)
|
623 |
except (AttributeError, ValueError), e: |
624 |
encinfo.xml_encoding = None
|
625 |
|
626 |
# HTML
|
627 |
if texttype == _HTML_TEXT_TYPE or texttype == _TEXT_TYPE: |
628 |
encinfo.meta_media_type, encinfo.meta_encoding = getMetaInfo( |
629 |
text, log) |
630 |
|
631 |
# guess
|
632 |
# 1. HTTP charset?
|
633 |
encinfo.encoding = encinfo.http_encoding |
634 |
encinfo.mismatch = False
|
635 |
|
636 |
# 2. media_type?
|
637 |
# XML application/...
|
638 |
if texttype == _XML_APPLICATION_TYPE:
|
639 |
if not encinfo.encoding: |
640 |
encinfo.encoding = encinfo.xml_encoding |
641 |
# xml_encoding has default of utf-8
|
642 |
|
643 |
# text/html
|
644 |
elif texttype == _HTML_TEXT_TYPE:
|
645 |
if not encinfo.encoding: |
646 |
encinfo.encoding = encinfo.meta_encoding |
647 |
if not encinfo.encoding: |
648 |
encinfo.encoding = encodingByMediaType(encinfo.http_media_type) |
649 |
if not encinfo.encoding: |
650 |
encinfo.encoding = tryEncodings(text) |
651 |
|
652 |
# text/... + xml or text/*
|
653 |
elif texttype == _XML_TEXT_TYPE or texttype == _TEXT_TYPE: |
654 |
if not encinfo.encoding: |
655 |
encinfo.encoding = encodingByMediaType(encinfo.http_media_type) |
656 |
|
657 |
elif texttype == _TEXT_UTF8:
|
658 |
if not encinfo.encoding: |
659 |
encinfo.encoding = encodingByMediaType(encinfo.http_media_type) |
660 |
|
661 |
# possible mismatches, checks if present at all and then if equal
|
662 |
# HTTP + XML
|
663 |
if encinfo.http_encoding and encinfo.xml_encoding and\ |
664 |
encinfo.http_encoding != encinfo.xml_encoding: |
665 |
encinfo.mismatch = True
|
666 |
log.warn(u'"%s" (HTTP) != "%s" (XML) encoding mismatch' %
|
667 |
(encinfo.http_encoding, encinfo.xml_encoding)) |
668 |
# HTTP + Meta
|
669 |
if encinfo.http_encoding and encinfo.meta_encoding and\ |
670 |
encinfo.http_encoding != encinfo.meta_encoding: |
671 |
encinfo.mismatch = True
|
672 |
log.warning(u'"%s" (HTTP) != "%s" (HTML <meta>) encoding mismatch' %
|
673 |
(encinfo.http_encoding, encinfo.meta_encoding)) |
674 |
# XML + Meta
|
675 |
if encinfo.xml_encoding and encinfo.meta_encoding and\ |
676 |
encinfo.xml_encoding != encinfo.meta_encoding: |
677 |
encinfo.mismatch = True
|
678 |
log.warning(u'"%s" (XML) != "%s" (HTML <meta>) encoding mismatch' %
|
679 |
(encinfo.xml_encoding, encinfo.meta_encoding)) |
680 |
|
681 |
log.info(u'Encoding (probably): %s (Mismatch: %s)',
|
682 |
encinfo.encoding, encinfo.mismatch) |
683 |
|
684 |
encinfo.logtext = logstream.getvalue() |
685 |
return encinfo
|
686 |
|
687 |
|
688 |
if __name__ == '__main__': |
689 |
import pydoc |
690 |
pydoc.help(__name__) |