Statistics
| Revision:

svn-gvsig-desktop / tags / v1_1_Build_1012 / extensions / extScripting / scripts / jython / Lib / codecs.py @ 12987

History | View | Annotate | Download (17.4 KB)

1
""" codecs -- Python Codec Registry, API and helpers.
2

3

4
Written by Marc-Andre Lemburg (mal@lemburg.com).
5

6
(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7

8
"""#"
9

    
10
import struct,types,__builtin__
11

    
12
### Registry and builtin stateless codec functions
13

    
14
try:
15
    from _codecs import *
16
except ImportError,why:
17
    raise SystemError,\
18
          'Failed to load the builtin codecs: %s' % why
19

    
20
__all__ = ["register","lookup","open","EncodedFile","BOM","BOM_BE",
21
           "BOM_LE","BOM32_BE","BOM32_LE","BOM64_BE","BOM64_LE"]
22

    
23
### Constants
24

    
25
#
26
# Byte Order Mark (BOM) and its possible values (BOM_BE, BOM_LE)
27
#
28
BOM = struct.pack('=H',0xFEFF)
29
#
30
BOM_BE = BOM32_BE = '\376\377'
31
#       corresponds to Unicode U+FEFF in UTF-16 on big endian
32
#       platforms == ZERO WIDTH NO-BREAK SPACE
33
BOM_LE = BOM32_LE = '\377\376'
34
#       corresponds to Unicode U+FFFE in UTF-16 on little endian
35
#       platforms == defined as being an illegal Unicode character
36

    
37
#
38
# 64-bit Byte Order Marks
39
#
40
BOM64_BE = '\000\000\376\377'
41
#       corresponds to Unicode U+0000FEFF in UCS-4
42
BOM64_LE = '\377\376\000\000'
43
#       corresponds to Unicode U+0000FFFE in UCS-4
44

    
45

    
46
### Codec base classes (defining the API)
47

    
48
class Codec:
49

    
50
    """ Defines the interface for stateless encoders/decoders.
51

52
        The .encode()/.decode() methods may implement different error
53
        handling schemes by providing the errors argument. These
54
        string values are defined:
55

56
         'strict' - raise a ValueError error (or a subclass)
57
         'ignore' - ignore the character and continue with the next
58
         'replace' - replace with a suitable replacement character;
59
                    Python will use the official U+FFFD REPLACEMENT
60
                    CHARACTER for the builtin Unicode codecs.
61

62
    """
63
    def encode(self,input,errors='strict'):
64

    
65
        """ Encodes the object input and returns a tuple (output
66
            object, length consumed).
67

68
            errors defines the error handling to apply. It defaults to
69
            'strict' handling.
70

71
            The method may not store state in the Codec instance. Use
72
            StreamCodec for codecs which have to keep state in order to
73
            make encoding/decoding efficient.
74

75
            The encoder must be able to handle zero length input and
76
            return an empty object of the output object type in this
77
            situation.
78

79
        """
80
        raise NotImplementedError
81

    
82
    def decode(self,input,errors='strict'):
83

    
84
        """ Decodes the object input and returns a tuple (output
85
            object, length consumed).
86

87
            input must be an object which provides the bf_getreadbuf
88
            buffer slot. Python strings, buffer objects and memory
89
            mapped files are examples of objects providing this slot.
90

91
            errors defines the error handling to apply. It defaults to
92
            'strict' handling.
93

94
            The method may not store state in the Codec instance. Use
95
            StreamCodec for codecs which have to keep state in order to
96
            make encoding/decoding efficient.
97

98
            The decoder must be able to handle zero length input and
99
            return an empty object of the output object type in this
100
            situation.
101

102
        """
103
        raise NotImplementedError
104

    
105
#
106
# The StreamWriter and StreamReader class provide generic working
107
# interfaces which can be used to implement new encodings submodules
108
# very easily. See encodings/utf_8.py for an example on how this is
109
# done.
110
#
111

    
112
class StreamWriter(Codec):
113

    
114
    def __init__(self,stream,errors='strict'):
115

    
116
        """ Creates a StreamWriter instance.
117

118
            stream must be a file-like object open for writing
119
            (binary) data.
120

121
            The StreamWriter may implement different error handling
122
            schemes by providing the errors keyword argument. These
123
            parameters are defined:
124

125
             'strict' - raise a ValueError (or a subclass)
126
             'ignore' - ignore the character and continue with the next
127
             'replace'- replace with a suitable replacement character
128

129
        """
130
        self.stream = stream
131
        self.errors = errors
132

    
133
    def write(self, object):
134

    
135
        """ Writes the object's contents encoded to self.stream.
136
        """
137
        data, consumed = self.encode(object,self.errors)
138
        self.stream.write(data)
139

    
140
    def writelines(self, list):
141

    
142
        """ Writes the concatenated list of strings to the stream
143
            using .write().
144
        """
145
        self.write(''.join(list))
146

    
147
    def reset(self):
148

    
149
        """ Flushes and resets the codec buffers used for keeping state.
150

151
            Calling this method should ensure that the data on the
152
            output is put into a clean state, that allows appending
153
            of new fresh data without having to rescan the whole
154
            stream to recover state.
155

156
        """
157
        pass
158

    
159
    def __getattr__(self,name,
160

    
161
                    getattr=getattr):
162

    
163
        """ Inherit all other methods from the underlying stream.
164
        """
165
        return getattr(self.stream,name)
166

    
167
###
168

    
169
class StreamReader(Codec):
170

    
171
    def __init__(self,stream,errors='strict'):
172

    
173
        """ Creates a StreamReader instance.
174

175
            stream must be a file-like object open for reading
176
            (binary) data.
177

178
            The StreamReader may implement different error handling
179
            schemes by providing the errors keyword argument. These
180
            parameters are defined:
181

182
             'strict' - raise a ValueError (or a subclass)
183
             'ignore' - ignore the character and continue with the next
184
             'replace'- replace with a suitable replacement character;
185

186
        """
187
        self.stream = stream
188
        self.errors = errors
189

    
190
    def read(self, size=-1):
191

    
192
        """ Decodes data from the stream self.stream and returns the
193
            resulting object.
194

195
            size indicates the approximate maximum number of bytes to
196
            read from the stream for decoding purposes. The decoder
197
            can modify this setting as appropriate. The default value
198
            -1 indicates to read and decode as much as possible.  size
199
            is intended to prevent having to decode huge files in one
200
            step.
201

202
            The method should use a greedy read strategy meaning that
203
            it should read as much data as is allowed within the
204
            definition of the encoding and the given size, e.g.  if
205
            optional encoding endings or state markers are available
206
            on the stream, these should be read too.
207

208
        """
209
        # Unsliced reading:
210
        if size < 0:
211
            return self.decode(self.stream.read(), self.errors)[0]
212

    
213
        # Sliced reading:
214
        read = self.stream.read
215
        decode = self.decode
216
        data = read(size)
217
        i = 0
218
        while 1:
219
            try:
220
                object, decodedbytes = decode(data, self.errors)
221
            except ValueError,why:
222
                # This method is slow but should work under pretty much
223
                # all conditions; at most 10 tries are made
224
                i = i + 1
225
                newdata = read(1)
226
                if not newdata or i > 10:
227
                    raise
228
                data = data + newdata
229
            else:
230
                return object
231

    
232
    def readline(self, size=None):
233

    
234
        """ Read one line from the input stream and return the
235
            decoded data.
236

237
            Note: Unlike the .readlines() method, this method inherits
238
            the line breaking knowledge from the underlying stream's
239
            .readline() method -- there is currently no support for
240
            line breaking using the codec decoder due to lack of line
241
            buffering. Sublcasses should however, if possible, try to
242
            implement this method using their own knowledge of line
243
            breaking.
244

245
            size, if given, is passed as size argument to the stream's
246
            .readline() method.
247

248
        """
249
        if size is None:
250
            line = self.stream.readline()
251
        else:
252
            line = self.stream.readline(size)
253
        return self.decode(line,self.errors)[0]
254

    
255

    
256
    def readlines(self, sizehint=0):
257

    
258
        """ Read all lines available on the input stream
259
            and return them as list of lines.
260

261
            Line breaks are implemented using the codec's decoder
262
            method and are included in the list entries.
263

264
            sizehint, if given, is passed as size argument to the
265
            stream's .read() method.
266

267
        """
268
        if sizehint is None:
269
            data = self.stream.read()
270
        else:
271
            data = self.stream.read(sizehint)
272
        return self.decode(data,self.errors)[0].splitlines(1)
273

    
274
    def reset(self):
275

    
276
        """ Resets the codec buffers used for keeping state.
277

278
            Note that no stream repositioning should take place.
279
            This method is primarily intended to be able to recover
280
            from decoding errors.
281

282
        """
283
        pass
284

    
285
    def __getattr__(self,name,
286

    
287
                    getattr=getattr):
288

    
289
        """ Inherit all other methods from the underlying stream.
290
        """
291
        return getattr(self.stream,name)
292

    
293
###
294

    
295
class StreamReaderWriter:
296

    
297
    """ StreamReaderWriter instances allow wrapping streams which
298
        work in both read and write modes.
299

300
        The design is such that one can use the factory functions
301
        returned by the codec.lookup() function to construct the
302
        instance.
303

304
    """
305
    # Optional attributes set by the file wrappers below
306
    encoding = 'unknown'
307

    
308
    def __init__(self,stream,Reader,Writer,errors='strict'):
309

    
310
        """ Creates a StreamReaderWriter instance.
311

312
            stream must be a Stream-like object.
313

314
            Reader, Writer must be factory functions or classes
315
            providing the StreamReader, StreamWriter interface resp.
316

317
            Error handling is done in the same way as defined for the
318
            StreamWriter/Readers.
319

320
        """
321
        self.stream = stream
322
        self.reader = Reader(stream, errors)
323
        self.writer = Writer(stream, errors)
324
        self.errors = errors
325

    
326
    def read(self,size=-1):
327

    
328
        return self.reader.read(size)
329

    
330
    def readline(self, size=None):
331

    
332
        return self.reader.readline(size)
333

    
334
    def readlines(self, sizehint=None):
335

    
336
        return self.reader.readlines(sizehint)
337

    
338
    def write(self,data):
339

    
340
        return self.writer.write(data)
341

    
342
    def writelines(self,list):
343

    
344
        return self.writer.writelines(list)
345

    
346
    def reset(self):
347

    
348
        self.reader.reset()
349
        self.writer.reset()
350

    
351
    def __getattr__(self,name,
352

    
353
                    getattr=getattr):
354

    
355
        """ Inherit all other methods from the underlying stream.
356
        """
357
        return getattr(self.stream,name)
358

    
359
###
360

    
361
class StreamRecoder:
362

    
363
    """ StreamRecoder instances provide a frontend - backend
364
        view of encoding data.
365

366
        They use the complete set of APIs returned by the
367
        codecs.lookup() function to implement their task.
368

369
        Data written to the stream is first decoded into an
370
        intermediate format (which is dependent on the given codec
371
        combination) and then written to the stream using an instance
372
        of the provided Writer class.
373

374
        In the other direction, data is read from the stream using a
375
        Reader instance and then return encoded data to the caller.
376

377
    """
378
    # Optional attributes set by the file wrappers below
379
    data_encoding = 'unknown'
380
    file_encoding = 'unknown'
381

    
382
    def __init__(self,stream,encode,decode,Reader,Writer,errors='strict'):
383

    
384
        """ Creates a StreamRecoder instance which implements a two-way
385
            conversion: encode and decode work on the frontend (the
386
            input to .read() and output of .write()) while
387
            Reader and Writer work on the backend (reading and
388
            writing to the stream).
389

390
            You can use these objects to do transparent direct
391
            recodings from e.g. latin-1 to utf-8 and back.
392

393
            stream must be a file-like object.
394

395
            encode, decode must adhere to the Codec interface, Reader,
396
            Writer must be factory functions or classes providing the
397
            StreamReader, StreamWriter interface resp.
398

399
            encode and decode are needed for the frontend translation,
400
            Reader and Writer for the backend translation. Unicode is
401
            used as intermediate encoding.
402

403
            Error handling is done in the same way as defined for the
404
            StreamWriter/Readers.
405

406
        """
407
        self.stream = stream
408
        self.encode = encode
409
        self.decode = decode
410
        self.reader = Reader(stream, errors)
411
        self.writer = Writer(stream, errors)
412
        self.errors = errors
413

    
414
    def read(self,size=-1):
415

    
416
        data = self.reader.read(size)
417
        data, bytesencoded = self.encode(data, self.errors)
418
        return data
419

    
420
    def readline(self,size=None):
421

    
422
        if size is None:
423
            data = self.reader.readline()
424
        else:
425
            data = self.reader.readline(size)
426
        data, bytesencoded = self.encode(data, self.errors)
427
        return data
428

    
429
    def readlines(self,sizehint=None):
430

    
431
        if sizehint is None:
432
            data = self.reader.read()
433
        else:
434
            data = self.reader.read(sizehint)
435
        data, bytesencoded = self.encode(data, self.errors)
436
        return data.splitlines(1)
437

    
438
    def write(self,data):
439

    
440
        data, bytesdecoded = self.decode(data, self.errors)
441
        return self.writer.write(data)
442

    
443
    def writelines(self,list):
444

    
445
        data = ''.join(list)
446
        data, bytesdecoded = self.decode(data, self.errors)
447
        return self.writer.write(data)
448

    
449
    def reset(self):
450

    
451
        self.reader.reset()
452
        self.writer.reset()
453

    
454
    def __getattr__(self,name,
455

    
456
                    getattr=getattr):
457

    
458
        """ Inherit all other methods from the underlying stream.
459
        """
460
        return getattr(self.stream,name)
461

    
462
### Shortcuts
463

    
464
def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
465

    
466
    """ Open an encoded file using the given mode and return
467
        a wrapped version providing transparent encoding/decoding.
468

469
        Note: The wrapped version will only accept the object format
470
        defined by the codecs, i.e. Unicode objects for most builtin
471
        codecs. Output is also codec dependent and will usually by
472
        Unicode as well.
473

474
        Files are always opened in binary mode, even if no binary mode
475
        was specified. Thisis done to avoid data loss due to encodings
476
        using 8-bit values. The default file mode is 'rb' meaning to
477
        open the file in binary read mode.
478

479
        encoding specifies the encoding which is to be used for the
480
        the file.
481

482
        errors may be given to define the error handling. It defaults
483
        to 'strict' which causes ValueErrors to be raised in case an
484
        encoding error occurs.
485

486
        buffering has the same meaning as for the builtin open() API.
487
        It defaults to line buffered.
488

489
        The returned wrapped file object provides an extra attribute
490
        .encoding which allows querying the used encoding. This
491
        attribute is only available if an encoding was specified as
492
        parameter.
493

494
    """
495
    if encoding is not None and \
496
       'b' not in mode:
497
        # Force opening of the file in binary mode
498
        mode = mode + 'b'
499
    file = __builtin__.open(filename, mode, buffering)
500
    if encoding is None:
501
        return file
502
    (e,d,sr,sw) = lookup(encoding)
503
    srw = StreamReaderWriter(file, sr, sw, errors)
504
    # Add attributes to simplify introspection
505
    srw.encoding = encoding
506
    return srw
507

    
508
def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
509

    
510
    """ Return a wrapped version of file which provides transparent
511
        encoding translation.
512

513
        Strings written to the wrapped file are interpreted according
514
        to the given data_encoding and then written to the original
515
        file as string using file_encoding. The intermediate encoding
516
        will usually be Unicode but depends on the specified codecs.
517

518
        Strings are read from the file using file_encoding and then
519
        passed back to the caller as string using data_encoding.
520

521
        If file_encoding is not given, it defaults to data_encoding.
522

523
        errors may be given to define the error handling. It defaults
524
        to 'strict' which causes ValueErrors to be raised in case an
525
        encoding error occurs.
526

527
        The returned wrapped file object provides two extra attributes
528
        .data_encoding and .file_encoding which reflect the given
529
        parameters of the same name. The attributes can be used for
530
        introspection by Python programs.
531

532
    """
533
    if file_encoding is None:
534
        file_encoding = data_encoding
535
    encode, decode = lookup(data_encoding)[:2]
536
    Reader, Writer = lookup(file_encoding)[2:]
537
    sr = StreamRecoder(file,
538
                       encode,decode,Reader,Writer,
539
                       errors)
540
    # Add attributes to simplify introspection
541
    sr.data_encoding = data_encoding
542
    sr.file_encoding = file_encoding
543
    return sr
544

    
545
### Helpers for charmap-based codecs
546

    
547
def make_identity_dict(rng):
548

    
549
    """ make_identity_dict(rng) -> dict
550

551
        Return a dictionary where elements of the rng sequence are
552
        mapped to themselves.
553

554
    """
555
    res = {}
556
    for i in rng:
557
        res[i]=i
558
    return res
559

    
560
### Tests
561

    
562
if __name__ == '__main__':
563

    
564
    import sys
565

    
566
    # Make stdout translate Latin-1 output into UTF-8 output
567
    sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
568

    
569
    # Have stdin translate Latin-1 input into UTF-8 input
570
    sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')