Statistics
| Revision:

gvsig-scripting / org.gvsig.scripting / trunk / org.gvsig.scripting / org.gvsig.scripting.app / org.gvsig.scripting.app.mainplugin / src / main / resources-plugin / scripting / lib / dulwich / pack.py @ 959

History | View | Annotate | Download (64.8 KB)

1
# pack.py -- For dealing with packed git objects.
2
# Copyright (C) 2007 James Westby <jw+debian@jameswestby.net>
3
# Copyright (C) 2008-2013 Jelmer Vernooij <jelmer@samba.org>
4
#
5
# Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
6
# General Public License as public by the Free Software Foundation; version 2.0
7
# or (at your option) any later version. You can redistribute it and/or
8
# modify it under the terms of either of these two licenses.
9
#
10
# Unless required by applicable law or agreed to in writing, software
11
# distributed under the License is distributed on an "AS IS" BASIS,
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
# See the License for the specific language governing permissions and
14
# limitations under the License.
15
#
16
# You should have received a copy of the licenses; if not, see
17
# <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
18
# and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
19
# License, Version 2.0.
20
#
21

    
22
"""Classes for dealing with packed git objects.
23

24
A pack is a compact representation of a bunch of objects, stored
25
using deltas where possible.
26

27
They have two parts, the pack file, which stores the data, and an index
28
that tells you where the data is.
29

30
To find an object you look in all of the index files 'til you find a
31
match for the object name. You then use the pointer got from this as
32
a pointer in to the corresponding packfile.
33
"""
34

    
35
from collections import defaultdict
36

    
37
import binascii
38
from io import BytesIO, UnsupportedOperation
39
from collections import (
40
    deque,
41
    )
42
import difflib
43
import struct
44

    
45
from itertools import chain
46
try:
47
    from itertools import imap, izip
48
except ImportError:
49
    # Python3
50
    imap = map
51
    izip = zip
52

    
53
import os
54
import sys
55

    
56
try:
57
    import mmap
58
except ImportError:
59
    has_mmap = False
60
else:
61
    has_mmap = True
62

    
63
# For some reason the above try, except fails to set has_mmap = False for plan9
64
if sys.platform == 'Plan9':
65
    has_mmap = False
66

    
67
from hashlib import sha1
68
from os import (
69
    SEEK_CUR,
70
    SEEK_END,
71
    )
72
from struct import unpack_from
73
import zlib
74

    
75
from dulwich.errors import (
76
    ApplyDeltaError,
77
    ChecksumMismatch,
78
    )
79
from dulwich.file import GitFile
80
from dulwich.lru_cache import (
81
    LRUSizeCache,
82
    )
83
from dulwich.objects import (
84
    ShaFile,
85
    hex_to_sha,
86
    sha_to_hex,
87
    object_header,
88
    )
89

    
90

    
91
OFS_DELTA = 6
92
REF_DELTA = 7
93

    
94
DELTA_TYPES = (OFS_DELTA, REF_DELTA)
95

    
96

    
97
DEFAULT_PACK_DELTA_WINDOW_SIZE = 10
98

    
99

    
100
def take_msb_bytes(read, crc32=None):
101
    """Read bytes marked with most significant bit.
102

103
    :param read: Read function
104
    """
105
    ret = []
106
    while len(ret) == 0 or ret[-1] & 0x80:
107
        b = read(1)
108
        if crc32 is not None:
109
            crc32 = binascii.crc32(b, crc32)
110
        ret.append(ord(b[:1]))
111
    return ret, crc32
112

    
113

    
114
class UnpackedObject(object):
115
    """Class encapsulating an object unpacked from a pack file.
116

117
    These objects should only be created from within unpack_object. Most
118
    members start out as empty and are filled in at various points by
119
    read_zlib_chunks, unpack_object, DeltaChainIterator, etc.
120

121
    End users of this object should take care that the function they're getting
122
    this object from is guaranteed to set the members they need.
123
    """
124

    
125
    __slots__ = [
126
      'offset',         # Offset in its pack.
127
      '_sha',           # Cached binary SHA.
128
      'obj_type_num',   # Type of this object.
129
      'obj_chunks',     # Decompressed and delta-resolved chunks.
130
      'pack_type_num',  # Type of this object in the pack (may be a delta).
131
      'delta_base',     # Delta base offset or SHA.
132
      'comp_chunks',    # Compressed object chunks.
133
      'decomp_chunks',  # Decompressed object chunks.
134
      'decomp_len',     # Decompressed length of this object.
135
      'crc32',          # CRC32.
136
      ]
137

    
138
    # TODO(dborowitz): read_zlib_chunks and unpack_object could very well be
139
    # methods of this object.
140
    def __init__(self, pack_type_num, delta_base, decomp_len, crc32):
141
        self.offset = None
142
        self._sha = None
143
        self.pack_type_num = pack_type_num
144
        self.delta_base = delta_base
145
        self.comp_chunks = None
146
        self.decomp_chunks = []
147
        self.decomp_len = decomp_len
148
        self.crc32 = crc32
149

    
150
        if pack_type_num in DELTA_TYPES:
151
            self.obj_type_num = None
152
            self.obj_chunks = None
153
        else:
154
            self.obj_type_num = pack_type_num
155
            self.obj_chunks = self.decomp_chunks
156
            self.delta_base = delta_base
157

    
158
    def sha(self):
159
        """Return the binary SHA of this object."""
160
        if self._sha is None:
161
            self._sha = obj_sha(self.obj_type_num, self.obj_chunks)
162
        return self._sha
163

    
164
    def sha_file(self):
165
        """Return a ShaFile from this object."""
166
        return ShaFile.from_raw_chunks(self.obj_type_num, self.obj_chunks)
167

    
168
    # Only provided for backwards compatibility with code that expects either
169
    # chunks or a delta tuple.
170
    def _obj(self):
171
        """Return the decompressed chunks, or (delta base, delta chunks)."""
172
        if self.pack_type_num in DELTA_TYPES:
173
            return (self.delta_base, self.decomp_chunks)
174
        else:
175
            return self.decomp_chunks
176

    
177
    def __eq__(self, other):
178
        if not isinstance(other, UnpackedObject):
179
            return False
180
        for slot in self.__slots__:
181
            if getattr(self, slot) != getattr(other, slot):
182
                return False
183
        return True
184

    
185
    def __ne__(self, other):
186
        return not (self == other)
187

    
188
    def __repr__(self):
189
        data = ['%s=%r' % (s, getattr(self, s)) for s in self.__slots__]
190
        return '%s(%s)' % (self.__class__.__name__, ', '.join(data))
191

    
192

    
193
_ZLIB_BUFSIZE = 4096
194

    
195

    
196
def read_zlib_chunks(read_some, unpacked, include_comp=False,
197
                     buffer_size=_ZLIB_BUFSIZE):
198
    """Read zlib data from a buffer.
199

200
    This function requires that the buffer have additional data following the
201
    compressed data, which is guaranteed to be the case for git pack files.
202

203
    :param read_some: Read function that returns at least one byte, but may
204
        return less than the requested size.
205
    :param unpacked: An UnpackedObject to write result data to. If its crc32
206
        attr is not None, the CRC32 of the compressed bytes will be computed
207
        using this starting CRC32.
208
        After this function, will have the following attrs set:
209
        * comp_chunks    (if include_comp is True)
210
        * decomp_chunks
211
        * decomp_len
212
        * crc32
213
    :param include_comp: If True, include compressed data in the result.
214
    :param buffer_size: Size of the read buffer.
215
    :return: Leftover unused data from the decompression.
216
    :raise zlib.error: if a decompression error occurred.
217
    """
218
    if unpacked.decomp_len <= -1:
219
        raise ValueError('non-negative zlib data stream size expected')
220
    decomp_obj = zlib.decompressobj()
221

    
222
    comp_chunks = []
223
    decomp_chunks = unpacked.decomp_chunks
224
    decomp_len = 0
225
    crc32 = unpacked.crc32
226

    
227
    while True:
228
        add = read_some(buffer_size)
229
        if not add:
230
            raise zlib.error('EOF before end of zlib stream')
231
        comp_chunks.append(add)
232
        decomp = decomp_obj.decompress(add)
233
        decomp_len += len(decomp)
234
        decomp_chunks.append(decomp)
235
        unused = decomp_obj.unused_data
236
        if unused:
237
            left = len(unused)
238
            if crc32 is not None:
239
                crc32 = binascii.crc32(add[:-left], crc32)
240
            if include_comp:
241
                comp_chunks[-1] = add[:-left]
242
            break
243
        elif crc32 is not None:
244
            crc32 = binascii.crc32(add, crc32)
245
    if crc32 is not None:
246
        crc32 &= 0xffffffff
247

    
248
    if decomp_len != unpacked.decomp_len:
249
        raise zlib.error('decompressed data does not match expected size')
250

    
251
    unpacked.crc32 = crc32
252
    if include_comp:
253
        unpacked.comp_chunks = comp_chunks
254
    return unused
255

    
256

    
257
def iter_sha1(iter):
258
    """Return the hexdigest of the SHA1 over a set of names.
259

260
    :param iter: Iterator over string objects
261
    :return: 40-byte hex sha1 digest
262
    """
263
    sha = sha1()
264
    for name in iter:
265
        sha.update(name)
266
    return sha.hexdigest().encode('ascii')
267

    
268

    
269
def load_pack_index(path):
270
    """Load an index file by path.
271

272
    :param filename: Path to the index file
273
    :return: A PackIndex loaded from the given path
274
    """
275
    with GitFile(path, 'rb') as f:
276
        return load_pack_index_file(path, f)
277

    
278

    
279
def _load_file_contents(f, size=None):
280
    try:
281
        fd = f.fileno()
282
    except (UnsupportedOperation, AttributeError):
283
        fd = None
284
    # Attempt to use mmap if possible
285
    if fd is not None:
286
        if size is None:
287
            size = os.fstat(fd).st_size
288
        if has_mmap:
289
            try:
290
                contents = mmap.mmap(fd, size, access=mmap.ACCESS_READ)
291
            except mmap.error:
292
                # Perhaps a socket?
293
                pass
294
            else:
295
                return contents, size
296
    contents = f.read()
297
    size = len(contents)
298
    return contents, size
299

    
300

    
301
def load_pack_index_file(path, f):
302
    """Load an index file from a file-like object.
303

304
    :param path: Path for the index file
305
    :param f: File-like object
306
    :return: A PackIndex loaded from the given file
307
    """
308
    contents, size = _load_file_contents(f)
309
    if contents[:4] == b'\377tOc':
310
        version = struct.unpack(b'>L', contents[4:8])[0]
311
        if version == 2:
312
            return PackIndex2(path, file=f, contents=contents,
313
                size=size)
314
        else:
315
            raise KeyError('Unknown pack index format %d' % version)
316
    else:
317
        return PackIndex1(path, file=f, contents=contents, size=size)
318

    
319

    
320
def bisect_find_sha(start, end, sha, unpack_name):
321
    """Find a SHA in a data blob with sorted SHAs.
322

323
    :param start: Start index of range to search
324
    :param end: End index of range to search
325
    :param sha: Sha to find
326
    :param unpack_name: Callback to retrieve SHA by index
327
    :return: Index of the SHA, or None if it wasn't found
328
    """
329
    assert start <= end
330
    while start <= end:
331
        i = (start + end) // 2
332
        file_sha = unpack_name(i)
333
        if file_sha < sha:
334
            start = i + 1
335
        elif file_sha > sha:
336
            end = i - 1
337
        else:
338
            return i
339
    return None
340

    
341

    
342
class PackIndex(object):
343
    """An index in to a packfile.
344

345
    Given a sha id of an object a pack index can tell you the location in the
346
    packfile of that object if it has it.
347
    """
348

    
349
    def __eq__(self, other):
350
        if not isinstance(other, PackIndex):
351
            return False
352

    
353
        for (name1, _, _), (name2, _, _) in izip(self.iterentries(),
354
                                                 other.iterentries()):
355
            if name1 != name2:
356
                return False
357
        return True
358

    
359
    def __ne__(self, other):
360
        return not self.__eq__(other)
361

    
362
    def __len__(self):
363
        """Return the number of entries in this pack index."""
364
        raise NotImplementedError(self.__len__)
365

    
366
    def __iter__(self):
367
        """Iterate over the SHAs in this pack."""
368
        return imap(sha_to_hex, self._itersha())
369

    
370
    def iterentries(self):
371
        """Iterate over the entries in this pack index.
372

373
        :return: iterator over tuples with object name, offset in packfile and
374
            crc32 checksum.
375
        """
376
        raise NotImplementedError(self.iterentries)
377

    
378
    def get_pack_checksum(self):
379
        """Return the SHA1 checksum stored for the corresponding packfile.
380

381
        :return: 20-byte binary digest
382
        """
383
        raise NotImplementedError(self.get_pack_checksum)
384

    
385
    def object_index(self, sha):
386
        """Return the index in to the corresponding packfile for the object.
387

388
        Given the name of an object it will return the offset that object
389
        lives at within the corresponding pack file. If the pack file doesn't
390
        have the object then None will be returned.
391
        """
392
        if len(sha) == 40:
393
            sha = hex_to_sha(sha)
394
        return self._object_index(sha)
395

    
396
    def _object_index(self, sha):
397
        """See object_index.
398

399
        :param sha: A *binary* SHA string. (20 characters long)_
400
        """
401
        raise NotImplementedError(self._object_index)
402

    
403
    def objects_sha1(self):
404
        """Return the hex SHA1 over all the shas of all objects in this pack.
405

406
        :note: This is used for the filename of the pack.
407
        """
408
        return iter_sha1(self._itersha())
409

    
410
    def _itersha(self):
411
        """Yield all the SHA1's of the objects in the index, sorted."""
412
        raise NotImplementedError(self._itersha)
413

    
414

    
415
class MemoryPackIndex(PackIndex):
416
    """Pack index that is stored entirely in memory."""
417

    
418
    def __init__(self, entries, pack_checksum=None):
419
        """Create a new MemoryPackIndex.
420

421
        :param entries: Sequence of name, idx, crc32 (sorted)
422
        :param pack_checksum: Optional pack checksum
423
        """
424
        self._by_sha = {}
425
        for name, idx, crc32 in entries:
426
            self._by_sha[name] = idx
427
        self._entries = entries
428
        self._pack_checksum = pack_checksum
429

    
430
    def get_pack_checksum(self):
431
        return self._pack_checksum
432

    
433
    def __len__(self):
434
        return len(self._entries)
435

    
436
    def _object_index(self, sha):
437
        return self._by_sha[sha][0]
438

    
439
    def _itersha(self):
440
        return iter(self._by_sha)
441

    
442
    def iterentries(self):
443
        return iter(self._entries)
444

    
445

    
446
class FilePackIndex(PackIndex):
447
    """Pack index that is based on a file.
448

449
    To do the loop it opens the file, and indexes first 256 4 byte groups
450
    with the first byte of the sha id. The value in the four byte group indexed
451
    is the end of the group that shares the same starting byte. Subtract one
452
    from the starting byte and index again to find the start of the group.
453
    The values are sorted by sha id within the group, so do the math to find
454
    the start and end offset and then bisect in to find if the value is present.
455
    """
456

    
457
    def __init__(self, filename, file=None, contents=None, size=None):
458
        """Create a pack index object.
459

460
        Provide it with the name of the index file to consider, and it will map
461
        it whenever required.
462
        """
463
        self._filename = filename
464
        # Take the size now, so it can be checked each time we map the file to
465
        # ensure that it hasn't changed.
466
        if file is None:
467
            self._file = GitFile(filename, 'rb')
468
        else:
469
            self._file = file
470
        if contents is None:
471
            self._contents, self._size = _load_file_contents(self._file, size)
472
        else:
473
            self._contents, self._size = (contents, size)
474

    
475
    def __eq__(self, other):
476
        # Quick optimization:
477
        if (isinstance(other, FilePackIndex) and
478
            self._fan_out_table != other._fan_out_table):
479
            return False
480

    
481
        return super(FilePackIndex, self).__eq__(other)
482

    
483
    def close(self):
484
        self._file.close()
485
        if getattr(self._contents, "close", None) is not None:
486
            self._contents.close()
487

    
488
    def __len__(self):
489
        """Return the number of entries in this pack index."""
490
        return self._fan_out_table[-1]
491

    
492
    def _unpack_entry(self, i):
493
        """Unpack the i-th entry in the index file.
494

495
        :return: Tuple with object name (SHA), offset in pack file and CRC32
496
            checksum (if known).
497
        """
498
        raise NotImplementedError(self._unpack_entry)
499

    
500
    def _unpack_name(self, i):
501
        """Unpack the i-th name from the index file."""
502
        raise NotImplementedError(self._unpack_name)
503

    
504
    def _unpack_offset(self, i):
505
        """Unpack the i-th object offset from the index file."""
506
        raise NotImplementedError(self._unpack_offset)
507

    
508
    def _unpack_crc32_checksum(self, i):
509
        """Unpack the crc32 checksum for the i-th object from the index file."""
510
        raise NotImplementedError(self._unpack_crc32_checksum)
511

    
512
    def _itersha(self):
513
        for i in range(len(self)):
514
            yield self._unpack_name(i)
515

    
516
    def iterentries(self):
517
        """Iterate over the entries in this pack index.
518

519
        :return: iterator over tuples with object name, offset in packfile and
520
            crc32 checksum.
521
        """
522
        for i in range(len(self)):
523
            yield self._unpack_entry(i)
524

    
525
    def _read_fan_out_table(self, start_offset):
526
        ret = []
527
        for i in range(0x100):
528
            fanout_entry = self._contents[start_offset+i*4:start_offset+(i+1)*4]
529
            ret.append(struct.unpack('>L', fanout_entry)[0])
530
        return ret
531

    
532
    def check(self):
533
        """Check that the stored checksum matches the actual checksum."""
534
        actual = self.calculate_checksum()
535
        stored = self.get_stored_checksum()
536
        if actual != stored:
537
            raise ChecksumMismatch(stored, actual)
538

    
539
    def calculate_checksum(self):
540
        """Calculate the SHA1 checksum over this pack index.
541

542
        :return: This is a 20-byte binary digest
543
        """
544
        return sha1(self._contents[:-20]).digest()
545

    
546
    def get_pack_checksum(self):
547
        """Return the SHA1 checksum stored for the corresponding packfile.
548

549
        :return: 20-byte binary digest
550
        """
551
        return bytes(self._contents[-40:-20])
552

    
553
    def get_stored_checksum(self):
554
        """Return the SHA1 checksum stored for this index.
555

556
        :return: 20-byte binary digest
557
        """
558
        return bytes(self._contents[-20:])
559

    
560
    def _object_index(self, sha):
561
        """See object_index.
562

563
        :param sha: A *binary* SHA string. (20 characters long)_
564
        """
565
        assert len(sha) == 20
566
        idx = ord(sha[:1])
567
        if idx == 0:
568
            start = 0
569
        else:
570
            start = self._fan_out_table[idx-1]
571
        end = self._fan_out_table[idx]
572
        i = bisect_find_sha(start, end, sha, self._unpack_name)
573
        if i is None:
574
            raise KeyError(sha)
575
        return self._unpack_offset(i)
576

    
577

    
578
class PackIndex1(FilePackIndex):
579
    """Version 1 Pack Index file."""
580

    
581
    def __init__(self, filename, file=None, contents=None, size=None):
582
        super(PackIndex1, self).__init__(filename, file, contents, size)
583
        self.version = 1
584
        self._fan_out_table = self._read_fan_out_table(0)
585

    
586
    def _unpack_entry(self, i):
587
        (offset, name) = unpack_from('>L20s', self._contents,
588
                                     (0x100 * 4) + (i * 24))
589
        return (name, offset, None)
590

    
591
    def _unpack_name(self, i):
592
        offset = (0x100 * 4) + (i * 24) + 4
593
        return self._contents[offset:offset+20]
594

    
595
    def _unpack_offset(self, i):
596
        offset = (0x100 * 4) + (i * 24)
597
        return unpack_from('>L', self._contents, offset)[0]
598

    
599
    def _unpack_crc32_checksum(self, i):
600
        # Not stored in v1 index files
601
        return None
602

    
603

    
604
class PackIndex2(FilePackIndex):
605
    """Version 2 Pack Index file."""
606

    
607
    def __init__(self, filename, file=None, contents=None, size=None):
608
        super(PackIndex2, self).__init__(filename, file, contents, size)
609
        if self._contents[:4] != b'\377tOc':
610
            raise AssertionError('Not a v2 pack index file')
611
        (self.version, ) = unpack_from(b'>L', self._contents, 4)
612
        if self.version != 2:
613
            raise AssertionError('Version was %d' % self.version)
614
        self._fan_out_table = self._read_fan_out_table(8)
615
        self._name_table_offset = 8 + 0x100 * 4
616
        self._crc32_table_offset = self._name_table_offset + 20 * len(self)
617
        self._pack_offset_table_offset = (self._crc32_table_offset +
618
                                          4 * len(self))
619
        self._pack_offset_largetable_offset = (self._pack_offset_table_offset +
620
                                          4 * len(self))
621

    
622
    def _unpack_entry(self, i):
623
        return (self._unpack_name(i), self._unpack_offset(i),
624
                self._unpack_crc32_checksum(i))
625

    
626
    def _unpack_name(self, i):
627
        offset = self._name_table_offset + i * 20
628
        return self._contents[offset:offset+20]
629

    
630
    def _unpack_offset(self, i):
631
        offset = self._pack_offset_table_offset + i * 4
632
        offset = unpack_from('>L', self._contents, offset)[0]
633
        if offset & (2**31):
634
            offset = self._pack_offset_largetable_offset + (offset&(2**31-1)) * 8
635
            offset = unpack_from('>Q', self._contents, offset)[0]
636
        return offset
637

    
638
    def _unpack_crc32_checksum(self, i):
639
        return unpack_from('>L', self._contents,
640
                          self._crc32_table_offset + i * 4)[0]
641

    
642

    
643
def read_pack_header(read):
644
    """Read the header of a pack file.
645

646
    :param read: Read function
647
    :return: Tuple of (pack version, number of objects). If no data is available
648
        to read, returns (None, None).
649
    """
650
    header = read(12)
651
    if not header:
652
        return None, None
653
    if header[:4] != b'PACK':
654
        raise AssertionError('Invalid pack header %r' % header)
655
    (version,) = unpack_from(b'>L', header, 4)
656
    if version not in (2, 3):
657
        raise AssertionError('Version was %d' % version)
658
    (num_objects,) = unpack_from(b'>L', header, 8)
659
    return (version, num_objects)
660

    
661

    
662
def chunks_length(chunks):
663
    if isinstance(chunks, bytes):
664
        return len(chunks)
665
    else:
666
        return sum(imap(len, chunks))
667

    
668

    
669
def unpack_object(read_all, read_some=None, compute_crc32=False,
670
                  include_comp=False, zlib_bufsize=_ZLIB_BUFSIZE):
671
    """Unpack a Git object.
672

673
    :param read_all: Read function that blocks until the number of requested
674
        bytes are read.
675
    :param read_some: Read function that returns at least one byte, but may not
676
        return the number of bytes requested.
677
    :param compute_crc32: If True, compute the CRC32 of the compressed data. If
678
        False, the returned CRC32 will be None.
679
    :param include_comp: If True, include compressed data in the result.
680
    :param zlib_bufsize: An optional buffer size for zlib operations.
681
    :return: A tuple of (unpacked, unused), where unused is the unused data
682
        leftover from decompression, and unpacked in an UnpackedObject with
683
        the following attrs set:
684

685
        * obj_chunks     (for non-delta types)
686
        * pack_type_num
687
        * delta_base     (for delta types)
688
        * comp_chunks    (if include_comp is True)
689
        * decomp_chunks
690
        * decomp_len
691
        * crc32          (if compute_crc32 is True)
692
    """
693
    if read_some is None:
694
        read_some = read_all
695
    if compute_crc32:
696
        crc32 = 0
697
    else:
698
        crc32 = None
699

    
700
    bytes, crc32 = take_msb_bytes(read_all, crc32=crc32)
701
    type_num = (bytes[0] >> 4) & 0x07
702
    size = bytes[0] & 0x0f
703
    for i, byte in enumerate(bytes[1:]):
704
        size += (byte & 0x7f) << ((i * 7) + 4)
705

    
706
    raw_base = len(bytes)
707
    if type_num == OFS_DELTA:
708
        bytes, crc32 = take_msb_bytes(read_all, crc32=crc32)
709
        raw_base += len(bytes)
710
        if bytes[-1] & 0x80:
711
            raise AssertionError
712
        delta_base_offset = bytes[0] & 0x7f
713
        for byte in bytes[1:]:
714
            delta_base_offset += 1
715
            delta_base_offset <<= 7
716
            delta_base_offset += (byte & 0x7f)
717
        delta_base = delta_base_offset
718
    elif type_num == REF_DELTA:
719
        delta_base = read_all(20)
720
        if compute_crc32:
721
            crc32 = binascii.crc32(delta_base, crc32)
722
        raw_base += 20
723
    else:
724
        delta_base = None
725

    
726
    unpacked = UnpackedObject(type_num, delta_base, size, crc32)
727
    unused = read_zlib_chunks(read_some, unpacked, buffer_size=zlib_bufsize,
728
                              include_comp=include_comp)
729
    return unpacked, unused
730

    
731

    
732
def _compute_object_size(value):
733
    """Compute the size of a unresolved object for use with LRUSizeCache."""
734
    (num, obj) = value
735
    if num in DELTA_TYPES:
736
        return chunks_length(obj[1])
737
    return chunks_length(obj)
738

    
739

    
740
class PackStreamReader(object):
741
    """Class to read a pack stream.
742

743
    The pack is read from a ReceivableProtocol using read() or recv() as
744
    appropriate.
745
    """
746

    
747
    def __init__(self, read_all, read_some=None, zlib_bufsize=_ZLIB_BUFSIZE):
748
        self.read_all = read_all
749
        if read_some is None:
750
            self.read_some = read_all
751
        else:
752
            self.read_some = read_some
753
        self.sha = sha1()
754
        self._offset = 0
755
        self._rbuf = BytesIO()
756
        # trailer is a deque to avoid memory allocation on small reads
757
        self._trailer = deque()
758
        self._zlib_bufsize = zlib_bufsize
759

    
760
    def _read(self, read, size):
761
        """Read up to size bytes using the given callback.
762

763
        As a side effect, update the verifier's hash (excluding the last 20
764
        bytes read).
765

766
        :param read: The read callback to read from.
767
        :param size: The maximum number of bytes to read; the particular
768
            behavior is callback-specific.
769
        """
770
        data = read(size)
771

    
772
        # maintain a trailer of the last 20 bytes we've read
773
        n = len(data)
774
        self._offset += n
775
        tn = len(self._trailer)
776
        if n >= 20:
777
            to_pop = tn
778
            to_add = 20
779
        else:
780
            to_pop = max(n + tn - 20, 0)
781
            to_add = n
782
        self.sha.update(bytes(bytearray([self._trailer.popleft() for _ in range(to_pop)])))
783
        self._trailer.extend(data[-to_add:])
784

    
785
        # hash everything but the trailer
786
        self.sha.update(data[:-to_add])
787
        return data
788

    
789
    def _buf_len(self):
790
        buf = self._rbuf
791
        start = buf.tell()
792
        buf.seek(0, SEEK_END)
793
        end = buf.tell()
794
        buf.seek(start)
795
        return end - start
796

    
797
    @property
798
    def offset(self):
799
        return self._offset - self._buf_len()
800

    
801
    def read(self, size):
802
        """Read, blocking until size bytes are read."""
803
        buf_len = self._buf_len()
804
        if buf_len >= size:
805
            return self._rbuf.read(size)
806
        buf_data = self._rbuf.read()
807
        self._rbuf = BytesIO()
808
        return buf_data + self._read(self.read_all, size - buf_len)
809

    
810
    def recv(self, size):
811
        """Read up to size bytes, blocking until one byte is read."""
812
        buf_len = self._buf_len()
813
        if buf_len:
814
            data = self._rbuf.read(size)
815
            if size >= buf_len:
816
                self._rbuf = BytesIO()
817
            return data
818
        return self._read(self.read_some, size)
819

    
820
    def __len__(self):
821
        return self._num_objects
822

    
823
    def read_objects(self, compute_crc32=False):
824
        """Read the objects in this pack file.
825

826
        :param compute_crc32: If True, compute the CRC32 of the compressed
827
            data. If False, the returned CRC32 will be None.
828
        :return: Iterator over UnpackedObjects with the following members set:
829
            offset
830
            obj_type_num
831
            obj_chunks (for non-delta types)
832
            delta_base (for delta types)
833
            decomp_chunks
834
            decomp_len
835
            crc32 (if compute_crc32 is True)
836
        :raise ChecksumMismatch: if the checksum of the pack contents does not
837
            match the checksum in the pack trailer.
838
        :raise zlib.error: if an error occurred during zlib decompression.
839
        :raise IOError: if an error occurred writing to the output file.
840
        """
841
        pack_version, self._num_objects = read_pack_header(self.read)
842
        if pack_version is None:
843
            return
844

    
845
        for i in range(self._num_objects):
846
            offset = self.offset
847
            unpacked, unused = unpack_object(
848
              self.read, read_some=self.recv, compute_crc32=compute_crc32,
849
              zlib_bufsize=self._zlib_bufsize)
850
            unpacked.offset = offset
851

    
852
            # prepend any unused data to current read buffer
853
            buf = BytesIO()
854
            buf.write(unused)
855
            buf.write(self._rbuf.read())
856
            buf.seek(0)
857
            self._rbuf = buf
858

    
859
            yield unpacked
860

    
861
        if self._buf_len() < 20:
862
            # If the read buffer is full, then the last read() got the whole
863
            # trailer off the wire. If not, it means there is still some of the
864
            # trailer to read. We need to read() all 20 bytes; N come from the
865
            # read buffer and (20 - N) come from the wire.
866
            self.read(20)
867

    
868
        pack_sha = bytearray(self._trailer)
869
        if pack_sha != self.sha.digest():
870
            raise ChecksumMismatch(sha_to_hex(pack_sha), self.sha.hexdigest())
871

    
872

    
873
class PackStreamCopier(PackStreamReader):
874
    """Class to verify a pack stream as it is being read.
875

876
    The pack is read from a ReceivableProtocol using read() or recv() as
877
    appropriate and written out to the given file-like object.
878
    """
879

    
880
    def __init__(self, read_all, read_some, outfile, delta_iter=None):
881
        """Initialize the copier.
882

883
        :param read_all: Read function that blocks until the number of requested
884
            bytes are read.
885
        :param read_some: Read function that returns at least one byte, but may
886
            not return the number of bytes requested.
887
        :param outfile: File-like object to write output through.
888
        :param delta_iter: Optional DeltaChainIterator to record deltas as we
889
            read them.
890
        """
891
        super(PackStreamCopier, self).__init__(read_all, read_some=read_some)
892
        self.outfile = outfile
893
        self._delta_iter = delta_iter
894

    
895
    def _read(self, read, size):
896
        """Read data from the read callback and write it to the file."""
897
        data = super(PackStreamCopier, self)._read(read, size)
898
        self.outfile.write(data)
899
        return data
900

    
901
    def verify(self):
902
        """Verify a pack stream and write it to the output file.
903

904
        See PackStreamReader.iterobjects for a list of exceptions this may
905
        throw.
906
        """
907
        if self._delta_iter:
908
            for unpacked in self.read_objects():
909
                self._delta_iter.record(unpacked)
910
        else:
911
            for _ in self.read_objects():
912
                pass
913

    
914

    
915
def obj_sha(type, chunks):
916
    """Compute the SHA for a numeric type and object chunks."""
917
    sha = sha1()
918
    sha.update(object_header(type, chunks_length(chunks)))
919
    if isinstance(chunks, bytes):
920
        sha.update(chunks)
921
    else:
922
        for chunk in chunks:
923
            sha.update(chunk)
924
    return sha.digest()
925

    
926

    
927
def compute_file_sha(f, start_ofs=0, end_ofs=0, buffer_size=1<<16):
928
    """Hash a portion of a file into a new SHA.
929

930
    :param f: A file-like object to read from that supports seek().
931
    :param start_ofs: The offset in the file to start reading at.
932
    :param end_ofs: The offset in the file to end reading at, relative to the
933
        end of the file.
934
    :param buffer_size: A buffer size for reading.
935
    :return: A new SHA object updated with data read from the file.
936
    """
937
    sha = sha1()
938
    f.seek(0, SEEK_END)
939
    length = f.tell()
940
    if (end_ofs < 0 and length + end_ofs < start_ofs) or end_ofs > length:
941
        raise AssertionError(
942
            "Attempt to read beyond file length. "
943
            "start_ofs: %d, end_ofs: %d, file length: %d" % (
944
                start_ofs, end_ofs, length))
945
    todo = length + end_ofs - start_ofs
946
    f.seek(start_ofs)
947
    while todo:
948
        data = f.read(min(todo, buffer_size))
949
        sha.update(data)
950
        todo -= len(data)
951
    return sha
952

    
953

    
954
class PackData(object):
955
    """The data contained in a packfile.
956

957
    Pack files can be accessed both sequentially for exploding a pack, and
958
    directly with the help of an index to retrieve a specific object.
959

960
    The objects within are either complete or a delta against another.
961

962
    The header is variable length. If the MSB of each byte is set then it
963
    indicates that the subsequent byte is still part of the header.
964
    For the first byte the next MS bits are the type, which tells you the type
965
    of object, and whether it is a delta. The LS byte is the lowest bits of the
966
    size. For each subsequent byte the LS 7 bits are the next MS bits of the
967
    size, i.e. the last byte of the header contains the MS bits of the size.
968

969
    For the complete objects the data is stored as zlib deflated data.
970
    The size in the header is the uncompressed object size, so to uncompress
971
    you need to just keep feeding data to zlib until you get an object back,
972
    or it errors on bad data. This is done here by just giving the complete
973
    buffer from the start of the deflated object on. This is bad, but until I
974
    get mmap sorted out it will have to do.
975

976
    Currently there are no integrity checks done. Also no attempt is made to
977
    try and detect the delta case, or a request for an object at the wrong
978
    position.  It will all just throw a zlib or KeyError.
979
    """
980

    
981
    def __init__(self, filename, file=None, size=None):
982
        """Create a PackData object representing the pack in the given filename.
983

984
        The file must exist and stay readable until the object is disposed of. It
985
        must also stay the same size. It will be mapped whenever needed.
986

987
        Currently there is a restriction on the size of the pack as the python
988
        mmap implementation is flawed.
989
        """
990
        self._filename = filename
991
        self._size = size
992
        self._header_size = 12
993
        if file is None:
994
            self._file = GitFile(self._filename, 'rb')
995
        else:
996
            self._file = file
997
        (version, self._num_objects) = read_pack_header(self._file.read)
998
        self._offset_cache = LRUSizeCache(1024*1024*20,
999
            compute_size=_compute_object_size)
1000
        self.pack = None
1001

    
1002
    @property
1003
    def filename(self):
1004
        return os.path.basename(self._filename)
1005

    
1006
    @classmethod
1007
    def from_file(cls, file, size):
1008
        return cls(str(file), file=file, size=size)
1009

    
1010
    @classmethod
1011
    def from_path(cls, path):
1012
        return cls(filename=path)
1013

    
1014
    def close(self):
1015
        self._file.close()
1016

    
1017
    def __enter__(self):
1018
        return self
1019

    
1020
    def __exit__(self, exc_type, exc_val, exc_tb):
1021
        self.close()
1022

    
1023
    def _get_size(self):
1024
        if self._size is not None:
1025
            return self._size
1026
        self._size = os.path.getsize(self._filename)
1027
        if self._size < self._header_size:
1028
            errmsg = ('%s is too small for a packfile (%d < %d)' %
1029
                      (self._filename, self._size, self._header_size))
1030
            raise AssertionError(errmsg)
1031
        return self._size
1032

    
1033
    def __len__(self):
1034
        """Returns the number of objects in this pack."""
1035
        return self._num_objects
1036

    
1037
    def calculate_checksum(self):
1038
        """Calculate the checksum for this pack.
1039

1040
        :return: 20-byte binary SHA1 digest
1041
        """
1042
        return compute_file_sha(self._file, end_ofs=-20).digest()
1043

    
1044
    def get_ref(self, sha):
1045
        """Get the object for a ref SHA, only looking in this pack."""
1046
        # TODO: cache these results
1047
        if self.pack is None:
1048
            raise KeyError(sha)
1049
        try:
1050
            offset = self.pack.index.object_index(sha)
1051
        except KeyError:
1052
            offset = None
1053
        if offset:
1054
            type, obj = self.get_object_at(offset)
1055
        elif self.pack is not None and self.pack.resolve_ext_ref:
1056
            type, obj = self.pack.resolve_ext_ref(sha)
1057
        else:
1058
            raise KeyError(sha)
1059
        return offset, type, obj
1060

    
1061
    def resolve_object(self, offset, type, obj, get_ref=None):
1062
        """Resolve an object, possibly resolving deltas when necessary.
1063

1064
        :return: Tuple with object type and contents.
1065
        """
1066
        # Walk down the delta chain, building a stack of deltas to reach
1067
        # the requested object.
1068
        base_offset = offset
1069
        base_type = type
1070
        base_obj = obj
1071
        delta_stack = []
1072
        while base_type in DELTA_TYPES:
1073
            prev_offset = base_offset
1074
            if get_ref is None:
1075
                get_ref = self.get_ref
1076
            if base_type == OFS_DELTA:
1077
                (delta_offset, delta) = base_obj
1078
                # TODO: clean up asserts and replace with nicer error messages
1079
                assert (
1080
                    isinstance(base_offset, int)
1081
                    or isinstance(base_offset, long))
1082
                assert (
1083
                    isinstance(delta_offset, int)
1084
                    or isinstance(base_offset, long))
1085
                base_offset = base_offset - delta_offset
1086
                base_type, base_obj = self.get_object_at(base_offset)
1087
                assert isinstance(base_type, int)
1088
            elif base_type == REF_DELTA:
1089
                (basename, delta) = base_obj
1090
                assert isinstance(basename, bytes) and len(basename) == 20
1091
                base_offset, base_type, base_obj = get_ref(basename)
1092
                assert isinstance(base_type, int)
1093
            delta_stack.append((prev_offset, base_type, delta))
1094

    
1095
        # Now grab the base object (mustn't be a delta) and apply the
1096
        # deltas all the way up the stack.
1097
        chunks = base_obj
1098
        for prev_offset, delta_type, delta in reversed(delta_stack):
1099
            chunks = apply_delta(chunks, delta)
1100
            # TODO(dborowitz): This can result in poor performance if
1101
            # large base objects are separated from deltas in the pack.
1102
            # We should reorganize so that we apply deltas to all
1103
            # objects in a chain one after the other to optimize cache
1104
            # performance.
1105
            if prev_offset is not None:
1106
                self._offset_cache[prev_offset] = base_type, chunks
1107
        return base_type, chunks
1108

    
1109
    def iterobjects(self, progress=None, compute_crc32=True):
1110
        self._file.seek(self._header_size)
1111
        for i in range(1, self._num_objects + 1):
1112
            offset = self._file.tell()
1113
            unpacked, unused = unpack_object(
1114
              self._file.read, compute_crc32=compute_crc32)
1115
            if progress is not None:
1116
                progress(i, self._num_objects)
1117
            yield (offset, unpacked.pack_type_num, unpacked._obj(),
1118
                   unpacked.crc32)
1119
            self._file.seek(-len(unused), SEEK_CUR)  # Back up over unused data.
1120

    
1121
    def _iter_unpacked(self):
1122
        # TODO(dborowitz): Merge this with iterobjects, if we can change its
1123
        # return type.
1124
        self._file.seek(self._header_size)
1125

    
1126
        if self._num_objects is None:
1127
            return
1128

    
1129
        for _ in range(self._num_objects):
1130
            offset = self._file.tell()
1131
            unpacked, unused = unpack_object(
1132
              self._file.read, compute_crc32=False)
1133
            unpacked.offset = offset
1134
            yield unpacked
1135
            self._file.seek(-len(unused), SEEK_CUR)  # Back up over unused data.
1136

    
1137
    def iterentries(self, progress=None):
1138
        """Yield entries summarizing the contents of this pack.
1139

1140
        :param progress: Progress function, called with current and total
1141
            object count.
1142
        :return: iterator of tuples with (sha, offset, crc32)
1143
        """
1144
        num_objects = self._num_objects
1145
        resolve_ext_ref = (
1146
            self.pack.resolve_ext_ref if self.pack is not None else None)
1147
        indexer = PackIndexer.for_pack_data(
1148
            self, resolve_ext_ref=resolve_ext_ref)
1149
        for i, result in enumerate(indexer):
1150
            if progress is not None:
1151
                progress(i, num_objects)
1152
            yield result
1153

    
1154
    def sorted_entries(self, progress=None):
1155
        """Return entries in this pack, sorted by SHA.
1156

1157
        :param progress: Progress function, called with current and total
1158
            object count
1159
        :return: List of tuples with (sha, offset, crc32)
1160
        """
1161
        ret = list(self.iterentries(progress=progress))
1162
        ret.sort()
1163
        return ret
1164

    
1165
    def create_index_v1(self, filename, progress=None):
1166
        """Create a version 1 file for this data file.
1167

1168
        :param filename: Index filename.
1169
        :param progress: Progress report function
1170
        :return: Checksum of index file
1171
        """
1172
        entries = self.sorted_entries(progress=progress)
1173
        with GitFile(filename, 'wb') as f:
1174
            return write_pack_index_v1(f, entries, self.calculate_checksum())
1175

    
1176
    def create_index_v2(self, filename, progress=None):
1177
        """Create a version 2 index file for this data file.
1178

1179
        :param filename: Index filename.
1180
        :param progress: Progress report function
1181
        :return: Checksum of index file
1182
        """
1183
        entries = self.sorted_entries(progress=progress)
1184
        with GitFile(filename, 'wb') as f:
1185
            return write_pack_index_v2(f, entries, self.calculate_checksum())
1186

    
1187
    def create_index(self, filename, progress=None,
1188
                     version=2):
1189
        """Create an  index file for this data file.
1190

1191
        :param filename: Index filename.
1192
        :param progress: Progress report function
1193
        :return: Checksum of index file
1194
        """
1195
        if version == 1:
1196
            return self.create_index_v1(filename, progress)
1197
        elif version == 2:
1198
            return self.create_index_v2(filename, progress)
1199
        else:
1200
            raise ValueError('unknown index format %d' % version)
1201

    
1202
    def get_stored_checksum(self):
1203
        """Return the expected checksum stored in this pack."""
1204
        self._file.seek(-20, SEEK_END)
1205
        return self._file.read(20)
1206

    
1207
    def check(self):
1208
        """Check the consistency of this pack."""
1209
        actual = self.calculate_checksum()
1210
        stored = self.get_stored_checksum()
1211
        if actual != stored:
1212
            raise ChecksumMismatch(stored, actual)
1213

    
1214
    def get_object_at(self, offset):
1215
        """Given an offset in to the packfile return the object that is there.
1216

1217
        Using the associated index the location of an object can be looked up,
1218
        and then the packfile can be asked directly for that object using this
1219
        function.
1220
        """
1221
        try:
1222
            return self._offset_cache[offset]
1223
        except KeyError:
1224
            pass
1225
        assert offset >= self._header_size
1226
        self._file.seek(offset)
1227
        unpacked, _ = unpack_object(self._file.read)
1228
        return (unpacked.pack_type_num, unpacked._obj())
1229

    
1230

    
1231
class DeltaChainIterator(object):
1232
    """Abstract iterator over pack data based on delta chains.
1233

1234
    Each object in the pack is guaranteed to be inflated exactly once,
1235
    regardless of how many objects reference it as a delta base. As a result,
1236
    memory usage is proportional to the length of the longest delta chain.
1237

1238
    Subclasses can override _result to define the result type of the iterator.
1239
    By default, results are UnpackedObjects with the following members set:
1240

1241
    * offset
1242
    * obj_type_num
1243
    * obj_chunks
1244
    * pack_type_num
1245
    * delta_base     (for delta types)
1246
    * comp_chunks    (if _include_comp is True)
1247
    * decomp_chunks
1248
    * decomp_len
1249
    * crc32          (if _compute_crc32 is True)
1250
    """
1251

    
1252
    _compute_crc32 = False
1253
    _include_comp = False
1254

    
1255
    def __init__(self, file_obj, resolve_ext_ref=None):
1256
        self._file = file_obj
1257
        self._resolve_ext_ref = resolve_ext_ref
1258
        self._pending_ofs = defaultdict(list)
1259
        self._pending_ref = defaultdict(list)
1260
        self._full_ofs = []
1261
        self._shas = {}
1262
        self._ext_refs = []
1263

    
1264
    @classmethod
1265
    def for_pack_data(cls, pack_data, resolve_ext_ref=None):
1266
        walker = cls(None, resolve_ext_ref=resolve_ext_ref)
1267
        walker.set_pack_data(pack_data)
1268
        for unpacked in pack_data._iter_unpacked():
1269
            walker.record(unpacked)
1270
        return walker
1271

    
1272
    def record(self, unpacked):
1273
        type_num = unpacked.pack_type_num
1274
        offset = unpacked.offset
1275
        if type_num == OFS_DELTA:
1276
            base_offset = offset - unpacked.delta_base
1277
            self._pending_ofs[base_offset].append(offset)
1278
        elif type_num == REF_DELTA:
1279
            self._pending_ref[unpacked.delta_base].append(offset)
1280
        else:
1281
            self._full_ofs.append((offset, type_num))
1282

    
1283
    def set_pack_data(self, pack_data):
1284
        self._file = pack_data._file
1285

    
1286
    def _walk_all_chains(self):
1287
        for offset, type_num in self._full_ofs:
1288
            for result in self._follow_chain(offset, type_num, None):
1289
                yield result
1290
        for result in self._walk_ref_chains():
1291
            yield result
1292
        assert not self._pending_ofs
1293

    
1294
    def _ensure_no_pending(self):
1295
        if self._pending_ref:
1296
            raise KeyError([sha_to_hex(s) for s in self._pending_ref])
1297

    
1298
    def _walk_ref_chains(self):
1299
        if not self._resolve_ext_ref:
1300
            self._ensure_no_pending()
1301
            return
1302

    
1303
        for base_sha, pending in sorted(self._pending_ref.items()):
1304
            if base_sha not in self._pending_ref:
1305
                continue
1306
            try:
1307
                type_num, chunks = self._resolve_ext_ref(base_sha)
1308
            except KeyError:
1309
                # Not an external ref, but may depend on one. Either it will get
1310
                # popped via a _follow_chain call, or we will raise an error
1311
                # below.
1312
                continue
1313
            self._ext_refs.append(base_sha)
1314
            self._pending_ref.pop(base_sha)
1315
            for new_offset in pending:
1316
                for result in self._follow_chain(new_offset, type_num, chunks):
1317
                    yield result
1318

    
1319
        self._ensure_no_pending()
1320

    
1321
    def _result(self, unpacked):
1322
        return unpacked
1323

    
1324
    def _resolve_object(self, offset, obj_type_num, base_chunks):
1325
        self._file.seek(offset)
1326
        unpacked, _ = unpack_object(
1327
          self._file.read, include_comp=self._include_comp,
1328
          compute_crc32=self._compute_crc32)
1329
        unpacked.offset = offset
1330
        if base_chunks is None:
1331
            assert unpacked.pack_type_num == obj_type_num
1332
        else:
1333
            assert unpacked.pack_type_num in DELTA_TYPES
1334
            unpacked.obj_type_num = obj_type_num
1335
            unpacked.obj_chunks = apply_delta(base_chunks,
1336
                                              unpacked.decomp_chunks)
1337
        return unpacked
1338

    
1339
    def _follow_chain(self, offset, obj_type_num, base_chunks):
1340
        # Unlike PackData.get_object_at, there is no need to cache offsets as
1341
        # this approach by design inflates each object exactly once.
1342
        todo = [(offset, obj_type_num, base_chunks)]
1343
        for offset, obj_type_num, base_chunks in todo:
1344
            unpacked = self._resolve_object(offset, obj_type_num, base_chunks)
1345
            yield self._result(unpacked)
1346

    
1347
            unblocked = chain(self._pending_ofs.pop(unpacked.offset, []),
1348
                              self._pending_ref.pop(unpacked.sha(), []))
1349
            todo.extend(
1350
                (new_offset, unpacked.obj_type_num, unpacked.obj_chunks)
1351
                for new_offset in unblocked)
1352

    
1353
    def __iter__(self):
1354
        return self._walk_all_chains()
1355

    
1356
    def ext_refs(self):
1357
        return self._ext_refs
1358

    
1359

    
1360
class PackIndexer(DeltaChainIterator):
1361
    """Delta chain iterator that yields index entries."""
1362

    
1363
    _compute_crc32 = True
1364

    
1365
    def _result(self, unpacked):
1366
        return unpacked.sha(), unpacked.offset, unpacked.crc32
1367

    
1368

    
1369
class PackInflater(DeltaChainIterator):
1370
    """Delta chain iterator that yields ShaFile objects."""
1371

    
1372
    def _result(self, unpacked):
1373
        return unpacked.sha_file()
1374

    
1375

    
1376
class SHA1Reader(object):
1377
    """Wrapper around a file-like object that remembers the SHA1 of its data."""
1378

    
1379
    def __init__(self, f):
1380
        self.f = f
1381
        self.sha1 = sha1(b'')
1382

    
1383
    def read(self, num=None):
1384
        data = self.f.read(num)
1385
        self.sha1.update(data)
1386
        return data
1387

    
1388
    def check_sha(self):
1389
        stored = self.f.read(20)
1390
        if stored != self.sha1.digest():
1391
            raise ChecksumMismatch(self.sha1.hexdigest(), sha_to_hex(stored))
1392

    
1393
    def close(self):
1394
        return self.f.close()
1395

    
1396
    def tell(self):
1397
        return self.f.tell()
1398

    
1399

    
1400
class SHA1Writer(object):
1401
    """Wrapper around a file-like object that remembers the SHA1 of its data."""
1402

    
1403
    def __init__(self, f):
1404
        self.f = f
1405
        self.length = 0
1406
        self.sha1 = sha1(b'')
1407

    
1408
    def write(self, data):
1409
        self.sha1.update(data)
1410
        self.f.write(data)
1411
        self.length += len(data)
1412

    
1413
    def write_sha(self):
1414
        sha = self.sha1.digest()
1415
        assert len(sha) == 20
1416
        self.f.write(sha)
1417
        self.length += len(sha)
1418
        return sha
1419

    
1420
    def close(self):
1421
        sha = self.write_sha()
1422
        self.f.close()
1423
        return sha
1424

    
1425
    def offset(self):
1426
        return self.length
1427

    
1428
    def tell(self):
1429
        return self.f.tell()
1430

    
1431

    
1432
def pack_object_header(type_num, delta_base, size):
1433
    """Create a pack object header for the given object info.
1434

1435
    :param type_num: Numeric type of the object.
1436
    :param delta_base: Delta base offset or ref, or None for whole objects.
1437
    :param size: Uncompressed object size.
1438
    :return: A header for a packed object.
1439
    """
1440
    header = []
1441
    c = (type_num << 4) | (size & 15)
1442
    size >>= 4
1443
    while size:
1444
        header.append(c | 0x80)
1445
        c = size & 0x7f
1446
        size >>= 7
1447
    header.append(c)
1448
    if type_num == OFS_DELTA:
1449
        ret = [delta_base & 0x7f]
1450
        delta_base >>= 7
1451
        while delta_base:
1452
            delta_base -= 1
1453
            ret.insert(0, 0x80 | (delta_base & 0x7f))
1454
            delta_base >>= 7
1455
        header.extend(ret)
1456
    elif type_num == REF_DELTA:
1457
        assert len(delta_base) == 20
1458
        header += delta_base
1459
    return bytearray(header)
1460

    
1461

    
1462
def write_pack_object(f, type, object, sha=None):
1463
    """Write pack object to a file.
1464

1465
    :param f: File to write to
1466
    :param type: Numeric type of the object
1467
    :param object: Object to write
1468
    :return: Tuple with offset at which the object was written, and crc32
1469
    """
1470
    if type in DELTA_TYPES:
1471
        delta_base, object = object
1472
    else:
1473
        delta_base = None
1474
    header = bytes(pack_object_header(type, delta_base, len(object)))
1475
    comp_data = zlib.compress(object)
1476
    crc32 = 0
1477
    for data in (header, comp_data):
1478
        f.write(data)
1479
        if sha is not None:
1480
            sha.update(data)
1481
        crc32 = binascii.crc32(data, crc32)
1482
    return crc32 & 0xffffffff
1483

    
1484

    
1485
def write_pack(filename, objects, deltify=None, delta_window_size=None):
1486
    """Write a new pack data file.
1487

1488
    :param filename: Path to the new pack file (without .pack extension)
1489
    :param objects: Iterable of (object, path) tuples to write.
1490
        Should provide __len__
1491
    :param window_size: Delta window size
1492
    :param deltify: Whether to deltify pack objects
1493
    :return: Tuple with checksum of pack file and index file
1494
    """
1495
    with GitFile(filename + '.pack', 'wb') as f:
1496
        entries, data_sum = write_pack_objects(f, objects,
1497
            delta_window_size=delta_window_size, deltify=deltify)
1498
    entries = [(k, v[0], v[1]) for (k, v) in entries.items()]
1499
    entries.sort()
1500
    with GitFile(filename + '.idx', 'wb') as f:
1501
        return data_sum, write_pack_index_v2(f, entries, data_sum)
1502

    
1503

    
1504
def write_pack_header(f, num_objects):
1505
    """Write a pack header for the given number of objects."""
1506
    f.write(b'PACK')                          # Pack header
1507
    f.write(struct.pack(b'>L', 2))            # Pack version
1508
    f.write(struct.pack(b'>L', num_objects))  # Number of objects in pack
1509

    
1510

    
1511
def deltify_pack_objects(objects, window_size=None):
1512
    """Generate deltas for pack objects.
1513

1514
    :param objects: An iterable of (object, path) tuples to deltify.
1515
    :param window_size: Window size; None for default
1516
    :return: Iterator over type_num, object id, delta_base, content
1517
        delta_base is None for full text entries
1518
    """
1519
    if window_size is None:
1520
        window_size = DEFAULT_PACK_DELTA_WINDOW_SIZE
1521
    # Build a list of objects ordered by the magic Linus heuristic
1522
    # This helps us find good objects to diff against us
1523
    magic = []
1524
    for obj, path in objects:
1525
        magic.append((obj.type_num, path, -obj.raw_length(), obj))
1526
    magic.sort()
1527

    
1528
    possible_bases = deque()
1529

    
1530
    for type_num, path, neg_length, o in magic:
1531
        raw = o.as_raw_string()
1532
        winner = raw
1533
        winner_base = None
1534
        for base in possible_bases:
1535
            if base.type_num != type_num:
1536
                continue
1537
            delta = create_delta(base.as_raw_string(), raw)
1538
            if len(delta) < len(winner):
1539
                winner_base = base.sha().digest()
1540
                winner = delta
1541
        yield type_num, o.sha().digest(), winner_base, winner
1542
        possible_bases.appendleft(o)
1543
        while len(possible_bases) > window_size:
1544
            possible_bases.pop()
1545

    
1546

    
1547
def write_pack_objects(f, objects, delta_window_size=None, deltify=False):
1548
    """Write a new pack data file.
1549

1550
    :param f: File to write to
1551
    :param objects: Iterable of (object, path) tuples to write.
1552
        Should provide __len__
1553
    :param window_size: Sliding window size for searching for deltas;
1554
                        Set to None for default window size.
1555
    :param deltify: Whether to deltify objects
1556
    :return: Dict mapping id -> (offset, crc32 checksum), pack checksum
1557
    """
1558
    if deltify:
1559
        pack_contents = deltify_pack_objects(objects, delta_window_size)
1560
    else:
1561
        pack_contents = (
1562
            (o.type_num, o.sha().digest(), None, o.as_raw_string())
1563
            for (o, path) in objects)
1564

    
1565
    return write_pack_data(f, len(objects), pack_contents)
1566

    
1567

    
1568
def write_pack_data(f, num_records, records):
1569
    """Write a new pack data file.
1570

1571
    :param f: File to write to
1572
    :param num_records: Number of records
1573
    :param records: Iterator over type_num, object_id, delta_base, raw
1574
    :return: Dict mapping id -> (offset, crc32 checksum), pack checksum
1575
    """
1576
    # Write the pack
1577
    entries = {}
1578
    f = SHA1Writer(f)
1579
    write_pack_header(f, num_records)
1580
    for type_num, object_id, delta_base, raw in records:
1581
        offset = f.offset()
1582
        if delta_base is not None:
1583
            try:
1584
                base_offset, base_crc32 = entries[delta_base]
1585
            except KeyError:
1586
                type_num = REF_DELTA
1587
                raw = (delta_base, raw)
1588
            else:
1589
                type_num = OFS_DELTA
1590
                raw = (offset - base_offset, raw)
1591
        crc32 = write_pack_object(f, type_num, raw)
1592
        entries[object_id] = (offset, crc32)
1593
    return entries, f.write_sha()
1594

    
1595

    
1596
def write_pack_index_v1(f, entries, pack_checksum):
1597
    """Write a new pack index file.
1598

1599
    :param f: A file-like object to write to
1600
    :param entries: List of tuples with object name (sha), offset_in_pack,
1601
        and crc32_checksum.
1602
    :param pack_checksum: Checksum of the pack file.
1603
    :return: The SHA of the written index file
1604
    """
1605
    f = SHA1Writer(f)
1606
    fan_out_table = defaultdict(lambda: 0)
1607
    for (name, offset, entry_checksum) in entries:
1608
        fan_out_table[ord(name[:1])] += 1
1609
    # Fan-out table
1610
    for i in range(0x100):
1611
        f.write(struct.pack('>L', fan_out_table[i]))
1612
        fan_out_table[i+1] += fan_out_table[i]
1613
    for (name, offset, entry_checksum) in entries:
1614
        if not (offset <= 0xffffffff):
1615
            raise TypeError("pack format 1 only supports offsets < 2Gb")
1616
        f.write(struct.pack('>L20s', offset, name))
1617
    assert len(pack_checksum) == 20
1618
    f.write(pack_checksum)
1619
    return f.write_sha()
1620

    
1621

    
1622
def _delta_encode_size(size):
1623
    ret = bytearray()
1624
    c = size & 0x7f
1625
    size >>= 7
1626
    while size:
1627
        ret.append(c | 0x80)
1628
        c = size & 0x7f
1629
        size >>= 7
1630
    ret.append(c)
1631
    return ret
1632

    
1633

    
1634
# The length of delta compression copy operations in version 2 packs is limited
1635
# to 64K.  To copy more, we use several copy operations.  Version 3 packs allow
1636
# 24-bit lengths in copy operations, but we always make version 2 packs.
1637
_MAX_COPY_LEN = 0xffff
1638

    
1639
def _encode_copy_operation(start, length):
1640
    scratch = []
1641
    op = 0x80
1642
    for i in range(4):
1643
        if start & 0xff << i*8:
1644
            scratch.append((start >> i*8) & 0xff)
1645
            op |= 1 << i
1646
    for i in range(2):
1647
        if length & 0xff << i*8:
1648
            scratch.append((length >> i*8) & 0xff)
1649
            op |= 1 << (4+i)
1650
    return bytearray([op] + scratch)
1651

    
1652

    
1653
def create_delta(base_buf, target_buf):
1654
    """Use python difflib to work out how to transform base_buf to target_buf.
1655

1656
    :param base_buf: Base buffer
1657
    :param target_buf: Target buffer
1658
    """
1659
    assert isinstance(base_buf, bytes)
1660
    assert isinstance(target_buf, bytes)
1661
    out_buf = bytearray()
1662
    # write delta header
1663
    out_buf += _delta_encode_size(len(base_buf))
1664
    out_buf += _delta_encode_size(len(target_buf))
1665
    # write out delta opcodes
1666
    seq = difflib.SequenceMatcher(a=base_buf, b=target_buf)
1667
    for opcode, i1, i2, j1, j2 in seq.get_opcodes():
1668
        # Git patch opcodes don't care about deletes!
1669
        #if opcode == 'replace' or opcode == 'delete':
1670
        #    pass
1671
        if opcode == 'equal':
1672
            # If they are equal, unpacker will use data from base_buf
1673
            # Write out an opcode that says what range to use
1674
            copy_start = i1
1675
            copy_len = i2 - i1
1676
            while copy_len > 0:
1677
                to_copy = min(copy_len, _MAX_COPY_LEN)
1678
                out_buf += _encode_copy_operation(copy_start, to_copy)
1679
                copy_start += to_copy
1680
                copy_len -= to_copy
1681
        if opcode == 'replace' or opcode == 'insert':
1682
            # If we are replacing a range or adding one, then we just
1683
            # output it to the stream (prefixed by its size)
1684
            s = j2 - j1
1685
            o = j1
1686
            while s > 127:
1687
                out_buf.append(127)
1688
                out_buf += bytearray(target_buf[o:o+127])
1689
                s -= 127
1690
                o += 127
1691
            out_buf.append(s)
1692
            out_buf += bytearray(target_buf[o:o+s])
1693
    return bytes(out_buf)
1694

    
1695

    
1696
def apply_delta(src_buf, delta):
1697
    """Based on the similar function in git's patch-delta.c.
1698

1699
    :param src_buf: Source buffer
1700
    :param delta: Delta instructions
1701
    """
1702
    if not isinstance(src_buf, bytes):
1703
        src_buf = b''.join(src_buf)
1704
    if not isinstance(delta, bytes):
1705
        delta = b''.join(delta)
1706
    out = []
1707
    index = 0
1708
    delta_length = len(delta)
1709
    def get_delta_header_size(delta, index):
1710
        size = 0
1711
        i = 0
1712
        while delta:
1713
            cmd = ord(delta[index:index+1])
1714
            index += 1
1715
            size |= (cmd & ~0x80) << i
1716
            i += 7
1717
            if not cmd & 0x80:
1718
                break
1719
        return size, index
1720
    src_size, index = get_delta_header_size(delta, index)
1721
    dest_size, index = get_delta_header_size(delta, index)
1722
    assert src_size == len(src_buf), '%d vs %d' % (src_size, len(src_buf))
1723
    while index < delta_length:
1724
        cmd = ord(delta[index:index+1])
1725
        index += 1
1726
        if cmd & 0x80:
1727
            cp_off = 0
1728
            for i in range(4):
1729
                if cmd & (1 << i):
1730
                    x = ord(delta[index:index+1])
1731
                    index += 1
1732
                    cp_off |= x << (i * 8)
1733
            cp_size = 0
1734
            # Version 3 packs can contain copy sizes larger than 64K.
1735
            for i in range(3):
1736
                if cmd & (1 << (4+i)):
1737
                    x = ord(delta[index:index+1])
1738
                    index += 1
1739
                    cp_size |= x << (i * 8)
1740
            if cp_size == 0:
1741
                cp_size = 0x10000
1742
            if (cp_off + cp_size < cp_size or
1743
                cp_off + cp_size > src_size or
1744
                cp_size > dest_size):
1745
                break
1746
            out.append(src_buf[cp_off:cp_off+cp_size])
1747
        elif cmd != 0:
1748
            out.append(delta[index:index+cmd])
1749
            index += cmd
1750
        else:
1751
            raise ApplyDeltaError('Invalid opcode 0')
1752

    
1753
    if index != delta_length:
1754
        raise ApplyDeltaError('delta not empty: %r' % delta[index:])
1755

    
1756
    if dest_size != chunks_length(out):
1757
        raise ApplyDeltaError('dest size incorrect')
1758

    
1759
    return out
1760

    
1761

    
1762
def write_pack_index_v2(f, entries, pack_checksum):
1763
    """Write a new pack index file.
1764

1765
    :param f: File-like object to write to
1766
    :param entries: List of tuples with object name (sha), offset_in_pack, and
1767
        crc32_checksum.
1768
    :param pack_checksum: Checksum of the pack file.
1769
    :return: The SHA of the index file written
1770
    """
1771
    f = SHA1Writer(f)
1772
    f.write(b'\377tOc')  # Magic!
1773
    f.write(struct.pack('>L', 2))
1774
    fan_out_table = defaultdict(lambda: 0)
1775
    for (name, offset, entry_checksum) in entries:
1776
        fan_out_table[ord(name[:1])] += 1
1777
    # Fan-out table
1778
    largetable = []
1779
    for i in range(0x100):
1780
        f.write(struct.pack(b'>L', fan_out_table[i]))
1781
        fan_out_table[i+1] += fan_out_table[i]
1782
    for (name, offset, entry_checksum) in entries:
1783
        f.write(name)
1784
    for (name, offset, entry_checksum) in entries:
1785
        f.write(struct.pack(b'>L', entry_checksum))
1786
    for (name, offset, entry_checksum) in entries:
1787
        if offset < 2**31:
1788
            f.write(struct.pack(b'>L', offset))
1789
        else:
1790
            f.write(struct.pack(b'>L', 2**31 + len(largetable)))
1791
            largetable.append(offset)
1792
    for offset in largetable:
1793
        f.write(struct.pack(b'>Q', offset))
1794
    assert len(pack_checksum) == 20
1795
    f.write(pack_checksum)
1796
    return f.write_sha()
1797

    
1798

    
1799
write_pack_index = write_pack_index_v2
1800

    
1801

    
1802
class Pack(object):
1803
    """A Git pack object."""
1804

    
1805
    def __init__(self, basename, resolve_ext_ref=None):
1806
        self._basename = basename
1807
        self._data = None
1808
        self._idx = None
1809
        self._idx_path = self._basename + '.idx'
1810
        self._data_path = self._basename + '.pack'
1811
        self._data_load = lambda: PackData(self._data_path)
1812
        self._idx_load = lambda: load_pack_index(self._idx_path)
1813
        self.resolve_ext_ref = resolve_ext_ref
1814

    
1815
    @classmethod
1816
    def from_lazy_objects(self, data_fn, idx_fn):
1817
        """Create a new pack object from callables to load pack data and
1818
        index objects."""
1819
        ret = Pack('')
1820
        ret._data_load = data_fn
1821
        ret._idx_load = idx_fn
1822
        return ret
1823

    
1824
    @classmethod
1825
    def from_objects(self, data, idx):
1826
        """Create a new pack object from pack data and index objects."""
1827
        ret = Pack('')
1828
        ret._data_load = lambda: data
1829
        ret._idx_load = lambda: idx
1830
        return ret
1831

    
1832
    def name(self):
1833
        """The SHA over the SHAs of the objects in this pack."""
1834
        return self.index.objects_sha1()
1835

    
1836
    @property
1837
    def data(self):
1838
        """The pack data object being used."""
1839
        if self._data is None:
1840
            self._data = self._data_load()
1841
            self._data.pack = self
1842
            self.check_length_and_checksum()
1843
        return self._data
1844

    
1845
    @property
1846
    def index(self):
1847
        """The index being used.
1848

1849
        :note: This may be an in-memory index
1850
        """
1851
        if self._idx is None:
1852
            self._idx = self._idx_load()
1853
        return self._idx
1854

    
1855
    def close(self):
1856
        if self._data is not None:
1857
            self._data.close()
1858
        if self._idx is not None:
1859
            self._idx.close()
1860

    
1861
    def __enter__(self):
1862
        return self
1863

    
1864
    def __exit__(self, exc_type, exc_val, exc_tb):
1865
        self.close()
1866

    
1867
    def __eq__(self, other):
1868
        return isinstance(self, type(other)) and self.index == other.index
1869

    
1870
    def __len__(self):
1871
        """Number of entries in this pack."""
1872
        return len(self.index)
1873

    
1874
    def __repr__(self):
1875
        return '%s(%r)' % (self.__class__.__name__, self._basename)
1876

    
1877
    def __iter__(self):
1878
        """Iterate over all the sha1s of the objects in this pack."""
1879
        return iter(self.index)
1880

    
1881
    def check_length_and_checksum(self):
1882
        """Sanity check the length and checksum of the pack index and data."""
1883
        assert len(self.index) == len(self.data)
1884
        idx_stored_checksum = self.index.get_pack_checksum()
1885
        data_stored_checksum = self.data.get_stored_checksum()
1886
        if idx_stored_checksum != data_stored_checksum:
1887
            raise ChecksumMismatch(sha_to_hex(idx_stored_checksum),
1888
                                   sha_to_hex(data_stored_checksum))
1889

    
1890
    def check(self):
1891
        """Check the integrity of this pack.
1892

1893
        :raise ChecksumMismatch: if a checksum for the index or data is wrong
1894
        """
1895
        self.index.check()
1896
        self.data.check()
1897
        for obj in self.iterobjects():
1898
            obj.check()
1899
        # TODO: object connectivity checks
1900

    
1901
    def get_stored_checksum(self):
1902
        return self.data.get_stored_checksum()
1903

    
1904
    def __contains__(self, sha1):
1905
        """Check whether this pack contains a particular SHA1."""
1906
        try:
1907
            self.index.object_index(sha1)
1908
            return True
1909
        except KeyError:
1910
            return False
1911

    
1912
    def get_raw(self, sha1):
1913
        offset = self.index.object_index(sha1)
1914
        obj_type, obj = self.data.get_object_at(offset)
1915
        type_num, chunks = self.data.resolve_object(offset, obj_type, obj)
1916
        return type_num, b''.join(chunks)
1917

    
1918
    def __getitem__(self, sha1):
1919
        """Retrieve the specified SHA1."""
1920
        type, uncomp = self.get_raw(sha1)
1921
        return ShaFile.from_raw_string(type, uncomp, sha=sha1)
1922

    
1923
    def iterobjects(self):
1924
        """Iterate over the objects in this pack."""
1925
        return iter(PackInflater.for_pack_data(
1926
            self.data, resolve_ext_ref=self.resolve_ext_ref))
1927

    
1928
    def pack_tuples(self):
1929
        """Provide an iterable for use with write_pack_objects.
1930

1931
        :return: Object that can iterate over (object, path) tuples
1932
            and provides __len__
1933
        """
1934
        class PackTupleIterable(object):
1935

    
1936
            def __init__(self, pack):
1937
                self.pack = pack
1938

    
1939
            def __len__(self):
1940
                return len(self.pack)
1941

    
1942
            def __iter__(self):
1943
                return ((o, None) for o in self.pack.iterobjects())
1944

    
1945
        return PackTupleIterable(self)
1946

    
1947
    def keep(self, msg=None):
1948
        """Add a .keep file for the pack, preventing git from garbage collecting it.
1949

1950
        :param msg: A message written inside the .keep file; can be used later to
1951
                    determine whether or not a .keep file is obsolete.
1952
        :return: The path of the .keep file, as a string.
1953
        """
1954
        keepfile_name = '%s.keep' % self._basename
1955
        with GitFile(keepfile_name, 'wb') as keepfile:
1956
            if msg:
1957
                keepfile.write(msg)
1958
                keepfile.write(b'\n')
1959
        return keepfile_name
1960

    
1961

    
1962
try:
1963
    from dulwich._pack import apply_delta, bisect_find_sha
1964
except ImportError:
1965
    pass