gvsig-scripting / org.gvsig.scripting / trunk / org.gvsig.scripting / org.gvsig.scripting.app / org.gvsig.scripting.app.mainplugin / src / main / resources-plugin / scripting / lib / dulwich / pack.py @ 959
History | View | Annotate | Download (64.8 KB)
1 |
# pack.py -- For dealing with packed git objects.
|
---|---|
2 |
# Copyright (C) 2007 James Westby <jw+debian@jameswestby.net>
|
3 |
# Copyright (C) 2008-2013 Jelmer Vernooij <jelmer@samba.org>
|
4 |
#
|
5 |
# Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
|
6 |
# General Public License as public by the Free Software Foundation; version 2.0
|
7 |
# or (at your option) any later version. You can redistribute it and/or
|
8 |
# modify it under the terms of either of these two licenses.
|
9 |
#
|
10 |
# Unless required by applicable law or agreed to in writing, software
|
11 |
# distributed under the License is distributed on an "AS IS" BASIS,
|
12 |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13 |
# See the License for the specific language governing permissions and
|
14 |
# limitations under the License.
|
15 |
#
|
16 |
# You should have received a copy of the licenses; if not, see
|
17 |
# <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
|
18 |
# and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
|
19 |
# License, Version 2.0.
|
20 |
#
|
21 |
|
22 |
"""Classes for dealing with packed git objects.
|
23 |
|
24 |
A pack is a compact representation of a bunch of objects, stored
|
25 |
using deltas where possible.
|
26 |
|
27 |
They have two parts, the pack file, which stores the data, and an index
|
28 |
that tells you where the data is.
|
29 |
|
30 |
To find an object you look in all of the index files 'til you find a
|
31 |
match for the object name. You then use the pointer got from this as
|
32 |
a pointer in to the corresponding packfile.
|
33 |
"""
|
34 |
|
35 |
from collections import defaultdict |
36 |
|
37 |
import binascii |
38 |
from io import BytesIO, UnsupportedOperation |
39 |
from collections import ( |
40 |
deque, |
41 |
) |
42 |
import difflib |
43 |
import struct |
44 |
|
45 |
from itertools import chain |
46 |
try:
|
47 |
from itertools import imap, izip |
48 |
except ImportError: |
49 |
# Python3
|
50 |
imap = map
|
51 |
izip = zip
|
52 |
|
53 |
import os |
54 |
import sys |
55 |
|
56 |
try:
|
57 |
import mmap |
58 |
except ImportError: |
59 |
has_mmap = False
|
60 |
else:
|
61 |
has_mmap = True
|
62 |
|
63 |
# For some reason the above try, except fails to set has_mmap = False for plan9
|
64 |
if sys.platform == 'Plan9': |
65 |
has_mmap = False
|
66 |
|
67 |
from hashlib import sha1 |
68 |
from os import ( |
69 |
SEEK_CUR, |
70 |
SEEK_END, |
71 |
) |
72 |
from struct import unpack_from |
73 |
import zlib |
74 |
|
75 |
from dulwich.errors import ( |
76 |
ApplyDeltaError, |
77 |
ChecksumMismatch, |
78 |
) |
79 |
from dulwich.file import GitFile |
80 |
from dulwich.lru_cache import ( |
81 |
LRUSizeCache, |
82 |
) |
83 |
from dulwich.objects import ( |
84 |
ShaFile, |
85 |
hex_to_sha, |
86 |
sha_to_hex, |
87 |
object_header, |
88 |
) |
89 |
|
90 |
|
91 |
OFS_DELTA = 6
|
92 |
REF_DELTA = 7
|
93 |
|
94 |
DELTA_TYPES = (OFS_DELTA, REF_DELTA) |
95 |
|
96 |
|
97 |
DEFAULT_PACK_DELTA_WINDOW_SIZE = 10
|
98 |
|
99 |
|
100 |
def take_msb_bytes(read, crc32=None): |
101 |
"""Read bytes marked with most significant bit.
|
102 |
|
103 |
:param read: Read function
|
104 |
"""
|
105 |
ret = [] |
106 |
while len(ret) == 0 or ret[-1] & 0x80: |
107 |
b = read(1)
|
108 |
if crc32 is not None: |
109 |
crc32 = binascii.crc32(b, crc32) |
110 |
ret.append(ord(b[:1])) |
111 |
return ret, crc32
|
112 |
|
113 |
|
114 |
class UnpackedObject(object): |
115 |
"""Class encapsulating an object unpacked from a pack file.
|
116 |
|
117 |
These objects should only be created from within unpack_object. Most
|
118 |
members start out as empty and are filled in at various points by
|
119 |
read_zlib_chunks, unpack_object, DeltaChainIterator, etc.
|
120 |
|
121 |
End users of this object should take care that the function they're getting
|
122 |
this object from is guaranteed to set the members they need.
|
123 |
"""
|
124 |
|
125 |
__slots__ = [ |
126 |
'offset', # Offset in its pack. |
127 |
'_sha', # Cached binary SHA. |
128 |
'obj_type_num', # Type of this object. |
129 |
'obj_chunks', # Decompressed and delta-resolved chunks. |
130 |
'pack_type_num', # Type of this object in the pack (may be a delta). |
131 |
'delta_base', # Delta base offset or SHA. |
132 |
'comp_chunks', # Compressed object chunks. |
133 |
'decomp_chunks', # Decompressed object chunks. |
134 |
'decomp_len', # Decompressed length of this object. |
135 |
'crc32', # CRC32. |
136 |
] |
137 |
|
138 |
# TODO(dborowitz): read_zlib_chunks and unpack_object could very well be
|
139 |
# methods of this object.
|
140 |
def __init__(self, pack_type_num, delta_base, decomp_len, crc32): |
141 |
self.offset = None |
142 |
self._sha = None |
143 |
self.pack_type_num = pack_type_num
|
144 |
self.delta_base = delta_base
|
145 |
self.comp_chunks = None |
146 |
self.decomp_chunks = []
|
147 |
self.decomp_len = decomp_len
|
148 |
self.crc32 = crc32
|
149 |
|
150 |
if pack_type_num in DELTA_TYPES: |
151 |
self.obj_type_num = None |
152 |
self.obj_chunks = None |
153 |
else:
|
154 |
self.obj_type_num = pack_type_num
|
155 |
self.obj_chunks = self.decomp_chunks |
156 |
self.delta_base = delta_base
|
157 |
|
158 |
def sha(self): |
159 |
"""Return the binary SHA of this object."""
|
160 |
if self._sha is None: |
161 |
self._sha = obj_sha(self.obj_type_num, self.obj_chunks) |
162 |
return self._sha |
163 |
|
164 |
def sha_file(self): |
165 |
"""Return a ShaFile from this object."""
|
166 |
return ShaFile.from_raw_chunks(self.obj_type_num, self.obj_chunks) |
167 |
|
168 |
# Only provided for backwards compatibility with code that expects either
|
169 |
# chunks or a delta tuple.
|
170 |
def _obj(self): |
171 |
"""Return the decompressed chunks, or (delta base, delta chunks)."""
|
172 |
if self.pack_type_num in DELTA_TYPES: |
173 |
return (self.delta_base, self.decomp_chunks) |
174 |
else:
|
175 |
return self.decomp_chunks |
176 |
|
177 |
def __eq__(self, other): |
178 |
if not isinstance(other, UnpackedObject): |
179 |
return False |
180 |
for slot in self.__slots__: |
181 |
if getattr(self, slot) != getattr(other, slot): |
182 |
return False |
183 |
return True |
184 |
|
185 |
def __ne__(self, other): |
186 |
return not (self == other) |
187 |
|
188 |
def __repr__(self): |
189 |
data = ['%s=%r' % (s, getattr(self, s)) for s in self.__slots__] |
190 |
return '%s(%s)' % (self.__class__.__name__, ', '.join(data)) |
191 |
|
192 |
|
193 |
_ZLIB_BUFSIZE = 4096
|
194 |
|
195 |
|
196 |
def read_zlib_chunks(read_some, unpacked, include_comp=False, |
197 |
buffer_size=_ZLIB_BUFSIZE): |
198 |
"""Read zlib data from a buffer.
|
199 |
|
200 |
This function requires that the buffer have additional data following the
|
201 |
compressed data, which is guaranteed to be the case for git pack files.
|
202 |
|
203 |
:param read_some: Read function that returns at least one byte, but may
|
204 |
return less than the requested size.
|
205 |
:param unpacked: An UnpackedObject to write result data to. If its crc32
|
206 |
attr is not None, the CRC32 of the compressed bytes will be computed
|
207 |
using this starting CRC32.
|
208 |
After this function, will have the following attrs set:
|
209 |
* comp_chunks (if include_comp is True)
|
210 |
* decomp_chunks
|
211 |
* decomp_len
|
212 |
* crc32
|
213 |
:param include_comp: If True, include compressed data in the result.
|
214 |
:param buffer_size: Size of the read buffer.
|
215 |
:return: Leftover unused data from the decompression.
|
216 |
:raise zlib.error: if a decompression error occurred.
|
217 |
"""
|
218 |
if unpacked.decomp_len <= -1: |
219 |
raise ValueError('non-negative zlib data stream size expected') |
220 |
decomp_obj = zlib.decompressobj() |
221 |
|
222 |
comp_chunks = [] |
223 |
decomp_chunks = unpacked.decomp_chunks |
224 |
decomp_len = 0
|
225 |
crc32 = unpacked.crc32 |
226 |
|
227 |
while True: |
228 |
add = read_some(buffer_size) |
229 |
if not add: |
230 |
raise zlib.error('EOF before end of zlib stream') |
231 |
comp_chunks.append(add) |
232 |
decomp = decomp_obj.decompress(add) |
233 |
decomp_len += len(decomp)
|
234 |
decomp_chunks.append(decomp) |
235 |
unused = decomp_obj.unused_data |
236 |
if unused:
|
237 |
left = len(unused)
|
238 |
if crc32 is not None: |
239 |
crc32 = binascii.crc32(add[:-left], crc32) |
240 |
if include_comp:
|
241 |
comp_chunks[-1] = add[:-left]
|
242 |
break
|
243 |
elif crc32 is not None: |
244 |
crc32 = binascii.crc32(add, crc32) |
245 |
if crc32 is not None: |
246 |
crc32 &= 0xffffffff
|
247 |
|
248 |
if decomp_len != unpacked.decomp_len:
|
249 |
raise zlib.error('decompressed data does not match expected size') |
250 |
|
251 |
unpacked.crc32 = crc32 |
252 |
if include_comp:
|
253 |
unpacked.comp_chunks = comp_chunks |
254 |
return unused
|
255 |
|
256 |
|
257 |
def iter_sha1(iter): |
258 |
"""Return the hexdigest of the SHA1 over a set of names.
|
259 |
|
260 |
:param iter: Iterator over string objects
|
261 |
:return: 40-byte hex sha1 digest
|
262 |
"""
|
263 |
sha = sha1() |
264 |
for name in iter: |
265 |
sha.update(name) |
266 |
return sha.hexdigest().encode('ascii') |
267 |
|
268 |
|
269 |
def load_pack_index(path): |
270 |
"""Load an index file by path.
|
271 |
|
272 |
:param filename: Path to the index file
|
273 |
:return: A PackIndex loaded from the given path
|
274 |
"""
|
275 |
with GitFile(path, 'rb') as f: |
276 |
return load_pack_index_file(path, f)
|
277 |
|
278 |
|
279 |
def _load_file_contents(f, size=None): |
280 |
try:
|
281 |
fd = f.fileno() |
282 |
except (UnsupportedOperation, AttributeError): |
283 |
fd = None
|
284 |
# Attempt to use mmap if possible
|
285 |
if fd is not None: |
286 |
if size is None: |
287 |
size = os.fstat(fd).st_size |
288 |
if has_mmap:
|
289 |
try:
|
290 |
contents = mmap.mmap(fd, size, access=mmap.ACCESS_READ) |
291 |
except mmap.error:
|
292 |
# Perhaps a socket?
|
293 |
pass
|
294 |
else:
|
295 |
return contents, size
|
296 |
contents = f.read() |
297 |
size = len(contents)
|
298 |
return contents, size
|
299 |
|
300 |
|
301 |
def load_pack_index_file(path, f): |
302 |
"""Load an index file from a file-like object.
|
303 |
|
304 |
:param path: Path for the index file
|
305 |
:param f: File-like object
|
306 |
:return: A PackIndex loaded from the given file
|
307 |
"""
|
308 |
contents, size = _load_file_contents(f) |
309 |
if contents[:4] == b'\377tOc': |
310 |
version = struct.unpack(b'>L', contents[4:8])[0] |
311 |
if version == 2: |
312 |
return PackIndex2(path, file=f, contents=contents,
|
313 |
size=size) |
314 |
else:
|
315 |
raise KeyError('Unknown pack index format %d' % version) |
316 |
else:
|
317 |
return PackIndex1(path, file=f, contents=contents, size=size)
|
318 |
|
319 |
|
320 |
def bisect_find_sha(start, end, sha, unpack_name): |
321 |
"""Find a SHA in a data blob with sorted SHAs.
|
322 |
|
323 |
:param start: Start index of range to search
|
324 |
:param end: End index of range to search
|
325 |
:param sha: Sha to find
|
326 |
:param unpack_name: Callback to retrieve SHA by index
|
327 |
:return: Index of the SHA, or None if it wasn't found
|
328 |
"""
|
329 |
assert start <= end
|
330 |
while start <= end:
|
331 |
i = (start + end) // 2
|
332 |
file_sha = unpack_name(i) |
333 |
if file_sha < sha:
|
334 |
start = i + 1
|
335 |
elif file_sha > sha:
|
336 |
end = i - 1
|
337 |
else:
|
338 |
return i
|
339 |
return None |
340 |
|
341 |
|
342 |
class PackIndex(object): |
343 |
"""An index in to a packfile.
|
344 |
|
345 |
Given a sha id of an object a pack index can tell you the location in the
|
346 |
packfile of that object if it has it.
|
347 |
"""
|
348 |
|
349 |
def __eq__(self, other): |
350 |
if not isinstance(other, PackIndex): |
351 |
return False |
352 |
|
353 |
for (name1, _, _), (name2, _, _) in izip(self.iterentries(), |
354 |
other.iterentries()): |
355 |
if name1 != name2:
|
356 |
return False |
357 |
return True |
358 |
|
359 |
def __ne__(self, other): |
360 |
return not self.__eq__(other) |
361 |
|
362 |
def __len__(self): |
363 |
"""Return the number of entries in this pack index."""
|
364 |
raise NotImplementedError(self.__len__) |
365 |
|
366 |
def __iter__(self): |
367 |
"""Iterate over the SHAs in this pack."""
|
368 |
return imap(sha_to_hex, self._itersha()) |
369 |
|
370 |
def iterentries(self): |
371 |
"""Iterate over the entries in this pack index.
|
372 |
|
373 |
:return: iterator over tuples with object name, offset in packfile and
|
374 |
crc32 checksum.
|
375 |
"""
|
376 |
raise NotImplementedError(self.iterentries) |
377 |
|
378 |
def get_pack_checksum(self): |
379 |
"""Return the SHA1 checksum stored for the corresponding packfile.
|
380 |
|
381 |
:return: 20-byte binary digest
|
382 |
"""
|
383 |
raise NotImplementedError(self.get_pack_checksum) |
384 |
|
385 |
def object_index(self, sha): |
386 |
"""Return the index in to the corresponding packfile for the object.
|
387 |
|
388 |
Given the name of an object it will return the offset that object
|
389 |
lives at within the corresponding pack file. If the pack file doesn't
|
390 |
have the object then None will be returned.
|
391 |
"""
|
392 |
if len(sha) == 40: |
393 |
sha = hex_to_sha(sha) |
394 |
return self._object_index(sha) |
395 |
|
396 |
def _object_index(self, sha): |
397 |
"""See object_index.
|
398 |
|
399 |
:param sha: A *binary* SHA string. (20 characters long)_
|
400 |
"""
|
401 |
raise NotImplementedError(self._object_index) |
402 |
|
403 |
def objects_sha1(self): |
404 |
"""Return the hex SHA1 over all the shas of all objects in this pack.
|
405 |
|
406 |
:note: This is used for the filename of the pack.
|
407 |
"""
|
408 |
return iter_sha1(self._itersha()) |
409 |
|
410 |
def _itersha(self): |
411 |
"""Yield all the SHA1's of the objects in the index, sorted."""
|
412 |
raise NotImplementedError(self._itersha) |
413 |
|
414 |
|
415 |
class MemoryPackIndex(PackIndex): |
416 |
"""Pack index that is stored entirely in memory."""
|
417 |
|
418 |
def __init__(self, entries, pack_checksum=None): |
419 |
"""Create a new MemoryPackIndex.
|
420 |
|
421 |
:param entries: Sequence of name, idx, crc32 (sorted)
|
422 |
:param pack_checksum: Optional pack checksum
|
423 |
"""
|
424 |
self._by_sha = {}
|
425 |
for name, idx, crc32 in entries: |
426 |
self._by_sha[name] = idx
|
427 |
self._entries = entries
|
428 |
self._pack_checksum = pack_checksum
|
429 |
|
430 |
def get_pack_checksum(self): |
431 |
return self._pack_checksum |
432 |
|
433 |
def __len__(self): |
434 |
return len(self._entries) |
435 |
|
436 |
def _object_index(self, sha): |
437 |
return self._by_sha[sha][0] |
438 |
|
439 |
def _itersha(self): |
440 |
return iter(self._by_sha) |
441 |
|
442 |
def iterentries(self): |
443 |
return iter(self._entries) |
444 |
|
445 |
|
446 |
class FilePackIndex(PackIndex): |
447 |
"""Pack index that is based on a file.
|
448 |
|
449 |
To do the loop it opens the file, and indexes first 256 4 byte groups
|
450 |
with the first byte of the sha id. The value in the four byte group indexed
|
451 |
is the end of the group that shares the same starting byte. Subtract one
|
452 |
from the starting byte and index again to find the start of the group.
|
453 |
The values are sorted by sha id within the group, so do the math to find
|
454 |
the start and end offset and then bisect in to find if the value is present.
|
455 |
"""
|
456 |
|
457 |
def __init__(self, filename, file=None, contents=None, size=None): |
458 |
"""Create a pack index object.
|
459 |
|
460 |
Provide it with the name of the index file to consider, and it will map
|
461 |
it whenever required.
|
462 |
"""
|
463 |
self._filename = filename
|
464 |
# Take the size now, so it can be checked each time we map the file to
|
465 |
# ensure that it hasn't changed.
|
466 |
if file is None: |
467 |
self._file = GitFile(filename, 'rb') |
468 |
else:
|
469 |
self._file = file |
470 |
if contents is None: |
471 |
self._contents, self._size = _load_file_contents(self._file, size) |
472 |
else:
|
473 |
self._contents, self._size = (contents, size) |
474 |
|
475 |
def __eq__(self, other): |
476 |
# Quick optimization:
|
477 |
if (isinstance(other, FilePackIndex) and |
478 |
self._fan_out_table != other._fan_out_table):
|
479 |
return False |
480 |
|
481 |
return super(FilePackIndex, self).__eq__(other) |
482 |
|
483 |
def close(self): |
484 |
self._file.close()
|
485 |
if getattr(self._contents, "close", None) is not None: |
486 |
self._contents.close()
|
487 |
|
488 |
def __len__(self): |
489 |
"""Return the number of entries in this pack index."""
|
490 |
return self._fan_out_table[-1] |
491 |
|
492 |
def _unpack_entry(self, i): |
493 |
"""Unpack the i-th entry in the index file.
|
494 |
|
495 |
:return: Tuple with object name (SHA), offset in pack file and CRC32
|
496 |
checksum (if known).
|
497 |
"""
|
498 |
raise NotImplementedError(self._unpack_entry) |
499 |
|
500 |
def _unpack_name(self, i): |
501 |
"""Unpack the i-th name from the index file."""
|
502 |
raise NotImplementedError(self._unpack_name) |
503 |
|
504 |
def _unpack_offset(self, i): |
505 |
"""Unpack the i-th object offset from the index file."""
|
506 |
raise NotImplementedError(self._unpack_offset) |
507 |
|
508 |
def _unpack_crc32_checksum(self, i): |
509 |
"""Unpack the crc32 checksum for the i-th object from the index file."""
|
510 |
raise NotImplementedError(self._unpack_crc32_checksum) |
511 |
|
512 |
def _itersha(self): |
513 |
for i in range(len(self)): |
514 |
yield self._unpack_name(i) |
515 |
|
516 |
def iterentries(self): |
517 |
"""Iterate over the entries in this pack index.
|
518 |
|
519 |
:return: iterator over tuples with object name, offset in packfile and
|
520 |
crc32 checksum.
|
521 |
"""
|
522 |
for i in range(len(self)): |
523 |
yield self._unpack_entry(i) |
524 |
|
525 |
def _read_fan_out_table(self, start_offset): |
526 |
ret = [] |
527 |
for i in range(0x100): |
528 |
fanout_entry = self._contents[start_offset+i*4:start_offset+(i+1)*4] |
529 |
ret.append(struct.unpack('>L', fanout_entry)[0]) |
530 |
return ret
|
531 |
|
532 |
def check(self): |
533 |
"""Check that the stored checksum matches the actual checksum."""
|
534 |
actual = self.calculate_checksum()
|
535 |
stored = self.get_stored_checksum()
|
536 |
if actual != stored:
|
537 |
raise ChecksumMismatch(stored, actual)
|
538 |
|
539 |
def calculate_checksum(self): |
540 |
"""Calculate the SHA1 checksum over this pack index.
|
541 |
|
542 |
:return: This is a 20-byte binary digest
|
543 |
"""
|
544 |
return sha1(self._contents[:-20]).digest() |
545 |
|
546 |
def get_pack_checksum(self): |
547 |
"""Return the SHA1 checksum stored for the corresponding packfile.
|
548 |
|
549 |
:return: 20-byte binary digest
|
550 |
"""
|
551 |
return bytes(self._contents[-40:-20]) |
552 |
|
553 |
def get_stored_checksum(self): |
554 |
"""Return the SHA1 checksum stored for this index.
|
555 |
|
556 |
:return: 20-byte binary digest
|
557 |
"""
|
558 |
return bytes(self._contents[-20:]) |
559 |
|
560 |
def _object_index(self, sha): |
561 |
"""See object_index.
|
562 |
|
563 |
:param sha: A *binary* SHA string. (20 characters long)_
|
564 |
"""
|
565 |
assert len(sha) == 20 |
566 |
idx = ord(sha[:1]) |
567 |
if idx == 0: |
568 |
start = 0
|
569 |
else:
|
570 |
start = self._fan_out_table[idx-1] |
571 |
end = self._fan_out_table[idx]
|
572 |
i = bisect_find_sha(start, end, sha, self._unpack_name)
|
573 |
if i is None: |
574 |
raise KeyError(sha) |
575 |
return self._unpack_offset(i) |
576 |
|
577 |
|
578 |
class PackIndex1(FilePackIndex): |
579 |
"""Version 1 Pack Index file."""
|
580 |
|
581 |
def __init__(self, filename, file=None, contents=None, size=None): |
582 |
super(PackIndex1, self).__init__(filename, file, contents, size) |
583 |
self.version = 1 |
584 |
self._fan_out_table = self._read_fan_out_table(0) |
585 |
|
586 |
def _unpack_entry(self, i): |
587 |
(offset, name) = unpack_from('>L20s', self._contents, |
588 |
(0x100 * 4) + (i * 24)) |
589 |
return (name, offset, None) |
590 |
|
591 |
def _unpack_name(self, i): |
592 |
offset = (0x100 * 4) + (i * 24) + 4 |
593 |
return self._contents[offset:offset+20] |
594 |
|
595 |
def _unpack_offset(self, i): |
596 |
offset = (0x100 * 4) + (i * 24) |
597 |
return unpack_from('>L', self._contents, offset)[0] |
598 |
|
599 |
def _unpack_crc32_checksum(self, i): |
600 |
# Not stored in v1 index files
|
601 |
return None |
602 |
|
603 |
|
604 |
class PackIndex2(FilePackIndex): |
605 |
"""Version 2 Pack Index file."""
|
606 |
|
607 |
def __init__(self, filename, file=None, contents=None, size=None): |
608 |
super(PackIndex2, self).__init__(filename, file, contents, size) |
609 |
if self._contents[:4] != b'\377tOc': |
610 |
raise AssertionError('Not a v2 pack index file') |
611 |
(self.version, ) = unpack_from(b'>L', self._contents, 4) |
612 |
if self.version != 2: |
613 |
raise AssertionError('Version was %d' % self.version) |
614 |
self._fan_out_table = self._read_fan_out_table(8) |
615 |
self._name_table_offset = 8 + 0x100 * 4 |
616 |
self._crc32_table_offset = self._name_table_offset + 20 * len(self) |
617 |
self._pack_offset_table_offset = (self._crc32_table_offset + |
618 |
4 * len(self)) |
619 |
self._pack_offset_largetable_offset = (self._pack_offset_table_offset + |
620 |
4 * len(self)) |
621 |
|
622 |
def _unpack_entry(self, i): |
623 |
return (self._unpack_name(i), self._unpack_offset(i), |
624 |
self._unpack_crc32_checksum(i))
|
625 |
|
626 |
def _unpack_name(self, i): |
627 |
offset = self._name_table_offset + i * 20 |
628 |
return self._contents[offset:offset+20] |
629 |
|
630 |
def _unpack_offset(self, i): |
631 |
offset = self._pack_offset_table_offset + i * 4 |
632 |
offset = unpack_from('>L', self._contents, offset)[0] |
633 |
if offset & (2**31): |
634 |
offset = self._pack_offset_largetable_offset + (offset&(2**31-1)) * 8 |
635 |
offset = unpack_from('>Q', self._contents, offset)[0] |
636 |
return offset
|
637 |
|
638 |
def _unpack_crc32_checksum(self, i): |
639 |
return unpack_from('>L', self._contents, |
640 |
self._crc32_table_offset + i * 4)[0] |
641 |
|
642 |
|
643 |
def read_pack_header(read): |
644 |
"""Read the header of a pack file.
|
645 |
|
646 |
:param read: Read function
|
647 |
:return: Tuple of (pack version, number of objects). If no data is available
|
648 |
to read, returns (None, None).
|
649 |
"""
|
650 |
header = read(12)
|
651 |
if not header: |
652 |
return None, None |
653 |
if header[:4] != b'PACK': |
654 |
raise AssertionError('Invalid pack header %r' % header) |
655 |
(version,) = unpack_from(b'>L', header, 4) |
656 |
if version not in (2, 3): |
657 |
raise AssertionError('Version was %d' % version) |
658 |
(num_objects,) = unpack_from(b'>L', header, 8) |
659 |
return (version, num_objects)
|
660 |
|
661 |
|
662 |
def chunks_length(chunks): |
663 |
if isinstance(chunks, bytes): |
664 |
return len(chunks) |
665 |
else:
|
666 |
return sum(imap(len, chunks)) |
667 |
|
668 |
|
669 |
def unpack_object(read_all, read_some=None, compute_crc32=False, |
670 |
include_comp=False, zlib_bufsize=_ZLIB_BUFSIZE):
|
671 |
"""Unpack a Git object.
|
672 |
|
673 |
:param read_all: Read function that blocks until the number of requested
|
674 |
bytes are read.
|
675 |
:param read_some: Read function that returns at least one byte, but may not
|
676 |
return the number of bytes requested.
|
677 |
:param compute_crc32: If True, compute the CRC32 of the compressed data. If
|
678 |
False, the returned CRC32 will be None.
|
679 |
:param include_comp: If True, include compressed data in the result.
|
680 |
:param zlib_bufsize: An optional buffer size for zlib operations.
|
681 |
:return: A tuple of (unpacked, unused), where unused is the unused data
|
682 |
leftover from decompression, and unpacked in an UnpackedObject with
|
683 |
the following attrs set:
|
684 |
|
685 |
* obj_chunks (for non-delta types)
|
686 |
* pack_type_num
|
687 |
* delta_base (for delta types)
|
688 |
* comp_chunks (if include_comp is True)
|
689 |
* decomp_chunks
|
690 |
* decomp_len
|
691 |
* crc32 (if compute_crc32 is True)
|
692 |
"""
|
693 |
if read_some is None: |
694 |
read_some = read_all |
695 |
if compute_crc32:
|
696 |
crc32 = 0
|
697 |
else:
|
698 |
crc32 = None
|
699 |
|
700 |
bytes, crc32 = take_msb_bytes(read_all, crc32=crc32)
|
701 |
type_num = (bytes[0] >> 4) & 0x07 |
702 |
size = bytes[0] & 0x0f |
703 |
for i, byte in enumerate(bytes[1:]): |
704 |
size += (byte & 0x7f) << ((i * 7) + 4) |
705 |
|
706 |
raw_base = len(bytes) |
707 |
if type_num == OFS_DELTA:
|
708 |
bytes, crc32 = take_msb_bytes(read_all, crc32=crc32)
|
709 |
raw_base += len(bytes) |
710 |
if bytes[-1] & 0x80: |
711 |
raise AssertionError |
712 |
delta_base_offset = bytes[0] & 0x7f |
713 |
for byte in bytes[1:]: |
714 |
delta_base_offset += 1
|
715 |
delta_base_offset <<= 7
|
716 |
delta_base_offset += (byte & 0x7f)
|
717 |
delta_base = delta_base_offset |
718 |
elif type_num == REF_DELTA:
|
719 |
delta_base = read_all(20)
|
720 |
if compute_crc32:
|
721 |
crc32 = binascii.crc32(delta_base, crc32) |
722 |
raw_base += 20
|
723 |
else:
|
724 |
delta_base = None
|
725 |
|
726 |
unpacked = UnpackedObject(type_num, delta_base, size, crc32) |
727 |
unused = read_zlib_chunks(read_some, unpacked, buffer_size=zlib_bufsize, |
728 |
include_comp=include_comp) |
729 |
return unpacked, unused
|
730 |
|
731 |
|
732 |
def _compute_object_size(value): |
733 |
"""Compute the size of a unresolved object for use with LRUSizeCache."""
|
734 |
(num, obj) = value |
735 |
if num in DELTA_TYPES: |
736 |
return chunks_length(obj[1]) |
737 |
return chunks_length(obj)
|
738 |
|
739 |
|
740 |
class PackStreamReader(object): |
741 |
"""Class to read a pack stream.
|
742 |
|
743 |
The pack is read from a ReceivableProtocol using read() or recv() as
|
744 |
appropriate.
|
745 |
"""
|
746 |
|
747 |
def __init__(self, read_all, read_some=None, zlib_bufsize=_ZLIB_BUFSIZE): |
748 |
self.read_all = read_all
|
749 |
if read_some is None: |
750 |
self.read_some = read_all
|
751 |
else:
|
752 |
self.read_some = read_some
|
753 |
self.sha = sha1()
|
754 |
self._offset = 0 |
755 |
self._rbuf = BytesIO()
|
756 |
# trailer is a deque to avoid memory allocation on small reads
|
757 |
self._trailer = deque()
|
758 |
self._zlib_bufsize = zlib_bufsize
|
759 |
|
760 |
def _read(self, read, size): |
761 |
"""Read up to size bytes using the given callback.
|
762 |
|
763 |
As a side effect, update the verifier's hash (excluding the last 20
|
764 |
bytes read).
|
765 |
|
766 |
:param read: The read callback to read from.
|
767 |
:param size: The maximum number of bytes to read; the particular
|
768 |
behavior is callback-specific.
|
769 |
"""
|
770 |
data = read(size) |
771 |
|
772 |
# maintain a trailer of the last 20 bytes we've read
|
773 |
n = len(data)
|
774 |
self._offset += n
|
775 |
tn = len(self._trailer) |
776 |
if n >= 20: |
777 |
to_pop = tn |
778 |
to_add = 20
|
779 |
else:
|
780 |
to_pop = max(n + tn - 20, 0) |
781 |
to_add = n |
782 |
self.sha.update(bytes(bytearray([self._trailer.popleft() for _ in range(to_pop)]))) |
783 |
self._trailer.extend(data[-to_add:])
|
784 |
|
785 |
# hash everything but the trailer
|
786 |
self.sha.update(data[:-to_add])
|
787 |
return data
|
788 |
|
789 |
def _buf_len(self): |
790 |
buf = self._rbuf
|
791 |
start = buf.tell() |
792 |
buf.seek(0, SEEK_END)
|
793 |
end = buf.tell() |
794 |
buf.seek(start) |
795 |
return end - start
|
796 |
|
797 |
@property
|
798 |
def offset(self): |
799 |
return self._offset - self._buf_len() |
800 |
|
801 |
def read(self, size): |
802 |
"""Read, blocking until size bytes are read."""
|
803 |
buf_len = self._buf_len()
|
804 |
if buf_len >= size:
|
805 |
return self._rbuf.read(size) |
806 |
buf_data = self._rbuf.read()
|
807 |
self._rbuf = BytesIO()
|
808 |
return buf_data + self._read(self.read_all, size - buf_len) |
809 |
|
810 |
def recv(self, size): |
811 |
"""Read up to size bytes, blocking until one byte is read."""
|
812 |
buf_len = self._buf_len()
|
813 |
if buf_len:
|
814 |
data = self._rbuf.read(size)
|
815 |
if size >= buf_len:
|
816 |
self._rbuf = BytesIO()
|
817 |
return data
|
818 |
return self._read(self.read_some, size) |
819 |
|
820 |
def __len__(self): |
821 |
return self._num_objects |
822 |
|
823 |
def read_objects(self, compute_crc32=False): |
824 |
"""Read the objects in this pack file.
|
825 |
|
826 |
:param compute_crc32: If True, compute the CRC32 of the compressed
|
827 |
data. If False, the returned CRC32 will be None.
|
828 |
:return: Iterator over UnpackedObjects with the following members set:
|
829 |
offset
|
830 |
obj_type_num
|
831 |
obj_chunks (for non-delta types)
|
832 |
delta_base (for delta types)
|
833 |
decomp_chunks
|
834 |
decomp_len
|
835 |
crc32 (if compute_crc32 is True)
|
836 |
:raise ChecksumMismatch: if the checksum of the pack contents does not
|
837 |
match the checksum in the pack trailer.
|
838 |
:raise zlib.error: if an error occurred during zlib decompression.
|
839 |
:raise IOError: if an error occurred writing to the output file.
|
840 |
"""
|
841 |
pack_version, self._num_objects = read_pack_header(self.read) |
842 |
if pack_version is None: |
843 |
return
|
844 |
|
845 |
for i in range(self._num_objects): |
846 |
offset = self.offset
|
847 |
unpacked, unused = unpack_object( |
848 |
self.read, read_some=self.recv, compute_crc32=compute_crc32, |
849 |
zlib_bufsize=self._zlib_bufsize)
|
850 |
unpacked.offset = offset |
851 |
|
852 |
# prepend any unused data to current read buffer
|
853 |
buf = BytesIO() |
854 |
buf.write(unused) |
855 |
buf.write(self._rbuf.read())
|
856 |
buf.seek(0)
|
857 |
self._rbuf = buf
|
858 |
|
859 |
yield unpacked
|
860 |
|
861 |
if self._buf_len() < 20: |
862 |
# If the read buffer is full, then the last read() got the whole
|
863 |
# trailer off the wire. If not, it means there is still some of the
|
864 |
# trailer to read. We need to read() all 20 bytes; N come from the
|
865 |
# read buffer and (20 - N) come from the wire.
|
866 |
self.read(20) |
867 |
|
868 |
pack_sha = bytearray(self._trailer) |
869 |
if pack_sha != self.sha.digest(): |
870 |
raise ChecksumMismatch(sha_to_hex(pack_sha), self.sha.hexdigest()) |
871 |
|
872 |
|
873 |
class PackStreamCopier(PackStreamReader): |
874 |
"""Class to verify a pack stream as it is being read.
|
875 |
|
876 |
The pack is read from a ReceivableProtocol using read() or recv() as
|
877 |
appropriate and written out to the given file-like object.
|
878 |
"""
|
879 |
|
880 |
def __init__(self, read_all, read_some, outfile, delta_iter=None): |
881 |
"""Initialize the copier.
|
882 |
|
883 |
:param read_all: Read function that blocks until the number of requested
|
884 |
bytes are read.
|
885 |
:param read_some: Read function that returns at least one byte, but may
|
886 |
not return the number of bytes requested.
|
887 |
:param outfile: File-like object to write output through.
|
888 |
:param delta_iter: Optional DeltaChainIterator to record deltas as we
|
889 |
read them.
|
890 |
"""
|
891 |
super(PackStreamCopier, self).__init__(read_all, read_some=read_some) |
892 |
self.outfile = outfile
|
893 |
self._delta_iter = delta_iter
|
894 |
|
895 |
def _read(self, read, size): |
896 |
"""Read data from the read callback and write it to the file."""
|
897 |
data = super(PackStreamCopier, self)._read(read, size) |
898 |
self.outfile.write(data)
|
899 |
return data
|
900 |
|
901 |
def verify(self): |
902 |
"""Verify a pack stream and write it to the output file.
|
903 |
|
904 |
See PackStreamReader.iterobjects for a list of exceptions this may
|
905 |
throw.
|
906 |
"""
|
907 |
if self._delta_iter: |
908 |
for unpacked in self.read_objects(): |
909 |
self._delta_iter.record(unpacked)
|
910 |
else:
|
911 |
for _ in self.read_objects(): |
912 |
pass
|
913 |
|
914 |
|
915 |
def obj_sha(type, chunks): |
916 |
"""Compute the SHA for a numeric type and object chunks."""
|
917 |
sha = sha1() |
918 |
sha.update(object_header(type, chunks_length(chunks)))
|
919 |
if isinstance(chunks, bytes): |
920 |
sha.update(chunks) |
921 |
else:
|
922 |
for chunk in chunks: |
923 |
sha.update(chunk) |
924 |
return sha.digest()
|
925 |
|
926 |
|
927 |
def compute_file_sha(f, start_ofs=0, end_ofs=0, buffer_size=1<<16): |
928 |
"""Hash a portion of a file into a new SHA.
|
929 |
|
930 |
:param f: A file-like object to read from that supports seek().
|
931 |
:param start_ofs: The offset in the file to start reading at.
|
932 |
:param end_ofs: The offset in the file to end reading at, relative to the
|
933 |
end of the file.
|
934 |
:param buffer_size: A buffer size for reading.
|
935 |
:return: A new SHA object updated with data read from the file.
|
936 |
"""
|
937 |
sha = sha1() |
938 |
f.seek(0, SEEK_END)
|
939 |
length = f.tell() |
940 |
if (end_ofs < 0 and length + end_ofs < start_ofs) or end_ofs > length: |
941 |
raise AssertionError( |
942 |
"Attempt to read beyond file length. "
|
943 |
"start_ofs: %d, end_ofs: %d, file length: %d" % (
|
944 |
start_ofs, end_ofs, length)) |
945 |
todo = length + end_ofs - start_ofs |
946 |
f.seek(start_ofs) |
947 |
while todo:
|
948 |
data = f.read(min(todo, buffer_size))
|
949 |
sha.update(data) |
950 |
todo -= len(data)
|
951 |
return sha
|
952 |
|
953 |
|
954 |
class PackData(object): |
955 |
"""The data contained in a packfile.
|
956 |
|
957 |
Pack files can be accessed both sequentially for exploding a pack, and
|
958 |
directly with the help of an index to retrieve a specific object.
|
959 |
|
960 |
The objects within are either complete or a delta against another.
|
961 |
|
962 |
The header is variable length. If the MSB of each byte is set then it
|
963 |
indicates that the subsequent byte is still part of the header.
|
964 |
For the first byte the next MS bits are the type, which tells you the type
|
965 |
of object, and whether it is a delta. The LS byte is the lowest bits of the
|
966 |
size. For each subsequent byte the LS 7 bits are the next MS bits of the
|
967 |
size, i.e. the last byte of the header contains the MS bits of the size.
|
968 |
|
969 |
For the complete objects the data is stored as zlib deflated data.
|
970 |
The size in the header is the uncompressed object size, so to uncompress
|
971 |
you need to just keep feeding data to zlib until you get an object back,
|
972 |
or it errors on bad data. This is done here by just giving the complete
|
973 |
buffer from the start of the deflated object on. This is bad, but until I
|
974 |
get mmap sorted out it will have to do.
|
975 |
|
976 |
Currently there are no integrity checks done. Also no attempt is made to
|
977 |
try and detect the delta case, or a request for an object at the wrong
|
978 |
position. It will all just throw a zlib or KeyError.
|
979 |
"""
|
980 |
|
981 |
def __init__(self, filename, file=None, size=None): |
982 |
"""Create a PackData object representing the pack in the given filename.
|
983 |
|
984 |
The file must exist and stay readable until the object is disposed of. It
|
985 |
must also stay the same size. It will be mapped whenever needed.
|
986 |
|
987 |
Currently there is a restriction on the size of the pack as the python
|
988 |
mmap implementation is flawed.
|
989 |
"""
|
990 |
self._filename = filename
|
991 |
self._size = size
|
992 |
self._header_size = 12 |
993 |
if file is None: |
994 |
self._file = GitFile(self._filename, 'rb') |
995 |
else:
|
996 |
self._file = file |
997 |
(version, self._num_objects) = read_pack_header(self._file.read) |
998 |
self._offset_cache = LRUSizeCache(1024*1024*20, |
999 |
compute_size=_compute_object_size) |
1000 |
self.pack = None |
1001 |
|
1002 |
@property
|
1003 |
def filename(self): |
1004 |
return os.path.basename(self._filename) |
1005 |
|
1006 |
@classmethod
|
1007 |
def from_file(cls, file, size): |
1008 |
return cls(str(file), file=file, size=size) |
1009 |
|
1010 |
@classmethod
|
1011 |
def from_path(cls, path): |
1012 |
return cls(filename=path)
|
1013 |
|
1014 |
def close(self): |
1015 |
self._file.close()
|
1016 |
|
1017 |
def __enter__(self): |
1018 |
return self |
1019 |
|
1020 |
def __exit__(self, exc_type, exc_val, exc_tb): |
1021 |
self.close()
|
1022 |
|
1023 |
def _get_size(self): |
1024 |
if self._size is not None: |
1025 |
return self._size |
1026 |
self._size = os.path.getsize(self._filename) |
1027 |
if self._size < self._header_size: |
1028 |
errmsg = ('%s is too small for a packfile (%d < %d)' %
|
1029 |
(self._filename, self._size, self._header_size)) |
1030 |
raise AssertionError(errmsg) |
1031 |
return self._size |
1032 |
|
1033 |
def __len__(self): |
1034 |
"""Returns the number of objects in this pack."""
|
1035 |
return self._num_objects |
1036 |
|
1037 |
def calculate_checksum(self): |
1038 |
"""Calculate the checksum for this pack.
|
1039 |
|
1040 |
:return: 20-byte binary SHA1 digest
|
1041 |
"""
|
1042 |
return compute_file_sha(self._file, end_ofs=-20).digest() |
1043 |
|
1044 |
def get_ref(self, sha): |
1045 |
"""Get the object for a ref SHA, only looking in this pack."""
|
1046 |
# TODO: cache these results
|
1047 |
if self.pack is None: |
1048 |
raise KeyError(sha) |
1049 |
try:
|
1050 |
offset = self.pack.index.object_index(sha)
|
1051 |
except KeyError: |
1052 |
offset = None
|
1053 |
if offset:
|
1054 |
type, obj = self.get_object_at(offset) |
1055 |
elif self.pack is not None and self.pack.resolve_ext_ref: |
1056 |
type, obj = self.pack.resolve_ext_ref(sha) |
1057 |
else:
|
1058 |
raise KeyError(sha) |
1059 |
return offset, type, obj |
1060 |
|
1061 |
def resolve_object(self, offset, type, obj, get_ref=None): |
1062 |
"""Resolve an object, possibly resolving deltas when necessary.
|
1063 |
|
1064 |
:return: Tuple with object type and contents.
|
1065 |
"""
|
1066 |
# Walk down the delta chain, building a stack of deltas to reach
|
1067 |
# the requested object.
|
1068 |
base_offset = offset |
1069 |
base_type = type
|
1070 |
base_obj = obj |
1071 |
delta_stack = [] |
1072 |
while base_type in DELTA_TYPES: |
1073 |
prev_offset = base_offset |
1074 |
if get_ref is None: |
1075 |
get_ref = self.get_ref
|
1076 |
if base_type == OFS_DELTA:
|
1077 |
(delta_offset, delta) = base_obj |
1078 |
# TODO: clean up asserts and replace with nicer error messages
|
1079 |
assert (
|
1080 |
isinstance(base_offset, int) |
1081 |
or isinstance(base_offset, long)) |
1082 |
assert (
|
1083 |
isinstance(delta_offset, int) |
1084 |
or isinstance(base_offset, long)) |
1085 |
base_offset = base_offset - delta_offset |
1086 |
base_type, base_obj = self.get_object_at(base_offset)
|
1087 |
assert isinstance(base_type, int) |
1088 |
elif base_type == REF_DELTA:
|
1089 |
(basename, delta) = base_obj |
1090 |
assert isinstance(basename, bytes) and len(basename) == 20 |
1091 |
base_offset, base_type, base_obj = get_ref(basename) |
1092 |
assert isinstance(base_type, int) |
1093 |
delta_stack.append((prev_offset, base_type, delta)) |
1094 |
|
1095 |
# Now grab the base object (mustn't be a delta) and apply the
|
1096 |
# deltas all the way up the stack.
|
1097 |
chunks = base_obj |
1098 |
for prev_offset, delta_type, delta in reversed(delta_stack): |
1099 |
chunks = apply_delta(chunks, delta) |
1100 |
# TODO(dborowitz): This can result in poor performance if
|
1101 |
# large base objects are separated from deltas in the pack.
|
1102 |
# We should reorganize so that we apply deltas to all
|
1103 |
# objects in a chain one after the other to optimize cache
|
1104 |
# performance.
|
1105 |
if prev_offset is not None: |
1106 |
self._offset_cache[prev_offset] = base_type, chunks
|
1107 |
return base_type, chunks
|
1108 |
|
1109 |
def iterobjects(self, progress=None, compute_crc32=True): |
1110 |
self._file.seek(self._header_size) |
1111 |
for i in range(1, self._num_objects + 1): |
1112 |
offset = self._file.tell()
|
1113 |
unpacked, unused = unpack_object( |
1114 |
self._file.read, compute_crc32=compute_crc32)
|
1115 |
if progress is not None: |
1116 |
progress(i, self._num_objects)
|
1117 |
yield (offset, unpacked.pack_type_num, unpacked._obj(),
|
1118 |
unpacked.crc32) |
1119 |
self._file.seek(-len(unused), SEEK_CUR) # Back up over unused data. |
1120 |
|
1121 |
def _iter_unpacked(self): |
1122 |
# TODO(dborowitz): Merge this with iterobjects, if we can change its
|
1123 |
# return type.
|
1124 |
self._file.seek(self._header_size) |
1125 |
|
1126 |
if self._num_objects is None: |
1127 |
return
|
1128 |
|
1129 |
for _ in range(self._num_objects): |
1130 |
offset = self._file.tell()
|
1131 |
unpacked, unused = unpack_object( |
1132 |
self._file.read, compute_crc32=False) |
1133 |
unpacked.offset = offset |
1134 |
yield unpacked
|
1135 |
self._file.seek(-len(unused), SEEK_CUR) # Back up over unused data. |
1136 |
|
1137 |
def iterentries(self, progress=None): |
1138 |
"""Yield entries summarizing the contents of this pack.
|
1139 |
|
1140 |
:param progress: Progress function, called with current and total
|
1141 |
object count.
|
1142 |
:return: iterator of tuples with (sha, offset, crc32)
|
1143 |
"""
|
1144 |
num_objects = self._num_objects
|
1145 |
resolve_ext_ref = ( |
1146 |
self.pack.resolve_ext_ref if self.pack is not None else None) |
1147 |
indexer = PackIndexer.for_pack_data( |
1148 |
self, resolve_ext_ref=resolve_ext_ref)
|
1149 |
for i, result in enumerate(indexer): |
1150 |
if progress is not None: |
1151 |
progress(i, num_objects) |
1152 |
yield result
|
1153 |
|
1154 |
def sorted_entries(self, progress=None): |
1155 |
"""Return entries in this pack, sorted by SHA.
|
1156 |
|
1157 |
:param progress: Progress function, called with current and total
|
1158 |
object count
|
1159 |
:return: List of tuples with (sha, offset, crc32)
|
1160 |
"""
|
1161 |
ret = list(self.iterentries(progress=progress)) |
1162 |
ret.sort() |
1163 |
return ret
|
1164 |
|
1165 |
def create_index_v1(self, filename, progress=None): |
1166 |
"""Create a version 1 file for this data file.
|
1167 |
|
1168 |
:param filename: Index filename.
|
1169 |
:param progress: Progress report function
|
1170 |
:return: Checksum of index file
|
1171 |
"""
|
1172 |
entries = self.sorted_entries(progress=progress)
|
1173 |
with GitFile(filename, 'wb') as f: |
1174 |
return write_pack_index_v1(f, entries, self.calculate_checksum()) |
1175 |
|
1176 |
def create_index_v2(self, filename, progress=None): |
1177 |
"""Create a version 2 index file for this data file.
|
1178 |
|
1179 |
:param filename: Index filename.
|
1180 |
:param progress: Progress report function
|
1181 |
:return: Checksum of index file
|
1182 |
"""
|
1183 |
entries = self.sorted_entries(progress=progress)
|
1184 |
with GitFile(filename, 'wb') as f: |
1185 |
return write_pack_index_v2(f, entries, self.calculate_checksum()) |
1186 |
|
1187 |
def create_index(self, filename, progress=None, |
1188 |
version=2):
|
1189 |
"""Create an index file for this data file.
|
1190 |
|
1191 |
:param filename: Index filename.
|
1192 |
:param progress: Progress report function
|
1193 |
:return: Checksum of index file
|
1194 |
"""
|
1195 |
if version == 1: |
1196 |
return self.create_index_v1(filename, progress) |
1197 |
elif version == 2: |
1198 |
return self.create_index_v2(filename, progress) |
1199 |
else:
|
1200 |
raise ValueError('unknown index format %d' % version) |
1201 |
|
1202 |
def get_stored_checksum(self): |
1203 |
"""Return the expected checksum stored in this pack."""
|
1204 |
self._file.seek(-20, SEEK_END) |
1205 |
return self._file.read(20) |
1206 |
|
1207 |
def check(self): |
1208 |
"""Check the consistency of this pack."""
|
1209 |
actual = self.calculate_checksum()
|
1210 |
stored = self.get_stored_checksum()
|
1211 |
if actual != stored:
|
1212 |
raise ChecksumMismatch(stored, actual)
|
1213 |
|
1214 |
def get_object_at(self, offset): |
1215 |
"""Given an offset in to the packfile return the object that is there.
|
1216 |
|
1217 |
Using the associated index the location of an object can be looked up,
|
1218 |
and then the packfile can be asked directly for that object using this
|
1219 |
function.
|
1220 |
"""
|
1221 |
try:
|
1222 |
return self._offset_cache[offset] |
1223 |
except KeyError: |
1224 |
pass
|
1225 |
assert offset >= self._header_size |
1226 |
self._file.seek(offset)
|
1227 |
unpacked, _ = unpack_object(self._file.read)
|
1228 |
return (unpacked.pack_type_num, unpacked._obj())
|
1229 |
|
1230 |
|
1231 |
class DeltaChainIterator(object): |
1232 |
"""Abstract iterator over pack data based on delta chains.
|
1233 |
|
1234 |
Each object in the pack is guaranteed to be inflated exactly once,
|
1235 |
regardless of how many objects reference it as a delta base. As a result,
|
1236 |
memory usage is proportional to the length of the longest delta chain.
|
1237 |
|
1238 |
Subclasses can override _result to define the result type of the iterator.
|
1239 |
By default, results are UnpackedObjects with the following members set:
|
1240 |
|
1241 |
* offset
|
1242 |
* obj_type_num
|
1243 |
* obj_chunks
|
1244 |
* pack_type_num
|
1245 |
* delta_base (for delta types)
|
1246 |
* comp_chunks (if _include_comp is True)
|
1247 |
* decomp_chunks
|
1248 |
* decomp_len
|
1249 |
* crc32 (if _compute_crc32 is True)
|
1250 |
"""
|
1251 |
|
1252 |
_compute_crc32 = False
|
1253 |
_include_comp = False
|
1254 |
|
1255 |
def __init__(self, file_obj, resolve_ext_ref=None): |
1256 |
self._file = file_obj
|
1257 |
self._resolve_ext_ref = resolve_ext_ref
|
1258 |
self._pending_ofs = defaultdict(list) |
1259 |
self._pending_ref = defaultdict(list) |
1260 |
self._full_ofs = []
|
1261 |
self._shas = {}
|
1262 |
self._ext_refs = []
|
1263 |
|
1264 |
@classmethod
|
1265 |
def for_pack_data(cls, pack_data, resolve_ext_ref=None): |
1266 |
walker = cls(None, resolve_ext_ref=resolve_ext_ref)
|
1267 |
walker.set_pack_data(pack_data) |
1268 |
for unpacked in pack_data._iter_unpacked(): |
1269 |
walker.record(unpacked) |
1270 |
return walker
|
1271 |
|
1272 |
def record(self, unpacked): |
1273 |
type_num = unpacked.pack_type_num |
1274 |
offset = unpacked.offset |
1275 |
if type_num == OFS_DELTA:
|
1276 |
base_offset = offset - unpacked.delta_base |
1277 |
self._pending_ofs[base_offset].append(offset)
|
1278 |
elif type_num == REF_DELTA:
|
1279 |
self._pending_ref[unpacked.delta_base].append(offset)
|
1280 |
else:
|
1281 |
self._full_ofs.append((offset, type_num))
|
1282 |
|
1283 |
def set_pack_data(self, pack_data): |
1284 |
self._file = pack_data._file
|
1285 |
|
1286 |
def _walk_all_chains(self): |
1287 |
for offset, type_num in self._full_ofs: |
1288 |
for result in self._follow_chain(offset, type_num, None): |
1289 |
yield result
|
1290 |
for result in self._walk_ref_chains(): |
1291 |
yield result
|
1292 |
assert not self._pending_ofs |
1293 |
|
1294 |
def _ensure_no_pending(self): |
1295 |
if self._pending_ref: |
1296 |
raise KeyError([sha_to_hex(s) for s in self._pending_ref]) |
1297 |
|
1298 |
def _walk_ref_chains(self): |
1299 |
if not self._resolve_ext_ref: |
1300 |
self._ensure_no_pending()
|
1301 |
return
|
1302 |
|
1303 |
for base_sha, pending in sorted(self._pending_ref.items()): |
1304 |
if base_sha not in self._pending_ref: |
1305 |
continue
|
1306 |
try:
|
1307 |
type_num, chunks = self._resolve_ext_ref(base_sha)
|
1308 |
except KeyError: |
1309 |
# Not an external ref, but may depend on one. Either it will get
|
1310 |
# popped via a _follow_chain call, or we will raise an error
|
1311 |
# below.
|
1312 |
continue
|
1313 |
self._ext_refs.append(base_sha)
|
1314 |
self._pending_ref.pop(base_sha)
|
1315 |
for new_offset in pending: |
1316 |
for result in self._follow_chain(new_offset, type_num, chunks): |
1317 |
yield result
|
1318 |
|
1319 |
self._ensure_no_pending()
|
1320 |
|
1321 |
def _result(self, unpacked): |
1322 |
return unpacked
|
1323 |
|
1324 |
def _resolve_object(self, offset, obj_type_num, base_chunks): |
1325 |
self._file.seek(offset)
|
1326 |
unpacked, _ = unpack_object( |
1327 |
self._file.read, include_comp=self._include_comp, |
1328 |
compute_crc32=self._compute_crc32)
|
1329 |
unpacked.offset = offset |
1330 |
if base_chunks is None: |
1331 |
assert unpacked.pack_type_num == obj_type_num
|
1332 |
else:
|
1333 |
assert unpacked.pack_type_num in DELTA_TYPES |
1334 |
unpacked.obj_type_num = obj_type_num |
1335 |
unpacked.obj_chunks = apply_delta(base_chunks, |
1336 |
unpacked.decomp_chunks) |
1337 |
return unpacked
|
1338 |
|
1339 |
def _follow_chain(self, offset, obj_type_num, base_chunks): |
1340 |
# Unlike PackData.get_object_at, there is no need to cache offsets as
|
1341 |
# this approach by design inflates each object exactly once.
|
1342 |
todo = [(offset, obj_type_num, base_chunks)] |
1343 |
for offset, obj_type_num, base_chunks in todo: |
1344 |
unpacked = self._resolve_object(offset, obj_type_num, base_chunks)
|
1345 |
yield self._result(unpacked) |
1346 |
|
1347 |
unblocked = chain(self._pending_ofs.pop(unpacked.offset, []),
|
1348 |
self._pending_ref.pop(unpacked.sha(), []))
|
1349 |
todo.extend( |
1350 |
(new_offset, unpacked.obj_type_num, unpacked.obj_chunks) |
1351 |
for new_offset in unblocked) |
1352 |
|
1353 |
def __iter__(self): |
1354 |
return self._walk_all_chains() |
1355 |
|
1356 |
def ext_refs(self): |
1357 |
return self._ext_refs |
1358 |
|
1359 |
|
1360 |
class PackIndexer(DeltaChainIterator): |
1361 |
"""Delta chain iterator that yields index entries."""
|
1362 |
|
1363 |
_compute_crc32 = True
|
1364 |
|
1365 |
def _result(self, unpacked): |
1366 |
return unpacked.sha(), unpacked.offset, unpacked.crc32
|
1367 |
|
1368 |
|
1369 |
class PackInflater(DeltaChainIterator): |
1370 |
"""Delta chain iterator that yields ShaFile objects."""
|
1371 |
|
1372 |
def _result(self, unpacked): |
1373 |
return unpacked.sha_file()
|
1374 |
|
1375 |
|
1376 |
class SHA1Reader(object): |
1377 |
"""Wrapper around a file-like object that remembers the SHA1 of its data."""
|
1378 |
|
1379 |
def __init__(self, f): |
1380 |
self.f = f
|
1381 |
self.sha1 = sha1(b'') |
1382 |
|
1383 |
def read(self, num=None): |
1384 |
data = self.f.read(num)
|
1385 |
self.sha1.update(data)
|
1386 |
return data
|
1387 |
|
1388 |
def check_sha(self): |
1389 |
stored = self.f.read(20) |
1390 |
if stored != self.sha1.digest(): |
1391 |
raise ChecksumMismatch(self.sha1.hexdigest(), sha_to_hex(stored)) |
1392 |
|
1393 |
def close(self): |
1394 |
return self.f.close() |
1395 |
|
1396 |
def tell(self): |
1397 |
return self.f.tell() |
1398 |
|
1399 |
|
1400 |
class SHA1Writer(object): |
1401 |
"""Wrapper around a file-like object that remembers the SHA1 of its data."""
|
1402 |
|
1403 |
def __init__(self, f): |
1404 |
self.f = f
|
1405 |
self.length = 0 |
1406 |
self.sha1 = sha1(b'') |
1407 |
|
1408 |
def write(self, data): |
1409 |
self.sha1.update(data)
|
1410 |
self.f.write(data)
|
1411 |
self.length += len(data) |
1412 |
|
1413 |
def write_sha(self): |
1414 |
sha = self.sha1.digest()
|
1415 |
assert len(sha) == 20 |
1416 |
self.f.write(sha)
|
1417 |
self.length += len(sha) |
1418 |
return sha
|
1419 |
|
1420 |
def close(self): |
1421 |
sha = self.write_sha()
|
1422 |
self.f.close()
|
1423 |
return sha
|
1424 |
|
1425 |
def offset(self): |
1426 |
return self.length |
1427 |
|
1428 |
def tell(self): |
1429 |
return self.f.tell() |
1430 |
|
1431 |
|
1432 |
def pack_object_header(type_num, delta_base, size): |
1433 |
"""Create a pack object header for the given object info.
|
1434 |
|
1435 |
:param type_num: Numeric type of the object.
|
1436 |
:param delta_base: Delta base offset or ref, or None for whole objects.
|
1437 |
:param size: Uncompressed object size.
|
1438 |
:return: A header for a packed object.
|
1439 |
"""
|
1440 |
header = [] |
1441 |
c = (type_num << 4) | (size & 15) |
1442 |
size >>= 4
|
1443 |
while size:
|
1444 |
header.append(c | 0x80)
|
1445 |
c = size & 0x7f
|
1446 |
size >>= 7
|
1447 |
header.append(c) |
1448 |
if type_num == OFS_DELTA:
|
1449 |
ret = [delta_base & 0x7f]
|
1450 |
delta_base >>= 7
|
1451 |
while delta_base:
|
1452 |
delta_base -= 1
|
1453 |
ret.insert(0, 0x80 | (delta_base & 0x7f)) |
1454 |
delta_base >>= 7
|
1455 |
header.extend(ret) |
1456 |
elif type_num == REF_DELTA:
|
1457 |
assert len(delta_base) == 20 |
1458 |
header += delta_base |
1459 |
return bytearray(header) |
1460 |
|
1461 |
|
1462 |
def write_pack_object(f, type, object, sha=None): |
1463 |
"""Write pack object to a file.
|
1464 |
|
1465 |
:param f: File to write to
|
1466 |
:param type: Numeric type of the object
|
1467 |
:param object: Object to write
|
1468 |
:return: Tuple with offset at which the object was written, and crc32
|
1469 |
"""
|
1470 |
if type in DELTA_TYPES: |
1471 |
delta_base, object = object
|
1472 |
else:
|
1473 |
delta_base = None
|
1474 |
header = bytes(pack_object_header(type, delta_base, len(object))) |
1475 |
comp_data = zlib.compress(object)
|
1476 |
crc32 = 0
|
1477 |
for data in (header, comp_data): |
1478 |
f.write(data) |
1479 |
if sha is not None: |
1480 |
sha.update(data) |
1481 |
crc32 = binascii.crc32(data, crc32) |
1482 |
return crc32 & 0xffffffff |
1483 |
|
1484 |
|
1485 |
def write_pack(filename, objects, deltify=None, delta_window_size=None): |
1486 |
"""Write a new pack data file.
|
1487 |
|
1488 |
:param filename: Path to the new pack file (without .pack extension)
|
1489 |
:param objects: Iterable of (object, path) tuples to write.
|
1490 |
Should provide __len__
|
1491 |
:param window_size: Delta window size
|
1492 |
:param deltify: Whether to deltify pack objects
|
1493 |
:return: Tuple with checksum of pack file and index file
|
1494 |
"""
|
1495 |
with GitFile(filename + '.pack', 'wb') as f: |
1496 |
entries, data_sum = write_pack_objects(f, objects, |
1497 |
delta_window_size=delta_window_size, deltify=deltify) |
1498 |
entries = [(k, v[0], v[1]) for (k, v) in entries.items()] |
1499 |
entries.sort() |
1500 |
with GitFile(filename + '.idx', 'wb') as f: |
1501 |
return data_sum, write_pack_index_v2(f, entries, data_sum)
|
1502 |
|
1503 |
|
1504 |
def write_pack_header(f, num_objects): |
1505 |
"""Write a pack header for the given number of objects."""
|
1506 |
f.write(b'PACK') # Pack header |
1507 |
f.write(struct.pack(b'>L', 2)) # Pack version |
1508 |
f.write(struct.pack(b'>L', num_objects)) # Number of objects in pack |
1509 |
|
1510 |
|
1511 |
def deltify_pack_objects(objects, window_size=None): |
1512 |
"""Generate deltas for pack objects.
|
1513 |
|
1514 |
:param objects: An iterable of (object, path) tuples to deltify.
|
1515 |
:param window_size: Window size; None for default
|
1516 |
:return: Iterator over type_num, object id, delta_base, content
|
1517 |
delta_base is None for full text entries
|
1518 |
"""
|
1519 |
if window_size is None: |
1520 |
window_size = DEFAULT_PACK_DELTA_WINDOW_SIZE |
1521 |
# Build a list of objects ordered by the magic Linus heuristic
|
1522 |
# This helps us find good objects to diff against us
|
1523 |
magic = [] |
1524 |
for obj, path in objects: |
1525 |
magic.append((obj.type_num, path, -obj.raw_length(), obj)) |
1526 |
magic.sort() |
1527 |
|
1528 |
possible_bases = deque() |
1529 |
|
1530 |
for type_num, path, neg_length, o in magic: |
1531 |
raw = o.as_raw_string() |
1532 |
winner = raw |
1533 |
winner_base = None
|
1534 |
for base in possible_bases: |
1535 |
if base.type_num != type_num:
|
1536 |
continue
|
1537 |
delta = create_delta(base.as_raw_string(), raw) |
1538 |
if len(delta) < len(winner): |
1539 |
winner_base = base.sha().digest() |
1540 |
winner = delta |
1541 |
yield type_num, o.sha().digest(), winner_base, winner
|
1542 |
possible_bases.appendleft(o) |
1543 |
while len(possible_bases) > window_size: |
1544 |
possible_bases.pop() |
1545 |
|
1546 |
|
1547 |
def write_pack_objects(f, objects, delta_window_size=None, deltify=False): |
1548 |
"""Write a new pack data file.
|
1549 |
|
1550 |
:param f: File to write to
|
1551 |
:param objects: Iterable of (object, path) tuples to write.
|
1552 |
Should provide __len__
|
1553 |
:param window_size: Sliding window size for searching for deltas;
|
1554 |
Set to None for default window size.
|
1555 |
:param deltify: Whether to deltify objects
|
1556 |
:return: Dict mapping id -> (offset, crc32 checksum), pack checksum
|
1557 |
"""
|
1558 |
if deltify:
|
1559 |
pack_contents = deltify_pack_objects(objects, delta_window_size) |
1560 |
else:
|
1561 |
pack_contents = ( |
1562 |
(o.type_num, o.sha().digest(), None, o.as_raw_string())
|
1563 |
for (o, path) in objects) |
1564 |
|
1565 |
return write_pack_data(f, len(objects), pack_contents) |
1566 |
|
1567 |
|
1568 |
def write_pack_data(f, num_records, records): |
1569 |
"""Write a new pack data file.
|
1570 |
|
1571 |
:param f: File to write to
|
1572 |
:param num_records: Number of records
|
1573 |
:param records: Iterator over type_num, object_id, delta_base, raw
|
1574 |
:return: Dict mapping id -> (offset, crc32 checksum), pack checksum
|
1575 |
"""
|
1576 |
# Write the pack
|
1577 |
entries = {} |
1578 |
f = SHA1Writer(f) |
1579 |
write_pack_header(f, num_records) |
1580 |
for type_num, object_id, delta_base, raw in records: |
1581 |
offset = f.offset() |
1582 |
if delta_base is not None: |
1583 |
try:
|
1584 |
base_offset, base_crc32 = entries[delta_base] |
1585 |
except KeyError: |
1586 |
type_num = REF_DELTA |
1587 |
raw = (delta_base, raw) |
1588 |
else:
|
1589 |
type_num = OFS_DELTA |
1590 |
raw = (offset - base_offset, raw) |
1591 |
crc32 = write_pack_object(f, type_num, raw) |
1592 |
entries[object_id] = (offset, crc32) |
1593 |
return entries, f.write_sha()
|
1594 |
|
1595 |
|
1596 |
def write_pack_index_v1(f, entries, pack_checksum): |
1597 |
"""Write a new pack index file.
|
1598 |
|
1599 |
:param f: A file-like object to write to
|
1600 |
:param entries: List of tuples with object name (sha), offset_in_pack,
|
1601 |
and crc32_checksum.
|
1602 |
:param pack_checksum: Checksum of the pack file.
|
1603 |
:return: The SHA of the written index file
|
1604 |
"""
|
1605 |
f = SHA1Writer(f) |
1606 |
fan_out_table = defaultdict(lambda: 0) |
1607 |
for (name, offset, entry_checksum) in entries: |
1608 |
fan_out_table[ord(name[:1])] += 1 |
1609 |
# Fan-out table
|
1610 |
for i in range(0x100): |
1611 |
f.write(struct.pack('>L', fan_out_table[i]))
|
1612 |
fan_out_table[i+1] += fan_out_table[i]
|
1613 |
for (name, offset, entry_checksum) in entries: |
1614 |
if not (offset <= 0xffffffff): |
1615 |
raise TypeError("pack format 1 only supports offsets < 2Gb") |
1616 |
f.write(struct.pack('>L20s', offset, name))
|
1617 |
assert len(pack_checksum) == 20 |
1618 |
f.write(pack_checksum) |
1619 |
return f.write_sha()
|
1620 |
|
1621 |
|
1622 |
def _delta_encode_size(size): |
1623 |
ret = bytearray()
|
1624 |
c = size & 0x7f
|
1625 |
size >>= 7
|
1626 |
while size:
|
1627 |
ret.append(c | 0x80)
|
1628 |
c = size & 0x7f
|
1629 |
size >>= 7
|
1630 |
ret.append(c) |
1631 |
return ret
|
1632 |
|
1633 |
|
1634 |
# The length of delta compression copy operations in version 2 packs is limited
|
1635 |
# to 64K. To copy more, we use several copy operations. Version 3 packs allow
|
1636 |
# 24-bit lengths in copy operations, but we always make version 2 packs.
|
1637 |
_MAX_COPY_LEN = 0xffff
|
1638 |
|
1639 |
def _encode_copy_operation(start, length): |
1640 |
scratch = [] |
1641 |
op = 0x80
|
1642 |
for i in range(4): |
1643 |
if start & 0xff << i*8: |
1644 |
scratch.append((start >> i*8) & 0xff) |
1645 |
op |= 1 << i
|
1646 |
for i in range(2): |
1647 |
if length & 0xff << i*8: |
1648 |
scratch.append((length >> i*8) & 0xff) |
1649 |
op |= 1 << (4+i) |
1650 |
return bytearray([op] + scratch) |
1651 |
|
1652 |
|
1653 |
def create_delta(base_buf, target_buf): |
1654 |
"""Use python difflib to work out how to transform base_buf to target_buf.
|
1655 |
|
1656 |
:param base_buf: Base buffer
|
1657 |
:param target_buf: Target buffer
|
1658 |
"""
|
1659 |
assert isinstance(base_buf, bytes) |
1660 |
assert isinstance(target_buf, bytes) |
1661 |
out_buf = bytearray()
|
1662 |
# write delta header
|
1663 |
out_buf += _delta_encode_size(len(base_buf))
|
1664 |
out_buf += _delta_encode_size(len(target_buf))
|
1665 |
# write out delta opcodes
|
1666 |
seq = difflib.SequenceMatcher(a=base_buf, b=target_buf) |
1667 |
for opcode, i1, i2, j1, j2 in seq.get_opcodes(): |
1668 |
# Git patch opcodes don't care about deletes!
|
1669 |
#if opcode == 'replace' or opcode == 'delete':
|
1670 |
# pass
|
1671 |
if opcode == 'equal': |
1672 |
# If they are equal, unpacker will use data from base_buf
|
1673 |
# Write out an opcode that says what range to use
|
1674 |
copy_start = i1 |
1675 |
copy_len = i2 - i1 |
1676 |
while copy_len > 0: |
1677 |
to_copy = min(copy_len, _MAX_COPY_LEN)
|
1678 |
out_buf += _encode_copy_operation(copy_start, to_copy) |
1679 |
copy_start += to_copy |
1680 |
copy_len -= to_copy |
1681 |
if opcode == 'replace' or opcode == 'insert': |
1682 |
# If we are replacing a range or adding one, then we just
|
1683 |
# output it to the stream (prefixed by its size)
|
1684 |
s = j2 - j1 |
1685 |
o = j1 |
1686 |
while s > 127: |
1687 |
out_buf.append(127)
|
1688 |
out_buf += bytearray(target_buf[o:o+127]) |
1689 |
s -= 127
|
1690 |
o += 127
|
1691 |
out_buf.append(s) |
1692 |
out_buf += bytearray(target_buf[o:o+s])
|
1693 |
return bytes(out_buf) |
1694 |
|
1695 |
|
1696 |
def apply_delta(src_buf, delta): |
1697 |
"""Based on the similar function in git's patch-delta.c.
|
1698 |
|
1699 |
:param src_buf: Source buffer
|
1700 |
:param delta: Delta instructions
|
1701 |
"""
|
1702 |
if not isinstance(src_buf, bytes): |
1703 |
src_buf = b''.join(src_buf)
|
1704 |
if not isinstance(delta, bytes): |
1705 |
delta = b''.join(delta)
|
1706 |
out = [] |
1707 |
index = 0
|
1708 |
delta_length = len(delta)
|
1709 |
def get_delta_header_size(delta, index): |
1710 |
size = 0
|
1711 |
i = 0
|
1712 |
while delta:
|
1713 |
cmd = ord(delta[index:index+1]) |
1714 |
index += 1
|
1715 |
size |= (cmd & ~0x80) << i
|
1716 |
i += 7
|
1717 |
if not cmd & 0x80: |
1718 |
break
|
1719 |
return size, index
|
1720 |
src_size, index = get_delta_header_size(delta, index) |
1721 |
dest_size, index = get_delta_header_size(delta, index) |
1722 |
assert src_size == len(src_buf), '%d vs %d' % (src_size, len(src_buf)) |
1723 |
while index < delta_length:
|
1724 |
cmd = ord(delta[index:index+1]) |
1725 |
index += 1
|
1726 |
if cmd & 0x80: |
1727 |
cp_off = 0
|
1728 |
for i in range(4): |
1729 |
if cmd & (1 << i): |
1730 |
x = ord(delta[index:index+1]) |
1731 |
index += 1
|
1732 |
cp_off |= x << (i * 8)
|
1733 |
cp_size = 0
|
1734 |
# Version 3 packs can contain copy sizes larger than 64K.
|
1735 |
for i in range(3): |
1736 |
if cmd & (1 << (4+i)): |
1737 |
x = ord(delta[index:index+1]) |
1738 |
index += 1
|
1739 |
cp_size |= x << (i * 8)
|
1740 |
if cp_size == 0: |
1741 |
cp_size = 0x10000
|
1742 |
if (cp_off + cp_size < cp_size or |
1743 |
cp_off + cp_size > src_size or
|
1744 |
cp_size > dest_size): |
1745 |
break
|
1746 |
out.append(src_buf[cp_off:cp_off+cp_size]) |
1747 |
elif cmd != 0: |
1748 |
out.append(delta[index:index+cmd]) |
1749 |
index += cmd |
1750 |
else:
|
1751 |
raise ApplyDeltaError('Invalid opcode 0') |
1752 |
|
1753 |
if index != delta_length:
|
1754 |
raise ApplyDeltaError('delta not empty: %r' % delta[index:]) |
1755 |
|
1756 |
if dest_size != chunks_length(out):
|
1757 |
raise ApplyDeltaError('dest size incorrect') |
1758 |
|
1759 |
return out
|
1760 |
|
1761 |
|
1762 |
def write_pack_index_v2(f, entries, pack_checksum): |
1763 |
"""Write a new pack index file.
|
1764 |
|
1765 |
:param f: File-like object to write to
|
1766 |
:param entries: List of tuples with object name (sha), offset_in_pack, and
|
1767 |
crc32_checksum.
|
1768 |
:param pack_checksum: Checksum of the pack file.
|
1769 |
:return: The SHA of the index file written
|
1770 |
"""
|
1771 |
f = SHA1Writer(f) |
1772 |
f.write(b'\377tOc') # Magic! |
1773 |
f.write(struct.pack('>L', 2)) |
1774 |
fan_out_table = defaultdict(lambda: 0) |
1775 |
for (name, offset, entry_checksum) in entries: |
1776 |
fan_out_table[ord(name[:1])] += 1 |
1777 |
# Fan-out table
|
1778 |
largetable = [] |
1779 |
for i in range(0x100): |
1780 |
f.write(struct.pack(b'>L', fan_out_table[i]))
|
1781 |
fan_out_table[i+1] += fan_out_table[i]
|
1782 |
for (name, offset, entry_checksum) in entries: |
1783 |
f.write(name) |
1784 |
for (name, offset, entry_checksum) in entries: |
1785 |
f.write(struct.pack(b'>L', entry_checksum))
|
1786 |
for (name, offset, entry_checksum) in entries: |
1787 |
if offset < 2**31: |
1788 |
f.write(struct.pack(b'>L', offset))
|
1789 |
else:
|
1790 |
f.write(struct.pack(b'>L', 2**31 + len(largetable))) |
1791 |
largetable.append(offset) |
1792 |
for offset in largetable: |
1793 |
f.write(struct.pack(b'>Q', offset))
|
1794 |
assert len(pack_checksum) == 20 |
1795 |
f.write(pack_checksum) |
1796 |
return f.write_sha()
|
1797 |
|
1798 |
|
1799 |
write_pack_index = write_pack_index_v2 |
1800 |
|
1801 |
|
1802 |
class Pack(object): |
1803 |
"""A Git pack object."""
|
1804 |
|
1805 |
def __init__(self, basename, resolve_ext_ref=None): |
1806 |
self._basename = basename
|
1807 |
self._data = None |
1808 |
self._idx = None |
1809 |
self._idx_path = self._basename + '.idx' |
1810 |
self._data_path = self._basename + '.pack' |
1811 |
self._data_load = lambda: PackData(self._data_path) |
1812 |
self._idx_load = lambda: load_pack_index(self._idx_path) |
1813 |
self.resolve_ext_ref = resolve_ext_ref
|
1814 |
|
1815 |
@classmethod
|
1816 |
def from_lazy_objects(self, data_fn, idx_fn): |
1817 |
"""Create a new pack object from callables to load pack data and
|
1818 |
index objects."""
|
1819 |
ret = Pack('')
|
1820 |
ret._data_load = data_fn |
1821 |
ret._idx_load = idx_fn |
1822 |
return ret
|
1823 |
|
1824 |
@classmethod
|
1825 |
def from_objects(self, data, idx): |
1826 |
"""Create a new pack object from pack data and index objects."""
|
1827 |
ret = Pack('')
|
1828 |
ret._data_load = lambda: data
|
1829 |
ret._idx_load = lambda: idx
|
1830 |
return ret
|
1831 |
|
1832 |
def name(self): |
1833 |
"""The SHA over the SHAs of the objects in this pack."""
|
1834 |
return self.index.objects_sha1() |
1835 |
|
1836 |
@property
|
1837 |
def data(self): |
1838 |
"""The pack data object being used."""
|
1839 |
if self._data is None: |
1840 |
self._data = self._data_load() |
1841 |
self._data.pack = self |
1842 |
self.check_length_and_checksum()
|
1843 |
return self._data |
1844 |
|
1845 |
@property
|
1846 |
def index(self): |
1847 |
"""The index being used.
|
1848 |
|
1849 |
:note: This may be an in-memory index
|
1850 |
"""
|
1851 |
if self._idx is None: |
1852 |
self._idx = self._idx_load() |
1853 |
return self._idx |
1854 |
|
1855 |
def close(self): |
1856 |
if self._data is not None: |
1857 |
self._data.close()
|
1858 |
if self._idx is not None: |
1859 |
self._idx.close()
|
1860 |
|
1861 |
def __enter__(self): |
1862 |
return self |
1863 |
|
1864 |
def __exit__(self, exc_type, exc_val, exc_tb): |
1865 |
self.close()
|
1866 |
|
1867 |
def __eq__(self, other): |
1868 |
return isinstance(self, type(other)) and self.index == other.index |
1869 |
|
1870 |
def __len__(self): |
1871 |
"""Number of entries in this pack."""
|
1872 |
return len(self.index) |
1873 |
|
1874 |
def __repr__(self): |
1875 |
return '%s(%r)' % (self.__class__.__name__, self._basename) |
1876 |
|
1877 |
def __iter__(self): |
1878 |
"""Iterate over all the sha1s of the objects in this pack."""
|
1879 |
return iter(self.index) |
1880 |
|
1881 |
def check_length_and_checksum(self): |
1882 |
"""Sanity check the length and checksum of the pack index and data."""
|
1883 |
assert len(self.index) == len(self.data) |
1884 |
idx_stored_checksum = self.index.get_pack_checksum()
|
1885 |
data_stored_checksum = self.data.get_stored_checksum()
|
1886 |
if idx_stored_checksum != data_stored_checksum:
|
1887 |
raise ChecksumMismatch(sha_to_hex(idx_stored_checksum),
|
1888 |
sha_to_hex(data_stored_checksum)) |
1889 |
|
1890 |
def check(self): |
1891 |
"""Check the integrity of this pack.
|
1892 |
|
1893 |
:raise ChecksumMismatch: if a checksum for the index or data is wrong
|
1894 |
"""
|
1895 |
self.index.check()
|
1896 |
self.data.check()
|
1897 |
for obj in self.iterobjects(): |
1898 |
obj.check() |
1899 |
# TODO: object connectivity checks
|
1900 |
|
1901 |
def get_stored_checksum(self): |
1902 |
return self.data.get_stored_checksum() |
1903 |
|
1904 |
def __contains__(self, sha1): |
1905 |
"""Check whether this pack contains a particular SHA1."""
|
1906 |
try:
|
1907 |
self.index.object_index(sha1)
|
1908 |
return True |
1909 |
except KeyError: |
1910 |
return False |
1911 |
|
1912 |
def get_raw(self, sha1): |
1913 |
offset = self.index.object_index(sha1)
|
1914 |
obj_type, obj = self.data.get_object_at(offset)
|
1915 |
type_num, chunks = self.data.resolve_object(offset, obj_type, obj)
|
1916 |
return type_num, b''.join(chunks) |
1917 |
|
1918 |
def __getitem__(self, sha1): |
1919 |
"""Retrieve the specified SHA1."""
|
1920 |
type, uncomp = self.get_raw(sha1) |
1921 |
return ShaFile.from_raw_string(type, uncomp, sha=sha1) |
1922 |
|
1923 |
def iterobjects(self): |
1924 |
"""Iterate over the objects in this pack."""
|
1925 |
return iter(PackInflater.for_pack_data( |
1926 |
self.data, resolve_ext_ref=self.resolve_ext_ref)) |
1927 |
|
1928 |
def pack_tuples(self): |
1929 |
"""Provide an iterable for use with write_pack_objects.
|
1930 |
|
1931 |
:return: Object that can iterate over (object, path) tuples
|
1932 |
and provides __len__
|
1933 |
"""
|
1934 |
class PackTupleIterable(object): |
1935 |
|
1936 |
def __init__(self, pack): |
1937 |
self.pack = pack
|
1938 |
|
1939 |
def __len__(self): |
1940 |
return len(self.pack) |
1941 |
|
1942 |
def __iter__(self): |
1943 |
return ((o, None) for o in self.pack.iterobjects()) |
1944 |
|
1945 |
return PackTupleIterable(self) |
1946 |
|
1947 |
def keep(self, msg=None): |
1948 |
"""Add a .keep file for the pack, preventing git from garbage collecting it.
|
1949 |
|
1950 |
:param msg: A message written inside the .keep file; can be used later to
|
1951 |
determine whether or not a .keep file is obsolete.
|
1952 |
:return: The path of the .keep file, as a string.
|
1953 |
"""
|
1954 |
keepfile_name = '%s.keep' % self._basename |
1955 |
with GitFile(keepfile_name, 'wb') as keepfile: |
1956 |
if msg:
|
1957 |
keepfile.write(msg) |
1958 |
keepfile.write(b'\n')
|
1959 |
return keepfile_name
|
1960 |
|
1961 |
|
1962 |
try:
|
1963 |
from dulwich._pack import apply_delta, bisect_find_sha |
1964 |
except ImportError: |
1965 |
pass
|