Statistics
| Revision:

gvsig-scripting / org.gvsig.scripting / trunk / org.gvsig.scripting / org.gvsig.scripting.app / org.gvsig.scripting.app.mainplugin / src / main / resources-plugin / scripting / lib / dulwich / objects.py @ 959

History | View | Annotate | Download (42.9 KB)

1
# objects.py -- Access to base git objects
2
# Copyright (C) 2007 James Westby <jw+debian@jameswestby.net>
3
# Copyright (C) 2008-2013 Jelmer Vernooij <jelmer@samba.org>
4
#
5
# Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
6
# General Public License as public by the Free Software Foundation; version 2.0
7
# or (at your option) any later version. You can redistribute it and/or
8
# modify it under the terms of either of these two licenses.
9
#
10
# Unless required by applicable law or agreed to in writing, software
11
# distributed under the License is distributed on an "AS IS" BASIS,
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
# See the License for the specific language governing permissions and
14
# limitations under the License.
15
#
16
# You should have received a copy of the licenses; if not, see
17
# <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
18
# and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
19
# License, Version 2.0.
20
#
21

    
22
"""Access to base git objects."""
23

    
24
import binascii
25
from io import BytesIO
26
from collections import namedtuple
27
import os
28
import posixpath
29
import stat
30
import warnings
31
import zlib
32
from hashlib import sha1
33

    
34
from dulwich.errors import (
35
    ChecksumMismatch,
36
    NotBlobError,
37
    NotCommitError,
38
    NotTagError,
39
    NotTreeError,
40
    ObjectFormatException,
41
    )
42
from dulwich.file import GitFile
43

    
44

    
45
ZERO_SHA = b'0' * 40
46

    
47
# Header fields for commits
48
_TREE_HEADER = b'tree'
49
_PARENT_HEADER = b'parent'
50
_AUTHOR_HEADER = b'author'
51
_COMMITTER_HEADER = b'committer'
52
_ENCODING_HEADER = b'encoding'
53
_MERGETAG_HEADER = b'mergetag'
54
_GPGSIG_HEADER = b'gpgsig'
55

    
56
# Header fields for objects
57
_OBJECT_HEADER = b'object'
58
_TYPE_HEADER = b'type'
59
_TAG_HEADER = b'tag'
60
_TAGGER_HEADER = b'tagger'
61

    
62

    
63
S_IFGITLINK = 0o160000
64

    
65

    
66
def S_ISGITLINK(m):
67
    """Check if a mode indicates a submodule.
68

69
    :param m: Mode to check
70
    :return: a ``boolean``
71
    """
72
    return (stat.S_IFMT(m) == S_IFGITLINK)
73

    
74

    
75
def _decompress(string):
76
    dcomp = zlib.decompressobj()
77
    dcomped = dcomp.decompress(string)
78
    dcomped += dcomp.flush()
79
    return dcomped
80

    
81

    
82
def sha_to_hex(sha):
83
    """Takes a string and returns the hex of the sha within"""
84
    hexsha = binascii.hexlify(sha)
85
    assert len(hexsha) == 40, "Incorrect length of sha1 string: %d" % hexsha
86
    return hexsha
87

    
88

    
89
def hex_to_sha(hex):
90
    """Takes a hex sha and returns a binary sha"""
91
    assert len(hex) == 40, "Incorrect length of hexsha: %s" % hex
92
    try:
93
        return binascii.unhexlify(hex)
94
    except TypeError as exc:
95
        if not isinstance(hex, bytes):
96
            raise
97
        raise ValueError(exc.args[0])
98

    
99

    
100
def valid_hexsha(hex):
101
    if len(hex) != 40:
102
        return False
103
    try:
104
        binascii.unhexlify(hex)
105
    except (TypeError, binascii.Error):
106
        return False
107
    else:
108
        return True
109

    
110

    
111
def hex_to_filename(path, hex):
112
    """Takes a hex sha and returns its filename relative to the given path."""
113
    # os.path.join accepts bytes or unicode, but all args must be of the same
114
    # type. Make sure that hex which is expected to be bytes, is the same type
115
    # as path.
116
    if getattr(path, 'encode', None) is not None:
117
        hex = hex.decode('ascii')
118
    dir = hex[:2]
119
    file = hex[2:]
120
    # Check from object dir
121
    return os.path.join(path, dir, file)
122

    
123

    
124
def filename_to_hex(filename):
125
    """Takes an object filename and returns its corresponding hex sha."""
126
    # grab the last (up to) two path components
127
    names = filename.rsplit(os.path.sep, 2)[-2:]
128
    errmsg = "Invalid object filename: %s" % filename
129
    assert len(names) == 2, errmsg
130
    base, rest = names
131
    assert len(base) == 2 and len(rest) == 38, errmsg
132
    hex = (base + rest).encode('ascii')
133
    hex_to_sha(hex)
134
    return hex
135

    
136

    
137
def object_header(num_type, length):
138
    """Return an object header for the given numeric type and text length."""
139
    return object_class(num_type).type_name + b' ' + str(length).encode('ascii') + b'\0'
140

    
141

    
142
def serializable_property(name, docstring=None):
143
    """A property that helps tracking whether serialization is necessary.
144
    """
145
    def set(obj, value):
146
        setattr(obj, "_"+name, value)
147
        obj._needs_serialization = True
148
    def get(obj):
149
        return getattr(obj, "_"+name)
150
    return property(get, set, doc=docstring)
151

    
152

    
153
def object_class(type):
154
    """Get the object class corresponding to the given type.
155

156
    :param type: Either a type name string or a numeric type.
157
    :return: The ShaFile subclass corresponding to the given type, or None if
158
        type is not a valid type name/number.
159
    """
160
    return _TYPE_MAP.get(type, None)
161

    
162

    
163
def check_hexsha(hex, error_msg):
164
    """Check if a string is a valid hex sha string.
165

166
    :param hex: Hex string to check
167
    :param error_msg: Error message to use in exception
168
    :raise ObjectFormatException: Raised when the string is not valid
169
    """
170
    if not valid_hexsha(hex):
171
        raise ObjectFormatException("%s %s" % (error_msg, hex))
172

    
173

    
174
def check_identity(identity, error_msg):
175
    """Check if the specified identity is valid.
176

177
    This will raise an exception if the identity is not valid.
178

179
    :param identity: Identity string
180
    :param error_msg: Error message to use in exception
181
    """
182
    email_start = identity.find(b'<')
183
    email_end = identity.find(b'>')
184
    if (email_start < 0 or email_end < 0 or email_end <= email_start
185
        or identity.find(b'<', email_start + 1) >= 0
186
        or identity.find(b'>', email_end + 1) >= 0
187
        or not identity.endswith(b'>')):
188
        raise ObjectFormatException(error_msg)
189

    
190

    
191
def git_line(*items):
192
    """Formats items into a space sepreated line."""
193
    return b' '.join(items) + b'\n'
194

    
195

    
196
class FixedSha(object):
197
    """SHA object that behaves like hashlib's but is given a fixed value."""
198

    
199
    __slots__ = ('_hexsha', '_sha')
200

    
201
    def __init__(self, hexsha):
202
        if getattr(hexsha, 'encode', None) is not None:
203
            hexsha = hexsha.encode('ascii')
204
        if not isinstance(hexsha, bytes):
205
            raise TypeError('Expected bytes for hexsha, got %r' % hexsha)
206
        self._hexsha = hexsha
207
        self._sha = hex_to_sha(hexsha)
208

    
209
    def digest(self):
210
        """Return the raw SHA digest."""
211
        return self._sha
212

    
213
    def hexdigest(self):
214
        """Return the hex SHA digest."""
215
        return self._hexsha.decode('ascii')
216

    
217

    
218
class ShaFile(object):
219
    """A git SHA file."""
220

    
221
    __slots__ = ('_chunked_text', '_sha', '_needs_serialization')
222

    
223
    @staticmethod
224
    def _parse_legacy_object_header(magic, f):
225
        """Parse a legacy object, creating it but not reading the file."""
226
        bufsize = 1024
227
        decomp = zlib.decompressobj()
228
        header = decomp.decompress(magic)
229
        start = 0
230
        end = -1
231
        while end < 0:
232
            extra = f.read(bufsize)
233
            header += decomp.decompress(extra)
234
            magic += extra
235
            end = header.find(b'\0', start)
236
            start = len(header)
237
        header = header[:end]
238
        type_name, size = header.split(b' ', 1)
239
        size = int(size)  # sanity check
240
        obj_class = object_class(type_name)
241
        if not obj_class:
242
            raise ObjectFormatException("Not a known type: %s" % type_name)
243
        return obj_class()
244

    
245
    def _parse_legacy_object(self, map):
246
        """Parse a legacy object, setting the raw string."""
247
        text = _decompress(map)
248
        header_end = text.find(b'\0')
249
        if header_end < 0:
250
            raise ObjectFormatException("Invalid object header, no \\0")
251
        self.set_raw_string(text[header_end+1:])
252

    
253
    def as_legacy_object_chunks(self):
254
        """Return chunks representing the object in the experimental format.
255

256
        :return: List of strings
257
        """
258
        compobj = zlib.compressobj()
259
        yield compobj.compress(self._header())
260
        for chunk in self.as_raw_chunks():
261
            yield compobj.compress(chunk)
262
        yield compobj.flush()
263

    
264
    def as_legacy_object(self):
265
        """Return string representing the object in the experimental format.
266
        """
267
        return b''.join(self.as_legacy_object_chunks())
268

    
269
    def as_raw_chunks(self):
270
        """Return chunks with serialization of the object.
271

272
        :return: List of strings, not necessarily one per line
273
        """
274
        if self._needs_serialization:
275
            self._sha = None
276
            self._chunked_text = self._serialize()
277
            self._needs_serialization = False
278
        return self._chunked_text
279

    
280
    def as_raw_string(self):
281
        """Return raw string with serialization of the object.
282

283
        :return: String object
284
        """
285
        return b''.join(self.as_raw_chunks())
286

    
287
    def __str__(self):
288
        """Return raw string serialization of this object."""
289
        return self.as_raw_string()
290

    
291
    def __hash__(self):
292
        """Return unique hash for this object."""
293
        return hash(self.id)
294

    
295
    def as_pretty_string(self):
296
        """Return a string representing this object, fit for display."""
297
        return self.as_raw_string()
298

    
299
    def set_raw_string(self, text, sha=None):
300
        """Set the contents of this object from a serialized string."""
301
        if not isinstance(text, bytes):
302
            raise TypeError('Expected bytes for text, got %r' % text)
303
        self.set_raw_chunks([text], sha)
304

    
305
    def set_raw_chunks(self, chunks, sha=None):
306
        """Set the contents of this object from a list of chunks."""
307
        self._chunked_text = chunks
308
        self._deserialize(chunks)
309
        if sha is None:
310
            self._sha = None
311
        else:
312
            self._sha = FixedSha(sha)
313
        self._needs_serialization = False
314

    
315
    @staticmethod
316
    def _parse_object_header(magic, f):
317
        """Parse a new style object, creating it but not reading the file."""
318
        num_type = (ord(magic[0:1]) >> 4) & 7
319
        obj_class = object_class(num_type)
320
        if not obj_class:
321
            raise ObjectFormatException("Not a known type %d" % num_type)
322
        return obj_class()
323

    
324
    def _parse_object(self, map):
325
        """Parse a new style object, setting self._text."""
326
        # skip type and size; type must have already been determined, and
327
        # we trust zlib to fail if it's otherwise corrupted
328
        byte = ord(map[0:1])
329
        used = 1
330
        while (byte & 0x80) != 0:
331
            byte = ord(map[used:used+1])
332
            used += 1
333
        raw = map[used:]
334
        self.set_raw_string(_decompress(raw))
335

    
336
    @classmethod
337
    def _is_legacy_object(cls, magic):
338
        b0 = ord(magic[0:1])
339
        b1 = ord(magic[1:2])
340
        word = (b0 << 8) + b1
341
        return (b0 & 0x8F) == 0x08 and (word % 31) == 0
342

    
343
    @classmethod
344
    def _parse_file(cls, f):
345
        map = f.read()
346
        if cls._is_legacy_object(map):
347
            obj = cls._parse_legacy_object_header(map, f)
348
            obj._parse_legacy_object(map)
349
        else:
350
            obj = cls._parse_object_header(map, f)
351
            obj._parse_object(map)
352
        return obj
353

    
354
    def __init__(self):
355
        """Don't call this directly"""
356
        self._sha = None
357
        self._chunked_text = []
358
        self._needs_serialization = True
359

    
360
    def _deserialize(self, chunks):
361
        raise NotImplementedError(self._deserialize)
362

    
363
    def _serialize(self):
364
        raise NotImplementedError(self._serialize)
365

    
366
    @classmethod
367
    def from_path(cls, path):
368
        """Open a SHA file from disk."""
369
        with GitFile(path, 'rb') as f:
370
            return cls.from_file(f)
371

    
372
    @classmethod
373
    def from_file(cls, f):
374
        """Get the contents of a SHA file on disk."""
375
        try:
376
            obj = cls._parse_file(f)
377
            obj._sha = None
378
            return obj
379
        except (IndexError, ValueError):
380
            raise ObjectFormatException("invalid object header")
381

    
382
    @staticmethod
383
    def from_raw_string(type_num, string, sha=None):
384
        """Creates an object of the indicated type from the raw string given.
385

386
        :param type_num: The numeric type of the object.
387
        :param string: The raw uncompressed contents.
388
        :param sha: Optional known sha for the object
389
        """
390
        obj = object_class(type_num)()
391
        obj.set_raw_string(string, sha)
392
        return obj
393

    
394
    @staticmethod
395
    def from_raw_chunks(type_num, chunks, sha=None):
396
        """Creates an object of the indicated type from the raw chunks given.
397

398
        :param type_num: The numeric type of the object.
399
        :param chunks: An iterable of the raw uncompressed contents.
400
        :param sha: Optional known sha for the object
401
        """
402
        obj = object_class(type_num)()
403
        obj.set_raw_chunks(chunks, sha)
404
        return obj
405

    
406
    @classmethod
407
    def from_string(cls, string):
408
        """Create a ShaFile from a string."""
409
        obj = cls()
410
        obj.set_raw_string(string)
411
        return obj
412

    
413
    def _check_has_member(self, member, error_msg):
414
        """Check that the object has a given member variable.
415

416
        :param member: the member variable to check for
417
        :param error_msg: the message for an error if the member is missing
418
        :raise ObjectFormatException: with the given error_msg if member is
419
            missing or is None
420
        """
421
        if getattr(self, member, None) is None:
422
            raise ObjectFormatException(error_msg)
423

    
424
    def check(self):
425
        """Check this object for internal consistency.
426

427
        :raise ObjectFormatException: if the object is malformed in some way
428
        :raise ChecksumMismatch: if the object was created with a SHA that does
429
            not match its contents
430
        """
431
        # TODO: if we find that error-checking during object parsing is a
432
        # performance bottleneck, those checks should be moved to the class's
433
        # check() method during optimization so we can still check the object
434
        # when necessary.
435
        old_sha = self.id
436
        try:
437
            self._deserialize(self.as_raw_chunks())
438
            self._sha = None
439
            new_sha = self.id
440
        except Exception as e:
441
            raise ObjectFormatException(e)
442
        if old_sha != new_sha:
443
            raise ChecksumMismatch(new_sha, old_sha)
444

    
445
    def _header(self):
446
        return object_header(self.type, self.raw_length())
447

    
448
    def raw_length(self):
449
        """Returns the length of the raw string of this object."""
450
        ret = 0
451
        for chunk in self.as_raw_chunks():
452
            ret += len(chunk)
453
        return ret
454

    
455
    def sha(self):
456
        """The SHA1 object that is the name of this object."""
457
        if self._sha is None or self._needs_serialization:
458
            # this is a local because as_raw_chunks() overwrites self._sha
459
            new_sha = sha1()
460
            new_sha.update(self._header())
461
            for chunk in self.as_raw_chunks():
462
                new_sha.update(chunk)
463
            self._sha = new_sha
464
        return self._sha
465

    
466
    def copy(self):
467
        """Create a new copy of this SHA1 object from its raw string"""
468
        obj_class = object_class(self.get_type())
469
        return obj_class.from_raw_string(
470
            self.get_type(),
471
            self.as_raw_string(),
472
            self.id)
473

    
474
    @property
475
    def id(self):
476
        """The hex SHA of this object."""
477
        return self.sha().hexdigest().encode('ascii')
478

    
479
    def get_type(self):
480
        """Return the type number for this object class."""
481
        return self.type_num
482

    
483
    def set_type(self, type):
484
        """Set the type number for this object class."""
485
        self.type_num = type
486

    
487
    # DEPRECATED: use type_num or type_name as needed.
488
    type = property(get_type, set_type)
489

    
490
    def __repr__(self):
491
        return "<%s %s>" % (self.__class__.__name__, self.id)
492

    
493
    def __ne__(self, other):
494
        return not isinstance(other, ShaFile) or self.id != other.id
495

    
496
    def __eq__(self, other):
497
        """Return True if the SHAs of the two objects match.
498

499
        It doesn't make sense to talk about an order on ShaFiles, so we don't
500
        override the rich comparison methods (__le__, etc.).
501
        """
502
        return isinstance(other, ShaFile) and self.id == other.id
503

    
504
    def __lt__(self, other):
505
        if not isinstance(other, ShaFile):
506
            raise TypeError
507
        return self.id < other.id
508

    
509
    def __le__(self, other):
510
        if not isinstance(other, ShaFile):
511
            raise TypeError
512
        return self.id <= other.id
513

    
514
    def __cmp__(self, other):
515
        if not isinstance(other, ShaFile):
516
            raise TypeError
517
        return cmp(self.id, other.id)
518

    
519

    
520
class Blob(ShaFile):
521
    """A Git Blob object."""
522

    
523
    __slots__ = ()
524

    
525
    type_name = b'blob'
526
    type_num = 3
527

    
528
    def __init__(self):
529
        super(Blob, self).__init__()
530
        self._chunked_text = []
531
        self._needs_serialization = False
532

    
533
    def _get_data(self):
534
        return self.as_raw_string()
535

    
536
    def _set_data(self, data):
537
        self.set_raw_string(data)
538

    
539
    data = property(_get_data, _set_data,
540
                    "The text contained within the blob object.")
541

    
542
    def _get_chunked(self):
543
        return self._chunked_text
544

    
545
    def _set_chunked(self, chunks):
546
        self._chunked_text = chunks
547

    
548
    def _serialize(self):
549
        return self._chunked_text
550

    
551
    def _deserialize(self, chunks):
552
        self._chunked_text = chunks
553

    
554
    chunked = property(_get_chunked, _set_chunked,
555
        "The text within the blob object, as chunks (not necessarily lines).")
556

    
557
    @classmethod
558
    def from_path(cls, path):
559
        blob = ShaFile.from_path(path)
560
        if not isinstance(blob, cls):
561
            raise NotBlobError(path)
562
        return blob
563

    
564
    def check(self):
565
        """Check this object for internal consistency.
566

567
        :raise ObjectFormatException: if the object is malformed in some way
568
        """
569
        super(Blob, self).check()
570

    
571
    def splitlines(self):
572
        """Return list of lines in this blob.
573

574
        This preserves the original line endings.
575
        """
576
        chunks = self.chunked
577
        if not chunks:
578
            return []
579
        if len(chunks) == 1:
580
            return chunks[0].splitlines(True)
581
        remaining = None
582
        ret = []
583
        for chunk in chunks:
584
            lines = chunk.splitlines(True)
585
            if len(lines) > 1:
586
                ret.append((remaining or b"") + lines[0])
587
                ret.extend(lines[1:-1])
588
                remaining = lines[-1]
589
            elif len(lines) == 1:
590
                if remaining is None:
591
                    remaining = lines.pop()
592
                else:
593
                    remaining += lines.pop()
594
        if remaining is not None:
595
            ret.append(remaining)
596
        return ret
597

    
598

    
599
def _parse_message(chunks):
600
    """Parse a message with a list of fields and a body.
601

602
    :param chunks: the raw chunks of the tag or commit object.
603
    :return: iterator of tuples of (field, value), one per header line, in the
604
        order read from the text, possibly including duplicates. Includes a
605
        field named None for the freeform tag/commit text.
606
    """
607
    f = BytesIO(b''.join(chunks))
608
    k = None
609
    v = ""
610
    eof = False
611

    
612
    # Parse the headers
613
    #
614
    # Headers can contain newlines. The next line is indented with a space.
615
    # We store the latest key as 'k', and the accumulated value as 'v'.
616
    for l in f:
617
        if l.startswith(b' '):
618
            # Indented continuation of the previous line
619
            v += l[1:]
620
        else:
621
            if k is not None:
622
                # We parsed a new header, return its value
623
                yield (k, v.rstrip(b'\n'))
624
            if l == b'\n':
625
                # Empty line indicates end of headers
626
                break
627
            (k, v) = l.split(b' ', 1)
628

    
629
    else:
630
        # We reached end of file before the headers ended. We still need to
631
        # return the previous header, then we need to return a None field for
632
        # the text.
633
        eof = True
634
        if k is not None:
635
            yield (k, v.rstrip(b'\n'))
636
        yield (None, None)
637

    
638
    if not eof:
639
        # We didn't reach the end of file while parsing headers. We can return
640
        # the rest of the file as a message.
641
        yield (None, f.read())
642

    
643
    f.close()
644

    
645

    
646
class Tag(ShaFile):
647
    """A Git Tag object."""
648

    
649
    type_name = b'tag'
650
    type_num = 4
651

    
652
    __slots__ = ('_tag_timezone_neg_utc', '_name', '_object_sha',
653
                 '_object_class', '_tag_time', '_tag_timezone',
654
                 '_tagger', '_message')
655

    
656
    def __init__(self):
657
        super(Tag, self).__init__()
658
        self._tag_timezone_neg_utc = False
659

    
660
    @classmethod
661
    def from_path(cls, filename):
662
        tag = ShaFile.from_path(filename)
663
        if not isinstance(tag, cls):
664
            raise NotTagError(filename)
665
        return tag
666

    
667
    def check(self):
668
        """Check this object for internal consistency.
669

670
        :raise ObjectFormatException: if the object is malformed in some way
671
        """
672
        super(Tag, self).check()
673
        self._check_has_member("_object_sha", "missing object sha")
674
        self._check_has_member("_object_class", "missing object type")
675
        self._check_has_member("_name", "missing tag name")
676

    
677
        if not self._name:
678
            raise ObjectFormatException("empty tag name")
679

    
680
        check_hexsha(self._object_sha, "invalid object sha")
681

    
682
        if getattr(self, "_tagger", None):
683
            check_identity(self._tagger, "invalid tagger")
684

    
685
        last = None
686
        for field, _ in _parse_message(self._chunked_text):
687
            if field == _OBJECT_HEADER and last is not None:
688
                raise ObjectFormatException("unexpected object")
689
            elif field == _TYPE_HEADER and last != _OBJECT_HEADER:
690
                raise ObjectFormatException("unexpected type")
691
            elif field == _TAG_HEADER and last != _TYPE_HEADER:
692
                raise ObjectFormatException("unexpected tag name")
693
            elif field == _TAGGER_HEADER and last != _TAG_HEADER:
694
                raise ObjectFormatException("unexpected tagger")
695
            last = field
696

    
697
    def _serialize(self):
698
        chunks = []
699
        chunks.append(git_line(_OBJECT_HEADER, self._object_sha))
700
        chunks.append(git_line(_TYPE_HEADER, self._object_class.type_name))
701
        chunks.append(git_line(_TAG_HEADER, self._name))
702
        if self._tagger:
703
            if self._tag_time is None:
704
                chunks.append(git_line(_TAGGER_HEADER, self._tagger))
705
            else:
706
                chunks.append(git_line(
707
                    _TAGGER_HEADER, self._tagger, str(self._tag_time).encode('ascii'),
708
                    format_timezone(self._tag_timezone, self._tag_timezone_neg_utc)))
709
        if self._message is not None:
710
            chunks.append(b'\n') # To close headers
711
            chunks.append(self._message)
712
        return chunks
713

    
714
    def _deserialize(self, chunks):
715
        """Grab the metadata attached to the tag"""
716
        self._tagger = None
717
        for field, value in _parse_message(chunks):
718
            if field == _OBJECT_HEADER:
719
                self._object_sha = value
720
            elif field == _TYPE_HEADER:
721
                obj_class = object_class(value)
722
                if not obj_class:
723
                    raise ObjectFormatException("Not a known type: %s" % value)
724
                self._object_class = obj_class
725
            elif field == _TAG_HEADER:
726
                self._name = value
727
            elif field == _TAGGER_HEADER:
728
                try:
729
                    sep = value.index(b'> ')
730
                except ValueError:
731
                    self._tagger = value
732
                    self._tag_time = None
733
                    self._tag_timezone = None
734
                    self._tag_timezone_neg_utc = False
735
                else:
736
                    self._tagger = value[0:sep+1]
737
                    try:
738
                        (timetext, timezonetext) = value[sep+2:].rsplit(b' ', 1)
739
                        self._tag_time = int(timetext)
740
                        self._tag_timezone, self._tag_timezone_neg_utc = \
741
                                parse_timezone(timezonetext)
742
                    except ValueError as e:
743
                        raise ObjectFormatException(e)
744
            elif field is None:
745
                self._message = value
746
            else:
747
                raise ObjectFormatException("Unknown field %s" % field)
748

    
749
    def _get_object(self):
750
        """Get the object pointed to by this tag.
751

752
        :return: tuple of (object class, sha).
753
        """
754
        return (self._object_class, self._object_sha)
755

    
756
    def _set_object(self, value):
757
        (self._object_class, self._object_sha) = value
758
        self._needs_serialization = True
759

    
760
    object = property(_get_object, _set_object)
761

    
762
    name = serializable_property("name", "The name of this tag")
763
    tagger = serializable_property("tagger",
764
        "Returns the name of the person who created this tag")
765
    tag_time = serializable_property("tag_time",
766
        "The creation timestamp of the tag.  As the number of seconds "
767
        "since the epoch")
768
    tag_timezone = serializable_property("tag_timezone",
769
        "The timezone that tag_time is in.")
770
    message = serializable_property(
771
        "message", "The message attached to this tag")
772

    
773

    
774
class TreeEntry(namedtuple('TreeEntry', ['path', 'mode', 'sha'])):
775
    """Named tuple encapsulating a single tree entry."""
776

    
777
    def in_path(self, path):
778
        """Return a copy of this entry with the given path prepended."""
779
        if not isinstance(self.path, bytes):
780
            raise TypeError('Expected bytes for path, got %r' % path)
781
        return TreeEntry(posixpath.join(path, self.path), self.mode, self.sha)
782

    
783

    
784
def parse_tree(text, strict=False):
785
    """Parse a tree text.
786

787
    :param text: Serialized text to parse
788
    :return: iterator of tuples of (name, mode, sha)
789
    :raise ObjectFormatException: if the object was malformed in some way
790
    """
791
    count = 0
792
    l = len(text)
793
    while count < l:
794
        mode_end = text.index(b' ', count)
795
        mode_text = text[count:mode_end]
796
        if strict and mode_text.startswith(b'0'):
797
            raise ObjectFormatException("Invalid mode '%s'" % mode_text)
798
        try:
799
            mode = int(mode_text, 8)
800
        except ValueError:
801
            raise ObjectFormatException("Invalid mode '%s'" % mode_text)
802
        name_end = text.index(b'\0', mode_end)
803
        name = text[mode_end+1:name_end]
804
        count = name_end+21
805
        sha = text[name_end+1:count]
806
        if len(sha) != 20:
807
            raise ObjectFormatException("Sha has invalid length")
808
        hexsha = sha_to_hex(sha)
809
        yield (name, mode, hexsha)
810

    
811

    
812
def serialize_tree(items):
813
    """Serialize the items in a tree to a text.
814

815
    :param items: Sorted iterable over (name, mode, sha) tuples
816
    :return: Serialized tree text as chunks
817
    """
818
    for name, mode, hexsha in items:
819
        yield ("%04o" % mode).encode('ascii') + b' ' + name + b'\0' + hex_to_sha(hexsha)
820

    
821

    
822
def sorted_tree_items(entries, name_order):
823
    """Iterate over a tree entries dictionary.
824

825
    :param name_order: If True, iterate entries in order of their name. If
826
        False, iterate entries in tree order, that is, treat subtree entries as
827
        having '/' appended.
828
    :param entries: Dictionary mapping names to (mode, sha) tuples
829
    :return: Iterator over (name, mode, hexsha)
830
    """
831
    key_func = name_order and key_entry_name_order or key_entry
832
    for name, entry in sorted(entries.items(), key=key_func):
833
        mode, hexsha = entry
834
        # Stricter type checks than normal to mirror checks in the C version.
835
        mode = int(mode)
836
        if not isinstance(hexsha, bytes):
837
            raise TypeError('Expected bytes for SHA, got %r' % hexsha)
838
        yield TreeEntry(name, mode, hexsha)
839

    
840

    
841
def key_entry(entry):
842
    """Sort key for tree entry.
843

844
    :param entry: (name, value) tuplee
845
    """
846
    (name, value) = entry
847
    if stat.S_ISDIR(value[0]):
848
        name += b'/'
849
    return name
850

    
851

    
852
def key_entry_name_order(entry):
853
    """Sort key for tree entry in name order."""
854
    return entry[0]
855

    
856

    
857
def pretty_format_tree_entry(name, mode, hexsha, encoding="utf-8"):
858
    """Pretty format tree entry.
859

860
    :param name: Name of the directory entry
861
    :param mode: Mode of entry
862
    :param hexsha: Hexsha of the referenced object
863
    :return: string describing the tree entry
864
    """
865
    if mode & stat.S_IFDIR:
866
        kind = "tree"
867
    else:
868
        kind = "blob"
869
    return "%04o %s %s\t%s\n" % (
870
            mode, kind, hexsha.decode('ascii'),
871
            name.decode(encoding, 'replace'))
872

    
873

    
874
class Tree(ShaFile):
875
    """A Git tree object"""
876

    
877
    type_name = b'tree'
878
    type_num = 2
879

    
880
    __slots__ = ('_entries')
881

    
882
    def __init__(self):
883
        super(Tree, self).__init__()
884
        self._entries = {}
885

    
886
    @classmethod
887
    def from_path(cls, filename):
888
        tree = ShaFile.from_path(filename)
889
        if not isinstance(tree, cls):
890
            raise NotTreeError(filename)
891
        return tree
892

    
893
    def __contains__(self, name):
894
        return name in self._entries
895

    
896
    def __getitem__(self, name):
897
        return self._entries[name]
898

    
899
    def __setitem__(self, name, value):
900
        """Set a tree entry by name.
901

902
        :param name: The name of the entry, as a string.
903
        :param value: A tuple of (mode, hexsha), where mode is the mode of the
904
            entry as an integral type and hexsha is the hex SHA of the entry as
905
            a string.
906
        """
907
        mode, hexsha = value
908
        self._entries[name] = (mode, hexsha)
909
        self._needs_serialization = True
910

    
911
    def __delitem__(self, name):
912
        del self._entries[name]
913
        self._needs_serialization = True
914

    
915
    def __len__(self):
916
        return len(self._entries)
917

    
918
    def __iter__(self):
919
        return iter(self._entries)
920

    
921
    def add(self, name, mode, hexsha):
922
        """Add an entry to the tree.
923

924
        :param mode: The mode of the entry as an integral type. Not all
925
            possible modes are supported by git; see check() for details.
926
        :param name: The name of the entry, as a string.
927
        :param hexsha: The hex SHA of the entry as a string.
928
        """
929
        if isinstance(name, int) and isinstance(mode, bytes):
930
            (name, mode) = (mode, name)
931
            warnings.warn(
932
                "Please use Tree.add(name, mode, hexsha)",
933
                category=DeprecationWarning, stacklevel=2)
934
        self._entries[name] = mode, hexsha
935
        self._needs_serialization = True
936

    
937
    def iteritems(self, name_order=False):
938
        """Iterate over entries.
939

940
        :param name_order: If True, iterate in name order instead of tree
941
            order.
942
        :return: Iterator over (name, mode, sha) tuples
943
        """
944
        return sorted_tree_items(self._entries, name_order)
945

    
946
    def items(self):
947
        """Return the sorted entries in this tree.
948

949
        :return: List with (name, mode, sha) tuples
950
        """
951
        return list(self.iteritems())
952

    
953
    def _deserialize(self, chunks):
954
        """Grab the entries in the tree"""
955
        try:
956
            parsed_entries = parse_tree(b''.join(chunks))
957
        except ValueError as e:
958
            raise ObjectFormatException(e)
959
        # TODO: list comprehension is for efficiency in the common (small)
960
        # case; if memory efficiency in the large case is a concern, use a genexp.
961
        self._entries = dict([(n, (m, s)) for n, m, s in parsed_entries])
962

    
963
    def check(self):
964
        """Check this object for internal consistency.
965

966
        :raise ObjectFormatException: if the object is malformed in some way
967
        """
968
        super(Tree, self).check()
969
        last = None
970
        allowed_modes = (stat.S_IFREG | 0o755, stat.S_IFREG | 0o644,
971
                         stat.S_IFLNK, stat.S_IFDIR, S_IFGITLINK,
972
                         # TODO: optionally exclude as in git fsck --strict
973
                         stat.S_IFREG | 0o664)
974
        for name, mode, sha in parse_tree(b''.join(self._chunked_text),
975
                                          True):
976
            check_hexsha(sha, 'invalid sha %s' % sha)
977
            if b'/' in name or name in (b'', b'.', b'..'):
978
                raise ObjectFormatException('invalid name %s' % name)
979

    
980
            if mode not in allowed_modes:
981
                raise ObjectFormatException('invalid mode %06o' % mode)
982

    
983
            entry = (name, (mode, sha))
984
            if last:
985
                if key_entry(last) > key_entry(entry):
986
                    raise ObjectFormatException('entries not sorted')
987
                if name == last[0]:
988
                    raise ObjectFormatException('duplicate entry %s' % name)
989
            last = entry
990

    
991
    def _serialize(self):
992
        return list(serialize_tree(self.iteritems()))
993

    
994
    def as_pretty_string(self):
995
        text = []
996
        for name, mode, hexsha in self.iteritems():
997
            text.append(pretty_format_tree_entry(name, mode, hexsha))
998
        return "".join(text)
999

    
1000
    def lookup_path(self, lookup_obj, path):
1001
        """Look up an object in a Git tree.
1002

1003
        :param lookup_obj: Callback for retrieving object by SHA1
1004
        :param path: Path to lookup
1005
        :return: A tuple of (mode, SHA) of the resulting path.
1006
        """
1007
        parts = path.split(b'/')
1008
        sha = self.id
1009
        mode = None
1010
        for p in parts:
1011
            if not p:
1012
                continue
1013
            obj = lookup_obj(sha)
1014
            if not isinstance(obj, Tree):
1015
                raise NotTreeError(sha)
1016
            mode, sha = obj[p]
1017
        return mode, sha
1018

    
1019

    
1020
def parse_timezone(text):
1021
    """Parse a timezone text fragment (e.g. '+0100').
1022

1023
    :param text: Text to parse.
1024
    :return: Tuple with timezone as seconds difference to UTC
1025
        and a boolean indicating whether this was a UTC timezone
1026
        prefixed with a negative sign (-0000).
1027
    """
1028
    # cgit parses the first character as the sign, and the rest
1029
    #  as an integer (using strtol), which could also be negative.
1030
    #  We do the same for compatibility. See #697828.
1031
    if not text[0] in b'+-':
1032
        raise ValueError("Timezone must start with + or - (%(text)s)" % vars())
1033
    sign = text[:1]
1034
    offset = int(text[1:])
1035
    if sign == b'-':
1036
        offset = -offset
1037
    unnecessary_negative_timezone = (offset >= 0 and sign == b'-')
1038
    signum = (offset < 0) and -1 or 1
1039
    offset = abs(offset)
1040
    hours = int(offset / 100)
1041
    minutes = (offset % 100)
1042
    return (signum * (hours * 3600 + minutes * 60),
1043
            unnecessary_negative_timezone)
1044

    
1045

    
1046
def format_timezone(offset, unnecessary_negative_timezone=False):
1047
    """Format a timezone for Git serialization.
1048

1049
    :param offset: Timezone offset as seconds difference to UTC
1050
    :param unnecessary_negative_timezone: Whether to use a minus sign for
1051
        UTC or positive timezones (-0000 and --700 rather than +0000 / +0700).
1052
    """
1053
    if offset % 60 != 0:
1054
        raise ValueError("Unable to handle non-minute offset.")
1055
    if offset < 0 or unnecessary_negative_timezone:
1056
        sign = '-'
1057
        offset = -offset
1058
    else:
1059
        sign = '+'
1060
    return ('%c%02d%02d' % (sign, offset / 3600, (offset / 60) % 60)).encode('ascii')
1061

    
1062

    
1063
def parse_commit(chunks):
1064
    """Parse a commit object from chunks.
1065

1066
    :param chunks: Chunks to parse
1067
    :return: Tuple of (tree, parents, author_info, commit_info,
1068
        encoding, mergetag, gpgsig, message, extra)
1069
    """
1070
    parents = []
1071
    extra = []
1072
    tree = None
1073
    author_info = (None, None, (None, None))
1074
    commit_info = (None, None, (None, None))
1075
    encoding = None
1076
    mergetag = []
1077
    message = None
1078
    gpgsig = None
1079

    
1080
    for field, value in _parse_message(chunks):
1081
        # TODO(jelmer): Enforce ordering
1082
        if field == _TREE_HEADER:
1083
            tree = value
1084
        elif field == _PARENT_HEADER:
1085
            parents.append(value)
1086
        elif field == _AUTHOR_HEADER:
1087
            author, timetext, timezonetext = value.rsplit(b' ', 2)
1088
            author_time = int(timetext)
1089
            author_info = (author, author_time, parse_timezone(timezonetext))
1090
        elif field == _COMMITTER_HEADER:
1091
            committer, timetext, timezonetext = value.rsplit(b' ', 2)
1092
            commit_time = int(timetext)
1093
            commit_info = (committer, commit_time, parse_timezone(timezonetext))
1094
        elif field == _ENCODING_HEADER:
1095
            encoding = value
1096
        elif field == _MERGETAG_HEADER:
1097
            mergetag.append(Tag.from_string(value + b'\n'))
1098
        elif field == _GPGSIG_HEADER:
1099
            gpgsig = value
1100
        elif field is None:
1101
            message = value
1102
        else:
1103
            extra.append((field, value))
1104
    return (tree, parents, author_info, commit_info, encoding, mergetag,
1105
            gpgsig, message, extra)
1106

    
1107

    
1108
class Commit(ShaFile):
1109
    """A git commit object"""
1110

    
1111
    type_name = b'commit'
1112
    type_num = 1
1113

    
1114
    __slots__ = ('_parents', '_encoding', '_extra', '_author_timezone_neg_utc',
1115
                 '_commit_timezone_neg_utc', '_commit_time',
1116
                 '_author_time', '_author_timezone', '_commit_timezone',
1117
                 '_author', '_committer', '_parents', '_extra',
1118
                 '_encoding', '_tree', '_message', '_mergetag', '_gpgsig')
1119

    
1120
    def __init__(self):
1121
        super(Commit, self).__init__()
1122
        self._parents = []
1123
        self._encoding = None
1124
        self._mergetag = []
1125
        self._gpgsig = None
1126
        self._extra = []
1127
        self._author_timezone_neg_utc = False
1128
        self._commit_timezone_neg_utc = False
1129

    
1130
    @classmethod
1131
    def from_path(cls, path):
1132
        commit = ShaFile.from_path(path)
1133
        if not isinstance(commit, cls):
1134
            raise NotCommitError(path)
1135
        return commit
1136

    
1137
    def _deserialize(self, chunks):
1138
        (self._tree, self._parents, author_info, commit_info, self._encoding,
1139
                self._mergetag, self._gpgsig, self._message, self._extra) = (
1140
                        parse_commit(chunks))
1141
        (self._author, self._author_time, (self._author_timezone,
1142
             self._author_timezone_neg_utc)) = author_info
1143
        (self._committer, self._commit_time, (self._commit_timezone,
1144
             self._commit_timezone_neg_utc)) = commit_info
1145

    
1146
    def check(self):
1147
        """Check this object for internal consistency.
1148

1149
        :raise ObjectFormatException: if the object is malformed in some way
1150
        """
1151
        super(Commit, self).check()
1152
        self._check_has_member("_tree", "missing tree")
1153
        self._check_has_member("_author", "missing author")
1154
        self._check_has_member("_committer", "missing committer")
1155
        # times are currently checked when set
1156

    
1157
        for parent in self._parents:
1158
            check_hexsha(parent, "invalid parent sha")
1159
        check_hexsha(self._tree, "invalid tree sha")
1160

    
1161
        check_identity(self._author, "invalid author")
1162
        check_identity(self._committer, "invalid committer")
1163

    
1164
        last = None
1165
        for field, _ in _parse_message(self._chunked_text):
1166
            if field == _TREE_HEADER and last is not None:
1167
                raise ObjectFormatException("unexpected tree")
1168
            elif field == _PARENT_HEADER and last not in (_PARENT_HEADER,
1169
                                                          _TREE_HEADER):
1170
                raise ObjectFormatException("unexpected parent")
1171
            elif field == _AUTHOR_HEADER and last not in (_TREE_HEADER,
1172
                                                          _PARENT_HEADER):
1173
                raise ObjectFormatException("unexpected author")
1174
            elif field == _COMMITTER_HEADER and last != _AUTHOR_HEADER:
1175
                raise ObjectFormatException("unexpected committer")
1176
            elif field == _ENCODING_HEADER and last != _COMMITTER_HEADER:
1177
                raise ObjectFormatException("unexpected encoding")
1178
            last = field
1179

    
1180
        # TODO: optionally check for duplicate parents
1181

    
1182
    def _serialize(self):
1183
        chunks = []
1184
        tree_bytes = self._tree.id if isinstance(self._tree, Tree) else self._tree
1185
        chunks.append(git_line(_TREE_HEADER, tree_bytes))
1186
        for p in self._parents:
1187
            chunks.append(git_line(_PARENT_HEADER, p))
1188
        chunks.append(git_line(
1189
            _AUTHOR_HEADER, self._author, str(self._author_time).encode('ascii'),
1190
            format_timezone(self._author_timezone,
1191
                            self._author_timezone_neg_utc)))
1192
        chunks.append(git_line(
1193
            _COMMITTER_HEADER, self._committer, str(self._commit_time).encode('ascii'),
1194
            format_timezone(self._commit_timezone,
1195
                            self._commit_timezone_neg_utc)))
1196
        if self.encoding:
1197
            chunks.append(git_line(_ENCODING_HEADER, self.encoding))
1198
        for mergetag in self.mergetag:
1199
            mergetag_chunks = mergetag.as_raw_string().split(b'\n')
1200

    
1201
            chunks.append(git_line(_MERGETAG_HEADER, mergetag_chunks[0]))
1202
            # Embedded extra header needs leading space
1203
            for chunk in mergetag_chunks[1:]:
1204
                chunks.append(b' ' + chunk + b'\n')
1205

    
1206
            # No trailing empty line
1207
            chunks[-1] = chunks[-1].rstrip(b' \n')
1208
        for k, v in self.extra:
1209
            if b'\n' in k or b'\n' in v:
1210
                raise AssertionError(
1211
                    "newline in extra data: %r -> %r" % (k, v))
1212
            chunks.append(git_line(k, v))
1213
        if self.gpgsig:
1214
            sig_chunks = self.gpgsig.split(b'\n')
1215
            chunks.append(git_line(_GPGSIG_HEADER, sig_chunks[0]))
1216
            for chunk in sig_chunks[1:]:
1217
                chunks.append(git_line(b'',  chunk))
1218
        chunks.append(b'\n')  # There must be a new line after the headers
1219
        chunks.append(self._message)
1220
        return chunks
1221

    
1222
    tree = serializable_property(
1223
        "tree", "Tree that is the state of this commit")
1224

    
1225
    def _get_parents(self):
1226
        """Return a list of parents of this commit."""
1227
        return self._parents
1228

    
1229
    def _set_parents(self, value):
1230
        """Set a list of parents of this commit."""
1231
        self._needs_serialization = True
1232
        self._parents = value
1233

    
1234
    parents = property(_get_parents, _set_parents,
1235
                       doc="Parents of this commit, by their SHA1.")
1236

    
1237
    def _get_extra(self):
1238
        """Return extra settings of this commit."""
1239
        return self._extra
1240

    
1241
    extra = property(_get_extra,
1242
        doc="Extra header fields not understood (presumably added in a "
1243
            "newer version of git). Kept verbatim so the object can "
1244
            "be correctly reserialized. For private commit metadata, use "
1245
            "pseudo-headers in Commit.message, rather than this field.")
1246

    
1247
    author = serializable_property("author",
1248
        "The name of the author of the commit")
1249

    
1250
    committer = serializable_property("committer",
1251
        "The name of the committer of the commit")
1252

    
1253
    message = serializable_property(
1254
        "message", "The commit message")
1255

    
1256
    commit_time = serializable_property("commit_time",
1257
        "The timestamp of the commit. As the number of seconds since the epoch.")
1258

    
1259
    commit_timezone = serializable_property("commit_timezone",
1260
        "The zone the commit time is in")
1261

    
1262
    author_time = serializable_property("author_time",
1263
        "The timestamp the commit was written. As the number of "
1264
        "seconds since the epoch.")
1265

    
1266
    author_timezone = serializable_property(
1267
        "author_timezone", "Returns the zone the author time is in.")
1268

    
1269
    encoding = serializable_property(
1270
        "encoding", "Encoding of the commit message.")
1271

    
1272
    mergetag = serializable_property(
1273
        "mergetag", "Associated signed tag.")
1274

    
1275
    gpgsig = serializable_property(
1276
        "gpgsig", "GPG Signature.")
1277

    
1278

    
1279
OBJECT_CLASSES = (
1280
    Commit,
1281
    Tree,
1282
    Blob,
1283
    Tag,
1284
    )
1285

    
1286
_TYPE_MAP = {}
1287

    
1288
for cls in OBJECT_CLASSES:
1289
    _TYPE_MAP[cls.type_name] = cls
1290
    _TYPE_MAP[cls.type_num] = cls
1291

    
1292

    
1293
# Hold on to the pure-python implementations for testing
1294
_parse_tree_py = parse_tree
1295
_sorted_tree_items_py = sorted_tree_items
1296
try:
1297
    # Try to import C versions
1298
    from dulwich._objects import parse_tree, sorted_tree_items
1299
except ImportError:
1300
    pass