Statistics
| Revision:

gvsig-scripting / org.gvsig.scripting / trunk / org.gvsig.scripting / org.gvsig.scripting.app / org.gvsig.scripting.app.mainplugin / src / main / resources-plugin / scripting / lib / dulwich / diff_tree.py @ 959

History | View | Annotate | Download (21.8 KB)

1
# diff_tree.py -- Utilities for diffing files and trees.
2
# Copyright (C) 2010 Google, Inc.
3
#
4
# Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
5
# General Public License as public by the Free Software Foundation; version 2.0
6
# or (at your option) any later version. You can redistribute it and/or
7
# modify it under the terms of either of these two licenses.
8
#
9
# Unless required by applicable law or agreed to in writing, software
10
# distributed under the License is distributed on an "AS IS" BASIS,
11
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
# See the License for the specific language governing permissions and
13
# limitations under the License.
14
#
15
# You should have received a copy of the licenses; if not, see
16
# <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
17
# and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
18
# License, Version 2.0.
19
#
20

    
21
"""Utilities for diffing files and trees."""
22
import sys
23
from collections import (
24
    defaultdict,
25
    namedtuple,
26
    )
27

    
28
from io import BytesIO
29
from itertools import chain
30
import stat
31

    
32
from dulwich.objects import (
33
    S_ISGITLINK,
34
    TreeEntry,
35
    )
36

    
37

    
38
# TreeChange type constants.
39
CHANGE_ADD = 'add'
40
CHANGE_MODIFY = 'modify'
41
CHANGE_DELETE = 'delete'
42
CHANGE_RENAME = 'rename'
43
CHANGE_COPY = 'copy'
44
CHANGE_UNCHANGED = 'unchanged'
45

    
46
RENAME_CHANGE_TYPES = (CHANGE_RENAME, CHANGE_COPY)
47

    
48
_NULL_ENTRY = TreeEntry(None, None, None)
49

    
50
_MAX_SCORE = 100
51
RENAME_THRESHOLD = 60
52
MAX_FILES = 200
53
REWRITE_THRESHOLD = None
54

    
55

    
56
class TreeChange(namedtuple('TreeChange', ['type', 'old', 'new'])):
57
    """Named tuple a single change between two trees."""
58

    
59
    @classmethod
60
    def add(cls, new):
61
        return cls(CHANGE_ADD, _NULL_ENTRY, new)
62

    
63
    @classmethod
64
    def delete(cls, old):
65
        return cls(CHANGE_DELETE, old, _NULL_ENTRY)
66

    
67

    
68
def _tree_entries(path, tree):
69
    result = []
70
    if not tree:
71
        return result
72
    for entry in tree.iteritems(name_order=True):
73
        result.append(entry.in_path(path))
74
    return result
75

    
76

    
77
def _merge_entries(path, tree1, tree2):
78
    """Merge the entries of two trees.
79

80
    :param path: A path to prepend to all tree entry names.
81
    :param tree1: The first Tree object to iterate, or None.
82
    :param tree2: The second Tree object to iterate, or None.
83
    :return: A list of pairs of TreeEntry objects for each pair of entries in
84
        the trees. If an entry exists in one tree but not the other, the other
85
        entry will have all attributes set to None. If neither entry's path is
86
        None, they are guaranteed to match.
87
    """
88
    entries1 = _tree_entries(path, tree1)
89
    entries2 = _tree_entries(path, tree2)
90
    i1 = i2 = 0
91
    len1 = len(entries1)
92
    len2 = len(entries2)
93

    
94
    result = []
95
    while i1 < len1 and i2 < len2:
96
        entry1 = entries1[i1]
97
        entry2 = entries2[i2]
98
        if entry1.path < entry2.path:
99
            result.append((entry1, _NULL_ENTRY))
100
            i1 += 1
101
        elif entry1.path > entry2.path:
102
            result.append((_NULL_ENTRY, entry2))
103
            i2 += 1
104
        else:
105
            result.append((entry1, entry2))
106
            i1 += 1
107
            i2 += 1
108
    for i in range(i1, len1):
109
        result.append((entries1[i], _NULL_ENTRY))
110
    for i in range(i2, len2):
111
        result.append((_NULL_ENTRY, entries2[i]))
112
    return result
113

    
114

    
115
def _is_tree(entry):
116
    mode = entry.mode
117
    if mode is None:
118
        return False
119
    return stat.S_ISDIR(mode)
120

    
121

    
122
def walk_trees(store, tree1_id, tree2_id, prune_identical=False):
123
    """Recursively walk all the entries of two trees.
124

125
    Iteration is depth-first pre-order, as in e.g. os.walk.
126

127
    :param store: An ObjectStore for looking up objects.
128
    :param tree1_id: The SHA of the first Tree object to iterate, or None.
129
    :param tree2_id: The SHA of the second Tree object to iterate, or None.
130
    :param prune_identical: If True, identical subtrees will not be walked.
131
    :return: Iterator over Pairs of TreeEntry objects for each pair of entries
132
        in the trees and their subtrees recursively. If an entry exists in one
133
        tree but not the other, the other entry will have all attributes set
134
        to None. If neither entry's path is None, they are guaranteed to
135
        match.
136
    """
137
    # This could be fairly easily generalized to >2 trees if we find a use
138
    # case.
139
    mode1 = tree1_id and stat.S_IFDIR or None
140
    mode2 = tree2_id and stat.S_IFDIR or None
141
    todo = [(TreeEntry(b'', mode1, tree1_id), TreeEntry(b'', mode2, tree2_id))]
142
    while todo:
143
        entry1, entry2 = todo.pop()
144
        is_tree1 = _is_tree(entry1)
145
        is_tree2 = _is_tree(entry2)
146
        if prune_identical and is_tree1 and is_tree2 and entry1 == entry2:
147
            continue
148

    
149
        tree1 = is_tree1 and store[entry1.sha] or None
150
        tree2 = is_tree2 and store[entry2.sha] or None
151
        path = entry1.path or entry2.path
152
        todo.extend(reversed(_merge_entries(path, tree1, tree2)))
153
        yield entry1, entry2
154

    
155

    
156
def _skip_tree(entry):
157
    if entry.mode is None or stat.S_ISDIR(entry.mode):
158
        return _NULL_ENTRY
159
    return entry
160

    
161

    
162
def tree_changes(store, tree1_id, tree2_id, want_unchanged=False,
163
                 rename_detector=None):
164
    """Find the differences between the contents of two trees.
165

166
    :param store: An ObjectStore for looking up objects.
167
    :param tree1_id: The SHA of the source tree.
168
    :param tree2_id: The SHA of the target tree.
169
    :param want_unchanged: If True, include TreeChanges for unmodified entries
170
        as well.
171
    :param rename_detector: RenameDetector object for detecting renames.
172
    :return: Iterator over TreeChange instances for each change between the
173
        source and target tree.
174
    """
175
    if (rename_detector is not None and tree1_id is not None and
176
        tree2_id is not None):
177
        for change in rename_detector.changes_with_renames(
178
            tree1_id, tree2_id, want_unchanged=want_unchanged):
179
                yield change
180
        return
181

    
182
    entries = walk_trees(store, tree1_id, tree2_id,
183
                         prune_identical=(not want_unchanged))
184
    for entry1, entry2 in entries:
185
        if entry1 == entry2 and not want_unchanged:
186
            continue
187

    
188
        # Treat entries for trees as missing.
189
        entry1 = _skip_tree(entry1)
190
        entry2 = _skip_tree(entry2)
191

    
192
        if entry1 != _NULL_ENTRY and entry2 != _NULL_ENTRY:
193
            if stat.S_IFMT(entry1.mode) != stat.S_IFMT(entry2.mode):
194
                # File type changed: report as delete/add.
195
                yield TreeChange.delete(entry1)
196
                entry1 = _NULL_ENTRY
197
                change_type = CHANGE_ADD
198
            elif entry1 == entry2:
199
                change_type = CHANGE_UNCHANGED
200
            else:
201
                change_type = CHANGE_MODIFY
202
        elif entry1 != _NULL_ENTRY:
203
            change_type = CHANGE_DELETE
204
        elif entry2 != _NULL_ENTRY:
205
            change_type = CHANGE_ADD
206
        else:
207
            # Both were None because at least one was a tree.
208
            continue
209
        yield TreeChange(change_type, entry1, entry2)
210

    
211

    
212
def _all_eq(seq, key, value):
213
    for e in seq:
214
        if key(e) != value:
215
            return False
216
    return True
217

    
218

    
219
def _all_same(seq, key):
220
    return _all_eq(seq[1:], key, key(seq[0]))
221

    
222

    
223
def tree_changes_for_merge(store, parent_tree_ids, tree_id,
224
                           rename_detector=None):
225
    """Get the tree changes for a merge tree relative to all its parents.
226

227
    :param store: An ObjectStore for looking up objects.
228
    :param parent_tree_ids: An iterable of the SHAs of the parent trees.
229
    :param tree_id: The SHA of the merge tree.
230
    :param rename_detector: RenameDetector object for detecting renames.
231

232
    :return: Iterator over lists of TreeChange objects, one per conflicted path
233
        in the merge.
234

235
        Each list contains one element per parent, with the TreeChange for that
236
        path relative to that parent. An element may be None if it never
237
        existed in one parent and was deleted in two others.
238

239
        A path is only included in the output if it is a conflict, i.e. its SHA
240
        in the merge tree is not found in any of the parents, or in the case of
241
        deletes, if not all of the old SHAs match.
242
    """
243
    all_parent_changes = [tree_changes(store, t, tree_id,
244
                                       rename_detector=rename_detector)
245
                          for t in parent_tree_ids]
246
    num_parents = len(parent_tree_ids)
247
    changes_by_path = defaultdict(lambda: [None] * num_parents)
248

    
249
    # Organize by path.
250
    for i, parent_changes in enumerate(all_parent_changes):
251
        for change in parent_changes:
252
            if change.type == CHANGE_DELETE:
253
                path = change.old.path
254
            else:
255
                path = change.new.path
256
            changes_by_path[path][i] = change
257

    
258
    old_sha = lambda c: c.old.sha
259
    change_type = lambda c: c.type
260

    
261
    # Yield only conflicting changes.
262
    for _, changes in sorted(changes_by_path.items()):
263
        assert len(changes) == num_parents
264
        have = [c for c in changes if c is not None]
265
        if _all_eq(have, change_type, CHANGE_DELETE):
266
            if not _all_same(have, old_sha):
267
                yield changes
268
        elif not _all_same(have, change_type):
269
            yield changes
270
        elif None not in changes:
271
            # If no change was found relative to one parent, that means the SHA
272
            # must have matched the SHA in that parent, so it is not a
273
            # conflict.
274
            yield changes
275

    
276

    
277
_BLOCK_SIZE = 64
278

    
279

    
280
def _count_blocks(obj):
281
    """Count the blocks in an object.
282

283
    Splits the data into blocks either on lines or <=64-byte chunks of lines.
284

285
    :param obj: The object to count blocks for.
286
    :return: A dict of block hashcode -> total bytes occurring.
287
    """
288
    block_counts = defaultdict(int)
289
    block = BytesIO()
290
    n = 0
291

    
292
    # Cache attrs as locals to avoid expensive lookups in the inner loop.
293
    block_write = block.write
294
    block_seek = block.seek
295
    block_truncate = block.truncate
296
    block_getvalue = block.getvalue
297

    
298
    for c in chain(*obj.as_raw_chunks()):
299
        if sys.version_info[0] == 3:
300
            c = c.to_bytes(1, 'big')
301
        block_write(c)
302
        n += 1
303
        if c == b'\n' or n == _BLOCK_SIZE:
304
            value = block_getvalue()
305
            block_counts[hash(value)] += len(value)
306
            block_seek(0)
307
            block_truncate()
308
            n = 0
309
    if n > 0:
310
        last_block = block_getvalue()
311
        block_counts[hash(last_block)] += len(last_block)
312
    return block_counts
313

    
314

    
315
def _common_bytes(blocks1, blocks2):
316
    """Count the number of common bytes in two block count dicts.
317

318
    :param block1: The first dict of block hashcode -> total bytes.
319
    :param block2: The second dict of block hashcode -> total bytes.
320
    :return: The number of bytes in common between blocks1 and blocks2. This is
321
        only approximate due to possible hash collisions.
322
    """
323
    # Iterate over the smaller of the two dicts, since this is symmetrical.
324
    if len(blocks1) > len(blocks2):
325
        blocks1, blocks2 = blocks2, blocks1
326
    score = 0
327
    for block, count1 in blocks1.items():
328
        count2 = blocks2.get(block)
329
        if count2:
330
            score += min(count1, count2)
331
    return score
332

    
333

    
334
def _similarity_score(obj1, obj2, block_cache=None):
335
    """Compute a similarity score for two objects.
336

337
    :param obj1: The first object to score.
338
    :param obj2: The second object to score.
339
    :param block_cache: An optional dict of SHA to block counts to cache
340
        results between calls.
341
    :return: The similarity score between the two objects, defined as the
342
        number of bytes in common between the two objects divided by the
343
        maximum size, scaled to the range 0-100.
344
    """
345
    if block_cache is None:
346
        block_cache = {}
347
    if obj1.id not in block_cache:
348
        block_cache[obj1.id] = _count_blocks(obj1)
349
    if obj2.id not in block_cache:
350
        block_cache[obj2.id] = _count_blocks(obj2)
351

    
352
    common_bytes = _common_bytes(block_cache[obj1.id], block_cache[obj2.id])
353
    max_size = max(obj1.raw_length(), obj2.raw_length())
354
    if not max_size:
355
        return _MAX_SCORE
356
    return int(float(common_bytes) * _MAX_SCORE / max_size)
357

    
358

    
359
def _tree_change_key(entry):
360
    # Sort by old path then new path. If only one exists, use it for both keys.
361
    path1 = entry.old.path
362
    path2 = entry.new.path
363
    if path1 is None:
364
        path1 = path2
365
    if path2 is None:
366
        path2 = path1
367
    return (path1, path2)
368

    
369

    
370
class RenameDetector(object):
371
    """Object for handling rename detection between two trees."""
372

    
373
    def __init__(self, store, rename_threshold=RENAME_THRESHOLD,
374
                 max_files=MAX_FILES,
375
                 rewrite_threshold=REWRITE_THRESHOLD,
376
                 find_copies_harder=False):
377
        """Initialize the rename detector.
378

379
        :param store: An ObjectStore for looking up objects.
380
        :param rename_threshold: The threshold similarity score for considering
381
            an add/delete pair to be a rename/copy; see _similarity_score.
382
        :param max_files: The maximum number of adds and deletes to consider,
383
            or None for no limit. The detector is guaranteed to compare no more
384
            than max_files ** 2 add/delete pairs. This limit is provided because
385
            rename detection can be quadratic in the project size. If the limit
386
            is exceeded, no content rename detection is attempted.
387
        :param rewrite_threshold: The threshold similarity score below which a
388
            modify should be considered a delete/add, or None to not break
389
            modifies; see _similarity_score.
390
        :param find_copies_harder: If True, consider unmodified files when
391
            detecting copies.
392
        """
393
        self._store = store
394
        self._rename_threshold = rename_threshold
395
        self._rewrite_threshold = rewrite_threshold
396
        self._max_files = max_files
397
        self._find_copies_harder = find_copies_harder
398
        self._want_unchanged = False
399

    
400
    def _reset(self):
401
        self._adds = []
402
        self._deletes = []
403
        self._changes = []
404

    
405
    def _should_split(self, change):
406
        if (self._rewrite_threshold is None or change.type != CHANGE_MODIFY or
407
            change.old.sha == change.new.sha):
408
            return False
409
        old_obj = self._store[change.old.sha]
410
        new_obj = self._store[change.new.sha]
411
        return _similarity_score(old_obj, new_obj) < self._rewrite_threshold
412

    
413
    def _add_change(self, change):
414
        if change.type == CHANGE_ADD:
415
            self._adds.append(change)
416
        elif change.type == CHANGE_DELETE:
417
            self._deletes.append(change)
418
        elif self._should_split(change):
419
            self._deletes.append(TreeChange.delete(change.old))
420
            self._adds.append(TreeChange.add(change.new))
421
        elif ((self._find_copies_harder and change.type == CHANGE_UNCHANGED)
422
              or change.type == CHANGE_MODIFY):
423
            # Treat all modifies as potential deletes for rename detection,
424
            # but don't split them (to avoid spurious renames). Setting
425
            # find_copies_harder means we treat unchanged the same as
426
            # modified.
427
            self._deletes.append(change)
428
        else:
429
            self._changes.append(change)
430

    
431
    def _collect_changes(self, tree1_id, tree2_id):
432
        want_unchanged = self._find_copies_harder or self._want_unchanged
433
        for change in tree_changes(self._store, tree1_id, tree2_id,
434
                                   want_unchanged=want_unchanged):
435
            self._add_change(change)
436

    
437
    def _prune(self, add_paths, delete_paths):
438
        self._adds = [a for a in self._adds if a.new.path not in add_paths]
439
        self._deletes = [d for d in self._deletes
440
                         if d.old.path not in delete_paths]
441

    
442
    def _find_exact_renames(self):
443
        add_map = defaultdict(list)
444
        for add in self._adds:
445
            add_map[add.new.sha].append(add.new)
446
        delete_map = defaultdict(list)
447
        for delete in self._deletes:
448
            # Keep track of whether the delete was actually marked as a delete.
449
            # If not, it needs to be marked as a copy.
450
            is_delete = delete.type == CHANGE_DELETE
451
            delete_map[delete.old.sha].append((delete.old, is_delete))
452

    
453
        add_paths = set()
454
        delete_paths = set()
455
        for sha, sha_deletes in delete_map.items():
456
            sha_adds = add_map[sha]
457
            for (old, is_delete), new in zip(sha_deletes, sha_adds):
458
                if stat.S_IFMT(old.mode) != stat.S_IFMT(new.mode):
459
                    continue
460
                if is_delete:
461
                    delete_paths.add(old.path)
462
                add_paths.add(new.path)
463
                new_type = is_delete and CHANGE_RENAME or CHANGE_COPY
464
                self._changes.append(TreeChange(new_type, old, new))
465

    
466
            num_extra_adds = len(sha_adds) - len(sha_deletes)
467
            # TODO(dborowitz): Less arbitrary way of dealing with extra copies.
468
            old = sha_deletes[0][0]
469
            if num_extra_adds > 0:
470
                for new in sha_adds[-num_extra_adds:]:
471
                    add_paths.add(new.path)
472
                    self._changes.append(TreeChange(CHANGE_COPY, old, new))
473
        self._prune(add_paths, delete_paths)
474

    
475
    def _should_find_content_renames(self):
476
        return len(self._adds) * len(self._deletes) <= self._max_files ** 2
477

    
478
    def _rename_type(self, check_paths, delete, add):
479
        if check_paths and delete.old.path == add.new.path:
480
            # If the paths match, this must be a split modify, so make sure it
481
            # comes out as a modify.
482
            return CHANGE_MODIFY
483
        elif delete.type != CHANGE_DELETE:
484
            # If it's in deletes but not marked as a delete, it must have been
485
            # added due to find_copies_harder, and needs to be marked as a
486
            # copy.
487
            return CHANGE_COPY
488
        return CHANGE_RENAME
489

    
490
    def _find_content_rename_candidates(self):
491
        candidates = self._candidates = []
492
        # TODO: Optimizations:
493
        #  - Compare object sizes before counting blocks.
494
        #  - Skip if delete's S_IFMT differs from all adds.
495
        #  - Skip if adds or deletes is empty.
496
        # Match C git's behavior of not attempting to find content renames if
497
        # the matrix size exceeds the threshold.
498
        if not self._should_find_content_renames():
499
            return
500

    
501
        block_cache = {}
502
        check_paths = self._rename_threshold is not None
503
        for delete in self._deletes:
504
            if S_ISGITLINK(delete.old.mode):
505
                continue  # Git links don't exist in this repo.
506
            old_sha = delete.old.sha
507
            old_obj = self._store[old_sha]
508
            block_cache[old_sha] = _count_blocks(old_obj)
509
            for add in self._adds:
510
                if stat.S_IFMT(delete.old.mode) != stat.S_IFMT(add.new.mode):
511
                    continue
512
                new_obj = self._store[add.new.sha]
513
                score = _similarity_score(old_obj, new_obj,
514
                                          block_cache=block_cache)
515
                if score > self._rename_threshold:
516
                    new_type = self._rename_type(check_paths, delete, add)
517
                    rename = TreeChange(new_type, delete.old, add.new)
518
                    candidates.append((-score, rename))
519

    
520
    def _choose_content_renames(self):
521
        # Sort scores from highest to lowest, but keep names in ascending
522
        # order.
523
        self._candidates.sort()
524

    
525
        delete_paths = set()
526
        add_paths = set()
527
        for _, change in self._candidates:
528
            new_path = change.new.path
529
            if new_path in add_paths:
530
                continue
531
            old_path = change.old.path
532
            orig_type = change.type
533
            if old_path in delete_paths:
534
                change = TreeChange(CHANGE_COPY, change.old, change.new)
535

    
536
            # If the candidate was originally a copy, that means it came from a
537
            # modified or unchanged path, so we don't want to prune it.
538
            if orig_type != CHANGE_COPY:
539
                delete_paths.add(old_path)
540
            add_paths.add(new_path)
541
            self._changes.append(change)
542
        self._prune(add_paths, delete_paths)
543

    
544
    def _join_modifies(self):
545
        if self._rewrite_threshold is None:
546
            return
547

    
548
        modifies = {}
549
        delete_map = dict((d.old.path, d) for d in self._deletes)
550
        for add in self._adds:
551
            path = add.new.path
552
            delete = delete_map.get(path)
553
            if (delete is not None and
554
                stat.S_IFMT(delete.old.mode) == stat.S_IFMT(add.new.mode)):
555
                modifies[path] = TreeChange(CHANGE_MODIFY, delete.old, add.new)
556

    
557
        self._adds = [a for a in self._adds if a.new.path not in modifies]
558
        self._deletes = [a for a in self._deletes if a.new.path not in
559
                         modifies]
560
        self._changes += modifies.values()
561

    
562
    def _sorted_changes(self):
563
        result = []
564
        result.extend(self._adds)
565
        result.extend(self._deletes)
566
        result.extend(self._changes)
567
        result.sort(key=_tree_change_key)
568
        return result
569

    
570
    def _prune_unchanged(self):
571
        if self._want_unchanged:
572
            return
573
        self._deletes = [d for d in self._deletes if d.type != CHANGE_UNCHANGED]
574

    
575
    def changes_with_renames(self, tree1_id, tree2_id, want_unchanged=False):
576
        """Iterate TreeChanges between two tree SHAs, with rename detection."""
577
        self._reset()
578
        self._want_unchanged = want_unchanged
579
        self._collect_changes(tree1_id, tree2_id)
580
        self._find_exact_renames()
581
        self._find_content_rename_candidates()
582
        self._choose_content_renames()
583
        self._join_modifies()
584
        self._prune_unchanged()
585
        return self._sorted_changes()
586

    
587

    
588
# Hold on to the pure-python implementations for testing.
589
_is_tree_py = _is_tree
590
_merge_entries_py = _merge_entries
591
_count_blocks_py = _count_blocks
592
try:
593
    # Try to import C versions
594
    from dulwich._diff_tree import _is_tree, _merge_entries, _count_blocks
595
except ImportError:
596
    pass