Changeset - 9cdb81c7b855
[Not reviewed]
default
0 1 0
Mads Kiilerich - 8 years ago 2017-10-03 00:14:40
mads@kiilerich.com
diffs: extract _parse_lines as a pure function
1 file changed with 102 insertions and 100 deletions:
0 comments (0 inline, 0 general)
kallithea/lib/diffs.py
Show inline comments
 
@@ -247,50 +247,48 @@ def get_gitdiff(filenode_old, filenode_n
 
    repo = filenode_new.changeset.repository
 
    old_raw_id = getattr(filenode_old.changeset, 'raw_id', repo.EMPTY_CHANGESET)
 
    new_raw_id = getattr(filenode_new.changeset, 'raw_id', repo.EMPTY_CHANGESET)
 

	
 
    vcs_gitdiff = repo.get_diff(old_raw_id, new_raw_id, filenode_new.path,
 
                                ignore_whitespace, context)
 
    return vcs_gitdiff
 

	
 
NEW_FILENODE = 1
 
DEL_FILENODE = 2
 
MOD_FILENODE = 3
 
RENAMED_FILENODE = 4
 
COPIED_FILENODE = 5
 
CHMOD_FILENODE = 6
 
BIN_FILENODE = 7
 

	
 

	
 
class DiffProcessor(object):
 
    """
 
    Give it a unified or git diff and it returns a list of the files that were
 
    mentioned in the diff together with a dict of meta information that
 
    can be used to render it in a HTML template.
 
    """
 
    _diff_git_re = re.compile('^diff --git', re.MULTILINE)
 
    _chunk_re = re.compile(r'^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@(.*)')
 
    _newline_marker = re.compile(r'^\\ No newline at end of file')
 

	
 
    def __init__(self, diff, vcs='hg', diff_limit=None, inline_diff=True):
 
        """
 
        :param diff:   a text in diff format
 
        :param vcs: type of version control hg or git
 
        :param diff_limit: define the size of diff that is considered "big"
 
            based on that parameter cut off will be triggered, set to None
 
            to show full diff
 
        """
 
        if not isinstance(diff, basestring):
 
            raise Exception('Diff must be a basestring got %s instead' % type(diff))
 

	
 
        self._diff = diff
 
        self.adds = 0
 
        self.removes = 0
 
        self.diff_limit = diff_limit
 
        self.limited_diff = False
 
        self.vcs = vcs
 
        self.parsed = self._parse_gitdiff(inline_diff=inline_diff)
 

	
 
    def _parse_gitdiff(self, inline_diff):
 
        """Parse self._diff and return a list of dicts with meta info and chunks for each file.
 
        Might set limited_diff.
 
        Optionally, do an extra pass and to extra markup of one-liner changes.
 
@@ -343,49 +341,49 @@ class DiffProcessor(object):
 
                    op = 'M'
 
                    stats['binary'] = True
 
                    stats['ops'][COPIED_FILENODE] = ('file copied from %s to %s'
 
                                        % (head['copy_from'], head['copy_to']))
 
                # FALL BACK: detect missed old style add or remove
 
                if op is None:
 
                    if not head['a_file'] and head['b_file']:
 
                        op = 'A'
 
                        stats['binary'] = True
 
                        stats['ops'][NEW_FILENODE] = 'new file'
 

	
 
                    elif head['a_file'] and not head['b_file']:
 
                        op = 'D'
 
                        stats['binary'] = True
 
                        stats['ops'][DEL_FILENODE] = 'deleted file'
 

	
 
                # it's not ADD not DELETE
 
                if op is None:
 
                    op = 'M'
 
                    stats['binary'] = True
 
                    stats['ops'][MOD_FILENODE] = 'modified file'
 

	
 
            # a real non-binary diff
 
            if head['a_file'] or head['b_file']:
 
                chunks, added, deleted = self._parse_lines(diff_lines)
 
                chunks, added, deleted = _parse_lines(diff_lines)
 
                stats['binary'] = False
 
                stats['added'] = added
 
                stats['deleted'] = deleted
 
                # explicit mark that it's a modified file
 
                if op == 'M':
 
                    stats['ops'][MOD_FILENODE] = 'modified file'
 
            else:  # Git binary patch (or empty diff)
 
                # Git binary patch
 
                if head['bin_patch']:
 
                    stats['ops'][BIN_FILENODE] = 'binary diff not shown'
 
                chunks = []
 

	
 
            if op == 'D' and chunks:
 
                # a way of seeing deleted content could perhaps be nice - but
 
                # not with the current UI
 
                chunks = []
 

	
 
            chunks.insert(0, [{
 
                'old_lineno': '',
 
                'new_lineno': '',
 
                'action':     'context',
 
                'line':       msg,
 
                } for _op, msg in stats['ops'].iteritems()
 
                  if _op not in [MOD_FILENODE]])
 
@@ -415,145 +413,48 @@ class DiffProcessor(object):
 
                            peekline = lineiter.next()
 
                        delline = peekline
 
                        peekline = lineiter.next()
 
                        # if not followed by add, eat all following del lines
 
                        if peekline['action'] != 'add':
 
                            while peekline['action'] == 'del':
 
                                peekline = lineiter.next()
 
                            continue
 
                        # found an add - make sure it is the only one
 
                        addline = peekline
 
                        try:
 
                            peekline = lineiter.next()
 
                        except StopIteration:
 
                            # add was last line - ok
 
                            _highlight_inline_diff(delline, addline)
 
                            raise
 
                        if peekline['action'] != 'add':
 
                            # there was only one add line - ok
 
                            _highlight_inline_diff(delline, addline)
 
                except StopIteration:
 
                    pass
 

	
 
        return _files
 

	
 
    def _parse_lines(self, diff_lines):
 
        """
 
        Given an iterator of diff body lines, parse them and return a dict per
 
        line and added/removed totals.
 
        """
 
        added = deleted = 0
 
        old_line = old_end = new_line = new_end = None
 

	
 
        try:
 
            chunks = []
 
            line = diff_lines.next()
 

	
 
            while True:
 
                lines = []
 
                chunks.append(lines)
 

	
 
                match = self._chunk_re.match(line)
 

	
 
                if not match:
 
                    raise Exception('error parsing diff @@ line %r' % line)
 

	
 
                gr = match.groups()
 
                (old_line, old_end,
 
                 new_line, new_end) = [int(x or 1) for x in gr[:-1]]
 
                old_line -= 1
 
                new_line -= 1
 

	
 
                context = len(gr) == 5
 
                old_end += old_line
 
                new_end += new_line
 

	
 
                if context:
 
                    # skip context only if it's first line
 
                    if int(gr[0]) > 1:
 
                        lines.append({
 
                            'old_lineno': '...',
 
                            'new_lineno': '...',
 
                            'action':     'context',
 
                            'line':       line,
 
                        })
 

	
 
                line = diff_lines.next()
 

	
 
                while old_line < old_end or new_line < new_end:
 
                    if not line:
 
                        raise Exception('error parsing diff - empty line at -%s+%s' % (old_line, new_line))
 

	
 
                    affects_old = affects_new = False
 

	
 
                    command = line[0]
 
                    if command == '+':
 
                        affects_new = True
 
                        action = 'add'
 
                        added += 1
 
                    elif command == '-':
 
                        affects_old = True
 
                        action = 'del'
 
                        deleted += 1
 
                    elif command == ' ':
 
                        affects_old = affects_new = True
 
                        action = 'unmod'
 
                    else:
 
                        raise Exception('error parsing diff - unknown command in line %r at -%s+%s' % (line, old_line, new_line))
 

	
 
                    if not self._newline_marker.match(line):
 
                        old_line += affects_old
 
                        new_line += affects_new
 
                        lines.append({
 
                            'old_lineno':   affects_old and old_line or '',
 
                            'new_lineno':   affects_new and new_line or '',
 
                            'action':       action,
 
                            'line':         line[1:],
 
                        })
 

	
 
                    line = diff_lines.next()
 

	
 
                    if self._newline_marker.match(line):
 
                        # we need to append to lines, since this is not
 
                        # counted in the line specs of diff
 
                        lines.append({
 
                            'old_lineno':   '...',
 
                            'new_lineno':   '...',
 
                            'action':       'context',
 
                            'line':         line,
 
                        })
 
                        line = diff_lines.next()
 
                if old_line > old_end:
 
                    raise Exception('error parsing diff - more than %s "-" lines at -%s+%s' % (old_end, old_line, new_line))
 
                if new_line > new_end:
 
                    raise Exception('error parsing diff - more than %s "+" lines at -%s+%s' % (new_end, old_line, new_line))
 
        except StopIteration:
 
            pass
 
        if old_line != old_end or new_line != new_end:
 
            raise Exception('diff processing broken when old %s<>%s or new %s<>%s line %r' % (old_line, old_end, new_line, new_end, line))
 

	
 
        return chunks, added, deleted
 

	
 
    def stat(self):
 
        """
 
        Returns tuple of added, and removed lines for this instance
 
        """
 
        return self.adds, self.removes
 

	
 

	
 
_escape_re = re.compile(r'(&)|(<)|(>)|(\t)|(\r)|(?<=.)( \n| $)')
 

	
 

	
 
def _escaper(string):
 
    """
 
    Do HTML escaping/markup
 
    """
 

	
 
    def substitute(m):
 
        groups = m.groups()
 
        if groups[0]:
 
            return '&amp;'
 
        if groups[1]:
 
            return '&lt;'
 
        if groups[2]:
 
            return '&gt;'
 
        if groups[3]:
 
@@ -609,48 +510,149 @@ def _get_header(vcs, diff_chunk):
 

	
 
    1. A dict with meta info:
 

	
 
        a_path, b_path, similarity_index, rename_from, rename_to,
 
        old_mode, new_mode, new_file_mode, deleted_file_mode,
 
        a_blob_id, b_blob_id, b_mode, a_file, b_file
 

	
 
    2. An iterator yielding lines with simple HTML markup.
 
    """
 
    match = None
 
    if vcs == 'git':
 
        match = _git_header_re.match(diff_chunk)
 
    elif vcs == 'hg':
 
        match = _hg_header_re.match(diff_chunk)
 
    if match is None:
 
        raise Exception('diff not recognized as valid %s diff' % vcs)
 
    meta_info = match.groupdict()
 
    rest = diff_chunk[match.end():]
 
    if rest and not rest.startswith('@') and not rest.startswith('literal ') and not rest.startswith('delta '):
 
        raise Exception('cannot parse %s diff header: %r followed by %r' % (vcs, diff_chunk[:match.end()], rest[:1000]))
 
    diff_lines = (_escaper(m.group(0)) for m in re.finditer(r'.*\n|.+$', rest)) # don't split on \r as str.splitlines do
 
    return meta_info, diff_lines
 

	
 

	
 
_chunk_re = re.compile(r'^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@(.*)')
 
_newline_marker = re.compile(r'^\\ No newline at end of file')
 

	
 

	
 
def _parse_lines(diff_lines):
 
    """
 
    Given an iterator of diff body lines, parse them and return a dict per
 
    line and added/removed totals.
 
    """
 
    added = deleted = 0
 
    old_line = old_end = new_line = new_end = None
 

	
 
    try:
 
        chunks = []
 
        line = diff_lines.next()
 

	
 
        while True:
 
            lines = []
 
            chunks.append(lines)
 

	
 
            match = _chunk_re.match(line)
 

	
 
            if not match:
 
                raise Exception('error parsing diff @@ line %r' % line)
 

	
 
            gr = match.groups()
 
            (old_line, old_end,
 
             new_line, new_end) = [int(x or 1) for x in gr[:-1]]
 
            old_line -= 1
 
            new_line -= 1
 

	
 
            context = len(gr) == 5
 
            old_end += old_line
 
            new_end += new_line
 

	
 
            if context:
 
                # skip context only if it's first line
 
                if int(gr[0]) > 1:
 
                    lines.append({
 
                        'old_lineno': '...',
 
                        'new_lineno': '...',
 
                        'action':     'context',
 
                        'line':       line,
 
                    })
 

	
 
            line = diff_lines.next()
 

	
 
            while old_line < old_end or new_line < new_end:
 
                if not line:
 
                    raise Exception('error parsing diff - empty line at -%s+%s' % (old_line, new_line))
 

	
 
                affects_old = affects_new = False
 

	
 
                command = line[0]
 
                if command == '+':
 
                    affects_new = True
 
                    action = 'add'
 
                    added += 1
 
                elif command == '-':
 
                    affects_old = True
 
                    action = 'del'
 
                    deleted += 1
 
                elif command == ' ':
 
                    affects_old = affects_new = True
 
                    action = 'unmod'
 
                else:
 
                    raise Exception('error parsing diff - unknown command in line %r at -%s+%s' % (line, old_line, new_line))
 

	
 
                if not _newline_marker.match(line):
 
                    old_line += affects_old
 
                    new_line += affects_new
 
                    lines.append({
 
                        'old_lineno':   affects_old and old_line or '',
 
                        'new_lineno':   affects_new and new_line or '',
 
                        'action':       action,
 
                        'line':         line[1:],
 
                    })
 

	
 
                line = diff_lines.next()
 

	
 
                if _newline_marker.match(line):
 
                    # we need to append to lines, since this is not
 
                    # counted in the line specs of diff
 
                    lines.append({
 
                        'old_lineno':   '...',
 
                        'new_lineno':   '...',
 
                        'action':       'context',
 
                        'line':         line,
 
                    })
 
                    line = diff_lines.next()
 
            if old_line > old_end:
 
                raise Exception('error parsing diff - more than %s "-" lines at -%s+%s' % (old_end, old_line, new_line))
 
            if new_line > new_end:
 
                raise Exception('error parsing diff - more than %s "+" lines at -%s+%s' % (new_end, old_line, new_line))
 
    except StopIteration:
 
        pass
 
    if old_line != old_end or new_line != new_end:
 
        raise Exception('diff processing broken when old %s<>%s or new %s<>%s line %r' % (old_line, old_end, new_line, new_end, line))
 

	
 
    return chunks, added, deleted
 

	
 
# Used for inline highlighter word split, must match the substitutions in _escaper
 
_token_re = re.compile(r'()(&amp;|&lt;|&gt;|<u>\t</u>|<u class="cr"></u>| <i></i>|\W+?)')
 

	
 

	
 
def _highlight_inline_diff(old, new):
 
    """
 
    Highlight simple add/remove in two lines given as info dicts. They are
 
    modified in place and given markup with <del>/<ins>.
 
    """
 
    assert old['action'] == 'del'
 
    assert new['action'] == 'add'
 

	
 
    oldwords = _token_re.split(old['line'])
 
    newwords = _token_re.split(new['line'])
 
    sequence = difflib.SequenceMatcher(None, oldwords, newwords)
 

	
 
    oldfragments, newfragments = [], []
 
    for tag, i1, i2, j1, j2 in sequence.get_opcodes():
 
        oldfrag = ''.join(oldwords[i1:i2])
 
        newfrag = ''.join(newwords[j1:j2])
 
        if tag != 'equal':
 
            if oldfrag:
 
                oldfrag = '<del>%s</del>' % oldfrag
 
            if newfrag:
0 comments (0 inline, 0 general)