Skip to content
This repository was archived by the owner on Sep 11, 2020. It is now read-only.

Commit d643cea

Browse files
committed
Blame support for files
This also includes a diff package and revlist package (needed by blame) Some extra packfiles (<1MB) are also included, to be used as fixtures in the tests.
1 parent caab43e commit d643cea

File tree

12 files changed

+2026
-1
lines changed

12 files changed

+2026
-1
lines changed

blame/blame.go

Lines changed: 223 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,223 @@
1+
// Package blame contains blaming functionality for files in the repo.
2+
//
3+
// Blaming a file is finding what commit was the last to modify each of
4+
// the lines in the file, therefore the output of a blaming operation is
5+
// usualy a slice of commits, one commit per line in the file.
6+
//
7+
// This package also provides a pretty print function to output the
8+
// results of a blame in a similar format to the git-blame command.
9+
package blame
10+
11+
import (
12+
"bytes"
13+
"fmt"
14+
"sort"
15+
"strconv"
16+
"strings"
17+
"unicode/utf8"
18+
19+
"gopkg.in/src-d/go-git.v2"
20+
"gopkg.in/src-d/go-git.v2/core"
21+
"gopkg.in/src-d/go-git.v2/diff"
22+
"gopkg.in/src-d/go-git.v2/revlist"
23+
)
24+
25+
// Blame returns the last commit that modified each line of a file in
26+
// a repository.
27+
//
28+
// The file to blame is identified by the input arguments: repo, commit and path.
29+
// The output is a slice of commits, one for each line in the file.
30+
//
31+
// Blaming a file is a two step process:
32+
//
33+
// 1. Create a linear history of the commits affecting a file. We use
34+
// revlist.New for that.
35+
//
36+
// 2. Then build a graph with a node for every line in every file in
37+
// the history of the file.
38+
//
39+
// Each node (line) holds the commit where it was introduced or
40+
// last modified. To achieve that we use the FORWARD algorithm
41+
// described in Zimmermann, et al. "Mining Version Archives for
42+
// Co-changed Lines", in proceedings of the Mining Software
43+
// Repositories workshop, Shanghai, May 22-23, 2006.
44+
//
45+
// Each node is asigned a commit: Start by the nodes in the first
46+
// commit. Assign that commit as the creator of all its lines.
47+
//
48+
// Then jump to the nodes in the next commit, and calculate the diff
49+
// between the two files. Newly created lines get
50+
// assigned the new commit as its origin. Modified lines also get
51+
// this new commit. Untouched lines retain the old commit.
52+
//
53+
// All this work is done in the assignOrigin function.
54+
//
55+
// This function holds all the internal relevant data in a blame
56+
// struct, that is not exported.
57+
//
58+
// TODO: ways to improve the efficiency of this function:
59+
//
60+
// 1. Improve revlist
61+
//
62+
// 2. Improve how to traverse the history (example a backward
63+
// traversal will be much more efficient)
64+
//
65+
// TODO: ways to improve the functrion in general
66+
//
67+
// 1. Add memoization betweenn revlist and assign.
68+
//
69+
// 2. It is using much more memmory than needed, see the TODOs below.
70+
func Blame(repo *git.Repository, commit *git.Commit, path string) ([]*git.Commit, error) {
71+
// init the internal blame struct
72+
b := new(blame)
73+
b.repo = repo
74+
b.fRev = commit
75+
b.path = path
76+
77+
// calculte the history of the file and store it in the
78+
// internal blame struct.
79+
var err error
80+
b.revs, err = revlist.New(b.repo, b.fRev, b.path)
81+
if err != nil {
82+
return nil, err
83+
}
84+
sort.Sort(b.revs) // for forward blame, we need the history sorted by commit date
85+
86+
// allocate space for the data in all the revisions of the file
87+
b.data = make([]string, len(b.revs))
88+
89+
// init the graph
90+
b.graph = make([][]vertex, len(b.revs))
91+
92+
// for all every revision of the file, starting with the first
93+
// one...
94+
var found bool
95+
for i, rev := range b.revs {
96+
// get the contents of the file
97+
b.data[i], found = git.Data(b.path, rev)
98+
if !found {
99+
continue
100+
}
101+
// count its lines
102+
nLines := git.CountLines(b.data[i])
103+
// create a node for each line
104+
b.graph[i] = make([]vertex, nLines)
105+
// assign a commit to each node
106+
// if this is the first revision, then the node is assigned to
107+
// this first commit.
108+
if i == 0 {
109+
for j := 0; j < nLines; j++ {
110+
b.graph[i][j] = vertex(b.revs[i])
111+
}
112+
} else {
113+
// if this is not the first commit, then assign to the old
114+
// commit or to the new one, depending on what the diff
115+
// says.
116+
b.assignOrigin(i, i-1)
117+
}
118+
}
119+
120+
// fill in the output results: copy the nodes of the last revision
121+
// into the result.
122+
fVs := b.graph[len(b.graph)-1]
123+
result := make([]*git.Commit, 0, len(fVs))
124+
for _, v := range fVs {
125+
c := git.Commit(*v)
126+
result = append(result, &c)
127+
}
128+
return result, nil
129+
}
130+
131+
// this struct is internally used by the blame function to hold its
132+
// intputs, outputs and state.
133+
type blame struct {
134+
repo *git.Repository // the repo holding the history of the file to blame
135+
path string // the path of the file to blame
136+
fRev *git.Commit // the commit of the final revision of the file to blame
137+
revs revlist.Revs // the chain of revisions affecting the the file to blame
138+
data []string // the contents on the file in all the revisions TODO: not all data is needed, only the current rev and the prev
139+
graph [][]vertex // the graph of the lines in the file across all the revisions TODO: not all vertexes are needed, only the current rev and the prev
140+
}
141+
142+
type vertex *git.Commit // a vertex only needs to store the original commit it came from
143+
144+
// Assigns origin to vertexes in current (c) rev from data in its previous (p)
145+
// revision
146+
func (b *blame) assignOrigin(c, p int) {
147+
// assign origin based on diff info
148+
hunks := diff.Do(b.data[p], b.data[c])
149+
sl := -1 // source line
150+
dl := -1 // destination line
151+
for h := range hunks {
152+
hLines := git.CountLines(hunks[h].Text)
153+
for hl := 0; hl < hLines; hl++ {
154+
// fmt.Printf("file=%q, rev=%d, r=%d, h=%d, hunk=%v, hunkLine=%d\n", file, rev, r, h, hunks[h], hl)
155+
switch {
156+
case hunks[h].Type == 0:
157+
sl++
158+
dl++
159+
b.graph[c][dl] = b.graph[p][sl]
160+
case hunks[h].Type == 1:
161+
dl++
162+
b.graph[c][dl] = vertex(b.revs[c])
163+
case hunks[h].Type == -1:
164+
sl++
165+
default:
166+
panic("unreachable")
167+
}
168+
}
169+
}
170+
}
171+
172+
// This will print the results of a Blame as in git-blame.
173+
func (b *blame) PrettyPrint() string {
174+
var buf bytes.Buffer
175+
176+
contents, found := git.Data(b.path, b.fRev)
177+
if !found {
178+
panic("PrettyPrint: internal error in repo.Data")
179+
}
180+
181+
lines := strings.Split(contents, "\n")
182+
// max line number length
183+
mlnl := len(fmt.Sprintf("%s", strconv.Itoa(len(lines))))
184+
// max author length
185+
mal := b.maxAuthorLength()
186+
format := fmt.Sprintf("%%s (%%-%ds %%%dd) %%s\n",
187+
mal, mlnl)
188+
189+
fVs := b.graph[len(b.graph)-1]
190+
for ln, v := range fVs {
191+
fmt.Fprintf(&buf, format, v.Hash.String()[:8],
192+
prettyPrintAuthor(fVs[ln]), ln+1, lines[ln])
193+
}
194+
return buf.String()
195+
}
196+
197+
// utility function to pretty print the author.
198+
func prettyPrintAuthor(c *git.Commit) string {
199+
return fmt.Sprintf("%s %s", c.Author.Name, c.Author.When.Format("2006-01-02"))
200+
}
201+
202+
// utility function to calculate the number of runes needed
203+
// to print the longest author name in the blame of a file.
204+
func (b *blame) maxAuthorLength() int {
205+
memo := make(map[core.Hash]struct{}, len(b.graph)-1)
206+
fVs := b.graph[len(b.graph)-1]
207+
m := 0
208+
for ln := range fVs {
209+
if _, ok := memo[fVs[ln].Hash]; ok {
210+
continue
211+
}
212+
memo[fVs[ln].Hash] = struct{}{}
213+
m = max(m, utf8.RuneCountInString(prettyPrintAuthor(fVs[ln])))
214+
}
215+
return m
216+
}
217+
218+
func max(a, b int) int {
219+
if a > b {
220+
return a
221+
}
222+
return b
223+
}

blame/blame2humantest.bash

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
#!/bin/bash
2+
3+
set -e
4+
5+
repo=`git remote show origin | grep Fetch | cut -d' ' -f5`
6+
branch="master"
7+
if [ "$#" -eq 1 ] ; then
8+
commit=`git log | head -1 | cut -d' ' -f2`
9+
path=$1
10+
elif [ "$#" -eq 2 ] ; then
11+
commit=$1
12+
path=$2
13+
else
14+
echo "bad number of parameters" > /dev/stderr
15+
echo > /dev/stderr
16+
echo " try with: [commit] path" > /dev/stderr
17+
exit
18+
fi
19+
20+
blames=`git blame --root $path | cut -d' ' -f1`
21+
declare -a blame
22+
i=0
23+
for shortBlame in $blames ; do
24+
blame[$i]=`git show $shortBlame | head -1 | cut -d' ' -f2`
25+
i=`expr $i + 1`
26+
done
27+
28+
# some remotes have the .git, other don't,
29+
# repoDot makes sure all have
30+
repoDot="${repo%.git}.git"
31+
32+
echo -e "\t{\"${repoDot}\", \"${branch}\", \"${commit}\", \"${path}\", concat(&[]string{},"
33+
prev=""
34+
count=1
35+
for i in ${blame[@]} ; do
36+
if [ "${prev}" == "" ] ; then
37+
prev=$i
38+
elif [ "$prev" == "$i" ] ; then
39+
count=`expr $count + 1`
40+
else
41+
echo -e "\t\trepeat(\"${prev}\", $count),"
42+
count=1
43+
prev=$i
44+
fi
45+
done
46+
echo -e "\t\trepeat(\"${prev}\", $count),"
47+
echo -e "\t)},"

0 commit comments

Comments
 (0)