Skip to content
This repository was archived by the owner on Sep 30, 2024. It is now read-only.

Commit 5d8286b

Browse files
authored
Backend: add line index (#63726)
This adds a line index utility. Frequently, I want to be able to efficiently index a file to extract a specific line or range of lines, but it's surprisingly tricky to get exactly right given weird definitions of "what even is a line" and edge conditions around out-of-bounds and such. So this adds a general-purpose utility to pre-calculate the locations of lines in the file, making extracting a line range a zero-allocation, `O(1)` operation. Not implemented: the same index can also be used to find the line that contains an offset, which I've also needed to do before. But I'll save that for when I actually have an immediate use for it.
1 parent 0fc4d28 commit 5d8286b

File tree

3 files changed

+247
-2
lines changed

3 files changed

+247
-2
lines changed

internal/byteutils/BUILD.bazel

+6-2
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ load("@io_bazel_rules_go//go:def.bzl", "go_library")
44
go_library(
55
name = "byteutils",
66
srcs = [
7+
"lineindex.go",
78
"linereader.go",
89
"nullscanner.go",
910
],
@@ -14,7 +15,10 @@ go_library(
1415

1516
go_test(
1617
name = "byteutils_test",
17-
srcs = ["linereader_test.go"],
18+
srcs = [
19+
"lineindex_test.go",
20+
"linereader_test.go",
21+
],
22+
embed = [":byteutils"],
1823
tags = [TAG_PLATFORM_SOURCE],
19-
deps = [":byteutils"],
2024
)

internal/byteutils/lineindex.go

+108
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
package byteutils
2+
3+
import (
4+
"bytes"
5+
"math"
6+
"strings"
7+
)
8+
9+
// NewLineIndex creates a new LineIndex from some file content.
10+
func NewLineIndex[T ~string | ~[]byte](content T) LineIndex {
11+
if len(content) > math.MaxUint32 {
12+
panic("content too large")
13+
}
14+
15+
// PERF: count the newlines in advance to allocate the index slice exactly
16+
// Explicitly case on the type rather than casting because the generics
17+
// seem to break the optimization that allows the allocation to be elided.
18+
var newlineCount int
19+
switch v := any(content).(type) {
20+
case string:
21+
newlineCount = strings.Count(v, "\n")
22+
case []byte:
23+
newlineCount = bytes.Count(v, []byte("\n"))
24+
}
25+
26+
index := make(LineIndex, 0, newlineCount+2)
27+
index = append(index, 0)
28+
offset := 0
29+
for {
30+
var loc int
31+
switch v := any(content).(type) {
32+
case string:
33+
loc = strings.IndexByte(v[offset:], '\n')
34+
case []byte:
35+
loc = bytes.IndexByte(v[offset:], '\n')
36+
}
37+
if loc == -1 {
38+
break
39+
}
40+
index = append(index, uint32(offset+loc+1))
41+
offset += loc + 1
42+
}
43+
index = append(index, uint32(len(content)))
44+
return index
45+
}
46+
47+
// LineIndex contains the line boundaries of the indexed content.
48+
// Its structure is:
49+
// - A leading 0
50+
// - A sorted list of every byte offset _after_ a newline byte
51+
// - A trailing len(content)
52+
//
53+
// This means:
54+
// - LineIndex[N] is the offset of the first byte of line N
55+
// - LineIndex[N+1] is the offset of the first byte after line N
56+
// - content[LineIndex[N]:LineIndex[N+1]] is the contents of line N
57+
type LineIndex []uint32
58+
59+
// LineRange returns a range that can be used to slice the indexed content to obtain
60+
// the line for the given number. The range is guaranteed to be a valid slice
61+
// into the content if the content is unchanged. If the line number refers to a
62+
// line that does not exist, a zero-length range will be returned pointing to
63+
// the beginning (for underflow) or end (for overflow) of the file.
64+
//
65+
// lineNumber is 0-indexed, and the returned range includes the terminating
66+
// newline (if it exists). Equivalent to Lines(lineNumber, lineNumber + 1).
67+
func (l LineIndex) LineRange(lineNumber int) (int, int) {
68+
return l.LinesRange(lineNumber, lineNumber+1)
69+
}
70+
71+
// LinesRange returns a range that can be used to slice the indexed content to
72+
// obtain the lines for the given half-open range. The range is guaranteed to
73+
// be a valid slice into the content if the content is unchanged. If the
74+
// requested range of lines does not exist, it will be truncated to return the
75+
// set of lines in that range that does exist.
76+
//
77+
// line numbers are 0-indexed, and the returned range includes the terminating
78+
// newline (if it exists).
79+
func (l LineIndex) LinesRange(startLine, endLine int) (int, int) {
80+
startLine = min(max(0, startLine), len(l)-1)
81+
endLine = min(max(startLine, endLine), len(l)-1)
82+
return int(l[startLine]), int(l[endLine])
83+
}
84+
85+
// For the purpose of this package, a line is defined as:
86+
// - zero or more non-newline bytes terminated by a newline byte
87+
// - OR one more non-newline terminated by the end of the file.
88+
//
89+
// Equivalently, the regex `[^\n]*\n|[^\n]+$`
90+
//
91+
// Equivalently, a newline at the last byte of the file does not
92+
// start an empty last line.
93+
//
94+
// Notably, this is at odds with the POSIX standard:
95+
// https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap03.html#tag_03_206
96+
func (l LineIndex) LineCount() int {
97+
lastLineEnd := l[len(l)-1]
98+
contentEnd := l[len(l)-2]
99+
if lastLineEnd == contentEnd {
100+
return len(l) - 2
101+
}
102+
return len(l) - 1
103+
}
104+
105+
// NewlineCount is simply the number of newline bytes in the content
106+
func (l LineIndex) NewlineCount() int {
107+
return len(l) - 2
108+
}

internal/byteutils/lineindex_test.go

+133
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
package byteutils
2+
3+
import (
4+
"bytes"
5+
"strings"
6+
"testing"
7+
"testing/quick"
8+
)
9+
10+
func naiveGetLines(contents string, lineStart, lineEnd int) string {
11+
lines := strings.SplitAfter(contents, "\n")
12+
if len(lines[len(lines)-1]) == 0 {
13+
lines = lines[:len(lines)-1]
14+
}
15+
clampedStart := min(max(0, lineStart), len(lines))
16+
clampedEnd := min(max(clampedStart, lineEnd), len(lines))
17+
return strings.Join(lines[clampedStart:clampedEnd], "")
18+
}
19+
20+
var testCases = []struct {
21+
contents string
22+
startLine, endLine int
23+
}{
24+
{"no trailing newline", 0, 1},
25+
{"trailing newline\n", 0, 1},
26+
{"trailing newline\nfollowed by no trailing newline", 0, 2},
27+
{"", 0, 0},
28+
{"\n", 0, 1},
29+
{"\n\n\n", 0, 3},
30+
31+
// Out of bounds
32+
{"\n\n\n", -1, 4},
33+
{"\n\n\n", -1, -1},
34+
{"\n\n\n", 4, 4},
35+
}
36+
37+
func TestNewlineIndex(t *testing.T) {
38+
lineIndexGetLines := func(contents string, startLine, endLine int) string {
39+
index := NewLineIndex(contents)
40+
start, end := index.LinesRange(startLine, endLine)
41+
return contents[start:end]
42+
}
43+
44+
t.Run("cases", func(t *testing.T) {
45+
for _, tc := range testCases {
46+
got := lineIndexGetLines(tc.contents, tc.startLine, tc.endLine)
47+
want := naiveGetLines(tc.contents, tc.startLine, tc.endLine)
48+
if want != got {
49+
t.Log(tc)
50+
t.Fatalf("got: %q, want: %q", got, want)
51+
}
52+
}
53+
})
54+
55+
t.Run("quick", func(t *testing.T) {
56+
quick.CheckEqual(lineIndexGetLines, naiveGetLines, nil)
57+
})
58+
59+
t.Run("line count", func(t *testing.T) {
60+
cases := []struct {
61+
content string
62+
lineCount int
63+
}{
64+
{"", 0},
65+
{"test", 1},
66+
{"test\n", 1},
67+
{"test\ntest", 2},
68+
{"test\ntest\n", 2},
69+
{"\n", 1},
70+
{"\n\n", 2},
71+
}
72+
73+
for _, tc := range cases {
74+
index := NewLineIndex(tc.content)
75+
if index.LineCount() != tc.lineCount {
76+
t.Fatalf("got %q, want %q", index.LineCount(), tc.lineCount)
77+
}
78+
}
79+
})
80+
81+
t.Run("string allocs", func(t *testing.T) {
82+
contents := strings.Repeat("testline\n", 1000)
83+
allocs := testing.AllocsPerRun(10, func() {
84+
_ = NewLineIndex(contents)
85+
})
86+
if allocs != 1 {
87+
t.Fatalf("expected one alloc got %f", allocs)
88+
}
89+
})
90+
91+
t.Run("byte allocs", func(t *testing.T) {
92+
contents := bytes.Repeat([]byte("testline\n"), 1000)
93+
allocs := testing.AllocsPerRun(10, func() {
94+
_ = NewLineIndex(contents)
95+
})
96+
if allocs != 1 {
97+
t.Fatalf("expected one alloc, got %f", allocs)
98+
}
99+
})
100+
}
101+
102+
func FuzzNewlineIndex(f *testing.F) {
103+
for _, tc := range testCases {
104+
f.Add(tc.contents, tc.startLine, tc.endLine)
105+
}
106+
f.Fuzz(func(t *testing.T, contents string, startLine, endLine int) {
107+
index := NewLineIndex(contents)
108+
start, end := index.LinesRange(startLine, endLine)
109+
got := contents[start:end]
110+
want := naiveGetLines(contents, startLine, endLine)
111+
if want != got {
112+
t.Fatalf("got: %q, want: %q", got, want)
113+
}
114+
})
115+
}
116+
117+
func BenchmarkLineIndex(b *testing.B) {
118+
b.Run("construct string", func(b *testing.B) {
119+
contents := strings.Repeat("testline\n", 1000)
120+
b.ResetTimer()
121+
for i := 0; i < b.N; i++ {
122+
_ = NewLineIndex(contents)
123+
}
124+
})
125+
126+
b.Run("construct bytes", func(b *testing.B) {
127+
contents := bytes.Repeat([]byte("testline\n"), 1000)
128+
b.ResetTimer()
129+
for i := 0; i < b.N; i++ {
130+
_ = NewLineIndex(contents)
131+
}
132+
})
133+
}

0 commit comments

Comments
 (0)