Skip to content
This repository was archived by the owner on Sep 30, 2024. It is now read-only.

Commit cd14dd8

Browse files
Gather additional statistics at index time (#64426)
Adds a new `GetCodyContextAlternatives` GraphQL resolver that returns a list of `ContextList`, each of which is a distinct list of results that match the user query, along with a name that describes how that result list was computed. The idea here is to be able to show alternative context fetched with different methods/rankers, which can be displayed by the client to help us iterate on context quality and ordering. The existing context fetching mechanism is preserved and given the `keyword($query)` name. We add a new experimental context list, which modifies the user query to do term expansion using a table of keywords. This keyword table is computed in the `RepositoryReindex` GraphQL endpoint and stored in Redis. We do not worry about clean up, because this feature is experimental. All of this is protected by the feature flag `enhanced-index`. Existing production behavior should not be modified. --------- Co-authored-by: Rishabh Mehrotra <[email protected]>
1 parent 9d45fc6 commit cd14dd8

File tree

12 files changed

+707
-66
lines changed

12 files changed

+707
-66
lines changed

.gitignore

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,8 +76,6 @@ eb-bundle.zip
7676
.bin/
7777
.env
7878

79-
/vendor/
80-
8179
.gtm/
8280

8381
cmd/gitserver/debug

cmd/frontend/graphqlbackend/BUILD.bazel

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -246,10 +246,12 @@ go_library(
246246
"//cmd/frontend/internal/backend",
247247
"//cmd/frontend/internal/cloneurls",
248248
"//cmd/frontend/internal/cody",
249+
"//cmd/frontend/internal/codycontext",
249250
"//cmd/frontend/internal/conf/validation",
250251
"//cmd/frontend/internal/highlight",
251252
"//cmd/frontend/internal/inventory",
252253
"//cmd/frontend/internal/processrestart",
254+
"//cmd/frontend/internal/search/idf",
253255
"//cmd/frontend/internal/search/logs",
254256
"//cmd/frontend/internal/ssc",
255257
"//cmd/frontend/internal/suspiciousnames",
@@ -374,6 +376,7 @@ go_library(
374376
"@com_github_masterminds_semver//:semver",
375377
"@com_github_prometheus_client_golang//prometheus",
376378
"@com_github_prometheus_client_golang//prometheus/promauto",
379+
"@com_github_sourcegraph_conc//iter",
377380
"@com_github_sourcegraph_conc//pool",
378381
"@com_github_sourcegraph_go_diff//diff",
379382
"@com_github_sourcegraph_go_langserver//pkg/lsp",

cmd/frontend/graphqlbackend/cody_context.go

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,17 @@
11
package graphqlbackend
22

33
import (
4+
"bytes"
45
"context"
56

67
"github.com/graph-gophers/graphql-go"
8+
"github.com/sourcegraph/conc/iter"
9+
10+
"github.com/sourcegraph/sourcegraph/cmd/frontend/internal/codycontext"
11+
"github.com/sourcegraph/sourcegraph/internal/database"
12+
"github.com/sourcegraph/sourcegraph/internal/gitserver"
13+
"github.com/sourcegraph/sourcegraph/internal/trace"
14+
"github.com/sourcegraph/sourcegraph/lib/pointers"
715
)
816

917
type CodyContextResolver interface {
@@ -14,6 +22,7 @@ type CodyContextResolver interface {
1422
UrlMentionContext(ctx context.Context, args UrlMentionContextArgs) (UrlMentionContextResolver, error)
1523
// GetCodyContext is the existing Cody Enterprise context endpoint
1624
GetCodyContext(ctx context.Context, args GetContextArgs) ([]ContextResultResolver, error)
25+
GetCodyContextAlternatives(ctx context.Context, args GetContextArgs) (*ContextAlternativesResolver, error)
1726
}
1827

1928
type GetContextArgs struct {
@@ -33,6 +42,98 @@ type UrlMentionContextResolver interface {
3342
Content() string
3443
}
3544

45+
func NewContextAlternativesResolver(db database.DB, gitserverClient gitserver.Client, contextAlternatives *codycontext.GetCodyContextResult) *ContextAlternativesResolver {
46+
return &ContextAlternativesResolver{
47+
db: db,
48+
gitserverClient: gitserverClient,
49+
ContextAlternatives: contextAlternatives,
50+
}
51+
}
52+
53+
type ContextAlternativesResolver struct {
54+
db database.DB
55+
gitserverClient gitserver.Client
56+
ContextAlternatives *codycontext.GetCodyContextResult
57+
}
58+
59+
func (c *ContextAlternativesResolver) ContextLists() []*ContextListResolver {
60+
var res []*ContextListResolver
61+
for _, contextList := range c.ContextAlternatives.ContextLists {
62+
res = append(res, &ContextListResolver{ContextList: &contextList, db: c.db, gitserverClient: c.gitserverClient})
63+
}
64+
return res
65+
}
66+
67+
type ContextListResolver struct {
68+
db database.DB
69+
gitserverClient gitserver.Client
70+
ContextList *codycontext.ContextList
71+
}
72+
73+
func (r *ContextListResolver) ContextItems(ctx context.Context) (_ []ContextResultResolver, err error) {
74+
tr, ctx := trace.New(ctx, "resolveChunks")
75+
defer tr.EndWithErr(&err)
76+
77+
return iter.MapErr(r.ContextList.FileChunks, func(fileChunk *codycontext.FileChunkContext) (ContextResultResolver, error) {
78+
return r.fileChunkToResolver(ctx, fileChunk)
79+
})
80+
}
81+
82+
func (r *ContextListResolver) Name(ctx context.Context) (string, error) {
83+
return r.ContextList.Name, nil
84+
}
85+
86+
// The rough size of a file chunk in runes. The value 1024 is due to historical reasons -- Cody context was once based
87+
// on embeddings, and we chunked files into ~1024 characters (aiming for 256 tokens, assuming each token takes 4
88+
// characters on average).
89+
//
90+
// Ideally, the caller would pass a token 'budget' and we'd use a tokenizer and attempt to exactly match this budget.
91+
const chunkSizeRunes = 1024
92+
93+
func (r *ContextListResolver) fileChunkToResolver(ctx context.Context, chunk *codycontext.FileChunkContext) (ContextResultResolver, error) {
94+
repoResolver := NewMinimalRepositoryResolver(r.db, r.gitserverClient, chunk.RepoID, chunk.RepoName)
95+
96+
commitResolver := NewGitCommitResolver(r.db, r.gitserverClient, repoResolver, chunk.CommitID, nil)
97+
stat, err := r.gitserverClient.Stat(ctx, chunk.RepoName, chunk.CommitID, chunk.Path)
98+
if err != nil {
99+
return nil, err
100+
}
101+
102+
gitTreeEntryResolver := NewGitTreeEntryResolver(r.db, r.gitserverClient, GitTreeEntryResolverOpts{
103+
Commit: commitResolver,
104+
Stat: stat,
105+
})
106+
107+
// Populate content ahead of time so we can do it concurrently
108+
content, err := gitTreeEntryResolver.Content(ctx, &GitTreeContentPageArgs{
109+
StartLine: pointers.Ptr(int32(chunk.StartLine)),
110+
})
111+
if err != nil {
112+
return nil, err
113+
}
114+
115+
numLines := countLines(content, chunkSizeRunes)
116+
endLine := chunk.StartLine + numLines - 1 // subtract 1 because endLine is inclusive
117+
return NewFileChunkContextResolver(gitTreeEntryResolver, chunk.StartLine, endLine), nil
118+
}
119+
120+
// countLines finds the number of lines corresponding to the number of runes. We 'round down'
121+
// to ensure that we don't return more characters than our budget.
122+
func countLines(content string, numRunes int) int {
123+
if len(content) == 0 {
124+
return 0
125+
}
126+
127+
if content[len(content)-1] != '\n' {
128+
content += "\n"
129+
}
130+
131+
runes := []rune(content)
132+
truncated := runes[:min(len(runes), numRunes)]
133+
in := []byte(string(truncated))
134+
return bytes.Count(in, []byte("\n"))
135+
}
136+
36137
type ContextResultResolver interface {
37138
ToFileChunkContext() (*FileChunkContextResolver, bool)
38139
}

cmd/frontend/graphqlbackend/cody_context.graphql

Lines changed: 56 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
extend type Query {
22
"""
3-
Get pieces of context related to the query from a set of repositories.
3+
Get a list of context related to the query from a set of repositories. This is the first list from getCodyContextAlternatives
44
"""
55
getCodyContext(
66
"""
@@ -27,6 +27,34 @@ extend type Query {
2727
textResultsCount: Int!
2828
): [CodyContextResult!]!
2929

30+
"""
31+
Get lists of context related to the query from a set of repositories.
32+
"""
33+
getCodyContextAlternatives(
34+
"""
35+
The repositories to search.
36+
"""
37+
repos: [ID!]!
38+
"""
39+
An optional list of file patterns used to filter the results. The
40+
patterns are regex strings. For a file chunk to be returned a context
41+
result, the path must match at least one of these patterns.
42+
"""
43+
filePatterns: [String!]
44+
"""
45+
A natural language query string.
46+
"""
47+
query: String!
48+
"""
49+
The number of code results to return.
50+
"""
51+
codeResultsCount: Int!
52+
"""
53+
The number of text results to return. Text results contain Markdown files and similar file types primarily used for writing documentation.
54+
"""
55+
textResultsCount: Int!
56+
): CodyContextAlternativesResult!
57+
3058
"""
3159
EXPERIMENTAL: Detect intent for a given Cody chat query.
3260
"""
@@ -132,6 +160,33 @@ or wiki page).
132160
"""
133161
union CodyContextResult = FileChunkContext
134162

163+
"""
164+
EXPERIMENTAL: The result from the Cody context alternatives API, which returns a list
165+
of context lists, each of which represents a distinct context list that could have been
166+
used. The first list is the one that should actually be used.
167+
"""
168+
type CodyContextAlternativesResult {
169+
"""
170+
List of context lists
171+
"""
172+
contextLists: [CodyContextList!]!
173+
}
174+
175+
"""
176+
EXPERIMENTAL: A named list of context items.
177+
"""
178+
type CodyContextList {
179+
"""
180+
Name of the list
181+
"""
182+
Name: String!
183+
184+
"""
185+
List of context items
186+
"""
187+
contextItems: [CodyContextResult!]!
188+
}
189+
135190
"""
136191
A piece of context that represents a chunk of a file in a git repository
137192
"""

cmd/frontend/graphqlbackend/repository_reindex.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55

66
"github.com/graph-gophers/graphql-go"
77

8+
"github.com/sourcegraph/sourcegraph/cmd/frontend/internal/search/idf"
89
"github.com/sourcegraph/sourcegraph/internal/auth"
910
"github.com/sourcegraph/sourcegraph/internal/search/zoekt"
1011
)
@@ -23,6 +24,10 @@ func (r *schemaResolver) ReindexRepository(ctx context.Context, args *struct {
2324
return nil, err
2425
}
2526

27+
if err := idf.Update(ctx, r.logger, repo.RepoName()); err != nil {
28+
return nil, err
29+
}
30+
2631
err = zoekt.Reindex(ctx, repo.RepoName(), repo.IDInt32())
2732
if err != nil {
2833
return nil, err

cmd/frontend/internal/codycontext/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ go_library(
1313
visibility = ["//cmd/frontend:__subpackages__"],
1414
deps = [
1515
"//cmd/frontend/internal/cody",
16+
"//cmd/frontend/internal/search/idf",
1617
"//internal/api",
1718
"//internal/cast",
1819
"//internal/conf",

0 commit comments

Comments
 (0)