Skip to content

Commit 8839f3b

Browse files
feat(indexing): enhance search, chunking and file watching
Major improvements to indexing and search functionality: - Add scoring explanations and custom weights - Improve document chunking with better overlap handling - Enhance file watching reliability - Add debug features and logging - Improve test coverage and error handling Co-authored-by: Bob <[email protected]>
1 parent 5868e61 commit 8839f3b

14 files changed

+738
-266
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -44,3 +44,4 @@ benchmark_index/
4444

4545
# Test coverage
4646
.coverage
47+
benchmark_data

Makefile

+3
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,6 @@ test:
44
# run linting, typechecking, and tests
55
check:
66
pre-commit run --all-files
7+
8+
typecheck:
9+
pre-commit run mypy --all-files

examples/basic/search.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ def main():
2626
indexer.index_directory(docs_dir, glob_pattern="**/*.md")
2727

2828
# Search
29-
documents, distances = indexer.search(query, n_results=3)
29+
documents, distances, _ = indexer.search(query, n_results=3)
3030

3131
# Display results
3232
console.print(f"\nResults for: [cyan]{query}[/cyan]\n")

examples/code-search/search_docs.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ def main():
8080
continue
8181

8282
# Search with chunk grouping
83-
documents, distances = indexer.search(
83+
documents, distances, _ = indexer.search(
8484
query,
8585
n_results=5,
8686
group_chunks=True,

examples/knowledge-base/search_kb.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ def main(query: str | None, index_dir: Path, interactive: bool, show_content: bo
8484
def do_search(search_query: str):
8585
"""Perform search and display results."""
8686
# Search with chunk grouping
87-
documents, distances = indexer.search(
87+
documents, distances, _ = indexer.search(
8888
search_query,
8989
n_results=5,
9090
group_chunks=True,

gptme_rag/benchmark.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -169,7 +169,7 @@ def search_operation():
169169
)
170170
total_results = 0
171171
for query in queries:
172-
results, _ = indexer.search(query, n_results=n_results)
172+
results, _, _ = indexer.search(query, n_results=n_results)
173173
total_results += len(results)
174174
return {
175175
"items_processed": len(queries),

gptme_rag/cli.py

+93-21
Original file line numberDiff line numberDiff line change
@@ -35,9 +35,7 @@ def cli(verbose: bool):
3535

3636

3737
@cli.command()
38-
@click.argument(
39-
"directory", type=click.Path(exists=True, file_okay=False, path_type=Path)
40-
)
38+
@click.argument("paths", nargs=-1, type=click.Path(exists=True, path_type=Path))
4139
@click.option(
4240
"--pattern", "-p", default="**/*.*", help="Glob pattern for files to index"
4341
)
@@ -47,16 +45,29 @@ def cli(verbose: bool):
4745
default=default_persist_dir,
4846
help="Directory to persist the index",
4947
)
50-
def index(directory: Path, pattern: str, persist_dir: Path):
51-
"""Index documents in a directory."""
48+
def index(paths: list[Path], pattern: str, persist_dir: Path):
49+
"""Index documents in one or more directories."""
50+
if not paths:
51+
console.print("❌ No paths provided", style="red")
52+
return
53+
5254
try:
5355
indexer = Indexer(persist_directory=persist_dir, enable_persist=True)
54-
console.print(f"Indexing files in {directory} with pattern {pattern}")
55-
56-
# Index the files
57-
n_indexed = indexer.index_directory(directory, pattern)
58-
59-
console.print(f"✅ Successfully indexed {n_indexed} files", style="green")
56+
total_indexed = 0
57+
58+
for path in paths:
59+
if path.is_file():
60+
console.print(f"Indexing file: {path}")
61+
n_indexed = indexer.index_file(path)
62+
if n_indexed is not None:
63+
total_indexed += n_indexed
64+
else:
65+
console.print(f"Indexing files in {path} with pattern {pattern}")
66+
n_indexed = indexer.index_directory(path, pattern)
67+
if n_indexed is not None:
68+
total_indexed += n_indexed
69+
70+
console.print(f"✅ Successfully indexed {total_indexed} files", style="green")
6071
except Exception as e:
6172
console.print(f"❌ Error indexing directory: {e}", style="red")
6273

@@ -74,6 +85,12 @@ def index(directory: Path, pattern: str, persist_dir: Path):
7485
@click.option("--max-tokens", default=4000, help="Maximum tokens in context window")
7586
@click.option("--show-context", is_flag=True, help="Show the full context content")
7687
@click.option("--raw", is_flag=True, help="Skip syntax highlighting")
88+
@click.option("--explain", is_flag=True, help="Show scoring explanations")
89+
@click.option(
90+
"--weights",
91+
type=click.STRING,
92+
help="Custom scoring weights as JSON string, e.g. '{\"recency_boost\": 0.3}'",
93+
)
7794
def search(
7895
query: str,
7996
paths: list[Path],
@@ -82,21 +99,46 @@ def search(
8299
max_tokens: int,
83100
show_context: bool,
84101
raw: bool,
102+
explain: bool,
103+
weights: str | None,
85104
):
86105
"""Search the index and assemble context."""
87106
paths = [path.resolve() for path in paths]
88107

89108
# Hide ChromaDB output during initialization and search
90109
with console.status("Initializing..."):
110+
# Parse custom weights if provided
111+
scoring_weights = None
112+
if weights:
113+
try:
114+
import json
115+
116+
scoring_weights = json.loads(weights)
117+
except json.JSONDecodeError as e:
118+
console.print(f"❌ Invalid weights JSON: {e}", style="red")
119+
return
120+
except Exception as e:
121+
console.print(f"❌ Error parsing weights: {e}", style="red")
122+
return
123+
91124
# Temporarily redirect stdout to suppress ChromaDB output
92125
stdout = sys.stdout
93126
sys.stdout = open(os.devnull, "w")
94127
try:
95-
indexer = Indexer(persist_directory=persist_dir, enable_persist=True)
96-
assembler = ContextAssembler(max_tokens=max_tokens)
97-
documents, distances = indexer.search(
98-
query, n_results=n_results, paths=paths
128+
indexer = Indexer(
129+
persist_directory=persist_dir,
130+
enable_persist=True,
131+
scoring_weights=scoring_weights,
99132
)
133+
assembler = ContextAssembler(max_tokens=max_tokens)
134+
if explain:
135+
documents, distances, explanations = indexer.search(
136+
query, n_results=n_results, paths=paths, explain=True
137+
)
138+
else:
139+
documents, distances, _ = indexer.search(
140+
query, n_results=n_results, paths=paths
141+
)
100142
finally:
101143
sys.stdout.close()
102144
sys.stdout = stdout
@@ -128,20 +170,50 @@ def search(
128170
for i, doc in enumerate(documents):
129171
source = doc.metadata.get("source", "unknown")
130172
distance = distances[i]
131-
relevance = 1 - distance # Convert distance to similarity score
132173

133-
# Show document header with relevance score
134-
console.print(
135-
f"\n[cyan]{i+1}. {source}[/cyan] [yellow](relevance: {relevance:.2f})[/yellow]"
136-
)
174+
# Show document header
175+
console.print(f"\n[cyan]{i+1}. {source}[/cyan]")
176+
177+
# Show scoring explanation if requested
178+
if explain and explanations: # Make sure explanations is not None
179+
explanation = explanations[i]
180+
console.print("\n[bold]Scoring Breakdown:[/bold]")
181+
182+
# Show individual score components
183+
scores = explanation.get("scores", {})
184+
for factor, score in scores.items():
185+
# Color code the scores
186+
if score > 0:
187+
score_color = "green"
188+
sign = "+"
189+
elif score < 0:
190+
score_color = "red"
191+
sign = ""
192+
else:
193+
score_color = "yellow"
194+
sign = " "
195+
196+
# Print score and explanation
197+
console.print(
198+
f" {factor:15} [{score_color}]{sign}{score:>6.3f}[/{score_color}] | {explanation['explanations'][factor]}"
199+
)
200+
201+
# Show total score
202+
total = explanation["total_score"]
203+
console.print(f"\n {'Total':15} [bold blue]{total:>7.3f}[/bold blue]")
204+
else:
205+
# Just show the base relevance score
206+
relevance = 1 - distance
207+
console.print(f"[yellow](relevance: {relevance:.2f})[/yellow]")
137208

138209
# Use file extension as lexer (strip the dot)
139210
lexer = doc.metadata.get("extension", "").lstrip(".") or "text"
140211

141212
# Extract preview content (first ~200 chars)
142213
preview = doc.content[:200] + ("..." if len(doc.content) > 200 else "")
143214

144-
# Display with syntax highlighting
215+
# Display preview with syntax highlighting
216+
console.print("\n[bold]Preview:[/bold]")
145217
syntax = Syntax(
146218
preview,
147219
lexer,

gptme_rag/indexing/document_processor.py

+21-18
Original file line numberDiff line numberDiff line change
@@ -72,42 +72,45 @@ def process_text(
7272
}
7373
return
7474

75-
# Process in chunks
75+
# Process text in chunks based on tokens
7676
chunk_start = 0
7777
chunk_count = 0
7878

7979
while chunk_start < len(tokens):
8080
# Calculate chunk end
8181
chunk_end = min(chunk_start + self.chunk_size, len(tokens))
8282

83-
# Decode chunk
83+
# Get chunk tokens and decode
8484
chunk_tokens = tokens[chunk_start:chunk_end]
8585
chunk_text = self.encoding.decode(chunk_tokens)
8686

8787
# Create chunk metadata
88-
chunk_metadata = {
89-
**(metadata or {}),
90-
"chunk_index": chunk_count,
91-
"token_count": len(chunk_tokens),
92-
"total_chunks": total_chunks,
93-
"chunk_start": chunk_start,
94-
"chunk_end": chunk_end,
95-
}
96-
9788
yield {
9889
"text": chunk_text,
99-
"metadata": chunk_metadata,
90+
"metadata": {
91+
**(metadata or {}),
92+
"chunk_index": chunk_count,
93+
"token_count": len(chunk_tokens),
94+
"total_chunks": total_chunks,
95+
"chunk_start": chunk_start,
96+
"chunk_end": chunk_end,
97+
"is_chunk": True,
98+
},
10099
}
101100

102-
# Move to next chunk
103-
chunk_start = chunk_end - self.chunk_overlap
101+
# Calculate next chunk start
102+
if chunk_end == len(tokens):
103+
# If we've reached the end, we're done
104+
break
105+
106+
# Move forward by at least one token, considering overlap
107+
next_start = chunk_start + max(1, self.chunk_size - self.chunk_overlap)
108+
chunk_start = min(next_start, len(tokens) - 1)
104109
chunk_count += 1
105110

106-
# Check stopping conditions
111+
# Check max chunks limit
107112
if self.max_chunks and chunk_count >= self.max_chunks:
108-
return
109-
if len(tokens) - chunk_start <= self.chunk_overlap:
110-
return
113+
break
111114

112115
except Exception as e:
113116
logger.error(f"Error processing text: {e}")

0 commit comments

Comments
 (0)