From a805c1e9f0ef768b1365a4666b77cf4408818eb1 Mon Sep 17 00:00:00 2001 From: mamba Date: Wed, 28 Jan 2026 13:40:00 +0000 Subject: [PATCH] Add incremental search index with boolean queries --- main.go | 44 +++++- search_index.go | 311 +++++++++++++++++++++++++++++++++++++++++++ search_index_test.go | 159 ++++++++++++++++++++++ search_query.go | 187 ++++++++++++++++++++++++++ 4 files changed, 700 insertions(+), 1 deletion(-) create mode 100644 search_index.go create mode 100644 search_index_test.go create mode 100644 search_query.go diff --git a/main.go b/main.go index e73e2cb..d26da53 100644 --- a/main.go +++ b/main.go @@ -38,7 +38,19 @@ func main() { return } - fmt.Fprintln(os.Stderr, "usage: jot [init|list|patterns]") + if len(args) >= 1 && args[0] == "search" { + if len(args) < 2 { + fmt.Fprintln(os.Stderr, "usage: jot search ") + os.Exit(1) + } + if err := jotSearch(os.Stdout, strings.Join(args[1:], " ")); err != nil { + fmt.Fprintln(os.Stderr, err) + os.Exit(1) + } + return + } + + fmt.Fprintln(os.Stderr, "usage: jot [init|list|patterns|search]") os.Exit(1) } @@ -138,6 +150,36 @@ func jotList(w io.Writer) error { return nil } +func jotSearch(w io.Writer, query string) error { + journalPath, err := ensureJournal() + if err != nil { + return err + } + + indexPath, err := defaultIndexPath() + if err != nil { + return err + } + + index, _, err := UpdateIndex(journalPath, indexPath) + if err != nil { + return err + } + + results, err := SearchIndex(index, query) + if err != nil { + return err + } + + for _, entry := range results { + if _, err := fmt.Fprintln(w, entry.Text); err != nil { + return err + } + } + + return nil +} + func ensureJournal() (string, error) { home, err := os.UserHomeDir() if err != nil { diff --git a/search_index.go b/search_index.go new file mode 100644 index 0000000..b06507c --- /dev/null +++ b/search_index.go @@ -0,0 +1,311 @@ +package main + +import ( + "bufio" + "crypto/sha1" + "encoding/hex" + "encoding/json" + "errors" + "fmt" + "os" + "path/filepath" + "regexp" + "sort" + "strings" +) + +const indexVersion = 1 + +var wordPattern = regexp.MustCompile(`[a-z0-9]+`) + +type Index struct { + Version int `json:"version"` + JournalPath string `json:"journal_path"` + Entries []IndexEntry `json:"entries"` + Terms map[string][]int +} + +type IndexEntry struct { + Line int `json:"line"` + Text string `json:"text"` + Hash string `json:"hash"` + Terms []string `json:"terms"` + Normalized string `json:"normalized"` +} + +type UpdateStats struct { + TotalLines int + ReindexedLines int + ReusedLines int +} + +func defaultIndexPath() (string, error) { + home, err := os.UserHomeDir() + if err != nil { + return "", err + } + return filepath.Join(home, ".jot", "index.json"), nil +} + +func UpdateIndex(journalPath, indexPath string) (*Index, UpdateStats, error) { + existing, err := loadIndex(indexPath) + if err != nil { + return nil, UpdateStats{}, err + } + + file, err := os.Open(journalPath) + if err != nil { + if errors.Is(err, os.ErrNotExist) { + return &Index{Version: indexVersion, JournalPath: journalPath}, UpdateStats{}, nil + } + return nil, UpdateStats{}, err + } + defer file.Close() + + scanner := bufio.NewScanner(file) + entries := make([]IndexEntry, 0) + stats := UpdateStats{} + lineNo := 0 + for scanner.Scan() { + lineNo++ + line := scanner.Text() + hash := hashLine(line) + stats.TotalLines++ + + if existing != nil && lineNo <= len(existing.Entries) { + prev := existing.Entries[lineNo-1] + if prev.Hash == hash { + prev.Line = lineNo + entries = append(entries, prev) + stats.ReusedLines++ + continue + } + } + + normalized := strings.ToLower(line) + terms := uniqueTerms(normalized) + entries = append(entries, IndexEntry{ + Line: lineNo, + Text: line, + Hash: hash, + Terms: terms, + Normalized: normalized, + }) + stats.ReindexedLines++ + } + if err := scanner.Err(); err != nil { + return nil, UpdateStats{}, err + } + + index := &Index{ + Version: indexVersion, + JournalPath: journalPath, + Entries: entries, + } + index.Terms = buildTerms(entries) + + if err := writeIndex(indexPath, index); err != nil { + return nil, UpdateStats{}, err + } + + return index, stats, nil +} + +func SearchIndex(index *Index, query string) ([]IndexEntry, error) { + if strings.TrimSpace(query) == "" { + return nil, fmt.Errorf("empty query") + } + + tokens, err := parseQuery(query) + if err != nil { + return nil, err + } + + rpn, err := toRPN(tokens) + if err != nil { + return nil, err + } + + results, err := evalRPN(rpn, index) + if err != nil { + return nil, err + } + + entries := make([]IndexEntry, 0, len(results)) + for _, entry := range index.Entries { + if _, ok := results[entry.Line-1]; ok { + entries = append(entries, entry) + } + } + return entries, nil +} + +func loadIndex(path string) (*Index, error) { + data, err := os.ReadFile(path) + if err != nil { + if errors.Is(err, os.ErrNotExist) { + return nil, nil + } + return nil, err + } + + var idx Index + if err := json.Unmarshal(data, &idx); err != nil { + return nil, err + } + if idx.Version != indexVersion { + return nil, nil + } + + return &idx, nil +} + +func writeIndex(path string, index *Index) error { + if err := os.MkdirAll(filepath.Dir(path), 0o700); err != nil { + return err + } + data, err := json.Marshal(index) + if err != nil { + return err + } + return os.WriteFile(path, data, 0o600) +} + +func hashLine(line string) string { + sum := sha1.Sum([]byte(line)) + return hex.EncodeToString(sum[:]) +} + +func uniqueTerms(line string) []string { + matches := wordPattern.FindAllString(line, -1) + if len(matches) == 0 { + return nil + } + seen := make(map[string]struct{}, len(matches)) + terms := make([]string, 0, len(matches)) + for _, term := range matches { + if _, ok := seen[term]; ok { + continue + } + seen[term] = struct{}{} + terms = append(terms, term) + } + sort.Strings(terms) + return terms +} + +func buildTerms(entries []IndexEntry) map[string][]int { + terms := make(map[string][]int) + for i, entry := range entries { + for _, term := range entry.Terms { + terms[term] = append(terms[term], i) + } + } + for term, ids := range terms { + sort.Ints(ids) + terms[term] = ids + } + return terms +} + +func evalRPN(tokens []token, index *Index) (map[int]struct{}, error) { + var stack []map[int]struct{} + all := allEntrySet(index) + + for _, tok := range tokens { + switch tok.kind { + case tokenTerm: + stack = append(stack, termSet(index, tok.value)) + case tokenPhrase: + stack = append(stack, phraseSet(index, tok.value)) + case tokenNot: + if len(stack) < 1 { + return nil, fmt.Errorf("invalid query") + } + operand := stack[len(stack)-1] + stack = stack[:len(stack)-1] + stack = append(stack, difference(all, operand)) + case tokenAnd, tokenOr: + if len(stack) < 2 { + return nil, fmt.Errorf("invalid query") + } + right := stack[len(stack)-1] + left := stack[len(stack)-2] + stack = stack[:len(stack)-2] + if tok.kind == tokenAnd { + stack = append(stack, intersection(left, right)) + } else { + stack = append(stack, union(left, right)) + } + default: + return nil, fmt.Errorf("invalid query") + } + } + + if len(stack) != 1 { + return nil, fmt.Errorf("invalid query") + } + + return stack[0], nil +} + +func allEntrySet(index *Index) map[int]struct{} { + all := make(map[int]struct{}, len(index.Entries)) + for i := range index.Entries { + all[i] = struct{}{} + } + return all +} + +func termSet(index *Index, term string) map[int]struct{} { + ids := index.Terms[term] + set := make(map[int]struct{}, len(ids)) + for _, id := range ids { + set[id] = struct{}{} + } + return set +} + +func phraseSet(index *Index, phrase string) map[int]struct{} { + set := make(map[int]struct{}) + for i, entry := range index.Entries { + if strings.Contains(entry.Normalized, phrase) { + set[i] = struct{}{} + } + } + return set +} + +func union(a, b map[int]struct{}) map[int]struct{} { + out := make(map[int]struct{}, len(a)+len(b)) + for k := range a { + out[k] = struct{}{} + } + for k := range b { + out[k] = struct{}{} + } + return out +} + +func intersection(a, b map[int]struct{}) map[int]struct{} { + if len(a) > len(b) { + a, b = b, a + } + out := make(map[int]struct{}) + for k := range a { + if _, ok := b[k]; ok { + out[k] = struct{}{} + } + } + return out +} + +func difference(all, remove map[int]struct{}) map[int]struct{} { + out := make(map[int]struct{}, len(all)) + for k := range all { + if _, ok := remove[k]; !ok { + out[k] = struct{}{} + } + } + return out +} diff --git a/search_index_test.go b/search_index_test.go new file mode 100644 index 0000000..6ca25af --- /dev/null +++ b/search_index_test.go @@ -0,0 +1,159 @@ +package main + +import ( + "fmt" + "os" + "strings" + "testing" +) + +func TestUpdateIndexReusesUnchangedLines(t *testing.T) { + home := withTempHome(t) + journalDir, journalPath := journalPaths(home) + if err := os.MkdirAll(journalDir, 0o700); err != nil { + t.Fatalf("mkdir failed: %v", err) + } + + content := "[2024-01-01 10:00] first line\n[2024-01-01 11:00] second line\n" + if err := os.WriteFile(journalPath, []byte(content), 0o600); err != nil { + t.Fatalf("write failed: %v", err) + } + + indexPath, err := defaultIndexPath() + if err != nil { + t.Fatalf("defaultIndexPath failed: %v", err) + } + + index, stats, err := UpdateIndex(journalPath, indexPath) + if err != nil { + t.Fatalf("UpdateIndex failed: %v", err) + } + if stats.ReindexedLines != 2 || stats.ReusedLines != 0 { + t.Fatalf("expected 2 reindexed lines, got %+v", stats) + } + if len(index.Entries) != 2 { + t.Fatalf("expected 2 entries, got %d", len(index.Entries)) + } + + updated := "[2024-01-01 10:00] first line\n[2024-01-01 11:00] changed line\n" + if err := os.WriteFile(journalPath, []byte(updated), 0o600); err != nil { + t.Fatalf("write failed: %v", err) + } + + _, stats, err = UpdateIndex(journalPath, indexPath) + if err != nil { + t.Fatalf("UpdateIndex failed: %v", err) + } + if stats.ReusedLines != 1 || stats.ReindexedLines != 1 { + t.Fatalf("expected 1 reused and 1 reindexed line, got %+v", stats) + } +} + +func TestSearchIndexSupportsBooleanAndPhrase(t *testing.T) { + home := withTempHome(t) + journalDir, journalPath := journalPaths(home) + if err := os.MkdirAll(journalDir, 0o700); err != nil { + t.Fatalf("mkdir failed: %v", err) + } + + content := strings.Join([]string{ + "[2024-01-01 10:00] quick brown fox", + "[2024-01-01 11:00] lazy dog jumps", + "[2024-01-01 12:00] quick blue hare", + "", + }, "\n") + if err := os.WriteFile(journalPath, []byte(content), 0o600); err != nil { + t.Fatalf("write failed: %v", err) + } + + indexPath, err := defaultIndexPath() + if err != nil { + t.Fatalf("defaultIndexPath failed: %v", err) + } + + index, _, err := UpdateIndex(journalPath, indexPath) + if err != nil { + t.Fatalf("UpdateIndex failed: %v", err) + } + + results, err := SearchIndex(index, "\"quick brown\" OR hare") + if err != nil { + t.Fatalf("SearchIndex failed: %v", err) + } + if len(results) != 2 { + t.Fatalf("expected 2 results, got %d", len(results)) + } + + results, err = SearchIndex(index, "quick AND NOT blue") + if err != nil { + t.Fatalf("SearchIndex failed: %v", err) + } + if len(results) != 1 { + t.Fatalf("expected 1 result, got %d", len(results)) + } +} + +func BenchmarkIndexing1k(b *testing.B) { + home := b.TempDir() + b.Setenv("HOME", home) + b.Setenv("USERPROFILE", home) + journalDir, journalPath := journalPaths(home) + if err := os.MkdirAll(journalDir, 0o700); err != nil { + b.Fatalf("mkdir failed: %v", err) + } + lines := make([]string, 0, 1000) + for i := 0; i < 1000; i++ { + lines = append(lines, fmt.Sprintf("[2024-01-01 10:%02d] note %d", i%60, i)) + } + content := strings.Join(lines, "\n") + "\n" + if err := os.WriteFile(journalPath, []byte(content), 0o600); err != nil { + b.Fatalf("write failed: %v", err) + } + + indexPath, err := defaultIndexPath() + if err != nil { + b.Fatalf("defaultIndexPath failed: %v", err) + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + if _, _, err := UpdateIndex(journalPath, indexPath); err != nil { + b.Fatalf("UpdateIndex failed: %v", err) + } + } +} + +func BenchmarkSearch1k(b *testing.B) { + home := b.TempDir() + b.Setenv("HOME", home) + b.Setenv("USERPROFILE", home) + journalDir, journalPath := journalPaths(home) + if err := os.MkdirAll(journalDir, 0o700); err != nil { + b.Fatalf("mkdir failed: %v", err) + } + lines := make([]string, 0, 1000) + for i := 0; i < 1000; i++ { + lines = append(lines, fmt.Sprintf("[2024-01-01 10:%02d] note %d", i%60, i)) + } + content := strings.Join(lines, "\n") + "\n" + if err := os.WriteFile(journalPath, []byte(content), 0o600); err != nil { + b.Fatalf("write failed: %v", err) + } + + indexPath, err := defaultIndexPath() + if err != nil { + b.Fatalf("defaultIndexPath failed: %v", err) + } + + index, _, err := UpdateIndex(journalPath, indexPath) + if err != nil { + b.Fatalf("UpdateIndex failed: %v", err) + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + if _, err := SearchIndex(index, "note AND 9"); err != nil { + b.Fatalf("SearchIndex failed: %v", err) + } + } +} diff --git a/search_query.go b/search_query.go new file mode 100644 index 0000000..3fd4057 --- /dev/null +++ b/search_query.go @@ -0,0 +1,187 @@ +package main + +import ( + "fmt" + "strings" + "unicode" +) + +type tokenKind int + +const ( + tokenTerm tokenKind = iota + tokenPhrase + tokenAnd + tokenOr + tokenNot + tokenLParen + tokenRParen +) + +type token struct { + kind tokenKind + value string +} + +func parseQuery(query string) ([]token, error) { + lexerTokens, err := lexQuery(query) + if err != nil { + return nil, err + } + if len(lexerTokens) == 0 { + return nil, fmt.Errorf("empty query") + } + + var tokens []token + var prev *token + for _, tok := range lexerTokens { + if prev != nil && needsImplicitAnd(*prev, tok) { + tokens = append(tokens, token{kind: tokenAnd}) + } + tokens = append(tokens, tok) + prev = &tok + } + + return tokens, nil +} + +func needsImplicitAnd(prev, next token) bool { + prevIsTerm := prev.kind == tokenTerm || prev.kind == tokenPhrase || prev.kind == tokenRParen + nextIsTerm := next.kind == tokenTerm || next.kind == tokenPhrase || next.kind == tokenLParen || next.kind == tokenNot + return prevIsTerm && nextIsTerm +} + +func lexQuery(input string) ([]token, error) { + var tokens []token + runes := []rune(strings.TrimSpace(input)) + for i := 0; i < len(runes); { + ch := runes[i] + switch { + case unicode.IsSpace(ch): + i++ + case ch == '(': + tokens = append(tokens, token{kind: tokenLParen}) + i++ + case ch == ')': + tokens = append(tokens, token{kind: tokenRParen}) + i++ + case ch == '"': + phrase, next, err := readPhrase(runes, i+1) + if err != nil { + return nil, err + } + if strings.TrimSpace(phrase) == "" { + return nil, fmt.Errorf("empty phrase") + } + tokens = append(tokens, token{kind: tokenPhrase, value: strings.ToLower(phrase)}) + i = next + default: + word, next := readWord(runes, i) + lower := strings.ToLower(word) + switch lower { + case "and": + tokens = append(tokens, token{kind: tokenAnd}) + case "or": + tokens = append(tokens, token{kind: tokenOr}) + case "not": + tokens = append(tokens, token{kind: tokenNot}) + default: + tokens = append(tokens, token{kind: tokenTerm, value: lower}) + } + i = next + } + } + + return tokens, nil +} + +func readPhrase(runes []rune, start int) (string, int, error) { + var builder strings.Builder + for i := start; i < len(runes); i++ { + if runes[i] == '"' { + return builder.String(), i + 1, nil + } + builder.WriteRune(runes[i]) + } + return "", len(runes), fmt.Errorf("unterminated phrase") +} + +func readWord(runes []rune, start int) (string, int) { + var builder strings.Builder + for i := start; i < len(runes); i++ { + ch := runes[i] + if unicode.IsSpace(ch) || ch == '(' || ch == ')' { + return builder.String(), i + } + builder.WriteRune(ch) + } + return builder.String(), len(runes) +} + +func toRPN(tokens []token) ([]token, error) { + var output []token + var stack []token + + for _, tok := range tokens { + switch tok.kind { + case tokenTerm, tokenPhrase: + output = append(output, tok) + case tokenNot, tokenAnd, tokenOr: + for len(stack) > 0 { + top := stack[len(stack)-1] + if top.kind == tokenLParen { + break + } + if precedence(top.kind) >= precedence(tok.kind) { + output = append(output, top) + stack = stack[:len(stack)-1] + } else { + break + } + } + stack = append(stack, tok) + case tokenLParen: + stack = append(stack, tok) + case tokenRParen: + found := false + for len(stack) > 0 { + top := stack[len(stack)-1] + stack = stack[:len(stack)-1] + if top.kind == tokenLParen { + found = true + break + } + output = append(output, top) + } + if !found { + return nil, fmt.Errorf("unbalanced parentheses") + } + default: + return nil, fmt.Errorf("unsupported token") + } + } + + for len(stack) > 0 { + top := stack[len(stack)-1] + stack = stack[:len(stack)-1] + if top.kind == tokenLParen || top.kind == tokenRParen { + return nil, fmt.Errorf("unbalanced parentheses") + } + output = append(output, top) + } + + return output, nil +} + +func precedence(kind tokenKind) int { + switch kind { + case tokenNot: + return 3 + case tokenAnd: + return 2 + case tokenOr: + return 1 + default: + return 0 + } +}