-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathparquet_dictionary_test.go
More file actions
96 lines (83 loc) · 2.62 KB
/
parquet_dictionary_test.go
File metadata and controls
96 lines (83 loc) · 2.62 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
package buildkitelogs
import (
"context"
"os"
"testing"
"time"
)
func TestDictionaryEncodingAcrossBatches(t *testing.T) {
// Create test data with repeated strings across batches
entry1 := &LogEntry{
Timestamp: time.Now(),
Content: "repeated content",
Group: "common group",
}
entry2 := &LogEntry{
Timestamp: time.Now(),
Content: "different content",
Group: "common group", // This will be repeated
}
entry3 := &LogEntry{
Timestamp: time.Now(),
Content: "repeated content", // This will be repeated from batch 1
Group: "another group",
}
// Create temporary file
tmpFile, err := os.CreateTemp("", "dictionary_test_*.parquet")
if err != nil {
t.Fatalf("Failed to create temp file: %v", err)
}
defer os.Remove(tmpFile.Name())
// Create writer
writer, err := NewParquetWriter(tmpFile)
if err != nil {
t.Fatalf("Failed to create ParquetWriter: %v", err)
}
// Write first batch
batch1 := []*LogEntry{entry1, entry2}
if err := writer.WriteBatch(batch1); err != nil {
t.Fatalf("Failed to write batch 1: %v", err)
}
// Write second batch with some repeated strings
batch2 := []*LogEntry{entry3}
if err := writer.WriteBatch(batch2); err != nil {
t.Fatalf("Failed to write batch 2: %v", err)
}
// Close writer (this will also close the file)
if err := writer.Close(); err != nil {
t.Fatalf("Failed to close writer: %v", err)
}
// Verify file size - dictionary encoding should make it smaller
// than if we used regular string encoding
stat, err := os.Stat(tmpFile.Name()) //nolint:gosec // path from os.CreateTemp
if err != nil {
t.Fatalf("Failed to stat file: %v", err)
}
// The file should be successfully created and have content
if stat.Size() == 0 {
t.Error("Output file is empty")
}
t.Logf("Dictionary-encoded file size: %d bytes", stat.Size())
// Test that we can read the file back
var entries []ParquetLogEntry
for entry, err := range ReadParquetFileIter(context.Background(), tmpFile.Name()) {
if err != nil {
t.Fatalf("Error reading entries: %v", err)
}
entries = append(entries, entry)
}
// Verify we got all entries back
if len(entries) != 3 {
t.Errorf("Expected 3 entries, got %d", len(entries))
}
// Verify content was preserved
if entries[0].Content != "repeated content" {
t.Errorf("Entry 0 content mismatch: got %s, want %s", entries[0].Content, "repeated content")
}
if entries[1].Group != "common group" {
t.Errorf("Entry 1 group mismatch: got %s, want %s", entries[1].Group, "common group")
}
if entries[2].Content != "repeated content" {
t.Errorf("Entry 2 content mismatch: got %s, want %s", entries[2].Content, "repeated content")
}
}