-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscoring_integration_test.go
More file actions
128 lines (108 loc) · 4.07 KB
/
scoring_integration_test.go
File metadata and controls
128 lines (108 loc) · 4.07 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
package defuddle
import (
"context"
"fmt"
"strings"
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
// buildScoringPage wraps content in a minimal page that has no semantic
// elements (article/main), forcing the scoring path in findMainContent.
func buildScoringPage(title, bodyHTML string) string {
return fmt.Sprintf(`<html><head><title>%s</title></head><body>%s</body></html>`, title, bodyHTML)
}
func TestScoringIntegration(t *testing.T) {
t.Parallel()
t.Run("deeply nested div found by scoring", func(t *testing.T) {
t.Parallel()
// Build a deeply nested div tree with real content at the leaf.
// No article/main, so scoring is the only path.
paragraphs := strings.Repeat(
"<p>The quick brown fox jumps over the lazy dog. "+
"Pack my box with five dozen liquor jugs. "+
"How vexingly quick daft zebras jump.</p>\n",
8,
)
bodyHTML := fmt.Sprintf(
`<div class="outer"><div class="middle"><div class="inner">%s</div></div></div>`,
paragraphs,
)
html := buildScoringPage("Nested Div Test", bodyHTML)
d, err := NewDefuddle(html, nil)
require.NoError(t, err)
result, err := d.Parse(context.Background())
require.NoError(t, err)
require.NotNil(t, result)
assert.Greater(t, result.WordCount, 0,
"scoring should find content inside nested divs")
assert.Contains(t, result.Content, "quick brown fox",
"extracted content should include paragraph text")
})
t.Run("div with most text wins over sparse siblings", func(t *testing.T) {
t.Parallel()
// One content-rich div alongside several low-text divs.
richContent := strings.Repeat(
"<p>Scoring algorithms evaluate text density and paragraph count "+
"to identify the most relevant content block on a page.</p>\n",
10,
)
bodyHTML := fmt.Sprintf(`
<div class="nav">Home About Contact</div>
<div class="sidebar">Tags: foo bar baz</div>
<div class="content">%s</div>
<div class="footer">Copyright 2024</div>
`, richContent)
html := buildScoringPage("Multi-Div Test", bodyHTML)
d, err := NewDefuddle(html, nil)
require.NoError(t, err)
result, err := d.Parse(context.Background())
require.NoError(t, err)
require.NotNil(t, result)
assert.Greater(t, result.WordCount, 30,
"word count should reflect the rich content div")
assert.Contains(t, result.Content, "text density",
"content from the rich div should be present")
})
t.Run("page with only nav header footer still returns result", func(t *testing.T) {
t.Parallel()
// All blocks are clutter. Parse must not return nil — it falls back
// gracefully rather than crashing or returning an error.
bodyHTML := `
<nav>Home | About | Contact | Blog | Portfolio</nav>
<header>My Website — Established 2010</header>
<footer>Privacy Policy | Terms of Service | Cookie Settings</footer>
`
html := buildScoringPage("Clutter Only", bodyHTML)
d, err := NewDefuddle(html, nil)
require.NoError(t, err)
result, err := d.Parse(context.Background())
require.NoError(t, err)
require.NotNil(t, result, "Parse must never return nil result for valid HTML")
})
t.Run("single high-scoring div selected over many low-scoring divs", func(t *testing.T) {
t.Parallel()
// Build many near-empty divs and one substantive one.
var sb strings.Builder
for i := range 15 {
fmt.Fprintf(&sb, `<div class="filler-%d">word%d</div>`, i, i)
}
winnerContent := strings.Repeat(
"<p>Content scoring selects the element whose text density, "+
"paragraph structure, and link ratio best match article content. "+
"This paragraph provides enough signal to rank above the fillers.</p>\n",
6,
)
fmt.Fprintf(&sb, `<div class="winner">%s</div>`, winnerContent)
html := buildScoringPage("Winner Div Test", sb.String())
d, err := NewDefuddle(html, nil)
require.NoError(t, err)
result, err := d.Parse(context.Background())
require.NoError(t, err)
require.NotNil(t, result)
assert.Greater(t, result.WordCount, 20,
"the high-scoring div should dominate the word count")
assert.Contains(t, result.Content, "text density",
"winner div content should be present in extraction")
})
}