-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathextractor_test.go
More file actions
124 lines (105 loc) · 3.41 KB
/
extractor_test.go
File metadata and controls
124 lines (105 loc) · 3.41 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
package defuddle
import (
"context"
"testing"
"github.com/dotcommander/defuddle/extractors"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestExtractors(t *testing.T) {
t.Parallel()
t.Run("GitHub extractor registration and detection", func(t *testing.T) {
t.Parallel()
// Initialize extractors
extractors.InitializeBuiltins()
// Test GitHub URL detection
githubHTML := `<html>
<head>
<meta name="expected-hostname" content="github.com">
<meta name="github-keyboard-shortcuts" content="">
<title>Test Issue · kepano/defuddle</title>
</head>
<body>
<div data-testid="issue-metadata-sticky">Issue metadata</div>
<div data-testid="issue-title">Test Issue</div>
<div data-testid="issue-viewer-issue-container">
<div data-testid="issue-body-viewer">
<div class="markdown-body">
<p>This is a test issue body.</p>
</div>
</div>
</div>
</body>
</html>`
defuddleInstance, err := NewDefuddle(githubHTML, &Options{
URL: "https://github.com/kepano/defuddle/issues/123",
})
require.NoError(t, err)
result, err := defuddleInstance.Parse(context.Background())
require.NoError(t, err)
t.Logf("GitHub extraction result: %+v", result)
// Check if GitHub extractor was used
require.NotNil(t, result.ExtractorType)
assert.Equal(t, "github", *result.ExtractorType)
// Check content extraction
assert.Contains(t, result.Content, "This is a test issue body")
})
t.Run("YouTube extractor with empty videoId", func(t *testing.T) {
t.Parallel()
// Test YouTube URL that might have empty videoId
youtubeHTML := `<html>
<head>
<title>YouTube</title>
<script type="application/ld+json">
{
"@type": "VideoObject",
"name": "Test Video",
"description": "Test video description",
"author": "Test Author",
"uploadDate": "2024-01-01T00:00:00Z"
}
</script>
</head>
<body>
<h1>Test Video</h1>
<p>Test video description</p>
</body>
</html>`
defuddleInstance, err := NewDefuddle(youtubeHTML, &Options{
URL: "https://youtube.com/watch?v=", // Empty video ID
})
require.NoError(t, err)
result, err := defuddleInstance.Parse(context.Background())
require.NoError(t, err)
t.Logf("YouTube extraction result: %+v", result)
// Should handle empty videoId gracefully
if result.ExtractorType != nil && *result.ExtractorType == "youtube" {
// If YouTube extractor was used, check content doesn't have empty iframe
assert.NotContains(t, result.Content, `src="https://www.youtube.com/embed/"`, "Found empty iframe src, should be handled gracefully")
}
})
t.Run("Twitter extractor safety", func(t *testing.T) {
t.Parallel()
twitterHTML := `<html>
<head><title>Twitter</title></head>
<body>
<article data-testid="tweet">
<div data-testid="tweetText">
<span>This is a test tweet</span>
</div>
</article>
</body>
</html>`
defuddleInstance, err := NewDefuddle(twitterHTML, &Options{
URL: "https://twitter.com/user/status/123",
})
require.NoError(t, err)
result, err := defuddleInstance.Parse(context.Background())
require.NoError(t, err)
t.Logf("Twitter extraction result: %+v", result)
// Should not crash with document undefined issues
if result.ExtractorType != nil && *result.ExtractorType == "twitter" {
assert.Contains(t, result.Content, "test tweet", "Expected tweet content to be extracted")
}
})
}