defuddle/extractor_test.go at main · dotcommander/defuddle · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
package defuddle

import (
	"context"
	"testing"

	"github.com/dotcommander/defuddle/extractors"
	"github.com/stretchr/testify/assert"
	"github.com/stretchr/testify/require"
)

func TestExtractors(t *testing.T) {
	t.Parallel()
	t.Run("GitHub extractor registration and detection", func(t *testing.T) {
		t.Parallel()
		// Initialize extractors
		extractors.InitializeBuiltins()

		// Test GitHub URL detection
		githubHTML := `<html>
			<head>
				<meta name="expected-hostname" content="github.com">
				<meta name="github-keyboard-shortcuts" content="">
				<title>Test Issue · kepano/defuddle</title>
			</head>
			<body>
				<div data-testid="issue-metadata-sticky">Issue metadata</div>
				<div data-testid="issue-title">Test Issue</div>
				<div data-testid="issue-viewer-issue-container">
					<div data-testid="issue-body-viewer">
						<div class="markdown-body">
							<p>This is a test issue body.</p>
						</div>
					</div>
				</div>
			</body>
		</html>`

		defuddleInstance, err := NewDefuddle(githubHTML, &Options{
			URL: "https://github.com/kepano/defuddle/issues/123",
		})
		require.NoError(t, err)

		result, err := defuddleInstance.Parse(context.Background())
		require.NoError(t, err)

		t.Logf("GitHub extraction result: %+v", result)

		// Check if GitHub extractor was used
		require.NotNil(t, result.ExtractorType)
		assert.Equal(t, "github", *result.ExtractorType)

		// Check content extraction
		assert.Contains(t, result.Content, "This is a test issue body")
	})

	t.Run("YouTube extractor with empty videoId", func(t *testing.T) {
		t.Parallel()
		// Test YouTube URL that might have empty videoId
		youtubeHTML := `<html>
			<head>
				<title>YouTube</title>
				<script type="application/ld+json">
				{
					"@type": "VideoObject",
					"name": "Test Video",
					"description": "Test video description",
					"author": "Test Author",
					"uploadDate": "2024-01-01T00:00:00Z"
				}
				</script>
			</head>
			<body>
				<h1>Test Video</h1>
				<p>Test video description</p>
			</body>
		</html>`

		defuddleInstance, err := NewDefuddle(youtubeHTML, &Options{
			URL: "https://youtube.com/watch?v=", // Empty video ID
		})
		require.NoError(t, err)

		result, err := defuddleInstance.Parse(context.Background())
		require.NoError(t, err)

		t.Logf("YouTube extraction result: %+v", result)

		// Should handle empty videoId gracefully
		if result.ExtractorType != nil && *result.ExtractorType == "youtube" {
			// If YouTube extractor was used, check content doesn't have empty iframe
			assert.NotContains(t, result.Content, `src="https://www.youtube.com/embed/"`, "Found empty iframe src, should be handled gracefully")
		}
	})

	t.Run("Twitter extractor safety", func(t *testing.T) {
		t.Parallel()
		twitterHTML := `<html>
			<head><title>Twitter</title></head>
			<body>
				<article data-testid="tweet">
					<div data-testid="tweetText">
						<span>This is a test tweet</span>
					</div>
				</article>
			</body>
		</html>`

		defuddleInstance, err := NewDefuddle(twitterHTML, &Options{
			URL: "https://twitter.com/user/status/123",
		})
		require.NoError(t, err)

		result, err := defuddleInstance.Parse(context.Background())
		require.NoError(t, err)

		t.Logf("Twitter extraction result: %+v", result)

		// Should not crash with document undefined issues
		if result.ExtractorType != nil && *result.ExtractorType == "twitter" {
			assert.Contains(t, result.Content, "test tweet", "Expected tweet content to be extracted")
		}
	})
}