-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcustom_extractor.go
More file actions
114 lines (95 loc) · 2.84 KB
/
custom_extractor.go
File metadata and controls
114 lines (95 loc) · 2.84 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
// Package main demonstrates custom extractor usage.
package main
import (
"context"
"fmt"
"log"
"github.com/PuerkitoBio/goquery"
"github.com/dotcommander/defuddle"
"github.com/dotcommander/defuddle/extractors"
)
// CustomBlogExtractor implements a custom extractor for blog sites
type CustomBlogExtractor struct {
*extractors.ExtractorBase
}
// NewCustomBlogExtractor creates a new custom blog extractor
func NewCustomBlogExtractor(doc *goquery.Document, url string, schemaOrgData any) extractors.BaseExtractor {
return &CustomBlogExtractor{
ExtractorBase: extractors.NewExtractorBase(doc, url, schemaOrgData),
}
}
// CanExtract determines if this extractor can handle the content
func (e *CustomBlogExtractor) CanExtract() bool {
return e.GetDocument().Find(".blog-post, .post-content").Length() > 0
}
// GetName returns the name of this extractor
func (e *CustomBlogExtractor) Name() string {
return "CustomBlogExtractor"
}
// Extract performs the custom extraction logic
func (e *CustomBlogExtractor) Extract() *extractors.ExtractorResult {
doc := e.GetDocument()
// Extract title
title := ""
if titleElement := doc.Find(".post-title, h1").First(); titleElement.Length() > 0 {
title = titleElement.Text()
}
// Extract main content
contentHTML := ""
if contentElement := doc.Find(".post-content").First(); contentElement.Length() > 0 {
if html, err := contentElement.Html(); err == nil {
contentHTML = html
}
}
variables := map[string]string{
"title": title,
"site": "Custom Blog",
}
return &extractors.ExtractorResult{
ContentHTML: contentHTML,
Variables: variables,
}
}
func main() {
// Register custom extractor for blog.example.com
extractors.Register(extractors.ExtractorMapping{
Patterns: []any{"blog.example.com"},
Extractor: NewCustomBlogExtractor,
})
// HTML content with blog structure
html := `
<html>
<head>
<title>My Blog Post</title>
</head>
<body>
<h1 class="post-title">Custom Extractor Demo</h1>
<div class="post-content">
<p>This content will be extracted by our custom blog extractor.</p>
<p>The extractor looks for specific CSS classes like .post-content.</p>
</div>
</body>
</html>
`
// URL matches our registered pattern
options := &defuddle.Options{
URL: "https://blog.example.com/post/123",
Debug: true,
}
defuddleInstance, err := defuddle.NewDefuddle(html, options)
if err != nil {
log.Fatalf("Error: %v", err)
}
result, err := defuddleInstance.Parse(context.Background())
if err != nil {
log.Fatalf("Error: %v", err)
}
fmt.Println("=== Custom Extractor Demo ===")
fmt.Printf("URL: %s\n", options.URL)
fmt.Printf("Title: %s\n", result.Title)
fmt.Printf("Site: %s\n", result.Site)
fmt.Printf("Word Count: %d\n", result.WordCount)
fmt.Println("\n=== Extracted Content ===")
fmt.Println(result.Content)
fmt.Println("\n✅ Custom extractor successfully used!")
}