-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtypes.go
More file actions
153 lines (130 loc) · 5.77 KB
/
types.go
File metadata and controls
153 lines (130 loc) · 5.77 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
package defuddle
import (
"github.com/dotcommander/defuddle/internal/debug"
"github.com/dotcommander/defuddle/internal/elements"
"github.com/dotcommander/defuddle/internal/metadata"
"github.com/kaptinlin/requests"
)
// MetaTag represents a meta tag item from HTML
// This is an alias to the internal metadata.MetaTag type
type MetaTag = metadata.MetaTag
// Options represents configuration options for Defuddle parsing
// JavaScript original code:
//
// export interface DefuddleOptions {
// debug?: boolean;
// url?: string;
// markdown?: boolean;
// separateMarkdown?: boolean;
// removeExactSelectors?: boolean;
// removePartialSelectors?: boolean;
// }
type Options struct {
// Enable debug logging
Debug bool `json:"debug,omitempty"`
// URL of the page being parsed
URL string `json:"url,omitempty"`
// Convert output to Markdown
Markdown bool `json:"markdown,omitempty"`
// Include Markdown in the response
SeparateMarkdown bool `json:"separateMarkdown,omitempty"`
// Whether to remove elements matching exact selectors like ads, social buttons, etc.
// nil = true (default). Use PtrBool(false) to disable.
RemoveExactSelectors *bool `json:"removeExactSelectors,omitempty"`
// Whether to remove elements matching partial selectors like ads, social buttons, etc.
// nil = true (default). Use PtrBool(false) to disable.
RemovePartialSelectors *bool `json:"removePartialSelectors,omitempty"`
// Remove images from the extracted content
// Defaults to false.
RemoveImages bool `json:"removeImages,omitempty"`
// Whether to remove hidden elements (display:none, Tailwind hidden classes).
// nil = true (default). Use PtrBool(false) to disable.
RemoveHiddenElements *bool `json:"removeHiddenElements,omitempty"`
// Whether to remove low-scoring non-content blocks.
// nil = true (default). Use PtrBool(false) to disable.
RemoveLowScoring *bool `json:"removeLowScoring,omitempty"`
// Whether to remove content patterns (boilerplate, breadcrumbs, etc.).
// nil = true (default). Use PtrBool(false) to disable.
RemoveContentPatterns *bool `json:"removeContentPatterns,omitempty"`
// CSS selector to use for content extraction instead of auto-detection.
ContentSelector string `json:"contentSelector,omitempty"`
// Element processing options
ProcessCode bool `json:"processCode,omitempty"`
ProcessImages bool `json:"processImages,omitempty"`
ProcessHeadings bool `json:"processHeadings,omitempty"`
ProcessMath bool `json:"processMath,omitempty"`
ProcessFootnotes bool `json:"processFootnotes,omitempty"`
ProcessRoles bool `json:"processRoles,omitempty"`
CodeOptions *elements.CodeBlockProcessingOptions `json:"codeOptions,omitempty"`
ImageOptions *elements.ImageProcessingOptions `json:"imageOptions,omitempty"`
HeadingOptions *elements.HeadingProcessingOptions `json:"headingOptions,omitempty"`
MathOptions *elements.MathProcessingOptions `json:"mathOptions,omitempty"`
FootnoteOptions *elements.FootnoteProcessingOptions `json:"footnoteOptions,omitempty"`
RoleOptions *elements.RoleProcessingOptions `json:"roleOptions,omitempty"`
// Client is a custom HTTP client for fetching URLs.
// If nil, a default client with standard User-Agent and 30s timeout is created.
Client *requests.Client `json:"-"`
// MaxConcurrency limits parallel URL fetches in ParseFromURLs.
// Defaults to 5 if zero.
MaxConcurrency int `json:"maxConcurrency,omitempty"`
}
// Metadata represents extracted metadata from a document
// This is an alias to the internal metadata.Metadata type
type Metadata = metadata.Metadata
// Result represents the complete response from Defuddle parsing
// JavaScript original code:
//
// export interface DefuddleResponse extends DefuddleMetadata {
// content: string;
// contentMarkdown?: string;
// extractorType?: string;
// metaTags?: MetaTagItem[];
// }
type Result struct {
Metadata
Content string `json:"content"`
ContentMarkdown *string `json:"contentMarkdown,omitempty"`
ExtractorType *string `json:"extractorType,omitempty"`
Variables map[string]string `json:"variables,omitempty"`
MetaTags []MetaTag `json:"metaTags,omitempty"`
DebugInfo *debug.Info `json:"debugInfo,omitempty"`
}
func (o *Options) wantsMarkdown() bool {
return o.Markdown || o.SeparateMarkdown
}
// PtrBool returns a pointer to the given bool value.
// Use this to explicitly set *bool fields in Options (e.g., PtrBool(false) to disable defaults).
func PtrBool(v bool) *bool { return &v }
// BoolDefault returns the value pointed to by b, or defaultVal if b is nil.
func BoolDefault(b *bool, defaultVal bool) bool {
if b == nil {
return defaultVal
}
return *b
}
// ExtractorVariables represents variables extracted by site-specific extractors
// JavaScript original code:
//
// export interface ExtractorVariables {
// [key: string]: string;
// }
type ExtractorVariables map[string]string
// ExtractedContent represents content extracted by site-specific extractors
// JavaScript original code:
//
// export interface ExtractedContent {
// title?: string;
// author?: string;
// published?: string;
// content?: string;
// contentHtml?: string;
// variables?: ExtractorVariables;
// }
type ExtractedContent struct {
Title *string `json:"title,omitempty"`
Author *string `json:"author,omitempty"`
Published *string `json:"published,omitempty"`
Content *string `json:"content,omitempty"`
ContentHTML *string `json:"contentHtml,omitempty"`
Variables *ExtractorVariables `json:"variables,omitempty"`
}