-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathextractlinks.go
105 lines (90 loc) · 2.07 KB
/
extractlinks.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
package extractlinks
import (
"golang.org/x/net/html"
"io"
"strconv"
"strings"
)
// Link object for parsing an anchor link
type Link struct {
Href string
Text string
}
// All takes a reader object (like the one returned from http.Get())
// It returns a slice of Links representing the Href & Text attributes from
// anchor links found in the provided html.
// It does not close the reader passed to it.
func All(htmlBody io.Reader) ([]Link, error) {
document, err := html.Parse(htmlBody)
if err != nil {
return nil, err
}
nodes := buildNodes(document)
var links []Link
for _, n := range nodes {
links = append(links, buildLink(n))
}
links = removeDuplicateLinks(links)
return links, nil
}
// removeDuplicateLinks removed repeated href Links
func removeDuplicateLinks(links []Link) []Link {
var (
check = make(map[string]int)
cleanLinks []Link
)
for _, n := range links {
if val := check[n.Href]; val == 0 {
check[n.Href] = 1
cleanLinks = append(cleanLinks, n)
}
}
return cleanLinks
}
func buildNodes(n *html.Node) []*html.Node {
if n.Type == html.ElementNode && n.Data == "a" {
return []*html.Node{n}
}
var ret []*html.Node
for c := n.FirstChild; c != nil; c = c.NextSibling {
ret = append(ret, buildNodes(c)...)
}
return ret
}
func buildLink(n *html.Node) (link Link) {
for _, attr := range n.Attr {
if attr.Key == "href" {
link.Href = removeTrailingSlash(trimHash(attr.Val))
}
}
link.Text = buildText(n)
return
}
func buildText(n *html.Node) string {
if n.Type == html.TextNode {
return n.Data
}
var text string
for c := n.FirstChild; c != nil; c = c.NextSibling {
text += buildText(c)
}
return strings.Join(strings.Fields(text), " ")
}
// trimHash slices a hash # from the link
func trimHash(l string) string {
if strings.Contains(l, "#") {
var index int
for n, str := range l {
if strconv.QuoteRune(str) == "'#'" {
index = n
break
}
}
return l[:index]
}
return l
}
// removeTrailingSlash removes `/` from tail-end
func removeTrailingSlash(path string) string {
return strings.TrimRight(path, "/")
}