-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathurl_parser.go
147 lines (140 loc) · 3.75 KB
/
url_parser.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
package unfurlist
import (
"net/url"
"regexp"
"strings"
"github.com/gomarkdown/markdown/ast"
"github.com/gomarkdown/markdown/parser"
)
// reUrls matches sequence of characters described by RFC 3986 having http:// or
// https:// prefix. It actually allows superset of characters from RFC 3986,
// allowing some most commonly used characters like {}, etc.
var reUrls = regexp.MustCompile(`(?i:https?)://[%:/?#\[\]@!$&'\(\){}*+,;=\pL\pN._~-]+`)
// ParseURLs tries to extract unique url-like (http/https scheme only) substrings from
// given text. Results may not be proper urls, since only sequence of matched
// characters are searched for. This function is optimized for extraction of
// urls from plain text where it can be mixed with punctuation symbols: trailing
// symbols []()<>,;. are removed, but // trailing >]) are left if any opening
// <[( is found inside url.
func ParseURLs(content string) []string { return parseURLsMax(content, -1) }
func parseURLsMax(content string, maxItems int) []string {
const punct = `[]()<>{},;.*_`
res := reUrls.FindAllString(content, maxItems)
for i, s := range res {
// remove all combinations of trailing >)],. characters only if
// no similar characters were found somewhere in the middle
if idx := strings.IndexAny(s, punct); idx < 0 {
continue
}
cleanLoop:
for {
idx2 := strings.LastIndexAny(s, punct)
if idx2 != len(s)-1 {
break
}
switch s[idx2] {
case ')':
if strings.Index(s, `(`) > 0 {
break cleanLoop
}
case ']':
if strings.Index(s, `[`) > 0 {
break cleanLoop
}
case '>':
if strings.Index(s, `<`) > 0 {
break cleanLoop
}
case '}':
if strings.Index(s, `{`) > 0 {
break cleanLoop
}
}
s = s[:idx2]
}
res[i] = s
}
out := res[:0]
seen := make(map[string]struct{})
for _, v := range res {
if _, ok := seen[v]; ok {
continue
}
out = append(out, v)
seen[v] = struct{}{}
}
return out
}
// validURL returns true if s is a valid absolute url with http/https scheme.
// In addition to verification that s is not empty and url.Parse(s) returns nil
// error, validURL also ensures that query part only contains characters allowed
// by RFC 3986 3.4.
//
// This is required because url.Parse doesn't verify query part of the URI.
func validURL(s string) bool {
if s == "" {
return false
}
u, err := url.Parse(s)
if err != nil {
return false
}
if u.Host == "" {
return false
}
switch u.Scheme {
case "http", "https":
default:
return false
}
for _, r := range u.RawQuery {
// https://tools.ietf.org/html/rfc3986#section-3.4 defines:
//
// query = *( pchar / "/" / "?" )
// pchar = unreserved / pct-encoded / sub-delims / ":" / "@"
// unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
// pct-encoded = "%" HEXDIG HEXDIG
// sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
// / "*" / "+" / "," / ";" / "="
//
// check for these
switch {
case r >= '0' && r <= '9':
case r >= 'A' && r <= 'Z':
case r >= 'a' && r <= 'z':
default:
switch r {
case '/', '?',
':', '@',
'-', '.', '_', '~',
'%', '!', '$', '&', '\'', '(', ')', '*', '+', ',', ';', '=':
default:
return false
}
}
}
return true
}
func parseMarkdownURLs(content string, maxItems int) []string {
doc := parser.New().Parse([]byte(content))
var urls []string
walkFn := func(node ast.Node, entering bool) ast.WalkStatus {
if maxItems >= 0 && len(urls) == maxItems {
return ast.Terminate
}
if !entering {
return ast.GoToNext
}
switch n := node.(type) {
case *ast.Link:
if s := string(n.Destination); validURL(s) {
urls = append(urls, s)
}
case *ast.Code, *ast.CodeBlock:
return ast.SkipChildren
}
return ast.GoToNext
}
_ = ast.Walk(doc, ast.NodeVisitorFunc(walkFn))
return urls
}