-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.go
88 lines (74 loc) · 1.86 KB
/
main.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
package main
import (
"fmt"
"os"
"regexp"
"github.com/gocolly/colly/v2"
)
func matchAny(urlPath string, patterns []string) bool {
for _, pattern := range patterns {
matched, err := regexp.MatchString(pattern, urlPath)
if matched {
return true
} else if err != nil {
fmt.Println(err)
}
}
return false
}
func main() {
exitCode := 0
defer func() {
os.Exit(exitCode)
}()
collector := colly.NewCollector(colly.Async())
collector.Limit(&colly.LimitRule{DomainGlob: "*", Parallelism: 8})
// Find and visit all links on ems.press pages
collector.OnHTML("a[href]", func(element *colly.HTMLElement) {
url := element.Request.URL
if url.Host == "ems.press" {
element.Request.Ctx.Put(element.Attr("href"), url.String())
element.Request.Visit(element.Attr("href"))
}
})
collector.OnRequest(func(request *colly.Request) {
if request.URL.Scheme != "https" && request.URL.Scheme != "http" {
request.Abort()
}
exclude := []string{
"^\\/journals\\/.*\\/articles.*",
"^\\/journals\\/.*\\/issues.*",
"^\\/books\\/.*\\/.*",
}
include := []string{
"^\\/journals\\/msl\\/articles.*",
"^\\/journals\\/msl\\/issues.*",
"^\\/books\\/esiam.*",
}
urlPath := request.URL.Path
matchedExclude := matchAny(urlPath, exclude)
matchedInclude := matchAny(urlPath, include)
if matchedExclude && !matchedInclude {
request.Abort()
}
})
collector.OnError(func(response *colly.Response, err error) {
if response.StatusCode == 503 || response.StatusCode == 999 || response.StatusCode == 0 {
// ignore 503 and 999 and 0 status code to avoid flaky errors
return
}
exitCode = 1
fmt.Println(
"Error Visiting:\n",
response.Request.URL,
"\n",
err,
response.StatusCode,
"\n Found on:",
response.Ctx.Get(response.Request.URL.String()),
"\n ",
)
})
collector.Visit("https://ems.press/")
collector.Wait()
}