-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcrawler.go
76 lines (70 loc) · 2.11 KB
/
crawler.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
package main
import (
"net/http"
"sync"
"log"
"time"
)
type Crawler struct {
Domain string
MaxDepth int
wg sync.WaitGroup
DocParser *Parser
Site *SiteMap
}
// Creates a New Crawler Object which contains a DocParser and a New Empty SiteMap Object
func NewCrawler(domain string, maxDepth int) *Crawler {
crawler := new(Crawler)
crawler.Domain = domain
crawler.MaxDepth = maxDepth
crawler.DocParser = NewParser()
crawler.Site = NewSiteMap(domain)
return crawler
}
// Initialize the Crawler starting from / that is the root of the domain
// We wait until all threads are over to end the execution of this block
func (crawler *Crawler) Begin() {
crawler.wg.Add(1)
go crawler.Crawl("/", 0)
crawler.wg.Wait()
}
// When crawls, it parses the data and extracts the URL address from Links and Assets
// This function is called recursively to extract from the child pages
// The depth is important to stop the execution when that depth is reached
func (crawler *Crawler) Crawl(uri string, depth int) {
defer crawler.wg.Done()
domain := crawler.Domain
log.Printf("Depth: %v - %s%s Crawling...", depth, domain, uri)
timeout := time.Duration(5 * time.Second)
client := http.Client{
Timeout: timeout,
}
resp, err := client.Get("http://" + domain + uri)
if err != nil {
log.Printf("Couldn't get %s%s", domain, uri)
return
}
log.Printf("Depth: %v - %s%s Parsing...", depth, domain, uri)
links, assets := crawler.DocParser.ParseBody(crawler.Domain, resp.Body)
defer resp.Body.Close()
log.Printf("Depth: %v - %s%s Extracted (%v links) and (%v assets)", depth, domain, uri, len(links), len(assets))
crawler.Site.AddDocument(uri, depth)
crawler.Site.Visit(uri)
for asset := range assets {
crawler.Site.AddAsset(uri, asset)
}
depth += 1
for _, parsedLink := range links {
if parsedLink.URI != "" {
crawler.Site.AddLink(uri, parsedLink.URI, depth)
if !crawler.Site.HasBeenVisited(parsedLink.URI) && depth <= crawler.MaxDepth {
crawler.wg.Add(1)
go crawler.Crawl(parsedLink.URI, depth)
}
}
}
}
// Prints the Crawled Site Map
func (crawler *Crawler) PrintSiteMap() {
crawler.Site.PrintSiteMap()
}