1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
|
package main
import (
"bytes"
"context"
"fmt"
"log"
"net/url"
"os"
"sync"
"golang.org/x/net/html"
)
func classic(startURL url.URL, maxConcurrency, maxURLs, maxDepth int) {
worklist := make(chan []packet)
var numPendingSends int
numPendingSends++
go func() {
startPacket := packet{startURL, 0}
worklist <- []packet{startPacket}
}()
// Crawl the web concurrently. Map URLs to their depth (i.e
// how many links we have to work through to find the URL.)
seen := make(map[url.URL]int)
count := 1
ctx, cancel := context.WithCancel(context.Background())
var wg sync.WaitGroup
sema := make(chan struct{}, maxConcurrency)
loop:
for ; numPendingSends > 0; numPendingSends-- {
batch := <-worklist
for _, p := range batch {
if _, ok := seen[p.url]; !ok {
fmt.Printf("%d. %s\n", count, &p)
count++
seen[p.url] = p.depth
if len(seen) == maxURLs {
break loop
}
// Track maxDepth here. A maxDepth
// greater than zero means a finite
// maxDepth value. If the packets
// we're seeing reach that depth,
// don't use their URLs to spawn new
// fetches.
if maxDepth > 0 && p.depth == maxDepth {
continue
}
numPendingSends++
wg.Go(func() {
sema <- struct{}{}
defer func() { <-sema }()
batch := getBatch(p.url)
ps := convertToPackets(batch, p.depth+1)
select {
case <-ctx.Done():
return
case worklist <- ps:
}
})
}
}
}
cancel()
wg.Wait()
// FIXME: eventually, when all crawlers terminate properly, we
// can move this out to main: that is, all crawlers will
// return the same seen map that will be processed by this
// code.
fmt.Println("Generating sitemap...")
sitemap, err := toSitemap(seen, maxDepth, maxURLs)
if err != nil {
log.Fatal(err)
}
xmlFilename := fmt.Sprintf("%s.xml", startURL.Host)
if err := os.WriteFile(xmlFilename, []byte(sitemap), 0666); err != nil {
log.Fatal(err)
}
fmt.Printf("Wrote sitemap to %s\n", xmlFilename)
}
func getBatch(u url.URL) []url.URL {
htmlBytes, err := fetch(u)
if err != nil {
log.Print(err)
return nil
}
htmlDoc, err := html.Parse(bytes.NewReader(htmlBytes))
if err != nil {
log.Print(err)
return nil
}
batch := findURLs(u, htmlDoc)
return batch
}
|