blob: 01c6888663b3898d2a0f0cf0ecf3763d731fa56d (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
|
package main
import (
"context"
"fmt"
"net/url"
"sync"
)
/*
main goroutine:
- manages urls channel.
*/
func workers(startURL url.URL, maxConcurrency, maxURLs int) {
worklist := make(chan []url.URL)
// Unseen URLs.
urls := make(chan url.URL)
go func() {
worklist <- []url.URL{startURL}
}()
var wg sync.WaitGroup
ctx, cancel := context.WithCancel(context.Background())
// Create maxConcurrency worker goroutines to demultiplex from
// the urls channel (unseen links.)
for i := range maxConcurrency {
wg.Go(func() {
loop:
for u := range urls {
batch := getBatch(u)
select {
case <-ctx.Done():
break loop
default:
go func() {
select {
case <-ctx.Done():
return
case worklist <- batch:
}
}()
}
}
fmt.Printf("terminating %d\n", i+1)
})
}
// The main goroutine deduplicates worklist items and sends
// unseen ones to the crawlers in a fan-out fashion.
seen := make(map[url.URL]bool)
count := 1
loop:
for batch := range worklist {
for _, u := range batch {
if !seen[u] {
fmt.Printf("%d. %s\n", count, &u)
count++
seen[u] = true
if len(seen) == maxURLs {
break loop
}
urls <- u
}
}
}
close(urls)
cancel()
wg.Wait()
}
|