1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
|
// URLs implements a breadth-first search webcrawler based on the
// example given in section 8.6 of The Go Programming Language.
package main
import (
"flag"
"fmt"
"log"
"net/url"
"sync"
)
func main() {
maxConcurrency := flag.Int("c", 0, "Maximum number of concurrent queue pushes")
startRawURL := flag.String("url", "", "Entry-point URL")
maxURLs := flag.Int("max", 0, "Maximum number of URLs to collect (omitted or 0 means no limit)")
flag.Parse()
if *maxConcurrency == 0 {
log.Fatal("Missing -c argument")
}
if *maxConcurrency < 1 {
log.Fatalf("Invalid -c argument: %d", *maxConcurrency)
}
if *startRawURL == "" {
log.Fatal("Missing -url argument")
}
if *maxURLs < 0 {
log.Fatalf("Invalid -max argument: %d", *maxURLs)
}
startURL, err := url.Parse(*startRawURL)
if err != nil {
log.Fatal(err)
}
pool(*startURL, *maxConcurrency, *maxURLs)
}
func pool(startURL url.URL, maxConcurrency, maxURLs int) {
var wg sync.WaitGroup
urls := make(chan url.URL)
outlets := make([]<-chan []url.URL, maxConcurrency)
for i := range maxConcurrency {
outlets[i] = worker(&wg, i+1, urls)
}
out := fanIn(outlets...)
go func() {
urls <- startURL
}()
seen := make(map[url.URL]bool)
for batch := range out {
for _, u := range batch {
if !seen[u] {
seen[u] = true
go func() {
urls <- u
}()
fmt.Printf("(%d) %s\n", len(seen), &u)
}
}
}
wg.Wait()
}
func worker(wg *sync.WaitGroup, id int, urls <-chan url.URL) <-chan []url.URL {
out := make(chan []url.URL)
wg.Go(func() {
defer func() { close(out) }()
for u := range urls {
//fmt.Printf("(%d) starting %s\n", id, &u)
doc, err := fetch(u)
if err != nil {
log.Print(err)
}
batch := findURLs(u, doc)
// fmt.Printf("(%d) finished %s\n", id, &u)
out <- batch
}
})
return out
}
func fanIn(chans ...<-chan []url.URL) <-chan []url.URL {
out := make(chan []url.URL)
var wg sync.WaitGroup
for _, ch := range chans {
wg.Go(func() {
for batch := range ch {
out <- batch
}
})
}
go func() {
wg.Wait()
close(out)
}()
return out
}
|