summaryrefslogtreecommitdiff
path: root/main.go
blob: 27d3e27a416f127df9bd8388590186d4a98c3673 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
// URLs implements a breadth-first search webcrawler based on the
// example given in section 8.6 of The Go Programming Language.
package main

import (
	"flag"
	"fmt"
	"log"
	"net/url"
	"sync"
)

func main() {
	maxConcurrency := flag.Int("c", 0, "Maximum number of concurrent queue pushes")
	startRawURL := flag.String("url", "", "Entry-point URL")
	maxURLs := flag.Int("max", 0, "Maximum number of URLs to collect (omitted or 0 means no limit)")

	flag.Parse()

	if *maxConcurrency == 0 {
		log.Fatal("Missing -c argument")
	}

	if *maxConcurrency < 1 {
		log.Fatalf("Invalid -c argument: %d", *maxConcurrency)
	}

	if *startRawURL == "" {
		log.Fatal("Missing -url argument")
	}

	if *maxURLs < 0 {
		log.Fatalf("Invalid -max argument: %d", *maxURLs)
	}

	startURL, err := url.Parse(*startRawURL)
	if err != nil {
		log.Fatal(err)
	}

	pool(*startURL, *maxConcurrency, *maxURLs)
}

func pool(startURL url.URL, maxConcurrency, maxURLs int) {
	var wg sync.WaitGroup
	urls := make(chan url.URL)
	outlets := make([]<-chan []url.URL, maxConcurrency)

	for i := range maxConcurrency {
		outlets[i] = worker(&wg, i+1, urls)
	}

	out := fanIn(outlets...)

	go func() {
		urls <- startURL
	}()

	seen := make(map[url.URL]bool)
	for batch := range out {
		for _, u := range batch {
			if !seen[u] {
				seen[u] = true

				go func() {
					urls <- u
				}()

				fmt.Printf("(%d) %s\n", len(seen), &u)
			}
		}
	}

	wg.Wait()
}

func worker(wg *sync.WaitGroup, id int, urls <-chan url.URL) <-chan []url.URL {
	out := make(chan []url.URL)

	wg.Go(func() {
		defer func() { close(out) }()

		for u := range urls {
			//fmt.Printf("(%d) starting %s\n", id, &u)

			doc, err := fetch(u)
			if err != nil {
				log.Print(err)
			}

			batch := findURLs(u, doc)
			//	fmt.Printf("(%d) finished %s\n", id, &u)

			out <- batch
		}
	})

	return out
}

func fanIn(chans ...<-chan []url.URL) <-chan []url.URL {
	out := make(chan []url.URL)
	var wg sync.WaitGroup

	for _, ch := range chans {
		wg.Go(func() {
			for batch := range ch {
				out <- batch
			}
		})
	}

	go func() {
		wg.Wait()
		close(out)
	}()

	return out
}