summaryrefslogtreecommitdiff
path: root/workers.go
blob: ab99da81816c859aee00bf88b1c9ba86ffd885a9 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
package main

import (
	"fmt"
	"net/url"
	"sync"
)

/*


   main goroutine:
   - manages urls channel.
*/

func workers(startURL url.URL, maxConcurrency, maxURLs int) {
	worklist := make(chan []url.URL)

	// Unseen URLs.
	urls := make(chan url.URL)

	go func() {
		worklist <- []url.URL{startURL}
	}()

	var wg sync.WaitGroup
	// Create maxConcurrency worker goroutines to demultiplex from
	// the urls channel (unseen links.)
	for range maxConcurrency {
		wg.Go(func() {
			for u := range urls {
				batch := getBatch(u)
				go func() { worklist <- batch }()
			}
		})
	}

	// The main goroutine deduplicates worklist items and sends
	// unseen ones to the crawlers in a fan-out fashion.
	seen := make(map[url.URL]bool)
	count := 1

	for batch := range worklist {
		for _, u := range batch {
			if !seen[u] {
				fmt.Printf("%d. %s\n", count, &u)
				count++

				seen[u] = true
				urls <- u
			}
		}
	}

	wg.Wait()
}