summaryrefslogtreecommitdiff
path: root/workers.go
diff options
context:
space:
mode:
Diffstat (limited to 'workers.go')
-rw-r--r--workers.go56
1 files changed, 56 insertions, 0 deletions
diff --git a/workers.go b/workers.go
new file mode 100644
index 0000000..ab99da8
--- /dev/null
+++ b/workers.go
@@ -0,0 +1,56 @@
+package main
+
+import (
+ "fmt"
+ "net/url"
+ "sync"
+)
+
+/*
+
+
+ main goroutine:
+ - manages urls channel.
+*/
+
+func workers(startURL url.URL, maxConcurrency, maxURLs int) {
+ worklist := make(chan []url.URL)
+
+ // Unseen URLs.
+ urls := make(chan url.URL)
+
+ go func() {
+ worklist <- []url.URL{startURL}
+ }()
+
+ var wg sync.WaitGroup
+ // Create maxConcurrency worker goroutines to demultiplex from
+ // the urls channel (unseen links.)
+ for range maxConcurrency {
+ wg.Go(func() {
+ for u := range urls {
+ batch := getBatch(u)
+ go func() { worklist <- batch }()
+ }
+ })
+ }
+
+ // The main goroutine deduplicates worklist items and sends
+ // unseen ones to the crawlers in a fan-out fashion.
+ seen := make(map[url.URL]bool)
+ count := 1
+
+ for batch := range worklist {
+ for _, u := range batch {
+ if !seen[u] {
+ fmt.Printf("%d. %s\n", count, &u)
+ count++
+
+ seen[u] = true
+ urls <- u
+ }
+ }
+ }
+
+ wg.Wait()
+}