summaryrefslogtreecommitdiff
path: root/workers.go
blob: 43e398d0617b08a3a9d02946c2291374ba55ae15 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
package main

import (
	"context"
	"fmt"
	"net/url"
	"sync"
)

/*
   CHANNEL/GOROUTINE NOTES

   main goroutine:
   - manages urls channel.
*/

// Packet accrues data as it passes through our concurrent
// pipeline. Formerly the web crawler only transmitted [url.URL]'s,
// but usingn a compound data type allows us to add URL
// depth-tracking.
type Packet struct {
	url   url.URL
	depth int
}

// String implements the Stringer interface. We need this mainly
// because a [url.URL]'s String method only works when that URL is a
// pointer.
func (p Packet) String() string {
	return fmt.Sprintf("[%d] %s", p.depth, &p.url)
}

// workers launches a worker queue for crawling a given Web domain.
func workers(startURL url.URL, maxConcurrency, maxURLs int) {
	worklist := make(chan []Packet)

	// Unseen URLs.
	packets := make(chan Packet)

	go func() {
		startPacket := Packet{startURL, 0}
		worklist <- []Packet{startPacket}
	}()

	var wg sync.WaitGroup
	ctx, cancel := context.WithCancel(context.Background())

	// Create maxConcurrency worker goroutines to demultiplex from
	// the urls channel (unseen links.)
	for i := range maxConcurrency {
		wg.Go(func() {
			for p := range packets {
				batch := getBatch(p.url)

				// Convert URLs to Packets. In the
				// process, bump up the depth by 1.
				ps := convertToPackets(batch, p.depth+1)

				select {
				case <-ctx.Done():
					fmt.Printf("exiting early %d\n", i+1)
					return
				default:
					go func() { worklist <- ps }()
				}
			}

			fmt.Printf("terminating %d\n", i+1)
		})
	}

	// The main goroutine deduplicates worklist items and sends
	// unseen ones to the crawlers in a fan-out fashion.
	seen := make(map[url.URL]int)

	// Used to prettify the running URL listing.
	count := 1

loop:
	for batch := range worklist {
		for _, p := range batch {
			// We're tracking _depth_ with the seen-map
			// now, so any unseen URL doesn't have any
			// depth-entry registered yet.
			if _, ok := seen[p.url]; !ok {
				fmt.Printf("%d. %s\n", count, p)
				count++

				seen[p.url] = p.depth

				if len(seen) == maxURLs {
					break loop
				}

				packets <- p
			}
		}
	}

	// We're done writing to the packets channel, so close it.
	close(packets)

	// There are some in-flight workers as of this point, so
	// signal a cancel to them.
	cancel()
	wg.Wait()
}

// convertToPackets converts the batch of URLs to a slice of Packet
// structs, configuring each one with the given depth.
func convertToPackets(batch []url.URL, depth int) []Packet {
	var ps []Packet

	for _, u := range batch {
		newPacket := Packet{u, depth}
		ps = append(ps, newPacket)
	}

	return ps
}