summaryrefslogtreecommitdiff
path: root/workers.go
diff options
context:
space:
mode:
authordemo <demo@antix1>2026-05-26 22:32:58 -0400
committerdemo <demo@antix1>2026-05-26 22:46:39 -0400
commit26f5b43a82955c77ea4bc1d7a710895e4b36209a (patch)
tree891f77573bad6eb8dbc2440b1795c8f00cfbf334 /workers.go
parent99111bcfd5b81ca51f102b35efe8e4be0e6d390e (diff)
docs: add extensive comments
Diffstat (limited to 'workers.go')
-rw-r--r--workers.go21
1 files changed, 20 insertions, 1 deletions
diff --git a/workers.go b/workers.go
index 845e947..535e50c 100644
--- a/workers.go
+++ b/workers.go
@@ -8,21 +8,29 @@ import (
)
/*
-
+ CHANNEL/GOROUTINE NOTES
main goroutine:
- manages urls channel.
*/
+// Packet accrues data as it passes through our concurrent
+// pipeline. Formerly the web crawler only transmitted [url.URL]'s,
+// but usingn a compound data type allows us to add URL
+// depth-tracking.
type Packet struct {
url url.URL
depth int
}
+// String implements the Stringer interface. We need this mainly
+// because a [url.URL]'s String method only works when that URL is a
+// pointer.
func (p Packet) String() string {
return fmt.Sprintf("[%d] %s", p.depth, &p.url)
}
+// workers launches a worker queue for crawling a given Web domain.
func workers(startURL url.URL, maxConcurrency, maxURLs int) {
worklist := make(chan []Packet)
@@ -43,6 +51,9 @@ func workers(startURL url.URL, maxConcurrency, maxURLs int) {
wg.Go(func() {
for p := range packets {
batch := getBatch(p.url)
+
+ // Convert URLs to Packets. In the
+ // process, bump up the depth by 1.
var ps []Packet
for _, u := range batch {
@@ -66,11 +77,16 @@ func workers(startURL url.URL, maxConcurrency, maxURLs int) {
// The main goroutine deduplicates worklist items and sends
// unseen ones to the crawlers in a fan-out fashion.
seen := make(map[url.URL]int)
+
+ // Used to prettify the running URL listing.
count := 1
loop:
for batch := range worklist {
for _, p := range batch {
+ // We're tracking _depth_ with the seen-map
+ // now, so any unseen URL doesn't have any
+ // depth-entry registered yet.
if _, ok := seen[p.url]; !ok {
fmt.Printf("%d. %s\n", count, p)
count++
@@ -86,8 +102,11 @@ loop:
}
}
+ // We're done writing to the packets channel, so close it.
close(packets)
+ // There are some in-flight workers as of this point, so
+ // signal a cancel to them.
cancel()
wg.Wait()
}