summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authordemo <demo@antix1>2026-05-26 21:11:01 -0400
committerdemo <demo@antix1>2026-05-26 21:11:01 -0400
commit1e7d34bb14057c79ea1f7fa75e6ecf29951b69aa (patch)
tree46981d34474dce2b7aef1f8cd648a245810e8e81
parent375efff4cf21834a32e01a1a9d3f470a86a46cd2 (diff)
feat: add the worker-pool-based crawer from TGPL
-rw-r--r--main.go2
-rw-r--r--workers.go56
2 files changed, 57 insertions, 1 deletions
diff --git a/main.go b/main.go
index bd30a4c..900cb1f 100644
--- a/main.go
+++ b/main.go
@@ -41,7 +41,7 @@ func main() {
}
getLeakProfile(func() {
- classic(*startURL, *maxConcurrency, *maxURLs)
+ workers(*startURL, *maxConcurrency, *maxURLs)
})
}
diff --git a/workers.go b/workers.go
new file mode 100644
index 0000000..ab99da8
--- /dev/null
+++ b/workers.go
@@ -0,0 +1,56 @@
+package main
+
+import (
+ "fmt"
+ "net/url"
+ "sync"
+)
+
+/*
+
+
+ main goroutine:
+ - manages urls channel.
+*/
+
+func workers(startURL url.URL, maxConcurrency, maxURLs int) {
+ worklist := make(chan []url.URL)
+
+ // Unseen URLs.
+ urls := make(chan url.URL)
+
+ go func() {
+ worklist <- []url.URL{startURL}
+ }()
+
+ var wg sync.WaitGroup
+ // Create maxConcurrency worker goroutines to demultiplex from
+ // the urls channel (unseen links.)
+ for range maxConcurrency {
+ wg.Go(func() {
+ for u := range urls {
+ batch := getBatch(u)
+ go func() { worklist <- batch }()
+ }
+ })
+ }
+
+ // The main goroutine deduplicates worklist items and sends
+ // unseen ones to the crawlers in a fan-out fashion.
+ seen := make(map[url.URL]bool)
+ count := 1
+
+ for batch := range worklist {
+ for _, u := range batch {
+ if !seen[u] {
+ fmt.Printf("%d. %s\n", count, &u)
+ count++
+
+ seen[u] = true
+ urls <- u
+ }
+ }
+ }
+
+ wg.Wait()
+}