From 1e7d34bb14057c79ea1f7fa75e6ecf29951b69aa Mon Sep 17 00:00:00 2001 From: demo Date: Tue, 26 May 2026 21:11:01 -0400 Subject: feat: add the worker-pool-based crawer from TGPL --- workers.go | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 workers.go (limited to 'workers.go') diff --git a/workers.go b/workers.go new file mode 100644 index 0000000..ab99da8 --- /dev/null +++ b/workers.go @@ -0,0 +1,56 @@ +package main + +import ( + "fmt" + "net/url" + "sync" +) + +/* + + + main goroutine: + - manages urls channel. +*/ + +func workers(startURL url.URL, maxConcurrency, maxURLs int) { + worklist := make(chan []url.URL) + + // Unseen URLs. + urls := make(chan url.URL) + + go func() { + worklist <- []url.URL{startURL} + }() + + var wg sync.WaitGroup + // Create maxConcurrency worker goroutines to demultiplex from + // the urls channel (unseen links.) + for range maxConcurrency { + wg.Go(func() { + for u := range urls { + batch := getBatch(u) + go func() { worklist <- batch }() + } + }) + } + + // The main goroutine deduplicates worklist items and sends + // unseen ones to the crawlers in a fan-out fashion. + seen := make(map[url.URL]bool) + count := 1 + + for batch := range worklist { + for _, u := range batch { + if !seen[u] { + fmt.Printf("%d. %s\n", count, &u) + count++ + + seen[u] = true + urls <- u + } + } + } + + wg.Wait() +} -- cgit v1.2.3