diff options
| author | demo <demo@antix1> | 2026-05-26 21:11:01 -0400 |
|---|---|---|
| committer | demo <demo@antix1> | 2026-05-26 21:11:01 -0400 |
| commit | 1e7d34bb14057c79ea1f7fa75e6ecf29951b69aa (patch) | |
| tree | 46981d34474dce2b7aef1f8cd648a245810e8e81 /workers.go | |
| parent | 375efff4cf21834a32e01a1a9d3f470a86a46cd2 (diff) | |
feat: add the worker-pool-based crawer from TGPL
Diffstat (limited to 'workers.go')
| -rw-r--r-- | workers.go | 56 |
1 files changed, 56 insertions, 0 deletions
diff --git a/workers.go b/workers.go new file mode 100644 index 0000000..ab99da8 --- /dev/null +++ b/workers.go @@ -0,0 +1,56 @@ +package main + +import ( + "fmt" + "net/url" + "sync" +) + +/* + + + main goroutine: + - manages urls channel. +*/ + +func workers(startURL url.URL, maxConcurrency, maxURLs int) { + worklist := make(chan []url.URL) + + // Unseen URLs. + urls := make(chan url.URL) + + go func() { + worklist <- []url.URL{startURL} + }() + + var wg sync.WaitGroup + // Create maxConcurrency worker goroutines to demultiplex from + // the urls channel (unseen links.) + for range maxConcurrency { + wg.Go(func() { + for u := range urls { + batch := getBatch(u) + go func() { worklist <- batch }() + } + }) + } + + // The main goroutine deduplicates worklist items and sends + // unseen ones to the crawlers in a fan-out fashion. + seen := make(map[url.URL]bool) + count := 1 + + for batch := range worklist { + for _, u := range batch { + if !seen[u] { + fmt.Printf("%d. %s\n", count, &u) + count++ + + seen[u] = true + urls <- u + } + } + } + + wg.Wait() +} |
