From 99111bcfd5b81ca51f102b35efe8e4be0e6d390e Mon Sep 17 00:00:00 2001 From: demo Date: Tue, 26 May 2026 22:22:05 -0400 Subject: feat: measure the depth where each URL is found --- workers.go | 43 ++++++++++++++++++++++++++++++------------- 1 file changed, 30 insertions(+), 13 deletions(-) diff --git a/workers.go b/workers.go index 966f330..845e947 100644 --- a/workers.go +++ b/workers.go @@ -14,14 +14,24 @@ import ( - manages urls channel. */ +type Packet struct { + url url.URL + depth int +} + +func (p Packet) String() string { + return fmt.Sprintf("[%d] %s", p.depth, &p.url) +} + func workers(startURL url.URL, maxConcurrency, maxURLs int) { - worklist := make(chan []url.URL) + worklist := make(chan []Packet) // Unseen URLs. - urls := make(chan url.URL) + packets := make(chan Packet) go func() { - worklist <- []url.URL{startURL} + startPacket := Packet{startURL, 0} + worklist <- []Packet{startPacket} }() var wg sync.WaitGroup @@ -31,14 +41,21 @@ func workers(startURL url.URL, maxConcurrency, maxURLs int) { // the urls channel (unseen links.) for i := range maxConcurrency { wg.Go(func() { - for u := range urls { - batch := getBatch(u) + for p := range packets { + batch := getBatch(p.url) + var ps []Packet + + for _, u := range batch { + newPacket := Packet{u, p.depth + 1} + ps = append(ps, newPacket) + } + select { case <-ctx.Done(): fmt.Printf("exiting early %d\n", i+1) return default: - go func() { worklist <- batch }() + go func() { worklist <- ps }() } } @@ -48,28 +65,28 @@ func workers(startURL url.URL, maxConcurrency, maxURLs int) { // The main goroutine deduplicates worklist items and sends // unseen ones to the crawlers in a fan-out fashion. - seen := make(map[url.URL]bool) + seen := make(map[url.URL]int) count := 1 loop: for batch := range worklist { - for _, u := range batch { - if !seen[u] { - fmt.Printf("%d. %s\n", count, &u) + for _, p := range batch { + if _, ok := seen[p.url]; !ok { + fmt.Printf("%d. %s\n", count, p) count++ - seen[u] = true + seen[p.url] = p.depth if len(seen) == maxURLs { break loop } - urls <- u + packets <- p } } } - close(urls) + close(packets) cancel() wg.Wait() -- cgit v1.2.3