diff options
| author | demo <demo@antix1> | 2026-05-26 22:22:05 -0400 |
|---|---|---|
| committer | demo <demo@antix1> | 2026-05-26 22:22:05 -0400 |
| commit | 99111bcfd5b81ca51f102b35efe8e4be0e6d390e (patch) | |
| tree | 4ac4472632578f2611c7d6be68216b3cccf6bc6b | |
| parent | 5898a16cf4b9dc2f0eef2bf81d429b5aca7005fc (diff) | |
feat: measure the depth where each URL is found
| -rw-r--r-- | workers.go | 43 |
1 files changed, 30 insertions, 13 deletions
@@ -14,14 +14,24 @@ import ( - manages urls channel. */ +type Packet struct { + url url.URL + depth int +} + +func (p Packet) String() string { + return fmt.Sprintf("[%d] %s", p.depth, &p.url) +} + func workers(startURL url.URL, maxConcurrency, maxURLs int) { - worklist := make(chan []url.URL) + worklist := make(chan []Packet) // Unseen URLs. - urls := make(chan url.URL) + packets := make(chan Packet) go func() { - worklist <- []url.URL{startURL} + startPacket := Packet{startURL, 0} + worklist <- []Packet{startPacket} }() var wg sync.WaitGroup @@ -31,14 +41,21 @@ func workers(startURL url.URL, maxConcurrency, maxURLs int) { // the urls channel (unseen links.) for i := range maxConcurrency { wg.Go(func() { - for u := range urls { - batch := getBatch(u) + for p := range packets { + batch := getBatch(p.url) + var ps []Packet + + for _, u := range batch { + newPacket := Packet{u, p.depth + 1} + ps = append(ps, newPacket) + } + select { case <-ctx.Done(): fmt.Printf("exiting early %d\n", i+1) return default: - go func() { worklist <- batch }() + go func() { worklist <- ps }() } } @@ -48,28 +65,28 @@ func workers(startURL url.URL, maxConcurrency, maxURLs int) { // The main goroutine deduplicates worklist items and sends // unseen ones to the crawlers in a fan-out fashion. - seen := make(map[url.URL]bool) + seen := make(map[url.URL]int) count := 1 loop: for batch := range worklist { - for _, u := range batch { - if !seen[u] { - fmt.Printf("%d. %s\n", count, &u) + for _, p := range batch { + if _, ok := seen[p.url]; !ok { + fmt.Printf("%d. %s\n", count, p) count++ - seen[u] = true + seen[p.url] = p.depth if len(seen) == maxURLs { break loop } - urls <- u + packets <- p } } } - close(urls) + close(packets) cancel() wg.Wait() |
