summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authordemo <demo@antix1>2026-05-26 22:22:05 -0400
committerdemo <demo@antix1>2026-05-26 22:22:05 -0400
commit99111bcfd5b81ca51f102b35efe8e4be0e6d390e (patch)
tree4ac4472632578f2611c7d6be68216b3cccf6bc6b
parent5898a16cf4b9dc2f0eef2bf81d429b5aca7005fc (diff)
feat: measure the depth where each URL is found
-rw-r--r--workers.go43
1 files changed, 30 insertions, 13 deletions
diff --git a/workers.go b/workers.go
index 966f330..845e947 100644
--- a/workers.go
+++ b/workers.go
@@ -14,14 +14,24 @@ import (
- manages urls channel.
*/
+type Packet struct {
+ url url.URL
+ depth int
+}
+
+func (p Packet) String() string {
+ return fmt.Sprintf("[%d] %s", p.depth, &p.url)
+}
+
func workers(startURL url.URL, maxConcurrency, maxURLs int) {
- worklist := make(chan []url.URL)
+ worklist := make(chan []Packet)
// Unseen URLs.
- urls := make(chan url.URL)
+ packets := make(chan Packet)
go func() {
- worklist <- []url.URL{startURL}
+ startPacket := Packet{startURL, 0}
+ worklist <- []Packet{startPacket}
}()
var wg sync.WaitGroup
@@ -31,14 +41,21 @@ func workers(startURL url.URL, maxConcurrency, maxURLs int) {
// the urls channel (unseen links.)
for i := range maxConcurrency {
wg.Go(func() {
- for u := range urls {
- batch := getBatch(u)
+ for p := range packets {
+ batch := getBatch(p.url)
+ var ps []Packet
+
+ for _, u := range batch {
+ newPacket := Packet{u, p.depth + 1}
+ ps = append(ps, newPacket)
+ }
+
select {
case <-ctx.Done():
fmt.Printf("exiting early %d\n", i+1)
return
default:
- go func() { worklist <- batch }()
+ go func() { worklist <- ps }()
}
}
@@ -48,28 +65,28 @@ func workers(startURL url.URL, maxConcurrency, maxURLs int) {
// The main goroutine deduplicates worklist items and sends
// unseen ones to the crawlers in a fan-out fashion.
- seen := make(map[url.URL]bool)
+ seen := make(map[url.URL]int)
count := 1
loop:
for batch := range worklist {
- for _, u := range batch {
- if !seen[u] {
- fmt.Printf("%d. %s\n", count, &u)
+ for _, p := range batch {
+ if _, ok := seen[p.url]; !ok {
+ fmt.Printf("%d. %s\n", count, p)
count++
- seen[u] = true
+ seen[p.url] = p.depth
if len(seen) == maxURLs {
break loop
}
- urls <- u
+ packets <- p
}
}
}
- close(urls)
+ close(packets)
cancel()
wg.Wait()