From 7852eae1e9653d9a62ff3c82b8fac2954ee1944f Mon Sep 17 00:00:00 2001 From: demo Date: Wed, 27 May 2026 12:05:45 -0400 Subject: feat: update the classic crawler to track depth via packets --- classic.go | 23 ++++++++++++++--------- main.go | 2 +- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/classic.go b/classic.go index 0b2bfa8..bd689fc 100644 --- a/classic.go +++ b/classic.go @@ -9,16 +9,18 @@ import ( ) func classic(startURL url.URL, maxConcurrency, maxURLs int) { - worklist := make(chan []url.URL) + worklist := make(chan []packet) var numPendingSends int numPendingSends++ go func() { - worklist <- []url.URL{startURL} + startPacket := packet{startURL, 0} + worklist <- []packet{startPacket} }() - // Crawl the web concurrently. - seen := make(map[url.URL]bool) + // Crawl the web concurrently. Map URLs to their depth (i.e + // how many links we have to work through to find the URL.) + seen := make(map[url.URL]int) count := 1 ctx, cancel := context.WithCancel(context.Background()) @@ -28,12 +30,12 @@ func classic(startURL url.URL, maxConcurrency, maxURLs int) { loop: for ; numPendingSends > 0; numPendingSends-- { batch := <-worklist - for _, u := range batch { - if !seen[u] { - fmt.Printf("%d. %s\n", count, &u) + for _, p := range batch { + if _, ok := seen[p.url]; !ok { + fmt.Printf("%d. %s\n", count, &p) count++ - seen[u] = true + seen[p.url] = p.depth if len(seen) == maxURLs { break loop } @@ -46,7 +48,10 @@ loop: select { case <-ctx.Done(): return - case worklist <- getBatch(u): + default: + batch := getBatch(p.url) + ps := convertToPackets(batch, p.depth+1) + worklist <- ps } }) } diff --git a/main.go b/main.go index 900cb1f..bd30a4c 100644 --- a/main.go +++ b/main.go @@ -41,7 +41,7 @@ func main() { } getLeakProfile(func() { - workers(*startURL, *maxConcurrency, *maxURLs) + classic(*startURL, *maxConcurrency, *maxURLs) }) } -- cgit v1.2.3