summaryrefslogtreecommitdiff
path: root/classic.go
diff options
context:
space:
mode:
authordemo <demo@antix1>2026-05-27 12:05:45 -0400
committerdemo <demo@antix1>2026-05-27 12:05:45 -0400
commit7852eae1e9653d9a62ff3c82b8fac2954ee1944f (patch)
tree212a6f043a33d4662e358c2a3f56c4e484862f7c /classic.go
parent81ee780aca0aa0d9ce6999a23ef94e986307e060 (diff)
feat: update the classic crawler to track depth via packets
Diffstat (limited to 'classic.go')
-rw-r--r--classic.go23
1 files changed, 14 insertions, 9 deletions
diff --git a/classic.go b/classic.go
index 0b2bfa8..bd689fc 100644
--- a/classic.go
+++ b/classic.go
@@ -9,16 +9,18 @@ import (
)
func classic(startURL url.URL, maxConcurrency, maxURLs int) {
- worklist := make(chan []url.URL)
+ worklist := make(chan []packet)
var numPendingSends int
numPendingSends++
go func() {
- worklist <- []url.URL{startURL}
+ startPacket := packet{startURL, 0}
+ worklist <- []packet{startPacket}
}()
- // Crawl the web concurrently.
- seen := make(map[url.URL]bool)
+ // Crawl the web concurrently. Map URLs to their depth (i.e
+ // how many links we have to work through to find the URL.)
+ seen := make(map[url.URL]int)
count := 1
ctx, cancel := context.WithCancel(context.Background())
@@ -28,12 +30,12 @@ func classic(startURL url.URL, maxConcurrency, maxURLs int) {
loop:
for ; numPendingSends > 0; numPendingSends-- {
batch := <-worklist
- for _, u := range batch {
- if !seen[u] {
- fmt.Printf("%d. %s\n", count, &u)
+ for _, p := range batch {
+ if _, ok := seen[p.url]; !ok {
+ fmt.Printf("%d. %s\n", count, &p)
count++
- seen[u] = true
+ seen[p.url] = p.depth
if len(seen) == maxURLs {
break loop
}
@@ -46,7 +48,10 @@ loop:
select {
case <-ctx.Done():
return
- case worklist <- getBatch(u):
+ default:
+ batch := getBatch(p.url)
+ ps := convertToPackets(batch, p.depth+1)
+ worklist <- ps
}
})
}