From 8921d502c306fa939fba904479fdbd659a39f6fc Mon Sep 17 00:00:00 2001 From: demo Date: Sat, 23 May 2026 10:12:47 -0400 Subject: feat: implement simple BFS webcrawler --- main.go | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) (limited to 'main.go') diff --git a/main.go b/main.go index 7905807..f4b9a91 100644 --- a/main.go +++ b/main.go @@ -1,5 +1,57 @@ package main +import ( + "flag" + "fmt" + "log" + "net/url" +) + func main() { + maxConcurrency := flag.Int("c", 0, "Maximum number of concurrent queue pushes") + startRawURL := flag.String("url", "", "Entry-point URL") + flag.Parse() + + if *maxConcurrency == 0 { + log.Fatal("Missing -c argument") + } + + if *startRawURL == "" { + log.Fatal("Missing -url argument") + } + + startURL, err := url.Parse(*startRawURL) + if err != nil { + log.Fatal(err) + } + + worklist := make(chan []url.URL) + go func() { + worklist <- []url.URL{*startURL} + }() + + seen := make(map[url.URL]bool) + for list := range worklist { + for _, u := range list { + if !seen[u] { + fmt.Printf("%s\n", &u) + seen[u] = true + + go func() { + more := crawl(u) + worklist <- more + }() + } + } + } +} + +func crawl(u url.URL) []url.URL { + doc, err := fetch(u) + if err != nil { + log.Print(err) + return nil + } + return findURLs(u, doc) } -- cgit v1.2.3