diff options
| author | demo <demo@antix1> | 2026-05-23 10:12:47 -0400 |
|---|---|---|
| committer | demo <demo@antix1> | 2026-05-23 10:12:47 -0400 |
| commit | 8921d502c306fa939fba904479fdbd659a39f6fc (patch) | |
| tree | 2ca77b0a31c5c4c81d6fb9c8025c857f82d8de04 /main.go | |
| parent | ee3b576ba30177f89bcb558fb588b3380caf1f95 (diff) | |
feat: implement simple BFS webcrawler
Diffstat (limited to 'main.go')
| -rw-r--r-- | main.go | 52 |
1 files changed, 52 insertions, 0 deletions
@@ -1,5 +1,57 @@ package main +import ( + "flag" + "fmt" + "log" + "net/url" +) + func main() { + maxConcurrency := flag.Int("c", 0, "Maximum number of concurrent queue pushes") + startRawURL := flag.String("url", "", "Entry-point URL") + flag.Parse() + + if *maxConcurrency == 0 { + log.Fatal("Missing -c argument") + } + + if *startRawURL == "" { + log.Fatal("Missing -url argument") + } + + startURL, err := url.Parse(*startRawURL) + if err != nil { + log.Fatal(err) + } + + worklist := make(chan []url.URL) + go func() { + worklist <- []url.URL{*startURL} + }() + + seen := make(map[url.URL]bool) + for list := range worklist { + for _, u := range list { + if !seen[u] { + fmt.Printf("%s\n", &u) + seen[u] = true + + go func() { + more := crawl(u) + worklist <- more + }() + } + } + } +} + +func crawl(u url.URL) []url.URL { + doc, err := fetch(u) + if err != nil { + log.Print(err) + return nil + } + return findURLs(u, doc) } |
