// URLs implements a breadth-first search webcrawler based on the // example given in section 8.6 of The Go Programming Language. package main import ( "context" "flag" "fmt" "log" "net/url" ) func main() { maxConcurrency := flag.Int("c", 0, "Maximum number of concurrent queue pushes") startRawURL := flag.String("url", "", "Entry-point URL") maxURLs := flag.Int("max", 0, "Maximum number of URLs to collect (omitted or 0 means no limit)") flag.Parse() if *maxConcurrency == 0 { log.Fatal("Missing -c argument") } if *maxConcurrency < 1 { log.Fatalf("Invalid -c argument: %d", *maxConcurrency) } if *startRawURL == "" { log.Fatal("Missing -url argument") } if *maxURLs < 0 { log.Fatalf("Invalid -max argument: %d", *maxURLs) } startURL, err := url.Parse(*startRawURL) if err != nil { log.Fatal(err) } crawler(*startURL, *maxConcurrency, *maxURLs) } func crawler(startURL url.URL, maxConcurrency, maxURLs int) { worklist := make(chan []url.URL) go func() { worklist <- []url.URL{startURL} }() sem := make(chan struct{}, maxConcurrency) ctx, cancel := context.WithCancel(context.Background()) seen := make(map[url.URL]bool) i := 1 // FIXME: unfortunately, this example leaks, but I don't know // how to fix that yet. loop: for list := range worklist { for _, u := range list { if maxURLs > 0 && len(seen) == maxURLs { break loop } if !seen[u] { fmt.Printf("%d. %s\n", i, &u) i++ seen[u] = true go func() { sem <- struct{}{} defer func() { <-sem }() more := getMoreURLs(ctx, u) if len(more) > 0 { worklist <- more } }() } } } // We broke the range loop, meaning there should be no more // pending getMoreURLs jobs anyway. cancel() // For now, print out some diagnostics that prove that there // are still pending sends on the worklist channel. for batch := range worklist { fmt.Printf("%d\n", len(batch)) for _, u := range batch { fmt.Printf("-- %s\n", &u) } } } func getMoreURLs(ctx context.Context, u url.URL) []url.URL { select { case <-ctx.Done(): return nil default: } doc, err := fetch(u) if err != nil { log.Print(err) return nil } return findURLs(u, doc) }