// URLs implements a breadth-first search webcrawler based on the // example given in section 8.6 of The Go Programming Language. package main import ( "flag" "fmt" "log" "net/url" "runtime/pprof" "strings" "time" ) func main() { // Setting shorfile helps especially for when we log errors // without returning them. log.SetFlags(log.LstdFlags | log.Lshortfile) maxConcurrency := flag.Int("c", 0, "Maximum number of concurrent queue pushes") urlArg := flag.String("url", "", "Entry-point URL") maxURLs := flag.Int("max", 0, "Maximum number of URLs to collect (omitted or 0 means no limit)") maxDepth := flag.Int("depth", 0, "Maximum URL depth (omitted or 0 means no limit)") shortcode := flag.String("shortcode", "", "URL shortcode") shortcodeFilename := flag.String("scfile", "urls.csv", "Shortcode CSV file") flag.Parse() // Vet the given CLI arguments for things like negative or // missing values. if *maxConcurrency == 0 { log.Fatal("Missing -c argument") } if *maxConcurrency < 1 { log.Fatalf("Invalid -c argument: %d", *maxConcurrency) } startRawURL, err := chooseFrom(*urlArg, *shortcode, *shortcodeFilename) if err != nil { log.Fatal(err) } if *maxURLs < 0 { log.Fatalf("Invalid -max argument: %d", *maxURLs) } if *maxDepth < 0 { log.Fatalf("Invalid -depth argument: %d", *maxDepth) } startURL, err := convertToURL(startRawURL) if err != nil { log.Fatal(err) } // Our web crawlers use concurrency: check if any goroutines // have leaked. getLeakProfile(func() { classic(startURL, *maxConcurrency, *maxURLs, *maxDepth) }) } // chooseFrom determines whether to use a -url or -shortcode // argument. If both are present or absent, an error is returned. If // exactly one is present, return that one. func chooseFrom(urlArg, shortcode, shortcodeFilename string) (string, error) { urlAbsent := (urlArg == "") shortcodeAbsent := (shortcode == "") if urlAbsent == shortcodeAbsent { messageSlug := "present" if urlAbsent { messageSlug = "missing; need exactly one" } return "", fmt.Errorf("-url and -shortcode flags both %s", messageSlug) } if urlAbsent { return getURLFromShortcode(shortcodeFilename, shortcode) } return urlArg, nil } // convertToURL parses the given rawURL into a [url.URL]. If the // rawURL is missing a scheme, "https://" is prepended before parsing. // // Return the parsed URL, along with any error. func convertToURL(rawURL string) (url.URL, error) { if !strings.HasPrefix(rawURL, "http://") && !strings.HasPrefix(rawURL, "https://") { rawURL = "https://" + rawURL fmt.Printf("start url: %s\n", rawURL) } u, err := url.Parse(rawURL) if err != nil { return url.URL{}, fmt.Errorf("can't parse %s: %w", rawURL, err) } return *u, nil } // getLeakProfile runs a leaky program snippet, extracts the goroutine leak profile, // and writes it to stdout. func getLeakProfile(leakySnippet func()) { prof := pprof.Lookup("goroutineleak") defer func() { time.Sleep(2 * time.Second) var content strings.Builder prof.WriteTo(&content, 2) // Ignore non leaked goroutines leaks := strings.SplitSeq(content.String(), "\n\n") for leak := range leaks { if strings.Contains(leak, "(leaked)") { fmt.Println(leak + "\n") } } }() leakySnippet() }