package main import ( "flag" "fmt" "io" "log" "net/http" "time" "golang.org/x/net/html" "golang.org/x/net/html/atom" ) type Link struct { Href string Text string } func main() { // Logging configuration. log.SetFlags(log.LstdFlags | log.Lshortfile) // CLI flag configuration. rawURL := flag.String("url", "", "Web address of target HTML") timeoutSecs := flag.Int("timeout", 2, "Number of seconds after which to time out") flag.Parse() if *rawURL == "" { log.Fatal("Missing -url") } // Configure the request. timeout := time.Duration(*timeoutSecs) * time.Second client := http.Client{ Timeout: timeout, } req, err := http.NewRequest(http.MethodGet, *rawURL, nil) if err != nil { log.Fatal(err) } // Perform the request. resp, err := client.Do(req) if err != nil { log.Fatal(err) } defer resp.Body.Close() links, err := findLinks(resp.Body) if err != nil { log.Fatal(err) } fmt.Println(links) } // findLinks consumes the given [io.Reader], scraping it of anchor // tags. Each anchor tag is "unmarshalled" into a [Link]. The // resulting slice of Links is returned, along with an error. func findLinks(r io.Reader) ([]Link, error) { doc, err := html.Parse(r) if err != nil { return nil, fmt.Errorf("can't parse html reader: %w", err) } var links []Link for n := range doc.Descendants() { if n.Type == html.ElementNode && n.DataAtom == atom.A { var link Link // Scan the href. for _, a := range n.Attr { if a.Key == "href" { link.Href = a.Val } } // FIXME: for now, only scan for hrefs. links = append(links, link) } } return links, nil }