package links import ( "fmt" "io" "log" "net/url" "golang.org/x/net/html" "golang.org/x/net/html/atom" ) func Parse(htmlInput io.Reader, baseURL *url.URL) ([]string, error) { var err error hrefs, err := parse(htmlInput) if err != nil { return nil, fmt.Errorf("can't parse: %w", err) } hrefs = filterByBaseURL(baseURL, hrefs) return hrefs, nil } // parse finds the links inside htmlInput and returns them as a slice // of strings, along with an error. // // The only possible error should be the one returned from calling // [html.Parse]. func parse(htmlInput io.Reader) ([]string, error) { doc, err := html.Parse(htmlInput) if err != nil { return nil, err } hrefs := findHrefs(doc) return hrefs, nil } // findHrefs returns all link addresses inside doc. It collects each // one exactly as it appears in the document, without resolving it // with respect to some base URL. func findHrefs(doc *html.Node) []string { var hrefs []string for node := range doc.Descendants() { if node.Type == html.ElementNode && node.DataAtom == atom.A { for _, attr := range node.Attr { if attr.Key == "href" { hrefs = append(hrefs, attr.Val) } } } } return hrefs } // filterByBaseURL returns the slice of all web addresses in hrefs // that are under baseURL. In passing, it also resolves these with // respect to baseURL. // // The motivation is that, when crawling pages // to build a sitemap, the crawl should never leave the top-level Web // domain those pages belong to. func filterByBaseURL(baseURL *url.URL, hrefs []string) []string { var neighbors []string for _, href := range hrefs { // Check that href parses as a URL, and at the same // time resolve it with respec to baseURL. u, err := baseURL.Parse(href) if err != nil { log.Printf("%s: %v", u, err) continue } // If href is a valid absolute URL, it will parse // successfully, so we need to check hostnames. if u.Hostname() != baseURL.Hostname() { log.Printf("different hostnames: %s %s", u, baseURL) continue } neighbors = append(neighbors, href) } return neighbors }