package main import ( "log" "net/url" "golang.org/x/net/html" "golang.org/x/net/html/atom" ) // findURLs combs htmlDoc for anchor-tag URLs. The refURL parameter is // used to resolve each found URL into an absolute URL, and also // ensures that we don't crawl outside refURL's parent Web domain. // // Return the slice of URLs. func findURLs(refURL url.URL, htmlDoc *html.Node) []url.URL { // Used for collecting all the URLs we find. var rawURLs []url.URL for node := range htmlDoc.Descendants() { if node.Type == html.ElementNode { switch node.DataAtom { case atom.A: for _, attr := range node.Attr { if attr.Key == "href" { subURL, err := refURL.Parse(attr.Val) if err != nil { log.Printf("bad URL: %s", attr.Val) continue } // Filter external // URLs from the final // result. if subURL.Hostname() != refURL.Hostname() { continue } rawURLs = append(rawURLs, *subURL) } } } } } return rawURLs }