diff options
Diffstat (limited to 'findurls.go')
| -rw-r--r-- | findurls.go | 44 |
1 files changed, 44 insertions, 0 deletions
diff --git a/findurls.go b/findurls.go new file mode 100644 index 0000000..6bc95d7 --- /dev/null +++ b/findurls.go @@ -0,0 +1,44 @@ +package main + +import ( + "log" + "net/url" + + "golang.org/x/net/html" + "golang.org/x/net/html/atom" +) + +// findURLs combs htmlDoc for anchor-tag URLs. The refURL parameter is +// used to resolve each found URL into an absolute URL. +func findURLs(refURL url.URL, htmlDoc *html.Node) []url.URL { + // Used for collecting all the URLs we find. + var rawURLs []url.URL + + for node := range htmlDoc.Descendants() { + if node.Type == html.ElementNode { + switch node.DataAtom { + case atom.A: + for _, attr := range node.Attr { + if attr.Key == "href" { + subURL, err := refURL.Parse(attr.Val) + if err != nil { + log.Printf("bad URL: %s", attr.Val) + continue + } + + // Filter external + // URLs from the final + // result. + if subURL.Hostname() != refURL.Hostname() { + continue + } + + rawURLs = append(rawURLs, *subURL) + } + } + } + } + } + + return rawURLs +} |
