summaryrefslogtreecommitdiff
path: root/findurls.go
diff options
context:
space:
mode:
Diffstat (limited to 'findurls.go')
-rw-r--r--findurls.go44
1 files changed, 44 insertions, 0 deletions
diff --git a/findurls.go b/findurls.go
new file mode 100644
index 0000000..6bc95d7
--- /dev/null
+++ b/findurls.go
@@ -0,0 +1,44 @@
+package main
+
+import (
+ "log"
+ "net/url"
+
+ "golang.org/x/net/html"
+ "golang.org/x/net/html/atom"
+)
+
+// findURLs combs htmlDoc for anchor-tag URLs. The refURL parameter is
+// used to resolve each found URL into an absolute URL.
+func findURLs(refURL url.URL, htmlDoc *html.Node) []url.URL {
+ // Used for collecting all the URLs we find.
+ var rawURLs []url.URL
+
+ for node := range htmlDoc.Descendants() {
+ if node.Type == html.ElementNode {
+ switch node.DataAtom {
+ case atom.A:
+ for _, attr := range node.Attr {
+ if attr.Key == "href" {
+ subURL, err := refURL.Parse(attr.Val)
+ if err != nil {
+ log.Printf("bad URL: %s", attr.Val)
+ continue
+ }
+
+ // Filter external
+ // URLs from the final
+ // result.
+ if subURL.Hostname() != refURL.Hostname() {
+ continue
+ }
+
+ rawURLs = append(rawURLs, *subURL)
+ }
+ }
+ }
+ }
+ }
+
+ return rawURLs
+}