summaryrefslogtreecommitdiff
path: root/findurls.go
diff options
context:
space:
mode:
authordemo <demo@antix1>2026-05-21 18:13:18 -0400
committerdemo <demo@antix1>2026-05-21 18:13:18 -0400
commitc43b8afcd63446021334669f5a246cfb0b637fd3 (patch)
tree161b10faeaa7675d9fd2f558daaec5a16f3de018 /findurls.go
parentc032bd8d32130a2dbc1a7481dd501f2c182707cc (diff)
chore: add existing code to project
I also ran go mod tidy since the new files refer to packages I've recently installed (locally.)
Diffstat (limited to 'findurls.go')
-rw-r--r--findurls.go44
1 files changed, 44 insertions, 0 deletions
diff --git a/findurls.go b/findurls.go
new file mode 100644
index 0000000..6bc95d7
--- /dev/null
+++ b/findurls.go
@@ -0,0 +1,44 @@
+package main
+
+import (
+ "log"
+ "net/url"
+
+ "golang.org/x/net/html"
+ "golang.org/x/net/html/atom"
+)
+
+// findURLs combs htmlDoc for anchor-tag URLs. The refURL parameter is
+// used to resolve each found URL into an absolute URL.
+func findURLs(refURL url.URL, htmlDoc *html.Node) []url.URL {
+ // Used for collecting all the URLs we find.
+ var rawURLs []url.URL
+
+ for node := range htmlDoc.Descendants() {
+ if node.Type == html.ElementNode {
+ switch node.DataAtom {
+ case atom.A:
+ for _, attr := range node.Attr {
+ if attr.Key == "href" {
+ subURL, err := refURL.Parse(attr.Val)
+ if err != nil {
+ log.Printf("bad URL: %s", attr.Val)
+ continue
+ }
+
+ // Filter external
+ // URLs from the final
+ // result.
+ if subURL.Hostname() != refURL.Hostname() {
+ continue
+ }
+
+ rawURLs = append(rawURLs, *subURL)
+ }
+ }
+ }
+ }
+ }
+
+ return rawURLs
+}