summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--fetch.go44
-rw-r--r--findurls.go44
-rw-r--r--go.mod2
-rw-r--r--go.sum2
4 files changed, 92 insertions, 0 deletions
diff --git a/fetch.go b/fetch.go
new file mode 100644
index 0000000..f81f327
--- /dev/null
+++ b/fetch.go
@@ -0,0 +1,44 @@
+package main
+
+import (
+ "fmt"
+ "net/http"
+ "net/url"
+
+ "golang.org/x/net/html"
+)
+
+// fetch makes a GET request to refURL, returning the HTML contents of
+// that webpage. An error is also returned.
+//
+// A [url.URL] type is used for refURL to simplify recursive or else
+// repeated use of this function when crawling webpages to, say, build
+// a sitemap.
+func fetch(refURL url.URL) (*html.Node, error) {
+ rawURL := refURL.String()
+
+ // For now we leave the client unconfigured.
+ client := http.Client{}
+
+ req, err := http.NewRequest(http.MethodGet, rawURL, nil)
+ if err != nil {
+ return nil, fmt.Errorf("can't create request: %w", err)
+ }
+
+ resp, err := client.Do(req)
+ if err != nil {
+ return nil, fmt.Errorf("client failed: %w", err)
+ }
+ defer resp.Body.Close()
+
+ if resp.StatusCode != http.StatusOK {
+ return nil, fmt.Errorf("status for %s for %s: %s", http.MethodGet, rawURL, resp.Status)
+ }
+
+ htmlDoc, err := html.Parse(resp.Body)
+ if err != nil {
+ return nil, fmt.Errorf("can't parse response body: %w", err)
+ }
+
+ return htmlDoc, nil
+}
diff --git a/findurls.go b/findurls.go
new file mode 100644
index 0000000..6bc95d7
--- /dev/null
+++ b/findurls.go
@@ -0,0 +1,44 @@
+package main
+
+import (
+ "log"
+ "net/url"
+
+ "golang.org/x/net/html"
+ "golang.org/x/net/html/atom"
+)
+
+// findURLs combs htmlDoc for anchor-tag URLs. The refURL parameter is
+// used to resolve each found URL into an absolute URL.
+func findURLs(refURL url.URL, htmlDoc *html.Node) []url.URL {
+ // Used for collecting all the URLs we find.
+ var rawURLs []url.URL
+
+ for node := range htmlDoc.Descendants() {
+ if node.Type == html.ElementNode {
+ switch node.DataAtom {
+ case atom.A:
+ for _, attr := range node.Attr {
+ if attr.Key == "href" {
+ subURL, err := refURL.Parse(attr.Val)
+ if err != nil {
+ log.Printf("bad URL: %s", attr.Val)
+ continue
+ }
+
+ // Filter external
+ // URLs from the final
+ // result.
+ if subURL.Hostname() != refURL.Hostname() {
+ continue
+ }
+
+ rawURLs = append(rawURLs, *subURL)
+ }
+ }
+ }
+ }
+ }
+
+ return rawURLs
+}
diff --git a/go.mod b/go.mod
index bd0c8b7..2fee455 100644
--- a/go.mod
+++ b/go.mod
@@ -1,3 +1,5 @@
module git.brandonirizarry.xyz/urls
go 1.26.2
+
+require golang.org/x/net v0.54.0
diff --git a/go.sum b/go.sum
index e69de29..4439791 100644
--- a/go.sum
+++ b/go.sum
@@ -0,0 +1,2 @@
+golang.org/x/net v0.54.0 h1:2zJIZAxAHV/OHCDTCOHAYehQzLfSXuf/5SoL/Dv6w/w=
+golang.org/x/net v0.54.0/go.mod h1:Sj4oj8jK6XmHpBZU/zWHw3BV3abl4Kvi+Ut7cQcY+cQ=