diff options
| author | demo <demo@antix1> | 2026-05-21 18:13:18 -0400 |
|---|---|---|
| committer | demo <demo@antix1> | 2026-05-21 18:13:18 -0400 |
| commit | c43b8afcd63446021334669f5a246cfb0b637fd3 (patch) | |
| tree | 161b10faeaa7675d9fd2f558daaec5a16f3de018 | |
| parent | c032bd8d32130a2dbc1a7481dd501f2c182707cc (diff) | |
chore: add existing code to project
I also ran go mod tidy since the new files refer to packages I've
recently installed (locally.)
| -rw-r--r-- | fetch.go | 44 | ||||
| -rw-r--r-- | findurls.go | 44 | ||||
| -rw-r--r-- | go.mod | 2 | ||||
| -rw-r--r-- | go.sum | 2 |
4 files changed, 92 insertions, 0 deletions
diff --git a/fetch.go b/fetch.go new file mode 100644 index 0000000..f81f327 --- /dev/null +++ b/fetch.go @@ -0,0 +1,44 @@ +package main + +import ( + "fmt" + "net/http" + "net/url" + + "golang.org/x/net/html" +) + +// fetch makes a GET request to refURL, returning the HTML contents of +// that webpage. An error is also returned. +// +// A [url.URL] type is used for refURL to simplify recursive or else +// repeated use of this function when crawling webpages to, say, build +// a sitemap. +func fetch(refURL url.URL) (*html.Node, error) { + rawURL := refURL.String() + + // For now we leave the client unconfigured. + client := http.Client{} + + req, err := http.NewRequest(http.MethodGet, rawURL, nil) + if err != nil { + return nil, fmt.Errorf("can't create request: %w", err) + } + + resp, err := client.Do(req) + if err != nil { + return nil, fmt.Errorf("client failed: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("status for %s for %s: %s", http.MethodGet, rawURL, resp.Status) + } + + htmlDoc, err := html.Parse(resp.Body) + if err != nil { + return nil, fmt.Errorf("can't parse response body: %w", err) + } + + return htmlDoc, nil +} diff --git a/findurls.go b/findurls.go new file mode 100644 index 0000000..6bc95d7 --- /dev/null +++ b/findurls.go @@ -0,0 +1,44 @@ +package main + +import ( + "log" + "net/url" + + "golang.org/x/net/html" + "golang.org/x/net/html/atom" +) + +// findURLs combs htmlDoc for anchor-tag URLs. The refURL parameter is +// used to resolve each found URL into an absolute URL. +func findURLs(refURL url.URL, htmlDoc *html.Node) []url.URL { + // Used for collecting all the URLs we find. + var rawURLs []url.URL + + for node := range htmlDoc.Descendants() { + if node.Type == html.ElementNode { + switch node.DataAtom { + case atom.A: + for _, attr := range node.Attr { + if attr.Key == "href" { + subURL, err := refURL.Parse(attr.Val) + if err != nil { + log.Printf("bad URL: %s", attr.Val) + continue + } + + // Filter external + // URLs from the final + // result. + if subURL.Hostname() != refURL.Hostname() { + continue + } + + rawURLs = append(rawURLs, *subURL) + } + } + } + } + } + + return rawURLs +} @@ -1,3 +1,5 @@ module git.brandonirizarry.xyz/urls go 1.26.2 + +require golang.org/x/net v0.54.0 @@ -0,0 +1,2 @@ +golang.org/x/net v0.54.0 h1:2zJIZAxAHV/OHCDTCOHAYehQzLfSXuf/5SoL/Dv6w/w= +golang.org/x/net v0.54.0/go.mod h1:Sj4oj8jK6XmHpBZU/zWHw3BV3abl4Kvi+Ut7cQcY+cQ= |
