diff options
| author | demo <demo@antix1> | 2026-05-10 22:47:08 -0400 |
|---|---|---|
| committer | demo <demo@antix1> | 2026-05-10 22:47:08 -0400 |
| commit | 3bfb19098c69a6b810a2a4e478f4184420bf4200 (patch) | |
| tree | 4955faa3a8f914d52b1281f1d2680d27a7de2c02 | |
| parent | 8c6e2780beb8e295c309eab503a18c9058a8cb8b (diff) | |
feat: resolve hrefs according to a base URL
| -rw-r--r-- | internal/links/find.go | 52 | ||||
| -rw-r--r-- | internal/links/find_count_test.go (renamed from internal/links/find_test.go) | 26 |
2 files changed, 74 insertions, 4 deletions
diff --git a/internal/links/find.go b/internal/links/find.go index 44ba128..17b6d8f 100644 --- a/internal/links/find.go +++ b/internal/links/find.go @@ -1,12 +1,28 @@ package links import ( + "fmt" "io" + "log" + "net/url" "golang.org/x/net/html" "golang.org/x/net/html/atom" ) +func Parse(htmlInput io.Reader, baseURL *url.URL) ([]string, error) { + var err error + + hrefs, err := parse(htmlInput) + if err != nil { + return nil, fmt.Errorf("can't parse: %w", err) + } + + hrefs = filterByBaseURL(baseURL, hrefs) + + return hrefs, nil +} + // parse finds the links inside htmlInput and returns them as a slice // of strings, along with an error. // @@ -23,7 +39,9 @@ func parse(htmlInput io.Reader) ([]string, error) { return hrefs, nil } -// findHrefs returns all link addresses inside doc. +// findHrefs returns all link addresses inside doc. It collects each +// one exactly as it appears in the document, without resolving it +// with respect to some base URL. func findHrefs(doc *html.Node) []string { var hrefs []string for node := range doc.Descendants() { @@ -38,3 +56,35 @@ func findHrefs(doc *html.Node) []string { return hrefs } + +// filterByBaseURL returns the slice of all web addresses in hrefs +// that are under baseURL. In passing, it also resolves these with +// respect to baseURL. +// +// The motivation is that, when crawling pages +// to build a sitemap, the crawl should never leave the top-level Web +// domain those pages belong to. +func filterByBaseURL(baseURL *url.URL, hrefs []string) []string { + var neighbors []string + + for _, href := range hrefs { + // Check that href parses as a URL, and at the same + // time resolve it with respec to baseURL. + u, err := baseURL.Parse(href) + if err != nil { + log.Printf("%s: %v", u, err) + continue + } + + // If href is a valid absolute URL, it will parse + // successfully, so we need to check hostnames. + if u.Hostname() != baseURL.Hostname() { + log.Printf("different hostnames: %s %s", u, baseURL) + continue + } + + neighbors = append(neighbors, href) + } + + return neighbors +} diff --git a/internal/links/find_test.go b/internal/links/find_count_test.go index e0866a5..2932bf9 100644 --- a/internal/links/find_test.go +++ b/internal/links/find_count_test.go @@ -2,17 +2,21 @@ package links import ( "fmt" + "log" + "net/url" "strings" "testing" ) type exampleType struct { expectedCount int + rawBaseURL string content string } var examples = []exampleType{ - {2, ` + // Example 1 + {2, "https://example.com", ` <html>c <head> <title>Ex 1</title> @@ -24,7 +28,8 @@ var examples = []exampleType{ </html> `}, - {4, `<html> + // Example 2 + {4, "https://example.com", `<html> <head> <title>Ex 2</title> <head> @@ -35,21 +40,36 @@ var examples = []exampleType{ </body> </html> `}, + + // Example 3 + {2, "https://example.com", `<html> +<a href="https://example.com">Main Page</a> +<a href="/example1">Example 1</a> +<a href="https://brandonirizarry.xyz">Brandon's Blog</a> +<a href="https://brandonirizarry.xyz/post1">Post 1</a> +<a href=":foo">Bad link</a> +</html> +`}, } func TestFindCountHrefs(t *testing.T) { for i, ex := range examples { name := fmt.Sprintf("Example %d", i+1) + baseURL, err := url.Parse(ex.rawBaseURL) + if err != nil { + t.Fatalf("can't parse %s: %v", ex.rawBaseURL, err) + } t.Run(name, func(t *testing.T) { r := strings.NewReader(ex.content) - hrefs, err := parse(r) + hrefs, err := Parse(r, baseURL) if err != nil { t.Error(err) } if actualCount := len(hrefs); actualCount != ex.expectedCount { t.Errorf("got %d, want %d", actualCount, ex.expectedCount) + log.Print(hrefs) } }) } |
