From 3bfb19098c69a6b810a2a4e478f4184420bf4200 Mon Sep 17 00:00:00 2001 From: demo Date: Sun, 10 May 2026 22:47:08 -0400 Subject: feat: resolve hrefs according to a base URL --- internal/links/find.go | 52 ++++++++++++++++++++++++++- internal/links/find_count_test.go | 76 +++++++++++++++++++++++++++++++++++++++ internal/links/find_test.go | 56 ----------------------------- 3 files changed, 127 insertions(+), 57 deletions(-) create mode 100644 internal/links/find_count_test.go delete mode 100644 internal/links/find_test.go (limited to 'internal') diff --git a/internal/links/find.go b/internal/links/find.go index 44ba128..17b6d8f 100644 --- a/internal/links/find.go +++ b/internal/links/find.go @@ -1,12 +1,28 @@ package links import ( + "fmt" "io" + "log" + "net/url" "golang.org/x/net/html" "golang.org/x/net/html/atom" ) +func Parse(htmlInput io.Reader, baseURL *url.URL) ([]string, error) { + var err error + + hrefs, err := parse(htmlInput) + if err != nil { + return nil, fmt.Errorf("can't parse: %w", err) + } + + hrefs = filterByBaseURL(baseURL, hrefs) + + return hrefs, nil +} + // parse finds the links inside htmlInput and returns them as a slice // of strings, along with an error. // @@ -23,7 +39,9 @@ func parse(htmlInput io.Reader) ([]string, error) { return hrefs, nil } -// findHrefs returns all link addresses inside doc. +// findHrefs returns all link addresses inside doc. It collects each +// one exactly as it appears in the document, without resolving it +// with respect to some base URL. func findHrefs(doc *html.Node) []string { var hrefs []string for node := range doc.Descendants() { @@ -38,3 +56,35 @@ func findHrefs(doc *html.Node) []string { return hrefs } + +// filterByBaseURL returns the slice of all web addresses in hrefs +// that are under baseURL. In passing, it also resolves these with +// respect to baseURL. +// +// The motivation is that, when crawling pages +// to build a sitemap, the crawl should never leave the top-level Web +// domain those pages belong to. +func filterByBaseURL(baseURL *url.URL, hrefs []string) []string { + var neighbors []string + + for _, href := range hrefs { + // Check that href parses as a URL, and at the same + // time resolve it with respec to baseURL. + u, err := baseURL.Parse(href) + if err != nil { + log.Printf("%s: %v", u, err) + continue + } + + // If href is a valid absolute URL, it will parse + // successfully, so we need to check hostnames. + if u.Hostname() != baseURL.Hostname() { + log.Printf("different hostnames: %s %s", u, baseURL) + continue + } + + neighbors = append(neighbors, href) + } + + return neighbors +} diff --git a/internal/links/find_count_test.go b/internal/links/find_count_test.go new file mode 100644 index 0000000..2932bf9 --- /dev/null +++ b/internal/links/find_count_test.go @@ -0,0 +1,76 @@ +package links + +import ( + "fmt" + "log" + "net/url" + "strings" + "testing" +) + +type exampleType struct { + expectedCount int + rawBaseURL string + content string +} + +var examples = []exampleType{ + // Example 1 + {2, "https://example.com", ` +c + + Ex 1 + + + Example Page + Posts + + +`}, + + // Example 2 + {4, "https://example.com", ` + + Ex 2 + + + Example Page + Posts + A rouge link! + + +`}, + + // Example 3 + {2, "https://example.com", ` +Main Page +Example 1 +Brandon's Blog +Post 1 +Bad link + +`}, +} + +func TestFindCountHrefs(t *testing.T) { + for i, ex := range examples { + name := fmt.Sprintf("Example %d", i+1) + baseURL, err := url.Parse(ex.rawBaseURL) + if err != nil { + t.Fatalf("can't parse %s: %v", ex.rawBaseURL, err) + } + + t.Run(name, func(t *testing.T) { + r := strings.NewReader(ex.content) + hrefs, err := Parse(r, baseURL) + if err != nil { + t.Error(err) + } + + if actualCount := len(hrefs); actualCount != ex.expectedCount { + t.Errorf("got %d, want %d", actualCount, ex.expectedCount) + log.Print(hrefs) + } + }) + } +} diff --git a/internal/links/find_test.go b/internal/links/find_test.go deleted file mode 100644 index e0866a5..0000000 --- a/internal/links/find_test.go +++ /dev/null @@ -1,56 +0,0 @@ -package links - -import ( - "fmt" - "strings" - "testing" -) - -type exampleType struct { - expectedCount int - content string -} - -var examples = []exampleType{ - {2, ` -c - - Ex 1 - - - Example Page - Posts - - -`}, - - {4, ` - - Ex 2 - - - Example Page - Posts - A rouge link! - - -`}, -} - -func TestFindCountHrefs(t *testing.T) { - for i, ex := range examples { - name := fmt.Sprintf("Example %d", i+1) - - t.Run(name, func(t *testing.T) { - r := strings.NewReader(ex.content) - hrefs, err := parse(r) - if err != nil { - t.Error(err) - } - - if actualCount := len(hrefs); actualCount != ex.expectedCount { - t.Errorf("got %d, want %d", actualCount, ex.expectedCount) - } - }) - } -} -- cgit v1.2.3