summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authordemo <demo@antix1>2026-05-10 22:47:08 -0400
committerdemo <demo@antix1>2026-05-10 22:47:08 -0400
commit3bfb19098c69a6b810a2a4e478f4184420bf4200 (patch)
tree4955faa3a8f914d52b1281f1d2680d27a7de2c02
parent8c6e2780beb8e295c309eab503a18c9058a8cb8b (diff)
feat: resolve hrefs according to a base URL
-rw-r--r--internal/links/find.go52
-rw-r--r--internal/links/find_count_test.go (renamed from internal/links/find_test.go)26
2 files changed, 74 insertions, 4 deletions
diff --git a/internal/links/find.go b/internal/links/find.go
index 44ba128..17b6d8f 100644
--- a/internal/links/find.go
+++ b/internal/links/find.go
@@ -1,12 +1,28 @@
package links
import (
+ "fmt"
"io"
+ "log"
+ "net/url"
"golang.org/x/net/html"
"golang.org/x/net/html/atom"
)
+func Parse(htmlInput io.Reader, baseURL *url.URL) ([]string, error) {
+ var err error
+
+ hrefs, err := parse(htmlInput)
+ if err != nil {
+ return nil, fmt.Errorf("can't parse: %w", err)
+ }
+
+ hrefs = filterByBaseURL(baseURL, hrefs)
+
+ return hrefs, nil
+}
+
// parse finds the links inside htmlInput and returns them as a slice
// of strings, along with an error.
//
@@ -23,7 +39,9 @@ func parse(htmlInput io.Reader) ([]string, error) {
return hrefs, nil
}
-// findHrefs returns all link addresses inside doc.
+// findHrefs returns all link addresses inside doc. It collects each
+// one exactly as it appears in the document, without resolving it
+// with respect to some base URL.
func findHrefs(doc *html.Node) []string {
var hrefs []string
for node := range doc.Descendants() {
@@ -38,3 +56,35 @@ func findHrefs(doc *html.Node) []string {
return hrefs
}
+
+// filterByBaseURL returns the slice of all web addresses in hrefs
+// that are under baseURL. In passing, it also resolves these with
+// respect to baseURL.
+//
+// The motivation is that, when crawling pages
+// to build a sitemap, the crawl should never leave the top-level Web
+// domain those pages belong to.
+func filterByBaseURL(baseURL *url.URL, hrefs []string) []string {
+ var neighbors []string
+
+ for _, href := range hrefs {
+ // Check that href parses as a URL, and at the same
+ // time resolve it with respec to baseURL.
+ u, err := baseURL.Parse(href)
+ if err != nil {
+ log.Printf("%s: %v", u, err)
+ continue
+ }
+
+ // If href is a valid absolute URL, it will parse
+ // successfully, so we need to check hostnames.
+ if u.Hostname() != baseURL.Hostname() {
+ log.Printf("different hostnames: %s %s", u, baseURL)
+ continue
+ }
+
+ neighbors = append(neighbors, href)
+ }
+
+ return neighbors
+}
diff --git a/internal/links/find_test.go b/internal/links/find_count_test.go
index e0866a5..2932bf9 100644
--- a/internal/links/find_test.go
+++ b/internal/links/find_count_test.go
@@ -2,17 +2,21 @@ package links
import (
"fmt"
+ "log"
+ "net/url"
"strings"
"testing"
)
type exampleType struct {
expectedCount int
+ rawBaseURL string
content string
}
var examples = []exampleType{
- {2, `
+ // Example 1
+ {2, "https://example.com", `
<html>c
<head>
<title>Ex 1</title>
@@ -24,7 +28,8 @@ var examples = []exampleType{
</html>
`},
- {4, `<html>
+ // Example 2
+ {4, "https://example.com", `<html>
<head>
<title>Ex 2</title>
<head>
@@ -35,21 +40,36 @@ var examples = []exampleType{
</body>
</html>
`},
+
+ // Example 3
+ {2, "https://example.com", `<html>
+<a href="https://example.com">Main Page</a>
+<a href="/example1">Example 1</a>
+<a href="https://brandonirizarry.xyz">Brandon's Blog</a>
+<a href="https://brandonirizarry.xyz/post1">Post 1</a>
+<a href=":foo">Bad link</a>
+</html>
+`},
}
func TestFindCountHrefs(t *testing.T) {
for i, ex := range examples {
name := fmt.Sprintf("Example %d", i+1)
+ baseURL, err := url.Parse(ex.rawBaseURL)
+ if err != nil {
+ t.Fatalf("can't parse %s: %v", ex.rawBaseURL, err)
+ }
t.Run(name, func(t *testing.T) {
r := strings.NewReader(ex.content)
- hrefs, err := parse(r)
+ hrefs, err := Parse(r, baseURL)
if err != nil {
t.Error(err)
}
if actualCount := len(hrefs); actualCount != ex.expectedCount {
t.Errorf("got %d, want %d", actualCount, ex.expectedCount)
+ log.Print(hrefs)
}
})
}