internal/links/find.go


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90

package links

import (
	"fmt"
	"io"
	"log"
	"net/url"

	"golang.org/x/net/html"
	"golang.org/x/net/html/atom"
)

func Parse(htmlInput io.Reader, refURL *url.URL) ([]string, error) {
	var err error

	hrefs, err := parse(htmlInput)
	if err != nil {
		return nil, fmt.Errorf("can't parse: %w", err)
	}

	hrefs = filterByBaseURL(refURL, hrefs)

	return hrefs, nil
}

// parse finds the links inside htmlInput and returns them as a slice
// of strings, along with an error.
//
// The only possible error should be the one returned from calling
// [html.Parse].
func parse(htmlInput io.Reader) ([]string, error) {
	doc, err := html.Parse(htmlInput)
	if err != nil {
		return nil, err
	}

	hrefs := findHrefs(doc)

	return hrefs, nil
}

// findHrefs returns all link addresses inside doc. It collects each
// one exactly as it appears in the document, without resolving it
// with respect to some base URL.
func findHrefs(doc *html.Node) []string {
	var hrefs []string
	for node := range doc.Descendants() {
		if node.Type == html.ElementNode && node.DataAtom == atom.A {
			for _, attr := range node.Attr {
				if attr.Key == "href" {
					hrefs = append(hrefs, attr.Val)
				}
			}
		}
	}

	return hrefs
}

// filterByBaseURL returns the slice of all web addresses in hrefs
// that are under refURL. In passing, it also resolves these with
// respect to refURL.
//
// The motivation is that, when crawling pages
// to build a sitemap, the crawl should never leave the top-level Web
// domain those pages belong to.
func filterByBaseURL(refURL *url.URL, hrefs []string) []string {
	var neighbors []string

	for _, href := range hrefs {
		// Check that href parses as a URL, and at the same
		// time resolve it with respect to refURL.
		u, err := refURL.Parse(href)
		if err != nil {
			log.Printf("%s: %v", u, err)
			continue
		}

		// If href is a valid absolute URL, it will parse
		// successfully, so we need to check hostnames.
		if u.Hostname() != refURL.Hostname() {
			log.Printf("different hostnames: %s %s", u, refURL)
			continue
		}

		neighbors = append(neighbors, href)
	}

	return neighbors
}