1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
|
package links
import (
"fmt"
"io"
"log"
"net/url"
"golang.org/x/net/html"
"golang.org/x/net/html/atom"
)
func Parse(htmlInput io.Reader, refURL *url.URL) ([]string, error) {
var err error
hrefs, err := parse(htmlInput)
if err != nil {
return nil, fmt.Errorf("can't parse: %w", err)
}
hrefs = filterByBaseURL(refURL, hrefs)
return hrefs, nil
}
// parse finds the links inside htmlInput and returns them as a slice
// of strings, along with an error.
//
// The only possible error should be the one returned from calling
// [html.Parse].
func parse(htmlInput io.Reader) ([]string, error) {
doc, err := html.Parse(htmlInput)
if err != nil {
return nil, err
}
hrefs := findHrefs(doc)
return hrefs, nil
}
// findHrefs returns all link addresses inside doc. It collects each
// one exactly as it appears in the document, without resolving it
// with respect to some base URL.
func findHrefs(doc *html.Node) []string {
var hrefs []string
for node := range doc.Descendants() {
if node.Type == html.ElementNode && node.DataAtom == atom.A {
for _, attr := range node.Attr {
if attr.Key == "href" {
hrefs = append(hrefs, attr.Val)
}
}
}
}
return hrefs
}
// filterByBaseURL returns the slice of all web addresses in hrefs
// that are under refURL. In passing, it also resolves these with
// respect to refURL.
//
// The motivation is that, when crawling pages
// to build a sitemap, the crawl should never leave the top-level Web
// domain those pages belong to.
func filterByBaseURL(refURL *url.URL, hrefs []string) []string {
var neighbors []string
for _, href := range hrefs {
// Check that href parses as a URL, and at the same
// time resolve it with respect to refURL.
u, err := refURL.Parse(href)
if err != nil {
log.Printf("%s: %v", u, err)
continue
}
// If href is a valid absolute URL, it will parse
// successfully, so we need to check hostnames.
if u.Hostname() != refURL.Hostname() {
log.Printf("different hostnames: %s %s", u, refURL)
continue
}
neighbors = append(neighbors, href)
}
return neighbors
}
|