package findlinks import ( "fmt" "io" "strings" "golang.org/x/net/html" "golang.org/x/net/html/atom" ) // A Link encapsulates the data harvested from a link. type Link struct { Href string Text string } // FindLinks consumes the given [io.Reader], scraping it of anchor // tags. Each anchor tag is "unmarshalled" into a [Link]. The // resulting slice of Links is returned, along with an error. func FindLinks(r io.Reader) ([]Link, error) { doc, err := html.Parse(r) if err != nil { return nil, fmt.Errorf("can't parse html reader: %w", err) } links := iterHTML(doc, nil) return links, nil } func iterHTML(n *html.Node, buffer []Link) []Link { if n.Type == html.ElementNode && n.DataAtom == atom.A { var link Link // Href link.Href = extractHref(n) // Text chunks := extractText(n, nil) link.Text = strings.Join(chunks, " ") buffer = append(buffer, link) } else { for c := n.FirstChild; c != nil; c = c.NextSibling { buffer = iterHTML(c, buffer) } } return buffer } // extractHref returns the first href attribute of anchor. func extractHref(anchor *html.Node) string { var href string for _, a := range anchor.Attr { if a.Key == atom.Href.String() { href = a.Val break } } return href } // extractText recursively scans anchor to return the various nested // pieces of text content. func extractText(anchor *html.Node, buffer []string) []string { for c := anchor.FirstChild; c != nil; c = c.NextSibling { switch c.Type { case html.TextNode: buffer = append(buffer, c.Data) case html.ElementNode: buffer = extractText(c, buffer) } } return buffer }