package findlinks import ( "fmt" "io" "strings" "golang.org/x/net/html" "golang.org/x/net/html/atom" ) // A Link encapsulates the data harvested from a link. type Link struct { Href string Text string } // Format returns a suitable string representation of a link. // // Argument sep (e.g. " ", "\n") customizes how the href should be // displayed alongside the text. func Format(link Link, sep string) string { return fmt.Sprintf("%s%s%s", link.Text, sep, link.Href) } // FindLinks consumes the given [io.Reader], scraping it of anchor // tags. Each anchor tag is "unmarshalled" into a [Link]. The // resulting slice of Links is returned, along with an error. func FindLinks(r io.Reader) ([]Link, error) { doc, err := html.Parse(r) if err != nil { return nil, fmt.Errorf("can't parse html reader: %w", err) } links := iterHTML(doc, nil) return links, nil } // iterHTML recursively scans the HTML tree n for link data. func iterHTML(n *html.Node, buffer []Link) []Link { // Return if n doesn't contain the right kind of data, since // we could potentially iterate twice over things like text // nodes when calling extractText. if n.Type != html.ElementNode && n.Type != html.DocumentNode { return buffer } // If we've hit a link, go for it. if n.Type == html.ElementNode && n.DataAtom == atom.A { var link Link // Href link.Href = extractHref(n) // Text chunks := extractText(n, nil) link.Text = strings.Join(chunks, " ") buffer = append(buffer, link) } else { // If not a link, just dive down the tree looking for // more links. for c := n.FirstChild; c != nil; c = c.NextSibling { buffer = iterHTML(c, buffer) } } return buffer } // extractHref returns the first href attribute of anchor. func extractHref(anchor *html.Node) string { var href string for _, a := range anchor.Attr { if a.Key == atom.Href.String() { href = a.Val break } } return href } // extractText recursively scans anchor to return the various nested // pieces of text content. func extractText(anchor *html.Node, buffer []string) []string { for c := anchor.FirstChild; c != nil; c = c.NextSibling { switch c.Type { case html.TextNode: buffer = append(buffer, c.Data) case html.ElementNode: buffer = extractText(c, buffer) } } return buffer }