package findlinks import ( "fmt" "io" "strings" "golang.org/x/net/html" "golang.org/x/net/html/atom" ) // Parse collects the unmarshalled [Link] data from the HTML document // represented by r. The data is returned as a slice, along with an // error. func Parse(r io.Reader) ([]Link, error) { doc, err := html.Parse(r) if err != nil { return nil, fmt.Errorf("can't parse html reader: %w", err) } return parseLinks(doc), nil } // parseLinks returns a [Link] slice from doc. Each element is an // "unmarshalled" version of an anchor tag element inside doc. func parseLinks(doc *html.Node) []Link { linkNodes := harvestLinkNodes(doc) var links []Link for _, linkNode := range linkNodes { var link Link // Get the link's inner text. link.Text = harvestText(linkNode) // Get the href attribute. for _, a := range linkNode.Attr { if a.Key == "href" { link.Href = a.Val break } } links = append(links, link) } return links } // harvestText returns the harvestText contained inside n. // // Note that the harvestText could be under many layers of HTML // nesting. Hence the [html.ElementNode] case calls harvestText recursively. // // For the current project, harvestText's argument is always an // anchor-tag element. func harvestText(n *html.Node) string { switch n.Type { // The text of an [html.TextNode] is its [html.Node.Data] // field. case html.TextNode: return n.Data // The text of an [html.ElementNode] is the aggregate of the // text of its children. case html.ElementNode: var builder strings.Builder for c := n.FirstChild; c != nil; c = c.NextSibling { fmt.Fprintf(&builder, "%s ", harvestText(c)) } rawResult := builder.String() fields := strings.Fields(rawResult) return strings.Join(fields, " ") // Any other kind of node (e.g. [html.CommentNode]) doesn't // have text. default: return "" } } // harvestLinkNodes harvests all of the link nodes contained inside n. // // For the current project, harvestLinkNodes' argument is always the // top-level document node. func harvestLinkNodes(node *html.Node) []*html.Node { var links []*html.Node for child := range node.Descendants() { if child.Type == html.ElementNode && child.DataAtom == atom.A { links = append(links, child) } } return links }