From 52bb422959147384291dcfbfe5a6142d363862ab Mon Sep 17 00:00:00 2001 From: demo Date: Sat, 9 May 2026 11:43:48 -0400 Subject: feat: implement "v2" This is based on the Gophercises solution. --- internal/findlinks/findlinks_v2.go | 97 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) create mode 100644 internal/findlinks/findlinks_v2.go (limited to 'internal/findlinks/findlinks_v2.go') diff --git a/internal/findlinks/findlinks_v2.go b/internal/findlinks/findlinks_v2.go new file mode 100644 index 0000000..e56a961 --- /dev/null +++ b/internal/findlinks/findlinks_v2.go @@ -0,0 +1,97 @@ +package findlinks + +import ( + "fmt" + "io" + "strings" + + "golang.org/x/net/html" + "golang.org/x/net/html/atom" +) + +// Parse collects the unmarshalled [Link] data from the HTML document +// represented by r. The data is returned as a slice, along with an +// error. +func Parse(r io.Reader) ([]Link, error) { + doc, err := html.Parse(r) + if err != nil { + return nil, fmt.Errorf("can't parse html reader: %w", err) + } + + return parseLinks(doc), nil +} + +// parseLinks returns a [Link] slice from doc. Each element is an +// "unmarshalled" version of an anchor tag element inside doc. +func parseLinks(doc *html.Node) []Link { + linkNodes := harvestLinkNodes(doc) + + var links []Link + for _, linkNode := range linkNodes { + var link Link + + // Get the link's inner text. + link.Text = harvestText(linkNode) + + // Get the href attribute. + for _, a := range linkNode.Attr { + if a.Key == "href" { + link.Href = a.Val + break + } + } + + links = append(links, link) + } + + return links +} + +// harvestText returns the harvestText contained inside n. +// +// Note that the harvestText could be under many layers of HTML +// nesting. Hence the [html.ElementNode] case calls harvestText recursively. +// +// For the current project, harvestText's argument is always an +// anchor-tag element. +func harvestText(n *html.Node) string { + switch n.Type { + // The text of an [html.TextNode] is its [html.Node.Data] + // field. + case html.TextNode: + return n.Data + + // The text of an [html.ElementNode] is the aggregate of the + // text of its children. + case html.ElementNode: + var builder strings.Builder + for c := n.FirstChild; c != nil; c = c.NextSibling { + fmt.Fprintf(&builder, "%s ", harvestText(c)) + } + + rawResult := builder.String() + fields := strings.Fields(rawResult) + return strings.Join(fields, " ") + + // Any other kind of node (e.g. [html.CommentNode]) doesn't + // have text. + default: + return "" + } +} + +// harvestLinkNodes harvests all of the link nodes contained inside n. +// +// For the current project, harvestLinkNodes' argument is always the +// top-level document node. +func harvestLinkNodes(node *html.Node) []*html.Node { + var links []*html.Node + + for child := range node.Descendants() { + if child.Type == html.ElementNode && child.DataAtom == atom.A { + links = append(links, child) + } + } + + return links +} -- cgit v1.2.3