1 files changed, 97 insertions, 0 deletions
diff --git a/internal/findlinks/findlinks_v2.go b/internal/findlinks/findlinks_v2.go
new file mode 100644
index 0000000..e56a961
--- /dev/null
+++ b/internal/findlinks/findlinks_v2.go
@@ -0,0 +1,97 @@
+package findlinks
+
+import (
+	"fmt"
+	"io"
+	"strings"
+
+	"golang.org/x/net/html"
+	"golang.org/x/net/html/atom"
+)
+
+// Parse collects the unmarshalled [Link] data from the HTML document
+// represented by r. The data is returned as a slice, along with an
+// error.
+func Parse(r io.Reader) ([]Link, error) {
+	doc, err := html.Parse(r)
+	if err != nil {
+		return nil, fmt.Errorf("can't parse html reader: %w", err)
+	}
+
+	return parseLinks(doc), nil
+}
+
+// parseLinks returns a [Link] slice from doc. Each element is an
+// "unmarshalled" version of an anchor tag element inside doc.
+func parseLinks(doc *html.Node) []Link {
+	linkNodes := harvestLinkNodes(doc)
+
+	var links []Link
+	for _, linkNode := range linkNodes {
+		var link Link
+
+		// Get the link's inner text.
+		link.Text = harvestText(linkNode)
+
+		// Get the href attribute.
+		for _, a := range linkNode.Attr {
+			if a.Key == "href" {
+				link.Href = a.Val
+				break
+			}
+		}
+
+		links = append(links, link)
+	}
+
+	return links
+}
+
+// harvestText returns the harvestText contained inside n.
+//
+// Note that the harvestText could be under many layers of HTML
+// nesting. Hence the [html.ElementNode] case calls harvestText recursively.
+//
+// For the current project, harvestText's argument is always an
+// anchor-tag element.
+func harvestText(n *html.Node) string {
+	switch n.Type {
+	// The text of an [html.TextNode] is its [html.Node.Data]
+	// field.
+	case html.TextNode:
+		return n.Data
+
+	// The text of an [html.ElementNode] is the aggregate of the
+	// text of its children.
+	case html.ElementNode:
+		var builder strings.Builder
+		for c := n.FirstChild; c != nil; c = c.NextSibling {
+			fmt.Fprintf(&builder, "%s ", harvestText(c))
+		}
+
+		rawResult := builder.String()
+		fields := strings.Fields(rawResult)
+		return strings.Join(fields, " ")
+
+	// Any other kind of node (e.g. [html.CommentNode]) doesn't
+	// have text.
+	default:
+		return ""
+	}
+}
+
+// harvestLinkNodes harvests all of the link nodes contained inside n.
+//
+// For the current project, harvestLinkNodes' argument is always the
+// top-level document node.
+func harvestLinkNodes(node *html.Node) []*html.Node {
+	var links []*html.Node
+
+	for child := range node.Descendants() {
+		if child.Type == html.ElementNode && child.DataAtom == atom.A {
+			links = append(links, child)
+		}
+	}
+
+	return links
+}