feat: include link text in final output

This meant using the older-style node-iteration technique, instead of trying to one-shot it with html.Descendants.
author: demo <demo@antix1> 2026-05-08 12:00:27 -0400
committer: demo <demo@antix1> 2026-05-08 12:00:27 -0400
commit: ac4464a77e2b4c1c1c7a9fa1974e8b1f714b5040 (patch)
tree: d4615db2f572636bd3ee23b4723239abbd54e1d0 /internal/findlinks
parent: 89841980a55e809778e3f1af778b8962cd540aa0 (diff)
1 files changed, 44 insertions, 20 deletions
diff --git a/internal/findlinks/findlinks.go b/internal/findlinks/findlinks.go
index 619d8a4..00635ec 100644
--- a/internal/findlinks/findlinks.go
+++ b/internal/findlinks/findlinks.go
@@ -24,33 +24,57 @@ func FindLinks(r io.Reader) ([]Link, error) {
 		return nil, fmt.Errorf("can't parse html reader: %w", err)
 	}
 
-	var links []Link
+	links := iterHTML(doc, nil)
 
-	for n := range doc.Descendants() {
-		trimmedData := strings.TrimSpace(n.Data)
+	return links, nil
+}
+
+func iterHTML(n *html.Node, buffer []Link) []Link {
+	if n.Type == html.ElementNode && n.DataAtom == atom.A {
+		var link Link
+
+		// Href
+		link.Href = extractHref(n)
 
-		if n.Type == html.TextNode && trimmedData != "" {
-			fmt.Printf("Data: %s\n", trimmedData)
-			for anc := range n.Ancestors() {
-				fmt.Printf("\tAncestor: %v\n", anc.Data)
-			}
-			continue
+		// Text
+		chunks := extractText(n, nil)
+		link.Text = strings.Join(chunks, " ")
+
+		buffer = append(buffer, link)
+	} else {
+		for c := n.FirstChild; c != nil; c = c.NextSibling {
+			buffer = iterHTML(c, buffer)
 		}
+	}
 
-		if n.Type == html.ElementNode && n.DataAtom == atom.A {
-			var link Link
+	return buffer
+}
 
-			// Scan the href.
-			for _, a := range n.Attr {
-				if a.Key == "href" {
-					link.Href = a.Val
-				}
-			}
+// extractHref returns the first href attribute of anchor.
+func extractHref(anchor *html.Node) string {
+	var href string
 
-			// FIXME: for now, only scan for hrefs.
-			links = append(links, link)
+	for _, a := range anchor.Attr {
+		if a.Key == atom.Href.String() {
+			href = a.Val
+			break
 		}
 	}
 
-	return links, nil
+	return href
+}
+
+// extractText recursively scans anchor to return the various nested
+// pieces of text content.
+func extractText(anchor *html.Node, buffer []string) []string {
+	for c := anchor.FirstChild; c != nil; c = c.NextSibling {
+		switch c.Type {
+		case html.TextNode:
+			buffer = append(buffer, c.Data)
+		case html.ElementNode:
+			buffer = extractText(c, buffer)
+		}
+	}
+
+	return buffer
 }
author	demo <demo@antix1>	2026-05-08 12:00:27 -0400
committer	demo <demo@antix1>	2026-05-08 12:00:27 -0400
commit	ac4464a77e2b4c1c1c7a9fa1974e8b1f714b5040 (patch)
tree	d4615db2f572636bd3ee23b4723239abbd54e1d0 /internal/findlinks
parent	89841980a55e809778e3f1af778b8962cd540aa0 (diff)