summaryrefslogtreecommitdiff
path: root/internal/findlinks/findlinks_v2.go
diff options
context:
space:
mode:
Diffstat (limited to 'internal/findlinks/findlinks_v2.go')
-rw-r--r--internal/findlinks/findlinks_v2.go97
1 files changed, 97 insertions, 0 deletions
diff --git a/internal/findlinks/findlinks_v2.go b/internal/findlinks/findlinks_v2.go
new file mode 100644
index 0000000..e56a961
--- /dev/null
+++ b/internal/findlinks/findlinks_v2.go
@@ -0,0 +1,97 @@
+package findlinks
+
+import (
+ "fmt"
+ "io"
+ "strings"
+
+ "golang.org/x/net/html"
+ "golang.org/x/net/html/atom"
+)
+
+// Parse collects the unmarshalled [Link] data from the HTML document
+// represented by r. The data is returned as a slice, along with an
+// error.
+func Parse(r io.Reader) ([]Link, error) {
+ doc, err := html.Parse(r)
+ if err != nil {
+ return nil, fmt.Errorf("can't parse html reader: %w", err)
+ }
+
+ return parseLinks(doc), nil
+}
+
+// parseLinks returns a [Link] slice from doc. Each element is an
+// "unmarshalled" version of an anchor tag element inside doc.
+func parseLinks(doc *html.Node) []Link {
+ linkNodes := harvestLinkNodes(doc)
+
+ var links []Link
+ for _, linkNode := range linkNodes {
+ var link Link
+
+ // Get the link's inner text.
+ link.Text = harvestText(linkNode)
+
+ // Get the href attribute.
+ for _, a := range linkNode.Attr {
+ if a.Key == "href" {
+ link.Href = a.Val
+ break
+ }
+ }
+
+ links = append(links, link)
+ }
+
+ return links
+}
+
+// harvestText returns the harvestText contained inside n.
+//
+// Note that the harvestText could be under many layers of HTML
+// nesting. Hence the [html.ElementNode] case calls harvestText recursively.
+//
+// For the current project, harvestText's argument is always an
+// anchor-tag element.
+func harvestText(n *html.Node) string {
+ switch n.Type {
+ // The text of an [html.TextNode] is its [html.Node.Data]
+ // field.
+ case html.TextNode:
+ return n.Data
+
+ // The text of an [html.ElementNode] is the aggregate of the
+ // text of its children.
+ case html.ElementNode:
+ var builder strings.Builder
+ for c := n.FirstChild; c != nil; c = c.NextSibling {
+ fmt.Fprintf(&builder, "%s ", harvestText(c))
+ }
+
+ rawResult := builder.String()
+ fields := strings.Fields(rawResult)
+ return strings.Join(fields, " ")
+
+ // Any other kind of node (e.g. [html.CommentNode]) doesn't
+ // have text.
+ default:
+ return ""
+ }
+}
+
+// harvestLinkNodes harvests all of the link nodes contained inside n.
+//
+// For the current project, harvestLinkNodes' argument is always the
+// top-level document node.
+func harvestLinkNodes(node *html.Node) []*html.Node {
+ var links []*html.Node
+
+ for child := range node.Descendants() {
+ if child.Type == html.ElementNode && child.DataAtom == atom.A {
+ links = append(links, child)
+ }
+ }
+
+ return links
+}