summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authordemo <demo@antix1>2026-05-09 11:43:48 -0400
committerdemo <demo@antix1>2026-05-09 11:43:48 -0400
commit52bb422959147384291dcfbfe5a6142d363862ab (patch)
tree90a17881bf865d43a0431beefded70e3f227e2e6
parent7fe62a3f676d810c8df46fa24a7314a2209a9dd2 (diff)
feat: implement "v2"
This is based on the Gophercises solution.
-rw-r--r--internal/findlinks/findlinks_v2.go97
-rw-r--r--internal/test/findlinks_test.go34
2 files changed, 119 insertions, 12 deletions
diff --git a/internal/findlinks/findlinks_v2.go b/internal/findlinks/findlinks_v2.go
new file mode 100644
index 0000000..e56a961
--- /dev/null
+++ b/internal/findlinks/findlinks_v2.go
@@ -0,0 +1,97 @@
+package findlinks
+
+import (
+ "fmt"
+ "io"
+ "strings"
+
+ "golang.org/x/net/html"
+ "golang.org/x/net/html/atom"
+)
+
+// Parse collects the unmarshalled [Link] data from the HTML document
+// represented by r. The data is returned as a slice, along with an
+// error.
+func Parse(r io.Reader) ([]Link, error) {
+ doc, err := html.Parse(r)
+ if err != nil {
+ return nil, fmt.Errorf("can't parse html reader: %w", err)
+ }
+
+ return parseLinks(doc), nil
+}
+
+// parseLinks returns a [Link] slice from doc. Each element is an
+// "unmarshalled" version of an anchor tag element inside doc.
+func parseLinks(doc *html.Node) []Link {
+ linkNodes := harvestLinkNodes(doc)
+
+ var links []Link
+ for _, linkNode := range linkNodes {
+ var link Link
+
+ // Get the link's inner text.
+ link.Text = harvestText(linkNode)
+
+ // Get the href attribute.
+ for _, a := range linkNode.Attr {
+ if a.Key == "href" {
+ link.Href = a.Val
+ break
+ }
+ }
+
+ links = append(links, link)
+ }
+
+ return links
+}
+
+// harvestText returns the harvestText contained inside n.
+//
+// Note that the harvestText could be under many layers of HTML
+// nesting. Hence the [html.ElementNode] case calls harvestText recursively.
+//
+// For the current project, harvestText's argument is always an
+// anchor-tag element.
+func harvestText(n *html.Node) string {
+ switch n.Type {
+ // The text of an [html.TextNode] is its [html.Node.Data]
+ // field.
+ case html.TextNode:
+ return n.Data
+
+ // The text of an [html.ElementNode] is the aggregate of the
+ // text of its children.
+ case html.ElementNode:
+ var builder strings.Builder
+ for c := n.FirstChild; c != nil; c = c.NextSibling {
+ fmt.Fprintf(&builder, "%s ", harvestText(c))
+ }
+
+ rawResult := builder.String()
+ fields := strings.Fields(rawResult)
+ return strings.Join(fields, " ")
+
+ // Any other kind of node (e.g. [html.CommentNode]) doesn't
+ // have text.
+ default:
+ return ""
+ }
+}
+
+// harvestLinkNodes harvests all of the link nodes contained inside n.
+//
+// For the current project, harvestLinkNodes' argument is always the
+// top-level document node.
+func harvestLinkNodes(node *html.Node) []*html.Node {
+ var links []*html.Node
+
+ for child := range node.Descendants() {
+ if child.Type == html.ElementNode && child.DataAtom == atom.A {
+ links = append(links, child)
+ }
+ }
+
+ return links
+}
diff --git a/internal/test/findlinks_test.go b/internal/test/findlinks_test.go
index 1bc2f20..37c6152 100644
--- a/internal/test/findlinks_test.go
+++ b/internal/test/findlinks_test.go
@@ -1,6 +1,8 @@
package test
import (
+ "fmt"
+ "io"
"os"
"testing"
@@ -8,14 +10,16 @@ import (
"github.com/google/go-cmp/cmp"
)
-func findLinksFile(filename string) ([]findlinks.Link, error) {
+type parserFn func(io.Reader) ([]findlinks.Link, error)
+
+func findLinksFile(filename string, parser parserFn) ([]findlinks.Link, error) {
f, err := os.Open(filename)
if err != nil {
panic("can't open test file")
}
defer f.Close()
- return findlinks.FindLinks(f)
+ return parser(f)
}
func TestFindlinks(t *testing.T) {
@@ -37,16 +41,22 @@ func TestFindlinks(t *testing.T) {
}
for _, test := range tests {
- t.Run(test.filename, func(t *testing.T) {
- links, err := findLinksFile(test.filename)
- if err != nil {
- t.Error(err)
- }
-
- if !cmp.Equal(links, test.links) {
- t.Errorf("got %v, want %v", links, test.links)
- }
- })
+ parsers := []parserFn{findlinks.FindLinks, findlinks.Parse}
+
+ for i, p := range parsers {
+ testName := fmt.Sprintf("Parser %d %s", i+1, test.filename)
+
+ t.Run(testName, func(t *testing.T) {
+ links, err := findLinksFile(test.filename, p)
+ if err != nil {
+ t.Error(err)
+ }
+
+ if !cmp.Equal(links, test.links) {
+ t.Errorf("got %v, want %v", links, test.links)
+ }
+ })
+ }
}
}