diff options
| author | demo <demo@antix1> | 2026-05-09 11:43:48 -0400 |
|---|---|---|
| committer | demo <demo@antix1> | 2026-05-09 11:43:48 -0400 |
| commit | 52bb422959147384291dcfbfe5a6142d363862ab (patch) | |
| tree | 90a17881bf865d43a0431beefded70e3f227e2e6 | |
| parent | 7fe62a3f676d810c8df46fa24a7314a2209a9dd2 (diff) | |
feat: implement "v2"
This is based on the Gophercises solution.
| -rw-r--r-- | internal/findlinks/findlinks_v2.go | 97 | ||||
| -rw-r--r-- | internal/test/findlinks_test.go | 34 |
2 files changed, 119 insertions, 12 deletions
diff --git a/internal/findlinks/findlinks_v2.go b/internal/findlinks/findlinks_v2.go new file mode 100644 index 0000000..e56a961 --- /dev/null +++ b/internal/findlinks/findlinks_v2.go @@ -0,0 +1,97 @@ +package findlinks + +import ( + "fmt" + "io" + "strings" + + "golang.org/x/net/html" + "golang.org/x/net/html/atom" +) + +// Parse collects the unmarshalled [Link] data from the HTML document +// represented by r. The data is returned as a slice, along with an +// error. +func Parse(r io.Reader) ([]Link, error) { + doc, err := html.Parse(r) + if err != nil { + return nil, fmt.Errorf("can't parse html reader: %w", err) + } + + return parseLinks(doc), nil +} + +// parseLinks returns a [Link] slice from doc. Each element is an +// "unmarshalled" version of an anchor tag element inside doc. +func parseLinks(doc *html.Node) []Link { + linkNodes := harvestLinkNodes(doc) + + var links []Link + for _, linkNode := range linkNodes { + var link Link + + // Get the link's inner text. + link.Text = harvestText(linkNode) + + // Get the href attribute. + for _, a := range linkNode.Attr { + if a.Key == "href" { + link.Href = a.Val + break + } + } + + links = append(links, link) + } + + return links +} + +// harvestText returns the harvestText contained inside n. +// +// Note that the harvestText could be under many layers of HTML +// nesting. Hence the [html.ElementNode] case calls harvestText recursively. +// +// For the current project, harvestText's argument is always an +// anchor-tag element. +func harvestText(n *html.Node) string { + switch n.Type { + // The text of an [html.TextNode] is its [html.Node.Data] + // field. + case html.TextNode: + return n.Data + + // The text of an [html.ElementNode] is the aggregate of the + // text of its children. + case html.ElementNode: + var builder strings.Builder + for c := n.FirstChild; c != nil; c = c.NextSibling { + fmt.Fprintf(&builder, "%s ", harvestText(c)) + } + + rawResult := builder.String() + fields := strings.Fields(rawResult) + return strings.Join(fields, " ") + + // Any other kind of node (e.g. [html.CommentNode]) doesn't + // have text. + default: + return "" + } +} + +// harvestLinkNodes harvests all of the link nodes contained inside n. +// +// For the current project, harvestLinkNodes' argument is always the +// top-level document node. +func harvestLinkNodes(node *html.Node) []*html.Node { + var links []*html.Node + + for child := range node.Descendants() { + if child.Type == html.ElementNode && child.DataAtom == atom.A { + links = append(links, child) + } + } + + return links +} diff --git a/internal/test/findlinks_test.go b/internal/test/findlinks_test.go index 1bc2f20..37c6152 100644 --- a/internal/test/findlinks_test.go +++ b/internal/test/findlinks_test.go @@ -1,6 +1,8 @@ package test import ( + "fmt" + "io" "os" "testing" @@ -8,14 +10,16 @@ import ( "github.com/google/go-cmp/cmp" ) -func findLinksFile(filename string) ([]findlinks.Link, error) { +type parserFn func(io.Reader) ([]findlinks.Link, error) + +func findLinksFile(filename string, parser parserFn) ([]findlinks.Link, error) { f, err := os.Open(filename) if err != nil { panic("can't open test file") } defer f.Close() - return findlinks.FindLinks(f) + return parser(f) } func TestFindlinks(t *testing.T) { @@ -37,16 +41,22 @@ func TestFindlinks(t *testing.T) { } for _, test := range tests { - t.Run(test.filename, func(t *testing.T) { - links, err := findLinksFile(test.filename) - if err != nil { - t.Error(err) - } - - if !cmp.Equal(links, test.links) { - t.Errorf("got %v, want %v", links, test.links) - } - }) + parsers := []parserFn{findlinks.FindLinks, findlinks.Parse} + + for i, p := range parsers { + testName := fmt.Sprintf("Parser %d %s", i+1, test.filename) + + t.Run(testName, func(t *testing.T) { + links, err := findLinksFile(test.filename, p) + if err != nil { + t.Error(err) + } + + if !cmp.Equal(links, test.links) { + t.Errorf("got %v, want %v", links, test.links) + } + }) + } } } |
