diff options
| author | demo <demo@antix1> | 2026-05-08 12:00:27 -0400 |
|---|---|---|
| committer | demo <demo@antix1> | 2026-05-08 12:00:27 -0400 |
| commit | ac4464a77e2b4c1c1c7a9fa1974e8b1f714b5040 (patch) | |
| tree | d4615db2f572636bd3ee23b4723239abbd54e1d0 /internal/findlinks | |
| parent | 89841980a55e809778e3f1af778b8962cd540aa0 (diff) | |
feat: include link text in final output
This meant using the older-style node-iteration technique, instead of
trying to one-shot it with html.Descendants.
Diffstat (limited to 'internal/findlinks')
| -rw-r--r-- | internal/findlinks/findlinks.go | 64 |
1 files changed, 44 insertions, 20 deletions
diff --git a/internal/findlinks/findlinks.go b/internal/findlinks/findlinks.go index 619d8a4..00635ec 100644 --- a/internal/findlinks/findlinks.go +++ b/internal/findlinks/findlinks.go @@ -24,33 +24,57 @@ func FindLinks(r io.Reader) ([]Link, error) { return nil, fmt.Errorf("can't parse html reader: %w", err) } - var links []Link + links := iterHTML(doc, nil) - for n := range doc.Descendants() { - trimmedData := strings.TrimSpace(n.Data) + return links, nil +} + +func iterHTML(n *html.Node, buffer []Link) []Link { + if n.Type == html.ElementNode && n.DataAtom == atom.A { + var link Link + + // Href + link.Href = extractHref(n) - if n.Type == html.TextNode && trimmedData != "" { - fmt.Printf("Data: %s\n", trimmedData) - for anc := range n.Ancestors() { - fmt.Printf("\tAncestor: %v\n", anc.Data) - } - continue + // Text + chunks := extractText(n, nil) + link.Text = strings.Join(chunks, " ") + + buffer = append(buffer, link) + } else { + for c := n.FirstChild; c != nil; c = c.NextSibling { + buffer = iterHTML(c, buffer) } + } - if n.Type == html.ElementNode && n.DataAtom == atom.A { - var link Link + return buffer +} - // Scan the href. - for _, a := range n.Attr { - if a.Key == "href" { - link.Href = a.Val - } - } +// extractHref returns the first href attribute of anchor. +func extractHref(anchor *html.Node) string { + var href string - // FIXME: for now, only scan for hrefs. - links = append(links, link) + for _, a := range anchor.Attr { + if a.Key == atom.Href.String() { + href = a.Val + break } } - return links, nil + return href +} + +// extractText recursively scans anchor to return the various nested +// pieces of text content. +func extractText(anchor *html.Node, buffer []string) []string { + for c := anchor.FirstChild; c != nil; c = c.NextSibling { + switch c.Type { + case html.TextNode: + buffer = append(buffer, c.Data) + case html.ElementNode: + buffer = extractText(c, buffer) + } + } + + return buffer } |
