diff options
Diffstat (limited to 'internal')
| -rw-r--r-- | internal/findlinks/findlinks.go | 64 |
1 files changed, 44 insertions, 20 deletions
diff --git a/internal/findlinks/findlinks.go b/internal/findlinks/findlinks.go index 619d8a4..00635ec 100644 --- a/internal/findlinks/findlinks.go +++ b/internal/findlinks/findlinks.go @@ -24,33 +24,57 @@ func FindLinks(r io.Reader) ([]Link, error) { return nil, fmt.Errorf("can't parse html reader: %w", err) } - var links []Link + links := iterHTML(doc, nil) - for n := range doc.Descendants() { - trimmedData := strings.TrimSpace(n.Data) + return links, nil +} + +func iterHTML(n *html.Node, buffer []Link) []Link { + if n.Type == html.ElementNode && n.DataAtom == atom.A { + var link Link + + // Href + link.Href = extractHref(n) - if n.Type == html.TextNode && trimmedData != "" { - fmt.Printf("Data: %s\n", trimmedData) - for anc := range n.Ancestors() { - fmt.Printf("\tAncestor: %v\n", anc.Data) - } - continue + // Text + chunks := extractText(n, nil) + link.Text = strings.Join(chunks, " ") + + buffer = append(buffer, link) + } else { + for c := n.FirstChild; c != nil; c = c.NextSibling { + buffer = iterHTML(c, buffer) } + } - if n.Type == html.ElementNode && n.DataAtom == atom.A { - var link Link + return buffer +} - // Scan the href. - for _, a := range n.Attr { - if a.Key == "href" { - link.Href = a.Val - } - } +// extractHref returns the first href attribute of anchor. +func extractHref(anchor *html.Node) string { + var href string - // FIXME: for now, only scan for hrefs. - links = append(links, link) + for _, a := range anchor.Attr { + if a.Key == atom.Href.String() { + href = a.Val + break } } - return links, nil + return href +} + +// extractText recursively scans anchor to return the various nested +// pieces of text content. +func extractText(anchor *html.Node, buffer []string) []string { + for c := anchor.FirstChild; c != nil; c = c.NextSibling { + switch c.Type { + case html.TextNode: + buffer = append(buffer, c.Data) + case html.ElementNode: + buffer = extractText(c, buffer) + } + } + + return buffer } |
