summaryrefslogtreecommitdiff
path: root/internal/findlinks
diff options
context:
space:
mode:
Diffstat (limited to 'internal/findlinks')
-rw-r--r--internal/findlinks/findlinks.go64
1 files changed, 44 insertions, 20 deletions
diff --git a/internal/findlinks/findlinks.go b/internal/findlinks/findlinks.go
index 619d8a4..00635ec 100644
--- a/internal/findlinks/findlinks.go
+++ b/internal/findlinks/findlinks.go
@@ -24,33 +24,57 @@ func FindLinks(r io.Reader) ([]Link, error) {
return nil, fmt.Errorf("can't parse html reader: %w", err)
}
- var links []Link
+ links := iterHTML(doc, nil)
- for n := range doc.Descendants() {
- trimmedData := strings.TrimSpace(n.Data)
+ return links, nil
+}
+
+func iterHTML(n *html.Node, buffer []Link) []Link {
+ if n.Type == html.ElementNode && n.DataAtom == atom.A {
+ var link Link
+
+ // Href
+ link.Href = extractHref(n)
- if n.Type == html.TextNode && trimmedData != "" {
- fmt.Printf("Data: %s\n", trimmedData)
- for anc := range n.Ancestors() {
- fmt.Printf("\tAncestor: %v\n", anc.Data)
- }
- continue
+ // Text
+ chunks := extractText(n, nil)
+ link.Text = strings.Join(chunks, " ")
+
+ buffer = append(buffer, link)
+ } else {
+ for c := n.FirstChild; c != nil; c = c.NextSibling {
+ buffer = iterHTML(c, buffer)
}
+ }
- if n.Type == html.ElementNode && n.DataAtom == atom.A {
- var link Link
+ return buffer
+}
- // Scan the href.
- for _, a := range n.Attr {
- if a.Key == "href" {
- link.Href = a.Val
- }
- }
+// extractHref returns the first href attribute of anchor.
+func extractHref(anchor *html.Node) string {
+ var href string
- // FIXME: for now, only scan for hrefs.
- links = append(links, link)
+ for _, a := range anchor.Attr {
+ if a.Key == atom.Href.String() {
+ href = a.Val
+ break
}
}
- return links, nil
+ return href
+}
+
+// extractText recursively scans anchor to return the various nested
+// pieces of text content.
+func extractText(anchor *html.Node, buffer []string) []string {
+ for c := anchor.FirstChild; c != nil; c = c.NextSibling {
+ switch c.Type {
+ case html.TextNode:
+ buffer = append(buffer, c.Data)
+ case html.ElementNode:
+ buffer = extractText(c, buffer)
+ }
+ }
+
+ return buffer
}