package findlinks import ( "fmt" "io" "strings" "golang.org/x/net/html" "golang.org/x/net/html/atom" ) // A Link encapsulates the data harvested from a link. type Link struct { Href string Text string } // findLinks consumes the given [io.Reader], scraping it of anchor // tags. Each anchor tag is "unmarshalled" into a [Link]. The // resulting slice of Links is returned, along with an error. func FindLinks(r io.Reader) ([]Link, error) { doc, err := html.Parse(r) if err != nil { return nil, fmt.Errorf("can't parse html reader: %w", err) } var links []Link for n := range doc.Descendants() { trimmedData := strings.TrimSpace(n.Data) if n.Type == html.TextNode && trimmedData != "" { fmt.Printf("Data: %s\n", trimmedData) for anc := range n.Ancestors() { fmt.Printf("\tAncestor: %v\n", anc.Data) } continue } if n.Type == html.ElementNode && n.DataAtom == atom.A { var link Link // Scan the href. for _, a := range n.Attr { if a.Key == "href" { link.Href = a.Val } } // FIXME: for now, only scan for hrefs. links = append(links, link) } } return links, nil }