package links

import (
	"fmt"
	"io"
	"log"
	"net/url"

	"golang.org/x/net/html"
	"golang.org/x/net/html/atom"
)

func Parse(htmlInput io.Reader, baseURL *url.URL) ([]string, error) {
	var err error

	hrefs, err := parse(htmlInput)
	if err != nil {
		return nil, fmt.Errorf("can't parse: %w", err)
	}

	hrefs = filterByBaseURL(baseURL, hrefs)

	return hrefs, nil
}

// parse finds the links inside htmlInput and returns them as a slice
// of strings, along with an error.
//
// The only possible error should be the one returned from calling
// [html.Parse].
func parse(htmlInput io.Reader) ([]string, error) {
	doc, err := html.Parse(htmlInput)
	if err != nil {
		return nil, err
	}

	hrefs := findHrefs(doc)

	return hrefs, nil
}

// findHrefs returns all link addresses inside doc. It collects each
// one exactly as it appears in the document, without resolving it
// with respect to some base URL.
func findHrefs(doc *html.Node) []string {
	var hrefs []string
	for node := range doc.Descendants() {
		if node.Type == html.ElementNode && node.DataAtom == atom.A {
			for _, attr := range node.Attr {
				if attr.Key == "href" {
					hrefs = append(hrefs, attr.Val)
				}
			}
		}
	}

	return hrefs
}

// filterByBaseURL returns the slice of all web addresses in hrefs
// that are under baseURL. In passing, it also resolves these with
// respect to baseURL.
//
// The motivation is that, when crawling pages
// to build a sitemap, the crawl should never leave the top-level Web
// domain those pages belong to.
func filterByBaseURL(baseURL *url.URL, hrefs []string) []string {
	var neighbors []string

	for _, href := range hrefs {
		// Check that href parses as a URL, and at the same
		// time resolve it with respec to baseURL.
		u, err := baseURL.Parse(href)
		if err != nil {
			log.Printf("%s: %v", u, err)
			continue
		}

		// If href is a valid absolute URL, it will parse
		// successfully, so we need to check hostnames.
		if u.Hostname() != baseURL.Hostname() {
			log.Printf("different hostnames: %s %s", u, baseURL)
			continue
		}

		neighbors = append(neighbors, href)
	}

	return neighbors
}