summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--classic.go14
-rw-r--r--fetch.go11
2 files changed, 17 insertions, 8 deletions
diff --git a/classic.go b/classic.go
index 653f8f8..19450c1 100644
--- a/classic.go
+++ b/classic.go
@@ -1,11 +1,14 @@
package main
import (
+ "bytes"
"context"
"fmt"
"log"
"net/url"
"sync"
+
+ "golang.org/x/net/html"
)
func classic(startURL url.URL, maxConcurrency, maxURLs, maxDepth int) {
@@ -73,12 +76,19 @@ loop:
}
func getBatch(u url.URL) []url.URL {
- doc, err := fetch(u)
+ htmlBytes, err := fetch(u)
+ if err != nil {
+ log.Print(err)
+ return nil
+ }
+
+ htmlDoc, err := html.Parse(bytes.NewReader(htmlBytes))
if err != nil {
log.Print(err)
+ return nil
}
- batch := findURLs(u, doc)
+ batch := findURLs(u, htmlDoc)
return batch
}
diff --git a/fetch.go b/fetch.go
index f81f327..446e3fd 100644
--- a/fetch.go
+++ b/fetch.go
@@ -2,10 +2,9 @@ package main
import (
"fmt"
+ "io"
"net/http"
"net/url"
-
- "golang.org/x/net/html"
)
// fetch makes a GET request to refURL, returning the HTML contents of
@@ -14,7 +13,7 @@ import (
// A [url.URL] type is used for refURL to simplify recursive or else
// repeated use of this function when crawling webpages to, say, build
// a sitemap.
-func fetch(refURL url.URL) (*html.Node, error) {
+func fetch(refURL url.URL) ([]byte, error) {
rawURL := refURL.String()
// For now we leave the client unconfigured.
@@ -35,10 +34,10 @@ func fetch(refURL url.URL) (*html.Node, error) {
return nil, fmt.Errorf("status for %s for %s: %s", http.MethodGet, rawURL, resp.Status)
}
- htmlDoc, err := html.Parse(resp.Body)
+ htmlBytes, err := io.ReadAll(resp.Body)
if err != nil {
- return nil, fmt.Errorf("can't parse response body: %w", err)
+ return nil, fmt.Errorf("can't read reponse body into byte buffer")
}
- return htmlDoc, nil
+ return htmlBytes, nil
}