From 848481fd1a738225664fa398275fcd1afd61c4ff Mon Sep 17 00:00:00 2001 From: demo Date: Thu, 28 May 2026 17:28:59 -0400 Subject: feat: implement shortcode feature --- classic.go | 4 ++++ main.go | 30 ++++++++++++++++++++++++++---- shortcodes.go | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 80 insertions(+), 4 deletions(-) create mode 100644 shortcodes.go diff --git a/classic.go b/classic.go index cdf8606..b7a3d8d 100644 --- a/classic.go +++ b/classic.go @@ -75,6 +75,10 @@ loop: cancel() wg.Wait() + // FIXME: eventually, when all crawlers terminate properly, we + // can move this out to main: that is, all crawlers will + // return the same seen map that will be processed by this + // code. fmt.Println("Generating sitemap...") sitemap, err := toSitemap(seen, maxDepth, maxURLs) if err != nil { diff --git a/main.go b/main.go index 3582ed5..b48f6c9 100644 --- a/main.go +++ b/main.go @@ -3,6 +3,7 @@ package main import ( + "errors" "flag" "fmt" "log" @@ -18,9 +19,11 @@ func main() { log.SetFlags(log.LstdFlags | log.Lshortfile) maxConcurrency := flag.Int("c", 0, "Maximum number of concurrent queue pushes") - startRawURL := flag.String("url", "", "Entry-point URL") + urlArg := flag.String("url", "", "Entry-point URL") maxURLs := flag.Int("max", 0, "Maximum number of URLs to collect (omitted or 0 means no limit)") maxDepth := flag.Int("depth", 0, "Maximum URL depth (omitted or 0 means no limit)") + shortcode := flag.String("shortcode", "", "URL shortcode") + shortcodeFilename := flag.String("scfile", "urls.csv", "Shortcode CSV file") flag.Parse() @@ -34,8 +37,9 @@ func main() { log.Fatalf("Invalid -c argument: %d", *maxConcurrency) } - if *startRawURL == "" { - log.Fatal("Missing -url argument") + startRawURL, err := chooseFrom(*urlArg, *shortcode, *shortcodeFilename) + if err != nil { + log.Fatal(err) } if *maxURLs < 0 { @@ -46,7 +50,7 @@ func main() { log.Fatalf("Invalid -depth argument: %d", *maxDepth) } - startURL, err := convertToURL(*startRawURL) + startURL, err := convertToURL(startRawURL) if err != nil { log.Fatal(err) } @@ -58,6 +62,24 @@ func main() { }) } +// chooseFrom determines whether to use a -url or -shortcode +// argument. If both are present or absent, an error is returned. If +// exactly one is present, return that one. +func chooseFrom(urlArg, shortcode, shortcodeFilename string) (string, error) { + urlAbsent := (urlArg == "") + shortcodeAbsent := (shortcode == "") + + if urlAbsent == shortcodeAbsent { + return "", errors.New("-url and -shortcode flags either both missing or both present") + } + + if urlAbsent { + return getURLFromShortcode(shortcodeFilename, shortcode) + } + + return urlArg, nil +} + // convertToURL parses the given rawURL into a [url.URL]. If the // rawURL is missing a scheme, "https://" is prepended before parsing. // diff --git a/shortcodes.go b/shortcodes.go new file mode 100644 index 0000000..978b39b --- /dev/null +++ b/shortcodes.go @@ -0,0 +1,50 @@ +package main + +import ( + "encoding/csv" + "fmt" + "io" + "os" +) + +// getURLFromShortcode looks for the shortcode inside the file +// specified by filename (default "urls.csv" in the top-level project +// directory), and returns the associated URL. +// +// Right now we perform an O(n) search across the entire file just to +// get a single URL, so there's room for improvement here. +func getURLFromShortcode(filename, shortcode string) (string, error) { + f, err := os.Open(filename) + if err != nil { + return "", fmt.Errorf("can't open shortcode file: %w", err) + } + defer f.Close() + + r := csv.NewReader(f) + + // Read, vet, and discard the column-header row. + header, err := r.Read() + if err != nil { + return "", fmt.Errorf("missing header line: %w", err) + } + + if header[0] != "shortcode" || header[1] != "url" { + return "", fmt.Errorf("invalid CSV header: %v", header) + } + + for i := 1; ; i++ { + record, err := r.Read() + if err == io.EOF { + break + } + if err != nil { + return "", fmt.Errorf("error reading %s, row %d: %w", filename, i, err) + } + + if record[0] == shortcode { + return record[1], nil + } + } + + return "", fmt.Errorf("no URL with shortcode %s", shortcode) +} -- cgit v1.2.3