diff options
| author | demo <demo@antix1> | 2026-05-28 17:28:59 -0400 |
|---|---|---|
| committer | demo <demo@antix1> | 2026-05-28 17:28:59 -0400 |
| commit | 848481fd1a738225664fa398275fcd1afd61c4ff (patch) | |
| tree | 9883e7f82b358de00e1d7b3b866f71f64d719a43 | |
| parent | 0d7056446009f38ea44249f2f74ab70b05410cfb (diff) | |
feat: implement shortcode feature
| -rw-r--r-- | classic.go | 4 | ||||
| -rw-r--r-- | main.go | 30 | ||||
| -rw-r--r-- | shortcodes.go | 50 |
3 files changed, 80 insertions, 4 deletions
@@ -75,6 +75,10 @@ loop: cancel() wg.Wait() + // FIXME: eventually, when all crawlers terminate properly, we + // can move this out to main: that is, all crawlers will + // return the same seen map that will be processed by this + // code. fmt.Println("Generating sitemap...") sitemap, err := toSitemap(seen, maxDepth, maxURLs) if err != nil { @@ -3,6 +3,7 @@ package main import ( + "errors" "flag" "fmt" "log" @@ -18,9 +19,11 @@ func main() { log.SetFlags(log.LstdFlags | log.Lshortfile) maxConcurrency := flag.Int("c", 0, "Maximum number of concurrent queue pushes") - startRawURL := flag.String("url", "", "Entry-point URL") + urlArg := flag.String("url", "", "Entry-point URL") maxURLs := flag.Int("max", 0, "Maximum number of URLs to collect (omitted or 0 means no limit)") maxDepth := flag.Int("depth", 0, "Maximum URL depth (omitted or 0 means no limit)") + shortcode := flag.String("shortcode", "", "URL shortcode") + shortcodeFilename := flag.String("scfile", "urls.csv", "Shortcode CSV file") flag.Parse() @@ -34,8 +37,9 @@ func main() { log.Fatalf("Invalid -c argument: %d", *maxConcurrency) } - if *startRawURL == "" { - log.Fatal("Missing -url argument") + startRawURL, err := chooseFrom(*urlArg, *shortcode, *shortcodeFilename) + if err != nil { + log.Fatal(err) } if *maxURLs < 0 { @@ -46,7 +50,7 @@ func main() { log.Fatalf("Invalid -depth argument: %d", *maxDepth) } - startURL, err := convertToURL(*startRawURL) + startURL, err := convertToURL(startRawURL) if err != nil { log.Fatal(err) } @@ -58,6 +62,24 @@ func main() { }) } +// chooseFrom determines whether to use a -url or -shortcode +// argument. If both are present or absent, an error is returned. If +// exactly one is present, return that one. +func chooseFrom(urlArg, shortcode, shortcodeFilename string) (string, error) { + urlAbsent := (urlArg == "") + shortcodeAbsent := (shortcode == "") + + if urlAbsent == shortcodeAbsent { + return "", errors.New("-url and -shortcode flags either both missing or both present") + } + + if urlAbsent { + return getURLFromShortcode(shortcodeFilename, shortcode) + } + + return urlArg, nil +} + // convertToURL parses the given rawURL into a [url.URL]. If the // rawURL is missing a scheme, "https://" is prepended before parsing. // diff --git a/shortcodes.go b/shortcodes.go new file mode 100644 index 0000000..978b39b --- /dev/null +++ b/shortcodes.go @@ -0,0 +1,50 @@ +package main + +import ( + "encoding/csv" + "fmt" + "io" + "os" +) + +// getURLFromShortcode looks for the shortcode inside the file +// specified by filename (default "urls.csv" in the top-level project +// directory), and returns the associated URL. +// +// Right now we perform an O(n) search across the entire file just to +// get a single URL, so there's room for improvement here. +func getURLFromShortcode(filename, shortcode string) (string, error) { + f, err := os.Open(filename) + if err != nil { + return "", fmt.Errorf("can't open shortcode file: %w", err) + } + defer f.Close() + + r := csv.NewReader(f) + + // Read, vet, and discard the column-header row. + header, err := r.Read() + if err != nil { + return "", fmt.Errorf("missing header line: %w", err) + } + + if header[0] != "shortcode" || header[1] != "url" { + return "", fmt.Errorf("invalid CSV header: %v", header) + } + + for i := 1; ; i++ { + record, err := r.Read() + if err == io.EOF { + break + } + if err != nil { + return "", fmt.Errorf("error reading %s, row %d: %w", filename, i, err) + } + + if record[0] == shortcode { + return record[1], nil + } + } + + return "", fmt.Errorf("no URL with shortcode %s", shortcode) +} |
