From 172b87e3c552aa9274870ebe88ff99928cee4841 Mon Sep 17 00:00:00 2001 From: "Brandon C. Irizarry" Date: Thu, 16 Apr 2026 16:07:45 -0400 Subject: feat: read N URLs via command-line parameter from CSV file --- main.py | 37 ++++++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/main.py b/main.py index b9ce84b..c0a62af 100644 --- a/main.py +++ b/main.py @@ -1,5 +1,40 @@ +import csv +import sys + + +def get_urls(filename: str, limit: int | None = None) -> list[str]: + """Get a list of URLs from FILENAME. + + FILENAME should be that of a CSV file with a field 'Domain' + denoting the site + + The https schema is prefixed to each URL. + + If LIMIT is not None, then stop after LIMIT URLs have been read. + + """ + + urls: list[str] = [] + + with open(filename) as f: + reader = csv.DictReader(f) + for i, row in enumerate(reader): + if limit is not None and i == limit: + break + + urls.append(f"https://{row['Domain']}") + + return urls + + def main(): - print("Hello from crawl-before-you-walk!") + limit: int | None = None + + if len(sys.argv) > 1: + limit = int(sys.argv[1]) + + urls = get_urls("majestic_million.csv", limit) + print(urls) if __name__ == "__main__": -- cgit v1.2.3