diff options
| author | Brandon C. Irizarry <brandon.irizarry@gmail.com> | 2026-04-16 16:07:45 -0400 |
|---|---|---|
| committer | Brandon C. Irizarry <brandon.irizarry@gmail.com> | 2026-04-16 16:07:45 -0400 |
| commit | 172b87e3c552aa9274870ebe88ff99928cee4841 (patch) | |
| tree | 4951b92e0cf26b45d4910fc324479718ce847d1c | |
| parent | 4ba4d981ab9197db9bed3422dec3aa8a91afda8a (diff) | |
feat: read N URLs via command-line parameter from CSV file
| -rw-r--r-- | main.py | 37 |
1 files changed, 36 insertions, 1 deletions
@@ -1,5 +1,40 @@ +import csv +import sys + + +def get_urls(filename: str, limit: int | None = None) -> list[str]: + """Get a list of URLs from FILENAME. + + FILENAME should be that of a CSV file with a field 'Domain' + denoting the site + + The https schema is prefixed to each URL. + + If LIMIT is not None, then stop after LIMIT URLs have been read. + + """ + + urls: list[str] = [] + + with open(filename) as f: + reader = csv.DictReader(f) + for i, row in enumerate(reader): + if limit is not None and i == limit: + break + + urls.append(f"https://{row['Domain']}") + + return urls + + def main(): - print("Hello from crawl-before-you-walk!") + limit: int | None = None + + if len(sys.argv) > 1: + limit = int(sys.argv[1]) + + urls = get_urls("majestic_million.csv", limit) + print(urls) if __name__ == "__main__": |
