diff options
| -rw-r--r-- | main.py | 37 |
1 files changed, 36 insertions, 1 deletions
@@ -1,5 +1,40 @@ +import csv +import sys + + +def get_urls(filename: str, limit: int | None = None) -> list[str]: + """Get a list of URLs from FILENAME. + + FILENAME should be that of a CSV file with a field 'Domain' + denoting the site + + The https schema is prefixed to each URL. + + If LIMIT is not None, then stop after LIMIT URLs have been read. + + """ + + urls: list[str] = [] + + with open(filename) as f: + reader = csv.DictReader(f) + for i, row in enumerate(reader): + if limit is not None and i == limit: + break + + urls.append(f"https://{row['Domain']}") + + return urls + + def main(): - print("Hello from crawl-before-you-walk!") + limit: int | None = None + + if len(sys.argv) > 1: + limit = int(sys.argv[1]) + + urls = get_urls("majestic_million.csv", limit) + print(urls) if __name__ == "__main__": |
