summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--main.py37
1 files changed, 36 insertions, 1 deletions
diff --git a/main.py b/main.py
index b9ce84b..c0a62af 100644
--- a/main.py
+++ b/main.py
@@ -1,5 +1,40 @@
+import csv
+import sys
+
+
+def get_urls(filename: str, limit: int | None = None) -> list[str]:
+ """Get a list of URLs from FILENAME.
+
+ FILENAME should be that of a CSV file with a field 'Domain'
+ denoting the site
+
+ The https schema is prefixed to each URL.
+
+ If LIMIT is not None, then stop after LIMIT URLs have been read.
+
+ """
+
+ urls: list[str] = []
+
+ with open(filename) as f:
+ reader = csv.DictReader(f)
+ for i, row in enumerate(reader):
+ if limit is not None and i == limit:
+ break
+
+ urls.append(f"https://{row['Domain']}")
+
+ return urls
+
+
def main():
- print("Hello from crawl-before-you-walk!")
+ limit: int | None = None
+
+ if len(sys.argv) > 1:
+ limit = int(sys.argv[1])
+
+ urls = get_urls("majestic_million.csv", limit)
+ print(urls)
if __name__ == "__main__":