blob: c0a62af73acabbc2cd2e5a29ee61c0f4491b4363 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
|
import csv
import sys
def get_urls(filename: str, limit: int | None = None) -> list[str]:
"""Get a list of URLs from FILENAME.
FILENAME should be that of a CSV file with a field 'Domain'
denoting the site
The https schema is prefixed to each URL.
If LIMIT is not None, then stop after LIMIT URLs have been read.
"""
urls: list[str] = []
with open(filename) as f:
reader = csv.DictReader(f)
for i, row in enumerate(reader):
if limit is not None and i == limit:
break
urls.append(f"https://{row['Domain']}")
return urls
def main():
limit: int | None = None
if len(sys.argv) > 1:
limit = int(sys.argv[1])
urls = get_urls("majestic_million.csv", limit)
print(urls)
if __name__ == "__main__":
main()
|