blob: d874d42cb07b4f511000fdc57fa031cc2ca1ae92 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
|
import asyncio
import csv
import sys
import time
import aiohttp
def get_urls(filename: str, limit: int | None = None) -> list[str]:
"""Get a list of URLs from FILENAME.
FILENAME should be that of a CSV file with a field 'Domain',
denoting the site URL.
The 'https://' schema is prefixed to each URL.
If LIMIT is not None, then stop after LIMIT URLs have been read.
Return the list of URLs.
"""
urls: list[str] = []
with open(filename) as f:
reader = csv.DictReader(f)
for i, row in enumerate(reader):
if limit is not None and i == limit:
break
urls.append(f"https://{row['Domain']}")
return urls
async def ping(urls: list[str], max_concurrency=None) -> None:
"""Make a GET request to members of URLS.
If MAX_CONCURRENCY is None, browse every site at once.
Else, only browse MAX_CONCURRENCY number of sites at a time.
Print the sites as they get browsed; don't return anything.
"""
async with aiohttp.ClientSession(max_field_size=8190 * 2, timeout=1.0) as session:
for url in urls:
try:
async with session.get(url) as response:
print(f"Status: {response.status}")
if not response.ok:
print(f"Got code {response.status} from URL; skipping")
continue
print(f"Content-Type: {response.headers['content-type']}")
except aiohttp.ClientError as e:
print(f"Something bad happened with URL: {e}; skipping")
def main():
limit: int | None = None
if len(sys.argv) > 1:
limit = int(sys.argv[1])
urls = get_urls("majestic_million.csv", limit)
start_time = time.perf_counter()
asyncio.run(ping(urls))
end_time = time.perf_counter()
elapsed_time = end_time - start_time
print(f"Execution time: {elapsed_time:.6f} seconds")
if __name__ == "__main__":
main()
|