import asyncio import csv import sys import time import aiohttp def get_urls(filename: str, limit: int | None = None) -> list[str]: """Get a list of URLs from FILENAME. FILENAME should be that of a CSV file with a field 'Domain', denoting the site URL. The 'https://' schema is prefixed to each URL. If LIMIT is not None, then stop after LIMIT URLs have been read. Return the list of URLs. """ urls: list[str] = [] with open(filename) as f: reader = csv.DictReader(f) for i, row in enumerate(reader): if limit is not None and i == limit: break urls.append(f"https://{row['Domain']}") return urls async def ping(urls: list[str], max_concurrency=None) -> None: """Make a GET request to members of URLS. If MAX_CONCURRENCY is None, browse every site at once. Else, only browse MAX_CONCURRENCY number of sites at a time. Print the sites as they get browsed; don't return anything. """ async with aiohttp.ClientSession(max_field_size=8190 * 2) as session: for url in urls: try: async with session.get(url) as response: print(f"Status: {response.status}") if not response.ok: print(f"Got code {response.status} from URL; skipping") continue print(f"Content-Type: {response.headers['content-type']}") except aiohttp.ClientError as e: print(f"Something bad happened with URL: {e}; skipping") def main(): limit: int | None = None if len(sys.argv) > 1: limit = int(sys.argv[1]) urls = get_urls("majestic_million.csv", limit) start_time = time.perf_counter() asyncio.run(ping(urls)) end_time = time.perf_counter() elapsed_time = end_time - start_time print(f"Execution time: {elapsed_time:.6f} seconds") if __name__ == "__main__": main()