summaryrefslogtreecommitdiff
path: root/main.py
blob: 80dcce7ee1ea1677b648827dfb77916dd38edd7a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import asyncio
import csv
import sys
import time

import aiohttp


def get_urls(filename: str, limit: int | None = None) -> list[str]:
    """Get a list of URLs from FILENAME.

    FILENAME should be that of a CSV file with a field 'Domain',
    denoting the site URL.

    The 'https://' schema is prefixed to each URL.

    If LIMIT is not None, then stop after LIMIT URLs have been read.

    Return the list of URLs.

    """

    urls: list[str] = []

    with open(filename) as f:
        reader = csv.DictReader(f)
        for i, row in enumerate(reader):
            if limit is not None and i == limit:
                break

            urls.append(f"https://{row['Domain']}")

        return urls


async def ping(urls: list[str], max_concurrency=None) -> None:
    """Make a GET request to members of URLS.

    If MAX_CONCURRENCY is None, browse every site at once.

    Else, only browse MAX_CONCURRENCY number of sites at a time.

    Print the sites as they get browsed; don't return anything.

    """

    async with aiohttp.ClientSession(max_field_size=8190 * 2) as session:
        for url in urls:
            try:
                async with session.get(url) as response:
                    print(f"Status: {response.status}")

                    if not response.ok:
                        print(f"Got code {response.status} from URL; skipping")
                        continue

                    print(f"Content-Type: {response.headers['content-type']}")
            except aiohttp.ClientError as e:
                print(f"Something bad happened with URL: {e}; skipping")


def main():
    limit: int | None = None

    if len(sys.argv) > 1:
        limit = int(sys.argv[1])

    urls = get_urls("majestic_million.csv", limit)

    start_time = time.perf_counter()
    asyncio.run(ping(urls))
    end_time = time.perf_counter()

    elapsed_time = end_time - start_time
    print(f"Execution time: {elapsed_time:.6f} seconds")


if __name__ == "__main__":
    main()