summaryrefslogtreecommitdiff
path: root/classic.go
blob: 653f8f88604a8cfee5679cf2ec6257f28c97b000 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
package main

import (
	"context"
	"fmt"
	"log"
	"net/url"
	"sync"
)

func classic(startURL url.URL, maxConcurrency, maxURLs, maxDepth int) {
	worklist := make(chan []packet)
	var numPendingSends int

	numPendingSends++
	go func() {
		startPacket := packet{startURL, 0}
		worklist <- []packet{startPacket}
	}()

	// Crawl the web concurrently. Map URLs to their depth (i.e
	// how many links we have to work through to find the URL.)
	seen := make(map[url.URL]int)
	count := 1

	ctx, cancel := context.WithCancel(context.Background())
	var wg sync.WaitGroup
	sema := make(chan struct{}, maxConcurrency)

loop:
	for ; numPendingSends > 0; numPendingSends-- {
		batch := <-worklist
		for _, p := range batch {
			if _, ok := seen[p.url]; !ok {
				fmt.Printf("%d. %s\n", count, &p)
				count++

				seen[p.url] = p.depth
				if len(seen) == maxURLs {
					break loop
				}

				// Track maxDepth here. A maxDepth
				// greater than zero means a finite
				// maxDepth value. If the packets
				// we're seeing reach that depth,
				// don't use their URLs to spawn new
				// fetches.
				if maxDepth > 0 && p.depth == maxDepth {
					continue
				}

				numPendingSends++
				wg.Go(func() {
					sema <- struct{}{}
					defer func() { <-sema }()

					batch := getBatch(p.url)
					ps := convertToPackets(batch, p.depth+1)

					select {
					case <-ctx.Done():
						return
					case worklist <- ps:
					}
				})
			}
		}
	}

	cancel()
	wg.Wait()
}

func getBatch(u url.URL) []url.URL {
	doc, err := fetch(u)
	if err != nil {
		log.Print(err)
	}

	batch := findURLs(u, doc)

	return batch
}