Advertisement
fenixD3

Exercise: Web Crawler

Jun 17th, 2024
910
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Go 2.71 KB | None | 0 0
  1. package main
  2.  
  3. import (
  4.     "fmt"
  5.     "sync"
  6. )
  7.  
  8. type Fetcher interface {
  9.     // Fetch returns the body of URL and
  10.     // a slice of URLs found on that page.
  11.     Fetch(url string) (body string, urls []string, err error)
  12. }
  13.  
  14. // Crawl uses fetcher to recursively crawl
  15. // pages starting with url, to a maximum of depth.
  16. func Crawl(url string, depth int, fetcher Fetcher) {
  17.     // TODO: Fetch URLs in parallel.
  18.     // TODO: Don't fetch the same URL twice.
  19.     // This implementation doesn't do either:
  20.     type CachedData struct {
  21.         body string
  22.         urls []string
  23.     }
  24.     cached_urls_data := make(map[string]CachedData)
  25.     var mutex sync.Mutex
  26.  
  27.     var walk func(url string, depth int, ch chan string)
  28.     walk = func(url string, depth int, ch chan string) {
  29.         defer close(ch)
  30.         if depth <= 0 {
  31.             return
  32.         }
  33.  
  34.         var body string
  35.         var urls []string
  36.         var err error
  37.  
  38.         mutex.Lock()
  39.         cached_data, was_stored := cached_urls_data[url]
  40.         mutex.Unlock()
  41.  
  42.         if !was_stored {
  43.             body, urls, err = fetcher.Fetch(url)
  44.             if err != nil {
  45.                 ch <- err.Error()
  46.                 return
  47.             }
  48.  
  49.             mutex.Lock()
  50.             cached_urls_data[url] = CachedData{body, urls}
  51.             mutex.Unlock()
  52.         } else {
  53.             urls = cached_data.urls
  54.             body = cached_data.body
  55.         }
  56.  
  57.         ch <- fmt.Sprintf("found: %s %q", url, body)
  58.  
  59.         results := make([]chan string, len(urls))
  60.         for i, u := range urls {
  61.             results[i] = make(chan string)
  62.             go walk(u, depth-1, results[i])
  63.         }
  64.         for _, channel := range results {
  65.             for res := range channel {
  66.                 ch <- res
  67.             }
  68.         }
  69.     }
  70.  
  71.     ch := make(chan string)
  72.     go walk(url, depth, ch)
  73.     for found := range ch {
  74.         fmt.Println(found)
  75.     }
  76. }
  77.  
  78. func main() {
  79.     Crawl("https://golang.org/", 4, fetcher)
  80. }
  81.  
  82. // fakeFetcher is Fetcher that returns canned results.
  83. type fakeFetcher map[string]*fakeResult
  84.  
  85. type fakeResult struct {
  86.     body string
  87.     urls []string
  88. }
  89.  
  90. func (f fakeFetcher) Fetch(url string) (string, []string, error) {
  91.     if res, ok := f[url]; ok {
  92.         return res.body, res.urls, nil
  93.     }
  94.     return "", nil, fmt.Errorf("not found: %s", url)
  95. }
  96.  
  97. // fetcher is a populated fakeFetcher.
  98. var fetcher = fakeFetcher{
  99.     "https://golang.org/": &fakeResult{
  100.         "The Go Programming Language",
  101.         []string{
  102.             "https://golang.org/pkg/",
  103.             "https://golang.org/cmd/",
  104.         },
  105.     },
  106.     "https://golang.org/pkg/": &fakeResult{
  107.         "Packages",
  108.         []string{
  109.             "https://golang.org/",
  110.             "https://golang.org/cmd/",
  111.             "https://golang.org/pkg/fmt/",
  112.             "https://golang.org/pkg/os/",
  113.         },
  114.     },
  115.     "https://golang.org/pkg/fmt/": &fakeResult{
  116.         "Package fmt",
  117.         []string{
  118.             "https://golang.org/",
  119.             "https://golang.org/pkg/",
  120.         },
  121.     },
  122.     "https://golang.org/pkg/os/": &fakeResult{
  123.         "Package os",
  124.         []string{
  125.             "https://golang.org/",
  126.             "https://golang.org/pkg/",
  127.         },
  128.     },
  129. }
  130.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement