Guest User

Untitled

a guest
Jan 23rd, 2018
73
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.42 KB | None | 0 0
  1. package main
  2.  
  3. import (
  4. "fmt"
  5. "sync"
  6. "net/http"
  7. "log"
  8. "regexp"
  9. "io/ioutil"
  10. )
  11.  
  12. type Crawler struct {
  13. urls map[string]bool
  14. mux sync.Mutex
  15. umatch *regexp.Regexp
  16. }
  17.  
  18. func (c Crawler) parse(body string) (urls []string) {
  19. return c.umatch.FindAllString(body, -1)
  20. }
  21.  
  22. func (c Crawler) fetch(url string) (urls []string) {
  23. res, err := http.Get(url)
  24. if err != nil {
  25. fmt.Println("Error in fetching %s: %s", url, err)
  26. log.Fatal(err)
  27. }
  28. defer res.Body.Close()
  29. body, err := ioutil.ReadAll(res.Body)
  30. if err != nil {
  31. log.Fatal(err)
  32. }
  33. urls = c.parse(string(body))
  34. return
  35. }
  36.  
  37. func (c Crawler) Crawl(url string, depth int) {
  38. if depth <= 0 {
  39. return
  40. }
  41.  
  42. c.mux.Lock()
  43. if c.urls[url] { //Already exists
  44. c.mux.Unlock()
  45. return
  46. }
  47. c.urls[url] = true
  48. c.mux.Unlock()
  49.  
  50. log.Println("Fetching %s", url)
  51. fetched := c.fetch(url)
  52. for _, u := range fetched {
  53. go c.Crawl(u, depth - 1)
  54. }
  55. return
  56. }
  57.  
  58. func main() {
  59. c := Crawler{urls : map[string]bool{}, umatch : regexp.MustCompile(`(http|ftp|https)://([w-_]+(?:(?:.[w-_]+)+))([w-.,@?^=%&:/~+#]*[w-@?^=%&/~+#])?`)}
  60. c.Crawl("http://www.yahoo.com", 3)
  61. for u, _ := range c.urls {
  62. fmt.Println(u)
  63. }
  64. }
  65.  
  66. re := regexp.MustCompile("href="(.*?)"")
  67. subre := regexp.MustCompile(""/[\w]+")
  68.  
  69. matchLink := re.FindAllStringSubmatch(string(*data), -1)
Add Comment
Please, Sign In to add comment