Advertisement
Guest User

Untitled

a guest
Apr 24th, 2014
48
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.44 KB | None | 0 0
  1. type SiteData struct {
  2. // ...
  3. }
  4.  
  5. func downloadURL(url string) (body []byte, status int) {
  6. resp, err := http.Get(url)
  7.  
  8. if err != nil {
  9. return
  10. }
  11.  
  12. status = resp.StatusCode
  13. defer resp.Body.Close()
  14.  
  15. body, err = ioutil.ReadAll(resp.Body)
  16. body = bytes.Trim(body, "x00")
  17.  
  18. return
  19. }
  20.  
  21.  
  22. func processSiteData(resp []byte) SiteData {
  23. // ...
  24. }
  25.  
  26. func worker(input chan string, output chan SiteData) {
  27.  
  28. // wait on the channel for links to process
  29. for url := range input {
  30.  
  31. // fetch the http response and status code
  32. resp, status := downloadURL(url)
  33.  
  34. if resp != nil && status == 200 {
  35. // if no errors in fetching link
  36. // process the data and send
  37. // it back
  38. output <- processSiteData(resp)
  39. } else {
  40. // otherwise send the url for processing
  41. // once more
  42. input <- url
  43. }
  44. }
  45. }
  46.  
  47. func crawl(urlList []string) {
  48. numWorkers := 4
  49. input := make(chan string)
  50. output := make(chan SiteData)
  51.  
  52. // spawn workers
  53. for i := 0; i < numWorkers; i++ {
  54. go worker(input, output)
  55. }
  56.  
  57. // enqueue urls
  58. go func() {
  59. for url := range urlList {
  60. input <- url
  61. }
  62. }()
  63.  
  64. // wait for the results
  65. for {
  66. select {
  67. case data := <-output:
  68. saveToDB(data)
  69. }
  70. }
  71.  
  72. }
  73.  
  74. func main() {
  75. urlList := loadLinksFromDB()
  76. crawl(urlList)
  77. }
  78.  
  79. package main
  80.  
  81. func worker(intput chan string, output chan SiteData, failed chan string) {
  82. for url := range input {
  83. // ...
  84. if resp != nil && status == 200 {
  85. output <- processSideData(resp)
  86. } else {
  87. failed <- url
  88. }
  89. }
  90. }
  91.  
  92. func crawl(urlList []string) {
  93. numWorkers := 4
  94. input := make(chan string)
  95. failed := make(chan string)
  96. output := make(chan SiteData)
  97.  
  98. // spawn workers
  99. for i := 0; i < numWorkers; i++ {
  100. go worker(input, output, failed)
  101. }
  102.  
  103. // Dispatch URLs to the workers, also receive failures from them.
  104. go func() {
  105. for {
  106. select {
  107. case input <- urlList[0]:
  108. urlList = urlList[1:]
  109. case url := <-failed:
  110. urlList = append(urlList, url)
  111. }
  112. }
  113. }()
  114.  
  115. // wait for the results
  116. for {
  117. data := <-output
  118. saveToDB(data)
  119. }
  120. }
  121.  
  122. func main() {
  123. urlList := loadLinksFromDB()
  124. crawl(urlList)
  125. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement