Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- type SiteData struct {
- // ...
- }
- func downloadURL(url string) (body []byte, status int) {
- resp, err := http.Get(url)
- if err != nil {
- return
- }
- status = resp.StatusCode
- defer resp.Body.Close()
- body, err = ioutil.ReadAll(resp.Body)
- body = bytes.Trim(body, "x00")
- return
- }
- func processSiteData(resp []byte) SiteData {
- // ...
- }
- func worker(input chan string, output chan SiteData) {
- // wait on the channel for links to process
- for url := range input {
- // fetch the http response and status code
- resp, status := downloadURL(url)
- if resp != nil && status == 200 {
- // if no errors in fetching link
- // process the data and send
- // it back
- output <- processSiteData(resp)
- } else {
- // otherwise send the url for processing
- // once more
- input <- url
- }
- }
- }
- func crawl(urlList []string) {
- numWorkers := 4
- input := make(chan string)
- output := make(chan SiteData)
- // spawn workers
- for i := 0; i < numWorkers; i++ {
- go worker(input, output)
- }
- // enqueue urls
- go func() {
- for url := range urlList {
- input <- url
- }
- }()
- // wait for the results
- for {
- select {
- case data := <-output:
- saveToDB(data)
- }
- }
- }
- func main() {
- urlList := loadLinksFromDB()
- crawl(urlList)
- }
- package main
- func worker(intput chan string, output chan SiteData, failed chan string) {
- for url := range input {
- // ...
- if resp != nil && status == 200 {
- output <- processSideData(resp)
- } else {
- failed <- url
- }
- }
- }
- func crawl(urlList []string) {
- numWorkers := 4
- input := make(chan string)
- failed := make(chan string)
- output := make(chan SiteData)
- // spawn workers
- for i := 0; i < numWorkers; i++ {
- go worker(input, output, failed)
- }
- // Dispatch URLs to the workers, also receive failures from them.
- go func() {
- for {
- select {
- case input <- urlList[0]:
- urlList = urlList[1:]
- case url := <-failed:
- urlList = append(urlList, url)
- }
- }
- }()
- // wait for the results
- for {
- data := <-output
- saveToDB(data)
- }
- }
- func main() {
- urlList := loadLinksFromDB()
- crawl(urlList)
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement