Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- package main
- import (
- "fmt"
- "sync"
- "net/http"
- "log"
- "regexp"
- "io/ioutil"
- )
- type Crawler struct {
- urls map[string]bool
- mux sync.Mutex
- umatch *regexp.Regexp
- }
- func (c Crawler) parse(body string) (urls []string) {
- return c.umatch.FindAllString(body, -1)
- }
- func (c Crawler) fetch(url string) (urls []string) {
- res, err := http.Get(url)
- if err != nil {
- fmt.Println("Error in fetching %s: %s", url, err)
- log.Fatal(err)
- }
- defer res.Body.Close()
- body, err := ioutil.ReadAll(res.Body)
- if err != nil {
- log.Fatal(err)
- }
- urls = c.parse(string(body))
- return
- }
- func (c Crawler) Crawl(url string, depth int) {
- if depth <= 0 {
- return
- }
- c.mux.Lock()
- if c.urls[url] { //Already exists
- c.mux.Unlock()
- return
- }
- c.urls[url] = true
- c.mux.Unlock()
- log.Println("Fetching %s", url)
- fetched := c.fetch(url)
- for _, u := range fetched {
- go c.Crawl(u, depth - 1)
- }
- return
- }
- func main() {
- c := Crawler{urls : map[string]bool{}, umatch : regexp.MustCompile(`(http|ftp|https)://([w-_]+(?:(?:.[w-_]+)+))([w-.,@?^=%&:/~+#]*[w-@?^=%&/~+#])?`)}
- c.Crawl("http://www.yahoo.com", 3)
- for u, _ := range c.urls {
- fmt.Println(u)
- }
- }
- re := regexp.MustCompile("href="(.*?)"")
- subre := regexp.MustCompile(""/[\w]+")
- matchLink := re.FindAllStringSubmatch(string(*data), -1)
Add Comment
Please, Sign In to add comment