Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- package main
- import (
- "fmt"
- "log"
- "math"
- "net/http"
- "os"
- "regexp"
- "strconv"
- "strings"
- "github.com/PuerkitoBio/goquery"
- "github.com/tealeg/xlsx"
- )
- const mainUri string = "http://www.paginasamarillas.com.ar/buscar/r/motores-electricos-antiexplosivos/"
- const xlsxFileName string = "PaginasAmarillas_Categoría_MotoresElectricosAntiexplosivos.xlsx"
- func main() {
- finalUri := mainUri
- cn := 1
- var totalPages int
- var file *xlsx.File
- var sheet *xlsx.Sheet
- var row *xlsx.Row
- var cell *xlsx.Cell
- var err error
- file = xlsx.NewFile()
- sheet, err = file.AddSheet("Sheet1")
- if err != nil {
- fmt.Printf(err.Error())
- }
- for {
- res, err := http.Get(finalUri)
- if res.Status == "404 ACT" || (cn > totalPages && totalPages > 0) /*|| cn >= 2 */ {
- break
- }
- if err != nil {
- log.Fatal(err)
- }
- defer res.Body.Close()
- doc, err := goquery.NewDocumentFromReader(res.Body)
- if err != nil {
- log.Fatal(err)
- }
- if cn == 1 {
- totalResults := doc.Find("ul.breadcrumb-links li").Eq(-1).Text()
- re := regexp.MustCompile("[0-9]+")
- totalResultsInt, err := strconv.Atoi(re.FindString(totalResults))
- if err != nil {
- fmt.Println(err)
- os.Exit(2)
- }
- d := float64(totalResultsInt) / float64(25)
- totalPages = int(math.Ceil(d))
- fmt.Println(totalPages)
- }
- doc.Find(".col-center ul.businesses li.business").Each(func(i int, s *goquery.Selection) {
- row = sheet.AddRow()
- itemUri, chkDataHrefBool := s.Attr("data-href")
- name := strings.TrimSpace(s.Find("h2.business-name").Text())
- description := strings.TrimSpace(s.Find("div[itemprop='description']").Text())
- street := strings.TrimSpace(s.Find("p.business-address span").Eq(0).Text())
- address := strings.TrimSpace(s.Find("p.business-address span").Eq(1).Text())
- web := strings.TrimSpace(s.Find("a.business-web").Text())
- mainPhone := strings.TrimSpace(s.Find("div[itemprop='telephone']").Text())
- /*logo, chkDataLogoBool := s.Find("div[itemprop='logo'] .logoad").Attr("src")
- if chkDataLogoBool != false {
- logo = strings.TrimSpace(logo)
- fmt.Println(logo)
- }*/
- name = trimUnnecessaryWhiteSpaces(name)
- addCellWithValue(name, row, true)
- description = trimUnnecessaryWhiteSpaces(description)
- addCellWithValue(description, row, true)
- street = trimUnnecessaryWhiteSpaces(street)
- addCellWithValue(street, row, true)
- address = trimUnnecessaryWhiteSpaces(address)
- addCellWithValue(address, row, true)
- web = trimUnnecessaryWhiteSpaces(web)
- addCellWithValue(web, row, true)
- cell = row.AddCell()
- if chkDataHrefBool != false {
- itemUri = strings.TrimSpace(itemUri)
- fmt.Println(itemUri)
- cell.Value = itemUri
- } else {
- cell.Value = " - "
- }
- mainPhone = trimUnnecessaryWhiteSpaces(mainPhone)
- addCellWithValue(mainPhone, row, true)
- s.Find("a[itemprop='telephone']").Each(func(i int, ss *goquery.Selection) {
- morePhones := trimUnnecessaryWhiteSpaces(ss.Text())
- addCellWithValue(morePhones, row, true)
- })
- fmt.Println("**** ")
- })
- cn = cn + 1
- finalUri = mainUri + "p-" + strconv.Itoa(cn) + "/"
- fmt.Println(" ----------------------------- ")
- }
- err = file.Save(xlsxFileName)
- if err != nil {
- fmt.Printf(err.Error())
- }
- }
- func trimUnnecessaryWhiteSpaces(str string) string {
- re_leadclose_whtsp := regexp.MustCompile(`^[\s\p{Zs}]+|[\s\p{Zs}]+$`)
- re_inside_whtsp := regexp.MustCompile(`[\s\p{Zs}]{2,}`)
- final := re_leadclose_whtsp.ReplaceAllString(str, "")
- str = re_inside_whtsp.ReplaceAllString(final, " ")
- return strings.TrimSpace(str)
- }
- func addCellWithValue(str string, row *xlsx.Row, print bool) {
- cell := row.AddCell()
- if str != "" {
- cell.Value = str
- } else {
- cell.Value = " - "
- }
- if print {
- fmt.Println(str)
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement