Advertisement
Guest User

MUN

a guest
Jul 17th, 2017
65
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Go 6.93 KB | None | 0 0
  1. package main
  2.  
  3. import (
  4.   "fmt"
  5.   "log"
  6.   "strconv"
  7.   "strings"
  8.   "regexp"
  9.   "os"
  10.   "github.com/PuerkitoBio/goquery"
  11.   "github.com/tealeg/xlsx"
  12. )
  13.  
  14. var xls_file *xlsx.File
  15. var sheet *xlsx.Sheet
  16. var row *xlsx.Row
  17. var cell *xlsx.Cell
  18. var maxPhones = 20
  19. var maxEmails = 20
  20.  
  21. func validateEmail(email string) bool {
  22.  Re := regexp.MustCompile(`^[a-z0-9._%+\-]+@[a-z0-9.\-]+\.[a-z]{2,4}$`)
  23.  return Re.MatchString(email)
  24. }
  25.  
  26. func Scrape() {
  27.  
  28.   emails_processed := make(map[string]string)
  29.  
  30.   phones_temp := map[int]string {
  31.     0: "-",
  32.     1: "-",
  33.     2: "-",
  34.     3: "-",
  35.     4: "-",
  36.     5: "-",
  37.     6: "-",
  38.     7: "-",
  39.     8: "-",
  40.     9: "-",
  41.     10: "-",
  42.     11: "-",
  43.     12: "-",
  44.     13: "-",
  45.     14: "-",
  46.     15: "-",
  47.     16: "-",
  48.     17: "-",
  49.     18: "-",
  50.     19: "-",
  51.   }
  52.  
  53.   emails_temp := map[int]string {
  54.     0: "-",
  55.     1: "-",
  56.     2: "-",
  57.     3: "-",
  58.     4: "-",
  59.     5: "-",
  60.     6: "-",
  61.     7: "-",
  62.     8: "-",
  63.     9: "-",
  64.     10: "-",
  65.     11: "-",
  66.     12: "-",
  67.     13: "-",
  68.     14: "-",
  69.     15: "-",
  70.     16: "-",
  71.     17: "-",
  72.     18: "-",
  73.     19: "-",
  74.   }
  75.  
  76.   doc, err := goquery.NewDocument("http://www.mininterior.gov.ar/municipios/datos-municipio.php")
  77.  
  78.   if err != nil {
  79.     log.Fatal(err)
  80.   }
  81.  
  82.   // open files r and w
  83.   file, err := os.OpenFile("emails.txt", os.O_APPEND|os.O_WRONLY|os.O_CREATE|os.O_TRUNC,0600)
  84.  
  85.   if err != nil {
  86.       panic(err)
  87.   }
  88.   defer file.Close()
  89.  
  90.   xls_file = xlsx.NewFile()
  91.   sheet, err = xls_file.AddSheet("Sheet1")
  92.   if err != nil {
  93.       fmt.Printf(err.Error())
  94.   }
  95.   row = sheet.AddRow()
  96.   cell = row.AddCell()
  97.   cell.Value = "Código"
  98.   cell = row.AddCell()
  99.   cell.Value = "Provincia"
  100.   cell = row.AddCell()
  101.   cell.Value = "Municipio"
  102.   cell = row.AddCell()
  103.   cell.Value = "Sitio Web"
  104.   cell = row.AddCell()
  105.   cell.Value = "Dirección"
  106.   for i:=1; i <= maxPhones; i++ {
  107.     cell = row.AddCell()
  108.     cell.Value = "Teléfono " + strconv.Itoa(i)
  109.   }
  110.   for i:=1; i <= maxEmails; i++ {
  111.     cell = row.AddCell()
  112.     cell.Value = "Email " + strconv.Itoa(i)
  113.   }
  114.  
  115.   doc.Find("#provincia option").Each(func(i int, s *goquery.Selection) {
  116.  
  117.     provPrefix, _ := s.Attr("value")
  118.     provName := s.Text()
  119.  
  120.     if provPrefix != "" {
  121.  
  122.       currItem := 1
  123.  
  124.       for currItem >= 1 {
  125.  
  126.         numToStr := strconv.Itoa(currItem)
  127.  
  128.         if len(numToStr) == 1 {
  129.           numToStr = "00" + numToStr
  130.         } else if len(numToStr) == 2 {
  131.           numToStr = "0" + numToStr
  132.         }
  133.  
  134.         doc, err := goquery.NewDocument("http://www.mininterior.gov.ar/municipios/masinfo.php?municipio=" + provPrefix + numToStr)
  135.  
  136.         if err != nil {
  137.           log.Fatal(err)
  138.         }
  139.  
  140.         muni_name := doc.Find("h1").Eq(1).Text()
  141.         muni_web := doc.Find("table").Eq(2).Find("td").Eq(7).Text()
  142.         muni_addr := doc.Find("table").Eq(2).Find("td").Eq(1).Text()
  143.         phones, _ := doc.Find("table").Eq(2).Find("td").Eq(3).Html()
  144.         emails, _ := doc.Find("table").Eq(2).Find("td").Eq(5).Html()
  145.  
  146.         if muni_name == "" {
  147.  
  148.           currItem = 1
  149.           break
  150.  
  151.         } else {
  152.  
  153.           phones_temp = map[int]string {
  154.             0: "-",
  155.             1: "-",
  156.             2: "-",
  157.             3: "-",
  158.             4: "-",
  159.             5: "-",
  160.             6: "-",
  161.             7: "-",
  162.             8: "-",
  163.             9: "-",
  164.             10: "-",
  165.             11: "-",
  166.             12: "-",
  167.             13: "-",
  168.             14: "-",
  169.             15: "-",
  170.             16: "-",
  171.             17: "-",
  172.             18: "-",
  173.             19: "-",
  174.           }
  175.  
  176.           emails_temp = map[int]string {
  177.             0: "-",
  178.             1: "-",
  179.             2: "-",
  180.             3: "-",
  181.             4: "-",
  182.             5: "-",
  183.             6: "-",
  184.             7: "-",
  185.             8: "-",
  186.             9: "-",
  187.             10: "-",
  188.             11: "-",
  189.             12: "-",
  190.             13: "-",
  191.             14: "-",
  192.             15: "-",
  193.             16: "-",
  194.             17: "-",
  195.             18: "-",
  196.             19: "-",
  197.           }
  198.  
  199.           row = sheet.AddRow()
  200.  
  201.           cell = row.AddCell()
  202.           cell.Value = provPrefix
  203.           cell = row.AddCell()
  204.           cell.Value = provName
  205.           cell = row.AddCell()
  206.           cell.Value = muni_name
  207.           cell = row.AddCell()
  208.           cell.Value = muni_web
  209.           cell = row.AddCell()
  210.           cell.Value = muni_addr
  211.  
  212.           phones_temp_existing := strings.Split(phones, "<br/>")
  213.           emails_temp_existing := strings.Split(emails, "<br/>")
  214.  
  215.           x := 0
  216.  
  217.           for i := range phones_temp_existing {
  218.  
  219.             phones_temp_existing[i] = strings.Replace(phones_temp_existing[i], "<strong>", "", -1)
  220.             phones_temp_existing[i] = strings.Replace(phones_temp_existing[i], "</strong>", "", -1)
  221.             phones_temp_existing[i] = strings.Replace(phones_temp_existing[i], "<p>", "", -1)
  222.             phones_temp_existing[i] = strings.Replace(phones_temp_existing[i], "</p>", "", -1)
  223.             if phones_temp_existing[i] != "" && phones_temp_existing[i] != " " && phones_temp_existing[i] != "-" && phones_temp_existing[i] != "<strong>" && phones_temp_existing[i] != "</strong>" {
  224.               phones_temp[x] = phones_temp_existing[i]
  225.               x++
  226.             }
  227.  
  228.           }
  229.  
  230.           x = 0
  231.  
  232.           for i := range emails_temp_existing {
  233.             emails_temp_existing[i] = strings.Replace(emails_temp_existing[i], "<strong>", "", -1)
  234.             emails_temp_existing[i] = strings.Replace(emails_temp_existing[i], "</strong>", "", -1)
  235.             emails_temp_existing[i] = strings.Replace(emails_temp_existing[i], "<p>", "", -1)
  236.             emails_temp_existing[i] = strings.Replace(emails_temp_existing[i], "</p>", "", -1)
  237.             if emails_temp_existing[i] != "" && emails_temp_existing[i] != " " && emails_temp_existing[i] != "-" && emails_temp_existing[i] != "<strong>" && emails_temp_existing[i] != "</strong>" && validateEmail(emails_temp_existing[i]) {
  238.               emails_temp[x] = emails_temp_existing[i]
  239.               x++
  240.             }
  241.           }
  242.  
  243.           for i := range phones_temp {
  244.  
  245.             cell = row.AddCell()
  246.             cell.Value = phones_temp[i]
  247.  
  248.           }
  249.  
  250.           for i := range emails_temp {
  251.  
  252.             _, ok := emails_processed[emails_temp[i]]
  253.  
  254.             if !ok && emails_temp[i] != "-" {
  255.  
  256.               fmt.Println(provPrefix + numToStr + " => " + emails_temp[i])
  257.  
  258.               emails_processed[emails_temp[i]] = emails_temp[i]
  259.  
  260.               if _, err = file.WriteString(muni_name + "||" + emails_temp[i] + "\r\n"); err != nil {
  261.                panic(err)
  262.               }
  263.  
  264.             }
  265.  
  266.             cell = row.AddCell()
  267.             cell.Value = emails_temp[i]
  268.  
  269.           }
  270.  
  271.         }
  272.  
  273.         currItem = currItem + 1
  274.  
  275.       }
  276.     }
  277.   })
  278.  
  279.   err = xls_file.Save("municipios.xlsx")
  280.   if err != nil {
  281.       fmt.Printf(err.Error())
  282.   }
  283.  
  284. }
  285.  
  286. func main() {
  287.   Scrape()
  288. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement