Guest User

Untitled

a guest
Jan 31st, 2021
1,300
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Go 15.91 KB | None | 0 0
  1. package scrapers
  2.  
  3. import (
  4.     "bytes"
  5.     "fmt"
  6.     "html"
  7.     "math/rand"
  8.     "net/url"
  9.     "regexp"
  10.     "strconv"
  11.     "strings"
  12.     "time"
  13.  
  14.     "github.com/dgrr/cookiejar"
  15.     "github.com/valyala/fasthttp"
  16.  
  17.     "primeprice.com/dal"
  18.     "primeprice.com/pkg/fasthtml"
  19.     "primeprice.com/pkg/logger"
  20.     "primeprice.com/pkg/proxy"
  21.  
  22.     "github.com/hashicorp/go-retryablehttp"
  23. )
  24.  
  25. const (
  26.     maxRetries = 4
  27.     waitMin    = time.Second
  28.     waitMax    = 30 * time.Second
  29. )
  30.  
  31. var priceCutter = regexp.MustCompile(`[-]?\d[\d,]*[\.,\]?[\d{2}]*`)
  32.  
  33. var cj = cookiejar.AcquireCookieJar()
  34.  
  35. var cookieContainer = make(map[string]string)
  36.  
  37. var strPost = []byte("POST")
  38.  
  39. func processProduct(product string, baseURL string, isWareHouse bool) *dal.Product {
  40.  
  41.     defer func() {
  42.         if err := recover(); err != nil {
  43.             fmt.Println(err)
  44.         }
  45.     }()
  46.  
  47.     fp := float64(0)
  48.     price := ""
  49.  
  50.     asin := fasthtml.GetAttr(product, "data-asin")
  51.     if asin == "" {
  52.         return nil
  53.     }
  54.     titleNode := fasthtml.GetTagWithParams(product, "class", "a-size-medium a-color-base a-text-normal")
  55.     titleTxt := fasthtml.GetInner(titleNode)
  56.     logger.Println("Title:", titleTxt)
  57.     title := html.UnescapeString(titleTxt)
  58.     if title == "" {
  59.         logger.Println("return with title nil")
  60.         return nil
  61.     }
  62.  
  63.     if isWareHouse {
  64.         // price = product.Find("div.a-spacing-top-mini span.a-color-base").First().Text()
  65.         price = fasthtml.GetInner(fasthtml.GetTagWithParams(product, "class", "a-color-base"))
  66.     } else {
  67.         if !strings.Contains(product, `aria-label="Amazon Prime"`) {
  68.             logger.Println("not prime")
  69.             return nil
  70.         }
  71.         price = fasthtml.GetInner(fasthtml.GetTagWithParams(product, "class", "a-price-whole"))
  72.     }
  73.  
  74.     submatchall := priceCutter.FindAllString(price, -1)
  75.     fp = parsePrice(strings.Join(submatchall, ""))
  76.     // href := product.Find("a.a-link-normal.a-text-normal").First().Attr("href")
  77.     href := fasthtml.GetAttr(fasthtml.GetTagWithParams(product, "class", "a-link-normal a-text-normal"), "href")
  78.  
  79.     pd := &dal.Product{
  80.         Asin:  asin,
  81.         Title: title,
  82.         Price: fp,
  83.         URL:   baseURL + href,
  84.     }
  85.  
  86.     logger.Println(title[:7], fp, pd.Asin, href[:10])
  87.  
  88.     return pd
  89. }
  90.  
  91. func parsePrice(s string) float64 {
  92.     price := strings.ReplaceAll(s, ".", "")
  93.     price = strings.ReplaceAll(price, ",", "")
  94.     fpr, _ := strconv.ParseFloat(price, 64)
  95.     return fpr / 100
  96. }
  97.  
  98. func GetStringInBetweenTwoString(str string, startS string, endS string) (result string) {
  99.     s := strings.Index(str, startS)
  100.     if s == -1 {
  101.         return result
  102.     }
  103.     newS := str[s+len(startS):]
  104.     e := strings.Index(newS, endS)
  105.     if e == -1 {
  106.         return result
  107.     }
  108.     result = newS[:e]
  109.     return result
  110. }
  111.  
  112. func ChangeZipCode(purl string, proxies []dal.Proxy, zipCode string) {
  113.     cookieContainer = make(map[string]string)
  114.     u, err := url.Parse(purl)
  115.     if err != nil {
  116.         logger.Println(err)
  117.     } else {
  118.         logger.Println(u)
  119.     }
  120.     formData := "locationType=LOCATION_INPUT&zipCode=" + zipCode + "&storeContext=generic&deviceType=web&pageType=Gateway&actionSource=glow&almBrandId=undefined"
  121.     domainFix := "com"
  122.     searchTxt := "\"Tu dirección de envío actual es:\""
  123.     if strings.Contains(purl, "www.amazon.es") {
  124.         domainFix = "es"
  125.         searchTxt = "\"Tu dirección de envío actual es:\""
  126.     } else if strings.Contains(purl, "www.amazon.de") {
  127.         domainFix = "de"
  128.         searchTxt = "\"Sie kaufen gerade ein für:\""
  129.     } else if strings.Contains(purl, "www.amazon.fr") {
  130.         domainFix = "fr"
  131.         searchTxt = "\"Votre lieu de livraison est désormais:\""
  132.     } else if strings.Contains(purl, "www.amazon.co.uk") {
  133.         domainFix = "co.uk"
  134.         searchTxt = "\"You're now shopping for delivery to:\""
  135.     } else if strings.Contains(purl, "www.amazon.it") {
  136.         domainFix = "it"
  137.         searchTxt = "\"L'indirizzo di consegna selezionato è:\""
  138.     } else {
  139.         domainFix = "com"
  140.         searchTxt = "\"You're now shopping for delivery to:\""
  141.     }
  142.  
  143.     homePage := ""
  144.     homePage, err = getRequest("https://www.amazon."+domainFix, proxies)
  145.  
  146.     //uidCookie := strings.TrimLeft(strings.TrimRight(homePage, "\" })</script>"), "/ah/ajax/counter?ctr=desktop_ajax_atf")
  147.     uidCookie := GetStringInBetweenTwoString(homePage, "/ah/ajax/counter?ctr=desktop_ajax_atf", "\" })</script>")
  148.  
  149.     uidCookieUrl := "https://www.amazon." + domainFix + "/ah/ajax/counter?ctr=desktop_ajax_atf" + uidCookie
  150.     postRequest(uidCookieUrl, proxies, "")
  151.     tokenPage := ""
  152.     tokenPage, err = getRequest("https://www.amazon."+domainFix+"/gp/glow/get-address-selections.html?deviceType=desktop&pageType=Gateway&storeContext=NoStoreName", proxies)
  153.     //crosToken := strings.TrimLeft(strings.TrimRight(tokenPage, "\", IDs:{\"ADDRESS_LIST\":\"GLUXAddressList\""), "\"You're now shopping for delivery to:\", CSRF_TOKEN : \"")
  154.     crosToken := GetStringInBetweenTwoString(tokenPage, searchTxt+", CSRF_TOKEN : \"", "\", IDs:{\"ADDRESS_LIST\":\"GLUXAddressList\"")
  155.     changeZipCodePostRequest("https://www.amazon."+domainFix+"/gp/delivery/ajax/address-change.html", proxies, formData, crosToken)
  156.     postRequest("https://www.amazon."+domainFix+"/gp/glow/get-location-label.html", proxies, "storeContext=hpc&pageType=Landing")
  157.  
  158. }
  159.  
  160. // GetAmazonPrimeProducts get all prime products from search url
  161. func GetAmazonPrimeProducts(purl string, proxies []dal.Proxy, postal string) ([]*dal.Product, error) {
  162.     var ps []*dal.Product
  163.     curPage := 1
  164.     baseURL := ""
  165.     isWareHouse := false
  166.     u, err := url.Parse(purl)
  167.     if err != nil {
  168.         logger.Println(err)
  169.     } else {
  170.         baseURL = u.Host
  171.     }
  172.  
  173.     ChangeZipCode(purl, proxies, postal)
  174.  
  175.     for {
  176.         tt := time.Now()
  177.  
  178.         surl := fmt.Sprintf("%s&page=%d", purl, curPage)
  179.  
  180.         content, err := retryLoadDocument(surl, proxies)
  181.         if strings.Contains(content, "To discuss automated access to Amazon data") {
  182.             logger.Println("Scrapping Detected.Please change cookie")
  183.             ChangeZipCode(purl, proxies, postal)
  184.         }
  185.         if err != nil {
  186.             logger.Println("page:", curPage, err)
  187.             break
  188.         }
  189.  
  190.         logger.Println("request done", time.Since(tt).Seconds(), "sec")
  191.         tt = time.Now()
  192.  
  193.         if strings.Contains(fasthtml.GetTagWithParams(content, "selected", "selected"), "Amazon Warehouse") {
  194.             isWareHouse = true
  195.         }
  196.  
  197.         // Cut upper js part for collision safety
  198.         searchIdx := strings.Index(content, "<div id=\"search\">")
  199.         if searchIdx == -1 {
  200.             logger.Println("page:", curPage, "search block not found")
  201.             break
  202.         }
  203.         content = content[searchIdx:]
  204.         logger.Println("search id div found")
  205.  
  206.         items := fasthtml.FindNodes(content, "data-component-type=\"s-search-result\"")
  207.         logger.Println("find done", isWareHouse, len(items), time.Since(tt).Seconds(), "sec")
  208.         tt = time.Now()
  209.  
  210.         if len(items) == 0 {
  211.             break
  212.         }
  213.  
  214.         for _, it := range items {
  215.             pd := processProduct(it, baseURL, isWareHouse)
  216.             if pd != nil {
  217.                 ps = append(ps, pd)
  218.             }
  219.         }
  220.  
  221.         logger.Println("process items done", time.Since(tt).Seconds(), "sec")
  222.  
  223.         if len(ps) > 200 {
  224.             break
  225.         }
  226.         curPage++
  227.     }
  228.  
  229.     return ps, nil
  230. }
  231.  
  232. func retryLoadDocument(surl string, proxies []dal.Proxy) (string, error) {
  233.     var lastErr error
  234.     for n := 0; n < maxRetries; n++ {
  235.         document, err, shouldRetry := loadDocument(surl, proxies)
  236.         if !shouldRetry {
  237.             return document, err
  238.         }
  239.  
  240.         lastErr = err
  241.         backoff := retryablehttp.DefaultBackoff(waitMin, waitMax, n, nil)
  242.         logger.Println(err, "Retrying in", backoff)
  243.         time.Sleep(backoff)
  244.     }
  245.     return "", lastErr
  246. }
  247.  
  248. func getRequest(surl string, proxies []dal.Proxy) (string, error) {
  249.     logger.Println("processing", surl)
  250.     var client fasthttp.Client
  251.     if len(proxies) > 0 {
  252.         px := getRandomProxy(proxies)
  253.         client = fasthttp.Client{
  254.             Dial: proxy.FastHTTPProxyDialer(px),
  255.         }
  256.  
  257.         logger.Println("with proxy", px)
  258.     }
  259.  
  260.     defer client.CloseIdleConnections()
  261.  
  262.     req := fasthttp.AcquireRequest()
  263.     resp := fasthttp.AcquireResponse()
  264.     defer fasthttp.ReleaseRequest(req)
  265.     defer fasthttp.ReleaseResponse(resp)
  266.     // Acquire cookie jar
  267.     u, errUrl := url.Parse(surl)
  268.     if errUrl == nil {
  269.         cj = cookiejar.AcquireCookieJar()
  270.         for key, value := range cookieContainer {
  271.             if strings.Contains(key, u.Host) {
  272.                 key = strings.Replace(key, u.Host, "", -1)
  273.                 valueArry := strings.Split(value, "=")
  274.                 value = strings.Split(valueArry[1], ";")[0]
  275.                 cj.Set(key, value)
  276.             }
  277.         }
  278.     }
  279.     cj.FillRequest(req)
  280.  
  281.     req.SetRequestURI(surl)
  282.  
  283.     req.Header.Set("Content-Type", "text/html;charset=UTF-8")
  284.     req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9")
  285.     req.Header.Set("Accept-Encoding", "gzip")
  286.     req.Header.Set("user-agent", getRandomUserAgent())
  287.     req.Header.Set("Upgrade-Insecure-Requests", "1")
  288.     req.Header.Set("Connection", "keep-alive")
  289.     err := client.DoTimeout(req, resp, 30*time.Second)
  290.     if err != nil {
  291.         return "", err
  292.     }
  293.  
  294.     resp.Header.VisitAllCookie(func(key, value []byte) {
  295.         c := fasthttp.AcquireCookie()
  296.         defer fasthttp.ReleaseCookie(c)
  297.  
  298.         c.ParseBytes(value)
  299.         var emptyContent = string(key) + "=-;"
  300.         if !strings.Contains(string(value), emptyContent) {
  301.             var middle = strings.Replace(string(value), "Domain=.amazon", "domain=.www.amazon", -1)
  302.             middle = strings.Replace(middle, "domain=.amazon", "domain=.www.amazon", -1)
  303.             cookieContainer[string(key)+u.Host] = middle
  304.         }
  305.     })
  306.     contentEncoding := resp.Header.Peek("Content-Encoding")
  307.     var body []byte
  308.     if bytes.EqualFold(contentEncoding, []byte("gzip")) {
  309.         fmt.Println("Unzipping...")
  310.         body, _ = resp.BodyGunzip()
  311.     } else {
  312.         body = resp.Body()
  313.     }
  314.     content := string(body)
  315.     return content, nil
  316. }
  317.  
  318. func changeZipCodePostRequest(surl string, proxies []dal.Proxy, formData string, token string) (string, error, bool) {
  319.     logger.Println("processing", surl)
  320.     var client fasthttp.Client
  321.     if len(proxies) > 0 {
  322.         px := getRandomProxy(proxies)
  323.         client = fasthttp.Client{
  324.             Dial: proxy.FastHTTPProxyDialer(px),
  325.         }
  326.  
  327.         logger.Println("with proxy", px)
  328.     }
  329.  
  330.     defer client.CloseIdleConnections()
  331.  
  332.     req := fasthttp.AcquireRequest()
  333.     resp := fasthttp.AcquireResponse()
  334.     defer fasthttp.ReleaseRequest(req)
  335.     defer fasthttp.ReleaseResponse(resp)
  336.     // Acquire cookie jar
  337.     u, errUrl := url.Parse(surl)
  338.     if errUrl == nil {
  339.         cj = cookiejar.AcquireCookieJar()
  340.         for key, value := range cookieContainer {
  341.             if strings.Contains(key, u.Host) {
  342.                 key = strings.Replace(key, u.Host, "", -1)
  343.                 valueArry := strings.Split(value, "=")
  344.                 value = strings.Split(valueArry[1], ";")[0]
  345.                 cj.Set(key, value)
  346.             }
  347.         }
  348.     }
  349.     cj.FillRequest(req)
  350.  
  351.     req.SetRequestURI(surl)
  352.     req.Header.Set("Content-Type", "application/x-www-form-urlencoded")
  353.     req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9")
  354.     req.Header.Set("User-Agent", getRandomUserAgent())
  355.     req.Header.Set("Accept-Encoding", "gzip")
  356.     req.Header.Set("Upgrade-Insecure-Requests", "1")
  357.     req.Header.Set("anti-csrftoken-a2z", token)
  358.     req.Header.Set("Connection", "keep-alive")
  359.     req.Header.SetMethodBytes(strPost)
  360.     req.SetBodyString(formData)
  361.     err := client.DoTimeout(req, resp, 30*time.Second)
  362.     if err != nil {
  363.         return "", err, true
  364.     }
  365.     resp.Header.VisitAllCookie(func(key, value []byte) {
  366.         c := fasthttp.AcquireCookie()
  367.         defer fasthttp.ReleaseCookie(c)
  368.  
  369.         c.ParseBytes(value)
  370.         var emptyContent = string(key) + "=-;"
  371.         if !strings.Contains(string(value), emptyContent) {
  372.             var middle = strings.Replace(string(value), "Domain=.amazon", "domain=.www.amazon", -1)
  373.             middle = strings.Replace(middle, "domain=.amazon", "domain=.www.amazon", -1)
  374.             cookieContainer[string(key)+u.Host] = middle
  375.         }
  376.     })
  377.     contentEncoding := resp.Header.Peek("Content-Encoding")
  378.     var body []byte
  379.     if bytes.EqualFold(contentEncoding, []byte("gzip")) {
  380.         fmt.Println("Unzipping...")
  381.         body, _ = resp.BodyGunzip()
  382.     } else {
  383.         body = resp.Body()
  384.     }
  385.     content := string(body)
  386.     return content, nil, false
  387. }
  388.  
  389. func postRequest(surl string, proxies []dal.Proxy, formData string) (string, error, bool) {
  390.     logger.Println("processing", surl)
  391.     var client fasthttp.Client
  392.     if len(proxies) > 0 {
  393.         px := getRandomProxy(proxies)
  394.         client = fasthttp.Client{
  395.             Dial: proxy.FastHTTPProxyDialer(px),
  396.         }
  397.  
  398.         logger.Println("with proxy", px)
  399.     }
  400.  
  401.     defer client.CloseIdleConnections()
  402.  
  403.     req := fasthttp.AcquireRequest()
  404.     resp := fasthttp.AcquireResponse()
  405.     defer fasthttp.ReleaseRequest(req)
  406.     defer fasthttp.ReleaseResponse(resp)
  407.     // Acquire cookie jar
  408.     u, errUrl := url.Parse(surl)
  409.     if errUrl == nil {
  410.         cj = cookiejar.AcquireCookieJar()
  411.         for key, value := range cookieContainer {
  412.             if strings.Contains(key, u.Host) {
  413.                 key = strings.Replace(key, u.Host, "", -1)
  414.                 valueArry := strings.Split(value, "=")
  415.                 value = strings.Split(valueArry[1], ";")[0]
  416.                 cj.Set(key, value)
  417.             }
  418.         }
  419.     }
  420.     cj.FillRequest(req)
  421.  
  422.     req.SetRequestURI(surl)
  423.     req.Header.Set("Content-Type", "application/x-www-form-urlencoded")
  424.     req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9")
  425.     req.Header.Set("User-Agent", getRandomUserAgent())
  426.     req.Header.Set("Accept-Encoding", "gzip")
  427.     req.Header.Set("Upgrade-Insecure-Requests", "1")
  428.     req.Header.Set("Connection", "keep-alive")
  429.     req.Header.SetMethodBytes(strPost)
  430.     req.SetBodyString(formData)
  431.     err := client.DoTimeout(req, resp, 30*time.Second)
  432.     if err != nil {
  433.         return "", err, true
  434.     }
  435.     resp.Header.VisitAllCookie(func(key, value []byte) {
  436.         c := fasthttp.AcquireCookie()
  437.         defer fasthttp.ReleaseCookie(c)
  438.  
  439.         c.ParseBytes(value)
  440.         var emptyContent = string(key) + "=-;"
  441.         if !strings.Contains(string(value), emptyContent) {
  442.             var middle = strings.Replace(string(value), "Domain=.amazon", "domain=.www.amazon", -1)
  443.             middle = strings.Replace(middle, "domain=.amazon", "domain=.www.amazon", -1)
  444.             cookieContainer[string(key)+u.Host] = middle
  445.         }
  446.     })
  447.     contentEncoding := resp.Header.Peek("Content-Encoding")
  448.     var body []byte
  449.     if bytes.EqualFold(contentEncoding, []byte("gzip")) {
  450.         fmt.Println("Unzipping...")
  451.         body, _ = resp.BodyGunzip()
  452.     } else {
  453.         body = resp.Body()
  454.     }
  455.     content := string(body)
  456.     return content, nil, false
  457. }
  458.  
  459. func loadDocument(surl string, proxies []dal.Proxy) (string, error, bool) {
  460.     logger.Println("processing", surl)
  461.     var client fasthttp.Client
  462.     if len(proxies) > 0 {
  463.         px := getRandomProxy(proxies)
  464.         client = fasthttp.Client{
  465.             Dial: proxy.FastHTTPProxyDialer(px),
  466.         }
  467.  
  468.         logger.Println("with proxy", px)
  469.     }
  470.     defer client.CloseIdleConnections()
  471.  
  472.     req := fasthttp.AcquireRequest()
  473.     resp := fasthttp.AcquireResponse()
  474.     defer fasthttp.ReleaseRequest(req)
  475.     defer fasthttp.ReleaseResponse(resp)
  476.     cj.FillRequest(req)
  477.  
  478.     req.SetRequestURI(surl)
  479.     req.Header.Set("Content-Type", "text/html;charset=UTF-8")
  480.     req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9")
  481.     req.Header.Set("user-agent", getRandomUserAgent())
  482.     req.Header.Set("Upgrade-Insecure-Requests", "1")
  483.     req.Header.Set("Connection", "keep-alive")
  484.  
  485.     err := client.DoTimeout(req, resp, 30*time.Second)
  486.     if err != nil {
  487.         return "", err, true
  488.     }
  489.     u, errUrl := url.Parse(surl)
  490.     if errUrl == nil {
  491.         cj = cookiejar.AcquireCookieJar()
  492.         for key, value := range cookieContainer {
  493.             if strings.Contains(key, u.Host) {
  494.                 key = strings.Replace(key, u.Host, "", -1)
  495.                 valueArry := strings.Split(value, "=")
  496.                 value = strings.Split(valueArry[1], ";")[0]
  497.                 cj.Set(key, value)
  498.             }
  499.         }
  500.     }
  501.     resp.Header.VisitAllCookie(func(key, value []byte) {
  502.         c := fasthttp.AcquireCookie()
  503.         defer fasthttp.ReleaseCookie(c)
  504.  
  505.         c.ParseBytes(value)
  506.         var emptyContent = string(key) + "=-;"
  507.         if !strings.Contains(string(value), emptyContent) {
  508.             var middle = strings.Replace(string(value), "Domain=.amazon", "domain=.www.amazon", -1)
  509.             middle = strings.Replace(middle, "domain=.amazon", "domain=.www.amazon", -1)
  510.             cookieContainer[string(key)+u.Host] = middle
  511.         }
  512.     })
  513.     content := string(resp.Body())
  514.     return content, nil, false
  515. }
  516.  
  517. func getRandomProxy(ps []dal.Proxy) string {
  518.     if len(ps) == 0 {
  519.         return ""
  520.     }
  521.     i := rand.Intn(len(ps))
  522.     return ps[i].Proxy
  523. }
  524.  
Advertisement
Add Comment
Please, Sign In to add comment