Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- package scrapers
- import (
- "bytes"
- "fmt"
- "html"
- "math/rand"
- "net/url"
- "regexp"
- "strconv"
- "strings"
- "time"
- "github.com/dgrr/cookiejar"
- "github.com/valyala/fasthttp"
- "primeprice.com/dal"
- "primeprice.com/pkg/fasthtml"
- "primeprice.com/pkg/logger"
- "primeprice.com/pkg/proxy"
- "github.com/hashicorp/go-retryablehttp"
- )
- const (
- maxRetries = 4
- waitMin = time.Second
- waitMax = 30 * time.Second
- )
- var priceCutter = regexp.MustCompile(`[-]?\d[\d,]*[\.,\]?[\d{2}]*`)
- var cj = cookiejar.AcquireCookieJar()
- var cookieContainer = make(map[string]string)
- var strPost = []byte("POST")
- func processProduct(product string, baseURL string, isWareHouse bool) *dal.Product {
- defer func() {
- if err := recover(); err != nil {
- fmt.Println(err)
- }
- }()
- fp := float64(0)
- price := ""
- asin := fasthtml.GetAttr(product, "data-asin")
- if asin == "" {
- return nil
- }
- titleNode := fasthtml.GetTagWithParams(product, "class", "a-size-medium a-color-base a-text-normal")
- titleTxt := fasthtml.GetInner(titleNode)
- logger.Println("Title:", titleTxt)
- title := html.UnescapeString(titleTxt)
- if title == "" {
- logger.Println("return with title nil")
- return nil
- }
- if isWareHouse {
- // price = product.Find("div.a-spacing-top-mini span.a-color-base").First().Text()
- price = fasthtml.GetInner(fasthtml.GetTagWithParams(product, "class", "a-color-base"))
- } else {
- if !strings.Contains(product, `aria-label="Amazon Prime"`) {
- logger.Println("not prime")
- return nil
- }
- price = fasthtml.GetInner(fasthtml.GetTagWithParams(product, "class", "a-price-whole"))
- }
- submatchall := priceCutter.FindAllString(price, -1)
- fp = parsePrice(strings.Join(submatchall, ""))
- // href := product.Find("a.a-link-normal.a-text-normal").First().Attr("href")
- href := fasthtml.GetAttr(fasthtml.GetTagWithParams(product, "class", "a-link-normal a-text-normal"), "href")
- pd := &dal.Product{
- Asin: asin,
- Title: title,
- Price: fp,
- URL: baseURL + href,
- }
- logger.Println(title[:7], fp, pd.Asin, href[:10])
- return pd
- }
- func parsePrice(s string) float64 {
- price := strings.ReplaceAll(s, ".", "")
- price = strings.ReplaceAll(price, ",", "")
- fpr, _ := strconv.ParseFloat(price, 64)
- return fpr / 100
- }
- func GetStringInBetweenTwoString(str string, startS string, endS string) (result string) {
- s := strings.Index(str, startS)
- if s == -1 {
- return result
- }
- newS := str[s+len(startS):]
- e := strings.Index(newS, endS)
- if e == -1 {
- return result
- }
- result = newS[:e]
- return result
- }
- func ChangeZipCode(purl string, proxies []dal.Proxy, zipCode string) {
- cookieContainer = make(map[string]string)
- u, err := url.Parse(purl)
- if err != nil {
- logger.Println(err)
- } else {
- logger.Println(u)
- }
- formData := "locationType=LOCATION_INPUT&zipCode=" + zipCode + "&storeContext=generic&deviceType=web&pageType=Gateway&actionSource=glow&almBrandId=undefined"
- domainFix := "com"
- searchTxt := "\"Tu dirección de envío actual es:\""
- if strings.Contains(purl, "www.amazon.es") {
- domainFix = "es"
- searchTxt = "\"Tu dirección de envío actual es:\""
- } else if strings.Contains(purl, "www.amazon.de") {
- domainFix = "de"
- searchTxt = "\"Sie kaufen gerade ein für:\""
- } else if strings.Contains(purl, "www.amazon.fr") {
- domainFix = "fr"
- searchTxt = "\"Votre lieu de livraison est désormais:\""
- } else if strings.Contains(purl, "www.amazon.co.uk") {
- domainFix = "co.uk"
- searchTxt = "\"You're now shopping for delivery to:\""
- } else if strings.Contains(purl, "www.amazon.it") {
- domainFix = "it"
- searchTxt = "\"L'indirizzo di consegna selezionato è:\""
- } else {
- domainFix = "com"
- searchTxt = "\"You're now shopping for delivery to:\""
- }
- homePage := ""
- homePage, err = getRequest("https://www.amazon."+domainFix, proxies)
- //uidCookie := strings.TrimLeft(strings.TrimRight(homePage, "\" })</script>"), "/ah/ajax/counter?ctr=desktop_ajax_atf")
- uidCookie := GetStringInBetweenTwoString(homePage, "/ah/ajax/counter?ctr=desktop_ajax_atf", "\" })</script>")
- uidCookieUrl := "https://www.amazon." + domainFix + "/ah/ajax/counter?ctr=desktop_ajax_atf" + uidCookie
- postRequest(uidCookieUrl, proxies, "")
- tokenPage := ""
- tokenPage, err = getRequest("https://www.amazon."+domainFix+"/gp/glow/get-address-selections.html?deviceType=desktop&pageType=Gateway&storeContext=NoStoreName", proxies)
- //crosToken := strings.TrimLeft(strings.TrimRight(tokenPage, "\", IDs:{\"ADDRESS_LIST\":\"GLUXAddressList\""), "\"You're now shopping for delivery to:\", CSRF_TOKEN : \"")
- crosToken := GetStringInBetweenTwoString(tokenPage, searchTxt+", CSRF_TOKEN : \"", "\", IDs:{\"ADDRESS_LIST\":\"GLUXAddressList\"")
- changeZipCodePostRequest("https://www.amazon."+domainFix+"/gp/delivery/ajax/address-change.html", proxies, formData, crosToken)
- postRequest("https://www.amazon."+domainFix+"/gp/glow/get-location-label.html", proxies, "storeContext=hpc&pageType=Landing")
- }
- // GetAmazonPrimeProducts get all prime products from search url
- func GetAmazonPrimeProducts(purl string, proxies []dal.Proxy, postal string) ([]*dal.Product, error) {
- var ps []*dal.Product
- curPage := 1
- baseURL := ""
- isWareHouse := false
- u, err := url.Parse(purl)
- if err != nil {
- logger.Println(err)
- } else {
- baseURL = u.Host
- }
- ChangeZipCode(purl, proxies, postal)
- for {
- tt := time.Now()
- surl := fmt.Sprintf("%s&page=%d", purl, curPage)
- content, err := retryLoadDocument(surl, proxies)
- if strings.Contains(content, "To discuss automated access to Amazon data") {
- logger.Println("Scrapping Detected.Please change cookie")
- ChangeZipCode(purl, proxies, postal)
- }
- if err != nil {
- logger.Println("page:", curPage, err)
- break
- }
- logger.Println("request done", time.Since(tt).Seconds(), "sec")
- tt = time.Now()
- if strings.Contains(fasthtml.GetTagWithParams(content, "selected", "selected"), "Amazon Warehouse") {
- isWareHouse = true
- }
- // Cut upper js part for collision safety
- searchIdx := strings.Index(content, "<div id=\"search\">")
- if searchIdx == -1 {
- logger.Println("page:", curPage, "search block not found")
- break
- }
- content = content[searchIdx:]
- logger.Println("search id div found")
- items := fasthtml.FindNodes(content, "data-component-type=\"s-search-result\"")
- logger.Println("find done", isWareHouse, len(items), time.Since(tt).Seconds(), "sec")
- tt = time.Now()
- if len(items) == 0 {
- break
- }
- for _, it := range items {
- pd := processProduct(it, baseURL, isWareHouse)
- if pd != nil {
- ps = append(ps, pd)
- }
- }
- logger.Println("process items done", time.Since(tt).Seconds(), "sec")
- if len(ps) > 200 {
- break
- }
- curPage++
- }
- return ps, nil
- }
- func retryLoadDocument(surl string, proxies []dal.Proxy) (string, error) {
- var lastErr error
- for n := 0; n < maxRetries; n++ {
- document, err, shouldRetry := loadDocument(surl, proxies)
- if !shouldRetry {
- return document, err
- }
- lastErr = err
- backoff := retryablehttp.DefaultBackoff(waitMin, waitMax, n, nil)
- logger.Println(err, "Retrying in", backoff)
- time.Sleep(backoff)
- }
- return "", lastErr
- }
- func getRequest(surl string, proxies []dal.Proxy) (string, error) {
- logger.Println("processing", surl)
- var client fasthttp.Client
- if len(proxies) > 0 {
- px := getRandomProxy(proxies)
- client = fasthttp.Client{
- Dial: proxy.FastHTTPProxyDialer(px),
- }
- logger.Println("with proxy", px)
- }
- defer client.CloseIdleConnections()
- req := fasthttp.AcquireRequest()
- resp := fasthttp.AcquireResponse()
- defer fasthttp.ReleaseRequest(req)
- defer fasthttp.ReleaseResponse(resp)
- // Acquire cookie jar
- u, errUrl := url.Parse(surl)
- if errUrl == nil {
- cj = cookiejar.AcquireCookieJar()
- for key, value := range cookieContainer {
- if strings.Contains(key, u.Host) {
- key = strings.Replace(key, u.Host, "", -1)
- valueArry := strings.Split(value, "=")
- value = strings.Split(valueArry[1], ";")[0]
- cj.Set(key, value)
- }
- }
- }
- cj.FillRequest(req)
- req.SetRequestURI(surl)
- req.Header.Set("Content-Type", "text/html;charset=UTF-8")
- req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9")
- req.Header.Set("Accept-Encoding", "gzip")
- req.Header.Set("user-agent", getRandomUserAgent())
- req.Header.Set("Upgrade-Insecure-Requests", "1")
- req.Header.Set("Connection", "keep-alive")
- err := client.DoTimeout(req, resp, 30*time.Second)
- if err != nil {
- return "", err
- }
- resp.Header.VisitAllCookie(func(key, value []byte) {
- c := fasthttp.AcquireCookie()
- defer fasthttp.ReleaseCookie(c)
- c.ParseBytes(value)
- var emptyContent = string(key) + "=-;"
- if !strings.Contains(string(value), emptyContent) {
- var middle = strings.Replace(string(value), "Domain=.amazon", "domain=.www.amazon", -1)
- middle = strings.Replace(middle, "domain=.amazon", "domain=.www.amazon", -1)
- cookieContainer[string(key)+u.Host] = middle
- }
- })
- contentEncoding := resp.Header.Peek("Content-Encoding")
- var body []byte
- if bytes.EqualFold(contentEncoding, []byte("gzip")) {
- fmt.Println("Unzipping...")
- body, _ = resp.BodyGunzip()
- } else {
- body = resp.Body()
- }
- content := string(body)
- return content, nil
- }
- func changeZipCodePostRequest(surl string, proxies []dal.Proxy, formData string, token string) (string, error, bool) {
- logger.Println("processing", surl)
- var client fasthttp.Client
- if len(proxies) > 0 {
- px := getRandomProxy(proxies)
- client = fasthttp.Client{
- Dial: proxy.FastHTTPProxyDialer(px),
- }
- logger.Println("with proxy", px)
- }
- defer client.CloseIdleConnections()
- req := fasthttp.AcquireRequest()
- resp := fasthttp.AcquireResponse()
- defer fasthttp.ReleaseRequest(req)
- defer fasthttp.ReleaseResponse(resp)
- // Acquire cookie jar
- u, errUrl := url.Parse(surl)
- if errUrl == nil {
- cj = cookiejar.AcquireCookieJar()
- for key, value := range cookieContainer {
- if strings.Contains(key, u.Host) {
- key = strings.Replace(key, u.Host, "", -1)
- valueArry := strings.Split(value, "=")
- value = strings.Split(valueArry[1], ";")[0]
- cj.Set(key, value)
- }
- }
- }
- cj.FillRequest(req)
- req.SetRequestURI(surl)
- req.Header.Set("Content-Type", "application/x-www-form-urlencoded")
- req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9")
- req.Header.Set("User-Agent", getRandomUserAgent())
- req.Header.Set("Accept-Encoding", "gzip")
- req.Header.Set("Upgrade-Insecure-Requests", "1")
- req.Header.Set("anti-csrftoken-a2z", token)
- req.Header.Set("Connection", "keep-alive")
- req.Header.SetMethodBytes(strPost)
- req.SetBodyString(formData)
- err := client.DoTimeout(req, resp, 30*time.Second)
- if err != nil {
- return "", err, true
- }
- resp.Header.VisitAllCookie(func(key, value []byte) {
- c := fasthttp.AcquireCookie()
- defer fasthttp.ReleaseCookie(c)
- c.ParseBytes(value)
- var emptyContent = string(key) + "=-;"
- if !strings.Contains(string(value), emptyContent) {
- var middle = strings.Replace(string(value), "Domain=.amazon", "domain=.www.amazon", -1)
- middle = strings.Replace(middle, "domain=.amazon", "domain=.www.amazon", -1)
- cookieContainer[string(key)+u.Host] = middle
- }
- })
- contentEncoding := resp.Header.Peek("Content-Encoding")
- var body []byte
- if bytes.EqualFold(contentEncoding, []byte("gzip")) {
- fmt.Println("Unzipping...")
- body, _ = resp.BodyGunzip()
- } else {
- body = resp.Body()
- }
- content := string(body)
- return content, nil, false
- }
- func postRequest(surl string, proxies []dal.Proxy, formData string) (string, error, bool) {
- logger.Println("processing", surl)
- var client fasthttp.Client
- if len(proxies) > 0 {
- px := getRandomProxy(proxies)
- client = fasthttp.Client{
- Dial: proxy.FastHTTPProxyDialer(px),
- }
- logger.Println("with proxy", px)
- }
- defer client.CloseIdleConnections()
- req := fasthttp.AcquireRequest()
- resp := fasthttp.AcquireResponse()
- defer fasthttp.ReleaseRequest(req)
- defer fasthttp.ReleaseResponse(resp)
- // Acquire cookie jar
- u, errUrl := url.Parse(surl)
- if errUrl == nil {
- cj = cookiejar.AcquireCookieJar()
- for key, value := range cookieContainer {
- if strings.Contains(key, u.Host) {
- key = strings.Replace(key, u.Host, "", -1)
- valueArry := strings.Split(value, "=")
- value = strings.Split(valueArry[1], ";")[0]
- cj.Set(key, value)
- }
- }
- }
- cj.FillRequest(req)
- req.SetRequestURI(surl)
- req.Header.Set("Content-Type", "application/x-www-form-urlencoded")
- req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9")
- req.Header.Set("User-Agent", getRandomUserAgent())
- req.Header.Set("Accept-Encoding", "gzip")
- req.Header.Set("Upgrade-Insecure-Requests", "1")
- req.Header.Set("Connection", "keep-alive")
- req.Header.SetMethodBytes(strPost)
- req.SetBodyString(formData)
- err := client.DoTimeout(req, resp, 30*time.Second)
- if err != nil {
- return "", err, true
- }
- resp.Header.VisitAllCookie(func(key, value []byte) {
- c := fasthttp.AcquireCookie()
- defer fasthttp.ReleaseCookie(c)
- c.ParseBytes(value)
- var emptyContent = string(key) + "=-;"
- if !strings.Contains(string(value), emptyContent) {
- var middle = strings.Replace(string(value), "Domain=.amazon", "domain=.www.amazon", -1)
- middle = strings.Replace(middle, "domain=.amazon", "domain=.www.amazon", -1)
- cookieContainer[string(key)+u.Host] = middle
- }
- })
- contentEncoding := resp.Header.Peek("Content-Encoding")
- var body []byte
- if bytes.EqualFold(contentEncoding, []byte("gzip")) {
- fmt.Println("Unzipping...")
- body, _ = resp.BodyGunzip()
- } else {
- body = resp.Body()
- }
- content := string(body)
- return content, nil, false
- }
- func loadDocument(surl string, proxies []dal.Proxy) (string, error, bool) {
- logger.Println("processing", surl)
- var client fasthttp.Client
- if len(proxies) > 0 {
- px := getRandomProxy(proxies)
- client = fasthttp.Client{
- Dial: proxy.FastHTTPProxyDialer(px),
- }
- logger.Println("with proxy", px)
- }
- defer client.CloseIdleConnections()
- req := fasthttp.AcquireRequest()
- resp := fasthttp.AcquireResponse()
- defer fasthttp.ReleaseRequest(req)
- defer fasthttp.ReleaseResponse(resp)
- cj.FillRequest(req)
- req.SetRequestURI(surl)
- req.Header.Set("Content-Type", "text/html;charset=UTF-8")
- req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9")
- req.Header.Set("user-agent", getRandomUserAgent())
- req.Header.Set("Upgrade-Insecure-Requests", "1")
- req.Header.Set("Connection", "keep-alive")
- err := client.DoTimeout(req, resp, 30*time.Second)
- if err != nil {
- return "", err, true
- }
- u, errUrl := url.Parse(surl)
- if errUrl == nil {
- cj = cookiejar.AcquireCookieJar()
- for key, value := range cookieContainer {
- if strings.Contains(key, u.Host) {
- key = strings.Replace(key, u.Host, "", -1)
- valueArry := strings.Split(value, "=")
- value = strings.Split(valueArry[1], ";")[0]
- cj.Set(key, value)
- }
- }
- }
- resp.Header.VisitAllCookie(func(key, value []byte) {
- c := fasthttp.AcquireCookie()
- defer fasthttp.ReleaseCookie(c)
- c.ParseBytes(value)
- var emptyContent = string(key) + "=-;"
- if !strings.Contains(string(value), emptyContent) {
- var middle = strings.Replace(string(value), "Domain=.amazon", "domain=.www.amazon", -1)
- middle = strings.Replace(middle, "domain=.amazon", "domain=.www.amazon", -1)
- cookieContainer[string(key)+u.Host] = middle
- }
- })
- content := string(resp.Body())
- return content, nil, false
- }
- func getRandomProxy(ps []dal.Proxy) string {
- if len(ps) == 0 {
- return ""
- }
- i := rand.Intn(len(ps))
- return ps[i].Proxy
- }
Advertisement
Add Comment
Please, Sign In to add comment