rhat398

Untitled

Jun 29th, 2022
3,317
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.53 KB | None | 0 0
  1. import scrapy
  2. import uuid
  3. import json
  4. from urllib.parse import urlencode, unquote
  5. import pandas as pd
  6.  
  7.  
  8. API_KEY = "645c5d2f619d86d1ae99916c05d6d58b"
  9.  
  10.  
  11. def get_scraperapi_url(url):
  12. payload = {
  13. "api_key": API_KEY,
  14. "url": url,
  15. }
  16. proxy_url = "http://api.scraperapi.com/?" + urlencode(payload)
  17. return proxy_url
  18.  
  19.  
  20. class CarrefourKSA(scrapy.Spider):
  21. name = "carrefour-ksa-1"
  22.  
  23. custom_settings = {
  24. "FEED_FORMAT": "csv",
  25. "FEED_URI": "carrefour-ksa.csv",
  26. "LOG_FILE": "carrefour-ksa.log",
  27. # "IMAGES_STORE": catalouge_id,
  28. }
  29.  
  30. headers = {
  31. "sec-ch-ua": '" Not A;Brand";v="99", "Chromium";v="100", "Yandex";v="22"',
  32. "tracestate": "3355720@nr=0-1-3355720-1021845705-72a4dc2922710b2a----1656355603002",
  33. "env": "prod",
  34. "newrelic": "eyJ2IjpbMCwxXSwiZCI6eyJ0eSI6IkJyb3dzZXIiLCJhYyI6IjMzNTU3MjAiLCJhcCI6IjEwMjE4NDU3MDUiLCJpZCI6IjcyYTRkYzI5MjI3MTBiMmEiLCJ0ciI6ImZmZDkzYzdhNTYxMTlkZTk1ZTBlMjMxYjBmMGZkOGJjIiwidGkiOjE2NTYzNTU2MDMwMDJ9fQ==",
  35. "lang": "en",
  36. "userId": "anonymous",
  37. "X-Requested-With": "XMLHttpRequest",
  38. "storeId": "mafsau",
  39. "sec-ch-ua-platform": '"Linux"',
  40. "traceparent": "00-ffd93c7a56119de95e0e231b0f0fd8bc-72a4dc2922710b2a-01",
  41. "sec-ch-ua-mobile": "?0",
  42. "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.143 YaBrowser/22.5.0.1879 (beta) Yowser/2.5 Safari/537.36",
  43. "langCode": "en",
  44. "appId": "Reactweb",
  45. }
  46.  
  47. pd_data = []
  48.  
  49. def start_requests(self):
  50. categories = ["NFKSA1200000"]
  51. languages = ["en", "ar"]
  52.  
  53. for lang in languages:
  54. for category in categories:
  55. yield scrapy.Request(
  56. url=f"https://www.carrefourksa.com/mafsau/{lang}/c/{category}?currentPage=0&filter=&nextPageOffset=0&pageSize=60&sortBy=relevance",
  57. headers=self.headers,
  58. callback=self.parse_links,
  59. )
  60.  
  61. def parse_links(self, response):
  62.  
  63. product_listings = response.css("div.css-1itwyrf ::attr(href)").extract()
  64.  
  65. for product_link in product_listings:
  66. product_url = "https://www.carrefourksa.com/" + product_link
  67.  
  68. yield scrapy.Request(
  69. url=get_scraperapi_url(product_url),
  70. headers=self.headers,
  71. callback=self.parse_product,
  72. )
  73.  
  74. def parse_product(self, response):
  75. data = (
  76. response.css('script[id="__NEXT_DATA__"]')
  77. .get()
  78. .replace('<script id="__NEXT_DATA__" type="application/json">', "")
  79. .replace("</script>", "")
  80. )
  81. json_data = json.loads(data)
  82. link_url = unquote(response.url)
  83. LabebStoreId = "6019"
  84. catalog_uuid = ""
  85. lang = ""
  86. if "/en/" in link_url:
  87. lang = "en"
  88. if "/ar/" in link_url:
  89. lang = "ar"
  90. breadcrumb = response.css("div.css-iamwo8 > a::text").extract()[1:]
  91. for idx, cat in enumerate(breadcrumb):
  92. bc_no = f"cat_{idx}_name"
  93. bc = breadcrumb[idx]
  94. catalogname = response.css("h1.css-106scfp::text").get()
  95. try:
  96. description = ", ".join(response.css("div.css-16lm0vc ::text").getall())
  97. except:
  98. description = ""
  99. try:
  100. keys = response.css("div.css-pi51ey::text").getall()
  101. values = response.css("h3.css-1ps12pz::text").getall()
  102. properties = {keys[i]: values[i] for i in range(len(keys))}
  103. raw_properties = json.dumps(properties, ensure_ascii=False).encode("utf-8")
  104. properties = raw_properties.decode()
  105. except:
  106. properties = ""
  107. try:
  108. price = response.css("h2.css-1i90gmp::text").getall()[2]
  109. except:
  110. price = response.css("h2.css-17ctnp::text").getall()[2]
  111. try:
  112. price_before_discount = response.css("del.css-1bdwabt::text").getall()[2]
  113. except:
  114. price_before_discount = ""
  115. externallink = link_url.split("=")[2]
  116. Rating = ""
  117. delivery = response.css("span.css-u98ylp::text").get()
  118. try:
  119. discount = f'{json_data["props"]["initialProps"]["pageProps"]["initialData"]["products"][0]["offers"][0]["stores"][0]["price"]["discount"]["information"]["amount"]}%'
  120. except:
  121. discount = ""
  122. self.pd_data.append(
  123. {
  124. "LabebStoreId": LabebStoreId,
  125. "catalog_uuid": catalog_uuid,
  126. "lang": lang,
  127. bc_no: bc,
  128. "catalogname": catalogname,
  129. "description": description,
  130. "properties": properties,
  131. "price": price,
  132. "price_before_discount": price_before_discount,
  133. "externallink": externallink,
  134. "Rating": Rating,
  135. "delivery": delivery,
  136. "discount": discount,
  137. }
  138. )
  139.  
  140. df = pd.DataFrame(self.pd_data)
  141. df.to_csv("carrefour-ksa.csv", index=False)
  142.  
  143. try:
  144. df_2 = pd.read_csv("carrefour-ksa.csv")
  145. sorted = df_2.externallink.str[-6:].sort_values()
  146. sort = sorted.index
  147. df_orgnized = df_2.iloc[sort]
  148. df_orgnized.catalog_uuid = df_orgnized.externallink.str[-6:]
  149. df_orgnized.to_excel("carrefour-ksa-final.xlsx", encoding="utf-16")
  150. except:
  151. pass
  152.  
Advertisement
Add Comment
Please, Sign In to add comment