rhat398

Untitled

May 15th, 2023
786
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 11.19 KB | None | 0 0
  1. import scrapy
  2. import json
  3. import os
  4. from urllib.parse import urlparse
  5. from pathlib import Path
  6. import hashlib
  7.  
  8.  
  9. def make_path(urls):
  10.     img_path = []
  11.     for url in urls:
  12.         image_url_hash = hashlib.md5(url.encode()).hexdigest()
  13.         img_path.append(
  14.             image_url_hash[:3]
  15.             + "/"
  16.             + image_url_hash[3:6]
  17.             + "/"
  18.             + image_url_hash[6:9]
  19.             + "/"
  20.             + image_url_hash
  21.         )
  22.     return img_path
  23.  
  24.  
  25. class HouzzSimilar(scrapy.Spider):
  26.     name = "houzz_crawler"
  27.  
  28.     custom_settings = {
  29.         "LOG_FILE": "houzz_spider.log",
  30.         "IMAGES_STORE": "houzz_images",
  31.         "FEEDS": {
  32.             "houzz.json": {
  33.                 "format": "json",
  34.             }
  35.         },
  36.         "ITEM_PIPELINES": {
  37.             "houzz_crawler.pipelines.HouzzImagePipeline": 1,
  38.         },
  39.     }
  40.  
  41.     headers = {
  42.         "authority": "www.houzz.com",
  43.         "accept": "*/*",
  44.         "accept-language": "en,ru;q=0.9",
  45.         "content-type": "application/x-www-form-urlencoded; charset=UTF-8",
  46.         "origin": "https://www.houzz.com",
  47.         "referer": "https://www.houzz.com/photos/columbus-ave-residence-contemporary-bathroom-new-york-phvw-vp~160668148",
  48.         "rrid": "70402547-c900-47f7-a913-8e1cbc9aa0c3",
  49.         "sec-ch-ua": '"Chromium";v="110", "Not A(Brand";v="24", "YaBrowser";v="23"',
  50.         "sec-ch-ua-mobile": "?0",
  51.         "sec-ch-ua-platform": '"Linux"',
  52.         "sec-fetch-dest": "empty",
  53.         "sec-fetch-mode": "cors",
  54.         "sec-fetch-site": "same-origin",
  55.         "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 YaBrowser/23.3.1.906 (beta) Yowser/2.5 Safari/537.36",
  56.         "x-csrf-token": "i8B5ykgX-eprPj5yAHSxOng08Pa4qAr2Z0TQ",
  57.         "x-hz-request": "true",
  58.         "x-ol-exp-id": "clhhdi4wu00003y71rnvty395",
  59.         "x-ol-exp-name": "Photo - View",
  60.         "x-ol-ext-device-id": "23a3cfb8-7a04-4462-af71-d98689271533",
  61.         "x-ol-ext-session-id": "782c0a90-8925-409f-90c1-f47798e0426e",
  62.         "x-ol-product": "Houzz",
  63.         "x-ol-product-variant": "Houzz US",
  64.         "x-ol-session-id": "782c0a90-8925-409f-90c1-f47798e0426e",
  65.         "x-requested-with": "XMLHttpRequest",
  66.     }
  67.  
  68.     cookies = {
  69.         "v": "1683311076_f9d9a715-f45b-42dc-bc6d-7da75774a57f_9bda9dd500ca1e5119bbecaba51e53f0",
  70.         "vct": "en-US-vxnkSVVkSBzkSVVkCR%2FkSVVk8B%2FkSVVk4R3kSVVk4h3kSVVk",
  71.         "_gcl_au": "1.1.17413922.1683311086",
  72.         "crossdevicetracking": "915374c0-439c-46a1-bbf2-3a2aaa487e69",
  73.         "_pin_unauth": "dWlkPU16Y3dNbVF6T0dNdE1tWTBOaTAwWTJSa0xUazVZakV0TXprek5XWm1ZV014WWprMw",
  74.         "_sp_id.c905": "5af74097-a6bb-46e7-8d14-35ff6d738f39.1683317411.2.1683359810.1683317411.13ad94c9-5560-4fbf-963f-b63e32f2124d",
  75.         "g_state": '{"i_p":1684144918349,"i_l":3}',
  76.         "browseResultSetGridWidth": "554",
  77.         "_gid": "GA1.2.1176067560.1683652076",
  78.         "ln_or": "eyIzODE1NzE2IjoiZCJ9",
  79.         "_csrf": "G_nV-Kaa7rlqgTwnueAXkJtj",
  80.         "jdv": "t7WOzUb2vHLZtWVVHSk%2BXJEWN7ua9zR%2FUkXpY9RYDUW00hxMyur5c%2Bzn6M%2BqQADtWOInJpmlQA37Gxp0L267jdj74Iwe",
  81.         "documentWidth": "1318",
  82.         "_uetsid": "0bf41840ee8c11edac06995ca98afa3c",
  83.         "_uetvid": "1e07d960eb7211ed880b7db3cdc86191",
  84.         "_derived_epik": "dj0yJnU9NFBDc3RuOExta3NiM2xfaV9WS0RYbVVLRS1lRVpycDEmbj1tVE1RRUtOUjYwYU1Kalp0el9mNTBBJm09OCZ0PUFBQUFBR1JiUmprJnJtPTgmcnQ9QUFBQUFHUmJSamsmc3A9NQ",
  85.         "IR_gbd": "houzz.com",
  86.         "IR_5454": "1683703358356%7C0%7C1683703358356%7C%7C",
  87.         "_ga": "GA1.2.1658927820.1683311086",
  88.         "_dc_gtm_UA-3519678-1": "1",
  89.         "_ga_PB0RC2CT7B": "GS1.1.1683703353.11.1.1683704001.59.0.0",
  90.         "hzd": "70402547-c900-47f7-a913-8e1cbc9aa0c3%3A%3A%3A%3A%3ASeeMoreIdeas",
  91.     }
  92.  
  93.     base_url = "https://www.houzz.com/photos/home-design-ideas-phbr0-bp~"
  94.  
  95.     similar_ideas_api_url = "https://www.houzz.com/j/getSimilarSpaces"
  96.  
  97.     def start_requests(self):
  98.         yield scrapy.Request(
  99.             url=self.base_url, headers=self.headers, callback=self.parse_ideas
  100.         )
  101.  
  102.     def parse_ideas(self, response):
  103.         ideas = response.css("a.hz-photo-card__ratio-box::attr(href)").extract()
  104.         total_photos = int(
  105.             response.css("span.hz-top-pagination__text ::text")
  106.             .extract()[4]
  107.             .replace(",", "")
  108.         )
  109.         photos_per_page = int(
  110.             response.css("span.hz-top-pagination__text ::text").extract()[2]
  111.         )
  112.  
  113.         for idea in ideas:
  114.             yield scrapy.Request(
  115.                 url=idea, headers=self.headers, callback=self.parse_project_url
  116.             )
  117.  
  118.     def parse_project_url(self, response):
  119.         data = response.css('script[id="hz-ctx"] ::text').get()
  120.         json_data = json.loads(data)
  121.         space_id = json_data["data"]["pageContentData"]["spaceId"]
  122.         space = json_data["data"]["stores"]["data"]["SpaceStore"]["data"][space_id]
  123.         project_id = space["projectId"]
  124.         space_url = space["url"]
  125.         raw_project_url = (
  126.             space_url.split("~")[0].replace("phvw", "pj").replace("vp", "vj")
  127.         )
  128.         project_url = raw_project_url + "~" + str(project_id)
  129.  
  130.         yield scrapy.Request(
  131.             url=project_url, headers=self.headers, callback=self.parse_project_idea
  132.         )
  133.  
  134.     def parse_project_idea(self, response):
  135.         idea_board = response.css(
  136.             "div.hz-prj-container.hz-prj-container__photos.clearfix ::attr(href)"
  137.         ).extract()
  138.  
  139.         for idea_link in idea_board:
  140.             yield scrapy.Request(
  141.                 url=idea_link,
  142.                 headers=self.headers,
  143.                 callback=self.parse_idea_details,
  144.             )
  145.  
  146.     def parse_idea_details(self, response):
  147.         item = {}
  148.         item["ideadId"] = response.url.split("~")[-1]
  149.         item["ideaUrl"] = response.url
  150.         item["Title"] = response.css(
  151.             "h1.hz-view-photo__space-info__title.text-bold::text"
  152.         ).get()
  153.         subtitle = response.css(
  154.             "h1.hz-view-photo__space-info__subtitle.text-m::text"
  155.         ).get()
  156.         item["subTitle"] = subtitle
  157.         item["spaceDescription"] = response.css(
  158.             "div.hz-view-photo__space-info__description.text-m ::text"
  159.         ).get()
  160.         item["uploadedBy"] = response.css("div.vph-owner-info__details ::text").get()
  161.         item["Tags"] = [
  162.             {"tag": t}
  163.             for t in response.css(
  164.                 "ul.hz-view-photo__breadcrumb.hz-track-me ::text"
  165.             ).extract()
  166.         ]
  167.         item["starRating"] = len(
  168.             response.css(
  169.                 "span.icon-font.icon-star.hz-star-rate.hz-star-rate--highlighted.star-icon"
  170.             )
  171.         )
  172.         item["numberOfReviews"] = response.css(
  173.             "span.hz-star-rate__review-string::text"
  174.         ).get()
  175.         # you can use the "imageURL" field for this items images and then
  176.         # use the "image_urls" field to collect all the images for each
  177.         # of the similar items in the chained callbacks.
  178.         item["imageURL"] = response.css(
  179.             "div.view-photo-image-pane > img::attr(src)"
  180.         ).extract()
  181.  
  182.         item["image_urls"] = item["imageURL"].copy()  # <- make sure to copy()
  183.         item["similarIdeas"] = []
  184.         item["path"] = ""  # <- lambda path function
  185.  
  186.         spaceId = response.url.split("~")[-1]
  187.         body = f"spaceId={spaceId}&fromItem=0&itemsPerPage=10&contentDescriptor=%7B%22t%22%3A1%2C%22et%22%3A3%2C%22id%22%3A160668148%7D"
  188.         yield scrapy.Request(
  189.             url=self.similar_ideas_api_url,
  190.             method="POST",
  191.             cookies=self.cookies,
  192.             headers=self.headers,
  193.             body=body,
  194.             cb_kwargs={"item": item},
  195.             callback=self.get_similar_ideas_urls,
  196.         )
  197.  
  198.     def get_similar_ideas_urls(self, response, item=None):
  199.         data = response.json()["spaceData"]["spaces"]
  200.         space_keys = list(data.keys())
  201.         space_urls = set([data[key]["url"] for key in space_keys])
  202.         yield scrapy.Request(
  203.             url=space_urls.pop(),
  204.             headers=self.headers,
  205.             cb_kwargs={"item": item, "space_urls": space_urls},
  206.             callback=self.parse_similar_ideas,
  207.         )
  208.  
  209.     def parse_similar_ideas(self, response, item=None, space_urls=None):
  210.         # add the image urls to the top master list as well as locally.
  211.         image_urls = response.css(
  212.             "div.view-photo-image-pane > img::attr(src)"
  213.         ).extract()
  214.         item["image_urls"] += image_urls
  215.  
  216.         item["similarIdeas"].append(
  217.             {
  218.                 "ideaId": response.url.split("~")[-1],
  219.                 "ideaUrl": response.url,
  220.                 "Title": response.css(
  221.                     "h1.hz-view-photo__space-info__title.text-bold::text"
  222.                 ).get(),
  223.                 "subTitle": response.css(
  224.                     "h1.hz-view-photo__space-info__subtitle.text-m::text"
  225.                 ).get(),
  226.                 "spaceDescription": response.css(
  227.                     "div.hz-view-photo__space-info__description.text-m ::text"
  228.                 ).get(),
  229.                 "uploadedBy": response.css("div.vph-owner-info__details ::text").get(),
  230.                 "Tags": [
  231.                     {"tag": t}
  232.                     for t in response.css(
  233.                         "ul.hz-view-photo__breadcrumb.hz-track-me ::text"
  234.                     ).extract()
  235.                 ],
  236.                 "starRating": len(
  237.                     response.css(
  238.                         "span.icon-font.icon-star.hz-star-rate.hz-star-rate--highlighted.star-icon"
  239.                     )
  240.                 ),
  241.                 "numberOfReviews": response.css(
  242.                     "span.hz-star-rate__review-string::text"
  243.                 ).get(),
  244.                 "image_urls": image_urls,  # <- set image_urls here too
  245.                 "path": make_path(image_urls),  # <- calculate paths
  246.             }
  247.         )
  248.         if len(space_urls) > 0:
  249.             yield scrapy.Request(
  250.                 url=space_urls.pop(),
  251.                 headers=self.headers,
  252.                 cb_kwargs={"item": item, "space_urls": space_urls},
  253.                 dont_filter=True,
  254.                 callback=self.parse_similar_ideas,
  255.             )
  256.         else:
  257.             yield item
  258.  
  259.  
  260. # IMAGE_PIPELINE
  261.  
  262. class HouzzImagePipeline(ImagesPipeline):  # Inherit the ImagePipeline class
  263.     def get_media_requests(self, item, info):
  264.         for image_url in item["image_urls"]:
  265.             yield scrapy.Request(image_url)
  266.  
  267.     def file_path(self, request, response=None, info=None, *, item=None):
  268.         # use the same calculation as in your spider file to determine paths
  269.         image_url_hash = hashlib.md5(request.url.encode()).hexdigest()
  270.         item[
  271.             "path"
  272.         ] = f"{image_url_hash[:3]}/{image_url_hash[3:6]}/{image_url_hash[6:9]}"
  273.         image_filename = f"{image_url_hash}.jpg"
  274.         return item["path"] + image_filename
  275.  
  276.     def item_completed(self, results, item, info):
  277.         # once the item is complete you can delete the master
  278.         # image_urls list and rename the temporary one
  279.         item["image_urls"] = item["imageURL"]
  280.         del item["imageURL"]
  281.         return item
Advertisement
Add Comment
Please, Sign In to add comment