Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import scrapy
- import json
- import os
- from urllib.parse import urlparse
- from pathlib import Path
- import hashlib
- def make_path(urls):
- img_path = []
- for url in urls:
- image_url_hash = hashlib.md5(url.encode()).hexdigest()
- img_path.append(
- image_url_hash[:3]
- + "/"
- + image_url_hash[3:6]
- + "/"
- + image_url_hash[6:9]
- + "/"
- + image_url_hash
- )
- return img_path
- class HouzzSimilar(scrapy.Spider):
- name = "houzz_crawler"
- custom_settings = {
- "LOG_FILE": "houzz_spider.log",
- "IMAGES_STORE": "houzz_images",
- "FEEDS": {
- "houzz.json": {
- "format": "json",
- }
- },
- "ITEM_PIPELINES": {
- "houzz_crawler.pipelines.HouzzImagePipeline": 1,
- },
- }
- headers = {
- "authority": "www.houzz.com",
- "accept": "*/*",
- "accept-language": "en,ru;q=0.9",
- "content-type": "application/x-www-form-urlencoded; charset=UTF-8",
- "origin": "https://www.houzz.com",
- "referer": "https://www.houzz.com/photos/columbus-ave-residence-contemporary-bathroom-new-york-phvw-vp~160668148",
- "rrid": "70402547-c900-47f7-a913-8e1cbc9aa0c3",
- "sec-ch-ua": '"Chromium";v="110", "Not A(Brand";v="24", "YaBrowser";v="23"',
- "sec-ch-ua-mobile": "?0",
- "sec-ch-ua-platform": '"Linux"',
- "sec-fetch-dest": "empty",
- "sec-fetch-mode": "cors",
- "sec-fetch-site": "same-origin",
- "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 YaBrowser/23.3.1.906 (beta) Yowser/2.5 Safari/537.36",
- "x-csrf-token": "i8B5ykgX-eprPj5yAHSxOng08Pa4qAr2Z0TQ",
- "x-hz-request": "true",
- "x-ol-exp-id": "clhhdi4wu00003y71rnvty395",
- "x-ol-exp-name": "Photo - View",
- "x-ol-ext-device-id": "23a3cfb8-7a04-4462-af71-d98689271533",
- "x-ol-ext-session-id": "782c0a90-8925-409f-90c1-f47798e0426e",
- "x-ol-product": "Houzz",
- "x-ol-product-variant": "Houzz US",
- "x-ol-session-id": "782c0a90-8925-409f-90c1-f47798e0426e",
- "x-requested-with": "XMLHttpRequest",
- }
- cookies = {
- "v": "1683311076_f9d9a715-f45b-42dc-bc6d-7da75774a57f_9bda9dd500ca1e5119bbecaba51e53f0",
- "vct": "en-US-vxnkSVVkSBzkSVVkCR%2FkSVVk8B%2FkSVVk4R3kSVVk4h3kSVVk",
- "_gcl_au": "1.1.17413922.1683311086",
- "crossdevicetracking": "915374c0-439c-46a1-bbf2-3a2aaa487e69",
- "_pin_unauth": "dWlkPU16Y3dNbVF6T0dNdE1tWTBOaTAwWTJSa0xUazVZakV0TXprek5XWm1ZV014WWprMw",
- "_sp_id.c905": "5af74097-a6bb-46e7-8d14-35ff6d738f39.1683317411.2.1683359810.1683317411.13ad94c9-5560-4fbf-963f-b63e32f2124d",
- "g_state": '{"i_p":1684144918349,"i_l":3}',
- "browseResultSetGridWidth": "554",
- "_gid": "GA1.2.1176067560.1683652076",
- "ln_or": "eyIzODE1NzE2IjoiZCJ9",
- "_csrf": "G_nV-Kaa7rlqgTwnueAXkJtj",
- "jdv": "t7WOzUb2vHLZtWVVHSk%2BXJEWN7ua9zR%2FUkXpY9RYDUW00hxMyur5c%2Bzn6M%2BqQADtWOInJpmlQA37Gxp0L267jdj74Iwe",
- "documentWidth": "1318",
- "_uetsid": "0bf41840ee8c11edac06995ca98afa3c",
- "_uetvid": "1e07d960eb7211ed880b7db3cdc86191",
- "_derived_epik": "dj0yJnU9NFBDc3RuOExta3NiM2xfaV9WS0RYbVVLRS1lRVpycDEmbj1tVE1RRUtOUjYwYU1Kalp0el9mNTBBJm09OCZ0PUFBQUFBR1JiUmprJnJtPTgmcnQ9QUFBQUFHUmJSamsmc3A9NQ",
- "IR_gbd": "houzz.com",
- "IR_5454": "1683703358356%7C0%7C1683703358356%7C%7C",
- "_ga": "GA1.2.1658927820.1683311086",
- "_dc_gtm_UA-3519678-1": "1",
- "_ga_PB0RC2CT7B": "GS1.1.1683703353.11.1.1683704001.59.0.0",
- "hzd": "70402547-c900-47f7-a913-8e1cbc9aa0c3%3A%3A%3A%3A%3ASeeMoreIdeas",
- }
- base_url = "https://www.houzz.com/photos/home-design-ideas-phbr0-bp~"
- similar_ideas_api_url = "https://www.houzz.com/j/getSimilarSpaces"
- def start_requests(self):
- yield scrapy.Request(
- url=self.base_url, headers=self.headers, callback=self.parse_ideas
- )
- def parse_ideas(self, response):
- ideas = response.css("a.hz-photo-card__ratio-box::attr(href)").extract()
- total_photos = int(
- response.css("span.hz-top-pagination__text ::text")
- .extract()[4]
- .replace(",", "")
- )
- photos_per_page = int(
- response.css("span.hz-top-pagination__text ::text").extract()[2]
- )
- for idea in ideas:
- yield scrapy.Request(
- url=idea, headers=self.headers, callback=self.parse_project_url
- )
- def parse_project_url(self, response):
- data = response.css('script[id="hz-ctx"] ::text').get()
- json_data = json.loads(data)
- space_id = json_data["data"]["pageContentData"]["spaceId"]
- space = json_data["data"]["stores"]["data"]["SpaceStore"]["data"][space_id]
- project_id = space["projectId"]
- space_url = space["url"]
- raw_project_url = (
- space_url.split("~")[0].replace("phvw", "pj").replace("vp", "vj")
- )
- project_url = raw_project_url + "~" + str(project_id)
- yield scrapy.Request(
- url=project_url, headers=self.headers, callback=self.parse_project_idea
- )
- def parse_project_idea(self, response):
- idea_board = response.css(
- "div.hz-prj-container.hz-prj-container__photos.clearfix ::attr(href)"
- ).extract()
- for idea_link in idea_board:
- yield scrapy.Request(
- url=idea_link,
- headers=self.headers,
- callback=self.parse_idea_details,
- )
- def parse_idea_details(self, response):
- item = {}
- item["ideadId"] = response.url.split("~")[-1]
- item["ideaUrl"] = response.url
- item["Title"] = response.css(
- "h1.hz-view-photo__space-info__title.text-bold::text"
- ).get()
- subtitle = response.css(
- "h1.hz-view-photo__space-info__subtitle.text-m::text"
- ).get()
- item["subTitle"] = subtitle
- item["spaceDescription"] = response.css(
- "div.hz-view-photo__space-info__description.text-m ::text"
- ).get()
- item["uploadedBy"] = response.css("div.vph-owner-info__details ::text").get()
- item["Tags"] = [
- {"tag": t}
- for t in response.css(
- "ul.hz-view-photo__breadcrumb.hz-track-me ::text"
- ).extract()
- ]
- item["starRating"] = len(
- response.css(
- "span.icon-font.icon-star.hz-star-rate.hz-star-rate--highlighted.star-icon"
- )
- )
- item["numberOfReviews"] = response.css(
- "span.hz-star-rate__review-string::text"
- ).get()
- # you can use the "imageURL" field for this items images and then
- # use the "image_urls" field to collect all the images for each
- # of the similar items in the chained callbacks.
- item["imageURL"] = response.css(
- "div.view-photo-image-pane > img::attr(src)"
- ).extract()
- item["image_urls"] = item["imageURL"].copy() # <- make sure to copy()
- item["similarIdeas"] = []
- item["path"] = "" # <- lambda path function
- spaceId = response.url.split("~")[-1]
- body = f"spaceId={spaceId}&fromItem=0&itemsPerPage=10&contentDescriptor=%7B%22t%22%3A1%2C%22et%22%3A3%2C%22id%22%3A160668148%7D"
- yield scrapy.Request(
- url=self.similar_ideas_api_url,
- method="POST",
- cookies=self.cookies,
- headers=self.headers,
- body=body,
- cb_kwargs={"item": item},
- callback=self.get_similar_ideas_urls,
- )
- def get_similar_ideas_urls(self, response, item=None):
- data = response.json()["spaceData"]["spaces"]
- space_keys = list(data.keys())
- space_urls = set([data[key]["url"] for key in space_keys])
- yield scrapy.Request(
- url=space_urls.pop(),
- headers=self.headers,
- cb_kwargs={"item": item, "space_urls": space_urls},
- callback=self.parse_similar_ideas,
- )
- def parse_similar_ideas(self, response, item=None, space_urls=None):
- # add the image urls to the top master list as well as locally.
- image_urls = response.css(
- "div.view-photo-image-pane > img::attr(src)"
- ).extract()
- item["image_urls"] += image_urls
- item["similarIdeas"].append(
- {
- "ideaId": response.url.split("~")[-1],
- "ideaUrl": response.url,
- "Title": response.css(
- "h1.hz-view-photo__space-info__title.text-bold::text"
- ).get(),
- "subTitle": response.css(
- "h1.hz-view-photo__space-info__subtitle.text-m::text"
- ).get(),
- "spaceDescription": response.css(
- "div.hz-view-photo__space-info__description.text-m ::text"
- ).get(),
- "uploadedBy": response.css("div.vph-owner-info__details ::text").get(),
- "Tags": [
- {"tag": t}
- for t in response.css(
- "ul.hz-view-photo__breadcrumb.hz-track-me ::text"
- ).extract()
- ],
- "starRating": len(
- response.css(
- "span.icon-font.icon-star.hz-star-rate.hz-star-rate--highlighted.star-icon"
- )
- ),
- "numberOfReviews": response.css(
- "span.hz-star-rate__review-string::text"
- ).get(),
- "image_urls": image_urls, # <- set image_urls here too
- "path": make_path(image_urls), # <- calculate paths
- }
- )
- if len(space_urls) > 0:
- yield scrapy.Request(
- url=space_urls.pop(),
- headers=self.headers,
- cb_kwargs={"item": item, "space_urls": space_urls},
- dont_filter=True,
- callback=self.parse_similar_ideas,
- )
- else:
- yield item
- # IMAGE_PIPELINE
- class HouzzImagePipeline(ImagesPipeline): # Inherit the ImagePipeline class
- def get_media_requests(self, item, info):
- for image_url in item["image_urls"]:
- yield scrapy.Request(image_url)
- def file_path(self, request, response=None, info=None, *, item=None):
- # use the same calculation as in your spider file to determine paths
- image_url_hash = hashlib.md5(request.url.encode()).hexdigest()
- item[
- "path"
- ] = f"{image_url_hash[:3]}/{image_url_hash[3:6]}/{image_url_hash[6:9]}"
- image_filename = f"{image_url_hash}.jpg"
- return item["path"] + image_filename
- def item_completed(self, results, item, info):
- # once the item is complete you can delete the master
- # image_urls list and rename the temporary one
- item["image_urls"] = item["imageURL"]
- del item["imageURL"]
- return item
Advertisement
Add Comment
Please, Sign In to add comment