Untitled

import scrapy
import json
import os
from urllib.parse import urlparse
from pathlib import Path
import hashlib


def make_path(urls):
    img_path = []
    for url in urls:
        image_url_hash = hashlib.md5(url.encode()).hexdigest()
        img_path.append(
            image_url_hash[:3]
            + "/"
            + image_url_hash[3:6]
            + "/"
            + image_url_hash[6:9]
            + "/"
            + image_url_hash
        )
    return img_path


class HouzzSimilar(scrapy.Spider):
    name = "houzz_crawler"

    custom_settings = {
        "LOG_FILE": "houzz_spider.log",
        "IMAGES_STORE": "houzz_images",
        "FEEDS": {
            "houzz.json": {
                "format": "json",
            }
        },
        "ITEM_PIPELINES": {
            "houzz_crawler.pipelines.HouzzImagePipeline": 1,
        },
    }

    headers = {
        "authority": "www.houzz.com",
        "accept": "*/*",
        "accept-language": "en,ru;q=0.9",
        "content-type": "application/x-www-form-urlencoded; charset=UTF-8",
        "origin": "https://www.houzz.com",
        "referer": "https://www.houzz.com/photos/columbus-ave-residence-contemporary-bathroom-new-york-phvw-vp~160668148",
        "rrid": "70402547-c900-47f7-a913-8e1cbc9aa0c3",
        "sec-ch-ua": '"Chromium";v="110", "Not A(Brand";v="24", "YaBrowser";v="23"',
        "sec-ch-ua-mobile": "?0",
        "sec-ch-ua-platform": '"Linux"',
        "sec-fetch-dest": "empty",
        "sec-fetch-mode": "cors",
        "sec-fetch-site": "same-origin",
        "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 YaBrowser/23.3.1.906 (beta) Yowser/2.5 Safari/537.36",
        "x-csrf-token": "i8B5ykgX-eprPj5yAHSxOng08Pa4qAr2Z0TQ",
        "x-hz-request": "true",
        "x-ol-exp-id": "clhhdi4wu00003y71rnvty395",
        "x-ol-exp-name": "Photo - View",
        "x-ol-ext-device-id": "23a3cfb8-7a04-4462-af71-d98689271533",
        "x-ol-ext-session-id": "782c0a90-8925-409f-90c1-f47798e0426e",
        "x-ol-product": "Houzz",
        "x-ol-product-variant": "Houzz US",
        "x-ol-session-id": "782c0a90-8925-409f-90c1-f47798e0426e",
        "x-requested-with": "XMLHttpRequest",
    }

    cookies = {
        "v": "1683311076_f9d9a715-f45b-42dc-bc6d-7da75774a57f_9bda9dd500ca1e5119bbecaba51e53f0",
        "vct": "en-US-vxnkSVVkSBzkSVVkCR%2FkSVVk8B%2FkSVVk4R3kSVVk4h3kSVVk",
        "_gcl_au": "1.1.17413922.1683311086",
        "crossdevicetracking": "915374c0-439c-46a1-bbf2-3a2aaa487e69",
        "_pin_unauth": "dWlkPU16Y3dNbVF6T0dNdE1tWTBOaTAwWTJSa0xUazVZakV0TXprek5XWm1ZV014WWprMw",
        "_sp_id.c905": "5af74097-a6bb-46e7-8d14-35ff6d738f39.1683317411.2.1683359810.1683317411.13ad94c9-5560-4fbf-963f-b63e32f2124d",
        "g_state": '{"i_p":1684144918349,"i_l":3}',
        "browseResultSetGridWidth": "554",
        "_gid": "GA1.2.1176067560.1683652076",
        "ln_or": "eyIzODE1NzE2IjoiZCJ9",
        "_csrf": "G_nV-Kaa7rlqgTwnueAXkJtj",
        "jdv": "t7WOzUb2vHLZtWVVHSk%2BXJEWN7ua9zR%2FUkXpY9RYDUW00hxMyur5c%2Bzn6M%2BqQADtWOInJpmlQA37Gxp0L267jdj74Iwe",
        "documentWidth": "1318",
        "_uetsid": "0bf41840ee8c11edac06995ca98afa3c",
        "_uetvid": "1e07d960eb7211ed880b7db3cdc86191",
        "_derived_epik": "dj0yJnU9NFBDc3RuOExta3NiM2xfaV9WS0RYbVVLRS1lRVpycDEmbj1tVE1RRUtOUjYwYU1Kalp0el9mNTBBJm09OCZ0PUFBQUFBR1JiUmprJnJtPTgmcnQ9QUFBQUFHUmJSamsmc3A9NQ",
        "IR_gbd": "houzz.com",
        "IR_5454": "1683703358356%7C0%7C1683703358356%7C%7C",
        "_ga": "GA1.2.1658927820.1683311086",
        "_dc_gtm_UA-3519678-1": "1",
        "_ga_PB0RC2CT7B": "GS1.1.1683703353.11.1.1683704001.59.0.0",
        "hzd": "70402547-c900-47f7-a913-8e1cbc9aa0c3%3A%3A%3A%3A%3ASeeMoreIdeas",
    }

    base_url = "https://www.houzz.com/photos/home-design-ideas-phbr0-bp~"

    similar_ideas_api_url = "https://www.houzz.com/j/getSimilarSpaces"

    def start_requests(self):
        yield scrapy.Request(
            url=self.base_url, headers=self.headers, callback=self.parse_ideas
        )

    def parse_ideas(self, response):
        ideas = response.css("a.hz-photo-card__ratio-box::attr(href)").extract()
        total_photos = int(
            response.css("span.hz-top-pagination__text ::text")
            .extract()[4]
            .replace(",", "")
        )
        photos_per_page = int(
            response.css("span.hz-top-pagination__text ::text").extract()[2]
        )

        for idea in ideas:
            yield scrapy.Request(
                url=idea, headers=self.headers, callback=self.parse_project_url
            )

    def parse_project_url(self, response):
        data = response.css('script[id="hz-ctx"] ::text').get()
        json_data = json.loads(data)
        space_id = json_data["data"]["pageContentData"]["spaceId"]
        space = json_data["data"]["stores"]["data"]["SpaceStore"]["data"][space_id]
        project_id = space["projectId"]
        space_url = space["url"]
        raw_project_url = (
            space_url.split("~")[0].replace("phvw", "pj").replace("vp", "vj")
        )
        project_url = raw_project_url + "~" + str(project_id)

        yield scrapy.Request(
            url=project_url, headers=self.headers, callback=self.parse_project_idea
        )

    def parse_project_idea(self, response):
        idea_board = response.css(
            "div.hz-prj-container.hz-prj-container__photos.clearfix ::attr(href)"
        ).extract()

        for idea_link in idea_board:
            yield scrapy.Request(
                url=idea_link,
                headers=self.headers,
                callback=self.parse_idea_details,
            )

    def parse_idea_details(self, response):
        item = {}
        item["ideadId"] = response.url.split("~")[-1]
        item["ideaUrl"] = response.url
        item["Title"] = response.css(
            "h1.hz-view-photo__space-info__title.text-bold::text"
        ).get()
        subtitle = response.css(
            "h1.hz-view-photo__space-info__subtitle.text-m::text"
        ).get()
        item["subTitle"] = subtitle
        item["spaceDescription"] = response.css(
            "div.hz-view-photo__space-info__description.text-m ::text"
        ).get()
        item["uploadedBy"] = response.css("div.vph-owner-info__details ::text").get()
        item["Tags"] = [
            {"tag": t}
            for t in response.css(
                "ul.hz-view-photo__breadcrumb.hz-track-me ::text"
            ).extract()
        ]
        item["starRating"] = len(
            response.css(
                "span.icon-font.icon-star.hz-star-rate.hz-star-rate--highlighted.star-icon"
            )
        )
        item["numberOfReviews"] = response.css(
            "span.hz-star-rate__review-string::text"
        ).get()
        # you can use the "imageURL" field for this items images and then
        # use the "image_urls" field to collect all the images for each
        # of the similar items in the chained callbacks.
        item["imageURL"] = response.css(
            "div.view-photo-image-pane > img::attr(src)"
        ).extract()

        item["image_urls"] = item["imageURL"].copy()  # <- make sure to copy()
        item["similarIdeas"] = []
        item["path"] = ""  # <- lambda path function

        spaceId = response.url.split("~")[-1]
        body = f"spaceId={spaceId}&fromItem=0&itemsPerPage=10&contentDescriptor=%7B%22t%22%3A1%2C%22et%22%3A3%2C%22id%22%3A160668148%7D"
        yield scrapy.Request(
            url=self.similar_ideas_api_url,
            method="POST",
            cookies=self.cookies,
            headers=self.headers,
            body=body,
            cb_kwargs={"item": item},
            callback=self.get_similar_ideas_urls,
        )

    def get_similar_ideas_urls(self, response, item=None):
        data = response.json()["spaceData"]["spaces"]
        space_keys = list(data.keys())
        space_urls = set([data[key]["url"] for key in space_keys])
        yield scrapy.Request(
            url=space_urls.pop(),
            headers=self.headers,
            cb_kwargs={"item": item, "space_urls": space_urls},
            callback=self.parse_similar_ideas,
        )

    def parse_similar_ideas(self, response, item=None, space_urls=None):
        # add the image urls to the top master list as well as locally.
        image_urls = response.css(
            "div.view-photo-image-pane > img::attr(src)"
        ).extract()
        item["image_urls"] += image_urls

        item["similarIdeas"].append(
            {
                "ideaId": response.url.split("~")[-1],
                "ideaUrl": response.url,
                "Title": response.css(
                    "h1.hz-view-photo__space-info__title.text-bold::text"
                ).get(),
                "subTitle": response.css(
                    "h1.hz-view-photo__space-info__subtitle.text-m::text"
                ).get(),
                "spaceDescription": response.css(
                    "div.hz-view-photo__space-info__description.text-m ::text"
                ).get(),
                "uploadedBy": response.css("div.vph-owner-info__details ::text").get(),
                "Tags": [
                    {"tag": t}
                    for t in response.css(
                        "ul.hz-view-photo__breadcrumb.hz-track-me ::text"
                    ).extract()
                ],
                "starRating": len(
                    response.css(
                        "span.icon-font.icon-star.hz-star-rate.hz-star-rate--highlighted.star-icon"
                    )
                ),
                "numberOfReviews": response.css(
                    "span.hz-star-rate__review-string::text"
                ).get(),
                "image_urls": image_urls,  # <- set image_urls here too
                "path": make_path(image_urls),  # <- calculate paths
            }
        )
        if len(space_urls) > 0:
            yield scrapy.Request(
                url=space_urls.pop(),
                headers=self.headers,
                cb_kwargs={"item": item, "space_urls": space_urls},
                dont_filter=True,
                callback=self.parse_similar_ideas,
            )
        else:
            yield item


# IMAGE_PIPELINE

class HouzzImagePipeline(ImagesPipeline):  # Inherit the ImagePipeline class
    def get_media_requests(self, item, info):
        for image_url in item["image_urls"]:
            yield scrapy.Request(image_url)

    def file_path(self, request, response=None, info=None, *, item=None):
        # use the same calculation as in your spider file to determine paths
        image_url_hash = hashlib.md5(request.url.encode()).hexdigest()
        item[
            "path"
        ] = f"{image_url_hash[:3]}/{image_url_hash[3:6]}/{image_url_hash[6:9]}"
        image_filename = f"{image_url_hash}.jpg"
        return item["path"] + image_filename

    def item_completed(self, results, item, info):
        # once the item is complete you can delete the master
        # image_urls list and rename the temporary one
        item["image_urls"] = item["imageURL"]
        del item["imageURL"]
        return item