Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import scrapy
- class Mingkh(scrapy.Spider):
- name = 'houses'
- start_urls = ['https://dom.mingkh.ru/kemerovskaya-oblast/novokuzneck/houses?page=1']
- def parse(self, response):
- house_page_links = response.css('td a::attr(href)')
- yield from response.follow_all(house_page_links, self.parse_house)
- pagination_links = response.css('li a[rel="next"]::attr(href)')
- yield from response.follow_all(pagination_links, self.parse)
- def parse_house(self, response):
- def extract_with_css(query):
- return response.css(query).get(default='').strip()
- company_urls = response.css('dd span.clickable::attr(data-url)')
- yield from response.follow_all(company_urls, self.parse_company)
- yield {
- 'address': extract_with_css('div.block-heading-two h1::text'),
- 'url': response.url,
- }
- def parse_company(self, response):
- def extract_with_css(query):
- return response.css(query).get(default='').strip()
- def extract_with_xpath(query):
- return response.xpath(query).get(default='').strip()
- yield {
- 'company_url': response.url,
- 'company_name': extract_with_css('div.block-heading-two h1::text'),
- 'company_phone': extract_with_xpath('/html[1]/body[1]/div[1]/div[2]/div[2]/div[3]/div[1]/dl[1]/dd[4]/text()'),
- 'company_address': extract_with_xpath('/html[1]/body[1]/div[1]/div[2]/div[2]/div[3]/div[1]/dl[1]/dd[3]/text()'),
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement