SHOW:
|
|
- or go back to the newest paste.
| 1 | import scrapy | |
| 2 | - | import jsonhttps://pastebin.com/ |
| 2 | + | import json |
| 3 | ||
| 4 | class OlxHouses(scrapy.Spider): | |
| 5 | name = 'olx' | |
| 6 | ||
| 7 | custom_settings = {
| |
| 8 | 'USER_AGENT' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36', | |
| 9 | 'AUTOTHROTTLE_ENABLED': True, | |
| 10 | } | |
| 11 | ||
| 12 | def start_requests(self): | |
| 13 | for page in range(1,101): | |
| 14 | yield scrapy.Request(f'https://www.olx.com.br/eletronicos-e-celulares/estado-sp?o={page}')
| |
| 15 | ||
| 16 | def parse(self, response, **kwargs): | |
| 17 | html = json.loads(response.xpath('//script[@id="__NEXT_DATA__"]/text()').get())
| |
| 18 | houses = html.get('props').get('pageProps').get('ads')
| |
| 19 | for house in houses: | |
| 20 | yield{
| |
| 21 | 'title' : house.get('title'),
| |
| 22 | 'price' : house.get('price'),
| |
| 23 | 'locations' : house.get('location')
| |
| 24 | } | |
| 25 | ||
| 26 | ||
| 27 |