Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import requests
- from bs4 import BeautifulSoup
- import csv
- def get_html(url):
- r = requests.get(url)
- return r.text
- def get_total_pages(html):
- soup = BeautifulSoup(html, 'lxml')
- pages = soup.find('div', class_='pagination-pages').find_all('a', class_='pagination-page')[-1].get('href')
- total_pages = pages.split('=')[1].split('&')[0]
- return int(total_pages)
- # def get_total_pages2(html):
- # soup = BeautifulSoup(html, 'lxml')
- # pages = soup.find('div', class_='pagination-pages').find_all('a', class_='pagination-page')[-1].get('href')
- # total_pages = pages.split('=')[1]
- # return int(total_pages)
- def write_csv(data):
- with open('avito.csv', 'a') as f:
- writer = csv.writer(f)
- writer.writerow( (data['title'],
- data['short_description'],
- data['price'],
- data['metro'],
- # data['address'],
- data['birthtime'],
- data['url']) )
- return
- def get_page_data(html):
- soup = BeautifulSoup(html, 'lxml')
- ads = soup.find('div', class_='catalog-list').find_all('div', class_='item_table')
- for ad in ads:
- try:
- title = ad.find('div', class_='description').find('h3').text.strip()
- except:
- title = ''
- try:
- url = 'https://www.avito.ru' + ad.find('div', class_='description').find('h3').find('a').get('href')
- except:
- url = ''
- try:
- price = ad.find('div', class_='about').text.split('₽')[0].replace(' ', '').strip()
- except:
- price = ''
- try:
- short_description = ad.find('div', class_='about').text.split('₽')[1].strip()
- except:
- short_description = ''
- try:
- birthtime = ad.find('div', class_='data').text.strip()
- except:
- birthtime = ''
- try:
- metro = ad.find('div', class_='description').find('p').text.split(',')[0].strip()
- except:
- metro = ''
- # try:
- # address = ad.find('div', class_='description').find('p').text.split(',')[1].strip()
- # except:
- # address = ''
- data = {'title': title,
- 'short_description': short_description,
- 'price': price,
- 'metro': metro,
- # 'address': address,
- 'birthtime': birthtime,
- 'url': url}
- write_csv(data)
- return
- def main():
- url = 'https://www.avito.ru/moskva/garazhi_i_mashinomesta/prodam/mashinomesto/mnogourovnevyy_parking?p=1&q=машиноместо'
- # url2 = 'https://www.avito.ru/moskva/garazhi_i_mashinomesta/sdam/mashinomesto/mnogourovnevyy_parking?p=1'
- base_url = 'https://www.avito.ru/moskva/garazhi_i_mashinomesta/prodam/mashinomesto/mnogourovnevyy_parking?'
- # base_url2 = 'https://www.avito.ru/moskva/garazhi_i_mashinomesta/sdam/mashinomesto/mnogourovnevyy_parking?'
- page_part = 'p='
- query_part = '&q=машиноместо'
- total_pages = get_total_pages(get_html(url))
- # total_pages2 = get_total_pages2(get_html(url2))
- for i in range(1, total_pages + 1):
- url_gen = base_url + page_part + str(i) + query_part
- print(url_gen)
- html = get_html((url_gen))
- get_page_data(html)
- # for j in range(1, total_pages2 + 1):
- # url_gen2 = base_url2 + page_part + str(j)
- # html2 = get_html((url_gen2))
- # get_page_data(html2)
- # print(url_gen2)
- return
- #if __name__ == '__main__':
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement