Advertisement
Guest User

Untitled

a guest
Mar 20th, 2019
90
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.67 KB | None | 0 0
  1. import requests
  2. from bs4 import BeautifulSoup
  3. import csv
  4.  
  5. def get_html(url):
  6. r = requests.get(url)
  7. return r.text
  8.  
  9. def get_total_pages(html):
  10. soup = BeautifulSoup(html, 'lxml')
  11. pages = soup.find('div', class_='pagination-pages').find_all('a', class_='pagination-page')[-1].get('href')
  12. total_pages = pages.split('=')[1].split('&')[0]
  13. return int(total_pages)
  14.  
  15. # def get_total_pages2(html):
  16. # soup = BeautifulSoup(html, 'lxml')
  17. # pages = soup.find('div', class_='pagination-pages').find_all('a', class_='pagination-page')[-1].get('href')
  18. # total_pages = pages.split('=')[1]
  19. # return int(total_pages)
  20.  
  21. def write_csv(data):
  22. with open('avito.csv', 'a') as f:
  23. writer = csv.writer(f)
  24. writer.writerow( (data['title'],
  25. data['short_description'],
  26. data['price'],
  27. data['metro'],
  28. # data['address'],
  29. data['birthtime'],
  30. data['url']) )
  31. return
  32.  
  33. def get_page_data(html):
  34. soup = BeautifulSoup(html, 'lxml')
  35. ads = soup.find('div', class_='catalog-list').find_all('div', class_='item_table')
  36.  
  37. for ad in ads:
  38. try:
  39. title = ad.find('div', class_='description').find('h3').text.strip()
  40. except:
  41. title = ''
  42.  
  43. try:
  44. url = 'https://www.avito.ru' + ad.find('div', class_='description').find('h3').find('a').get('href')
  45. except:
  46. url = ''
  47.  
  48. try:
  49. price = ad.find('div', class_='about').text.split('₽')[0].replace(' ', '').strip()
  50. except:
  51. price = ''
  52.  
  53. try:
  54. short_description = ad.find('div', class_='about').text.split('₽')[1].strip()
  55. except:
  56. short_description = ''
  57.  
  58. try:
  59. birthtime = ad.find('div', class_='data').text.strip()
  60. except:
  61. birthtime = ''
  62.  
  63. try:
  64. metro = ad.find('div', class_='description').find('p').text.split(',')[0].strip()
  65. except:
  66. metro = ''
  67.  
  68. # try:
  69. # address = ad.find('div', class_='description').find('p').text.split(',')[1].strip()
  70. # except:
  71. # address = ''
  72.  
  73. data = {'title': title,
  74. 'short_description': short_description,
  75. 'price': price,
  76. 'metro': metro,
  77. # 'address': address,
  78. 'birthtime': birthtime,
  79. 'url': url}
  80.  
  81. write_csv(data)
  82. return
  83.  
  84. def main():
  85. url = 'https://www.avito.ru/moskva/garazhi_i_mashinomesta/prodam/mashinomesto/mnogourovnevyy_parking?p=1&q=машиноместо'
  86. # url2 = 'https://www.avito.ru/moskva/garazhi_i_mashinomesta/sdam/mashinomesto/mnogourovnevyy_parking?p=1'
  87. base_url = 'https://www.avito.ru/moskva/garazhi_i_mashinomesta/prodam/mashinomesto/mnogourovnevyy_parking?'
  88. # base_url2 = 'https://www.avito.ru/moskva/garazhi_i_mashinomesta/sdam/mashinomesto/mnogourovnevyy_parking?'
  89. page_part = 'p='
  90. query_part = '&q=машиноместо'
  91. total_pages = get_total_pages(get_html(url))
  92. # total_pages2 = get_total_pages2(get_html(url2))
  93.  
  94. for i in range(1, total_pages + 1):
  95. url_gen = base_url + page_part + str(i) + query_part
  96. print(url_gen)
  97. html = get_html((url_gen))
  98. get_page_data(html)
  99. # for j in range(1, total_pages2 + 1):
  100. # url_gen2 = base_url2 + page_part + str(j)
  101. # html2 = get_html((url_gen2))
  102. # get_page_data(html2)
  103. # print(url_gen2)
  104. return
  105.  
  106. #if __name__ == '__main__':
  107. main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement