Advertisement
Guest User

Untitled

a guest
Jul 19th, 2018
69
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.99 KB | None | 0 0
  1. import re
  2. import os
  3. import ast
  4. import time
  5. import datetime
  6.  
  7. import requests
  8. from bs4 import BeautifulSoup
  9. from googletrans import Translator
  10.  
  11.  
  12. URLS = (
  13. 'https://www.wg-gesucht.de/wohnungen-in-Unterfoehring.7308.2.1.0.html?offer_filter=1&sort_column=0&noDeact=1&city_id=7308&category=2&rent_type=0&sMin=25&rMax=1000',
  14. 'https://www.wg-gesucht.de/1-zimmer-wohnungen-in-Unterfoehring.7308.1.1.0.html?offer_filter=1&stadt_key=7308&sort_column=0&sort_order=&noDeact=1&autocompinp=Unterf%C3%B6hring&country_code=&countrymanuel=&city_name=&city_id=7308&category=1&rent_type=0&sMin=25&rMax=1000&dFr=&hidden_dFrDe=&dTo=&hidden_dToDe=&radLat=&radLng=&radAdd=&radDis=0&hidden_wgFla=0&hidden_wgSea=0&hidden_wgSmo=0&hidden_wgAge=&hidden_wgMnF=0&hidden_wgMxT=0&sin=0&exc=0&hidden_rmMin=0&hidden_rmMax=0&pet=0&fur=0',
  15. 'https://www.wg-gesucht.de/1-zimmer-wohnungen-in-Muenchen.90.1.1.0.html?offer_filter=1&sort_column=0&sort_order=&noDeact=1&autocompinp=M%C3%BCnchen+%28Bayern%29&country_code=de&city_name=M%C3%BCnchen&city_id=90&category=1&rent_type=0&sMin=25&rMax=1000&dFr=&hidden_dFrDe=&dTo=&hidden_dToDe=&radLat=&radLng=&radAdd=&radDis=0&wgFla=0&wgSea=0&wgSmo=0&wgAge=&wgMnF=0&wgMxT=0&sin=0&exc=0&rmMin=0&rmMax=0&pet=0&fur=0',
  16. 'https://www.wg-gesucht.de/wohnungen-in-Muenchen.90.2.1.0.html?offer_filter=1&stadt_key=90&sort_column=0&sort_order=&noDeact=1&autocompinp=M%C3%BCnchen&country_code=&countrymanuel=&city_name=&city_id=90&category=2&rent_type=0&sMin=25&rMax=1000&dFr=&hidden_dFrDe=&dTo=&hidden_dToDe=&radLat=&radLng=&radAdd=&radDis=0&hidden_wgFla=0&hidden_wgSea=0&hidden_wgSmo=0&hidden_wgAge=&hidden_wgMnF=0&hidden_wgMxT=0&sin=0&exc=0&rmMin=0&rmMax=0&pet=0&fur=0',
  17. )
  18. TMP_FILE = '/tmp/scraped'
  19. TELEGRAM_TOKEN = 'dfsgfdsgfsd'
  20. CHAT_ID = 'fdsgfdgsf'
  21.  
  22.  
  23. class Scraper:
  24. def __init__(self, urls, tmp_file, telegram_token, chat_id):
  25. self.urls = urls
  26. self.tmp_file = tmp_file
  27. self.telegram_token = telegram_token
  28. self.chat_id = chat_id
  29. self.scraped = set()
  30. self.get_scraped()
  31. self.translator = Translator()
  32.  
  33. def start(self):
  34. while True:
  35. print(f'[{datetime.datetime.now()}] Running task...')
  36. for u in self.urls:
  37. data = self.get_ads(u)
  38. if len(data) > 0:
  39. for ad in data:
  40. if ad['url'] not in self.scraped:
  41. message = self.create_message(ad)
  42. self.set_scraped(ad['url'])
  43. self.send_alert(message)
  44. time.sleep(120)
  45.  
  46. def get_ads(self, url):
  47. r = requests.get(url)
  48. soup = BeautifulSoup(r.content, 'html.parser')
  49. content = []
  50. ads = soup.find_all('div', {'id': lambda x: x and x.startswith('liste-details-ad-')})
  51. if len(ads) > 0:
  52. for ad in ads:
  53. ad_info = {}
  54. size_price = ad.find('div', {'class': 'detail-size-price-wrapper'})
  55. if size_price:
  56. size_price = re.sub(r'\s+', '', size_price.get_text())
  57.  
  58. text_details = ad.find('div', {'class': 'list-details-panel-inner'})
  59. if text_details:
  60. text_details = re.sub(r'\s+', ' ', text_details.get_text())
  61. try:
  62. text_details = self.translator.translate(text_details, src='de').text
  63. except Exception as e:
  64. print(e.message)
  65. pass
  66.  
  67. url = ad.find('a', {'class': 'detailansicht'})
  68. if url:
  69. url = 'https://www.wg-gesucht.de/en/' + url.get('href')
  70.  
  71. ad_info['size_price'] = size_price
  72. ad_info['text_details'] = text_details
  73. ad_info['url'] = url
  74. content.append(ad_info)
  75.  
  76. return content
  77.  
  78. def create_message(self, ad):
  79. details = ad['text_details']
  80. price = ad['size_price']
  81. url = ad['url']
  82. message = '*New post* '
  83. if details: message += f'_{details}_ '
  84. if price: message += f'_{price}_ '
  85. if url: message += f'[Link]({url}) '
  86. return message
  87.  
  88. def get_scraped(self):
  89. if not os.path.exists(self.tmp_file):
  90. with open(self.tmp_file, 'w'): pass
  91. return
  92.  
  93. with open(self.tmp_file, 'r') as f:
  94. for url in f:
  95. self.scraped.add(url.strip())
  96. print(f'[{datetime.datetime.now()}] Loaded {len(self.scraped)} ads')
  97.  
  98. def set_scraped(self, url):
  99. with open(self.tmp_file, 'a') as f:
  100. f.write(url + '\n')
  101. self.scraped.add(url)
  102.  
  103. def send_alert(self, message):
  104. url = f'https://api.telegram.org/bot{self.telegram_token}/sendMessage' \
  105. f'?chat_id={self.chat_id}&parse_mode=Markdown&text={message}'
  106.  
  107. requests.get(url)
  108.  
  109.  
  110. if __name__ == '__main__':
  111. s = Scraper(URLS, TMP_FILE, TELEGRAM_TOKEN, CHAT_ID)
  112. s.start()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement