Advertisement
Guest User

Untitled

a guest
Jul 17th, 2018
77
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.57 KB | None | 0 0
  1. import re
  2. import os
  3. import ast
  4. import time
  5. import hashlib
  6. import datetime
  7. import threading
  8.  
  9. import requests
  10. from bs4 import BeautifulSoup
  11. from googletrans import Translator
  12.  
  13.  
  14. URLS = (
  15.     'https://www.wg-gesucht.de/wohnungen-in-Unterfoehring.7308.2.1.0.html?offer_filter=1&sort_column=0&noDeact=1&city_id=7308&category=2&rent_type=0&sMin=25&rMax=1000',
  16.     'https://www.wg-gesucht.de/1-zimmer-wohnungen-in-Unterfoehring.7308.1.1.0.html?offer_filter=1&stadt_key=7308&sort_column=0&sort_order=&noDeact=1&autocompinp=Unterf%C3%B6hring&country_code=&countrymanuel=&city_name=&city_id=7308&category=1&rent_type=0&sMin=25&rMax=1000&dFr=&hidden_dFrDe=&dTo=&hidden_dToDe=&radLat=&radLng=&radAdd=&radDis=0&hidden_wgFla=0&hidden_wgSea=0&hidden_wgSmo=0&hidden_wgAge=&hidden_wgMnF=0&hidden_wgMxT=0&sin=0&exc=0&hidden_rmMin=0&hidden_rmMax=0&pet=0&fur=0',
  17.     'https://www.wg-gesucht.de/1-zimmer-wohnungen-in-Muenchen.90.1.1.0.html?offer_filter=1&sort_column=0&sort_order=&noDeact=1&autocompinp=M%C3%BCnchen+%28Bayern%29&country_code=de&city_name=M%C3%BCnchen&city_id=90&category=1&rent_type=0&sMin=25&rMax=1000&dFr=&hidden_dFrDe=&dTo=&hidden_dToDe=&radLat=&radLng=&radAdd=&radDis=0&wgFla=0&wgSea=0&wgSmo=0&wgAge=&wgMnF=0&wgMxT=0&sin=0&exc=0&rmMin=0&rmMax=0&pet=0&fur=0',
  18.     'https://www.wg-gesucht.de/wohnungen-in-Muenchen.90.2.1.0.html?offer_filter=1&stadt_key=90&sort_column=0&sort_order=&noDeact=1&autocompinp=M%C3%BCnchen&country_code=&countrymanuel=&city_name=&city_id=90&category=2&rent_type=0&sMin=25&rMax=1000&dFr=&hidden_dFrDe=&dTo=&hidden_dToDe=&radLat=&radLng=&radAdd=&radDis=0&hidden_wgFla=0&hidden_wgSea=0&hidden_wgSmo=0&hidden_wgAge=&hidden_wgMnF=0&hidden_wgMxT=0&sin=0&exc=0&rmMin=0&rmMax=0&pet=0&fur=0',
  19. )
  20. TMP_FILE = '/tmp/scraped'
  21. TELEGRAM_TOKEN = adsdsad
  22. CHAT_ID = sadsdadsa
  23.  
  24.  
  25. class Scraper:
  26.     def __init__(self, urls, tmp_file, telegram_token, chat_id):
  27.         self.urls = urls
  28.         self.tmp_file = tmp_file
  29.         self.telegram_token = telegram_token
  30.         self.chat_id = chat_id
  31.         self.scraped = set()
  32.         self.get_scraped()
  33.         self.translator = Translator()
  34.  
  35.     def start(self):
  36.         RepeatedTimer(60*2, self.task)
  37.  
  38.     def task(self):
  39.         for u in self.urls:
  40.             data = self.get_ads(u)
  41.             if len(data) > 0:
  42.                 for ad in data:
  43.                     message = self.create_message(ad)
  44.                     hash = self._hash(message)
  45.                     if hash not in self.scraped:
  46.                         self.set_scraped(hash)
  47.                         self.send_alert(message)
  48.  
  49.     def get_ads(self, url):
  50.         r = requests.get(url)
  51.         soup = BeautifulSoup(r.content, 'html.parser')
  52.         content = []
  53.         ads = soup.find_all('div', {'id': lambda x: x and x.startswith('liste-details-ad-')})
  54.         if len(ads) > 0:
  55.             for ad in ads:
  56.                 ad_info = {}
  57.                 size_price = ad.find('div', {'class': 'detail-size-price-wrapper'})
  58.                 if size_price:
  59.                     size_price = re.sub(r'\s+', '', size_price.get_text())
  60.  
  61.                 text_details = ad.find('div', {'class': 'list-details-panel-inner'})
  62.                 if text_details:
  63.                     text_details = re.sub(r'\s+', ' ', text_details.get_text())
  64.                     try:
  65.                         text_details = self.translator.translate(text_details, src='de').text
  66.                     except Exception as e:
  67.                         print(e.message)
  68.                         pass
  69.  
  70.                 url = ad.find('a', {'class': 'detailansicht'})
  71.                 if url:
  72.                     url = 'https://www.wg-gesucht.de/en/' + url.get('href')
  73.  
  74.                 ad_info['size_price'] = size_price
  75.                 ad_info['text_details'] = text_details
  76.                 ad_info['url'] = url
  77.                 content.append(ad_info)
  78.  
  79.         return content
  80.  
  81.     def create_message(self, ad):
  82.         details = ad['text_details']
  83.         price = ad['size_price']
  84.         url = ad['url']
  85.         message = '*New post*  '
  86.         if details: message += f'_{details}_  '
  87.         if price: message += f'_{price}_  '
  88.         if url: message += f'[Link]({url})  '
  89.         return message
  90.  
  91.     def get_scraped(self):
  92.         if not os.path.exists(self.tmp_file):
  93.             with open(self.tmp_file, 'w'): pass
  94.             return
  95.  
  96.         with open(self.tmp_file, 'r') as f:
  97.             for hash in f:
  98.                 self.scraped.add(hash.strip())
  99.  
  100.     def set_scraped(self, hash):
  101.         with open(self.tmp_file, 'w') as f:
  102.             f.write(hash)
  103.         self.scraped.add(hash)
  104.  
  105.     def _hash(self, ad):
  106.         return hashlib.sha256(ad.encode('utf-8')).hexdigest()
  107.  
  108.     def send_alert(self, message):
  109.         url = f'https://api.telegram.org/bot{self.telegram_token}/sendMessage' \
  110.               f'?chat_id={self.chat_id}&parse_mode=Markdown&text={message}'
  111.  
  112.         requests.get(url)
  113.  
  114.  
  115. class RepeatedTimer(object):
  116.     def __init__(self, interval, function):
  117.         self._timer = None
  118.         self.interval = interval
  119.         self.function = function
  120.         self.is_running = False
  121.         self.start()
  122.  
  123.     def _run(self):
  124.         self.is_running = False
  125.         self.start()
  126.         self.function()
  127.  
  128.     def start(self):
  129.         if not self.is_running:
  130.             self._timer = threading.Timer(self.interval, self._run)
  131.             self._timer.start()
  132.             self.is_running = True
  133.  
  134.     def stop(self):
  135.         self._timer.cancel()
  136.         self.is_running = False
  137.  
  138.  
  139. if __name__ == '__main__':
  140.     s = Scraper(URLS, TMP_FILE, TELEGRAM_TOKEN, CHAT_ID)
  141.     s.start()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement