Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import re
- import os
- import ast
- import time
- import datetime
- import requests
- from bs4 import BeautifulSoup
- from googletrans import Translator
- URLS = (
- 'https://www.wg-gesucht.de/wohnungen-in-Unterfoehring.7308.2.1.0.html?offer_filter=1&sort_column=0&noDeact=1&city_id=7308&category=2&rent_type=0&sMin=25&rMax=1000',
- 'https://www.wg-gesucht.de/1-zimmer-wohnungen-in-Unterfoehring.7308.1.1.0.html?offer_filter=1&stadt_key=7308&sort_column=0&sort_order=&noDeact=1&autocompinp=Unterf%C3%B6hring&country_code=&countrymanuel=&city_name=&city_id=7308&category=1&rent_type=0&sMin=25&rMax=1000&dFr=&hidden_dFrDe=&dTo=&hidden_dToDe=&radLat=&radLng=&radAdd=&radDis=0&hidden_wgFla=0&hidden_wgSea=0&hidden_wgSmo=0&hidden_wgAge=&hidden_wgMnF=0&hidden_wgMxT=0&sin=0&exc=0&hidden_rmMin=0&hidden_rmMax=0&pet=0&fur=0',
- 'https://www.wg-gesucht.de/1-zimmer-wohnungen-in-Muenchen.90.1.1.0.html?offer_filter=1&sort_column=0&sort_order=&noDeact=1&autocompinp=M%C3%BCnchen+%28Bayern%29&country_code=de&city_name=M%C3%BCnchen&city_id=90&category=1&rent_type=0&sMin=25&rMax=1000&dFr=&hidden_dFrDe=&dTo=&hidden_dToDe=&radLat=&radLng=&radAdd=&radDis=0&wgFla=0&wgSea=0&wgSmo=0&wgAge=&wgMnF=0&wgMxT=0&sin=0&exc=0&rmMin=0&rmMax=0&pet=0&fur=0',
- 'https://www.wg-gesucht.de/wohnungen-in-Muenchen.90.2.1.0.html?offer_filter=1&stadt_key=90&sort_column=0&sort_order=&noDeact=1&autocompinp=M%C3%BCnchen&country_code=&countrymanuel=&city_name=&city_id=90&category=2&rent_type=0&sMin=25&rMax=1000&dFr=&hidden_dFrDe=&dTo=&hidden_dToDe=&radLat=&radLng=&radAdd=&radDis=0&hidden_wgFla=0&hidden_wgSea=0&hidden_wgSmo=0&hidden_wgAge=&hidden_wgMnF=0&hidden_wgMxT=0&sin=0&exc=0&rmMin=0&rmMax=0&pet=0&fur=0',
- )
- TMP_FILE = '/tmp/scraped'
- TELEGRAM_TOKEN = 'dfsgfdsgfsd'
- CHAT_ID = 'fdsgfdgsf'
- class Scraper:
- def __init__(self, urls, tmp_file, telegram_token, chat_id):
- self.urls = urls
- self.tmp_file = tmp_file
- self.telegram_token = telegram_token
- self.chat_id = chat_id
- self.scraped = set()
- self.get_scraped()
- self.translator = Translator()
- def start(self):
- while True:
- print(f'[{datetime.datetime.now()}] Running task...')
- for u in self.urls:
- data = self.get_ads(u)
- if len(data) > 0:
- for ad in data:
- if ad['url'] not in self.scraped:
- message = self.create_message(ad)
- self.set_scraped(ad['url'])
- self.send_alert(message)
- time.sleep(120)
- def get_ads(self, url):
- r = requests.get(url)
- soup = BeautifulSoup(r.content, 'html.parser')
- content = []
- ads = soup.find_all('div', {'id': lambda x: x and x.startswith('liste-details-ad-')})
- if len(ads) > 0:
- for ad in ads:
- ad_info = {}
- size_price = ad.find('div', {'class': 'detail-size-price-wrapper'})
- if size_price:
- size_price = re.sub(r'\s+', '', size_price.get_text())
- text_details = ad.find('div', {'class': 'list-details-panel-inner'})
- if text_details:
- text_details = re.sub(r'\s+', ' ', text_details.get_text())
- try:
- text_details = self.translator.translate(text_details, src='de').text
- except Exception as e:
- print(e.message)
- pass
- url = ad.find('a', {'class': 'detailansicht'})
- if url:
- url = 'https://www.wg-gesucht.de/en/' + url.get('href')
- ad_info['size_price'] = size_price
- ad_info['text_details'] = text_details
- ad_info['url'] = url
- content.append(ad_info)
- return content
- def create_message(self, ad):
- details = ad['text_details']
- price = ad['size_price']
- url = ad['url']
- message = '*New post* '
- if details: message += f'_{details}_ '
- if price: message += f'_{price}_ '
- if url: message += f'[Link]({url}) '
- return message
- def get_scraped(self):
- if not os.path.exists(self.tmp_file):
- with open(self.tmp_file, 'w'): pass
- return
- with open(self.tmp_file, 'r') as f:
- for url in f:
- self.scraped.add(url.strip())
- print(f'[{datetime.datetime.now()}] Loaded {len(self.scraped)} ads')
- def set_scraped(self, url):
- with open(self.tmp_file, 'a') as f:
- f.write(url + '\n')
- self.scraped.add(url)
- def send_alert(self, message):
- url = f'https://api.telegram.org/bot{self.telegram_token}/sendMessage' \
- f'?chat_id={self.chat_id}&parse_mode=Markdown&text={message}'
- requests.get(url)
- if __name__ == '__main__':
- s = Scraper(URLS, TMP_FILE, TELEGRAM_TOKEN, CHAT_ID)
- s.start()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement