Untitled

import re
import os
import ast
import time
import datetime

import requests
from bs4 import BeautifulSoup
from googletrans import Translator


URLS = (
    'https://www.wg-gesucht.de/wohnungen-in-Unterfoehring.7308.2.1.0.html?offer_filter=1&sort_column=0&noDeact=1&city_id=7308&category=2&rent_type=0&sMin=25&rMax=1000',
    'https://www.wg-gesucht.de/1-zimmer-wohnungen-in-Unterfoehring.7308.1.1.0.html?offer_filter=1&stadt_key=7308&sort_column=0&sort_order=&noDeact=1&autocompinp=Unterf%C3%B6hring&country_code=&countrymanuel=&city_name=&city_id=7308&category=1&rent_type=0&sMin=25&rMax=1000&dFr=&hidden_dFrDe=&dTo=&hidden_dToDe=&radLat=&radLng=&radAdd=&radDis=0&hidden_wgFla=0&hidden_wgSea=0&hidden_wgSmo=0&hidden_wgAge=&hidden_wgMnF=0&hidden_wgMxT=0&sin=0&exc=0&hidden_rmMin=0&hidden_rmMax=0&pet=0&fur=0',
    'https://www.wg-gesucht.de/1-zimmer-wohnungen-in-Muenchen.90.1.1.0.html?offer_filter=1&sort_column=0&sort_order=&noDeact=1&autocompinp=M%C3%BCnchen+%28Bayern%29&country_code=de&city_name=M%C3%BCnchen&city_id=90&category=1&rent_type=0&sMin=25&rMax=1000&dFr=&hidden_dFrDe=&dTo=&hidden_dToDe=&radLat=&radLng=&radAdd=&radDis=0&wgFla=0&wgSea=0&wgSmo=0&wgAge=&wgMnF=0&wgMxT=0&sin=0&exc=0&rmMin=0&rmMax=0&pet=0&fur=0',
    'https://www.wg-gesucht.de/wohnungen-in-Muenchen.90.2.1.0.html?offer_filter=1&stadt_key=90&sort_column=0&sort_order=&noDeact=1&autocompinp=M%C3%BCnchen&country_code=&countrymanuel=&city_name=&city_id=90&category=2&rent_type=0&sMin=25&rMax=1000&dFr=&hidden_dFrDe=&dTo=&hidden_dToDe=&radLat=&radLng=&radAdd=&radDis=0&hidden_wgFla=0&hidden_wgSea=0&hidden_wgSmo=0&hidden_wgAge=&hidden_wgMnF=0&hidden_wgMxT=0&sin=0&exc=0&rmMin=0&rmMax=0&pet=0&fur=0',
)
TMP_FILE = '/tmp/scraped'
TELEGRAM_TOKEN = 'dfsgfdsgfsd'
CHAT_ID = 'fdsgfdgsf'


class Scraper:
    def __init__(self, urls, tmp_file, telegram_token, chat_id):
        self.urls = urls
        self.tmp_file = tmp_file
        self.telegram_token = telegram_token
        self.chat_id = chat_id
        self.scraped = set()
        self.get_scraped()
        self.translator = Translator()

    def start(self):
        while True:
            print(f'[{datetime.datetime.now()}] Running task...')
            for u in self.urls:
                data = self.get_ads(u)
                if len(data) > 0:
                    for ad in data:
                        if ad['url'] not in self.scraped:
                            message = self.create_message(ad)
                            self.set_scraped(ad['url'])
                            self.send_alert(message)
            time.sleep(120)

    def get_ads(self, url):
        r = requests.get(url)
        soup = BeautifulSoup(r.content, 'html.parser')
        content = []
        ads = soup.find_all('div', {'id': lambda x: x and x.startswith('liste-details-ad-')})
        if len(ads) > 0:
            for ad in ads:
                ad_info = {}
                size_price = ad.find('div', {'class': 'detail-size-price-wrapper'})
                if size_price:
                    size_price = re.sub(r'\s+', '', size_price.get_text())

                text_details = ad.find('div', {'class': 'list-details-panel-inner'})
                if text_details:
                    text_details = re.sub(r'\s+', ' ', text_details.get_text())
                    try:
                        text_details = self.translator.translate(text_details, src='de').text
                    except Exception as e:
                        print(e.message)
                        pass

                url = ad.find('a', {'class': 'detailansicht'})
                if url:
                    url = 'https://www.wg-gesucht.de/en/' + url.get('href')

                ad_info['size_price'] = size_price
                ad_info['text_details'] = text_details
                ad_info['url'] = url
                content.append(ad_info)

        return content

    def create_message(self, ad):
        details = ad['text_details']
        price = ad['size_price']
        url = ad['url']
        message = '*New post*  '
        if details: message += f'_{details}_  '
        if price: message += f'_{price}_  '
        if url: message += f'[Link]({url})  '
        return message

    def get_scraped(self):
        if not os.path.exists(self.tmp_file):
            with open(self.tmp_file, 'w'): pass
            return

        with open(self.tmp_file, 'r') as f:
            for url in f:
                self.scraped.add(url.strip())
            print(f'[{datetime.datetime.now()}] Loaded {len(self.scraped)} ads')

    def set_scraped(self, url):
        with open(self.tmp_file, 'a') as f:
            f.write(url + '\n')
        self.scraped.add(url)

    def send_alert(self, message):
        url = f'https://api.telegram.org/bot{self.telegram_token}/sendMessage' \
              f'?chat_id={self.chat_id}&parse_mode=Markdown&text={message}'

        requests.get(url)


if __name__ == '__main__':
    s = Scraper(URLS, TMP_FILE, TELEGRAM_TOKEN, CHAT_ID)
    s.start()