Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from selenium import webdriver
- import time
- import gspread
- import gspread_formatting as gsf
- from oauth2client.service_account import ServiceAccountCredentials
- import platform
- from selenium import webdriver
- from webdriver_manager.chrome import ChromeDriverManager
- import pdfkit
- from fake_useragent import UserAgent
- #GoogleSheets
- scope = ["https://www.googleapis.com/auth/drive"]
- credentials = ServiceAccountCredentials.from_json_keyfile_name("credentials.json", scope)
- client = gspread.authorize(credentials)
- #UserAgent
- ua = UserAgent()
- a = ua.random
- user_agent = ua.random
- #GoogleSheetsDetails
- sheetImmo = client.open('PythonTest').worksheet('Immo')
- sheetLinks = client.open('PythonTest').worksheet('Links')
- Index = 2
- Url = sheetLinks.col_values(10)
- Url_1 = Url[1]
- print(Url[1])
- PROXY = "95.216.68.90:3128"
- chrome_options = webdriver.ChromeOptions()
- #chrome_options.add_argument('--proxy-server=%s' % PROXY)
- #chrome_options.add_argument('--headless')
- chrome_options.add_argument("window-size=1400,600")
- chrome_options.add_argument(f'user-agent={user_agent}')
- chrome_options.add_argument("start-maximized")
- chrome_options.add_argument('--disable-gpu') # applicable to windows os only
- chrome_options.add_argument('disable-infobars')
- chrome_options.add_argument("--disable-extensions")
- chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
- chrome_options.add_experimental_option('useAutomationExtension', False)
- chrome_options.add_argument('--no-sandbox')
- chrome_options.add_argument('--user-data-dir=/tmp/user-data')
- chrome_options.add_argument('--hide-scrollbars')
- chrome_options.add_argument('--enable-logging')
- chrome_options.add_argument('--log-level=0')
- chrome_options.add_argument('--v=99')
- chrome_options.add_argument('--single-process')
- chrome_options.add_argument('--data-path=/tmp/data-path')
- chrome_options.add_argument('--ignore-certificate-errors')
- chrome_options.add_argument('--homedir=/tmp')
- chrome_options.add_argument('--disk-cache-dir=/tmp/cache-dir')
- driver = webdriver.Chrome(ChromeDriverManager().install(),options=chrome_options)
- wd = driver
- wd.get('https://www.immobilienscout24.de/')
- z = 1
- UUrls = sheetLinks.col_values(10)[1:] #"[1:] --> ignore the first row"
- for UUrl in UUrls:
- z = z+1 #row number
- time.sleep(15)
- wd.get(UUrl)
- time.sleep(13)
- ####
- ####LOOP
- ####
- AllLinksOnline = [] #Links die bereits in Google Sheet sind (Immo)
- i = 1
- Urls = sheetImmo.col_values(3)[1:] #"[1:] --> ignore the first row"
- for Url in Urls:
- i = i+1 #row number
- #wd.get(Url)
- #print(Url)
- if Url not in AllLinksOnline:
- AllLinksOnline.append(Url)
- print("Anzahl Links in Google Docs: ",len(AllLinksOnline))
- time.sleep(13)
- AllLinksWebsite = [] #Eindeutige Links auf der Webseite
- text = "expose"
- elems = wd.find_elements_by_xpath('//a[contains(@href, "%s")]' % text)
- for elem in elems:
- links = elem.get_attribute("href")
- if links not in AllLinksWebsite:
- AllLinksWebsite.append(links)
- print("Anzahl Links auf ImmobilienScout: ",len(AllLinksWebsite))
- AdditionalLinks = [] #Add links here
- AllLinksWebsite = [] #Eindeutige Links auf der Webseite
- AllLinksNew = []
- text = "expose"
- elems = wd.find_elements_by_xpath('//a[contains(@href, "%s")]' % text)
- for ALink in AdditionalLinks:
- AllLinksNew.append(ALink)
- for elem in elems:
- links = elem.get_attribute("href")
- #print(links)
- if links not in AllLinksOnline:
- AllLinksOnline.append(links)
- AllLinksNew.append(links)
- print("Anzahl neue Links: ",len(AllLinksNew))
- time.sleep(20)
- for links in AllLinksNew:
- time.sleep(11)
- wd.get(links)
- time.sleep(16)
- Titel = wd.find_element_by_id('expose-title').text
- product_name = wd.find_element_by_xpath('//div[contains(@class, "is24-scoutid__content padding-top-s")]').text
- ort = wd.find_element_by_xpath('//span[contains(@class, "zip-region-and-country")]').text
- price = wd.find_element_by_xpath('//div[contains(@class, "is24qa-kaufpreis is24-value font-semibold is24-preis-value")]').text
- zimmer = wd.find_element_by_xpath('//div[contains(@class, "is24qa-zi is24-value font-semibold")]').text
- Wohnqm = wd.find_element_by_xpath('//div[contains(@class, "is24qa-wohnflaeche is24-value font-semibold")]').text
- Grundqm = wd.find_element_by_xpath('//div[contains(@class, "is24qa-grundstueck is24-value font-semibold")]').text
- Wohnqm = Wohnqm.replace(" m²","")
- Grundqm = Grundqm.replace(" m²","")
- #DetailsHaus = wd.find_element_by_xpath('//div[contains(@class, "criteriagroup criteria-group--two-columns")]').text
- #Verkäufer = wd.find_element_by_xpath('//div[contains(@class, "style__truncateChild___2Z9XG font-semibold")]').text
- #DetailsBausubstanzEnergie = wd.find_element_by_xpath('//div[contains(@class, "criteriagroup criteria-group--border criteria-group--two-columns criteria-group--spacing")]').text
- #Beschreibung = wd.find_element_by_xpath('//pre[contains(@class, "is24qa-objektbeschreibung text-content short-text")]').text
- #Ausstattung = wd.find_element_by_xpath('//pre[contains(@class, "is24qa-ausstattung text-content short-text")]').text
- #Lage = wd.find_element_by_xpath('//pre[contains(@class, "is24qa-lage text-content short-text")]').text
- #Sonstiges = wd.find_element_by_xpath('//pre[contains(@class, "is24qa-lage text-content short-text")]').text
- Date = (time.strftime("%d.%m.%Y"))
- Time = (time.strftime("%H:%M"))
- FullRow = (Date, Time, links, product_name, Titel, ort, price, zimmer, Wohnqm,Grundqm)#,DetailsHaus,Verkäufer,DetailsBausubstanzEnergie,Beschreibung, Ausstattung, Lage, Sonstiges)
- #print(FullRow)
- sheetImmo.append_row(FullRow, Index)
- ID = links[40:]
- print(ID)
- path_wkhtmltopdf = r"C:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe"
- config = pdfkit.configuration(wkhtmltopdf=path_wkhtmltopdf)
- path = "C:/Users/abc/Desktop/Jupyter/Immo/" + ID + Titel[:6] +".pdf"
- pdfurl = links + "/print"
- time.sleep(13)
- pdfkit.from_url(pdfurl, path, configuration=config)
- time.sleep(12)
- print("DONE!!!")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement