Guest User

Untitled

a guest
Nov 29th, 2020
328
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. from selenium import webdriver
  2. import time
  3. import gspread
  4. import gspread_formatting as gsf
  5. from oauth2client.service_account import ServiceAccountCredentials
  6. import platform
  7. from selenium import webdriver
  8. from webdriver_manager.chrome import ChromeDriverManager
  9. import pdfkit
  10. from fake_useragent import UserAgent
  11.  
  12. #GoogleSheets
  13. scope = ["https://www.googleapis.com/auth/drive"]
  14. credentials = ServiceAccountCredentials.from_json_keyfile_name("credentials.json", scope)
  15. client = gspread.authorize(credentials)
  16. #UserAgent
  17. ua = UserAgent()
  18. a = ua.random
  19. user_agent = ua.random
  20. #GoogleSheetsDetails
  21. sheetImmo = client.open('PythonTest').worksheet('Immo')
  22. sheetLinks = client.open('PythonTest').worksheet('Links')
  23. Index = 2
  24. Url = sheetLinks.col_values(10)
  25. Url_1 = Url[1]
  26. print(Url[1])
  27.  
  28. PROXY = "95.216.68.90:3128"
  29. chrome_options = webdriver.ChromeOptions()
  30. #chrome_options.add_argument('--proxy-server=%s' % PROXY)
  31. #chrome_options.add_argument('--headless')
  32. chrome_options.add_argument("window-size=1400,600")
  33. chrome_options.add_argument(f'user-agent={user_agent}')
  34. chrome_options.add_argument("start-maximized")
  35. chrome_options.add_argument('--disable-gpu') # applicable to windows os only
  36. chrome_options.add_argument('disable-infobars')
  37. chrome_options.add_argument("--disable-extensions")
  38. chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
  39. chrome_options.add_experimental_option('useAutomationExtension', False)
  40. chrome_options.add_argument('--no-sandbox')
  41. chrome_options.add_argument('--user-data-dir=/tmp/user-data')
  42. chrome_options.add_argument('--hide-scrollbars')
  43. chrome_options.add_argument('--enable-logging')
  44. chrome_options.add_argument('--log-level=0')
  45. chrome_options.add_argument('--v=99')
  46. chrome_options.add_argument('--single-process')
  47. chrome_options.add_argument('--data-path=/tmp/data-path')
  48. chrome_options.add_argument('--ignore-certificate-errors')
  49. chrome_options.add_argument('--homedir=/tmp')
  50. chrome_options.add_argument('--disk-cache-dir=/tmp/cache-dir')
  51.  
  52.  
  53. driver = webdriver.Chrome(ChromeDriverManager().install(),options=chrome_options)
  54. wd = driver
  55. wd.get('https://www.immobilienscout24.de/')
  56. z = 1
  57. UUrls = sheetLinks.col_values(10)[1:] #"[1:] --> ignore the first row"
  58. for UUrl in UUrls:
  59. z = z+1 #row number
  60. time.sleep(15)
  61. wd.get(UUrl)
  62. time.sleep(13)
  63.  
  64. ####
  65. ####LOOP
  66. ####
  67. AllLinksOnline = [] #Links die bereits in Google Sheet sind (Immo)
  68. i = 1
  69. Urls = sheetImmo.col_values(3)[1:] #"[1:] --> ignore the first row"
  70. for Url in Urls:
  71. i = i+1 #row number
  72. #wd.get(Url)
  73. #print(Url)
  74. if Url not in AllLinksOnline:
  75. AllLinksOnline.append(Url)
  76. print("Anzahl Links in Google Docs: ",len(AllLinksOnline))
  77. time.sleep(13)
  78. AllLinksWebsite = [] #Eindeutige Links auf der Webseite
  79. text = "expose"
  80. elems = wd.find_elements_by_xpath('//a[contains(@href, "%s")]' % text)
  81. for elem in elems:
  82. links = elem.get_attribute("href")
  83. if links not in AllLinksWebsite:
  84. AllLinksWebsite.append(links)
  85. print("Anzahl Links auf ImmobilienScout: ",len(AllLinksWebsite))
  86.  
  87. AdditionalLinks = [] #Add links here
  88. AllLinksWebsite = [] #Eindeutige Links auf der Webseite
  89. AllLinksNew = []
  90. text = "expose"
  91. elems = wd.find_elements_by_xpath('//a[contains(@href, "%s")]' % text)
  92. for ALink in AdditionalLinks:
  93. AllLinksNew.append(ALink)
  94. for elem in elems:
  95. links = elem.get_attribute("href")
  96. #print(links)
  97. if links not in AllLinksOnline:
  98. AllLinksOnline.append(links)
  99. AllLinksNew.append(links)
  100.  
  101. print("Anzahl neue Links: ",len(AllLinksNew))
  102. time.sleep(20)
  103. for links in AllLinksNew:
  104. time.sleep(11)
  105. wd.get(links)
  106. time.sleep(16)
  107. Titel = wd.find_element_by_id('expose-title').text
  108. product_name = wd.find_element_by_xpath('//div[contains(@class, "is24-scoutid__content padding-top-s")]').text
  109. ort = wd.find_element_by_xpath('//span[contains(@class, "zip-region-and-country")]').text
  110. price = wd.find_element_by_xpath('//div[contains(@class, "is24qa-kaufpreis is24-value font-semibold is24-preis-value")]').text
  111. zimmer = wd.find_element_by_xpath('//div[contains(@class, "is24qa-zi is24-value font-semibold")]').text
  112. Wohnqm = wd.find_element_by_xpath('//div[contains(@class, "is24qa-wohnflaeche is24-value font-semibold")]').text
  113. Grundqm = wd.find_element_by_xpath('//div[contains(@class, "is24qa-grundstueck is24-value font-semibold")]').text
  114. Wohnqm = Wohnqm.replace(" m²","")
  115. Grundqm = Grundqm.replace(" m²","")
  116. #DetailsHaus = wd.find_element_by_xpath('//div[contains(@class, "criteriagroup criteria-group--two-columns")]').text
  117. #Verkäufer = wd.find_element_by_xpath('//div[contains(@class, "style__truncateChild___2Z9XG font-semibold")]').text
  118. #DetailsBausubstanzEnergie = wd.find_element_by_xpath('//div[contains(@class, "criteriagroup criteria-group--border criteria-group--two-columns criteria-group--spacing")]').text
  119. #Beschreibung = wd.find_element_by_xpath('//pre[contains(@class, "is24qa-objektbeschreibung text-content short-text")]').text
  120. #Ausstattung = wd.find_element_by_xpath('//pre[contains(@class, "is24qa-ausstattung text-content short-text")]').text
  121. #Lage = wd.find_element_by_xpath('//pre[contains(@class, "is24qa-lage text-content short-text")]').text
  122. #Sonstiges = wd.find_element_by_xpath('//pre[contains(@class, "is24qa-lage text-content short-text")]').text
  123. Date = (time.strftime("%d.%m.%Y"))
  124. Time = (time.strftime("%H:%M"))
  125. FullRow = (Date, Time, links, product_name, Titel, ort, price, zimmer, Wohnqm,Grundqm)#,DetailsHaus,Verkäufer,DetailsBausubstanzEnergie,Beschreibung, Ausstattung, Lage, Sonstiges)
  126. #print(FullRow)
  127. sheetImmo.append_row(FullRow, Index)
  128. ID = links[40:]
  129. print(ID)
  130. path_wkhtmltopdf = r"C:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe"
  131. config = pdfkit.configuration(wkhtmltopdf=path_wkhtmltopdf)
  132. path = "C:/Users/abc/Desktop/Jupyter/Immo/" + ID + Titel[:6] +".pdf"
  133. pdfurl = links + "/print"
  134. time.sleep(13)
  135. pdfkit.from_url(pdfurl, path, configuration=config)
  136. time.sleep(12)
  137.  
  138. print("DONE!!!")
RAW Paste Data

Adblocker detected! Please consider disabling it...

We've detected AdBlock Plus or some other adblocking software preventing Pastebin.com from fully loading.

We don't have any obnoxious sound, or popup ads, we actively block these annoying types of ads!

Please add Pastebin.com to your ad blocker whitelist or disable your adblocking software.

×