Advertisement
Guest User

Untitled

a guest
Nov 29th, 2020
387
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 6.35 KB | None | 0 0
  1. from selenium import webdriver
  2. import time
  3. import gspread
  4. import gspread_formatting as gsf
  5. from oauth2client.service_account import ServiceAccountCredentials
  6. import platform
  7. from selenium import webdriver
  8. from webdriver_manager.chrome import ChromeDriverManager
  9. import pdfkit
  10. from fake_useragent import UserAgent
  11.  
  12. #GoogleSheets
  13. scope = ["https://www.googleapis.com/auth/drive"]
  14. credentials = ServiceAccountCredentials.from_json_keyfile_name("credentials.json", scope)
  15. client = gspread.authorize(credentials)
  16. #UserAgent
  17. ua = UserAgent()
  18. a = ua.random
  19. user_agent = ua.random
  20. #GoogleSheetsDetails
  21. sheetImmo = client.open('PythonTest').worksheet('Immo')
  22. sheetLinks = client.open('PythonTest').worksheet('Links')
  23. Index = 2
  24. Url = sheetLinks.col_values(10)
  25. Url_1 = Url[1]
  26. print(Url[1])
  27.  
  28. PROXY = "95.216.68.90:3128"
  29. chrome_options = webdriver.ChromeOptions()
  30. #chrome_options.add_argument('--proxy-server=%s' % PROXY)
  31. #chrome_options.add_argument('--headless')
  32. chrome_options.add_argument("window-size=1400,600")
  33. chrome_options.add_argument(f'user-agent={user_agent}')
  34. chrome_options.add_argument("start-maximized")
  35. chrome_options.add_argument('--disable-gpu') # applicable to windows os only
  36. chrome_options.add_argument('disable-infobars')
  37. chrome_options.add_argument("--disable-extensions")
  38. chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
  39. chrome_options.add_experimental_option('useAutomationExtension', False)
  40. chrome_options.add_argument('--no-sandbox')
  41. chrome_options.add_argument('--user-data-dir=/tmp/user-data')
  42. chrome_options.add_argument('--hide-scrollbars')
  43. chrome_options.add_argument('--enable-logging')
  44. chrome_options.add_argument('--log-level=0')
  45. chrome_options.add_argument('--v=99')
  46. chrome_options.add_argument('--single-process')
  47. chrome_options.add_argument('--data-path=/tmp/data-path')
  48. chrome_options.add_argument('--ignore-certificate-errors')
  49. chrome_options.add_argument('--homedir=/tmp')
  50. chrome_options.add_argument('--disk-cache-dir=/tmp/cache-dir')
  51.  
  52.  
  53. driver = webdriver.Chrome(ChromeDriverManager().install(),options=chrome_options)
  54. wd = driver
  55. wd.get('https://www.immobilienscout24.de/')
  56. z = 1
  57. UUrls = sheetLinks.col_values(10)[1:] #"[1:] --> ignore the first row"
  58. for UUrl in UUrls:
  59. z = z+1 #row number
  60. time.sleep(15)
  61. wd.get(UUrl)
  62. time.sleep(13)
  63.  
  64. ####
  65. ####LOOP
  66. ####
  67. AllLinksOnline = [] #Links die bereits in Google Sheet sind (Immo)
  68. i = 1
  69. Urls = sheetImmo.col_values(3)[1:] #"[1:] --> ignore the first row"
  70. for Url in Urls:
  71. i = i+1 #row number
  72. #wd.get(Url)
  73. #print(Url)
  74. if Url not in AllLinksOnline:
  75. AllLinksOnline.append(Url)
  76. print("Anzahl Links in Google Docs: ",len(AllLinksOnline))
  77. time.sleep(13)
  78. AllLinksWebsite = [] #Eindeutige Links auf der Webseite
  79. text = "expose"
  80. elems = wd.find_elements_by_xpath('//a[contains(@href, "%s")]' % text)
  81. for elem in elems:
  82. links = elem.get_attribute("href")
  83. if links not in AllLinksWebsite:
  84. AllLinksWebsite.append(links)
  85. print("Anzahl Links auf ImmobilienScout: ",len(AllLinksWebsite))
  86.  
  87. AdditionalLinks = [] #Add links here
  88. AllLinksWebsite = [] #Eindeutige Links auf der Webseite
  89. AllLinksNew = []
  90. text = "expose"
  91. elems = wd.find_elements_by_xpath('//a[contains(@href, "%s")]' % text)
  92. for ALink in AdditionalLinks:
  93. AllLinksNew.append(ALink)
  94. for elem in elems:
  95. links = elem.get_attribute("href")
  96. #print(links)
  97. if links not in AllLinksOnline:
  98. AllLinksOnline.append(links)
  99. AllLinksNew.append(links)
  100.  
  101. print("Anzahl neue Links: ",len(AllLinksNew))
  102. time.sleep(20)
  103. for links in AllLinksNew:
  104. time.sleep(11)
  105. wd.get(links)
  106. time.sleep(16)
  107. Titel = wd.find_element_by_id('expose-title').text
  108. product_name = wd.find_element_by_xpath('//div[contains(@class, "is24-scoutid__content padding-top-s")]').text
  109. ort = wd.find_element_by_xpath('//span[contains(@class, "zip-region-and-country")]').text
  110. price = wd.find_element_by_xpath('//div[contains(@class, "is24qa-kaufpreis is24-value font-semibold is24-preis-value")]').text
  111. zimmer = wd.find_element_by_xpath('//div[contains(@class, "is24qa-zi is24-value font-semibold")]').text
  112. Wohnqm = wd.find_element_by_xpath('//div[contains(@class, "is24qa-wohnflaeche is24-value font-semibold")]').text
  113. Grundqm = wd.find_element_by_xpath('//div[contains(@class, "is24qa-grundstueck is24-value font-semibold")]').text
  114. Wohnqm = Wohnqm.replace(" m²","")
  115. Grundqm = Grundqm.replace(" m²","")
  116. #DetailsHaus = wd.find_element_by_xpath('//div[contains(@class, "criteriagroup criteria-group--two-columns")]').text
  117. #Verkäufer = wd.find_element_by_xpath('//div[contains(@class, "style__truncateChild___2Z9XG font-semibold")]').text
  118. #DetailsBausubstanzEnergie = wd.find_element_by_xpath('//div[contains(@class, "criteriagroup criteria-group--border criteria-group--two-columns criteria-group--spacing")]').text
  119. #Beschreibung = wd.find_element_by_xpath('//pre[contains(@class, "is24qa-objektbeschreibung text-content short-text")]').text
  120. #Ausstattung = wd.find_element_by_xpath('//pre[contains(@class, "is24qa-ausstattung text-content short-text")]').text
  121. #Lage = wd.find_element_by_xpath('//pre[contains(@class, "is24qa-lage text-content short-text")]').text
  122. #Sonstiges = wd.find_element_by_xpath('//pre[contains(@class, "is24qa-lage text-content short-text")]').text
  123. Date = (time.strftime("%d.%m.%Y"))
  124. Time = (time.strftime("%H:%M"))
  125. FullRow = (Date, Time, links, product_name, Titel, ort, price, zimmer, Wohnqm,Grundqm)#,DetailsHaus,Verkäufer,DetailsBausubstanzEnergie,Beschreibung, Ausstattung, Lage, Sonstiges)
  126. #print(FullRow)
  127. sheetImmo.append_row(FullRow, Index)
  128. ID = links[40:]
  129. print(ID)
  130. path_wkhtmltopdf = r"C:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe"
  131. config = pdfkit.configuration(wkhtmltopdf=path_wkhtmltopdf)
  132. path = "C:/Users/abc/Desktop/Jupyter/Immo/" + ID + Titel[:6] +".pdf"
  133. pdfurl = links + "/print"
  134. time.sleep(13)
  135. pdfkit.from_url(pdfurl, path, configuration=config)
  136. time.sleep(12)
  137.  
  138. print("DONE!!!")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement