Guest User

Scraping Google Maps Business Data With Python - rendrianarma

a guest
Jan 17th, 2022
67
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.96 KB | None | 0 0
  1. from datetime import datetime
  2. from fileinput import filename
  3. import logging
  4. from selenium.webdriver.chrome.options import Options
  5. from selenium.webdriver.common.keys import Keys
  6. from selenium.common.exceptions import NoSuchElementException
  7. from webdriver_manager.chrome import ChromeDriverManager
  8. import json
  9. import time
  10. import os
  11. import sys
  12. import clipboard
  13.  
  14. os.system('clear')
  15.  
  16. try:
  17. from selenium import webdriver
  18.  
  19. except:
  20.  
  21. seleniumcommand = "python3 -m pip install selenium"
  22. os.system(seleniumcommand)
  23.  
  24. try:
  25. from bs4 import BeautifulSoup
  26. except:
  27. bs4command = "python3 -m pip install bs4"
  28. os.system(bs4command)
  29.  
  30.  
  31. def scrolling(driver):
  32. try:
  33. scrollable_div = driver.find_element_by_xpath(
  34. '//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]')
  35. driver.execute_script(
  36. 'arguments[0].scrollTop = arguments[0].scrollHeight', scrollable_div)
  37. time.sleep(2)
  38.  
  39. except NoSuchElementException:
  40. print("Error: can't find scrollbar")
  41. print("")
  42.  
  43.  
  44. def doJob(query):
  45.  
  46. config = {
  47. "sampeQuery": query,
  48. "inputBox": '//*[@id="searchboxinput"]',
  49. 'searchButon': '//*[@id="searchbox-searchbutton"]',
  50. "firstListLink": 'V0h1Ob-haAclf',
  51. "leftPane": 'Yr7JMd-pane' # wait for scroll so the result will 20
  52. }
  53.  
  54. output = []
  55. output.clear()
  56. # open google maps with english
  57. chrome_options = Options()
  58. # chrome_options.add_argument('--headless')
  59. chrome_options.add_argument('--disable-dev-shm-usage')
  60. # chrome_options.add_argument('--remote-debugging-port=9222')
  61. chrome_options.add_argument('--no-sandbox')
  62. chrome_options.add_argument('--disable-gpu')
  63. chrome_options.add_experimental_option(
  64. "excludeSwitches", ["enable-logging"])
  65. driver = webdriver.Chrome(
  66. ChromeDriverManager(log_level=0).install(), options=chrome_options)
  67.  
  68. driver.get("https://google.com/maps?hl=id")
  69.  
  70. driver.find_element_by_xpath(
  71. config["inputBox"]).send_keys(config["sampeQuery"])
  72. driver.find_element_by_xpath(config["searchButon"]).click()
  73.  
  74. time.sleep(2)
  75.  
  76. scrolling(driver)
  77. scrolling(driver)
  78. scrolling(driver)
  79. entities = driver.find_elements_by_css_selector(
  80. 'a.a4gq8e-aVTXAb-haAclf-jRmmHf-hSRGPd')
  81.  
  82. links = [x.get_attribute('href') for x in driver.find_elements_by_css_selector(
  83. "a.a4gq8e-aVTXAb-haAclf-jRmmHf-hSRGPd")]
  84.  
  85. logging.info("Found: " + str(len(entities)) + " 20")
  86. results = []
  87. i = 1
  88. for item in links:
  89.  
  90. logging.info("Scraping: " + str(item))
  91. driver.get(item)
  92. time.sleep(2) # wait 2 sec before get the info
  93.  
  94. parser = BeautifulSoup(driver.page_source, "html.parser")
  95.  
  96. # get the info
  97. title = parser.select('h1')[0].text.strip()
  98.  
  99. # check if category is found
  100. if(parser.find('button', jsaction="pane.rating.category")):
  101. category = parser.find(
  102. 'button', jsaction="pane.rating.category").text.strip()
  103. else:
  104. category = ""
  105. # get full address
  106. if(parser.find('button', {'data-tooltip': 'Salin alamat'})):
  107. address = parser.find(
  108. 'button', {'data-tooltip': 'Salin alamat'}
  109. ).text.strip()
  110. else:
  111. address = ""
  112. # get phone number
  113. if(parser.find('button', {'data-tooltip': 'Salin nomor telepon'})):
  114. phone = parser.find(
  115. 'button', {'data-tooltip': 'Salin nomor telepon'}
  116. ).text.strip()
  117. else:
  118. phone = ""
  119. # get address with plus code
  120. if(parser.find('button', {'data-tooltip': 'Salin Plus Codes'})):
  121. plusCode = parser.find(
  122. 'button', {'data-tooltip': 'Salin Plus Codes'}
  123. ).text.strip()
  124. else:
  125. plusCode = ""
  126. # get business image
  127. if(parser.find('button', {'jsaction': 'pane.heroHeaderImage.click'})):
  128. img = parser.find(
  129. 'button', {'jsaction': 'pane.heroHeaderImage.click'}
  130. ).img['src']
  131. else:
  132. img = ""
  133. # find rating
  134. if(parser.find('span', {'class': 'aMPvhf-fI6EEc-KVuj8d'})):
  135. rating = parser.find(
  136. 'span', {'class': 'aMPvhf-fI6EEc-KVuj8d'}).text.strip()
  137. else:
  138. rating = ""
  139. # get the website through click
  140. if len(driver.find_elements_by_xpath('//img[@alt="Salin situs"]')) > 0:
  141. driver.find_element_by_xpath(
  142. '//img[@alt="Salin situs"]').click()
  143. website = clipboard.paste()
  144. else:
  145. website = ""
  146.  
  147. # find open hours
  148. if(parser.find('div', {'class': 'LJKBpe-open-R86cEd-haAclf'})):
  149. openHoursResults = {}
  150. openHours = parser.find(
  151. 'div', {'class': 'LJKBpe-open-R86cEd-haAclf'})['aria-label']
  152. for days in openHours.split('; '):
  153. dayTime = days.replace(
  154. 'hingga', '-').replace('. Sembunyikan jam buka untuk seminggu', '').split(',')
  155. dayInput = {'dayName': dayTime[0], 'openHour': dayTime[1]}
  156. # print(type(dayInput))
  157. openHoursResults[dayTime[0]] = dayTime[1]
  158.  
  159. else:
  160. openHoursResults = {}
  161.  
  162. result = {
  163. "link": driver.current_url,
  164. "title": title,
  165. "thumbnail": img,
  166. "category": category,
  167. "address": address,
  168. "phone": phone,
  169. "plusCode": plusCode,
  170. "openHours": openHoursResults,
  171. "rating": rating,
  172. "website": website
  173.  
  174. }
  175. logging.info("Scraping done, append results...")
  176.  
  177. results.append(result)
  178. # break #remove this to scrape all the data
  179.  
  180. i = i+1
  181.  
  182. return results
  183.  
  184.  
  185. result = doJob("bengkel near pekanbaru")
  186. print(json.dumps(result))
  187.  
Advertisement
Add Comment
Please, Sign In to add comment