Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from datetime import datetime
- from fileinput import filename
- import logging
- from selenium.webdriver.chrome.options import Options
- from selenium.webdriver.common.keys import Keys
- from selenium.common.exceptions import NoSuchElementException
- from webdriver_manager.chrome import ChromeDriverManager
- import json
- import time
- import os
- import sys
- import clipboard
- os.system('clear')
- try:
- from selenium import webdriver
- except:
- seleniumcommand = "python3 -m pip install selenium"
- os.system(seleniumcommand)
- try:
- from bs4 import BeautifulSoup
- except:
- bs4command = "python3 -m pip install bs4"
- os.system(bs4command)
- def scrolling(driver):
- try:
- scrollable_div = driver.find_element_by_xpath(
- '//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]')
- driver.execute_script(
- 'arguments[0].scrollTop = arguments[0].scrollHeight', scrollable_div)
- time.sleep(2)
- except NoSuchElementException:
- print("Error: can't find scrollbar")
- print("")
- def doJob(query):
- config = {
- "sampeQuery": query,
- "inputBox": '//*[@id="searchboxinput"]',
- 'searchButon': '//*[@id="searchbox-searchbutton"]',
- "firstListLink": 'V0h1Ob-haAclf',
- "leftPane": 'Yr7JMd-pane' # wait for scroll so the result will 20
- }
- output = []
- output.clear()
- # open google maps with english
- chrome_options = Options()
- # chrome_options.add_argument('--headless')
- chrome_options.add_argument('--disable-dev-shm-usage')
- # chrome_options.add_argument('--remote-debugging-port=9222')
- chrome_options.add_argument('--no-sandbox')
- chrome_options.add_argument('--disable-gpu')
- chrome_options.add_experimental_option(
- "excludeSwitches", ["enable-logging"])
- driver = webdriver.Chrome(
- ChromeDriverManager(log_level=0).install(), options=chrome_options)
- driver.get("https://google.com/maps?hl=id")
- driver.find_element_by_xpath(
- config["inputBox"]).send_keys(config["sampeQuery"])
- driver.find_element_by_xpath(config["searchButon"]).click()
- time.sleep(2)
- scrolling(driver)
- scrolling(driver)
- scrolling(driver)
- entities = driver.find_elements_by_css_selector(
- 'a.a4gq8e-aVTXAb-haAclf-jRmmHf-hSRGPd')
- links = [x.get_attribute('href') for x in driver.find_elements_by_css_selector(
- "a.a4gq8e-aVTXAb-haAclf-jRmmHf-hSRGPd")]
- logging.info("Found: " + str(len(entities)) + " 20")
- results = []
- i = 1
- for item in links:
- logging.info("Scraping: " + str(item))
- driver.get(item)
- time.sleep(2) # wait 2 sec before get the info
- parser = BeautifulSoup(driver.page_source, "html.parser")
- # get the info
- title = parser.select('h1')[0].text.strip()
- # check if category is found
- if(parser.find('button', jsaction="pane.rating.category")):
- category = parser.find(
- 'button', jsaction="pane.rating.category").text.strip()
- else:
- category = ""
- # get full address
- if(parser.find('button', {'data-tooltip': 'Salin alamat'})):
- address = parser.find(
- 'button', {'data-tooltip': 'Salin alamat'}
- ).text.strip()
- else:
- address = ""
- # get phone number
- if(parser.find('button', {'data-tooltip': 'Salin nomor telepon'})):
- phone = parser.find(
- 'button', {'data-tooltip': 'Salin nomor telepon'}
- ).text.strip()
- else:
- phone = ""
- # get address with plus code
- if(parser.find('button', {'data-tooltip': 'Salin Plus Codes'})):
- plusCode = parser.find(
- 'button', {'data-tooltip': 'Salin Plus Codes'}
- ).text.strip()
- else:
- plusCode = ""
- # get business image
- if(parser.find('button', {'jsaction': 'pane.heroHeaderImage.click'})):
- img = parser.find(
- 'button', {'jsaction': 'pane.heroHeaderImage.click'}
- ).img['src']
- else:
- img = ""
- # find rating
- if(parser.find('span', {'class': 'aMPvhf-fI6EEc-KVuj8d'})):
- rating = parser.find(
- 'span', {'class': 'aMPvhf-fI6EEc-KVuj8d'}).text.strip()
- else:
- rating = ""
- # get the website through click
- if len(driver.find_elements_by_xpath('//img[@alt="Salin situs"]')) > 0:
- driver.find_element_by_xpath(
- '//img[@alt="Salin situs"]').click()
- website = clipboard.paste()
- else:
- website = ""
- # find open hours
- if(parser.find('div', {'class': 'LJKBpe-open-R86cEd-haAclf'})):
- openHoursResults = {}
- openHours = parser.find(
- 'div', {'class': 'LJKBpe-open-R86cEd-haAclf'})['aria-label']
- for days in openHours.split('; '):
- dayTime = days.replace(
- 'hingga', '-').replace('. Sembunyikan jam buka untuk seminggu', '').split(',')
- dayInput = {'dayName': dayTime[0], 'openHour': dayTime[1]}
- # print(type(dayInput))
- openHoursResults[dayTime[0]] = dayTime[1]
- else:
- openHoursResults = {}
- result = {
- "link": driver.current_url,
- "title": title,
- "thumbnail": img,
- "category": category,
- "address": address,
- "phone": phone,
- "plusCode": plusCode,
- "openHours": openHoursResults,
- "rating": rating,
- "website": website
- }
- logging.info("Scraping done, append results...")
- results.append(result)
- # break #remove this to scrape all the data
- i = i+1
- return results
- result = doJob("bengkel near pekanbaru")
- print(json.dumps(result))
Advertisement
Add Comment
Please, Sign In to add comment