Scraping Google Maps Business Data With Python - rendrianarma

from datetime import datetime
from fileinput import filename
import logging
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager
import json
import time
import os
import sys
import clipboard

os.system('clear')

try:
    from selenium import webdriver

except:

    seleniumcommand = "python3 -m pip install selenium"
    os.system(seleniumcommand)

try:
    from bs4 import BeautifulSoup
except:
    bs4command = "python3 -m pip install bs4"
    os.system(bs4command)


def scrolling(driver):
    try:
        scrollable_div = driver.find_element_by_xpath(
            '//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]')
        driver.execute_script(
            'arguments[0].scrollTop = arguments[0].scrollHeight', scrollable_div)
        time.sleep(2)

    except NoSuchElementException:
        print("Error: can't find scrollbar")
        print("")


def doJob(query):

    config = {
        "sampeQuery": query,
        "inputBox": '//*[@id="searchboxinput"]',
        'searchButon': '//*[@id="searchbox-searchbutton"]',
        "firstListLink": 'V0h1Ob-haAclf',
        "leftPane": 'Yr7JMd-pane'  # wait for scroll so the result will 20
    }

    output = []
    output.clear()
    # open google maps with english
    chrome_options = Options()
    # chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-dev-shm-usage')
    # chrome_options.add_argument('--remote-debugging-port=9222')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_experimental_option(
        "excludeSwitches", ["enable-logging"])
    driver = webdriver.Chrome(
        ChromeDriverManager(log_level=0).install(), options=chrome_options)

    driver.get("https://google.com/maps?hl=id")

    driver.find_element_by_xpath(
        config["inputBox"]).send_keys(config["sampeQuery"])
    driver.find_element_by_xpath(config["searchButon"]).click()

    time.sleep(2)

    scrolling(driver)
    scrolling(driver)
    scrolling(driver)
    entities = driver.find_elements_by_css_selector(
        'a.a4gq8e-aVTXAb-haAclf-jRmmHf-hSRGPd')

    links = [x.get_attribute('href') for x in driver.find_elements_by_css_selector(
        "a.a4gq8e-aVTXAb-haAclf-jRmmHf-hSRGPd")]

    logging.info("Found: " + str(len(entities)) + " 20")
    results = []
    i = 1
    for item in links:

        logging.info("Scraping: " + str(item))
        driver.get(item)
        time.sleep(2)  # wait 2 sec before get the info

        parser = BeautifulSoup(driver.page_source, "html.parser")

        # get the info
        title = parser.select('h1')[0].text.strip()

        # check if category is found
        if(parser.find('button', jsaction="pane.rating.category")):
            category = parser.find(
                'button', jsaction="pane.rating.category").text.strip()
        else:
            category = ""
        #  get full address
        if(parser.find('button', {'data-tooltip': 'Salin alamat'})):
            address = parser.find(
                'button', {'data-tooltip': 'Salin alamat'}
            ).text.strip()
        else:
            address = ""
        # get phone number
        if(parser.find('button', {'data-tooltip': 'Salin nomor telepon'})):
            phone = parser.find(
                'button', {'data-tooltip': 'Salin nomor telepon'}
            ).text.strip()
        else:
            phone = ""
        # get address with plus code
        if(parser.find('button', {'data-tooltip': 'Salin Plus Codes'})):
            plusCode = parser.find(
                'button', {'data-tooltip': 'Salin Plus Codes'}
            ).text.strip()
        else:
            plusCode = ""
        # get business image
        if(parser.find('button', {'jsaction': 'pane.heroHeaderImage.click'})):
            img = parser.find(
                'button', {'jsaction': 'pane.heroHeaderImage.click'}
            ).img['src']
        else:
            img = ""
        # find rating
        if(parser.find('span', {'class': 'aMPvhf-fI6EEc-KVuj8d'})):
            rating = parser.find(
                'span', {'class': 'aMPvhf-fI6EEc-KVuj8d'}).text.strip()
        else:
            rating = ""
        # get the website through click
        if len(driver.find_elements_by_xpath('//img[@alt="Salin situs"]')) > 0:
            driver.find_element_by_xpath(
                '//img[@alt="Salin situs"]').click()
            website = clipboard.paste()
        else:
            website = ""

        #  find open hours
        if(parser.find('div', {'class': 'LJKBpe-open-R86cEd-haAclf'})):
            openHoursResults = {}
            openHours = parser.find(
                'div', {'class': 'LJKBpe-open-R86cEd-haAclf'})['aria-label']
            for days in openHours.split('; '):
                dayTime = days.replace(
                    'hingga', '-').replace('. Sembunyikan jam buka untuk seminggu', '').split(',')
                dayInput = {'dayName': dayTime[0], 'openHour': dayTime[1]}
                # print(type(dayInput))
                openHoursResults[dayTime[0]] = dayTime[1]

        else:
            openHoursResults = {}

        result = {
            "link": driver.current_url,
            "title": title,
            "thumbnail": img,
            "category": category,
            "address": address,
            "phone": phone,
            "plusCode": plusCode,
            "openHours": openHoursResults,
            "rating": rating,
            "website": website

        }
        logging.info("Scraping done, append results...")

        results.append(result)
        # break #remove this to scrape all the data

        i = i+1

    return results


result = doJob("bengkel near pekanbaru")
print(json.dumps(result))