Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #Way to many libararies to import
- #Automatically insert chromeditor
- import urllib.request
- from bs4 import BeautifulSoup
- import urllib.request
- from selenium import webdriver
- from selenium.webdriver.common.by import By
- from selenium.webdriver.support.ui import WebDriverWait
- from selenium.webdriver.support import expected_conditions as EC
- from selenium.common.exceptions import NoSuchElementException
- from selenium.common.exceptions import TimeoutException
- from selenium.common.exceptions import ElementNotVisibleException
- from selenium.common.exceptions import StaleElementReferenceException
- import time
- import xlsxwriter
- import urllib
- import os
- Start = True
- def FormattingExcel(ExcelFolderName, SheetName, worksheet, descriptionFormat, phoneFormat, formatTitle, format):
- phoneFormat.set_align('center')
- phoneFormat.set_align('vcenter')
- phoneFormat.font_size = 12
- format.set_align('center')
- format.set_align('vcenter')
- descriptionFormat.set_align('top')
- descriptionFormat.set_text_wrap()
- format.set_text_wrap()
- os.system('cls')
- format.set_align('center')
- formatTitle.set_bold()
- formatTitle.set_align('center')
- descriptionFormat.font_size = 9
- format.font_size = 12
- formatTitle.font_size = 13
- worksheet.set_column(0, 0, 40)
- worksheet.set_column(1, 1, 20)
- worksheet.set_column(2, 2, 15)
- worksheet.set_column(3, 3, 18)
- worksheet.set_column(4, 4, 18)
- worksheet.set_column(5, 5, 100)
- worksheet.set_row(1, 120, 0)
- os.system('cls')
- worksheet.write(0, 0, 'Title', formatTitle)
- worksheet.write(0, 1, 'Location', formatTitle)
- worksheet.write(0, 2, 'Price', formatTitle)
- worksheet.write(0, 3, 'Ad Link', formatTitle)
- worksheet.write(0, 4, 'Phone number', formatTitle)
- worksheet.write(0, 5, 'Description', formatTitle)
- print('New Excel WorkSheet created.')
- print('Name: ' + SheetName)
- def Setting_RunningBrowser(Username, Password, browser):
- os.system('cls')
- browser.maximize_window()
- os.system('cls')
- browser.get('https://my.gumtree.com/login')
- os.system('cls')
- print('Launching browser...')
- browser.execute_script('javascript:localStorage.clear()')
- os.system('cls')
- username = browser.find_element_by_id("email")
- os.system('cls')
- print('Browser launched!')
- password = browser.find_element_by_id("fld-password")
- os.system('cls')
- print('Getting ready to log in...')
- username.send_keys(Username)
- os.system('cls')
- password.send_keys(Password)
- os.system('cls')
- elem = browser.find_element_by_xpath(
- "//BUTTON[@class='btn-primary btn-full-width g-recaptcha'][text()='Login']/self::BUTTON").click()
- os.system('cls')
- print('Logging in...')
- os.system('cls')
- print('Logging in...')
- def PrintToExcel(myAds_Details,worksheet, workbook,format,phoneFormat,descriptionFormat):
- excelCounter = 1
- for ad in myAds_Details:
- worksheet.write(excelCounter, 0, ad.Title, format)
- worksheet.write(excelCounter, 1, ad.Location, format)
- worksheet.write(excelCounter, 2, ad.Price, format)
- worksheet.write(excelCounter, 3, ad.adLink, format)
- worksheet.write(excelCounter, 4, ad.adPhoneNumber, phoneFormat)
- worksheet.write(excelCounter, 5, ad.adDescription,
- descriptionFormat)
- excelCounter = excelCounter + 1
- workbook.close()
- print('Excel Workbook closed.')
- print('You can exit the program!')
- class Ad(object):
- def __init__(self, ad):
- self.ad = ad
- class AdDetails(object):
- def __init__(self, Title, Price, Location, adLink, adDescription, adPhoneNumber):
- self.Title = Title
- self.Price = Price
- self.Location = Location
- self.adLink = adLink
- self.adDescription = adDescription
- self.adPhoneNumber = adPhoneNumber
- def ScrapeSession(myAds,myAds_Details,browser, SameExcelSheet, SheetName, ScrapingMore, StartingPage, EndingPage, Category, KeyWord, workbook, worksheet, descriptionFormat, format, formatTitle, phoneFormat, HowManyAds, Output,TotalAdsRead, Total, Sessions):
- numberExists = True
- Start = False
- if(ScrapingMore):
- KeyWord = input('Enter what keyword you want to search: ')
- Category = input('Enter the category: ')
- StartingPage = input('Enter the page to start from: ')
- EndingPage = input('Enter the page to end at: ')
- HowManyAds = input('How many ads to read (if all from the page, type all): ')
- if (SameExcelSheet):
- print('Using the same Excel Sheet, called: ' + SheetName)
- else:
- if ScrapingMore:
- SheetName = input('\nEnter the new Excel Sheet name: ')
- settingsPath = '../../settings.txt'
- openFileToRead = open(settingsPath, 'r')
- fileLines = openFileToRead.readlines()
- Username = fileLines[0][6:-1]
- Password = fileLines[1][9:-1]
- ExcelFolderName = fileLines[2][12:]
- workbook = xlsxwriter.Workbook('../../' + ExcelFolderName + '/' + SheetName + '.xls')
- worksheet = workbook.add_worksheet()
- descriptionFormat = workbook.add_format()
- phoneFormat = workbook.add_format()
- formatTitle = workbook.add_format()
- format = workbook.add_format()
- FormattingExcel("", SheetName,worksheet,descriptionFormat,phoneFormat,formatTitle,format)
- print("\nProccess\n")
- params = ['Ad']
- paramsForDetails = ['adLink', 'adDescription', 'adPhoneNumber']
- ######### Getting all the ads #########
- myAds = []
- for i in range(int(StartingPage),int(EndingPage)):
- numberExists = True
- if (HowManyAds.lower().__eq__("all")):
- PrintOutput(Output)
- else:
- if (TotalAdsRead >= int(HowManyAds)):
- break
- if(HowManyAds.isnumeric()):
- if(int(HowManyAds) <= TotalAdsRead ):
- break
- sauce = urllib.request.urlopen('https://www.gumtree.com/search?search_category=' +
- Category + '&q=' + KeyWord + '&search_location=uk&q=cars&page=' + str(i)).read()
- os.system('cls')
- print('Getting ad links...')
- soup = BeautifulSoup(sauce, 'lxml')
- featuredAds = []
- ulPlace = soup.find_all('ul', 'clearfix list-listing-mini')
- souptoprint = ulPlace
- ulPlace = soup.find_all('li')
- for ulas in ulPlace:
- for ul in ulas.find_all('article'):
- newAd = ul.find('button')['data-savead'][15:]
- if((newAd).__contains__(' ')):
- newAd = str(newAd).replace(' ', '')
- newAd = ''.join([i for i in newAd if i.isdigit()])
- if(len(newAd) > 9):
- myAds.append(Ad(newAd))
- Output.clear();
- Output.append("Excel Sheet name: " + SheetName)
- Output.append("Searching for '" + KeyWord + "' ads")
- Output.append("Reading " + HowManyAds + ' ads in total.')
- for ad in myAds:
- if(HowManyAds.isnumeric()):
- if ((myAds.index(ad)+1) == myAds.__len__()) and (int(HowManyAds) > TotalAdsRead ):
- StartingPage = str(int(StartingPage) + 1)
- EndingPage = str(int(EndingPage) + 1)
- ScrapeSession(myAds, myAds_Details, browser, True, SheetName, False, StartingPage, EndingPage, Category, KeyWord, workbook, worksheet, descriptionFormat, format, formatTitle, phoneFormat, HowManyAds, Output, TotalAdsRead, Total, Sessions)
- return
- if int(HowManyAds) <= TotalAdsRead:
- break
- State = ''
- if (HowManyAds.lower().__eq__("all")):
- PrintOutput(Output)
- numberExists = True
- ######### End #########
- ######### Sending add to GumTree #########
- searchKeyWord = browser.find_element_by_id('header-search-q')
- os.system('cls')
- PrintOutput(Output)
- try:
- searchKeyWord.send_keys(ad.ad)
- os.system('cls')
- #PrintOutput(Output)
- except ElementNotVisibleException:
- try:
- element = WebDriverWait(browser, 120).until(
- EC.element_to_be_clickable(
- (By.XPATH, "//DIV[@class='gumtree-text-svg hide-fully-to-s']/self::DIV")))
- PrintOutput(Output)
- element.click()
- os.system('cls')
- except TimeoutException:
- element = WebDriverWait(browser, 120).until(
- EC.element_to_be_clickable(
- (By.XPATH, "//DIV[@class='gumtree-text-svg hide-fully-to-s']/self::DIV")))
- os.system('cls')
- PrintOutput(Output)
- element.click()
- try:
- element = WebDriverWait(browser, 120).until(
- EC.element_to_be_clickable((By.XPATH, "//BUTTON[@type='submit']/self::BUTTON")));
- except TimeoutException:
- try:
- element = WebDriverWait(browser, 120).until(
- EC.element_to_be_clickable(
- (By.XPATH, "//DIV[@class='gumtree-text-svg hide-fully-to-s']/self::DIV")))
- os.system('cls')
- PrintOutput(Output)
- element.click()
- except TimeoutException:
- PrintOutput(myAds_Details)
- TotalAdsRead = int(HowManyAds) + 1
- continue
- os.system('cls')
- PrintOutput(Output)
- if(Sessions == 0 and ScrapingMore == False):
- element.click()
- os.system('cls')
- PrintOutput(Output)
- element.click()
- os.system('cls')
- PrintOutput(Output)
- else:
- try:
- element.click()
- os.system('cls')
- PrintOutput(Output)
- except EC.WebDriverException:
- continue
- ######### End sending #########
- ######### Getting data #########
- if(browser.page_source.__contains__('Click to reveal phone number')):
- os.system('cls')
- PrintOutput(Output)
- try:
- element = WebDriverWait(browser, 120).until(
- EC.element_to_be_clickable((By.XPATH, "(//a[@id='reply-panel-reveal-btn-pro'])[2]")))
- os.system('cls')
- PrintOutput(Output)
- element.click()
- os.system('cls')
- PrintOutput(Output)
- State = "Click"
- except TimeoutException:
- element = WebDriverWait(browser, 120).until(
- EC.element_to_be_clickable(
- (By.XPATH, "//DIV[@class='gumtree-text-svg hide-fully-to-s']/self::DIV")))
- os.system('cls')
- PrintOutput(Output)
- element.click()
- else:
- if(browser.page_source.__contains__("Reveal")):
- os.system('cls')
- PrintOutput(Output)
- try:
- element = WebDriverWait(browser, 120).until(EC.element_to_be_clickable(
- (By.XPATH, "(//a[@id='reply-panel-reveal-btn'])[2]")))
- os.system('cls')
- PrintOutput(Output)
- element.click()
- os.system('cls')
- PrintOutput(Output)
- State = 'Reveal'
- except TimeoutException:
- element = WebDriverWait(browser, 120).until(
- EC.element_to_be_clickable(
- (By.XPATH, "//DIV[@class='gumtree-text-svg hide-fully-to-s']/self::DIV")))
- os.system('cls')
- PrintOutput(Output)
- element.click()
- else:
- numberExists = False
- try:
- Description = browser.find_element_by_xpath(
- "//P[@class='ad-description']").text
- os.system('cls')
- except NoSuchElementException:
- try:
- element = WebDriverWait(browser, 120).until(
- EC.element_to_be_clickable(
- (By.XPATH, "//DIV[@class='gumtree-text-svg hide-fully-to-s']/self::DIV")))
- elem = browser.find_element_by_xpath(
- "//DIV[@class='gumtree-text-svg hide-fully-to-s']/self::DIV").click()
- os.system('cls')
- Total = Total + 1
- Sessions = Sessions + 1
- continue
- except TimeoutException:
- element = WebDriverWait(browser, 120).until(
- EC.element_to_be_clickable(
- (By.XPATH, "//DIV[@class='gumtree-text-svg hide-fully-to-s']/self::DIV")))
- os.system('cls')
- PrintOutput(Output)
- element.click()
- PrintOutput(Output)
- except StaleElementReferenceException:
- try:
- element = WebDriverWait(browser, 120).until(
- EC.element_to_be_clickable(
- (By.XPATH, "//DIV[@class='gumtree-text-svg hide-fully-to-s']/self::DIV")))
- elem = browser.find_element_by_xpath(
- "//DIV[@class='gumtree-text-svg hide-fully-to-s']/self::DIV").click()
- os.system('cls')
- Total = Total + 1
- Sessions = Sessions + 1
- continue
- except TimeoutException:
- element = WebDriverWait(browser, 120).until(
- EC.element_to_be_clickable(
- (By.XPATH, "//DIV[@class='gumtree-text-svg hide-fully-to-s']/self::DIV")))
- os.system('cls')
- PrintOutput(Output)
- element.click()
- PrintOutput(Output)
- if(numberExists):
- try:
- os.system('cls')
- PrintOutput(Output)
- WebDriverWait(browser, 120).until(
- EC.visibility_of_element_located((By.XPATH, "(//DIV[@class='clearfix'])[2]")))
- os.system('cls')
- PrintOutput(Output)
- PhoneNumber = browser.find_element_by_xpath(
- "(//DIV[@class='clearfix'])[2]").text
- os.system('cls')
- PrintOutput(Output)
- except ElementNotVisibleException:
- print('not visible lul')
- if (State.__eq__("Reveal")):
- element = WebDriverWait(browser, 120).until(EC.element_to_be_clickable(
- (By.XPATH, "(//a[@id='reply-panel-reveal-btn'])[2]")))
- PhoneNumber = browser.find_element_by_xpath(
- "(//DIV[@class='clearfix'])[2]").text
- os.system('cls')
- PrintOutput(Output)
- if(State == "Click"):
- element = WebDriverWait(browser, 120).until(
- EC.element_to_be_clickable((By.XPATH, "(//a[@id='reply-panel-reveal-btn-pro'])[2]")))
- PhoneNumber = browser.find_element_by_xpath(
- "(//DIV[@class='clearfix'])[2]").text
- os.system('cls')
- PrintOutput(Output)
- PrintOutput(Output)
- #int(re.findall(r'\d+', browser.find_element_by_xpath("(//DIV[@class='clearfix'])[2]").text)[0])
- else:
- PhoneNumber = "None"
- try:
- Location = browser.find_element_by_xpath(
- "// SPAN[ @ itemprop = 'address']").text
- os.system('cls')
- PrintOutput(Output)
- except NoSuchElementException:
- try:
- element = WebDriverWait(browser, 120).until(
- EC.element_to_be_clickable(
- (By.XPATH, "//DIV[@class='gumtree-text-svg hide-fully-to-s']/self::DIV")))
- elem = browser.find_element_by_xpath(
- "//DIV[@class='gumtree-text-svg hide-fully-to-s']/self::DIV").click()
- os.system('cls')
- Total = Total + 1
- Sessions = Sessions + 1
- continue
- except TimeoutException:
- element = WebDriverWait(browser, 120).until(
- EC.element_to_be_clickable(
- (By.XPATH, "//DIV[@class='gumtree-text-svg hide-fully-to-s']/self::DIV")))
- os.system('cls')
- PrintOutput(Output)
- element.click()
- try:
- Price = browser.find_element_by_xpath(
- "//STRONG[@class='ad-price txt-xlarge txt-emphasis inline-block']").text
- os.system('cls')
- PrintOutput(Output)
- except NoSuchElementException:
- try:
- element = WebDriverWait(browser, 120).until(
- EC.element_to_be_clickable(
- (By.XPATH, "//DIV[@class='gumtree-text-svg hide-fully-to-s']/self::DIV")))
- elem = browser.find_element_by_xpath(
- "//DIV[@class='gumtree-text-svg hide-fully-to-s']/self::DIV").click()
- os.system('cls')
- Total = Total + 1
- Sessions = Sessions + 1
- continue
- except TimeoutException:
- element = WebDriverWait(browser, 120).until(
- EC.element_to_be_clickable(
- (By.XPATH, "//DIV[@class='gumtree-text-svg hide-fully-to-s']/self::DIV")))
- os.system('cls')
- PrintOutput(Output)
- element.click()
- try:
- Title = browser.find_element_by_xpath("//H1[@id='ad-title']").text
- os.system('cls')
- PrintOutput(Output)
- except NoSuchElementException:
- try:
- element = WebDriverWait(browser, 120).until(
- EC.element_to_be_clickable(
- (By.XPATH, "//DIV[@class='gumtree-text-svg hide-fully-to-s']/self::DIV")))
- elem = browser.find_element_by_xpath(
- "//DIV[@class='gumtree-text-svg hide-fully-to-s']/self::DIV").click()
- os.system('cls')
- Total = Total + 1
- Sessions = Sessions + 1
- continue
- except TimeoutException:
- element = WebDriverWait(browser, 120).until(
- EC.element_to_be_clickable(
- (By.XPATH, "//DIV[@class='gumtree-text-svg hide-fully-to-s']/self::DIV")))
- os.system('cls')
- #PrintOutput(Output)
- element.click()
- if (PhoneNumber.__contains__('X') or PhoneNumber.__contains__('Click') or PhoneNumber.__contains__(
- 'Reveal')):
- os.system('cls')
- #PrintOutput(Output)
- PhoneNumber = str(PhoneNumber).replace(' ', '')
- PhoneNumber = ''.join([i for i in PhoneNumber if i.isdigit()])
- time.sleep(2)
- PhoneNumber = browser.find_element_by_xpath(
- "(//DIV[@class='clearfix'])[2]").text
- if (PhoneNumber.__contains__('X') or PhoneNumber.__contains__('Click') or PhoneNumber.__contains__(
- 'Reveal')):
- if(PhoneNumber.__contains__('X') == False):
- PhoneNumber = str(PhoneNumber).replace(' ', '')
- PhoneNumber = ''.join([i for i in PhoneNumber if i.isdigit()])
- os.system('cls')
- #PrintOutput(Output)
- time.sleep(3)
- os.system('cls')
- #PrintOutput(Output)
- PhoneNumber = browser.find_element_by_xpath(
- "(//DIV[@class='clearfix'])[2]").text
- if (PhoneNumber.__contains__('X') or PhoneNumber.__contains__('Click') or PhoneNumber.__contains__(
- 'Reveal')):
- if (PhoneNumber.__contains__('X') == False):
- PhoneNumber = str(PhoneNumber).replace(' ', '')
- PhoneNumber = ''.join([i for i in PhoneNumber if i.isdigit()])
- os.system('cls')
- #PrintOutput(Output)
- if(PhoneNumber.__contains__('X') == False and PhoneNumber.__contains__('Click') == False and PhoneNumber.__contains__(
- 'Reveal') == False):
- if(PhoneNumber.__len__() > 5):
- myAds_Details.append(
- AdDetails(Title, Price, Location, ad.ad, Description, PhoneNumber))
- TotalAdsRead = TotalAdsRead + 1
- os.system('cls')
- PrintOutput(Output)
- try:
- element = WebDriverWait(browser, 120).until(
- EC.element_to_be_clickable((By.XPATH, "//DIV[@class='gumtree-text-svg hide-fully-to-s']/self::DIV")))
- os.system('cls')
- PrintOutput(Output)
- element.click()
- os.system('cls')
- except TimeoutException:
- element = WebDriverWait(browser, 120).until(
- EC.element_to_be_clickable(
- (By.XPATH, "//DIV[@class='gumtree-text-svg hide-fully-to-s']/self::DIV")))
- os.system('cls')
- PrintOutput(Output)
- element.click()
- Total = Total + 1
- os.system('cls')
- # for ad in myAds_Details:
- # worksheet.write(i+1,0,adDetails)
- Sessions = Sessions + 1
- PrintOutput(Output)
- print('Scraping session is over!')
- print('Amount Of Scraped Ads: ' + str(TotalAdsRead))
- ScrapeMorePages = input('\nDo you want to scrape more? (y/n):')
- if(ScrapeMorePages.lower().__eq__('y')):
- SameExcel = input("Use the same Excel Sheet? (y/n): ")
- os.system('cls')
- print('Scraping session is over!')
- print('Amount Of Scraped Ads: ' + str(TotalAdsRead))
- if(SameExcel.lower().__eq__('y')):
- Output.clear()
- os.system('cls')
- TotalAdsRead = 0;
- ScrapeSession(myAds,myAds_Details,browser, True, SheetName, True, StartingPage, EndingPage, Category, "", workbook,
- worksheet, descriptionFormat, format, formatTitle, phoneFormat, HowManyAds,Output,TotalAdsRead,Total,Sessions)
- else:
- excelCounter = 1
- os.system('cls')
- PrintToExcel(myAds_Details, worksheet, workbook,format,phoneFormat,descriptionFormat)
- myAds_Details.clear()
- Output.clear()
- TotalAdsRead = 0
- ScrapeSession(myAds,myAds_Details,browser, False, "", True, StartingPage, EndingPage, Category, "", workbook,
- worksheet, descriptionFormat, format, formatTitle, phoneFormat, HowManyAds,Output,TotalAdsRead,Total,Sessions)
- else:
- browser.delete_all_cookies()
- browser.close()
- print("Excel Sheet Name: " + SheetName)
- print('Browser closed.')
- PrintToExcel(myAds_Details,worksheet,workbook,format,phoneFormat,descriptionFormat)
- return
- ######### End #########
- def PrintOutput(output):
- for text in output:
- print(text)
- def Main():
- myAds = []
- myAds_Details = []
- souptoprint = []
- Output = []
- print('Starting the browser!')
- settingsPath = 'settings.txt'
- openFileToRead = open(settingsPath, 'r')
- fileLines = openFileToRead.readlines()
- Username = fileLines[0][6:-1]
- Password = fileLines[1][9:-1]
- ExcelFolderName = fileLines[2][12:]
- options = webdriver.ChromeOptions()
- os.system('cls')
- options.add_argument('--ignore-certificate-errors')
- options.add_argument('--ignore-ssl-errors')
- options.add_argument('--ignore-certificate-errors-spki-list')
- options.add_argument("--incognito")
- desiredCapabilities = webdriver.DesiredCapabilities()
- desiredCapabilities.__setattr__('ACCEPT_SSL_CERT', True)
- os.system('cls')
- browser = webdriver.Chrome(
- chrome_options=options, desired_capabilities=desiredCapabilities.CHROME)
- Setting_RunningBrowser(Username, Password, browser)
- KeyWord = input('Enter what keyword you want to search: ')
- Category = input('Enter the category: ')
- StartingPage = input('Enter the page to start from: ')
- EndingPage = input('Enter the page to end at: ')
- HowManyAds = input('How many ads to read (if all from the page, type all): ')
- SheetName = input('Enter the new Excel Sheet name: ')
- workbook = xlsxwriter.Workbook('../../' + ExcelFolderName + '/' + SheetName + '.xls')
- worksheet = workbook.add_worksheet()
- Output.append("Excel Sheet name: " + SheetName)
- Output.append("Searching for '" + KeyWord + "' ads...")
- if (HowManyAds.lower().__eq__("all")):
- Output.append("Reading " + HowManyAds + ' ads.')
- else:
- Output.append("Reading " + HowManyAds + ' ads in total.')
- descriptionFormat = workbook.add_format()
- phoneFormat = workbook.add_format()
- formatTitle = workbook.add_format()
- format = workbook.add_format()
- FormattingExcel(ExcelFolderName, SheetName, worksheet, descriptionFormat, phoneFormat, formatTitle, format)
- TotalAdsRead = 0
- Total = 0
- os.system('cls')
- print('New Excel WorkSheet created.')
- print('Name: ' + SheetName)
- print('WorkSheet formatting done!')
- try:
- element = WebDriverWait(browser, 120).until(
- EC.presence_of_element_located((By.CLASS_NAME, "js-advert-count"))
- )
- os.system('cls')
- print('Getting ready to scrape...')
- finally:
- elem = browser.find_element_by_xpath(
- "//DIV[@class='gumtree-logo-svg']/following-sibling::DIV").click()
- os.system('cls')
- print('Ready to scrape!')
- # browser.quit()
- ScrapeSession(myAds,myAds_Details,browser, False, SheetName, False, StartingPage, EndingPage, Category, KeyWord, workbook,
- worksheet, descriptionFormat, format, formatTitle, phoneFormat, HowManyAds,Output,TotalAdsRead,Total,0)
- if(Start):
- Main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement