Advertisement
Guest User

Untitled

a guest
May 30th, 2018
160
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 27.39 KB | None | 0 0
  1. #Way to many libararies to import
  2. #Automatically insert chromeditor
  3.  
  4. import urllib.request
  5. from bs4 import BeautifulSoup
  6. import urllib.request
  7. from selenium import webdriver
  8. from selenium.webdriver.common.by import By
  9. from selenium.webdriver.support.ui import WebDriverWait
  10. from selenium.webdriver.support import expected_conditions as EC
  11. from selenium.common.exceptions import NoSuchElementException
  12. from selenium.common.exceptions import TimeoutException
  13. from selenium.common.exceptions import ElementNotVisibleException
  14. from selenium.common.exceptions import StaleElementReferenceException
  15. import time
  16. import xlsxwriter
  17. import urllib
  18. import os
  19.  
  20. Start = True
  21.  
  22. def FormattingExcel(ExcelFolderName, SheetName, worksheet, descriptionFormat, phoneFormat, formatTitle, format):
  23. phoneFormat.set_align('center')
  24. phoneFormat.set_align('vcenter')
  25. phoneFormat.font_size = 12
  26. format.set_align('center')
  27. format.set_align('vcenter')
  28. descriptionFormat.set_align('top')
  29. descriptionFormat.set_text_wrap()
  30. format.set_text_wrap()
  31. os.system('cls')
  32. format.set_align('center')
  33. formatTitle.set_bold()
  34. formatTitle.set_align('center')
  35.  
  36. descriptionFormat.font_size = 9
  37. format.font_size = 12
  38. formatTitle.font_size = 13
  39.  
  40. worksheet.set_column(0, 0, 40)
  41. worksheet.set_column(1, 1, 20)
  42. worksheet.set_column(2, 2, 15)
  43. worksheet.set_column(3, 3, 18)
  44. worksheet.set_column(4, 4, 18)
  45. worksheet.set_column(5, 5, 100)
  46. worksheet.set_row(1, 120, 0)
  47. os.system('cls')
  48. worksheet.write(0, 0, 'Title', formatTitle)
  49. worksheet.write(0, 1, 'Location', formatTitle)
  50. worksheet.write(0, 2, 'Price', formatTitle)
  51. worksheet.write(0, 3, 'Ad Link', formatTitle)
  52. worksheet.write(0, 4, 'Phone number', formatTitle)
  53. worksheet.write(0, 5, 'Description', formatTitle)
  54.  
  55. print('New Excel WorkSheet created.')
  56. print('Name: ' + SheetName)
  57.  
  58. def Setting_RunningBrowser(Username, Password, browser):
  59. os.system('cls')
  60. browser.maximize_window()
  61. os.system('cls')
  62. browser.get('https://my.gumtree.com/login')
  63. os.system('cls')
  64. print('Launching browser...')
  65. browser.execute_script('javascript:localStorage.clear()')
  66. os.system('cls')
  67. username = browser.find_element_by_id("email")
  68. os.system('cls')
  69. print('Browser launched!')
  70. password = browser.find_element_by_id("fld-password")
  71. os.system('cls')
  72. print('Getting ready to log in...')
  73.  
  74. username.send_keys(Username)
  75. os.system('cls')
  76. password.send_keys(Password)
  77. os.system('cls')
  78. elem = browser.find_element_by_xpath(
  79. "//BUTTON[@class='btn-primary btn-full-width g-recaptcha'][text()='Login']/self::BUTTON").click()
  80. os.system('cls')
  81. print('Logging in...')
  82. os.system('cls')
  83. print('Logging in...')
  84.  
  85. def PrintToExcel(myAds_Details,worksheet, workbook,format,phoneFormat,descriptionFormat):
  86. excelCounter = 1
  87. for ad in myAds_Details:
  88. worksheet.write(excelCounter, 0, ad.Title, format)
  89. worksheet.write(excelCounter, 1, ad.Location, format)
  90. worksheet.write(excelCounter, 2, ad.Price, format)
  91. worksheet.write(excelCounter, 3, ad.adLink, format)
  92. worksheet.write(excelCounter, 4, ad.adPhoneNumber, phoneFormat)
  93. worksheet.write(excelCounter, 5, ad.adDescription,
  94. descriptionFormat)
  95. excelCounter = excelCounter + 1
  96. workbook.close()
  97. print('Excel Workbook closed.')
  98. print('You can exit the program!')
  99. class Ad(object):
  100. def __init__(self, ad):
  101. self.ad = ad
  102.  
  103. class AdDetails(object):
  104. def __init__(self, Title, Price, Location, adLink, adDescription, adPhoneNumber):
  105. self.Title = Title
  106. self.Price = Price
  107. self.Location = Location
  108. self.adLink = adLink
  109. self.adDescription = adDescription
  110. self.adPhoneNumber = adPhoneNumber
  111.  
  112. def ScrapeSession(myAds,myAds_Details,browser, SameExcelSheet, SheetName, ScrapingMore, StartingPage, EndingPage, Category, KeyWord, workbook, worksheet, descriptionFormat, format, formatTitle, phoneFormat, HowManyAds, Output,TotalAdsRead, Total, Sessions):
  113. numberExists = True
  114. Start = False
  115. if(ScrapingMore):
  116. KeyWord = input('Enter what keyword you want to search: ')
  117. Category = input('Enter the category: ')
  118. StartingPage = input('Enter the page to start from: ')
  119. EndingPage = input('Enter the page to end at: ')
  120. HowManyAds = input('How many ads to read (if all from the page, type all): ')
  121. if (SameExcelSheet):
  122. print('Using the same Excel Sheet, called: ' + SheetName)
  123. else:
  124. if ScrapingMore:
  125. SheetName = input('\nEnter the new Excel Sheet name: ')
  126. settingsPath = '../../settings.txt'
  127. openFileToRead = open(settingsPath, 'r')
  128. fileLines = openFileToRead.readlines()
  129.  
  130. Username = fileLines[0][6:-1]
  131. Password = fileLines[1][9:-1]
  132. ExcelFolderName = fileLines[2][12:]
  133. workbook = xlsxwriter.Workbook('../../' + ExcelFolderName + '/' + SheetName + '.xls')
  134. worksheet = workbook.add_worksheet()
  135. descriptionFormat = workbook.add_format()
  136. phoneFormat = workbook.add_format()
  137. formatTitle = workbook.add_format()
  138. format = workbook.add_format()
  139. FormattingExcel("", SheetName,worksheet,descriptionFormat,phoneFormat,formatTitle,format)
  140.  
  141. print("\nProccess\n")
  142.  
  143. params = ['Ad']
  144.  
  145. paramsForDetails = ['adLink', 'adDescription', 'adPhoneNumber']
  146.  
  147.  
  148. ######### Getting all the ads #########
  149. myAds = []
  150. for i in range(int(StartingPage),int(EndingPage)):
  151. numberExists = True
  152. if (HowManyAds.lower().__eq__("all")):
  153. PrintOutput(Output)
  154. else:
  155. if (TotalAdsRead >= int(HowManyAds)):
  156. break
  157.  
  158. if(HowManyAds.isnumeric()):
  159.  
  160. if(int(HowManyAds) <= TotalAdsRead ):
  161. break
  162.  
  163. sauce = urllib.request.urlopen('https://www.gumtree.com/search?search_category=' +
  164. Category + '&q=' + KeyWord + '&search_location=uk&q=cars&page=' + str(i)).read()
  165. os.system('cls')
  166. print('Getting ad links...')
  167. soup = BeautifulSoup(sauce, 'lxml')
  168.  
  169. featuredAds = []
  170.  
  171. ulPlace = soup.find_all('ul', 'clearfix list-listing-mini')
  172.  
  173. souptoprint = ulPlace
  174.  
  175. ulPlace = soup.find_all('li')
  176.  
  177. for ulas in ulPlace:
  178. for ul in ulas.find_all('article'):
  179. newAd = ul.find('button')['data-savead'][15:]
  180. if((newAd).__contains__(' ')):
  181. newAd = str(newAd).replace(' ', '')
  182. newAd = ''.join([i for i in newAd if i.isdigit()])
  183. if(len(newAd) > 9):
  184. myAds.append(Ad(newAd))
  185.  
  186.  
  187. Output.clear();
  188. Output.append("Excel Sheet name: " + SheetName)
  189. Output.append("Searching for '" + KeyWord + "' ads")
  190. Output.append("Reading " + HowManyAds + ' ads in total.')
  191.  
  192.  
  193. for ad in myAds:
  194.  
  195. if(HowManyAds.isnumeric()):
  196.  
  197. if ((myAds.index(ad)+1) == myAds.__len__()) and (int(HowManyAds) > TotalAdsRead ):
  198. StartingPage = str(int(StartingPage) + 1)
  199. EndingPage = str(int(EndingPage) + 1)
  200. ScrapeSession(myAds, myAds_Details, browser, True, SheetName, False, StartingPage, EndingPage, Category, KeyWord, workbook, worksheet, descriptionFormat, format, formatTitle, phoneFormat, HowManyAds, Output, TotalAdsRead, Total, Sessions)
  201. return
  202.  
  203. if int(HowManyAds) <= TotalAdsRead:
  204. break
  205.  
  206.  
  207. State = ''
  208. if (HowManyAds.lower().__eq__("all")):
  209. PrintOutput(Output)
  210.  
  211. numberExists = True
  212. ######### End #########
  213.  
  214. ######### Sending add to GumTree #########
  215. searchKeyWord = browser.find_element_by_id('header-search-q')
  216. os.system('cls')
  217. PrintOutput(Output)
  218. try:
  219. searchKeyWord.send_keys(ad.ad)
  220. os.system('cls')
  221. #PrintOutput(Output)
  222. except ElementNotVisibleException:
  223. try:
  224. element = WebDriverWait(browser, 120).until(
  225. EC.element_to_be_clickable(
  226. (By.XPATH, "//DIV[@class='gumtree-text-svg hide-fully-to-s']/self::DIV")))
  227. PrintOutput(Output)
  228. element.click()
  229. os.system('cls')
  230. except TimeoutException:
  231. element = WebDriverWait(browser, 120).until(
  232. EC.element_to_be_clickable(
  233. (By.XPATH, "//DIV[@class='gumtree-text-svg hide-fully-to-s']/self::DIV")))
  234. os.system('cls')
  235. PrintOutput(Output)
  236. element.click()
  237. try:
  238. element = WebDriverWait(browser, 120).until(
  239. EC.element_to_be_clickable((By.XPATH, "//BUTTON[@type='submit']/self::BUTTON")));
  240. except TimeoutException:
  241. try:
  242. element = WebDriverWait(browser, 120).until(
  243. EC.element_to_be_clickable(
  244. (By.XPATH, "//DIV[@class='gumtree-text-svg hide-fully-to-s']/self::DIV")))
  245. os.system('cls')
  246. PrintOutput(Output)
  247. element.click()
  248. except TimeoutException:
  249. PrintOutput(myAds_Details)
  250. TotalAdsRead = int(HowManyAds) + 1
  251. continue
  252.  
  253.  
  254. os.system('cls')
  255. PrintOutput(Output)
  256.  
  257.  
  258. if(Sessions == 0 and ScrapingMore == False):
  259. element.click()
  260. os.system('cls')
  261. PrintOutput(Output)
  262. element.click()
  263. os.system('cls')
  264. PrintOutput(Output)
  265. else:
  266. try:
  267. element.click()
  268. os.system('cls')
  269. PrintOutput(Output)
  270.  
  271. except EC.WebDriverException:
  272. continue
  273.  
  274.  
  275. ######### End sending #########
  276.  
  277. ######### Getting data #########
  278.  
  279. if(browser.page_source.__contains__('Click to reveal phone number')):
  280. os.system('cls')
  281. PrintOutput(Output)
  282. try:
  283. element = WebDriverWait(browser, 120).until(
  284. EC.element_to_be_clickable((By.XPATH, "(//a[@id='reply-panel-reveal-btn-pro'])[2]")))
  285.  
  286. os.system('cls')
  287. PrintOutput(Output)
  288. element.click()
  289. os.system('cls')
  290. PrintOutput(Output)
  291. State = "Click"
  292. except TimeoutException:
  293. element = WebDriverWait(browser, 120).until(
  294. EC.element_to_be_clickable(
  295. (By.XPATH, "//DIV[@class='gumtree-text-svg hide-fully-to-s']/self::DIV")))
  296. os.system('cls')
  297. PrintOutput(Output)
  298. element.click()
  299. else:
  300.  
  301. if(browser.page_source.__contains__("Reveal")):
  302. os.system('cls')
  303. PrintOutput(Output)
  304. try:
  305. element = WebDriverWait(browser, 120).until(EC.element_to_be_clickable(
  306. (By.XPATH, "(//a[@id='reply-panel-reveal-btn'])[2]")))
  307. os.system('cls')
  308. PrintOutput(Output)
  309. element.click()
  310. os.system('cls')
  311. PrintOutput(Output)
  312. State = 'Reveal'
  313. except TimeoutException:
  314. element = WebDriverWait(browser, 120).until(
  315. EC.element_to_be_clickable(
  316. (By.XPATH, "//DIV[@class='gumtree-text-svg hide-fully-to-s']/self::DIV")))
  317. os.system('cls')
  318. PrintOutput(Output)
  319. element.click()
  320. else:
  321. numberExists = False
  322.  
  323. try:
  324. Description = browser.find_element_by_xpath(
  325. "//P[@class='ad-description']").text
  326. os.system('cls')
  327. except NoSuchElementException:
  328. try:
  329. element = WebDriverWait(browser, 120).until(
  330. EC.element_to_be_clickable(
  331. (By.XPATH, "//DIV[@class='gumtree-text-svg hide-fully-to-s']/self::DIV")))
  332. elem = browser.find_element_by_xpath(
  333. "//DIV[@class='gumtree-text-svg hide-fully-to-s']/self::DIV").click()
  334. os.system('cls')
  335. Total = Total + 1
  336. Sessions = Sessions + 1
  337. continue
  338. except TimeoutException:
  339. element = WebDriverWait(browser, 120).until(
  340. EC.element_to_be_clickable(
  341. (By.XPATH, "//DIV[@class='gumtree-text-svg hide-fully-to-s']/self::DIV")))
  342. os.system('cls')
  343. PrintOutput(Output)
  344. element.click()
  345. PrintOutput(Output)
  346. except StaleElementReferenceException:
  347. try:
  348. element = WebDriverWait(browser, 120).until(
  349. EC.element_to_be_clickable(
  350. (By.XPATH, "//DIV[@class='gumtree-text-svg hide-fully-to-s']/self::DIV")))
  351. elem = browser.find_element_by_xpath(
  352. "//DIV[@class='gumtree-text-svg hide-fully-to-s']/self::DIV").click()
  353. os.system('cls')
  354. Total = Total + 1
  355. Sessions = Sessions + 1
  356. continue
  357. except TimeoutException:
  358. element = WebDriverWait(browser, 120).until(
  359. EC.element_to_be_clickable(
  360. (By.XPATH, "//DIV[@class='gumtree-text-svg hide-fully-to-s']/self::DIV")))
  361. os.system('cls')
  362. PrintOutput(Output)
  363. element.click()
  364. PrintOutput(Output)
  365. if(numberExists):
  366. try:
  367. os.system('cls')
  368. PrintOutput(Output)
  369. WebDriverWait(browser, 120).until(
  370. EC.visibility_of_element_located((By.XPATH, "(//DIV[@class='clearfix'])[2]")))
  371. os.system('cls')
  372. PrintOutput(Output)
  373. PhoneNumber = browser.find_element_by_xpath(
  374. "(//DIV[@class='clearfix'])[2]").text
  375. os.system('cls')
  376. PrintOutput(Output)
  377. except ElementNotVisibleException:
  378. print('not visible lul')
  379. if (State.__eq__("Reveal")):
  380. element = WebDriverWait(browser, 120).until(EC.element_to_be_clickable(
  381. (By.XPATH, "(//a[@id='reply-panel-reveal-btn'])[2]")))
  382. PhoneNumber = browser.find_element_by_xpath(
  383. "(//DIV[@class='clearfix'])[2]").text
  384. os.system('cls')
  385. PrintOutput(Output)
  386. if(State == "Click"):
  387. element = WebDriverWait(browser, 120).until(
  388. EC.element_to_be_clickable((By.XPATH, "(//a[@id='reply-panel-reveal-btn-pro'])[2]")))
  389. PhoneNumber = browser.find_element_by_xpath(
  390. "(//DIV[@class='clearfix'])[2]").text
  391. os.system('cls')
  392. PrintOutput(Output)
  393.  
  394. PrintOutput(Output)
  395.  
  396. #int(re.findall(r'\d+', browser.find_element_by_xpath("(//DIV[@class='clearfix'])[2]").text)[0])
  397. else:
  398. PhoneNumber = "None"
  399. try:
  400. Location = browser.find_element_by_xpath(
  401. "// SPAN[ @ itemprop = 'address']").text
  402. os.system('cls')
  403. PrintOutput(Output)
  404. except NoSuchElementException:
  405. try:
  406. element = WebDriverWait(browser, 120).until(
  407. EC.element_to_be_clickable(
  408. (By.XPATH, "//DIV[@class='gumtree-text-svg hide-fully-to-s']/self::DIV")))
  409. elem = browser.find_element_by_xpath(
  410. "//DIV[@class='gumtree-text-svg hide-fully-to-s']/self::DIV").click()
  411. os.system('cls')
  412. Total = Total + 1
  413. Sessions = Sessions + 1
  414. continue
  415. except TimeoutException:
  416. element = WebDriverWait(browser, 120).until(
  417. EC.element_to_be_clickable(
  418. (By.XPATH, "//DIV[@class='gumtree-text-svg hide-fully-to-s']/self::DIV")))
  419. os.system('cls')
  420. PrintOutput(Output)
  421. element.click()
  422.  
  423. try:
  424. Price = browser.find_element_by_xpath(
  425. "//STRONG[@class='ad-price txt-xlarge txt-emphasis inline-block']").text
  426. os.system('cls')
  427. PrintOutput(Output)
  428. except NoSuchElementException:
  429. try:
  430. element = WebDriverWait(browser, 120).until(
  431. EC.element_to_be_clickable(
  432. (By.XPATH, "//DIV[@class='gumtree-text-svg hide-fully-to-s']/self::DIV")))
  433. elem = browser.find_element_by_xpath(
  434. "//DIV[@class='gumtree-text-svg hide-fully-to-s']/self::DIV").click()
  435. os.system('cls')
  436. Total = Total + 1
  437. Sessions = Sessions + 1
  438. continue
  439. except TimeoutException:
  440. element = WebDriverWait(browser, 120).until(
  441. EC.element_to_be_clickable(
  442. (By.XPATH, "//DIV[@class='gumtree-text-svg hide-fully-to-s']/self::DIV")))
  443. os.system('cls')
  444. PrintOutput(Output)
  445. element.click()
  446. try:
  447. Title = browser.find_element_by_xpath("//H1[@id='ad-title']").text
  448. os.system('cls')
  449. PrintOutput(Output)
  450. except NoSuchElementException:
  451. try:
  452. element = WebDriverWait(browser, 120).until(
  453. EC.element_to_be_clickable(
  454. (By.XPATH, "//DIV[@class='gumtree-text-svg hide-fully-to-s']/self::DIV")))
  455. elem = browser.find_element_by_xpath(
  456. "//DIV[@class='gumtree-text-svg hide-fully-to-s']/self::DIV").click()
  457. os.system('cls')
  458. Total = Total + 1
  459. Sessions = Sessions + 1
  460. continue
  461. except TimeoutException:
  462. element = WebDriverWait(browser, 120).until(
  463. EC.element_to_be_clickable(
  464. (By.XPATH, "//DIV[@class='gumtree-text-svg hide-fully-to-s']/self::DIV")))
  465. os.system('cls')
  466. #PrintOutput(Output)
  467. element.click()
  468.  
  469. if (PhoneNumber.__contains__('X') or PhoneNumber.__contains__('Click') or PhoneNumber.__contains__(
  470. 'Reveal')):
  471. os.system('cls')
  472. #PrintOutput(Output)
  473. PhoneNumber = str(PhoneNumber).replace(' ', '')
  474. PhoneNumber = ''.join([i for i in PhoneNumber if i.isdigit()])
  475. time.sleep(2)
  476. PhoneNumber = browser.find_element_by_xpath(
  477. "(//DIV[@class='clearfix'])[2]").text
  478. if (PhoneNumber.__contains__('X') or PhoneNumber.__contains__('Click') or PhoneNumber.__contains__(
  479. 'Reveal')):
  480. if(PhoneNumber.__contains__('X') == False):
  481. PhoneNumber = str(PhoneNumber).replace(' ', '')
  482. PhoneNumber = ''.join([i for i in PhoneNumber if i.isdigit()])
  483. os.system('cls')
  484. #PrintOutput(Output)
  485. time.sleep(3)
  486. os.system('cls')
  487. #PrintOutput(Output)
  488. PhoneNumber = browser.find_element_by_xpath(
  489. "(//DIV[@class='clearfix'])[2]").text
  490. if (PhoneNumber.__contains__('X') or PhoneNumber.__contains__('Click') or PhoneNumber.__contains__(
  491. 'Reveal')):
  492. if (PhoneNumber.__contains__('X') == False):
  493. PhoneNumber = str(PhoneNumber).replace(' ', '')
  494. PhoneNumber = ''.join([i for i in PhoneNumber if i.isdigit()])
  495. os.system('cls')
  496. #PrintOutput(Output)
  497. if(PhoneNumber.__contains__('X') == False and PhoneNumber.__contains__('Click') == False and PhoneNumber.__contains__(
  498. 'Reveal') == False):
  499. if(PhoneNumber.__len__() > 5):
  500. myAds_Details.append(
  501. AdDetails(Title, Price, Location, ad.ad, Description, PhoneNumber))
  502. TotalAdsRead = TotalAdsRead + 1
  503. os.system('cls')
  504. PrintOutput(Output)
  505. try:
  506. element = WebDriverWait(browser, 120).until(
  507. EC.element_to_be_clickable((By.XPATH, "//DIV[@class='gumtree-text-svg hide-fully-to-s']/self::DIV")))
  508. os.system('cls')
  509. PrintOutput(Output)
  510. element.click()
  511. os.system('cls')
  512. except TimeoutException:
  513. element = WebDriverWait(browser, 120).until(
  514. EC.element_to_be_clickable(
  515. (By.XPATH, "//DIV[@class='gumtree-text-svg hide-fully-to-s']/self::DIV")))
  516. os.system('cls')
  517. PrintOutput(Output)
  518. element.click()
  519.  
  520. Total = Total + 1
  521. os.system('cls')
  522.  
  523. # for ad in myAds_Details:
  524. # worksheet.write(i+1,0,adDetails)
  525.  
  526.  
  527. Sessions = Sessions + 1
  528.  
  529. PrintOutput(Output)
  530.  
  531. print('Scraping session is over!')
  532. print('Amount Of Scraped Ads: ' + str(TotalAdsRead))
  533. ScrapeMorePages = input('\nDo you want to scrape more? (y/n):')
  534. if(ScrapeMorePages.lower().__eq__('y')):
  535. SameExcel = input("Use the same Excel Sheet? (y/n): ")
  536. os.system('cls')
  537. print('Scraping session is over!')
  538. print('Amount Of Scraped Ads: ' + str(TotalAdsRead))
  539. if(SameExcel.lower().__eq__('y')):
  540. Output.clear()
  541. os.system('cls')
  542. TotalAdsRead = 0;
  543. ScrapeSession(myAds,myAds_Details,browser, True, SheetName, True, StartingPage, EndingPage, Category, "", workbook,
  544. worksheet, descriptionFormat, format, formatTitle, phoneFormat, HowManyAds,Output,TotalAdsRead,Total,Sessions)
  545.  
  546. else:
  547. excelCounter = 1
  548. os.system('cls')
  549. PrintToExcel(myAds_Details, worksheet, workbook,format,phoneFormat,descriptionFormat)
  550. myAds_Details.clear()
  551. Output.clear()
  552. TotalAdsRead = 0
  553. ScrapeSession(myAds,myAds_Details,browser, False, "", True, StartingPage, EndingPage, Category, "", workbook,
  554. worksheet, descriptionFormat, format, formatTitle, phoneFormat, HowManyAds,Output,TotalAdsRead,Total,Sessions)
  555.  
  556.  
  557. else:
  558. browser.delete_all_cookies()
  559. browser.close()
  560. print("Excel Sheet Name: " + SheetName)
  561. print('Browser closed.')
  562. PrintToExcel(myAds_Details,worksheet,workbook,format,phoneFormat,descriptionFormat)
  563. return
  564. ######### End #########
  565. def PrintOutput(output):
  566. for text in output:
  567. print(text)
  568.  
  569. def Main():
  570.  
  571. myAds = []
  572. myAds_Details = []
  573. souptoprint = []
  574. Output = []
  575.  
  576.  
  577. print('Starting the browser!')
  578.  
  579. settingsPath = 'settings.txt'
  580. openFileToRead = open(settingsPath, 'r')
  581. fileLines = openFileToRead.readlines()
  582.  
  583. Username = fileLines[0][6:-1]
  584. Password = fileLines[1][9:-1]
  585. ExcelFolderName = fileLines[2][12:]
  586.  
  587. options = webdriver.ChromeOptions()
  588. os.system('cls')
  589.  
  590. options.add_argument('--ignore-certificate-errors')
  591. options.add_argument('--ignore-ssl-errors')
  592. options.add_argument('--ignore-certificate-errors-spki-list')
  593. options.add_argument("--incognito")
  594.  
  595. desiredCapabilities = webdriver.DesiredCapabilities()
  596. desiredCapabilities.__setattr__('ACCEPT_SSL_CERT', True)
  597.  
  598. os.system('cls')
  599.  
  600. browser = webdriver.Chrome(
  601. chrome_options=options, desired_capabilities=desiredCapabilities.CHROME)
  602.  
  603. Setting_RunningBrowser(Username, Password, browser)
  604.  
  605. KeyWord = input('Enter what keyword you want to search: ')
  606. Category = input('Enter the category: ')
  607. StartingPage = input('Enter the page to start from: ')
  608. EndingPage = input('Enter the page to end at: ')
  609. HowManyAds = input('How many ads to read (if all from the page, type all): ')
  610. SheetName = input('Enter the new Excel Sheet name: ')
  611.  
  612. workbook = xlsxwriter.Workbook('../../' + ExcelFolderName + '/' + SheetName + '.xls')
  613. worksheet = workbook.add_worksheet()
  614.  
  615. Output.append("Excel Sheet name: " + SheetName)
  616. Output.append("Searching for '" + KeyWord + "' ads...")
  617.  
  618. if (HowManyAds.lower().__eq__("all")):
  619. Output.append("Reading " + HowManyAds + ' ads.')
  620. else:
  621. Output.append("Reading " + HowManyAds + ' ads in total.')
  622.  
  623.  
  624. descriptionFormat = workbook.add_format()
  625. phoneFormat = workbook.add_format()
  626. formatTitle = workbook.add_format()
  627. format = workbook.add_format()
  628.  
  629. FormattingExcel(ExcelFolderName, SheetName, worksheet, descriptionFormat, phoneFormat, formatTitle, format)
  630.  
  631.  
  632. TotalAdsRead = 0
  633. Total = 0
  634.  
  635. os.system('cls')
  636. print('New Excel WorkSheet created.')
  637. print('Name: ' + SheetName)
  638. print('WorkSheet formatting done!')
  639.  
  640. try:
  641. element = WebDriverWait(browser, 120).until(
  642. EC.presence_of_element_located((By.CLASS_NAME, "js-advert-count"))
  643. )
  644. os.system('cls')
  645. print('Getting ready to scrape...')
  646.  
  647. finally:
  648. elem = browser.find_element_by_xpath(
  649. "//DIV[@class='gumtree-logo-svg']/following-sibling::DIV").click()
  650. os.system('cls')
  651. print('Ready to scrape!')
  652.  
  653. # browser.quit()
  654. ScrapeSession(myAds,myAds_Details,browser, False, SheetName, False, StartingPage, EndingPage, Category, KeyWord, workbook,
  655. worksheet, descriptionFormat, format, formatTitle, phoneFormat, HowManyAds,Output,TotalAdsRead,Total,0)
  656.  
  657.  
  658. if(Start):
  659. Main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement