Advertisement
Guest User

Untitled

a guest
Oct 16th, 2017
85
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 19.77 KB | None | 0 0
  1. from bs4 import BeautifulSoup
  2. import urllib.request
  3. from selenium import webdriver
  4. from selenium.webdriver.common.by import By
  5. from selenium.webdriver.support.ui import WebDriverWait
  6. from bs4 import BeautifulSoup
  7. import urllib.request
  8. from selenium import webdriver
  9. from selenium.webdriver.common.by import By
  10. from selenium.webdriver.support.ui import WebDriverWait
  11. from selenium.webdriver.support import expected_conditions as EC
  12. import time
  13. import xlsxwriter
  14. import urllib
  15. import os
  16.  
  17.  
  18. print('Starting the browser!')
  19.  
  20. settingsPath = '../../settings.txt'
  21. openFileToRead = open(settingsPath,'r')
  22. fileLines = openFileToRead.readlines()
  23.  
  24. Username = fileLines[0][6:-1]
  25. Password = fileLines[1][9:-1]
  26. ExcelFolderName = fileLines[2][12:]
  27.  
  28.  
  29. options = webdriver.ChromeOptions()
  30. os.system('cls')
  31. options.add_argument('--ignore-certificate-errors')
  32. options.add_argument('--ignore-ssl-errors')
  33. options.add_argument('--ignore-certificate-errors-spki-list')
  34. options.add_argument("--incognito")
  35. desiredCapabilities = webdriver.DesiredCapabilities()
  36. desiredCapabilities.__setattr__('ACCEPT_SSL_CERT', True)
  37. os.system('cls')
  38.  
  39. browser = webdriver.Chrome(
  40. chrome_options=options, desired_capabilities=desiredCapabilities.CHROME)
  41. os.system('cls')
  42. browser.maximize_window()
  43. os.system('cls')
  44. browser.get('https://my.gumtree.com/login')
  45. os.system('cls')
  46. print('Launching browser...')
  47. browser.execute_script('localStorage.clear()')
  48. os.system('cls')
  49. username = browser.find_element_by_id("email")
  50. os.system('cls')
  51. print('Browser launched!')
  52. password = browser.find_element_by_id("fld-password")
  53. os.system('cls')
  54. print('Getting ready to log in...')
  55.  
  56. username.send_keys(Username)
  57. os.system('cls')
  58. password.send_keys(Password)
  59. os.system('cls')
  60. elem = browser.find_element_by_xpath(
  61. "//BUTTON[@class='btn-primary btn-full-width g-recaptcha'][text()='Login']/self::BUTTON").click()
  62. os.system('cls')
  63. print('Logging in...')
  64. myAds = []
  65.  
  66. myAds_Details = []
  67.  
  68. souptoprint = []
  69.  
  70. Output = []
  71.  
  72. KeyWord = input('Enter what keyword you want to search: ')
  73. Category = input('Enter the category: ')
  74. PagesToRead = input('Enter the number of pages to scrape: ')
  75. HowManyAds = input('How many ads to read (if all from the page, type all): ')
  76. SheetName = input('Enter the new Excel Sheet name: ')
  77. workbook = xlsxwriter.Workbook('../../' +ExcelFolderName + '/' + SheetName +'.xls')
  78. worksheet = workbook.add_worksheet()
  79. Output.append("Excel Sheet name: " + SheetName)
  80. Output.append("Searching for '" + KeyWord + "' ads...")
  81. Output.append("Reading " + HowManyAds + ' ads in total.')
  82.  
  83. format = workbook.add_format()
  84. descriptionFormat = workbook.add_format()
  85. phoneFormat = workbook.add_format()
  86. phoneFormat.set_align('center')
  87. phoneFormat.set_align('vcenter')
  88. phoneFormat.font_size = 12
  89. format.set_align('center')
  90. format.set_align('vcenter')
  91. descriptionFormat.set_align('top')
  92. descriptionFormat.set_text_wrap()
  93. format.set_text_wrap()
  94. formatTitle = workbook.add_format()
  95. format.set_align('center')
  96. formatTitle.set_bold()
  97. formatTitle.set_align('center')
  98.  
  99. descriptionFormat.font_size = 9
  100. format.font_size = 12
  101. formatTitle.font_size = 13
  102.  
  103. worksheet.set_column(0, 0, 40)
  104. worksheet.set_column(1, 1, 20)
  105. worksheet.set_column(2, 2, 15)
  106. worksheet.set_column(3, 3, 18)
  107. worksheet.set_column(4, 4, 18)
  108. worksheet.set_column(5, 5, 100)
  109. worksheet.set_row(1, 120, 0)
  110. os.system('cls')
  111. worksheet.write(0, 0, 'Title', formatTitle)
  112. worksheet.write(0, 1, 'Location', formatTitle)
  113. worksheet.write(0, 2, 'Price', formatTitle)
  114. worksheet.write(0, 3, 'Ad Link', formatTitle)
  115. worksheet.write(0, 4, 'Phone number', formatTitle)
  116. worksheet.write(0, 5, 'Description', formatTitle)
  117. TotalAdsRead = 0
  118. Total = 0
  119. os.system('cls')
  120. print('New Excel WorkSheet created.')
  121. print('Name: ' + SheetName)
  122. print('WorkSheet formatting done!')
  123.  
  124. class Ad(object):
  125. def __init__(self, ad):
  126. self.ad = ad
  127.  
  128.  
  129. class AdDetails(object):
  130. def __init__(self, Title, Price, Location, adLink, adDescription, adPhoneNumber):
  131. self.Title = Title
  132. self.Price = Price
  133. self.Location = Location
  134. self.adLink = adLink
  135. self.adDescription = adDescription
  136. self.adPhoneNumber = adPhoneNumber
  137.  
  138.  
  139. try:
  140. element = WebDriverWait(browser, 120).until(
  141. EC.presence_of_element_located((By.CLASS_NAME, "js-advert-count"))
  142. )
  143. os.system('cls')
  144. print('Getting ready to scrape...')
  145.  
  146. finally:
  147. elem = browser.find_element_by_xpath(
  148. "//DIV[@class='gumtree-logo-svg']/following-sibling::DIV").click()
  149. os.system('cls')
  150. print('Ready to scrape!')
  151.  
  152.  
  153. def ScrapeSession(pagesToStartFrom, SameExcelSheet, SheetName, ScrapingMore, PagesToRead, Category, KeyWord, workbook, worksheet, descriptionFormat, format, formatTitle, phoneFormat, HowManyAds, Output,TotalAdsRead, Total):
  154. sessionIndex = 0
  155. numberExists = True
  156. if(ScrapingMore):
  157. KeyWord = input('Enter what keyword you want to search: ')
  158. Category = input('Enter the category: ')
  159. PagesToRead = input('Enter the number of pages to scrape: ')
  160. HowManyAds = input(
  161. 'How many ads to read (if all from the page, type all): ')
  162. Output.append("Excel Sheet name: " + SheetName)
  163. Output.append("Searching for '" + KeyWord + "' ads...")
  164. Output.append("Reading " + HowManyAds + ' ads in total.')
  165. if (SameExcelSheet):
  166. print('Using the same Excel Sheet, called: ' + SheetName)
  167. else:
  168. if ScrapingMore:
  169. SheetName = input('Enter the new Excel Sheet name: ')
  170. workbook = xlsxwriter.Workbook('../../' + ExcelFolderName + '/' + SheetName + '.xls')
  171. worksheet = workbook.add_worksheet()
  172.  
  173. format = workbook.add_format()
  174. descriptionFormat = workbook.add_format()
  175. phoneFormat = workbook.add_format()
  176. phoneFormat.set_align('center')
  177. phoneFormat.set_align('vcenter')
  178. phoneFormat.font_size = 12
  179. format.set_align('center')
  180. format.set_align('vcenter')
  181. descriptionFormat.set_align('top')
  182. descriptionFormat.set_text_wrap()
  183. format.set_text_wrap()
  184. formatTitle = workbook.add_format()
  185. format.set_align('center')
  186. formatTitle.set_bold()
  187. formatTitle.set_align('center')
  188.  
  189. descriptionFormat.font_size = 9
  190. format.font_size = 12
  191. formatTitle.font_size = 13
  192.  
  193. worksheet.set_column(0, 0, 40)
  194. worksheet.set_column(1, 1, 20)
  195. worksheet.set_column(2, 2, 15)
  196. worksheet.set_column(3, 3, 18)
  197. worksheet.set_column(4, 4, 18)
  198. worksheet.set_column(5, 5, 100)
  199. worksheet.set_row(1, 120, 0)
  200.  
  201. worksheet.write(0, 0, 'Title', formatTitle)
  202. worksheet.write(0, 1, 'Location', formatTitle)
  203. worksheet.write(0, 2, 'Price', formatTitle)
  204. worksheet.write(0, 3, 'Ad Link', formatTitle)
  205. worksheet.write(0, 4, 'Phone number', formatTitle)
  206. worksheet.write(0, 5, 'Description', formatTitle)
  207. print('New Excel WorkSheet created.')
  208. print('Name: ' + SheetName)
  209.  
  210. print("\nProccess\n")
  211.  
  212. params = ['Ad']
  213.  
  214. paramsForDetails = ['adLink', 'adDescription', 'adPhoneNumber']
  215.  
  216.  
  217. ######### Getting all the ads #########
  218. myAds = []
  219. for i in range(int(PagesToRead)):
  220. numberExists = True
  221. if (TotalAdsRead >= int(HowManyAds)):
  222. break
  223. print('Searching for ' + KeyWord + ' ads')
  224. print('Reading ', i + 1, ' page')
  225. Output.append('Reading ' + (i + 1).__str__() + ' page')
  226. i = i + 1
  227. sauce = urllib.request.urlopen('https://www.gumtree.com/search?search_category=' +
  228. Category + '&q=' + KeyWord + '&search_location=uk&q=cars&page=' + str(i)).read()
  229. os.system('cls')
  230. print('Getting ad links...')
  231. soup = BeautifulSoup(sauce, 'lxml')
  232.  
  233. featuredAds = []
  234.  
  235. ulPlace = soup.find_all('ul', 'clearfix list-listing-mini')
  236.  
  237. souptoprint = ulPlace
  238.  
  239. ulPlace = soup.find_all('li')
  240.  
  241. for ulas in ulPlace:
  242. for ul in ulas.find_all('article'):
  243. newAd = ul.find('button')['data-savead'][15:]
  244. if((newAd).__contains__(' ')):
  245. newAd = str(newAd).replace(' ', '')
  246. newAd = ''.join([i for i in newAd if i.isdigit()])
  247. if(len(newAd) > 9):
  248. myAds.append(Ad(newAd))
  249.  
  250. for ad in myAds:
  251. print(ad.ad + ' Reading this ad')
  252. State = ''
  253. if(TotalAdsRead >= int(HowManyAds)):
  254. break
  255. numberExists = True
  256. ######### End #########
  257.  
  258. ######### Sending add to GumTree #########
  259. searchKeyWord = browser.find_element_by_id('header-search-q')
  260. os.system('cls')
  261. PrintOutput(Output)
  262. searchKeyWord.send_keys(ad.ad)
  263. os.system('cls')
  264. PrintOutput(Output)
  265. element = WebDriverWait(browser, 20).until(
  266. EC.element_to_be_clickable((By.XPATH, "//BUTTON[@type='submit']/self::BUTTON")));
  267. os.system('cls')
  268. PrintOutput(Output)
  269. if(sessionIndex == 0 and ScrapingMore == False):
  270. element.click()
  271. os.system('cls')
  272. PrintOutput(Output)
  273. element.click()
  274. os.system('cls')
  275. PrintOutput(Output)
  276. else:
  277. element.click()
  278. os.system('cls')
  279. PrintOutput(Output)
  280.  
  281. ######### End sending #########
  282.  
  283. ######### Getting data #########
  284.  
  285. if(browser.page_source.__contains__('Click to reveal phone number')):
  286. os.system('cls')
  287. #PrintOutput(Output)
  288. element = WebDriverWait(browser, 80).until(
  289. EC.element_to_be_clickable((By.XPATH, "(//a[@id='reply-panel-reveal-btn-pro'])[2]")))
  290. os.system('cls')
  291. PrintOutput(Output)
  292. element.click()
  293. os.system('cls')
  294. PrintOutput(Output)
  295. State = "Click"
  296. else:
  297.  
  298. if(browser.page_source.__contains__("Reveal")):
  299. os.system('cls')
  300. PrintOutput(Output)
  301. element = WebDriverWait(browser, 80).until(EC.element_to_be_clickable(
  302. (By.XPATH, "(//a[@id='reply-panel-reveal-btn'])[2]")))
  303. os.system('cls')
  304. PrintOutput(Output)
  305. element.click()
  306. os.system('cls')
  307. PrintOutput(Output)
  308. State = 'Reveal'
  309. else:
  310. numberExists = False
  311.  
  312. Description = browser.find_element_by_xpath(
  313. "//P[@class='ad-description']").text
  314. os.system('cls')
  315. PrintOutput(Output)
  316. if(numberExists):
  317. time.sleep(12)
  318. PhoneNumber = browser.find_element_by_xpath(
  319. "(//DIV[@class='clearfix'])[2]").text
  320. os.system('cls')
  321. #PrintOutput(Output)
  322.  
  323. #int(re.findall(r'\d+', browser.find_element_by_xpath("(//DIV[@class='clearfix'])[2]").text)[0])
  324. else:
  325. if(State.__eq__("Reveal")):
  326. time.sleep(12)
  327. PhoneNumber = browser.find_element_by_xpath(
  328. "(//DIV[@class='clearfix'])[2]").text
  329. os.system('cls')
  330. PrintOutput(Output)
  331. else:
  332. PhoneNumber = "None"
  333. if (State == "Click"):
  334. time.sleep(12)
  335. PhoneNumber = browser.find_element_by_xpath(
  336. "(//DIV[@class='clearfix'])[2]").text
  337. os.system('cls')
  338. PrintOutput(Output)
  339.  
  340. Location = browser.find_element_by_xpath(
  341. "// SPAN[ @ itemprop = 'address']").text
  342. os.system('cls')
  343. PrintOutput(Output)
  344. Price = browser.find_element_by_xpath(
  345. "//STRONG[@class='ad-price txt-xlarge txt-emphasis inline-block']").text
  346. os.system('cls')
  347. PrintOutput(Output)
  348. Title = browser.find_element_by_xpath("//H1[@id='ad-title']").text
  349. os.system('cls')
  350. PrintOutput(Output)
  351.  
  352.  
  353. if(PhoneNumber.__contains__("Reveal")):
  354. PhoneNumber = PhoneNumber.replace("Reveal", "")
  355. PrintOutput(Output)
  356. if(PhoneNumber.__contains__("Click")):
  357. if (browser.page_source.__contains__('Click to reveal phone number')):
  358. os.system('cls')
  359. PrintOutput(Output)
  360. element = WebDriverWait(browser, 80).until(
  361. EC.element_to_be_clickable((By.XPATH, "(//a[@id='reply-panel-reveal-btn-pro'])[2]")))
  362. os.system('cls')
  363. PrintOutput(Output)
  364. element.click()
  365. os.system('cls')
  366. PrintOutput(Output)
  367. State = "Click"
  368. else:
  369. time.sleep(12)
  370. PhoneNumber = browser.find_element_by_xpath(
  371. "(//DIV[@class='clearfix'])[2]").text
  372. os.system('cls')
  373. PrintOutput(Output)
  374. if(State == "Click"):
  375. time.sleep(12)
  376. PhoneNumber = browser.find_element_by_xpath(
  377. "(//DIV[@class='clearfix'])[2]").text
  378. os.system('cls')
  379. PrintOutput(Output)
  380.  
  381. if (PhoneNumber.__contains__('X') or PhoneNumber.__contains__('Click') or PhoneNumber.__contains__(
  382. 'Reveal')):
  383.  
  384. if (browser.page_source.__contains__('Click to reveal phone number')):
  385. os.system('cls')
  386. # PrintOutput(Output)
  387. element = WebDriverWait(browser, 80).until(
  388. EC.element_to_be_clickable((By.XPATH, "(//a[@id='reply-panel-reveal-btn-pro'])[2]")))
  389. os.system('cls')
  390. PrintOutput(Output)
  391. element.click()
  392. os.system('cls')
  393. PrintOutput(Output)
  394. State = "Click"
  395. else:
  396. if (browser.page_source.__contains__("Reveal")):
  397. os.system('cls')
  398. PrintOutput(Output)
  399. element = WebDriverWait(browser, 80).until(EC.element_to_be_clickable(
  400. (By.XPATH, "(//a[@id='reply-panel-reveal-btn'])[2]")))
  401. os.system('cls')
  402. PrintOutput(Output)
  403. element.click()
  404. os.system('cls')
  405. PrintOutput(Output)
  406. State = 'Reveal'
  407. else:
  408. numberExists = False
  409. if (numberExists):
  410. time.sleep(15)
  411. PhoneNumber = browser.find_element_by_xpath(
  412. "(//DIV[@class='clearfix'])[2]").text
  413. os.system('cls')
  414. else:
  415. if (State.__eq__("Reveal") or State.__eq__("Click")):
  416. time.sleep(15)
  417. PhoneNumber = browser.find_element_by_xpath(
  418. "(//DIV[@class='clearfix'])[2]").text
  419. os.system('cls')
  420.  
  421.  
  422. if (PhoneNumber.__contains__('X') or PhoneNumber.__contains__('Click') or PhoneNumber.__contains__(
  423. 'Reveal')):
  424. TotalAdsRead = TotalAdsRead - 1
  425. else:
  426.  
  427. myAds_Details.append(
  428. AdDetails(Title, Price, Location, ad.ad, Description, PhoneNumber))
  429.  
  430.  
  431.  
  432.  
  433.  
  434. element = WebDriverWait(browser, 30).until(
  435. EC.element_to_be_clickable((By.XPATH, "//DIV[@class='gumtree-text-svg hide-fully-to-s']/self::DIV")))
  436. os.system('cls')
  437. PrintOutput(Output)
  438. element.click()
  439. os.system('cls')
  440. TotalAdsRead = TotalAdsRead + 1
  441. Total = Total + 1
  442. PrintOutput(Output)
  443. print('Amount Of Scraped Ads for this search: ' + str(TotalAdsRead))
  444. print('Amount of Scraped Ads in Total: ' + str(Total))
  445. # for ad in myAds_Details:
  446. # worksheet.write(i+1,0,adDetails)
  447.  
  448. if(HowManyAds.__eq__('all')):
  449. sessionIndex = sessionIndex + 1
  450. else:
  451. sessionIndex = sessionIndex + 1
  452. if(sessionIndex >= int(HowManyAds)):
  453. break
  454. os.system('cls')
  455. PrintOutput(Output)
  456. print('Amount Of Scraped Ads: ' + str(TotalAdsRead))
  457. print('Amount of Scraped Ads in Total: ' + str(Total))
  458. ScrapeMorePages = input('Do you want to scrape more? (y/n):')
  459. if(ScrapeMorePages.lower().__eq__('y')):
  460. SameExcel = input("Use the same Excel Sheet? (y/n): ")
  461. if(SameExcel.lower().__eq__('y')):
  462. Output.clear()
  463. TotalAdsRead = 0;
  464. ScrapeSession(PagesToRead, True, SheetName, True, PagesToRead, Category, "", workbook,
  465. worksheet, descriptionFormat, format, formatTitle, phoneFormat, HowManyAds,Output,TotalAdsRead,Total)
  466.  
  467. else:
  468. excelCounter = 1
  469.  
  470. excelCounter = 1
  471.  
  472. for ad in myAds_Details:
  473. worksheet.write(excelCounter, 0, ad.Title, format)
  474. worksheet.write(excelCounter, 1, ad.Location, format)
  475. worksheet.write(excelCounter, 2, ad.Price, format)
  476. worksheet.write(excelCounter, 3, ad.adLink, format)
  477. worksheet.write(excelCounter, 4, ad.adPhoneNumber, phoneFormat)
  478. worksheet.write(excelCounter, 5,
  479. ad.adDescription, descriptionFormat)
  480. excelCounter = excelCounter + 1
  481. workbook.close()
  482. myAds_Details.clear()
  483. print('Excel Workbook closed.')
  484. Output.clear()
  485. TotalAdsRead = 0
  486. ScrapeSession(PagesToRead, False, "", True, PagesToRead, Category, "", workbook,
  487. worksheet, descriptionFormat, format, formatTitle, phoneFormat, HowManyAds,Output,TotalAdsRead,Total)
  488.  
  489.  
  490. else:
  491. browser.delete_all_cookies()
  492. browser.close()
  493. print('Browser closed.')
  494. excelCounter = 1
  495.  
  496. for ad in myAds_Details:
  497. worksheet.write(excelCounter, 0, ad.Title, format)
  498. worksheet.write(excelCounter, 1, ad.Location, format)
  499. worksheet.write(excelCounter, 2, ad.Price, format)
  500. worksheet.write(excelCounter, 3, ad.adLink, format)
  501. worksheet.write(excelCounter, 4, ad.adPhoneNumber, phoneFormat)
  502. worksheet.write(excelCounter, 5, ad.adDescription,
  503. descriptionFormat)
  504. excelCounter = excelCounter + 1
  505. workbook.close()
  506. print('Excel Workbook closed.')
  507. print('You can exit the program safely!')
  508.  
  509. ######### End #########
  510.  
  511. def PrintOutput(output):
  512. for text in output:
  513. print(text)
  514.  
  515.  
  516. # browser.quit()
  517. ScrapeSession(0, False, SheetName, False, PagesToRead, Category, KeyWord, workbook,
  518. worksheet, descriptionFormat, format, formatTitle, phoneFormat, HowManyAds,Output,TotalAdsRead,Total)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement