Advertisement
Guest User

Untitled

a guest
Apr 27th, 2017
43
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.15 KB | None | 0 0
  1. import urllib
  2. import urllib.request
  3. import bs4 as bs
  4. import re
  5. import time
  6. import pandas as pd
  7. import requests
  8. import random
  9. import xlsxwriter
  10. import urllib.parse
  11. import urllib.request as rq
  12. import scrapy
  13.  
  14. class ScrapeCompSource():
  15. def __init__(self, url):
  16. self.url = url
  17. self.soup = self.makeSoup(url)
  18.  
  19. def makeSoup(self, url):
  20. scr = scrapy.Request(url=url)
  21. r = requests.get(url)
  22. print(r)
  23.  
  24.  
  25. # req = urllib.request.Request(url)
  26. # print(req.headers)
  27. # response = urllib.request.urlopen(req)
  28. # thepage = response.read()
  29. # soupData = bs.BeautifulSoup(thepage, "lxml")
  30. # return soupData
  31.  
  32. def getCategories(self):
  33. mainCategories = []
  34. for i in self.soup.findAll("div", {"id" : "estores_vensearch"}):
  35. for j in i.findAll("a"):
  36. link = re.compile("http://" + j.get("href"))
  37. if(link not in mainCategories):
  38. mainCategories.append(link)
  39.  
  40. print(mainCategories)
  41. print(len(mainCategories))
  42.  
  43. scrapeComp = ScrapeCompSource("http://www.compsource.com/")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement