Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import urllib
- import urllib.request
- import bs4 as bs
- import re
- import time
- import pandas as pd
- import requests
- import random
- import xlsxwriter
- import urllib.parse
- import urllib.request as rq
- import scrapy
- class ScrapeCompSource():
- def __init__(self, url):
- self.url = url
- self.soup = self.makeSoup(url)
- def makeSoup(self, url):
- opener = urllib.request.build_opener()
- # a = str(random.randrange(1000))
- opener.addheaders = [("sasa", "Chrome/52.0.2743.116")]
- response = opener.open(url)
- soupData = bs.BeautifulSoup(response, "lxml")
- def getCategories(self):
- mainCategories = []
- for i in self.soup.findAll("div", {"id" : "estores_vensearch"}):
- for j in i.findAll("a"):
- link = re.compile("http://" + j.get("href"))
- if(link not in mainCategories):
- mainCategories.append(link)
- print(mainCategories)
- print(len(mainCategories))
- scrapeComp = ScrapeCompSource("http://www.compsource.com/")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement