Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python2
- # coding: utf-8
- import bs4
- import logging
- import re
- from PySide import QtCore
- from PySide import QtNetwork
- import useragents
- extractprice = re.compile(r"[0-9]+(\.|,)[0-9]+")
- regexproductnotfound = re.compile(r"There were no results found for")
- logger = logging.getLogger(__name__)
- # List of functions to retrieve different data from
- # different base.com pages.
- def getprices(htmldata):
- """ Return a string and a list.
- The list of tuple are sellers the price of the product.
- htmldata should be the page of the product, otherwise an exception
- is triggered.
- """
- soup = bs4.BeautifulSoup(htmldata)
- # check if the page giving is a product page
- div = soup.find("div", attrs={"class": "sub-section clearfix"})
- if div is None:
- return None, None
- # look for the product name first
- div = soup.find("div", attrs={"class": "sub-section clearfix"})
- productname = unicode(div.find("h1").text)
- # look for the sellers and prices
- sellpric = []
- rs = soup.find_all("tr", attrs={"class": re.compile(r"rs-[0-1]")})
- if rs:
- for rsx in rs:
- # Is it new?
- if not rsx.find("td", text="New"):
- continue
- else:
- # Yes, it is!
- price = rsx.find("span", attrs={"class": "seller-price"}).text
- price = float(extractprice.search(price).group())
- seller = rsx.find(
- "p", attrs={"class": "seller-details"}).find("a").text
- sellpric.append((seller, price))
- else:
- # getting the main price
- price = soup.find("span", attrs={"class": "price"}).text
- price = float(extractprice.search(price).group())
- # Because there is no other seller than the website
- seller = "base.com"
- sellpric.append((seller, price))
- return productname, sellpric
- def findproducturl(htmldata, productname):
- """ From a search result page looks for the product url
- """
- soup = bs4.BeautifulSoup(htmldata)
- a = soup.find("a", attrs={"title": productname})
- if a is None:
- return None
- else:
- a = u"http://www.base.com" + a["href"]
- return a
- class BaseScrapper(QtCore.QObject):
- """ Scrap information from one product at the time
- There might be a better way of handling request reason
- instead of using self.reason check http://ur1.ca/gw6g8
- """
- # Signals
- askwork = QtCore.Signal()
- askrequest = QtCore.Signal()
- askproducturl = QtCore.Signal()
- asksellernprice = QtCore.Signal()
- updateprogressbar = QtCore.Signal()
- reason = QtCore.Signal()
- noworkleft = QtCore.Signal(object)
- iamdone = QtCore.Signal()
- def __repr__(self):
- try:
- return "<BaseScrapper(name: {}, product: {}, phase: {})>".format(
- self.name, self.product.name, self.reason)
- except:
- return "<BaseScrapper(name: {}, product: {}, phase: {})>".format(
- self.name, None, self.reason)
- def __init__(self, name, session, queue, parent=None):
- super(BaseScrapper, self).__init__(parent=parent)
- self.name = name # holds the scrapper name
- self.searchurl = QtCore.QUrl("http://www.base.com/fsearch.htm")
- self.producturl = QtCore.QUrl()
- self.data = QtCore.QByteArray("")
- self.session = session
- self.queue = queue
- self.product = None
- self.reason = None
- # connecting signals
- self.askwork.connect(self.getwork)
- self.askrequest.connect(self.request)
- self.askproducturl.connect(self.geturl)
- self.asksellernprice.connect(self.getsellernprice)
- self.iamdone.connect(self.startagain)
- self.parent().qnam.finished[
- QtNetwork.QNetworkReply].connect(self.endRequest)
- @QtCore.Slot()
- def start(self):
- """ emit a signal to request for work on the queue
- """
- logger.info("{}: started".format(self.name))
- # from pudb import set_trace
- # set_trace()
- QtCore.QTimer.singleShot(0, self.askwork.emit)
- @QtCore.Slot()
- def getwork(self):
- """ gather work from the queue
- """
- if self.queue.empty():
- # emit a signal to say that there is no more work
- # for this scrapper on the queue.
- logger.info("{}: No more work left.".format(self.name))
- # self.noworkleft.emit(self)
- else:
- # there is work to do.
- logger.info("{}: getting work...".format(self.name))
- self.product = self.queue.get()
- logger.info("{}: work obtained '{}'".format(
- self.name, self.product.name))
- self.reason = 0
- QtCore.QTimer.singleShot(0, self.askrequest.emit)
- @QtCore.Slot()
- def request(self):
- """ make a request and send to base.com.
- reason specifies the reason for the request.
- 0 means request search result page
- 1 means request product page
- """
- if self.reason == 0:
- # search for the product
- if self.searchurl.hasQueryItem("search"):
- # The query is not overwritten but rather append
- self.searchurl.removeQueryItem("search")
- self.searchurl.addQueryItem("search", self.product.name)
- myrequest = QtNetwork.QNetworkRequest(self.searchurl)
- logger.info("{}: Querying search for '{}'".format(
- self.name, self.product.name))
- logger.debug("{}: '{}' - {}".format(
- self.name, self.product.name, self.searchurl))
- elif self.reason == 1:
- # asking for the product page
- myrequest = QtNetwork.QNetworkRequest(self.producturl)
- logger.info("{}: Querying for product page of '{}'".format(
- self.name, self.product.name))
- logger.debug("{}: '{}' - {}".format(
- self.name, self.product.name, self.producturl))
- # change the user agent in order to not get caught
- myrequest.setRawHeader("User-Agent", useragents.getnewuseragent())
- # this object is the sender of this request
- myrequest.setOriginatingObject(self)
- self.parent().qnam.get(myrequest)
- # self.reply.readyRead.connect(self.replyReadData)
- # self.reply.finished.connect(self.replyFinished)
- @QtCore.Slot()
- def endRequest(self, reply):
- """ receive the reply send by the parent QNetworkAccessManager
- """
- objrec = reply.request().originatingObject()
- if objrec == self:
- if reply.error() != QtNetwork.QNetworkReply.NoError:
- logger.error("{}: {}".format(self.name, reply.error()))
- logger.info("{}: trying again in 30 seconds".format(self.name))
- # delete the reply
- reply.deleteLater()
- # FIXME: by which reason, 0 or 1?
- QtCore.QTimer.singleShot(30000, self.askrequest.emit)
- if reply.isFinished():
- logger.info("{}: reply finished for {}".format(self.name,
- reply.url()))
- data = reply.readAll()
- # we received some output AND the data isn't the same?
- # FIXME: Check if reply URL is the same as self.producturl url.
- # as another security check from receiving wrong replies.
- if len(data) and self.data != data:
- self.data += data
- logger.info("recibido: {}. Total: {}".format(
- len(data), len(self.data)))
- with open("coverage/{}.html".format(
- self.product.name.replace("/", "|")), "a") as arch:
- arch.write(str(self.data))
- # delete the reply
- QtCore.QTimer.singleShot(0, reply.deleteLater)
- # What method to call and process the page?
- if self.reason == 0:
- # process the search result page
- QtCore.QTimer.singleShot(0, self.askproducturl.emit)
- elif self.reason == 1:
- # process the product page
- QtCore.QTimer.singleShot(0, self.asksellernprice.emit)
- else:
- logger.error("{}: 0 bytes received ")
- @QtCore.Slot()
- def geturl(self):
- """ extract the url from a search result page
- """
- # check if the product do not exists in first place
- regexresult = regexproductnotfound.search(str(self.data))
- if regexresult is None:
- logger.info("{}: We have search results for '{}'".format(
- self.name, self.product.name))
- url = findproducturl(str(self.data), self.product.name)
- if url is None:
- # el producto puede estar con otro nombre
- # FIXME: set as 'may exist with other name'
- logger.error("{}: Product '{}' exists with other name".format(
- self.name, self.product.name))
- QtCore.QTimer.singleShot(0, self.iamdone.emit)
- else:
- # tenemos la URL!
- self.producturl.setUrl(url)
- self.reason = 1
- self.data.clear()
- QtCore.QTimer.singleShot(0, self.askrequest.emit)
- else:
- # This product really don't exists!
- logger.error("{}: product don not exist: '{}'".format(
- self.name, self.product.name))
- QtCore.QTimer.singleShot(0, self.iamdone.emit)
- @QtCore.Slot()
- def getsellernprice(self):
- """ From the product page get the seller and price
- for a given product
- """
- productname, sellernprice = getprices(str(self.data))
- if productname is None:
- logger.error("{}: Page given for '{}' is incomplete".format(
- self.name, self.product.name))
- else:
- logger.info("{}: {} sellers found for product '{}'".format(
- self.name, len(sellernprice), self.product.name))
- logger.info("{}: Is done with product '{}'".format(
- self.name, self.product.name))
- QtCore.QTimer.singleShot(0, self.iamdone.emit)
- @QtCore.Slot()
- def startagain(self):
- """ Emit a signal in 30 seconds
- this slot should be connected to the finished signal.
- """
- logger.info("{}: Requesting another product in 30 seconds".format(
- self.name))
- self.queue.task_done()
- self.data.clear()
- self.updateprogressbar.emit()
- QtCore.QTimer.singleShot(1000, self.start)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement