SHARE
TWEET

event-driven scrapper with PySide

shackra Mar 20th, 2014 123 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #!/usr/bin/env python2
  2. # coding: utf-8
  3.  
  4. import bs4
  5. import logging
  6. import re
  7. from PySide import QtCore
  8. from PySide import QtNetwork
  9. import useragents
  10.  
  11. extractprice = re.compile(r"[0-9]+(\.|,)[0-9]+")
  12. regexproductnotfound = re.compile(r"There were no results found for")
  13.  
  14. logger = logging.getLogger(__name__)
  15.  
  16. # List of functions to retrieve different data from
  17. # different base.com pages.
  18.  
  19. def getprices(htmldata):
  20.     """ Return a string and a list.
  21.  
  22.    The list of tuple are sellers the price of the product.
  23.    htmldata should be the page of the product, otherwise an exception
  24.    is triggered.
  25.    """
  26.     soup = bs4.BeautifulSoup(htmldata)
  27.     # check if the page giving is a product page
  28.     div = soup.find("div", attrs={"class": "sub-section clearfix"})
  29.     if div is None:
  30.         return None, None
  31.  
  32.     # look for the product name first
  33.     div = soup.find("div", attrs={"class": "sub-section clearfix"})
  34.     productname = unicode(div.find("h1").text)
  35.  
  36.     # look for the sellers and prices
  37.     sellpric = []
  38.     rs = soup.find_all("tr", attrs={"class": re.compile(r"rs-[0-1]")})
  39.     if rs:
  40.         for rsx in rs:
  41.             # Is it new?
  42.             if not rsx.find("td", text="New"):
  43.                 continue
  44.             else:
  45.                 # Yes, it is!
  46.                 price = rsx.find("span", attrs={"class": "seller-price"}).text
  47.                 price = float(extractprice.search(price).group())
  48.                 seller = rsx.find(
  49.                     "p", attrs={"class": "seller-details"}).find("a").text
  50.                 sellpric.append((seller, price))
  51.     else:
  52.         # getting the main price
  53.         price = soup.find("span", attrs={"class": "price"}).text
  54.         price = float(extractprice.search(price).group())
  55.         # Because there is no other seller than the website
  56.         seller = "base.com"
  57.         sellpric.append((seller, price))
  58.  
  59.     return productname, sellpric
  60.  
  61.  
  62. def findproducturl(htmldata, productname):
  63.     """ From a search result page looks for the product url
  64.    """
  65.     soup = bs4.BeautifulSoup(htmldata)
  66.     a = soup.find("a", attrs={"title": productname})
  67.  
  68.     if a is None:
  69.         return None
  70.     else:
  71.         a = u"http://www.base.com" + a["href"]
  72.         return a
  73.  
  74.  
  75. class BaseScrapper(QtCore.QObject):
  76.  
  77.     """ Scrap information from one product at the time
  78.    
  79.    There might be a better way of handling request reason
  80.    instead of using self.reason check http://ur1.ca/gw6g8
  81.    """
  82.  
  83.     # Signals
  84.     askwork = QtCore.Signal()
  85.     askrequest = QtCore.Signal()
  86.     askproducturl = QtCore.Signal()
  87.     asksellernprice = QtCore.Signal()
  88.     updateprogressbar = QtCore.Signal()
  89.     reason = QtCore.Signal()
  90.     noworkleft = QtCore.Signal(object)
  91.     iamdone = QtCore.Signal()
  92.  
  93.     def __repr__(self):
  94.         try:
  95.             return "<BaseScrapper(name: {}, product: {}, phase: {})>".format(
  96.                 self.name, self.product.name, self.reason)
  97.         except:
  98.             return "<BaseScrapper(name: {}, product: {}, phase: {})>".format(
  99.                 self.name, None, self.reason)
  100.  
  101.     def __init__(self, name, session, queue, parent=None):
  102.         super(BaseScrapper, self).__init__(parent=parent)
  103.         self.name = name  # holds the scrapper name
  104.         self.searchurl = QtCore.QUrl("http://www.base.com/fsearch.htm")
  105.         self.producturl = QtCore.QUrl()
  106.  
  107.         self.data = QtCore.QByteArray("")
  108.         self.session = session
  109.         self.queue = queue
  110.         self.product = None
  111.         self.reason = None
  112.  
  113.         # connecting signals
  114.         self.askwork.connect(self.getwork)
  115.         self.askrequest.connect(self.request)
  116.         self.askproducturl.connect(self.geturl)
  117.         self.asksellernprice.connect(self.getsellernprice)
  118.         self.iamdone.connect(self.startagain)
  119.         self.parent().qnam.finished[
  120.             QtNetwork.QNetworkReply].connect(self.endRequest)
  121.  
  122.     @QtCore.Slot()
  123.     def start(self):
  124.         """ emit a signal to request for work on the queue
  125.        """
  126.         logger.info("{}: started".format(self.name))
  127.         # from pudb import set_trace
  128.         # set_trace()
  129.         QtCore.QTimer.singleShot(0, self.askwork.emit)
  130.  
  131.     @QtCore.Slot()
  132.     def getwork(self):
  133.         """ gather work from the queue
  134.        """
  135.         if self.queue.empty():
  136.             # emit a signal to say that there is no more work
  137.             # for this scrapper on the queue.
  138.             logger.info("{}: No more work left.".format(self.name))
  139.             # self.noworkleft.emit(self)
  140.         else:
  141.             # there is work to do.
  142.             logger.info("{}: getting work...".format(self.name))
  143.             self.product = self.queue.get()
  144.             logger.info("{}: work obtained '{}'".format(
  145.                 self.name, self.product.name))
  146.             self.reason = 0
  147.             QtCore.QTimer.singleShot(0, self.askrequest.emit)
  148.  
  149.     @QtCore.Slot()
  150.     def request(self):
  151.         """ make a request and send to base.com.
  152.        reason specifies the reason for the request.
  153.        0 means request search result page
  154.        1 means request product page
  155.        """
  156.         if self.reason == 0:
  157.             # search for the product
  158.             if self.searchurl.hasQueryItem("search"):
  159.                 # The query is not overwritten but rather append
  160.                 self.searchurl.removeQueryItem("search")
  161.  
  162.             self.searchurl.addQueryItem("search", self.product.name)
  163.             myrequest = QtNetwork.QNetworkRequest(self.searchurl)
  164.             logger.info("{}: Querying search for '{}'".format(
  165.                 self.name, self.product.name))
  166.             logger.debug("{}: '{}' - {}".format(
  167.                 self.name, self.product.name, self.searchurl))
  168.         elif self.reason == 1:
  169.             # asking for the product page
  170.             myrequest = QtNetwork.QNetworkRequest(self.producturl)
  171.             logger.info("{}: Querying for product page of '{}'".format(
  172.                 self.name, self.product.name))
  173.             logger.debug("{}: '{}' - {}".format(
  174.                 self.name, self.product.name, self.producturl))
  175.  
  176.         # change the user agent in order to not get caught
  177.         myrequest.setRawHeader("User-Agent", useragents.getnewuseragent())
  178.         # this object is the sender of this request
  179.         myrequest.setOriginatingObject(self)
  180.         self.parent().qnam.get(myrequest)
  181.         # self.reply.readyRead.connect(self.replyReadData)
  182.         # self.reply.finished.connect(self.replyFinished)
  183.  
  184.     @QtCore.Slot()
  185.     def endRequest(self, reply):
  186.         """ receive the reply send by the parent QNetworkAccessManager
  187.        """
  188.         objrec = reply.request().originatingObject()
  189.         if objrec == self:
  190.             if reply.error() != QtNetwork.QNetworkReply.NoError:
  191.                 logger.error("{}: {}".format(self.name, reply.error()))
  192.                 logger.info("{}: trying again in 30 seconds".format(self.name))
  193.                 # delete the reply
  194.                 reply.deleteLater()
  195.                 # FIXME: by which reason, 0 or 1?
  196.                 QtCore.QTimer.singleShot(30000, self.askrequest.emit)
  197.             if reply.isFinished():
  198.                 logger.info("{}: reply finished for {}".format(self.name,
  199.                                                                reply.url()))
  200.                 data = reply.readAll()
  201.                 # we received some output AND the data isn't the same?
  202.                 # FIXME: Check if reply URL is the same as self.producturl url.
  203.                 # as another security check from receiving wrong replies.
  204.                 if len(data) and self.data != data:
  205.                     self.data += data
  206.                     logger.info("recibido: {}. Total: {}".format(
  207.                         len(data), len(self.data)))
  208.                     with open("coverage/{}.html".format(
  209.                             self.product.name.replace("/", "|")), "a") as arch:
  210.                         arch.write(str(self.data))
  211.                     # delete the reply
  212.                     QtCore.QTimer.singleShot(0, reply.deleteLater)
  213.                     # What method to call and process the page?
  214.                     if self.reason == 0:
  215.                         # process the search result page
  216.                         QtCore.QTimer.singleShot(0, self.askproducturl.emit)
  217.                     elif self.reason == 1:
  218.                         # process the product page
  219.                         QtCore.QTimer.singleShot(0, self.asksellernprice.emit)
  220.                 else:
  221.                     logger.error("{}: 0 bytes received ")
  222.  
  223.     @QtCore.Slot()
  224.     def geturl(self):
  225.         """ extract the url from a search result page
  226.        """
  227.         # check if the product do not exists in first place
  228.         regexresult = regexproductnotfound.search(str(self.data))
  229.         if regexresult is None:
  230.             logger.info("{}: We have search results for '{}'".format(
  231.                 self.name, self.product.name))
  232.             url = findproducturl(str(self.data), self.product.name)
  233.             if url is None:
  234.                 # el producto puede estar con otro nombre
  235.                 # FIXME: set as 'may exist with other name'
  236.                 logger.error("{}: Product '{}' exists with other name".format(
  237.                     self.name, self.product.name))
  238.                 QtCore.QTimer.singleShot(0, self.iamdone.emit)
  239.             else:
  240.                 # tenemos la URL!
  241.                 self.producturl.setUrl(url)
  242.                 self.reason = 1
  243.                 self.data.clear()
  244.                 QtCore.QTimer.singleShot(0, self.askrequest.emit)
  245.         else:
  246.             # This product really don't exists!
  247.             logger.error("{}: product don not exist: '{}'".format(
  248.                 self.name, self.product.name))
  249.             QtCore.QTimer.singleShot(0, self.iamdone.emit)
  250.  
  251.     @QtCore.Slot()
  252.     def getsellernprice(self):
  253.         """ From the product page get the seller and price
  254.        for a given product
  255.        """
  256.         productname, sellernprice = getprices(str(self.data))
  257.         if productname is None:
  258.             logger.error("{}: Page given for '{}' is incomplete".format(
  259.                 self.name, self.product.name))
  260.         else:
  261.             logger.info("{}: {} sellers found for product '{}'".format(
  262.                 self.name, len(sellernprice), self.product.name))
  263.  
  264.         logger.info("{}: Is done with product '{}'".format(
  265.             self.name, self.product.name))
  266.         QtCore.QTimer.singleShot(0, self.iamdone.emit)
  267.  
  268.     @QtCore.Slot()
  269.     def startagain(self):
  270.         """ Emit a signal in 30 seconds
  271.        this slot should be connected to the finished signal.
  272.        """
  273.         logger.info("{}: Requesting another product in 30 seconds".format(
  274.             self.name))
  275.         self.queue.task_done()
  276.         self.data.clear()
  277.         self.updateprogressbar.emit()
  278.         QtCore.QTimer.singleShot(1000, self.start)
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top