Advertisement
Guest User

Untitled

a guest
Oct 17th, 2017
395
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.74 KB | None | 0 0
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3.  
  4. import csv, sys, socket, threading, logging, hashlib, urllib2, MySQLdb
  5. from Queue import Queue
  6. from time import time
  7. from pybing.query import WebQuery
  8. from SimpleXMLRPCServer import SimpleXMLRPCServer
  9.  
  10. __author__ = 'neanton'
  11. __email__ = 'neanton@gmail.com'
  12.  
  13. #XML-RPC Server host
  14. XML_RPC_SERVER_HOST = 'localhost'
  15. #XML-RPC Server port
  16. XML_RPC_SERVER_PORT = 9890
  17.  
  18. #MySQL host
  19. MYSQL_HOST = 'localhost'
  20. MYSQL_USER = 'root'
  21. MYSQL_PASS = 'root'
  22. MYSQL_PORT = 3306
  23. MYSQL_DATABASE = 'peoop'
  24.  
  25. #blacklist some file extensions
  26. EXTENSION_BLACKLIST = ('.pdf', '.torrent', '.jpg', '.jpeg', '.png', '.bmp', '.gif', '.doc', '.docx', '.xls', '.xlsx')
  27.  
  28. #timeout for processing request
  29. TIMEOUT = 3.0
  30. #max number of threads
  31. MAX_THREADS = 100
  32. #logging format
  33. LOG_FORMAT = '[%(asctime)-15s] %(threadName)s: %(message)s'
  34. # Put your Bing AppId
  35. API_KEY = 'ACE47D6F867C923481DA0E35851962A6652C63F5'
  36.  
  37. logging.basicConfig(level=logging.DEBUG, format=LOG_FORMAT)
  38.  
  39. socket.setdefaulttimeout(TIMEOUT)
  40.  
  41. logging.debug('Parser started')
  42.  
  43. class TaskDone:
  44.     pass
  45.  
  46.  
  47. #def handleTimeout(qIn, i):
  48. #    logging.debug('Timeout in thread: %d' % i)
  49. #    qIn.task_done()
  50.  
  51. def worker(i):
  52.     logging.debug('Thread started')
  53.     while True:
  54.         (id, url) = qIn.get()
  55.         if url == TaskDone:
  56.             sys.exit(0)
  57.  
  58.         logging.debug('Processing URL: ' + url)
  59.         req = urllib2.Request(url, headers={
  60.             'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:5.0) Gecko/20100101 Firefox/5.0'
  61.         })
  62.         #timer = threading.Timer(TIMEOUT, handleTimeout, [qIn, i])
  63.         #timer.start()
  64.         try:
  65.             data = urllib2.urlopen(req, timeout=TIMEOUT).read()
  66.         except urllib2.HTTPError, e:
  67.             logging.debug('HTTPError: %d' % e.code)
  68.         except urllib2.URLError, e:
  69.             logging.debug('URLError: %s' % e.reason)
  70.         except Exception:
  71.             pass
  72.         else:
  73. #            fileName = hashlib.md5()
  74. #            fileName.update(url)
  75. #
  76. #            fo = open('./data/' + fileName.hexdigest() + '.html', 'w')
  77. #            fo.write(data)
  78. #            fo.close()
  79.             qOut.put((id, url, data))
  80.  
  81.         #timer.cancel()
  82.         #qIn.task_done()
  83.  
  84. def writer():
  85.     logging.debug('Thread started')
  86.     connection = MySQLdb.connect(host=MYSQL_HOST, port=MYSQL_PORT, user=MYSQL_USER, passwd=MYSQL_PASS, db=MYSQL_DATABASE)
  87.     cursor = connection.cursor()
  88.  
  89.     while True:
  90.         (id, url, data) = qOut.get()
  91.         logging.debug(url)
  92.         cursor.execute("""INSERT INTO search_result (keyword_id, url, data) VALUES(%s, %s, %s)""", (id, url, data))
  93.         connection.commit()
  94.  
  95. qIn = Queue()
  96. qOut = Queue()
  97.  
  98. #starting parser threads
  99. for i in range(MAX_THREADS):
  100.     t = threading.Thread(target=worker, args=(i,), name="Worker-%d" % i)
  101.     t.daemon = True
  102.     t.start()
  103.  
  104. #starting writer thread
  105. t = threading.Thread(target=writer, name="Writer")
  106. t.daemon = True
  107. t.start()
  108.  
  109. logging.debug('Threads started')
  110.  
  111. def add_keyword(id, kw):
  112.     """
  113.    XML-RPC function - addKeyword
  114.    """
  115.     start_time = time()
  116.     total_urls = 0
  117.  
  118.     logging.debug('Bing Parsing, KW: %s' % kw)
  119.  
  120.     query = WebQuery(API_KEY, kw)
  121.     results = query.execute()
  122.  
  123.     for result in results[:100]:
  124.         total_urls += 1
  125.         qIn.put((id, result.url))
  126.  
  127.     logging.debug('Bing parsed, found: %d' % total_urls)
  128.     logging.debug('Parser finished, runtime: %s' % (time() - start_time))
  129.  
  130.     return id, kw
  131.  
  132. logging.debug('Starting XML-RPC Server "%s" on port: %d' % (XML_RPC_SERVER_HOST, XML_RPC_SERVER_PORT))
  133. server = SimpleXMLRPCServer((XML_RPC_SERVER_HOST, XML_RPC_SERVER_PORT))
  134. server.register_function(add_keyword, 'addKeyword')
  135. server.serve_forever()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement