Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- # -*- coding: utf-8 -*-
- import csv, sys, socket, threading, logging, hashlib, urllib2, MySQLdb
- from Queue import Queue
- from time import time
- from pybing.query import WebQuery
- from SimpleXMLRPCServer import SimpleXMLRPCServer
- __author__ = 'neanton'
- __email__ = 'neanton@gmail.com'
- #XML-RPC Server host
- XML_RPC_SERVER_HOST = 'localhost'
- #XML-RPC Server port
- XML_RPC_SERVER_PORT = 9890
- #MySQL host
- MYSQL_HOST = 'localhost'
- MYSQL_USER = 'root'
- MYSQL_PASS = 'root'
- MYSQL_PORT = 3306
- MYSQL_DATABASE = 'peoop'
- #blacklist some file extensions
- EXTENSION_BLACKLIST = ('.pdf', '.torrent', '.jpg', '.jpeg', '.png', '.bmp', '.gif', '.doc', '.docx', '.xls', '.xlsx')
- #timeout for processing request
- TIMEOUT = 3.0
- #max number of threads
- MAX_THREADS = 100
- #logging format
- LOG_FORMAT = '[%(asctime)-15s] %(threadName)s: %(message)s'
- # Put your Bing AppId
- API_KEY = 'ACE47D6F867C923481DA0E35851962A6652C63F5'
- logging.basicConfig(level=logging.DEBUG, format=LOG_FORMAT)
- socket.setdefaulttimeout(TIMEOUT)
- logging.debug('Parser started')
- class TaskDone:
- pass
- #def handleTimeout(qIn, i):
- # logging.debug('Timeout in thread: %d' % i)
- # qIn.task_done()
- def worker(i):
- logging.debug('Thread started')
- while True:
- (id, url) = qIn.get()
- if url == TaskDone:
- sys.exit(0)
- logging.debug('Processing URL: ' + url)
- req = urllib2.Request(url, headers={
- 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:5.0) Gecko/20100101 Firefox/5.0'
- })
- #timer = threading.Timer(TIMEOUT, handleTimeout, [qIn, i])
- #timer.start()
- try:
- data = urllib2.urlopen(req, timeout=TIMEOUT).read()
- except urllib2.HTTPError, e:
- logging.debug('HTTPError: %d' % e.code)
- except urllib2.URLError, e:
- logging.debug('URLError: %s' % e.reason)
- except Exception:
- pass
- else:
- # fileName = hashlib.md5()
- # fileName.update(url)
- #
- # fo = open('./data/' + fileName.hexdigest() + '.html', 'w')
- # fo.write(data)
- # fo.close()
- qOut.put((id, url, data))
- #timer.cancel()
- #qIn.task_done()
- def writer():
- logging.debug('Thread started')
- connection = MySQLdb.connect(host=MYSQL_HOST, port=MYSQL_PORT, user=MYSQL_USER, passwd=MYSQL_PASS, db=MYSQL_DATABASE)
- cursor = connection.cursor()
- while True:
- (id, url, data) = qOut.get()
- logging.debug(url)
- cursor.execute("""INSERT INTO search_result (keyword_id, url, data) VALUES(%s, %s, %s)""", (id, url, data))
- connection.commit()
- qIn = Queue()
- qOut = Queue()
- #starting parser threads
- for i in range(MAX_THREADS):
- t = threading.Thread(target=worker, args=(i,), name="Worker-%d" % i)
- t.daemon = True
- t.start()
- #starting writer thread
- t = threading.Thread(target=writer, name="Writer")
- t.daemon = True
- t.start()
- logging.debug('Threads started')
- def add_keyword(id, kw):
- """
- XML-RPC function - addKeyword
- """
- start_time = time()
- total_urls = 0
- logging.debug('Bing Parsing, KW: %s' % kw)
- query = WebQuery(API_KEY, kw)
- results = query.execute()
- for result in results[:100]:
- total_urls += 1
- qIn.put((id, result.url))
- logging.debug('Bing parsed, found: %d' % total_urls)
- logging.debug('Parser finished, runtime: %s' % (time() - start_time))
- return id, kw
- logging.debug('Starting XML-RPC Server "%s" on port: %d' % (XML_RPC_SERVER_HOST, XML_RPC_SERVER_PORT))
- server = SimpleXMLRPCServer((XML_RPC_SERVER_HOST, XML_RPC_SERVER_PORT))
- server.register_function(add_keyword, 'addKeyword')
- server.serve_forever()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement