Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #@author: SL
- #@license: GPLv3
- # http client
- import urllib2
- # cookie handler
- import cookielib
- # time module for performance metrics
- import time
- # re module for simple replace
- import re
- # syncrinyzed queue
- from Queue import Queue
- # thread
- from threading import Thread
- # single thread worker
- class Worker(Thread):
- def __init__(self, tasks):
- Thread.__init__(self)
- self.tasks = tasks
- self.daemon = True
- self.start()
- def run(self):
- while True:
- func, args, kargs = self.tasks.get()
- try: func(*args, **kargs)
- except Exception, e: print e
- self.tasks.task_done()
- class ThreadPool:
- def __init__(self, num_threads):
- self.tasks = Queue(num_threads)
- for _ in range(num_threads): Worker(self.tasks)
- def add_task(self, func, *args, **kargs):
- self.tasks.put((func, args, kargs))
- def wait_completion(self):
- self.tasks.join()
- def getUrl(url):
- fname = url
- # Replace specific symblos
- fname = re.sub("[\?=\/]","_",fname);
- cookie = cookielib.CookieJar()
- opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie))
- urllib2.install_opener(opener)
- headers = {"User-Agent" : "Mozilla/4.0 (compatible; MSIE 5.5; WindowsNT)" }
- # Exception handling (because some url doesn't work)
- try:
- out=open("".join(fname[7:])+".txt", "w")
- site = urllib2.urlopen(url)
- out.write(site.read())
- out.close()
- except urllib2.HTTPError, e:
- print e.code
- start = time.time()
- # 1) Init a Thread pool with the desired number of threads
- pool = ThreadPool(20)
- inp = open("input.txt", "r")
- str = inp.readlines()
- #@TODO no limit register
- threadsRegister=[];
- for a in str:
- try:
- t=Thread(None,getUrl,None,(a[:-1],));
- pool.add_task(getUrl, a[:-1])
- except Exception as errtxt:
- print errtxt
- end = time.time()
- # 3) Wait for completion
- pool.wait_completion()
- print "Elapsed Time: %s" % (end - start)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement