# config
base_url = 'http://www.islamhouse.com/pg/9892/articles/'
save_dir = 'd:\\islamhouse\\'
from_to_url = range(1,12 + 1)
num_threads = 2
from Queue import Queue
from threading import Thread, Lock
class Worker(Thread):
"""Thread executing tasks from a given tasks queue"""
def __init__(self, tasks):
Thread.__init__(self)
self.tasks = tasks
self.daemon = True
self.start()
def run(self):
while True:
func, args, kargs = self.tasks.get()
try:
func(*args, **kargs)
except Exception, e:
print e
finally:
self.tasks.task_done()
class ThreadPool:
"""Pool of threads consuming tasks from a queue"""
def __init__(self, num_threads, backlog = 0):
if backlog == 0 or backlog <= num_threads:
backlog = num_threads
self.tasks = Queue(backlog)
for _ in range(num_threads): Worker(self.tasks)
def add_task(self, func, *args, **kargs):
"""Add a task to the queue"""
self.tasks.put((func, args, kargs))
def wait_completion(self):
"""Wait for completion of all the tasks in the queue"""
self.tasks.join()
import re
from urllib2 import urlopen
from urllib import urlretrieve
print 'compiling regex pattern'
page_pattern = re.compile(';" title="(.*?)" href="(.*?)">(.*?)</a></div>', re.DOTALL)
link_pattern = re.compile(' href="(.*?)"><img dir="ltr" hspace="5"')
mutex_stdout = Lock()
def extractlink(page):
links = list()
raw_source = urlopen(page).read()
for link in re.findall(link_pattern, raw_source):
if link[len(link)-4:] == '.pdf':
links.append(link)
return links
def loadfile(link, file_name = ''):
if file_name == '':
filename = link.split('/')[-1]
else:
filename = file_name
filename = filename.decode('utf-8')
punc_char = '"*:<>?\/|'
for c in punc_char:
filename = filename.replace(c,'')
mutex_stdout.acquire()
print 'loading...', filename
mutex_stdout.release()
urlretrieve(link, save_dir + filename)
## mutex_stdout.acquire()
## print 'finish....', filename
## mutex_stdout.release()
def getfile(url):
links = extractlink(url[1])
for link in links:
loadfile(link,url[2].strip()+'.pdf')
if __name__ == '__main__':
pool = ThreadPool(num_threads)
for i in from_to_url:
i = str(i)
mutex_stdout.acquire()
print 'fetching source... #'+i
mutex_stdout.acquire()
raw_source = urlopen(base_url + i).read()
for url in re.findall(page_pattern, raw_source):
pool.add_task(getfile, url)
mutex_stdout.acquire()
print 'finish parsing source... #'+i
mutex_stdout.acquire()
pool.wait_completion()
print '##### finish #####'