Don't like ads? PRO users don't see any ads ;-)

Untitled

By: hiddenmin on Apr 24th, 2012  |  syntax: Python  |  size: 2.95 KB  |  hits: 42  |  expires: Never
download  |  raw  |  embed  |  report abuse  |  print
Text below is selected. Please press Ctrl+C to copy to your clipboard. (⌘+C on Mac)
  1. # config
  2. base_url = 'http://www.islamhouse.com/pg/9892/articles/'
  3. save_dir = 'd:\\islamhouse\\'
  4. from_to_url = range(1,12 + 1)
  5. num_threads = 2
  6.  
  7. from Queue import Queue
  8. from threading import Thread, Lock
  9.  
  10. class Worker(Thread):
  11.     """Thread executing tasks from a given tasks queue"""
  12.     def __init__(self, tasks):
  13.         Thread.__init__(self)
  14.         self.tasks = tasks
  15.         self.daemon = True
  16.         self.start()
  17.  
  18.     def run(self):
  19.         while True:
  20.             func, args, kargs = self.tasks.get()
  21.             try:
  22.                 func(*args, **kargs)
  23.             except Exception, e:
  24.                 print e
  25.             finally:
  26.                 self.tasks.task_done()
  27.  
  28. class ThreadPool:
  29.     """Pool of threads consuming tasks from a queue"""
  30.     def __init__(self, num_threads, backlog = 0):
  31.         if backlog == 0 or backlog <= num_threads:
  32.             backlog = num_threads
  33.         self.tasks = Queue(backlog)
  34.         for _ in range(num_threads): Worker(self.tasks)
  35.  
  36.     def add_task(self, func, *args, **kargs):
  37.         """Add a task to the queue"""
  38.         self.tasks.put((func, args, kargs))
  39.  
  40.     def wait_completion(self):
  41.         """Wait for completion of all the tasks in the queue"""
  42.         self.tasks.join()
  43.  
  44.  
  45. import re
  46. from urllib2 import urlopen
  47. from urllib import urlretrieve
  48.  
  49. print 'compiling regex pattern'
  50. page_pattern = re.compile(';" title="(.*?)" href="(.*?)">(.*?)</a></div>', re.DOTALL)
  51. link_pattern = re.compile(' href="(.*?)"><img dir="ltr" hspace="5"')
  52.  
  53. mutex_stdout = Lock()
  54.  
  55. def extractlink(page):
  56.     links = list()
  57.     raw_source = urlopen(page).read()
  58.     for link in re.findall(link_pattern, raw_source):
  59.         if link[len(link)-4:] == '.pdf':
  60.             links.append(link)
  61.     return links
  62.  
  63.  
  64. def loadfile(link, file_name = ''):
  65.     if file_name == '':
  66.         filename = link.split('/')[-1]
  67.     else:
  68.         filename = file_name
  69.     filename = filename.decode('utf-8')
  70.     punc_char = '"*:<>?\/|'
  71.     for c in punc_char:
  72.         filename = filename.replace(c,'')
  73.        
  74.     mutex_stdout.acquire()
  75.     print 'loading...', filename
  76.     mutex_stdout.release()
  77.     urlretrieve(link, save_dir + filename)
  78. ##    mutex_stdout.acquire()
  79. ##    print 'finish....', filename
  80. ##    mutex_stdout.release()
  81.  
  82.  
  83. def getfile(url):    
  84.     links = extractlink(url[1])
  85.     for link in links:
  86.         loadfile(link,url[2].strip()+'.pdf')
  87.  
  88.  
  89. if __name__ == '__main__':
  90.  
  91.     pool = ThreadPool(num_threads)
  92.    
  93.     for i in from_to_url:
  94.         i = str(i)
  95.         mutex_stdout.acquire()
  96.         print 'fetching source... #'+i
  97.         mutex_stdout.acquire()
  98.         raw_source = urlopen(base_url + i).read()
  99.        
  100.         for url in re.findall(page_pattern, raw_source):
  101.             pool.add_task(getfile, url)
  102.            
  103.         mutex_stdout.acquire()
  104.         print 'finish parsing source... #'+i
  105.         mutex_stdout.acquire()
  106.     pool.wait_completion()
  107.     print '##### finish #####'