Guest User

Untitled

a guest
Dec 8th, 2016
75
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.75 KB | None | 0 0
  1. import urllib2
  2. import threading
  3. from Queue import Queue
  4. import sys, os, re
  5.  
  6. class ThreadedDownload(object):
  7.  
  8. REGEX = {
  9. 'hostname_strip':re.compile('.*\..*?/', re.I)
  10. }
  11.  
  12.  
  13. class MissingDirectoryException(Exception):
  14. pass
  15.  
  16.  
  17. class Downloader(threading.Thread):
  18. def __init__(self, queue, report):
  19. threading.Thread.__init__(self)
  20. self.queue = queue
  21. self.report = report
  22.  
  23. def run(self):
  24. while self.queue.empty() == False:
  25. url = self.queue.get()
  26.  
  27. response = url.download()
  28. if response == False and url.url_tried < url.url_tries:
  29. self.queue.put(url)
  30. elif response == False and url.url_tried == url.url_tries:
  31. self.report['failure'].append(url)
  32. elif response == True:
  33. self.report['success'].append(url)
  34.  
  35. self.queue.task_done()
  36.  
  37.  
  38. class URLTarget(object):
  39. def __init__(self, url, destination, url_tries):
  40. self.url = url
  41. self.destination = destination
  42. self.url_tries = url_tries
  43. self.url_tried = 0
  44. self.success = False
  45. self.error = None
  46.  
  47. def download(self):
  48. self.url_tried = self.url_tried + 1
  49.  
  50. try:
  51. if os.path.exists(self.destination): # This file has already been downloaded
  52. self.success = True
  53. return self.success
  54.  
  55. remote_file = urllib2.urlopen(self.url)
  56. package = remote_file.read()
  57. remote_file.close()
  58.  
  59. if os.path.exists(os.path.dirname(self.destination)) == False:
  60. os.makedirs(os.path.dirname(self.destination))
  61.  
  62. dest_file = open(self.destination, 'wb')
  63. dest_file.write(package)
  64. dest_file.close()
  65.  
  66. self.success = True
  67.  
  68. except Exception, e:
  69. self.error = e
  70.  
  71. return self.success
  72.  
  73. def __str__(self):
  74. return 'URLTarget (%(url)s, %(success)s, %(error)s)' % {'url':self.url, 'success':self.success, 'error':self.error}
  75.  
  76.  
  77. def __init__(self, urls=[], destination='.', directory_structure=False, thread_count=5, url_tries=3):
  78. if os.path.exists(destination) == False:
  79. raise ThreadedDownload.MissingDirectoryException('Destination folder does not exist.')
  80.  
  81. self.queue = Queue(0) # Infinite sized queue
  82. self.report = {'success':[],'failure':[]}
  83. self.threads = []
  84.  
  85. if destination[-1] != os.path.sep:
  86. destination = destination + os.path.sep
  87. self.destination = destination
  88. self.thread_count = thread_count
  89. self.directory_structure = directory_structure
  90.  
  91. # Prepopulate queue with any values we were given
  92. for url in urls:
  93. self.queue.put(ThreadedDownload.URLTarget(url, self.fileDestination(url), url_tries))
  94.  
  95.  
  96. def fileDestination(self, url):
  97. if self.directory_structure == False:
  98. # No directory structure, just filenames
  99. file_destination = '%s%s' % (self.destination, os.path.basename(url))
  100.  
  101. elif self.directory_structure == True:
  102. # Strip off hostname, keep all other directories
  103. file_destination = '%s%s' % (self.destination, ThreadedDownload.REGEX['hostname_strip'].sub('', url))
  104.  
  105. elif hasattr(self.directory_structure, '__len__') and len(self.directory_structure) == 2:
  106. # User supplied a custom regex replace
  107. regex = self.directory_structure[0]
  108. if instanceof(regex, str):
  109. regex = re.compile(str)
  110. replace = self.directory_structure[1]
  111. file_destination = '%s%s' % (self.destination, regex.sub(replace, url))
  112.  
  113. else:
  114. # No idea what's wanted
  115. file_destination = None
  116.  
  117. if hasattr(file_destination, 'replace'):
  118. file_destination = file_destination.replace('/', os.path.sep)
  119. return file_destination
  120.  
  121.  
  122. def addTarget(self, url, url_tries=3):
  123. self.queue.put(ThreadedDownload.URLTarget(url, self.fileDestination(url), url_tries))
  124.  
  125.  
  126. def run(self):
  127. for i in range(self.thread_count):
  128. thread = ThreadedDownload.Downloader(self.queue, self.report)
  129. thread.start()
  130. self.threads.append(thread)
  131. if self.queue.qsize() > 0:
  132. self.queue.join()
  133.  
  134.  
  135. if __name__ == "__main__":
  136. if len(sys.argv) == 1:
  137. print 'No source URLs given.'
  138. sys.exit()
  139.  
  140. url_source_path = sys.argv[1]
  141. if not os.path.exists(url_source_path):
  142. print '`%s` not found.' % url_source_path
  143. sys.exit()
  144.  
  145. # Load urls
  146. url_source = open(url_source_path, 'r')
  147. urls = [url.strip() for url in url_source.readlines()]
  148. url_source.close()
  149.  
  150. # Download destination
  151. if len(sys.argv) >= 3:
  152. destination = sys.argv[2]
  153. if not os.path.exists(destination):
  154. print 'Destination `%s` does not exist.'
  155. sys.exit()
  156. else:
  157. destination = '.'
  158.  
  159. # Number of threads
  160. if len(sys.argv) >= 4:
  161. threads = int(sys.argv[3])
  162. else:
  163. threads = 5
  164.  
  165. downloader = ThreadedDownload(urls, destination, True, threads, 3)
  166.  
  167. print 'Downloading %s files' % len(urls)
  168. downloader.run()
  169. print 'Downloaded %(success)s of %(total)s' % {'success': len(downloader.report['success']), 'total': len(urls)}
  170.  
  171. if len(downloader.report['failure']) > 0:
  172. print '\nFailed urls:'
  173. for url in downloader.report['failure']:
  174. print url
Add Comment
Please, Sign In to add comment