Guest User

Downloader

a guest
Dec 10th, 2013
191
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 7.58 KB | None | 0 0
  1. import re
  2. import urllib
  3. import os
  4. from Tkinter import Tk
  5.  
  6. BASE_DIRECTORY = os.getcwd() + "\\"
  7. TEMPORARY_IMAGE_PATH = BASE_DIRECTORY + "temp.%s"
  8. TEMPORARY_IMAGE = TEMPORARY_IMAGE_PATH
  9.  
  10. THREAD_URL_TEMPLATE = "http://boards.4chan.org/%s/res/%s"
  11. THREAD_URL_PATTERN = "^(https?://boards.4chan.org/([a-z]+)/res/([0-9]+))$"
  12.  
  13. THREAD_DIRECTORY_TEMPLATE = "%s-%s\\"
  14. THREAD_DIRECTORY_404_TEMPLATE = "(404) " + THREAD_DIRECTORY_TEMPLATE
  15. THREAD_DIRECTORY_PATTERN = "^([a-z]+)-([0-9]+)$"
  16.  
  17. IMAGE_URL_PATTERN = '<a href="//' \
  18.     '(images.4chan.org/[a-z]+/src/([0-9]+)\.([a-z]+))' \
  19.     '" target="_blank">'
  20.  
  21. IMAGE_URL_PATTERN = '<a class="fileThumb" ' \
  22.                     'href="//(i.4cdn.org/[a-z]+/src/([0-9]+)\.([a-z]+))" ' \
  23.                     'target="_blank">'
  24.  
  25. THREAD_LINK_PATH = "_thread.url"
  26. THREAD_LINK_TEMPLATE = "[InternetShortcut]\nURL=%s\nLATEST=%d"
  27. LATEST_POST_PATTERN = "LATEST=(-?[0-9]+)"
  28.  
  29. ERROR_THREAD_404 = -1
  30.  
  31.  
  32. class Image:
  33.  
  34.     def __init__(self, thread, match):
  35.         self.__thread = thread
  36.         [self.__src, self.__number, self.__ext] = match
  37.         self.__number = int(self.__number)
  38.  
  39.         conn = urllib.urlopen(self.get_source())
  40.         self.__filesize = int(conn.headers["Content-Length"])
  41.         conn.close()
  42.  
  43.     def get_thread(self):
  44.         return self.__thread
  45.  
  46.     def get_filename(self):
  47.         return '%s.%s' % (self.get_number(), self.get_extension())
  48.  
  49.     def get_source(self):
  50.         return "http://" + self.__src
  51.  
  52.     def get_destination(self):
  53.         return self.get_thread().get_directory() + self.get_filename()
  54.  
  55.     def get_number(self):
  56.         return self.__number
  57.  
  58.     def get_extension(self):
  59.         return self.__ext
  60.  
  61.     def get_filesize(self):
  62.         return self.__filesize
  63.  
  64.     def download(self):
  65.                 global TEMPORARY_IMAGE
  66.         TEMPORARY_IMAGE = TEMPORARY_IMAGE_PATH % (self.get_extension())
  67.         urllib.urlretrieve(self.get_source(), TEMPORARY_IMAGE)
  68.         os.rename(TEMPORARY_IMAGE, self.get_destination())
  69.  
  70.         self.get_thread().update_latest(self.get_number())
  71.  
  72.     def __str__(self):
  73.         return '%s -> %s' % (self.get_source(), self.get_destination())
  74.  
  75.  
  76. class Thread:
  77.  
  78.     def __init__(self, URL):
  79.         [self.__URL, self.__board, self.__number] = \
  80.             re.findall(THREAD_URL_PATTERN, URL)[0]
  81.  
  82.         conn = urllib.urlopen(URL)
  83.         self.__src = conn.read()
  84.         conn.close()
  85.  
  86.         if os.path.isdir(self.get_directory()):
  87.             self.load_latest()
  88.         else:
  89.             self.__latest = -1
  90.             self.create_directory()
  91.  
  92.     def get_URL(self):
  93.         return self.__URL
  94.  
  95.     def get_board(self):
  96.         return self.__board
  97.  
  98.     def get_number(self):
  99.         return self.__number
  100.  
  101.     def get_source(self):
  102.         return self.__src
  103.  
  104.     def get_directory(self):
  105.         return BASE_DIRECTORY + THREAD_DIRECTORY_TEMPLATE \
  106.             % (self.get_board(), self.get_number())
  107.  
  108.     def get_404_directory(self):
  109.         return BASE_DIRECTORY + THREAD_DIRECTORY_404_TEMPLATE \
  110.             % (self.get_board(), self.get_number())
  111.  
  112.     def get_link_path(self):
  113.         return self.get_directory() + THREAD_LINK_PATH
  114.  
  115.     def get_latest(self):
  116.         return self.__latest
  117.  
  118.     def set_latest(self, latest):
  119.         self.__latest = latest
  120.  
  121.     def load_latest(self):
  122.         f = open(self.get_link_path(), 'r')
  123.         contents = f.read()
  124.         f.close()
  125.  
  126.         self.__latest = int(re.findall(LATEST_POST_PATTERN, contents)[0])
  127.  
  128.     def update_latest(self, new_latest):
  129.         self.set_latest(new_latest)
  130.         self.create_shortcut()
  131.  
  132.     def create_directory(self):
  133.         os.makedirs(self.get_directory())
  134.         self.create_shortcut()
  135.  
  136.     def create_shortcut(self):
  137.         contents = THREAD_LINK_TEMPLATE % (self.get_URL(), self.get_latest())
  138.  
  139.         f = open(self.get_link_path(), 'w')
  140.         f.write(contents)
  141.         f.close()
  142.  
  143.     def get_images(self):
  144.         matches = re.findall(IMAGE_URL_PATTERN, self.get_source())
  145.  
  146.         if len(matches) == 0: return ERROR_THREAD_404
  147.  
  148.         images = []
  149.         for match in matches:
  150.             image = Image(self, match)
  151.             if image.get_number() > self.get_latest():
  152.                 images.append(image)
  153.  
  154.         return images
  155.  
  156.     def kill(self):
  157.         os.rename(self.get_directory(), self.get_404_directory())
  158.  
  159.  
  160. def download_new_images():
  161.  
  162.         try:
  163.  
  164.                 def display_status():
  165.                         width = num_digits(maximum)
  166.                         line = "\r%s: %" + str(width) + "d/%" + str(width) + "d (%3d%%)"
  167.                         print (line % \
  168.                                 (message, index, maximum, 100 * index / maximum)),
  169.                         if index == maximum: print
  170.  
  171.                 print "Loading threads"
  172.  
  173.                 # Load threads
  174.                 threads = []
  175.                 files = os.listdir(BASE_DIRECTORY)
  176.                 for filename in files:
  177.                         if os.path.isdir(filename):
  178.                                 match = re.findall(THREAD_DIRECTORY_PATTERN, filename)
  179.                                 if match:
  180.                                         URL = THREAD_URL_TEMPLATE % match[0]
  181.                                         threads.append(Thread(URL))
  182.  
  183.                 num_threads = len(threads)
  184.  
  185.                 if num_threads == 0:
  186.                         print 'No threads were found!'
  187.                         return
  188.  
  189.                 print '%d threads found!' % num_threads
  190.  
  191.                 # Scan threads for new images
  192.                 message = "Scanning threads"
  193.                 maximum = num_threads
  194.  
  195.                 images = []
  196.  
  197.                 for index, thread in enumerate(threads):
  198.                         display_status()
  199.                         new_images = thread.get_images()
  200.                         if new_images == ERROR_THREAD_404:
  201.                                 thread.kill()
  202.                         else:  
  203.                                 images.extend(new_images)
  204.                 index = maximum
  205.                 display_status()
  206.  
  207.                 num_images = len(images)
  208.  
  209.                 if num_images == 0:
  210.                         print 'No new images were found!'
  211.                         return
  212.  
  213.                 print '%d new images found!' % num_images
  214.  
  215.                 # Calculate total filesize
  216.                 bytes = 0
  217.                 for image in images:
  218.                         bytes += image.get_filesize()
  219.  
  220.                 print "Total file size: %s" % format_file_size(bytes)
  221.  
  222.                 # Download images
  223.                 message = "Downloading images"
  224.                 maximum = num_images
  225.  
  226.                 for index, image in enumerate(images):
  227.                         display_status()
  228.                         image.download()
  229.                 index = maximum
  230.                 display_status()
  231.  
  232.         except KeyboardInterrupt:
  233.                 try:
  234.                         os.remove(TEMPORARY_IMAGE)
  235.                 except:
  236.                         pass
  237.                 print "\nDownload cancelled"
  238.  
  239.  
  240. def add_thread():
  241.         tk = Tk()
  242.     URL = tk.clipboard_get()
  243.     tk.destroy()
  244.     if re.findall(THREAD_URL_PATTERN, URL):
  245.         Thread(URL)
  246.         print "Successfully added: %s" % URL
  247.     else:
  248.         print "Invalid URL: %s" % URL
  249.  
  250.  
  251. def num_digits(number):
  252.     return len(str(number))
  253.  
  254. def format_file_size(bytes):
  255.     for x in ['B','KB','MB','GB']:
  256.         if bytes < 1024.0:
  257.             return "%.1f%s" % (bytes, x)
  258.         bytes /= 1024.0
  259.     return "%.1f%s" % (bytes, 'TB')
  260.  
  261.  
  262. actions = {
  263.     "a" : ["Add thread from Clipboard", add_thread],
  264.     "d" : ["Download new images", download_new_images],
  265.     "q" : ["Quit", exit]
  266. }
  267.  
  268. PROMPT = "\aPlease choose a command\n" + \
  269.     '\n'.join(["%s : %s" % (key, actions[key][0]) \
  270.         for key in sorted(actions.keys())]) + '\n'
  271.  
  272. def main():
  273.     while True:
  274.         user_input = raw_input(PROMPT).lower()[0]
  275.         print
  276.         if user_input in actions:
  277.             actions[user_input][1]()
  278.             print
  279.         else:
  280.             print "Invalid command\n"
  281.  
  282.  
  283. if __name__ == "__main__":
  284.     main()
Add Comment
Please, Sign In to add comment