Advertisement
Guest User

Megatokyo Scraper

a guest
Aug 9th, 2014
1,283
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.21 KB | None | 0 0
  1. #!/usr/bin/python
  2. """
  3. Megatokyo webcomic scraper by /u/tomkatt. Takes user input to specify a range of comics and downloads
  4. the images to store as .png files. Files are named after the strip number in the URL.
  5. """
  6. # For personal use only. Feel free to modify or use for non-commercial purposes.
  7. # Author claims no rights to the website 'megatokyo.com', webpages therein, images, or or other information associated.
  8. #
  9. # Megatokyo.com and all information within is copyrighted by Fred Gallagher. The Megatokyo name is trademarked
  10. # by Fred Gallagher.
  11.  
  12.  
  13. import urllib2
  14. import os
  15.  
  16.  
  17. def try_url(url):
  18.     """For testing if the URL is a valid page"""
  19.     try:
  20.         urllib2.urlopen(url)
  21.         return True
  22.     except urllib2.URLError:
  23.         return False
  24.  
  25.  
  26. def write_file(address_url, filename):
  27.     """Writes the file from address_url to a file, as specified by the string passed to filename"""
  28.     if os.name == 'nt':
  29.         f = open(filename, 'wb')
  30.     else:
  31.         f = open(filename, 'w+')
  32.     site = urllib2.urlopen(address_url)
  33.     f.write(site.read())
  34.     f.close()
  35.  
  36.  
  37. def get_comics(low_val, high_val):
  38.     """gets the comics between the value ranges specified in main. The loop creates the url path, specifies multiple
  39.    possible urls based on potential filetype (GIF, PNG, JPG), and then runs a check to see which URL is valid.
  40.    After determining valid URL, write_file is called to download the file to local path."""
  41.     comic = low_val
  42.     last_comic = high_val
  43.  
  44.     while comic <= last_comic:
  45.         comic_str = ""
  46.  
  47.         if comic <= 9:
  48.             comic_str = "000" + str(comic)
  49.         if comic >= 10 and comic <= 99:
  50.             comic_str = "00" + str(comic)
  51.         if comic >= 100 and comic <= 999:
  52.             comic_str = "0" + str(comic)
  53.         if comic >= 1000:
  54.             comic_str = str(comic)
  55.  
  56.         url = "http://megatokyo.com/strips/" + comic_str + ".gif"
  57.         url2 = "http://megatokyo.com/strips/" + comic_str + ".png"
  58.         url3 = "http://megatokyo.com/strips/" + comic_str + ".jpg"
  59.         url_check = try_url(url)
  60.         url_check2 = try_url(url2)
  61.         url_check3 = try_url(url3)
  62.  
  63.         if url_check:
  64.             print "GIF found. Writing comic #" + comic_str
  65.             comic_str = comic_str + ".gif"
  66.             write_file(url, comic_str)
  67.  
  68.         elif url_check2:
  69.             print "PNG found. Writing comic #" + comic_str
  70.             comic_str = comic_str + ".png"
  71.             write_file(url2, comic_str)
  72.  
  73.         elif url_check3:
  74.             print "JPG found. Writing comic #" + comic_str
  75.             comic_str = comic_str + ".jpg"
  76.             write_file(url3, comic_str)
  77.  
  78.         else:
  79.             print "no comic found."
  80.  
  81.         comic += 1
  82.  
  83.  
  84. def main():
  85.     comic_low = raw_input("\nEnter the first comic to download by number (1, 5, 75, etc.): ")
  86.     comic_high = raw_input("Enter the last comic to download by number (10, 50, 1000, etc.): ")
  87.     print "\n"
  88.  
  89.     try:
  90.         if int(comic_low) < int(comic_high):
  91.             get_comics(int(comic_low), int(comic_high))
  92.         else:
  93.             print "invalid input\n\n"
  94.     except:
  95.         print "Input must be an integer value. "
  96.  
  97.  
  98. if __name__ == '__main__':
  99.         main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement