Advertisement
zinc55

Untitled

Jul 19th, 2012
67
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 11.17 KB | None | 0 0
  1. #!/usr/bin/env python2
  2. import sys
  3. from PyQt4 import QtGui, QtCore
  4. from ComicScrape import Ui_PieAndCake
  5. import requests
  6. from time import gmtime, strftime
  7. import os
  8. import platform
  9. import getpass
  10.  
  11. os = platform.system()
  12.  
  13. def Homestuck():
  14.     print 'Initiating download of the complete Homestuck archive.'
  15.         username = getpass.getuser()
  16.     #HTML
  17.     rootdata = "/home/" + username + "/Documents/Homestuck/"
  18.     #images
  19.     rootimg = "/home/" + username + "/Documents/Homestuck/images/"
  20.     print "Program started @", strftime("%Y-%m-%d %H:%M:%S", gmtime())
  21.  
  22.     #content is how many pages you want to download. Comment it out for auto-detection.
  23.     #The first 31 pages include all the differnt types of pages in the series: .gif, multigif, and .swf files.
  24.         global content
  25.     content = 1
  26.  
  27.     #endval is the comic's identification number
  28.     endval = "001901"
  29.  
  30.     #root page and image urls
  31.     page = "http://www.mspaintadventures.com/?s=6&p="
  32.     imgroot = "http://www.mspaintadventures.com/storyfiles/hs2/"
  33.     favipath = rootdata + "favicon.ico"
  34.  
  35.     #setting up variables for later
  36.     imgval = 0
  37.     flashcounter = 0
  38.  
  39.     #Homestuck uses non-unicode symbols in some panels. This makes python crash, so we'll
  40.     #remove the symbols.
  41.     trans_table = ''.join( [chr(i) for i in range(128)] + [' '] * 128 )
  42.  
  43.     #create the folders for the data if they dont exist
  44.     if not os.path.exists(rootdata):
  45.         os.mkdirs(rootdata)
  46.     if not os.path.exists(rootimg):
  47.         os.mkdirs(rootimg)
  48.         #download alignment images used on every page
  49.         fnames = ["v2_blankstrip.gif",
  50.               "v2_blanksquare.gif",
  51.               "spacer.gif",
  52.               "v2_blanksquare2.gif",
  53.               "v2_blanksquare3.gif",
  54.               "favicon.ico"]
  55.  
  56.         for i, name in enumerate(fnames):
  57.             print "Fetching spacers... (%s/5)" % i
  58.             f = requests.get("http://www.mspaintadventures.com/images/" + name)
  59.             q = open(rootdata + name, 'w+')
  60.             q.write(f.content)
  61.             q.close()
  62.  
  63.     #script will now attempt to identify how many comics there are.
  64.     print "Identifying amount of content to download..."
  65.     r = requests.get("http://www.mspaintadventures.com/")
  66.     html = r.text
  67.  
  68.     start  = html.find('<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">')
  69.     end   =  html.find("<!-----------------------LATEST PAGES----------------------------->")
  70.     html = html[:start] + html[end:]
  71.  
  72.     start = html.find("<!-----------------------END LATEST PAGES------------------------->")
  73.     end = html.find("</html>")
  74.     html = html[:start] + html[end:]
  75.  
  76.     start = html.find("<!-----------------------LATEST PAGES----------------------------->")
  77.     end = html.find('?s=6&p=')
  78.     html = html[:start] + html[end:]
  79.  
  80.     start = html.find('">"')
  81.     end = html.find('</html>')
  82.     html = html[:start] + html[end:]
  83.  
  84.     html = html[13:]
  85.     html = html[:6]
  86.  
  87.     currentcomicval = html
  88.     currentcomicval = int(currentcomicval) - 1901
  89.  
  90.     print "Downloading " + str(currentcomicval) + " pages of comics."
  91.  
  92.     content = int(currentcomicval)
  93.     global h_content
  94.     h_content = content
  95.  
  96.     #main loop
  97.  
  98.     while content > 0:
  99.         #set up file detection service. In need to know if there's a .gif, multiple .gifs, or a .swf
  100.         gif =True
  101.         multigif = True
  102.         flash = True
  103.  
  104.         multigifid = 1
  105.  
  106.         imgval = int(imgval) + 1
  107.         imgval = str(imgval).rjust(5,'0')
  108.        
  109.         urlgif = str(imgroot) + str(imgval) + ".gif"
  110.         urlmultigif = str(imgroot) + str(imgval) + "_" + str(multigifid) + ".gif"
  111.         flaurl = str(imgroot) + str(imgval) +"/" + str(imgval) + ".swf"
  112.  
  113.         urlgifpath = rootdata + str(imgval) + ".gif"
  114.         urlmultigifpath = rootdata + str(imgval) + "_1.gif"
  115.         flashpath = rootdata + str(imgval) + ".swf"
  116.  
  117.         #This checks to see if the file already exists; if it doesn't, it downloads it
  118.  
  119.         if not os.path.exists(urlgifpath) and not os.path.exists(urlmultigifpath) and not os.path.exists(flashpath):
  120.             gif = True
  121.             multigif = True
  122.             Flash = True    
  123.  
  124.             response = requests.get(urlgif)
  125.             if response.status_code == 404:
  126.                 gif = False
  127.  
  128.             response = requests.get(urlmultigif)
  129.             if response.status_code == 404:
  130.                 multigif = False
  131.                
  132.             response = requests.get(flaurl)
  133.             if response.status_code == 404:
  134.                 Flash = False
  135.  
  136.             #now to download the file
  137.  
  138.             #regular, single .gifs
  139.             if multigif == False and Flash == False:
  140.                 try: f = requests.get(urlgif)
  141.                 except f.statuscode == 404:
  142.                     print "Something went wrong while downloading the .gif."
  143.                     print urlgif
  144.                     break
  145.                 imgpath = rootdata + str(imgval) + ".gif"
  146.                 q = open(imgpath, 'w+')
  147.                 q.write(f.content)
  148.                 q.close()
  149.  
  150.             #more than 1 gif on a page
  151.             elif gif == False and Flash == False:
  152.                 gifstatus = True
  153.                 while gifstatus == True:
  154.                     urlmultigif = imgroot + imgval + "_" + str(multigifid) + ".gif"
  155.                     print urlmultigif
  156.                     f = requests.get(urlmultigif)
  157.                     if f.status_code == 404:
  158.                         gifstatus = False
  159.                         break
  160.                     imgpath = rootdata + str(imgval) + "_" + str(multigifid) + ".gif"
  161.                     q = open(imgpath, 'w+')
  162.                     q.write(f.content)
  163.                     q.close()
  164.                     multigifid += 1
  165.  
  166.             #Flash content
  167.             elif gif == False and multigif == False:
  168.                 flashcounter +=1
  169.                 swfurl = imgroot + imgval + "/" + imgval + ".swf"
  170.                 print swfurl
  171.                 f = requests.get(swfurl)
  172.                 imgpath = rootdata + str(imgval) + ".swf"
  173.                 q = open(imgpath, 'w+')
  174.                 q.write(f.content)
  175.                 q.close()
  176.  
  177.             else:
  178.                 print "Something went horribly wrong!"
  179.         else:
  180.             print "Image number " + imgval + " skipped."
  181.  
  182.         # Now we download the html
  183.         root = rootdir + str(endval) + ".html"
  184.         if not os.path.exists(root):
  185.             #create page id
  186.             url = page + str(endval)
  187.  
  188.             #open the webpage  
  189.             response = requests.get(url)
  190.             html = response.text
  191.            
  192.             #write data to file and fix path associations    
  193.             q = open(root, 'w+')
  194.  
  195.             #fix paths and whatnot
  196.             html = html.replace("http://www.mspaintadventures.com/storyfiles/hs2/", rootdata)
  197.             htmlpath = rootdir + str(endval) + ".html"
  198.            
  199.             start  = html.find("<!------------------------end comic content----------------------------------->")
  200.             end   =  html.find("</html>")
  201.             html = html[:start] + html[end:]
  202.            
  203.             start  = html.find("<!------------------------begin nav----------------------------------->")
  204.             end   =  html.find("<!------------------------end nav----------------------------------->")
  205.             html = html[:start] + html[end:]
  206.             html = html.replace(str(endval), "")
  207.            
  208.             #we need to increase the emdval by one to link to the next comic
  209.             endval = int(endval) + 1
  210.             endval = str(endval).rjust(6,'0')
  211.             htmlpath = rootdir + str(endval) + ".html"
  212.             html = html.replace("?s=6&p=" + endval, htmlpath)
  213.             html = html.replace("images/", rootdata)
  214.             html = html.replace("favicon.ico", favipath)
  215.            
  216.             #flash URL repair code
  217.             if gif == False and multigif == False:
  218.                 print "Repairing flash code..."
  219.                 start  = html.find('<script language="javascript">AC_FL_RunContent = 0;</script>')
  220.                 objns = ''' </object>
  221.         </noscript>'''
  222.                 end   =  html.find(objns)
  223.                 html = html[:start] + html[end + 1:]
  224.                 swffilelink = rootdata + str(imgval) + ".swf"
  225.                 swflink = "<a href=" + '"' + swffilelink + '"' + 'target="_self" name="Flash Content Link">Click here for flash</a>'
  226.                 html = html.replace("</object>", swflink)
  227.  
  228.             html = html.encode('ascii', 'ignore')
  229.             q.write(html)
  230.             q.close()
  231.         else:
  232.             endval = int(endval) + 1
  233.             endval = str(endval).rjust(6,'0')
  234.             print "html page " + endval + " skipped."
  235.  
  236.         content -= 1
  237.         myapp.homeBar()
  238.  
  239.     print "Finsihed downloading @:", strftime("%Y-%m-%d %H:%M:%S", gmtime())
  240.  
  241. def QC():
  242.     print 'Initiating download of the complete Qestionable Content archive.'
  243.  
  244.     #HTML
  245.     rootdir = "/home/" + username + "/Documents/Questionable Content/"
  246.     #CSS Local
  247.     localcss = rootdir + 'newstyles.css'
  248.     #local logo
  249.     locallogo = rootdir + "logo.png"
  250.     #images
  251.     rootdata = "/home/" + username + "/Documents/Questionable Content/comics/"
  252.  
  253.     print "Program started @", strftime("%Y-%m-%d %H:%M:%S", gmtime())
  254.  
  255.     #root page and image urls
  256.     page = "http://www.questionablecontent.net/"
  257.     imgroot = "http://www.questionablecontent.net/comics/"
  258.     css = "http://questionablecontent.net/newstyles.css"
  259.     logourl = "http://questionablecontent.net/testing/logo.png"
  260.     favipath = rootdata + "favicon."
  261.  
  262.     if not os.path.exists(rootdir):
  263.         os.mkdir(rootdir)
  264.     if not os.path.exists(rootdata):
  265.         os.mkdir(rootdata)
  266.     if not os.path.exists(localcss):
  267.         r = requests.get(css)
  268.         css = r.text
  269.         q = open(localcss, 'w+')
  270.         q.write(css)
  271.         q.close()
  272.     if not os.path.exists(locallogo):
  273.         r = requests.get(logourl)
  274.         logo = r.content
  275.         q = open(locallogo, 'w+')
  276.         q.write(logo)
  277.         q.close()    
  278.  
  279.     #get current comic id.
  280.     print "Identifying amount of content to download..."
  281.     r = requests.get("http://www.questionablecontent.net")
  282.     html = r.text
  283.     start  = html.find('<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">')
  284.     end   =  html.find('<img id="strip" src="http://www.questionablecontent.net/comics/')
  285.     html = html[:start] + html[end:]
  286.  
  287.     start  = html.find('.png">')
  288.     end   =  html.find('</html>')
  289.     html = html[:start] + html[end:]
  290.  
  291.     html = html[64:]
  292.     html = html[:4]
  293.     global qontent
  294.     qontent = html
  295.     global qc_content
  296.     qc_content = qontent
  297.     imgval = 1
  298.     urlroot = 'http://questionablecontent.net/view.php?comic='
  299.     print qontent
  300.     # main downlaod loop
  301.     while qontent > 0:
  302.         url = urlroot + str(imgval) + '.html'
  303.         print url
  304.         localpage = rootdir + str(imgval) + '.html'
  305.         localimage = rootdata + str(imgval) + '.png'
  306.         imgurl = imgroot + str(imgval) + '.png'
  307.        
  308.         #get the webpage
  309.         if not os.path.exists(localpage):
  310.             r = requests.get(urlroot)
  311.             html = r.text
  312.             #fix file path associations and write content to local file
  313.             nextcomic = int(imgval) + 1
  314.             html = html.replace('./comics/', './comics/' + str(imgval) + '.png')
  315.             html = html.replace('../testing/logo.png', './logo.png')
  316.             html = html.replace('view.php?comic=1', rootdir + str(nextcomic) + '.html')
  317.             start  = html.find('<b>Warning</b>')
  318.             end   =  html.find('<b>74</b><br />')
  319.             html = html[:start] + html[end + 5:]
  320.             q = open(localpage, 'w+')
  321.             q.write(html)
  322.             q.close()
  323.  
  324.         #get the image
  325.         print imgurl
  326.         if not os.path.exists(localimage):
  327.             r = requests.get(imgurl)
  328.             image = r.content
  329.             q = open(localimage, 'w+')
  330.             q.write(image)
  331.             q.close()
  332.  
  333.         imgval = int(imgval) + 1
  334.         qontent = int(qontent) - 1
  335.         myapp.qc_bar()
  336.  
  337. class MyApp(QtGui.QMainWindow):
  338.     def __init__(self):
  339.         QtGui.QMainWindow.__init__(self)
  340.         self.ui = Ui_PieAndCake()
  341.         self.ui.setupUi(self)
  342.         self.ui.the_button.clicked.connect(self.display_results)
  343.        
  344.     def display_results(self):
  345.         if self.ui.pie_check.isChecked() and self.ui.cake_check.isChecked():
  346.             Homestuck()
  347.             QC()
  348.         if self.ui.cake_check.isChecked():
  349.             QC()
  350.         if self.ui.pie_check.isChecked():
  351.             Homestuck()
  352.     def homeBar(self):
  353.         # h_content = total | content = current comic
  354.         addval = ((content * 100) / h_content)
  355.         addval = 100 - addval
  356.         self.ui.hBar.setValue(self.ui.hBar.value() + addval)
  357.     def qc_bar(self):
  358.         addval = ((int(qontent) * 100) / int(qc_content))
  359.         addval = 100 - addval
  360.         self.ui.qcBar.setValue(self.ui.qcBar.value() + addval)
  361.     def bugBar(self):
  362.                 print "bug comic"
  363.  
  364. if __name__ == '__main__':
  365.     app = QtGui.QApplication(sys.argv)
  366.     myapp = MyApp()
  367.     myapp.show()
  368.  
  369.     sys.exit(app.exec_())
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement