Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python2
- import sys
- from PyQt4 import QtGui, QtCore
- from ComicScrape import Ui_PieAndCake
- import requests
- from time import gmtime, strftime
- import os
- import platform
- import getpass
- os = platform.system()
- def Homestuck():
- print 'Initiating download of the complete Homestuck archive.'
- username = getpass.getuser()
- #HTML
- rootdata = "/home/" + username + "/Documents/Homestuck/"
- #images
- rootimg = "/home/" + username + "/Documents/Homestuck/images/"
- print "Program started @", strftime("%Y-%m-%d %H:%M:%S", gmtime())
- #content is how many pages you want to download. Comment it out for auto-detection.
- #The first 31 pages include all the differnt types of pages in the series: .gif, multigif, and .swf files.
- global content
- content = 1
- #endval is the comic's identification number
- endval = "001901"
- #root page and image urls
- page = "http://www.mspaintadventures.com/?s=6&p="
- imgroot = "http://www.mspaintadventures.com/storyfiles/hs2/"
- favipath = rootdata + "favicon.ico"
- #setting up variables for later
- imgval = 0
- flashcounter = 0
- #Homestuck uses non-unicode symbols in some panels. This makes python crash, so we'll
- #remove the symbols.
- trans_table = ''.join( [chr(i) for i in range(128)] + [' '] * 128 )
- #create the folders for the data if they dont exist
- if not os.path.exists(rootdata):
- os.mkdirs(rootdata)
- if not os.path.exists(rootimg):
- os.mkdirs(rootimg)
- #download alignment images used on every page
- fnames = ["v2_blankstrip.gif",
- "v2_blanksquare.gif",
- "spacer.gif",
- "v2_blanksquare2.gif",
- "v2_blanksquare3.gif",
- "favicon.ico"]
- for i, name in enumerate(fnames):
- print "Fetching spacers... (%s/5)" % i
- f = requests.get("http://www.mspaintadventures.com/images/" + name)
- q = open(rootdata + name, 'w+')
- q.write(f.content)
- q.close()
- #script will now attempt to identify how many comics there are.
- print "Identifying amount of content to download..."
- r = requests.get("http://www.mspaintadventures.com/")
- html = r.text
- start = html.find('<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">')
- end = html.find("<!-----------------------LATEST PAGES----------------------------->")
- html = html[:start] + html[end:]
- start = html.find("<!-----------------------END LATEST PAGES------------------------->")
- end = html.find("</html>")
- html = html[:start] + html[end:]
- start = html.find("<!-----------------------LATEST PAGES----------------------------->")
- end = html.find('?s=6&p=')
- html = html[:start] + html[end:]
- start = html.find('">"')
- end = html.find('</html>')
- html = html[:start] + html[end:]
- html = html[13:]
- html = html[:6]
- currentcomicval = html
- currentcomicval = int(currentcomicval) - 1901
- print "Downloading " + str(currentcomicval) + " pages of comics."
- content = int(currentcomicval)
- global h_content
- h_content = content
- #main loop
- while content > 0:
- #set up file detection service. In need to know if there's a .gif, multiple .gifs, or a .swf
- gif =True
- multigif = True
- flash = True
- multigifid = 1
- imgval = int(imgval) + 1
- imgval = str(imgval).rjust(5,'0')
- urlgif = str(imgroot) + str(imgval) + ".gif"
- urlmultigif = str(imgroot) + str(imgval) + "_" + str(multigifid) + ".gif"
- flaurl = str(imgroot) + str(imgval) +"/" + str(imgval) + ".swf"
- urlgifpath = rootdata + str(imgval) + ".gif"
- urlmultigifpath = rootdata + str(imgval) + "_1.gif"
- flashpath = rootdata + str(imgval) + ".swf"
- #This checks to see if the file already exists; if it doesn't, it downloads it
- if not os.path.exists(urlgifpath) and not os.path.exists(urlmultigifpath) and not os.path.exists(flashpath):
- gif = True
- multigif = True
- Flash = True
- response = requests.get(urlgif)
- if response.status_code == 404:
- gif = False
- response = requests.get(urlmultigif)
- if response.status_code == 404:
- multigif = False
- response = requests.get(flaurl)
- if response.status_code == 404:
- Flash = False
- #now to download the file
- #regular, single .gifs
- if multigif == False and Flash == False:
- try: f = requests.get(urlgif)
- except f.statuscode == 404:
- print "Something went wrong while downloading the .gif."
- print urlgif
- break
- imgpath = rootdata + str(imgval) + ".gif"
- q = open(imgpath, 'w+')
- q.write(f.content)
- q.close()
- #more than 1 gif on a page
- elif gif == False and Flash == False:
- gifstatus = True
- while gifstatus == True:
- urlmultigif = imgroot + imgval + "_" + str(multigifid) + ".gif"
- print urlmultigif
- f = requests.get(urlmultigif)
- if f.status_code == 404:
- gifstatus = False
- break
- imgpath = rootdata + str(imgval) + "_" + str(multigifid) + ".gif"
- q = open(imgpath, 'w+')
- q.write(f.content)
- q.close()
- multigifid += 1
- #Flash content
- elif gif == False and multigif == False:
- flashcounter +=1
- swfurl = imgroot + imgval + "/" + imgval + ".swf"
- print swfurl
- f = requests.get(swfurl)
- imgpath = rootdata + str(imgval) + ".swf"
- q = open(imgpath, 'w+')
- q.write(f.content)
- q.close()
- else:
- print "Something went horribly wrong!"
- else:
- print "Image number " + imgval + " skipped."
- # Now we download the html
- root = rootdir + str(endval) + ".html"
- if not os.path.exists(root):
- #create page id
- url = page + str(endval)
- #open the webpage
- response = requests.get(url)
- html = response.text
- #write data to file and fix path associations
- q = open(root, 'w+')
- #fix paths and whatnot
- html = html.replace("http://www.mspaintadventures.com/storyfiles/hs2/", rootdata)
- htmlpath = rootdir + str(endval) + ".html"
- start = html.find("<!------------------------end comic content----------------------------------->")
- end = html.find("</html>")
- html = html[:start] + html[end:]
- start = html.find("<!------------------------begin nav----------------------------------->")
- end = html.find("<!------------------------end nav----------------------------------->")
- html = html[:start] + html[end:]
- html = html.replace(str(endval), "")
- #we need to increase the emdval by one to link to the next comic
- endval = int(endval) + 1
- endval = str(endval).rjust(6,'0')
- htmlpath = rootdir + str(endval) + ".html"
- html = html.replace("?s=6&p=" + endval, htmlpath)
- html = html.replace("images/", rootdata)
- html = html.replace("favicon.ico", favipath)
- #flash URL repair code
- if gif == False and multigif == False:
- print "Repairing flash code..."
- start = html.find('<script language="javascript">AC_FL_RunContent = 0;</script>')
- objns = ''' </object>
- </noscript>'''
- end = html.find(objns)
- html = html[:start] + html[end + 1:]
- swffilelink = rootdata + str(imgval) + ".swf"
- swflink = "<a href=" + '"' + swffilelink + '"' + 'target="_self" name="Flash Content Link">Click here for flash</a>'
- html = html.replace("</object>", swflink)
- html = html.encode('ascii', 'ignore')
- q.write(html)
- q.close()
- else:
- endval = int(endval) + 1
- endval = str(endval).rjust(6,'0')
- print "html page " + endval + " skipped."
- content -= 1
- myapp.homeBar()
- print "Finsihed downloading @:", strftime("%Y-%m-%d %H:%M:%S", gmtime())
- def QC():
- print 'Initiating download of the complete Qestionable Content archive.'
- #HTML
- rootdir = "/home/" + username + "/Documents/Questionable Content/"
- #CSS Local
- localcss = rootdir + 'newstyles.css'
- #local logo
- locallogo = rootdir + "logo.png"
- #images
- rootdata = "/home/" + username + "/Documents/Questionable Content/comics/"
- print "Program started @", strftime("%Y-%m-%d %H:%M:%S", gmtime())
- #root page and image urls
- page = "http://www.questionablecontent.net/"
- imgroot = "http://www.questionablecontent.net/comics/"
- css = "http://questionablecontent.net/newstyles.css"
- logourl = "http://questionablecontent.net/testing/logo.png"
- favipath = rootdata + "favicon."
- if not os.path.exists(rootdir):
- os.mkdir(rootdir)
- if not os.path.exists(rootdata):
- os.mkdir(rootdata)
- if not os.path.exists(localcss):
- r = requests.get(css)
- css = r.text
- q = open(localcss, 'w+')
- q.write(css)
- q.close()
- if not os.path.exists(locallogo):
- r = requests.get(logourl)
- logo = r.content
- q = open(locallogo, 'w+')
- q.write(logo)
- q.close()
- #get current comic id.
- print "Identifying amount of content to download..."
- r = requests.get("http://www.questionablecontent.net")
- html = r.text
- start = html.find('<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">')
- end = html.find('<img id="strip" src="http://www.questionablecontent.net/comics/')
- html = html[:start] + html[end:]
- start = html.find('.png">')
- end = html.find('</html>')
- html = html[:start] + html[end:]
- html = html[64:]
- html = html[:4]
- global qontent
- qontent = html
- global qc_content
- qc_content = qontent
- imgval = 1
- urlroot = 'http://questionablecontent.net/view.php?comic='
- print qontent
- # main downlaod loop
- while qontent > 0:
- url = urlroot + str(imgval) + '.html'
- print url
- localpage = rootdir + str(imgval) + '.html'
- localimage = rootdata + str(imgval) + '.png'
- imgurl = imgroot + str(imgval) + '.png'
- #get the webpage
- if not os.path.exists(localpage):
- r = requests.get(urlroot)
- html = r.text
- #fix file path associations and write content to local file
- nextcomic = int(imgval) + 1
- html = html.replace('./comics/', './comics/' + str(imgval) + '.png')
- html = html.replace('../testing/logo.png', './logo.png')
- html = html.replace('view.php?comic=1', rootdir + str(nextcomic) + '.html')
- start = html.find('<b>Warning</b>')
- end = html.find('<b>74</b><br />')
- html = html[:start] + html[end + 5:]
- q = open(localpage, 'w+')
- q.write(html)
- q.close()
- #get the image
- print imgurl
- if not os.path.exists(localimage):
- r = requests.get(imgurl)
- image = r.content
- q = open(localimage, 'w+')
- q.write(image)
- q.close()
- imgval = int(imgval) + 1
- qontent = int(qontent) - 1
- myapp.qc_bar()
- class MyApp(QtGui.QMainWindow):
- def __init__(self):
- QtGui.QMainWindow.__init__(self)
- self.ui = Ui_PieAndCake()
- self.ui.setupUi(self)
- self.ui.the_button.clicked.connect(self.display_results)
- def display_results(self):
- if self.ui.pie_check.isChecked() and self.ui.cake_check.isChecked():
- Homestuck()
- QC()
- if self.ui.cake_check.isChecked():
- QC()
- if self.ui.pie_check.isChecked():
- Homestuck()
- def homeBar(self):
- # h_content = total | content = current comic
- addval = ((content * 100) / h_content)
- addval = 100 - addval
- self.ui.hBar.setValue(self.ui.hBar.value() + addval)
- def qc_bar(self):
- addval = ((int(qontent) * 100) / int(qc_content))
- addval = 100 - addval
- self.ui.qcBar.setValue(self.ui.qcBar.value() + addval)
- def bugBar(self):
- print "bug comic"
- if __name__ == '__main__':
- app = QtGui.QApplication(sys.argv)
- myapp = MyApp()
- myapp.show()
- sys.exit(app.exec_())
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement