Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/python
- import urllib2
- import re
- import sys
- import os
- # @todo: error handling
- # @todo: s/src="+v1+"/src="+v2+"/g s/href="+v1+"/href="+v2+"/g instead of s/v1/v2/g
- def saveFiles(page,urls,home,directory):
- dest = home + '/' + directory
- if not os.path.exists(dest):
- os.makedirs(dest)
- for url in urls:
- base = os.path.basename(url)
- page = page.replace(url,directory + '/' + base)
- url2 = url
- if url2.find('//') == 0:
- url2 = 'https:' + url2
- if url2.find('/') < 0:
- url2 = 'https://boards.4chan.org/' + url2
- sys.stderr.write(url2 + "\n")
- data = urllib2.urlopen(url2).read()
- f = open(dest + '/' + base,'wb')
- f.write(data)
- f.close()
- return page
- if len(sys.argv)<2:
- sys.stderr.write('usage: ' + sys.argv[0] + " url\n" )
- exit(0)
- url = sys.argv[1]
- home = '/home/user/threads'
- f = urllib2.urlopen(url)
- #f = open('test.html','r') #debug
- page = f.read()
- #f.close()
- #f = open('test.html','w')
- #f.write(page)
- #f.close()
- highresUrls = list(set(re.findall('(?<= href=")([^"]*\.[jpegnif]{3,4})',page)))
- previewUrls = list(set(re.findall('(?<= src=")([^"]*\.[jpegnif]{3,4})',page)))
- cssUrls = list(set(re.findall('(?<= href=")([^"]*\.css)',page)))
- jsUrls = list(set(re.findall('(?<= src=")([^"]*\.js)',page)))
- threadId = re.findall('(?<=thread/)([^/]*)',url)[0]
- threadDir = home + '/' + threadId
- page = saveFiles(page,cssUrls,threadDir,'css')
- page = saveFiles(page,jsUrls,threadDir,'js')
- page = saveFiles(page,highresUrls,threadDir,'img')
- page = saveFiles(page,previewUrls,threadDir,'img')
- f = open(threadDir + '/index.html','w')
- f.write(page)
- f.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement