Advertisement
overloop

save-thread.py

Oct 27th, 2015
227
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.61 KB | None | 0 0
  1. #!/usr/bin/python
  2. import urllib2
  3. import re
  4. import sys
  5. import os
  6.  
  7. # @todo: error handling
  8. # @todo: s/src="+v1+"/src="+v2+"/g s/href="+v1+"/href="+v2+"/g instead of s/v1/v2/g
  9.  
  10. def saveFiles(page,urls,home,directory):
  11.     dest = home + '/' + directory
  12.     if not os.path.exists(dest):
  13.         os.makedirs(dest)
  14.     for url in urls:
  15.         base = os.path.basename(url)
  16.         page = page.replace(url,directory + '/' + base)
  17.         url2 = url
  18.         if url2.find('//') == 0:
  19.             url2 = 'https:' + url2
  20.         if url2.find('/') < 0:
  21.             url2 = 'https://boards.4chan.org/' + url2  
  22.         sys.stderr.write(url2 + "\n")
  23.         data = urllib2.urlopen(url2).read()
  24.         f = open(dest + '/' + base,'wb')
  25.         f.write(data)
  26.         f.close()
  27.     return page
  28.  
  29. if len(sys.argv)<2:
  30.     sys.stderr.write('usage: ' + sys.argv[0] + " url\n" )
  31.     exit(0)
  32.  
  33. url = sys.argv[1]
  34.  
  35. home = '/home/user/threads'
  36.  
  37. f = urllib2.urlopen(url)
  38. #f = open('test.html','r') #debug
  39. page = f.read()
  40. #f.close()
  41.  
  42. #f = open('test.html','w')
  43. #f.write(page)
  44. #f.close()
  45.  
  46. highresUrls = list(set(re.findall('(?<= href=")([^"]*\.[jpegnif]{3,4})',page)))
  47. previewUrls = list(set(re.findall('(?<= src=")([^"]*\.[jpegnif]{3,4})',page)))
  48. cssUrls = list(set(re.findall('(?<= href=")([^"]*\.css)',page)))
  49. jsUrls = list(set(re.findall('(?<= src=")([^"]*\.js)',page)))
  50.  
  51. threadId = re.findall('(?<=thread/)([^/]*)',url)[0]
  52.  
  53. threadDir = home + '/' + threadId
  54.  
  55. page = saveFiles(page,cssUrls,threadDir,'css')
  56. page = saveFiles(page,jsUrls,threadDir,'js')
  57. page = saveFiles(page,highresUrls,threadDir,'img')
  58. page = saveFiles(page,previewUrls,threadDir,'img')
  59.  
  60. f = open(threadDir + '/index.html','w')
  61. f.write(page)
  62. f.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement