Advertisement
Guest User

Untitled

a guest
Jun 12th, 2018
78
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.47 KB | None | 0 0
  1. #!/usr/bin/python
  2. import urllib2, argparse, logging
  3. import os, re, time
  4. import httplib
  5. import fileinput
  6. from BeautifulSoup import BeautifulSoup, SoupStrainer
  7. from multiprocessing import Process
  8. from sets import Set
  9.  
  10. log = logging.getLogger('inb4404')
  11. workpath = os.path.dirname(os.path.realpath(__file__))
  12. args = None
  13.  
  14. def load(url):
  15. req = urllib2.Request(url, headers={'User-Agent': '4chan Browser'})
  16. return urllib2.urlopen(req).read()
  17.  
  18. def main():
  19. global args
  20. parser = argparse.ArgumentParser(description='inb4404')
  21. parser.add_argument('url', nargs=1, help='url of the url (or filename; one url per line)')
  22. parser.add_argument('-n', '--use-names', action='store_true', help='use url names instead of the url ids (...4chan.org/board/url/url-id/url-name)')
  23. parser.add_argument('-r', '--reload', action='store_true', help='reload the file every 5 minutes')
  24. parser.add_argument('-l', '--less', action='store_true', help='shows less information (surpresses checking messages)')
  25. parser.add_argument('-d', '--date', action='store_true', help='show date as well')
  26. args = parser.parse_args()
  27.  
  28. if args.date:
  29. logging.basicConfig(level=logging.INFO, format='[%(asctime)s] %(message)s', datefmt='%Y-%m-%d %I:%M:%S %p')
  30. else:
  31. logging.basicConfig(level=logging.INFO, format='[%(asctime)s] %(message)s', datefmt='%I:%M:%S %p')
  32.  
  33. urlArr = args.url[0].split('/')
  34. indicator = urlArr[len(urlArr) - 2:][0]
  35. print(indicator)
  36.  
  37. if indicator == 'thread':
  38. print 'Downloading from thread'
  39. download_thread(args.url[0])
  40. # Else assume that we're trying to download the images off of a board
  41. else:
  42. print 'Downloading from board: ' + indicator
  43. download_board(args.url[0])
  44. # Spawn a download_thread function for
  45.  
  46. def download_board(url):
  47. urls = [url]
  48.  
  49. for page in range(2, 10):
  50. urls.append(url + str(page))
  51.  
  52. for pageurl in urls:
  53. data = load(pageurl)
  54. linksToBoard = SoupStrainer('a', href=re.compile('thread/'))
  55. for element in BeautifulSoup(data, parseOnlyThese=linksToBoard):
  56. link = str(element['href'])
  57. if '#' not in link and len(link.split('/')) < 3:
  58. download_thread(url + link)
  59.  
  60.  
  61. def download_thread(thread_link):
  62. print(thread_link)
  63. board = thread_link.split('/')[3]
  64. thread = thread_link.split('/')[5].split('#')[0]
  65. if len(thread_link.split('/')) > 6:
  66. thread_tmp = thread_link.split('/')[6].split('#')[0]
  67.  
  68. if args.use_names or os.path.exists(os.path.join(workpath, 'downloads', board, thread_tmp)):
  69. thread = thread_tmp
  70.  
  71. directory = os.path.join(workpath, 'downloads', board, thread)
  72. if not os.path.exists(directory):
  73. os.makedirs(directory)
  74.  
  75. regex = '(\/\/i(?:s|)\d*\.(?:4cdn|4chan)\.org\/\w+\/(\d+\.(?:jpg|png|gif|webm)))'
  76. for link, img in list(set(re.findall(regex, load(thread_link)))):
  77. img_path = os.path.join(directory, img)
  78. if not os.path.exists(img_path):
  79. try:
  80. data = load('https:' + link)
  81.  
  82. log.info(board + '/' + thread + '/' + img)
  83.  
  84. with open(img_path, 'w') as f:
  85. f.write(data)
  86.  
  87. ##################################################################################
  88. # saves new images to a seperate directory
  89. # if you delete them there, they are not downloaded again
  90. # if you delete an image in the 'downloads' directory, it will be downloaded again
  91. copy_directory = os.path.join(workpath, 'new', board, thread)
  92. if not os.path.exists(copy_directory):
  93. os.makedirs(copy_directory)
  94. copy_path = os.path.join(copy_directory, img)
  95. with open(copy_path, 'w') as f:
  96. f.write(data)
  97. ##################################################################################
  98. except Exception:
  99. print 'exception'
  100.  
  101.  
  102.  
  103. def download_from_file(filename):
  104. running_links = []
  105. while True:
  106. processes = []
  107. for link in filter(None, [line.strip() for line in open(filename) if line[:4] == 'http']):
  108. if link not in running_links:
  109. running_links.append(link)
  110. log.info('Added ' + link)
  111.  
  112. process = Process(target=download_thread, args=(link, ))
  113. process.start()
  114. processes.append([process, link])
  115.  
  116. if len(processes) == 0:
  117. log.warning(filename + ' empty')
  118.  
  119. if args.reload:
  120. time.sleep(60 * 5) # 5 minutes
  121. links_to_remove = []
  122. for process, link in processes:
  123. if not process.is_alive():
  124. links_to_remove.append(link)
  125. else:
  126. process.terminate()
  127.  
  128. for link in links_to_remove:
  129. for line in fileinput.input(filename, inplace=True):
  130. print line.replace(link, '-' + link),
  131. running_links.remove(link)
  132. log.info('Removed ' + link)
  133. if not args.less:
  134. log.info('Reloading ' + args.thread[0]) # thread = filename here; reloading on next loop
  135. else:
  136. break
  137.  
  138.  
  139. :renif __name__ == '__main__':
  140. try:
  141. main()
  142. except KeyboardInterrupt:
  143. pass
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement