Guest User

Untitled

a guest
Aug 2nd, 2020
92
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.62 KB | None | 0 0
  1. # -*- coding: utf-8 -*-
  2. """
  3. @author: Anon
  4.  
  5. Changelog:
  6. 2020/08/02: Added ?_=RANDOM to get not recoded file version / Added timeout and retries request function / Thread level
  7. 2020/08/03  Added Post level data scrapping (com & photos)
  8. """
  9. import json, requests, urllib.request, time, socket, os
  10.  
  11. def file_as_bytes(file):
  12.     with file:
  13.         return file.read()
  14.    
  15. def GetFile(Thread, myPath):
  16.     socket.setdefaulttimeout(15)
  17.     t0 = time.time()
  18.     Filepath = ("https://media.8kun.top/file_store/" + str(Thread["tim"]) + str(Thread['ext']+ "?_=RANDOM"))
  19.     LocalFileName = str(Thread['filename']) + str(Thread['ext'])
  20.     fullfilename = os.path.join(myPath , LocalFileName)
  21.     for attempt in range(3):
  22.         try:
  23.              urllib.request.urlretrieve(Filepath, fullfilename)
  24.         except Exception as x:
  25.             print('It failed :(', x.__class__.__name__)
  26.             print('Failed url: ' + Filepath)
  27.         else:
  28.             print('Downloaded: ' + LocalFileName)
  29.             break
  30.         finally:
  31.             t1 = time.time()
  32.             print('Took', t1 - t0, 'seconds')    
  33.    
  34. def main(myPath):
  35.     t0 = time.time()
  36.     ThreadsR = requests.get('https://8kun.top/vichan/catalog.json');
  37.     Pages = ThreadsR.json()
  38.     FilesCount = 0
  39.     ThreadCount = 0
  40.     PostCount = 0
  41.    
  42.     with open(myPath + "Data.txt", 'a', encoding='utf-8') as outfile:
  43.         print("-----------------------Vichan-----------------------", file=outfile)
  44.         for Page in Pages:
  45.             Threads = Page['threads']
  46.             for Thread in Threads:
  47.                 print ("|     Tread     | Replies: "  + str(Thread['replies']), file=outfile)
  48.                 ThreadCount += 1
  49.                 ThreadJsonUrl = "https://8kun.top/vichan/res/" + str(Thread['no']) +".json"
  50.                 print(ThreadJsonUrl)
  51.                 PostsR = requests.get(ThreadJsonUrl);
  52.                 PostList = PostsR.json()
  53.                 for Post in PostList["posts"]:
  54.                     print ("|     POST     |:" + str(Post['no']) + " | Time: " + str(Post['time']) + ' | com: '  + str(Post['com']) + '\n', file=outfile)
  55.                     PostCount += 1
  56.                     if 'tim' in Post:
  57.                          GetFile(Post, myPath)
  58.                          FilesCount += 1
  59.     t1 = time.time()
  60.     print("TotalFiles collected: " + str(FilesCount ), file=outfile)
  61.     print("Total threads collected: " + str(FilesCount ), file=outfile)
  62.     print("Total posts collected: " + str(FilesCount ), file=outfile)
  63.     print("Total time: ", t1 - t0, 'seconds', file=outfile)
  64.        
  65.    
  66. myPath = r"C:/Tmp/ChanArchiveOutput/"
  67. main(myPath)
Add Comment
Please, Sign In to add comment