Guest User

Untitled

a guest
Aug 3rd, 2020
90
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.35 KB | None | 0 0
  1. # -*- coding: utf-8 -*-
  2. """
  3. @author: Anon
  4.  
  5. Changelog:
  6. 2020/08/02: Added ?_=RANDOM to get not recoded file version / Added timeout and retries request function / Thread level
  7. 2020/08/03:  Added Post level data scrapping (com & photos)
  8. 2020/08/03: Added file index creation, validation before downloading file based on index, some minor improvements
  9. """
  10. import json, requests, urllib.request, time, socket, os
  11. from datetime import datetime
  12. from CreateMD5Index import CreatePB64MD5Index, IsInIndex
  13.  
  14. def GetFile(Thread, myPath):
  15.     socket.setdefaulttimeout(150)
  16.     t0 = time.time()
  17.     Filepath = ("https://media.8kun.top/file_store/" + str(Thread["tim"]) + str(Thread['ext']+ "?_=RANDOM"))
  18.     LocalFileName = str(Thread['filename']) + str(Thread['ext'])
  19.     fullfilename = os.path.join(myPath , LocalFileName)
  20.     i = 0
  21.     print('Web checksum: ' + str(Thread["md5"]) )
  22.     if not IsInIndex(str(Thread["md5"])):
  23.         for attempt in range(20):
  24.             i += 1
  25.             print("Attempt #" +str(i))
  26.             try:
  27.                  urllib.request.urlretrieve(Filepath, fullfilename)
  28.             except Exception as x:
  29.                 print('It failed :(', x.__class__.__name__)
  30.                 print('Failed url: ' + Filepath)
  31.             else:
  32.                 print('Downloaded: ' + LocalFileName)
  33.                 break
  34.             finally:
  35.                 t1 = time.time()
  36.                 print('Took', t1 - t0, 'seconds')    
  37.    
  38. def main(myPath):
  39.     t0 = time.time()
  40.     ThreadsR = requests.get('https://8kun.top/vichan/catalog.json');
  41.     Pages = ThreadsR.json()
  42.     FilesCount = 0
  43.     ThreadCount = 0
  44.     PostCount = 0
  45.    
  46.     with open(r"C:\Tmp\Data.txt", 'a', encoding='utf-8') as outfile:
  47.         print("-----------------------Vichan-----------------------", file=outfile)
  48.         for Page in Pages:
  49.             Threads = Page['threads']
  50.             for Thread in Threads:
  51.                 print ("**********************************************************", file=outfile)
  52.                 print ("| Tread | Replies: "  + str(Thread['replies']), file=outfile)
  53.                 ThreadCount += 1
  54.                 ThreadJsonUrl = "https://8kun.top/vichan/res/" + str(Thread['no']) +".json"
  55.                 print(ThreadJsonUrl)
  56.                 PostsR = requests.get(ThreadJsonUrl);
  57.                 try:
  58.                     PostList = PostsR.json()
  59.                     for Post in PostList["posts"]:
  60.                         ts = int(Post['time'])
  61.                         ts = datetime.utcfromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')
  62.                         print ("| POST  |:" + str(Post['no']) + " | Time: " + str(ts) + ' | com: '  + str(Post['com']) + '\n', file=outfile)
  63.                         PostCount += 1
  64.                         if 'tim' in Post:
  65.                              GetFile(Post, myPath)
  66.                              FilesCount += 1
  67.                 except (ValueError, ConnectionError, AssertionError):
  68.                     print('Couldnt get posts list for URL:' + ThreadJsonUrl)
  69.     t1 = time.time()
  70.     print("TotalFiles collected: " + str(FilesCount))
  71.     print("Total threads collected: " + str(FilesCount))
  72.     print("Total posts collected: " + str(FilesCount))
  73.     print("Total time: ", t1 - t0, 'seconds')
  74.        
  75. #Main loop (sort of xDD)
  76. myPath = r"C:/Tmp/ChanArchiveOutput/"
  77. CreatePB64MD5Index(myPath)
  78. main(myPath)
Add Comment
Please, Sign In to add comment