Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding: utf-8 -*-
- """
- @author: Anon
- Changelog:
- 2020/08/02: Added ?_=RANDOM to get not recoded file version / Added timeout and retries request function / Thread level
- 2020/08/03: Added Post level data scrapping (com & photos)
- 2020/08/03: Added file index creation, validation before downloading file based on index, some minor improvements
- """
- import json, requests, urllib.request, time, socket, os
- from datetime import datetime
- from CreateMD5Index import CreatePB64MD5Index, IsInIndex
- def GetFile(Thread, myPath):
- socket.setdefaulttimeout(150)
- t0 = time.time()
- Filepath = ("https://media.8kun.top/file_store/" + str(Thread["tim"]) + str(Thread['ext']+ "?_=RANDOM"))
- LocalFileName = str(Thread['filename']) + str(Thread['ext'])
- fullfilename = os.path.join(myPath , LocalFileName)
- i = 0
- print('Web checksum: ' + str(Thread["md5"]) )
- if not IsInIndex(str(Thread["md5"])):
- for attempt in range(20):
- i += 1
- print("Attempt #" +str(i))
- try:
- urllib.request.urlretrieve(Filepath, fullfilename)
- except Exception as x:
- print('It failed :(', x.__class__.__name__)
- print('Failed url: ' + Filepath)
- else:
- print('Downloaded: ' + LocalFileName)
- break
- finally:
- t1 = time.time()
- print('Took', t1 - t0, 'seconds')
- def main(myPath):
- t0 = time.time()
- ThreadsR = requests.get('https://8kun.top/vichan/catalog.json');
- Pages = ThreadsR.json()
- FilesCount = 0
- ThreadCount = 0
- PostCount = 0
- with open(r"C:\Tmp\Data.txt", 'a', encoding='utf-8') as outfile:
- print("-----------------------Vichan-----------------------", file=outfile)
- for Page in Pages:
- Threads = Page['threads']
- for Thread in Threads:
- print ("**********************************************************", file=outfile)
- print ("| Tread | Replies: " + str(Thread['replies']), file=outfile)
- ThreadCount += 1
- ThreadJsonUrl = "https://8kun.top/vichan/res/" + str(Thread['no']) +".json"
- print(ThreadJsonUrl)
- PostsR = requests.get(ThreadJsonUrl);
- try:
- PostList = PostsR.json()
- for Post in PostList["posts"]:
- ts = int(Post['time'])
- ts = datetime.utcfromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')
- print ("| POST |:" + str(Post['no']) + " | Time: " + str(ts) + ' | com: ' + str(Post['com']) + '\n', file=outfile)
- PostCount += 1
- if 'tim' in Post:
- GetFile(Post, myPath)
- FilesCount += 1
- except (ValueError, ConnectionError, AssertionError):
- print('Couldnt get posts list for URL:' + ThreadJsonUrl)
- t1 = time.time()
- print("TotalFiles collected: " + str(FilesCount))
- print("Total threads collected: " + str(FilesCount))
- print("Total posts collected: " + str(FilesCount))
- print("Total time: ", t1 - t0, 'seconds')
- #Main loop (sort of xDD)
- myPath = r"C:/Tmp/ChanArchiveOutput/"
- CreatePB64MD5Index(myPath)
- main(myPath)
Add Comment
Please, Sign In to add comment