Untitled

# -*- coding: utf-8 -*-
"""
@author: Anon

Changelog:
2020/08/02: Added ?_=RANDOM to get not recoded file version / Added timeout and retries request function / Thread level
2020/08/03:  Added Post level data scrapping (com & photos)
2020/08/03: Added file index creation, validation before downloading file based on index, some minor improvements
"""
import json, requests, urllib.request, time, socket, os
from datetime import datetime
from CreateMD5Index import CreatePB64MD5Index, IsInIndex

def GetFile(Thread, myPath):
    socket.setdefaulttimeout(150)
    t0 = time.time()
    Filepath = ("https://media.8kun.top/file_store/" + str(Thread["tim"]) + str(Thread['ext']+ "?_=RANDOM"))
    LocalFileName = str(Thread['filename']) + str(Thread['ext'])
    fullfilename = os.path.join(myPath , LocalFileName)
    i = 0
    print('Web checksum: ' + str(Thread["md5"]) )
    if not IsInIndex(str(Thread["md5"])):
        for attempt in range(20):
            i += 1
            print("Attempt #" +str(i))
            try:
                 urllib.request.urlretrieve(Filepath, fullfilename)
            except Exception as x:
                print('It failed :(', x.__class__.__name__)
                print('Failed url: ' + Filepath)
            else:
                print('Downloaded: ' + LocalFileName)
                break
            finally:
                t1 = time.time()
                print('Took', t1 - t0, 'seconds')

def main(myPath):
    t0 = time.time()
    ThreadsR = requests.get('https://8kun.top/vichan/catalog.json');
    Pages = ThreadsR.json()
    FilesCount = 0
    ThreadCount = 0
    PostCount = 0

    with open(r"C:\Tmp\Data.txt", 'a', encoding='utf-8') as outfile:
        print("-----------------------Vichan-----------------------", file=outfile)
        for Page in Pages:
            Threads = Page['threads']
            for Thread in Threads:
                print ("**********************************************************", file=outfile)
                print ("| Tread | Replies: "  + str(Thread['replies']), file=outfile)
                ThreadCount += 1
                ThreadJsonUrl = "https://8kun.top/vichan/res/" + str(Thread['no']) +".json"
                print(ThreadJsonUrl)
                PostsR = requests.get(ThreadJsonUrl);
                try:
                    PostList = PostsR.json()
                    for Post in PostList["posts"]:
                        ts = int(Post['time'])
                        ts = datetime.utcfromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')
                        print ("| POST  |:" + str(Post['no']) + " | Time: " + str(ts) + ' | com: '  + str(Post['com']) + '\n', file=outfile)
                        PostCount += 1
                        if 'tim' in Post:
                             GetFile(Post, myPath)
                             FilesCount += 1
                except (ValueError, ConnectionError, AssertionError):
                    print('Couldnt get posts list for URL:' + ThreadJsonUrl)
    t1 = time.time()
    print("TotalFiles collected: " + str(FilesCount))
    print("Total threads collected: " + str(FilesCount))
    print("Total posts collected: " + str(FilesCount))
    print("Total time: ", t1 - t0, 'seconds')

#Main loop (sort of xDD)
myPath = r"C:/Tmp/ChanArchiveOutput/"
CreatePB64MD5Index(myPath)
main(myPath)