Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import requests
- import json
- import time
- import logging
- import random
- CURIOUSCAT_BASE_URL = "https://curiouscat.live/api/v2.1/profile"
- ### Curiouscat profile downloader
- # Usage : [script] username
- # Will use the curiouscat API to download all of a specific profile's messages
- # The resulting json files are stored in a folder named "archive" in the execution directory
- #
- if __name__ == "__main__":
- import argparse
- import sys
- from datetime import datetime
- from pathlib import Path
- import os
- logging.basicConfig(level=logging.DEBUG)
- # Username command line argument
- argparseur = argparse.ArgumentParser(usage="Save curiouscat profiles")
- argparseur.add_argument("username",nargs=1)
- args = argparseur.parse_args()
- username = args.username[0]
- timestamp = datetime.now().timestamp()
- num_posts = 1
- page_num = 1
- # Looping on contents returned by the API
- while num_posts > 0:
- page_num += 1
- CURIOUSCAT_URL_PARAMS = f"?username={username}&max_timestamp={timestamp}&_ob=noregisterOrSignin2"
- URL = CURIOUSCAT_BASE_URL+CURIOUSCAT_URL_PARAMS
- #API query
- resp = requests.get(url=URL)
- body = resp.content
- respdict = json.loads(body)
- #preparing the file path
- date_string = datetime.fromtimestamp(timestamp).strftime("%Y%m%d-%H%M%S")
- dir_name = f"archive/{username}"
- dir_path = Path(dir_name)
- dir_path.mkdir(parents=True, exist_ok=True)
- file_path = Path(dir_path,f"{username}_archive_{date_string}.json")
- #saving to archive dir,
- # in a subdirectory named after the username of the profile
- with open(file_path,"w") as out_file:
- json.dump(respdict,out_file,indent=4)
- # Control for the end of the crawling
- # Curiouscat returns a limited number of messages per query
- # The next "page" can be deduced from the timestamp of the last message returned in the
- # response
- num_posts = len(respdict["posts"])
- if num_posts > 0:
- old_timestamp = timestamp
- timestamp = respdict["posts"][-1]["post"]["timestamp"]
- timestamp = int(timestamp)
- if timestamp == old_timestamp:
- break
- time.sleep(1+random.random())
- sys.exit(0)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement