Advertisement
Tosiaki

Download twitter media server

Sep 14th, 2023
5,276
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.89 KB | None | 0 0
  1. from twitter.scraper import Scraper
  2.  
  3. email, username, password = EMAIL, USERNAME, PASSWORD
  4. scraper = Scraper(email, username, password)
  5.  
  6. from twitter.util import find_key
  7. from httpx import AsyncClient
  8. import asyncio
  9. import aiofiles
  10. from tqdm.asyncio import tqdm_asyncio
  11. from urllib.parse import urlsplit
  12. from pathlib import Path
  13.  
  14. import requests
  15.  
  16. def already_downloadeds(username):
  17.     test_path = Path('media') / username
  18.     downloaded_ids = set()
  19.    
  20.     for test_item in test_path.iterdir():
  21.         result = re.match('i_status_(\d+)_.*\.(jpg|png)', test_item.name)
  22.         downloaded_ids.add(result.groups()[0])
  23.        
  24.     return downloaded_ids
  25.    
  26. def download_media(tweets, username, after_id=None):
  27.     urls = []
  28.  
  29.     out = Path('media') / username
  30.     out.mkdir(exist_ok = True)
  31.  
  32.     downloaded_ids = already_downloadeds(username)
  33.  
  34.     if not after_id:
  35.         response = requests.get(f'http://{tracking_server_ip}:8080/twitter-user?username={username}')
  36.         after_id = response.json()['lastTweetDownloaded']
  37.    
  38.     for tweet in tweets:
  39.         tweet_id = find_key(tweet, 'id_str')[0]
  40.  
  41.         if not tweet_id in downloaded_ids and int(tweet_id) > after_id:
  42.             url = f'https://twitter.com/i/status/{tweet_id}'
  43.             media = [y for x in find_key(tweet, 'media') for y in x]
  44.            
  45.             photo_urls = list({u for m in media if 'ext_tw_video_thumb' not in (u := m['media_url_https'])})
  46.             [urls.append([url, photo]) for photo in photo_urls]
  47.             downloaded_ids.add(tweet_id)
  48.    
  49.     chunk_size = 8192
  50.  
  51.     print(f"Downloading {len(urls)} items")
  52.  
  53.     async def process():
  54.         async with AsyncClient(headers=scraper.session.headers, cookies=scraper.session.cookies) as client:
  55.             tasks = (download(client, x, y) for x, y in urls)
  56.             if scraper.pbar:
  57.                 return await tqdm_asyncio.gather(*tasks, desc='Downloading media')
  58.             return await asyncio.gather(*tasks)
  59.    
  60.     async def download(client: AsyncClient, post_url: str, cdn_url: str) -> None:
  61.         name = urlsplit(post_url).path.replace('/', '_')[1:]
  62.         ext = urlsplit(cdn_url).path.split('/')[-1]
  63.         try:
  64.             r = await client.get(cdn_url)
  65.             async with aiofiles.open(out / f'{name}_{ext}', 'wb') as fp:
  66.                 for chunk in r.iter_bytes(chunk_size=chunk_size):
  67.                      await fp.write(chunk)
  68.         except Exception as e:
  69.              scraper.logger.error(f'[{RED}error{RESET}] Failed to download media: {post_url} {e}')
  70.    
  71.     asyncio.run(process())
  72.     latest_downloaded = max([int(download_id) for download_id in downloaded_ids])
  73.     print(f"Latest downloaded is {latest_downloaded}")
  74.  
  75.     requests.post(f'http://{tracking_server_ip}:8080/twitter-user-done', json={
  76.         'username': username,
  77.         'lastId': latest_downloaded
  78.     })
  79.  
  80. def entries_of_media(media_result):
  81.     entries = media_result['data']['user']['result']['timeline_v2']['timeline']['instructions'][0]['entries']
  82.     return [entry for entry in entries if entry['content']['entryType'] == 'TimelineTimelineItem']
  83.  
  84. def user_entries(username, after_id = None):
  85.     users = scraper.users([username])
  86.     user_id = users[0]['data']['user']['result']['rest_id']
  87.     media_results = scraper.media([user_id])
  88.     all_entries = [entry for media_result in media_results for entry in entries_of_media(media_result)]
  89.  
  90.     return all_entries
  91.  
  92. from fastapi import FastAPI
  93. from pydantic import BaseModel
  94.  
  95. class UserModel(BaseModel):
  96.     username: str
  97.  
  98. app = FastAPI()
  99.  
  100. @app.post("/download")
  101. async def download_user(json: UserModel):
  102.     username = json.username
  103.     media_entries = user_entries(username)
  104.     download_media(media_entries, username)
  105.     return f'{username} downloaded'
  106.  
  107. import asyncio
  108. import uvicorn
  109.  
  110. config = uvicorn.Config(app)
  111. server = uvicorn.Server(config)
  112. await server.serve()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement