Advertisement
Guest User

kgirls.py

a guest
May 21st, 2022
65
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.88 KB | None | 0 0
  1. #!/usr/bin/python3
  2.  
  3. import sys
  4. import os
  5. import requests
  6. from lxml import html
  7. import subprocess
  8. import urllib.parse
  9. import re
  10.  
  11. YTDL = "yt-dlp"
  12. ARIA2 = "aria2c"
  13.  
  14. def download(url, name):
  15.     subprocess.call([ARIA2, "-o", name, "-x", "4", "-s", "4", "-c", "-q", url])
  16.  
  17. def get_name(url):
  18.     try:
  19.         r = requests.head(url)
  20.         f = re.search(r'filename="(.*?)"', r.headers.get('content-disposition'), re.I)
  21.         return urllib.parse.unquote(f.group(1))
  22.     except:
  23.         filename = urllib.parse.unquote(os.path.split(url)[-1])
  24.         filename = filename.rsplit('?')[0]
  25.         return filename
  26.  
  27. def process_url(url):
  28.     if '.kakaocdn.net' in url and 'img1.daumcdn.net' in url:
  29.         return urllib.parse.unquote(re.search(r'fname=(.*)', url).group(1))
  30.     if '.daumcdn.net/cfile/tistory' in url and not 'original' in url:
  31.         return url+"?original"
  32.     if '.daumcdn.net/thumb' in url:
  33.         return process_url(urllib.parse.unquote(re.search(r'fname=(.*)', url).group(1)))
  34.     if 'uf.tistory.com' in url:
  35.         return url.replace('image', 'original')
  36.     return url
  37.  
  38. def download_file(url, prefix, n):
  39.     url = process_url(url)
  40.     name = prefix + str(n) + "_" + get_name(url)
  41.     print(url)
  42.     download(url, name)
  43.  
  44. def download_embed(url):
  45.     subprocess.call([YTDL, url])
  46.  
  47. def get_post(url):
  48.     print("Loading post {}".format(url))
  49.     site = s.get(url)
  50.     tree = html.fromstring(site.text)
  51.     link = tree.cssselect('link[rel=canonical]')[0]
  52.     link = link.get('href').split('/')
  53.     prefix = link[-2]+"_"+link[-1]+"_"
  54.     content = tree.cssselect('div.xe_content')[0]
  55.     images = content.cssselect('img')
  56.     files = []
  57.     embeds = []
  58.     for img in images:
  59.         files.append(img.get('src'))
  60.     videos = content.cssselect('video>source')
  61.     for v in videos:
  62.         files.append(v.get('src'))
  63.     iframes = content.cssselect('iframe')
  64.     for iframe in iframes:
  65.         embeds.append(iframe.get('src'))
  66.  
  67.     total = len(files) + len(embeds)
  68.  
  69.     n = 1
  70.     for fi in files:
  71.         print("[{}/{}]".format(n, total))
  72.         download_file(fi, prefix, n)
  73.         n += 1
  74.     for em in embeds:
  75.         print("[{}/{}]".format(n, total))
  76.         download_embed(em)
  77.         n += 1
  78.  
  79. s = requests.Session()
  80. s.headers.update({'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:49.0) Gecko/20100101 Firefox/49.0'})
  81.  
  82. url = sys.argv[1]
  83.  
  84. while url is not None:
  85.     print("Loading category page {}".format(url))
  86.     site = s.get(url)
  87.     tree = html.fromstring(site.text)
  88.     tree.make_links_absolute(url)
  89.     links = tree.cssselect("ul#tmb_lst>li a.hx")
  90.     for li in links:
  91.         #print(li.get('href'))
  92.         get_post(li.get('href'))
  93.  
  94.     di = tree.cssselect('a.direction')
  95.     for d in di:
  96.         if "Next" in d.text:
  97.             url = d.get('href')
  98.             break
  99.     else:
  100.         url = None
  101.  
  102.  
  103.     #url = None
  104.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement