kgirls.py

#!/usr/bin/python3

import sys
import os
import requests
from lxml import html
import subprocess
import urllib.parse
import re

YTDL = "yt-dlp"
ARIA2 = "aria2c"

def download(url, name):
    subprocess.call([ARIA2, "-o", name, "-x", "4", "-s", "4", "-c", "-q", url])

def get_name(url):
    try:
        r = requests.head(url)
        f = re.search(r'filename="(.*?)"', r.headers.get('content-disposition'), re.I)
        return urllib.parse.unquote(f.group(1))
    except:
        filename = urllib.parse.unquote(os.path.split(url)[-1])
        filename = filename.rsplit('?')[0]
        return filename

def process_url(url):
    if '.kakaocdn.net' in url and 'img1.daumcdn.net' in url:
        return urllib.parse.unquote(re.search(r'fname=(.*)', url).group(1))
    if '.daumcdn.net/cfile/tistory' in url and not 'original' in url:
        return url+"?original"
    if '.daumcdn.net/thumb' in url:
        return process_url(urllib.parse.unquote(re.search(r'fname=(.*)', url).group(1)))
    if 'uf.tistory.com' in url:
        return url.replace('image', 'original')
    return url

def download_file(url, prefix, n):
    url = process_url(url)
    name = prefix + str(n) + "_" + get_name(url)
    print(url)
    download(url, name)

def download_embed(url):
    subprocess.call([YTDL, url])

def get_post(url):
    print("Loading post {}".format(url))
    site = s.get(url)
    tree = html.fromstring(site.text)
    link = tree.cssselect('link[rel=canonical]')[0]
    link = link.get('href').split('/')
    prefix = link[-2]+"_"+link[-1]+"_"
    content = tree.cssselect('div.xe_content')[0]
    images = content.cssselect('img')
    files = []
    embeds = []
    for img in images:
        files.append(img.get('src'))
    videos = content.cssselect('video>source')
    for v in videos:
        files.append(v.get('src'))
    iframes = content.cssselect('iframe')
    for iframe in iframes:
        embeds.append(iframe.get('src'))

    total = len(files) + len(embeds)

    n = 1
    for fi in files:
        print("[{}/{}]".format(n, total))
        download_file(fi, prefix, n)
        n += 1
    for em in embeds:
        print("[{}/{}]".format(n, total))
        download_embed(em)
        n += 1

s = requests.Session()
s.headers.update({'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:49.0) Gecko/20100101 Firefox/49.0'})

url = sys.argv[1]

while url is not None:
    print("Loading category page {}".format(url))
    site = s.get(url)
    tree = html.fromstring(site.text)
    tree.make_links_absolute(url)
    links = tree.cssselect("ul#tmb_lst>li a.hx")
    for li in links:
        #print(li.get('href'))
        get_post(li.get('href'))

    di = tree.cssselect('a.direction')
    for d in di:
        if "Next" in d.text:
            url = d.get('href')
            break
    else:
        url = None


    #url = None