Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/python3
- import sys
- import os
- import requests
- from lxml import html
- import subprocess
- import urllib.parse
- import re
- YTDL = "yt-dlp"
- ARIA2 = "aria2c"
- def download(url, name):
- subprocess.call([ARIA2, "-o", name, "-x", "4", "-s", "4", "-c", "-q", url])
- def get_name(url):
- try:
- r = requests.head(url)
- f = re.search(r'filename="(.*?)"', r.headers.get('content-disposition'), re.I)
- return urllib.parse.unquote(f.group(1))
- except:
- filename = urllib.parse.unquote(os.path.split(url)[-1])
- filename = filename.rsplit('?')[0]
- return filename
- def process_url(url):
- if '.kakaocdn.net' in url and 'img1.daumcdn.net' in url:
- return urllib.parse.unquote(re.search(r'fname=(.*)', url).group(1))
- if '.daumcdn.net/cfile/tistory' in url and not 'original' in url:
- return url+"?original"
- if '.daumcdn.net/thumb' in url:
- return process_url(urllib.parse.unquote(re.search(r'fname=(.*)', url).group(1)))
- if 'uf.tistory.com' in url:
- return url.replace('image', 'original')
- return url
- def download_file(url, prefix, n):
- url = process_url(url)
- name = prefix + str(n) + "_" + get_name(url)
- print(url)
- download(url, name)
- def download_embed(url):
- subprocess.call([YTDL, url])
- def get_post(url):
- print("Loading post {}".format(url))
- site = s.get(url)
- tree = html.fromstring(site.text)
- link = tree.cssselect('link[rel=canonical]')[0]
- link = link.get('href').split('/')
- prefix = link[-2]+"_"+link[-1]+"_"
- content = tree.cssselect('div.xe_content')[0]
- images = content.cssselect('img')
- files = []
- embeds = []
- for img in images:
- files.append(img.get('src'))
- videos = content.cssselect('video>source')
- for v in videos:
- files.append(v.get('src'))
- iframes = content.cssselect('iframe')
- for iframe in iframes:
- embeds.append(iframe.get('src'))
- total = len(files) + len(embeds)
- n = 1
- for fi in files:
- print("[{}/{}]".format(n, total))
- download_file(fi, prefix, n)
- n += 1
- for em in embeds:
- print("[{}/{}]".format(n, total))
- download_embed(em)
- n += 1
- s = requests.Session()
- s.headers.update({'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:49.0) Gecko/20100101 Firefox/49.0'})
- url = sys.argv[1]
- while url is not None:
- print("Loading category page {}".format(url))
- site = s.get(url)
- tree = html.fromstring(site.text)
- tree.make_links_absolute(url)
- links = tree.cssselect("ul#tmb_lst>li a.hx")
- for li in links:
- #print(li.get('href'))
- get_post(li.get('href'))
- di = tree.cssselect('a.direction')
- for d in di:
- if "Next" in d.text:
- url = d.get('href')
- break
- else:
- url = None
- #url = None
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement