Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import requests, bs4
- from pprint import pprint
- import os
- from multiprocessing.dummy import Pool
- def hentlinks():
- url = 'http://studerende.au.dk/studier/fagportaler/jur/eksamen/eksamensopgaver-og-rettevejledninger/1-aar-paa-bacheloruddannelsen/'
- res = requests.get(url)
- soup = bs4.BeautifulSoup(res.content, 'lxml')
- poi = soup.find('div', {'id': 'c1485-default'})
- uls = poi.find_all('ul')
- linklist = []
- for u in uls:
- for i in u.find_all('li'):
- link = i.find('a')['href']
- linklist.append(link)
- return linklist
- def downloader(url):
- baseurl = 'http://studerende.au.dk/'
- pdfurl = baseurl+url
- filname = os.path.split(url)[1]
- sti = '/home/mathis/pdf_selma'
- pprint(pdfurl)
- pprint(filname)
- if not os.path.exists(sti):
- os.mkdir(sti)
- os.path.isfile()
- res = requests.get(pdfurl)
- with open(os.path.join(sti, filname), 'wb') as f:
- for chunk in res.iter_content(100000):
- f.write(chunk)
- f.close()
- print('hentet og skrevet')
- def main():
- links = hentlinks()
- pprint(links)
- TP = Pool(20)
- TP.map(downloader, links)
- TP.close()
- TP.join()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement