Advertisement
Guest User

Tübitak Dergileri

a guest
Mar 29th, 2020
66
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.30 KB | None | 0 0
  1. import requests
  2. from bs4 import BeautifulSoup as Soup
  3. import re
  4. from pathlib import Path
  5. import os
  6.  
  7. session = requests.session()
  8. session.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36 Edg/80.0.361.69'
  9. requests.packages.urllib3.disable_warnings()
  10. session.verify=False
  11.  
  12. def get_session_cookie(session=session):
  13.     url = 'https://services.tubitak.gov.tr/edergi/user/index.jsp'
  14.     session.get(url, verify=False)
  15.  
  16. def downloader(url, filename):
  17.     path = Path(filename)
  18.     path.parent.mkdir(parents=True, exist_ok=True)
  19.     with session.get(url, stream=True) as r:
  20.         r.raise_for_status()
  21.         with open(path, 'wb') as f:
  22.             for chunk in r.iter_content(chunk_size=8*2**10):
  23.                 if chunk:
  24.                     f.write(chunk)
  25.  
  26. def download_pdf(page_link):
  27.     res = session.get(page_link)
  28.     soup = Soup(res.text, 'html.parser')
  29.     magazine_name = soup.find('font', {'class':'DergiAdi'})
  30.     info = magazine_name.parent.next_sibling.get_text()
  31.     info = re.match('(?P<ay>\w+) +(?P<yil>\d{4}).*? (?P<sayi>\d+)', info.strip())
  32.     ay, yil, sayi = info.groups()
  33.     magazine_name = magazine_name.get_text()
  34.     pdf_name = f'tubitak/{magazine_name}/{magazine_name}-{yil}-{ay}-{sayi}.pdf'
  35.     pdf_link = f'https://services.tubitak.gov.tr/edergi/user/{soup.find("a").get("href")}'
  36.     downloader(pdf_link, pdf_name)
  37.     print(f'{pdf_name} indirildi')
  38.  
  39. def index(year, magazine, session=session):
  40.     url = 'https://services.tubitak.gov.tr/edergi/user/yilList1.jsp'
  41.     data = {'submitButton': '', 'yil':year, 'dergiKodu':magazine}
  42.     res = session.post(url, data=data)
  43.     soup = Soup(res.text, 'html.parser')
  44.     return soup
  45.  
  46. def get_year(year, magazine, session=session):
  47.     soup = index(year, magazine, session)
  48.     links = soup.find_all('a')
  49.     links = [f'https://services.tubitak.gov.tr/edergi/user/{link.get("href")}' for link in links]
  50.     return links
  51.  
  52. def get_years(magazine, session=session):
  53.     soup = index(2020, magazine, session)
  54.     year_select = soup.find('select', {'name': 'yil'})
  55.     years = [option.get('value') for option in year_select.find_all('option')]
  56.     return years
  57.  
  58. def magazine_downloader(magazine):
  59.     years = get_years(magazine)
  60.     downloaded_path = f'tubitak/downloaded{magazine}.txt'
  61.     downloaded = []
  62.     if os.path.exists(downloaded_path):
  63.         with open(downloaded_path) as f:
  64.             downloaded = f.read().strip().split('\n')
  65.     for year in years:
  66.         links = get_year(year, magazine)
  67.         for link in links:
  68.             if link not in downloaded:
  69.                 download_pdf(link)
  70.                 with open(downloaded_path, 'a') as f:
  71.                     f.write(f'{link}\n')
  72.            
  73.  
  74. def get_magazine_codes():
  75.     soup = index(2020, 4, session)
  76.     magazine_select = soup.find('select', {'name': 'dergiKodu'})
  77.     options = [(option.get('value'), option.get_text()) for option in magazine_select.find_all('option')][1:]
  78.     print(f'{"kod":^5} - {"dergi adı":<30}')
  79.     for option in options:
  80.         print(f'{option[0]:^5} - {option[1]:<30})')
  81.  
  82.  
  83. if __name__ == '__main__':
  84.     get_magazine_codes()
  85.     dergi_kodu = input('indirmek istediğiniz derginin kodunu giriniz: ')
  86.     magazine_downloader(dergi_kodu)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement