SHARE
TWEET

Untitled

a guest Aug 21st, 2019 79 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. import requests, urllib.request, sqlite3, time, os, sys, click, util, rename as ren, fileparser as parsr, database, login as log
  2. from bs4 import BeautifulSoup
  3. from pathvalidate import ValidationError, validate_filename
  4. from PyPDF2 import PdfFileReader, PdfFileWriter; from shutil import copyfile, move
  5. from lxml import html
  6. from requests import Session
  7. base_url = "https://www.etsjets.org"
  8. url = "https://www.etsjets.org/JETS_Online_Archive"
  9.  
  10. path = os.path.realpath(__file__)
  11. path = path.replace("downloader.py","")
  12. path = path + "/Articles/"
  13. all_path = path + "All/"
  14.  
  15.  
  16. def start(data):
  17.     get_volume_url(data)
  18.  
  19. def get_volume_url(data):
  20.     with Session() as s:
  21.         site = s.get("https://www.etsjets.org/new_welcome?destination=node%2F1120")
  22.         bs_content = BeautifulSoup(site.content, "html.parser")
  23.         login_data = {"name":log.get_username(), "pass":log.get_password(), "op":"Log in", "form_build_id":"form-a0ed7b5c7437ac9afeb21b126e24633b", "form_id":"user_login_block"}
  24.         s.post("https://www.etsjets.org/new_welcome?destination=node%2F1120", login_data)
  25.         volume_number = data[0]
  26.         if volume_number == '0':
  27.             file_start = "Vol "
  28.         else:
  29.             file_start = " " + volume_number + " "
  30.  
  31.         response = s.get(url)
  32.         soup = BeautifulSoup(response.content,'html.parser')
  33.  
  34.         link_list = soup.findAll('a') #finds the "a" tag on url
  35.         for link in link_list:
  36.             link = str(link)
  37.             if file_start in link:
  38.                 link_start = link.find('"') + 1
  39.                 link_end = link.find('"', link_start)
  40.                 urlAppend = link[link_start:link_end]
  41.                 volume_url = base_url + urlAppend
  42.                 get_issue_url(data, volume_url)
  43.  
  44. def get_issue_url(data, volume_url):
  45.     with Session() as s:
  46.         site = s.get("https://www.etsjets.org/new_welcome?destination=node%2F1120")
  47.         bs_content = BeautifulSoup(site.content, "html.parser")
  48.         login_data = {"name":log.get_username(), "pass":log.get_password(), "op":"Log in", "form_build_id":"form-a0ed7b5c7437ac9afeb21b126e24633b", "form_id":"user_login_block"}
  49.         s.post("https://www.etsjets.org/new_welcome?destination=node%2F1120", login_data)
  50.         issue_number_original = data[1]
  51.         issue_number = data[1]
  52.         volume_number = data[0]
  53.  
  54.         response = requests.get(volume_url)
  55.         soup = BeautifulSoup(response.text, 'html.parser')
  56.         link_list = soup.findAll('a')
  57.  
  58.         i = 0
  59.    
  60.         for link in link_list:
  61.             link = str(link)
  62.             if (" " + volume_number + "." in link or volume_number + "." in link)and "Go to" not in link:
  63.                 i = i + 1
  64.                 link_start = link.find('"') + 1
  65.                 link_end = link.find('"', link_start)
  66.                 url_append = link[link_start:link_end]
  67.                 issue_url = base_url + url_append
  68.                 print(issue_url)
  69.                 if issue_number == '0' or "." + issue_number in link:
  70.                     data[1] = str(i)
  71.                     get_article_url(data, issue_url)
  72.                     data[1] = issue_number_original
  73.  
  74. def get_article_url(data, issue_url):
  75.     with Session() as s:
  76.         site = s.get("https://www.etsjets.org/new_welcome?destination=node%2F1120")
  77.         bs_content = BeautifulSoup(site.content, "html.parser")
  78.         login_data = {"name":log.get_username(), "pass":log.get_password(), "op":"Log in", "form_build_id":"form-a0ed7b5c7437ac9afeb21b126e24633b", "form_id":"user_login_block"}
  79.         s.post("https://www.etsjets.org/new_welcome?destination=node%2F1120", login_data)
  80.         article_number_original = data[2]
  81.         article_number = data[2]
  82.  
  83.         response = requests.get(issue_url)
  84.         soup = BeautifulSoup(response.text, 'html.parser')
  85.         link_list = soup.findAll('a')
  86.    
  87.         z = 0
  88.         for link in link_list:
  89.             link = str(link)
  90.             if ".pdf" in link and "Purchase Articles" not in link and "Purchase Back Issue(s)" not in link:
  91.                 z = z + 1
  92.                 if str(z) == article_number or article_number == '0':
  93.                     if "<em>" in link:
  94.                         link = link.replace("<em>", "")
  95.                         link = link.replace("</em>", "")
  96.                     title_start = link.find('>') + 1
  97.                     title_end = link.find('<', title_start)
  98.                     link_start = link.find('"') + 1
  99.                     link_end = link.find('"', link_start)
  100.                     article_url = link[link_start:link_end]
  101.                     article_url = base_url + article_url
  102.                     title = link[title_start:title_end]
  103.                     if "http://www.etsjets.org/" in article_url:
  104.                         article_url = article_url.replace("http://www.etsjets.org", "")
  105.                     if "https://www.etsjets.org/" in article_url:
  106.                         article_url = article_url.replace("https://www.etsjets.org", "")
  107.                     article_url = base_url + article_url
  108.                     data[2] = str(z)
  109.                     get_title_and_author(data, title, article_url)
  110.                     data[2] = article_number_original
  111.  
  112. def get_title_and_author(data, title, article_url):
  113.     original_file_name = title
  114.     count              = title.count(". . .")
  115.     title              = parsr.get_raw_title(count, original_file_name)
  116.     file_name          = parsr.get_file_name(title)
  117.  
  118.     util.p(original_file_name)
  119.  
  120.     full_number        = util.check_digit(data[0]) + "." + util.check_digit(data[1]) + "." + util.check_digit(data[2])
  121.     full_name          =  full_number + " - " + file_name + ".pdf"
  122.    
  123.     author             = parsr.get_raw_author(full_name, count, original_file_name)
  124.    
  125.     try:
  126.         validate_filename(file_name)
  127.     except ValidationError as e:
  128.         click.echo()
  129.         click.echo("{}\n".format(e), file=sys.stderr)
  130.         sys.exit()
  131.     finally:
  132.         download(title, file_name, full_name, author, article_url, data, full_number)
  133.  
  134. def download(title, file_name, full_name, author, article_url, data, full_number):
  135.     force              = data[3]
  136.  
  137.     if os.path.exists(all_path + full_name) and force == False:
  138.         util.p(full_name)
  139.         click.echo("This File Already Exists")
  140.     else:
  141.         util.p(full_number)
  142.         click.echo("File Name: " + file_name)
  143.         click.echo("Title: " + title)
  144.         click.echo("Author: " + author)
  145.         if click.confirm("Download File?"):
  146.             if "&amp;" in article_url:
  147.                 article_url = article_url.replace("&amp;", "&")
  148.             r = requests.get(article_url, stream=True)
  149.             with open(all_path + "temp.pdf", 'wb') as file:
  150.                 file.write(r.content)
  151.             util.write_info(all_path + "temp.pdf", title, author)
  152.             move(all_path + "temp.pdf", all_path + full_name)
  153.             author_database_worker(full_name, full_number, author, force, title)
  154.             time.sleep(1)
  155.         else:
  156.             value = click.prompt("Change (A)uthor or (T)itle or (N)either?", default="n")
  157.             value = value.lower()
  158.             if value == "a":
  159.                 util.p("Current Author: " + author)
  160.                 new_auth = click.prompt("New Author Name: ")
  161.                 download(title, file_name, full_name, new_auth, article_url, data, full_number)
  162.             elif value == "t":
  163.                 util.p("Current Title: " + title)
  164.                 new_title = click.prompt("New Title: ")
  165.                 download(new_title,file_name, full_name, author, article_url, data, full_number)
  166.  
  167.  
  168. def author_database_worker(full_name, full_num, author, force, title):
  169.     for file in os.listdir(all_path):
  170.         if full_name == file:
  171.             authors = parsr.get_authors(author)
  172.             for name in authors:
  173.                 author_name = util.get_possible_names(name)
  174.                 if author_name == None:
  175.                     author_name = name
  176.                 database.add_to_table(author_name, full_num)
  177.                 database.print_table()
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top