Advertisement
Guest User

Untitled

a guest
Aug 21st, 2019
134
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 8.04 KB | None | 0 0
  1. import requests, urllib.request, sqlite3, time, os, sys, click, util, rename as ren, fileparser as parsr, database, login as log
  2. from bs4 import BeautifulSoup
  3. from pathvalidate import ValidationError, validate_filename
  4. from PyPDF2 import PdfFileReader, PdfFileWriter; from shutil import copyfile, move
  5. from lxml import html
  6. from requests import Session
  7. base_url = "https://www.etsjets.org"
  8. url = "https://www.etsjets.org/JETS_Online_Archive"
  9.  
  10. path = os.path.realpath(__file__)
  11. path = path.replace("downloader.py","")
  12. path = path + "/Articles/"
  13. all_path = path + "All/"
  14.  
  15.  
  16. def start(data):
  17. get_volume_url(data)
  18.  
  19. def get_volume_url(data):
  20. with Session() as s:
  21. site = s.get("https://www.etsjets.org/new_welcome?destination=node%2F1120")
  22. bs_content = BeautifulSoup(site.content, "html.parser")
  23. login_data = {"name":log.get_username(), "pass":log.get_password(), "op":"Log in", "form_build_id":"form-a0ed7b5c7437ac9afeb21b126e24633b", "form_id":"user_login_block"}
  24. s.post("https://www.etsjets.org/new_welcome?destination=node%2F1120", login_data)
  25. volume_number = data[0]
  26. if volume_number == '0':
  27. file_start = "Vol "
  28. else:
  29. file_start = " " + volume_number + " "
  30.  
  31. response = s.get(url)
  32. soup = BeautifulSoup(response.content,'html.parser')
  33.  
  34. link_list = soup.findAll('a') #finds the "a" tag on url
  35. for link in link_list:
  36. link = str(link)
  37. if file_start in link:
  38. link_start = link.find('"') + 1
  39. link_end = link.find('"', link_start)
  40. urlAppend = link[link_start:link_end]
  41. volume_url = base_url + urlAppend
  42. get_issue_url(data, volume_url)
  43.  
  44. def get_issue_url(data, volume_url):
  45. with Session() as s:
  46. site = s.get("https://www.etsjets.org/new_welcome?destination=node%2F1120")
  47. bs_content = BeautifulSoup(site.content, "html.parser")
  48. login_data = {"name":log.get_username(), "pass":log.get_password(), "op":"Log in", "form_build_id":"form-a0ed7b5c7437ac9afeb21b126e24633b", "form_id":"user_login_block"}
  49. s.post("https://www.etsjets.org/new_welcome?destination=node%2F1120", login_data)
  50. issue_number_original = data[1]
  51. issue_number = data[1]
  52. volume_number = data[0]
  53.  
  54. response = requests.get(volume_url)
  55. soup = BeautifulSoup(response.text, 'html.parser')
  56. link_list = soup.findAll('a')
  57.  
  58. i = 0
  59.  
  60. for link in link_list:
  61. link = str(link)
  62. if (" " + volume_number + "." in link or volume_number + "." in link)and "Go to" not in link:
  63. i = i + 1
  64. link_start = link.find('"') + 1
  65. link_end = link.find('"', link_start)
  66. url_append = link[link_start:link_end]
  67. issue_url = base_url + url_append
  68. print(issue_url)
  69. if issue_number == '0' or "." + issue_number in link:
  70. data[1] = str(i)
  71. get_article_url(data, issue_url)
  72. data[1] = issue_number_original
  73.  
  74. def get_article_url(data, issue_url):
  75. with Session() as s:
  76. site = s.get("https://www.etsjets.org/new_welcome?destination=node%2F1120")
  77. bs_content = BeautifulSoup(site.content, "html.parser")
  78. login_data = {"name":log.get_username(), "pass":log.get_password(), "op":"Log in", "form_build_id":"form-a0ed7b5c7437ac9afeb21b126e24633b", "form_id":"user_login_block"}
  79. s.post("https://www.etsjets.org/new_welcome?destination=node%2F1120", login_data)
  80. article_number_original = data[2]
  81. article_number = data[2]
  82.  
  83. response = requests.get(issue_url)
  84. soup = BeautifulSoup(response.text, 'html.parser')
  85. link_list = soup.findAll('a')
  86.  
  87. z = 0
  88. for link in link_list:
  89. link = str(link)
  90. if ".pdf" in link and "Purchase Articles" not in link and "Purchase Back Issue(s)" not in link:
  91. z = z + 1
  92. if str(z) == article_number or article_number == '0':
  93. if "<em>" in link:
  94. link = link.replace("<em>", "")
  95. link = link.replace("</em>", "")
  96. title_start = link.find('>') + 1
  97. title_end = link.find('<', title_start)
  98. link_start = link.find('"') + 1
  99. link_end = link.find('"', link_start)
  100. article_url = link[link_start:link_end]
  101. article_url = base_url + article_url
  102. title = link[title_start:title_end]
  103. if "http://www.etsjets.org/" in article_url:
  104. article_url = article_url.replace("http://www.etsjets.org", "")
  105. if "https://www.etsjets.org/" in article_url:
  106. article_url = article_url.replace("https://www.etsjets.org", "")
  107. article_url = base_url + article_url
  108. data[2] = str(z)
  109. get_title_and_author(data, title, article_url)
  110. data[2] = article_number_original
  111.  
  112. def get_title_and_author(data, title, article_url):
  113. original_file_name = title
  114. count = title.count(". . .")
  115. title = parsr.get_raw_title(count, original_file_name)
  116. file_name = parsr.get_file_name(title)
  117.  
  118. util.p(original_file_name)
  119.  
  120. full_number = util.check_digit(data[0]) + "." + util.check_digit(data[1]) + "." + util.check_digit(data[2])
  121. full_name = full_number + " - " + file_name + ".pdf"
  122.  
  123. author = parsr.get_raw_author(full_name, count, original_file_name)
  124.  
  125. try:
  126. validate_filename(file_name)
  127. except ValidationError as e:
  128. click.echo()
  129. click.echo("{}\n".format(e), file=sys.stderr)
  130. sys.exit()
  131. finally:
  132. download(title, file_name, full_name, author, article_url, data, full_number)
  133.  
  134. def download(title, file_name, full_name, author, article_url, data, full_number):
  135. force = data[3]
  136.  
  137. if os.path.exists(all_path + full_name) and force == False:
  138. util.p(full_name)
  139. click.echo("This File Already Exists")
  140. else:
  141. util.p(full_number)
  142. click.echo("File Name: " + file_name)
  143. click.echo("Title: " + title)
  144. click.echo("Author: " + author)
  145. if click.confirm("Download File?"):
  146. if "&amp;" in article_url:
  147. article_url = article_url.replace("&amp;", "&")
  148. r = requests.get(article_url, stream=True)
  149. with open(all_path + "temp.pdf", 'wb') as file:
  150. file.write(r.content)
  151. util.write_info(all_path + "temp.pdf", title, author)
  152. move(all_path + "temp.pdf", all_path + full_name)
  153. author_database_worker(full_name, full_number, author, force, title)
  154. time.sleep(1)
  155. else:
  156. value = click.prompt("Change (A)uthor or (T)itle or (N)either?", default="n")
  157. value = value.lower()
  158. if value == "a":
  159. util.p("Current Author: " + author)
  160. new_auth = click.prompt("New Author Name: ")
  161. download(title, file_name, full_name, new_auth, article_url, data, full_number)
  162. elif value == "t":
  163. util.p("Current Title: " + title)
  164. new_title = click.prompt("New Title: ")
  165. download(new_title,file_name, full_name, author, article_url, data, full_number)
  166.  
  167.  
  168. def author_database_worker(full_name, full_num, author, force, title):
  169. for file in os.listdir(all_path):
  170. if full_name == file:
  171. authors = parsr.get_authors(author)
  172. for name in authors:
  173. author_name = util.get_possible_names(name)
  174. if author_name == None:
  175. author_name = name
  176. database.add_to_table(author_name, full_num)
  177. database.print_table()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement