Advertisement
Guest User

Untitled

a guest
Apr 20th, 2019
129
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 12.87 KB | None | 0 0
  1. from selenium import webdriver
  2. from selenium.webdriver.common.keys import Keys
  3.  
  4. import urllib.request
  5. import requests
  6. import certifi
  7. from bs4 import BeautifulSoup
  8. import re
  9. import ssl
  10. text = []
  11. b = str()
  12. s = []
  13.  
  14. if __name__ == '__main__':
  15. print(' Name ', ' Q1 count ', ' Q2 count ', ' Q3 count ',
  16. ' Q4 count ', ' Art count ', ' ConfP count ', 'Book count', ' Total ', ' CiteSc ', ' SJR ')
  17.  
  18. opener = urllib.request.FancyURLopener({})
  19. url = 'http://www.apmath.spbu.ru/ru/staff/'
  20. f = opener.open(url)
  21. content = f.read()
  22.  
  23. soup = BeautifulSoup(content, 'html.parser')
  24.  
  25. table = soup.find('table')
  26. links = table.find_all('a')
  27. links = [link for link in links if 'depts' not in str(link)]
  28. name_depts = [link for link in links if 'depts' in str(link)]
  29. # print(links)
  30. print(ssl.OPENSSL_VERSION)
  31.  
  32. for i in links:
  33. teacher_link = i.get('href')
  34. teacher_desc = i.text
  35. # print(teacher_link, teacher_desc)
  36.  
  37. url = 'http://www.apmath.spbu.ru/ru/staff/{}index.html'.format(teacher_link)
  38.  
  39. with opener.open(url) as f:
  40. content = f.read()
  41.  
  42. soup = BeautifulSoup(content, 'html.parser')
  43.  
  44. links_pure = soup.find_all(href=re.compile('pure'))
  45.  
  46. for i in links_pure:
  47.  
  48. teach_pure_link = i.get('href')
  49. teach_pure_name = teach_pure_link[42:]
  50. if teach_pure_name == 'меджид-эльхан оглы-аббасов':
  51. teach_pure_name = 'меджид-эльхан-оглы-аббасов'
  52.  
  53. if teach_pure_name != 'андрей-юрьевич-гарнаев' or 'сергей-александрович-костромин' or 'латыпов' or 'игорь-николаевич-мешков' or 'павловский' or 'андрей-анатольевич-печников' or 'олег-васильевич-рогачевский' or'анатолий-олегович-сидорин' or 'григорий-владимирович-трубников' or 'шарлай':
  54.  
  55. url = 'https://spbu.pure.elsevier.com/ru/persons/{}/publications'.format(teach_pure_name)
  56.  
  57. r = requests.get(url, verify=True)
  58. content = r.text # http текст pure!
  59. #print(teach_pure_name)
  60.  
  61. soup = BeautifulSoup(content, 'html.parser')
  62.  
  63. #if teach_pure_name ==
  64.  
  65. scop_id = soup.find_all(href=re.compile('ID'))
  66. scop_id = str(scop_id)
  67.  
  68. a = scop_id.find('ID=')
  69. b = scop_id.find('&')
  70. scop_id = scop_id[a + 3:b]
  71. ##### костыль с scop idd
  72. if teach_pure_name == 'геннадий-викторович-алфёров':
  73. scop_id = 56405478800
  74. if teach_pure_name == 'александр-александрович-давыденко':
  75. scop_id = 57159745400
  76. if teach_pure_name == 'николай-семёнович-едаменко':
  77. scop_id = 12345018300
  78. if teach_pure_name == 'владимир-степанович-королёв':
  79. scop_id = 56651788200
  80. if teach_pure_name == 'дмитрий-юрьевич-куранов':
  81. scop_id = 56405591200
  82. if teach_pure_name == 'ирина-васильевна-медведева':
  83. scop_id = 55811039700
  84. if teach_pure_name == 'виктор-сергеевич-новосёлов':
  85. scop_id = 7004972246
  86. if teach_pure_name == 'наталья-викторовна-распопова':
  87. scop_id = 57160032200
  88. if teach_pure_name == 'михаил-николаевич-смирнов':
  89. scop_id = 56962731000
  90. if teach_pure_name == 'олег-анатольевич-тумка':
  91. scop_id = 57160483000
  92. if teach_pure_name == 'геннадий-михайлович-хитров':
  93. scop_id = 57159817900
  94. if teach_pure_name == 'анастасия-павловна-широколобова':
  95. scop_id = 57192959034
  96. #if teach_pure_name == 'андрей-юрьевич-гарнаев' or 'костромин' or 'латыпов' or 'мешков' or 'павловский' or 'печников' or 'рогачевский' or'сидорин' or 'трубников' or 'шарлай':
  97. #i += 1
  98. #print('пропустили ', teach_pure_name)
  99. #print(scop_id)
  100.  
  101. dept = soup.find_all(string=re.compile('Кафедра'))
  102. dept = str(dept).replace('[', ' ').replace(']', ' ').replace("'", ' ').strip()
  103. if dept == '':
  104. dept = 'No info'
  105. #print(dept)
  106.  
  107. ################# # # # # # # # # # # # # # # # # # # # # #
  108.  
  109. opener = urllib.request.FancyURLopener({})
  110. driver = webdriver.Chrome('/Users/alexkozlov/Documents/учеба/Научник/ВекторПМ/chromedriver')
  111. url = ('https://proxy.library.spbu.ru:2092/authid/detail.uri?authorId={}').format(scop_id)
  112. driver.get(url) #56202287000') # прогоняем все id
  113.  
  114. # driver.get('http://cufts.library.spbu.ru/CRDB/SPBGU/resource/79/goto')
  115.  
  116. username = driver.find_element_by_css_selector("input#username")
  117. username.click()
  118. username.send_keys('st048842')
  119. password = driver.find_element_by_css_selector("input#password")
  120. password.click()
  121. password.send_keys('jzEB89de')
  122. submit = driver.find_element_by_name("_eventId_proceed")
  123. submit.click()
  124.  
  125. url = ('https://proxy.library.spbu.ru:2092/onclick/export.uri?oneClickExport=%7b"Format"%3a"TEXT"%2c"SelectedFields"%3a"+Authors++Title++Year++EID++SourceTitle++Volume+Issue+ArtNo+PageStart+PageEnd+PageCount++CitedBy++DocumentType+Source++DOI++ACCESSTYPE++Affiliations++ISSN+ISBN+CODEN++PubMedID++Publisher++Editors++LanguageOfOriginalDocument++CorrespondenceAddress++AbbreviatedSourceTitle+Link+"%2c"View"%3a"SpecifyFields"%7d&origin=AuthorProfile&zone=exportDropDown&dataCheckoutTest=false&sort=plf-f&tabSelected=docLi&authorId={}&txGid=01f0e61cd67a3da5eb6ead1e6403b8c5').format(scop_id)
  126. driver.get(url)
  127. http = driver.page_source
  128. soup = BeautifulSoup(http, 'html.parser')
  129. str_soup = str(soup)
  130.  
  131. str_soup = '\n'.join(str_soup.split('\n')[2:])
  132. abz = str_soup.split('ИСТОЧНИКИ: Scopus')[:-1]
  133.  
  134. tot_points = 0
  135. conf_count = 0
  136. book_count = 0
  137. article_count = 0
  138. q1_count = 0
  139. q2_count = 0
  140. q3_count = 0
  141. q4_count = 0
  142. cs_count = 0
  143. sjr_count = 0
  144.  
  145. def parse_abz(abz):
  146. abz = [i for i in abz.split('\n') if i != '']
  147.  
  148. year = int(abz[3].split(')', 1)[0][1:])
  149.  
  150. issn_str = [i for i in abz if 'ISSN:' in i]
  151. if len(issn_str) == 0:
  152. issn_str = None
  153. else:
  154. issn_str = issn_str[0].split(' ')[1]
  155.  
  156. type_doc = ' '.join([i for i in abz if 'ТИП ДОКУМЕНТА:' in i][0].split(' ')[2:])
  157.  
  158. name_doc1 = abz[2]
  159. #print(name_doc1)
  160. #if '2013' or '2014' or '2015' or '2016' or '2017' in name_doc1:
  161. #if '2016' or '2015' in year: # Выбор периода '2016' = только 2016, '2' = все
  162. name_doc = ('').join(name_doc1.split(') ')[1:]).split(',')[0]
  163.  
  164.  
  165. publ = {
  166. 'year': year,
  167. 'issn': issn_str,
  168. 'type': type_doc,
  169. 'name': name_doc
  170. }
  171.  
  172. #print(publ)
  173. return (point_count(publ))
  174.  
  175.  
  176. def point_count(publ):
  177. global conf_count
  178. global book_count
  179. global article_count
  180. global tot_points
  181. issn_n = publ['issn']
  182. year = publ['year']
  183. name = publ['name']
  184.  
  185. if year == 2018 or year == 2017 or year == 2016 :
  186. if publ['type'] == 'Article':
  187. article_count += 1
  188. get_best_for(issn_n, year)
  189. elif publ['type'] == 'Conference Paper':
  190. conf_count += 1
  191. tot_points += 0.1
  192. elif publ['type'] == 'Book':
  193. book_count += 1
  194.  
  195. #######
  196. def get_best_for(issn_n, year):
  197. global cs_count
  198. global sjr_count
  199. global tot_points
  200. driver.get('http://www.scimagojr.com/')
  201. enter_issn = driver.find_element_by_css_selector("input#searchinput")
  202. enter_issn.click()
  203. #print(issn_n)
  204. enter_issn.send_keys(issn_n)
  205. try:
  206. enter_but = driver.find_element_by_css_selector("input#searchbutton")
  207. enter_but.click()
  208.  
  209. publ = driver.find_element_by_css_selector('a[href*="journalsearch"]')
  210. publ.click()
  211.  
  212. cross = driver.find_element_by_css_selector('html body div div div svg')
  213. cross.click()
  214.  
  215. http = driver.page_source
  216. soup = BeautifulSoup(http, 'html.parser')
  217.  
  218. tbody = soup.find_all('tbody')
  219. bodies = [t for t in str(tbody).split('\n') if 'Q' in t]
  220.  
  221. cs = [t for t in str(tbody).split('\n') if '2016' in t and '3 years' in t][0]
  222. cs = cs[8:-10].split('</td><td>')[2]
  223. cs = float(cs)
  224. cs_count = cs_count + cs
  225.  
  226. sjr_text = (soup.find(class_=re.compile('cell1x1 dynamiccell')))
  227. sjr = [t for t in str(sjr_text).split('\n') if '2016' in t][-1]
  228. sjr = sjr[8:-10].split('</td><td>')[1]
  229. sjr = float(sjr)
  230. sjr_count = sjr_count + sjr
  231.  
  232. return find_max_for_year(year, bodies)
  233. except:
  234. #print(teach_pure_name, issn_n, 'ошибка в issn')
  235. tot_points += 0.1
  236.  
  237.  
  238. def find_max_for_year(year, bodies):
  239. global q1_count
  240. global q2_count
  241. global q3_count
  242. global q4_count
  243. global tot_points
  244.  
  245. year = 2016
  246. bodies = [b for b in bodies if str(year) in b]
  247. if len(bodies) == 0:
  248. return None
  249. qs = [int(re.findall(r'Q[\d]', b)[0][1:]) for b in bodies]
  250. min_qs = min(qs)
  251. if min_qs == 1:
  252. q1_count += 1
  253. tot_points += 10
  254. elif min_qs == 2:
  255. q2_count += 1
  256. tot_points += 7
  257. elif min_qs == 3:
  258. q3_count += 1
  259. tot_points += 3
  260. elif min_qs == 4:
  261. q4_count += 1
  262. tot_points += 1
  263. tot_points = float(tot_points)
  264. return min_qs
  265.  
  266.  
  267. res = [parse_abz(i) for i in abz]
  268. #print(' Name ', ' Q1 count ', ' Q2 count ', ' Q3 count ',
  269. # ' Q4 count ', ' Art count ', ' ConfP count ', ' Total ', ' CiteSc ', ' SJR ')
  270. print("%-13s%13d%13d%13d%13d%13d%13d%13d%13.3f%13.3f%13.3f" % (teach_pure_name.split('-')[-1], q1_count, q2_count, q3_count,
  271. q4_count, article_count, conf_count, book_count, tot_points, cs_count, sjr_count))
  272. print('-------------------------------------------------------------------------------------------------------------------------------------------')
  273.  
  274.  
  275.  
  276. #print('#############')'''
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement