Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from selenium import webdriver
- from selenium.webdriver.common.keys import Keys
- import urllib.request
- import requests
- import certifi
- from bs4 import BeautifulSoup
- import re
- import ssl
- text = []
- b = str()
- s = []
- if __name__ == '__main__':
- print(' Name ', ' Q1 count ', ' Q2 count ', ' Q3 count ',
- ' Q4 count ', ' Art count ', ' ConfP count ', 'Book count', ' Total ', ' CiteSc ', ' SJR ')
- opener = urllib.request.FancyURLopener({})
- url = 'http://www.apmath.spbu.ru/ru/staff/'
- f = opener.open(url)
- content = f.read()
- soup = BeautifulSoup(content, 'html.parser')
- table = soup.find('table')
- links = table.find_all('a')
- links = [link for link in links if 'depts' not in str(link)]
- name_depts = [link for link in links if 'depts' in str(link)]
- # print(links)
- print(ssl.OPENSSL_VERSION)
- for i in links:
- teacher_link = i.get('href')
- teacher_desc = i.text
- # print(teacher_link, teacher_desc)
- url = 'http://www.apmath.spbu.ru/ru/staff/{}index.html'.format(teacher_link)
- with opener.open(url) as f:
- content = f.read()
- soup = BeautifulSoup(content, 'html.parser')
- links_pure = soup.find_all(href=re.compile('pure'))
- for i in links_pure:
- teach_pure_link = i.get('href')
- teach_pure_name = teach_pure_link[42:]
- if teach_pure_name == 'меджид-эльхан оглы-аббасов':
- teach_pure_name = 'меджид-эльхан-оглы-аббасов'
- if teach_pure_name != 'андрей-юрьевич-гарнаев' or 'сергей-александрович-костромин' or 'латыпов' or 'игорь-николаевич-мешков' or 'павловский' or 'андрей-анатольевич-печников' or 'олег-васильевич-рогачевский' or'анатолий-олегович-сидорин' or 'григорий-владимирович-трубников' or 'шарлай':
- url = 'https://spbu.pure.elsevier.com/ru/persons/{}/publications'.format(teach_pure_name)
- r = requests.get(url, verify=True)
- content = r.text # http текст pure!
- #print(teach_pure_name)
- soup = BeautifulSoup(content, 'html.parser')
- #if teach_pure_name ==
- scop_id = soup.find_all(href=re.compile('ID'))
- scop_id = str(scop_id)
- a = scop_id.find('ID=')
- b = scop_id.find('&')
- scop_id = scop_id[a + 3:b]
- ##### костыль с scop idd
- if teach_pure_name == 'геннадий-викторович-алфёров':
- scop_id = 56405478800
- if teach_pure_name == 'александр-александрович-давыденко':
- scop_id = 57159745400
- if teach_pure_name == 'николай-семёнович-едаменко':
- scop_id = 12345018300
- if teach_pure_name == 'владимир-степанович-королёв':
- scop_id = 56651788200
- if teach_pure_name == 'дмитрий-юрьевич-куранов':
- scop_id = 56405591200
- if teach_pure_name == 'ирина-васильевна-медведева':
- scop_id = 55811039700
- if teach_pure_name == 'виктор-сергеевич-новосёлов':
- scop_id = 7004972246
- if teach_pure_name == 'наталья-викторовна-распопова':
- scop_id = 57160032200
- if teach_pure_name == 'михаил-николаевич-смирнов':
- scop_id = 56962731000
- if teach_pure_name == 'олег-анатольевич-тумка':
- scop_id = 57160483000
- if teach_pure_name == 'геннадий-михайлович-хитров':
- scop_id = 57159817900
- if teach_pure_name == 'анастасия-павловна-широколобова':
- scop_id = 57192959034
- #if teach_pure_name == 'андрей-юрьевич-гарнаев' or 'костромин' or 'латыпов' or 'мешков' or 'павловский' or 'печников' or 'рогачевский' or'сидорин' or 'трубников' or 'шарлай':
- #i += 1
- #print('пропустили ', teach_pure_name)
- #print(scop_id)
- dept = soup.find_all(string=re.compile('Кафедра'))
- dept = str(dept).replace('[', ' ').replace(']', ' ').replace("'", ' ').strip()
- if dept == '':
- dept = 'No info'
- #print(dept)
- ################# # # # # # # # # # # # # # # # # # # # # #
- opener = urllib.request.FancyURLopener({})
- driver = webdriver.Chrome('/Users/alexkozlov/Documents/учеба/Научник/ВекторПМ/chromedriver')
- url = ('https://proxy.library.spbu.ru:2092/authid/detail.uri?authorId={}').format(scop_id)
- driver.get(url) #56202287000') # прогоняем все id
- # driver.get('http://cufts.library.spbu.ru/CRDB/SPBGU/resource/79/goto')
- username = driver.find_element_by_css_selector("input#username")
- username.click()
- username.send_keys('st048842')
- password = driver.find_element_by_css_selector("input#password")
- password.click()
- password.send_keys('jzEB89de')
- submit = driver.find_element_by_name("_eventId_proceed")
- submit.click()
- url = ('https://proxy.library.spbu.ru:2092/onclick/export.uri?oneClickExport=%7b"Format"%3a"TEXT"%2c"SelectedFields"%3a"+Authors++Title++Year++EID++SourceTitle++Volume+Issue+ArtNo+PageStart+PageEnd+PageCount++CitedBy++DocumentType+Source++DOI++ACCESSTYPE++Affiliations++ISSN+ISBN+CODEN++PubMedID++Publisher++Editors++LanguageOfOriginalDocument++CorrespondenceAddress++AbbreviatedSourceTitle+Link+"%2c"View"%3a"SpecifyFields"%7d&origin=AuthorProfile&zone=exportDropDown&dataCheckoutTest=false&sort=plf-f&tabSelected=docLi&authorId={}&txGid=01f0e61cd67a3da5eb6ead1e6403b8c5').format(scop_id)
- driver.get(url)
- http = driver.page_source
- soup = BeautifulSoup(http, 'html.parser')
- str_soup = str(soup)
- str_soup = '\n'.join(str_soup.split('\n')[2:])
- abz = str_soup.split('ИСТОЧНИКИ: Scopus')[:-1]
- tot_points = 0
- conf_count = 0
- book_count = 0
- article_count = 0
- q1_count = 0
- q2_count = 0
- q3_count = 0
- q4_count = 0
- cs_count = 0
- sjr_count = 0
- def parse_abz(abz):
- abz = [i for i in abz.split('\n') if i != '']
- year = int(abz[3].split(')', 1)[0][1:])
- issn_str = [i for i in abz if 'ISSN:' in i]
- if len(issn_str) == 0:
- issn_str = None
- else:
- issn_str = issn_str[0].split(' ')[1]
- type_doc = ' '.join([i for i in abz if 'ТИП ДОКУМЕНТА:' in i][0].split(' ')[2:])
- name_doc1 = abz[2]
- #print(name_doc1)
- #if '2013' or '2014' or '2015' or '2016' or '2017' in name_doc1:
- #if '2016' or '2015' in year: # Выбор периода '2016' = только 2016, '2' = все
- name_doc = ('').join(name_doc1.split(') ')[1:]).split(',')[0]
- publ = {
- 'year': year,
- 'issn': issn_str,
- 'type': type_doc,
- 'name': name_doc
- }
- #print(publ)
- return (point_count(publ))
- def point_count(publ):
- global conf_count
- global book_count
- global article_count
- global tot_points
- issn_n = publ['issn']
- year = publ['year']
- name = publ['name']
- if year == 2018 or year == 2017 or year == 2016 :
- if publ['type'] == 'Article':
- article_count += 1
- get_best_for(issn_n, year)
- elif publ['type'] == 'Conference Paper':
- conf_count += 1
- tot_points += 0.1
- elif publ['type'] == 'Book':
- book_count += 1
- #######
- def get_best_for(issn_n, year):
- global cs_count
- global sjr_count
- global tot_points
- driver.get('http://www.scimagojr.com/')
- enter_issn = driver.find_element_by_css_selector("input#searchinput")
- enter_issn.click()
- #print(issn_n)
- enter_issn.send_keys(issn_n)
- try:
- enter_but = driver.find_element_by_css_selector("input#searchbutton")
- enter_but.click()
- publ = driver.find_element_by_css_selector('a[href*="journalsearch"]')
- publ.click()
- cross = driver.find_element_by_css_selector('html body div div div svg')
- cross.click()
- http = driver.page_source
- soup = BeautifulSoup(http, 'html.parser')
- tbody = soup.find_all('tbody')
- bodies = [t for t in str(tbody).split('\n') if 'Q' in t]
- cs = [t for t in str(tbody).split('\n') if '2016' in t and '3 years' in t][0]
- cs = cs[8:-10].split('</td><td>')[2]
- cs = float(cs)
- cs_count = cs_count + cs
- sjr_text = (soup.find(class_=re.compile('cell1x1 dynamiccell')))
- sjr = [t for t in str(sjr_text).split('\n') if '2016' in t][-1]
- sjr = sjr[8:-10].split('</td><td>')[1]
- sjr = float(sjr)
- sjr_count = sjr_count + sjr
- return find_max_for_year(year, bodies)
- except:
- #print(teach_pure_name, issn_n, 'ошибка в issn')
- tot_points += 0.1
- def find_max_for_year(year, bodies):
- global q1_count
- global q2_count
- global q3_count
- global q4_count
- global tot_points
- year = 2016
- bodies = [b for b in bodies if str(year) in b]
- if len(bodies) == 0:
- return None
- qs = [int(re.findall(r'Q[\d]', b)[0][1:]) for b in bodies]
- min_qs = min(qs)
- if min_qs == 1:
- q1_count += 1
- tot_points += 10
- elif min_qs == 2:
- q2_count += 1
- tot_points += 7
- elif min_qs == 3:
- q3_count += 1
- tot_points += 3
- elif min_qs == 4:
- q4_count += 1
- tot_points += 1
- tot_points = float(tot_points)
- return min_qs
- res = [parse_abz(i) for i in abz]
- #print(' Name ', ' Q1 count ', ' Q2 count ', ' Q3 count ',
- # ' Q4 count ', ' Art count ', ' ConfP count ', ' Total ', ' CiteSc ', ' SJR ')
- print("%-13s%13d%13d%13d%13d%13d%13d%13d%13.3f%13.3f%13.3f" % (teach_pure_name.split('-')[-1], q1_count, q2_count, q3_count,
- q4_count, article_count, conf_count, book_count, tot_points, cs_count, sjr_count))
- print('-------------------------------------------------------------------------------------------------------------------------------------------')
- #print('#############')'''
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement