Untitled

from selenium import webdriver
from selenium.webdriver.common.keys import Keys

import urllib.request
import requests
import certifi
from bs4 import BeautifulSoup
import re
import ssl
text = []
b = str()
s = []

if __name__ == '__main__':
    print('     Name   ', '         Q1 count  ', '  Q2 count  ', ' Q3 count  ',
          ' Q4 count ', '   Art count ', ' ConfP count ', 'Book count', '   Total  ', '     CiteSc  ', '     SJR ')

    opener = urllib.request.FancyURLopener({})
    url = 'http://www.apmath.spbu.ru/ru/staff/'
    f = opener.open(url)
    content = f.read()

    soup = BeautifulSoup(content, 'html.parser')

    table = soup.find('table')
    links = table.find_all('a')
    links = [link for link in links if 'depts' not in str(link)]
    name_depts = [link for link in links if 'depts' in str(link)]
    # print(links)
    print(ssl.OPENSSL_VERSION)

    for i in links:
        teacher_link = i.get('href')
        teacher_desc = i.text
        # print(teacher_link, teacher_desc)

        url = 'http://www.apmath.spbu.ru/ru/staff/{}index.html'.format(teacher_link)

        with opener.open(url) as f:
            content = f.read()

        soup = BeautifulSoup(content, 'html.parser')

        links_pure = soup.find_all(href=re.compile('pure'))

        for i in links_pure:

            teach_pure_link = i.get('href')
            teach_pure_name = teach_pure_link[42:]
            if teach_pure_name == 'меджид-эльхан оглы-аббасов':
                teach_pure_name = 'меджид-эльхан-оглы-аббасов'

            if teach_pure_name != 'андрей-юрьевич-гарнаев' or 'сергей-александрович-костромин' or 'латыпов' or 'игорь-николаевич-мешков' or 'павловский' or 'андрей-анатольевич-печников' or 'олег-васильевич-рогачевский' or'анатолий-олегович-сидорин' or 'григорий-владимирович-трубников' or 'шарлай':

                url = 'https://spbu.pure.elsevier.com/ru/persons/{}/publications'.format(teach_pure_name)

                r = requests.get(url, verify=True)
                content = r.text  # http текст pure!
                #print(teach_pure_name)

                soup = BeautifulSoup(content, 'html.parser')

                #if teach_pure_name ==

                scop_id = soup.find_all(href=re.compile('ID'))
                scop_id = str(scop_id)

                a = scop_id.find('ID=')
                b = scop_id.find('&')
                scop_id = scop_id[a + 3:b]
                ##### костыль с scop idd
                if teach_pure_name == 'геннадий-викторович-алфёров':
                    scop_id = 56405478800
                if teach_pure_name == 'александр-александрович-давыденко':
                    scop_id = 57159745400
                if teach_pure_name == 'николай-семёнович-едаменко':
                    scop_id = 12345018300
                if teach_pure_name == 'владимир-степанович-королёв':
                    scop_id = 56651788200
                if teach_pure_name == 'дмитрий-юрьевич-куранов':
                    scop_id = 56405591200
                if teach_pure_name == 'ирина-васильевна-медведева':
                    scop_id = 55811039700
                if teach_pure_name == 'виктор-сергеевич-новосёлов':
                    scop_id = 7004972246
                if teach_pure_name == 'наталья-викторовна-распопова':
                    scop_id = 57160032200
                if teach_pure_name == 'михаил-николаевич-смирнов':
                    scop_id = 56962731000
                if teach_pure_name == 'олег-анатольевич-тумка':
                    scop_id = 57160483000
                if teach_pure_name == 'геннадий-михайлович-хитров':
                    scop_id = 57159817900
                if teach_pure_name == 'анастасия-павловна-широколобова':
                    scop_id = 57192959034
                #if teach_pure_name == 'андрей-юрьевич-гарнаев' or 'костромин' or 'латыпов' or 'мешков' or 'павловский' or 'печников' or 'рогачевский' or'сидорин' or 'трубников' or 'шарлай':
                   #i += 1
                   #print('пропустили ', teach_pure_name)
                #print(scop_id)

                dept = soup.find_all(string=re.compile('Кафедра'))
                dept = str(dept).replace('[', ' ').replace(']', ' ').replace("'", ' ').strip()
                if dept == '':
                    dept = 'No info'
                #print(dept)

                ################# # # # # # #  # # # #  # # # # #  # # # #  # #

                opener = urllib.request.FancyURLopener({})
                driver = webdriver.Chrome('/Users/alexkozlov/Documents/учеба/Научник/ВекторПМ/chromedriver')
                url = ('https://proxy.library.spbu.ru:2092/authid/detail.uri?authorId={}').format(scop_id)
                driver.get(url) #56202287000') # прогоняем все id

                # driver.get('http://cufts.library.spbu.ru/CRDB/SPBGU/resource/79/goto')

                username = driver.find_element_by_css_selector("input#username")
                username.click()
                username.send_keys('st048842')
                password = driver.find_element_by_css_selector("input#password")
                password.click()
                password.send_keys('jzEB89de')
                submit = driver.find_element_by_name("_eventId_proceed")
                submit.click()

                url = ('https://proxy.library.spbu.ru:2092/onclick/export.uri?oneClickExport=%7b"Format"%3a"TEXT"%2c"SelectedFields"%3a"+Authors++Title++Year++EID++SourceTitle++Volume+Issue+ArtNo+PageStart+PageEnd+PageCount++CitedBy++DocumentType+Source++DOI++ACCESSTYPE++Affiliations++ISSN+ISBN+CODEN++PubMedID++Publisher++Editors++LanguageOfOriginalDocument++CorrespondenceAddress++AbbreviatedSourceTitle+Link+"%2c"View"%3a"SpecifyFields"%7d&origin=AuthorProfile&zone=exportDropDown&dataCheckoutTest=false&sort=plf-f&tabSelected=docLi&authorId={}&txGid=01f0e61cd67a3da5eb6ead1e6403b8c5').format(scop_id)
                driver.get(url)
                http = driver.page_source
                soup = BeautifulSoup(http, 'html.parser')
                str_soup = str(soup)

                str_soup = '\n'.join(str_soup.split('\n')[2:])
                abz = str_soup.split('ИСТОЧНИКИ: Scopus')[:-1]

                tot_points = 0
                conf_count = 0
                book_count = 0
                article_count = 0
                q1_count = 0
                q2_count = 0
                q3_count = 0
                q4_count = 0
                cs_count = 0
                sjr_count = 0

                def parse_abz(abz):
                    abz = [i for i in abz.split('\n') if i != '']

                    year = int(abz[3].split(')', 1)[0][1:])

                    issn_str = [i for i in abz if 'ISSN:' in i]
                    if len(issn_str) == 0:
                        issn_str = None
                    else:
                        issn_str = issn_str[0].split(' ')[1]

                    type_doc = ' '.join([i for i in abz if 'ТИП ДОКУМЕНТА:' in i][0].split(' ')[2:])

                    name_doc1 = abz[2]
                    #print(name_doc1)
                    #if '2013' or '2014' or '2015' or '2016' or '2017' in name_doc1:
                    #if '2016' or '2015' in year:   #  Выбор периода '2016' = только 2016, '2' = все
                    name_doc = ('').join(name_doc1.split(') ')[1:]).split(',')[0]


                    publ = {
                        'year': year,
                        'issn': issn_str,
                        'type': type_doc,
                        'name': name_doc
                    }

                    #print(publ)
                    return (point_count(publ))


                def point_count(publ):
                    global conf_count
                    global book_count
                    global article_count
                    global tot_points
                    issn_n = publ['issn']
                    year = publ['year']
                    name = publ['name']

                    if year == 2018 or year == 2017 or year == 2016 :
                        if publ['type'] == 'Article':
                            article_count += 1
                            get_best_for(issn_n, year)
                        elif publ['type'] == 'Conference Paper':
                            conf_count += 1
                            tot_points += 0.1
                        elif publ['type'] == 'Book':
                            book_count += 1

            #######
                def get_best_for(issn_n, year):
                    global cs_count
                    global sjr_count
                    global tot_points
                    driver.get('http://www.scimagojr.com/')
                    enter_issn = driver.find_element_by_css_selector("input#searchinput")
                    enter_issn.click()
                    #print(issn_n)
                    enter_issn.send_keys(issn_n)
                    try:
                        enter_but = driver.find_element_by_css_selector("input#searchbutton")
                        enter_but.click()

                        publ = driver.find_element_by_css_selector('a[href*="journalsearch"]')
                        publ.click()

                        cross = driver.find_element_by_css_selector('html body div div div svg')
                        cross.click()

                        http = driver.page_source
                        soup = BeautifulSoup(http, 'html.parser')

                        tbody = soup.find_all('tbody')
                        bodies = [t for t in str(tbody).split('\n') if 'Q' in t]

                        cs = [t for t in str(tbody).split('\n') if '2016' in t and '3 years' in t][0]
                        cs = cs[8:-10].split('</td><td>')[2]
                        cs = float(cs)
                        cs_count = cs_count + cs

                        sjr_text = (soup.find(class_=re.compile('cell1x1 dynamiccell')))
                        sjr = [t for t in str(sjr_text).split('\n') if '2016' in t][-1]
                        sjr = sjr[8:-10].split('</td><td>')[1]
                        sjr = float(sjr)
                        sjr_count = sjr_count + sjr

                        return find_max_for_year(year, bodies)
                    except:
                        #print(teach_pure_name, issn_n, 'ошибка в issn')
                        tot_points += 0.1


                def find_max_for_year(year, bodies):
                    global q1_count
                    global q2_count
                    global q3_count
                    global q4_count
                    global tot_points

                    year = 2016
                    bodies = [b for b in bodies if str(year) in b]
                    if len(bodies) == 0:
                        return None
                    qs = [int(re.findall(r'Q[\d]', b)[0][1:]) for b in bodies]
                    min_qs = min(qs)
                    if min_qs == 1:
                        q1_count += 1
                        tot_points += 10
                    elif min_qs == 2:
                        q2_count += 1
                        tot_points += 7
                    elif min_qs == 3:
                        q3_count += 1
                        tot_points += 3
                    elif min_qs == 4:
                        q4_count += 1
                        tot_points += 1
                    tot_points = float(tot_points)
                    return min_qs


                res = [parse_abz(i) for i in abz]
                #print('     Name   ', '       Q1 count  ', '    Q2 count  ', ' Q3 count  ',
                #      ' Q4 count ', '   Art count ', ' ConfP count ', ' Total  ', '     CiteSc  ', '      SJR ')
                print("%-13s%13d%13d%13d%13d%13d%13d%13d%13.3f%13.3f%13.3f" % (teach_pure_name.split('-')[-1], q1_count, q2_count, q3_count,
                      q4_count, article_count, conf_count, book_count, tot_points, cs_count, sjr_count))
                print('-------------------------------------------------------------------------------------------------------------------------------------------')


                #print('#############')'''