Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import requests
- from urllib.request import urlopen
- from bs4 import BeautifulSoup
- from IPython.display import display, HTML
- import random
- from multiprocessing.dummy import Pool as ThreadPool
- def getart(number):
- number = str(number)
- pattern = 'insert author=[{0}] year=[{1}] number=[{2}] journal=[{3}] article=[{4}]\n'
- QURL = 'https://istina.msu.ru/publications/article/'
- QURL += number + '/'
- resp = requests.get(QURL, allow_redirects=False)
- bs = BeautifulSoup(resp.text, 'lxml')
- for x in bs('script'):
- x.extract()
- sel = bs.select('html body div.container h2')
- art = sel[0].text
- if (art == 'Страница не найдена'):
- return ('', 0)
- art = art.strip()
- sel = bs.select('html body div.container h2 span.detail_label')
- for tag in sel[-1:0:-1]:
- rem = len(tag.text)
- art = art[0:-rem].strip()
- rem = len(sel[0].text)
- art = art[0:-rem].strip()
- if len(art) > 127:
- art = art[:127]
- aut = []
- sel = bs.select('html body div.container div.span-16.colborder ul.object_detail li')
- tmp = sel[0].text
- tmp = tmp[tmp.find(':') + 1:]
- for s in tmp.split(','):
- aut.append(s.strip())
- sel = bs.select('html body div.container div.span-16.colborder ul.object_detail li')
- jou = sel[1].text
- jou = jou[jou.find(':') + 1:].strip()
- if len(jou) > 63:
- jou = jou[:63]
- year = ''
- sel = bs.select('html body div.container div.span-16.colborder ul.object_detail li')
- for pos in sel:
- tmp = pos.text.lstrip()
- if (tmp.startswith('Год издания:')):
- year = tmp[12:].strip()
- if year == '':
- year = number
- theres = ''
- cnt = 0
- for a in aut:
- theres += pattern.format(a, year, number, jou, art)
- cnt += 1
- #print(theres)
- return (theres, cnt)
- count = 0
- curload = 115000000
- def myload(left, right, file):
- while True:
- try:
- with ThreadPool(8) as p:
- res = p.map(getart, range(left, right))
- break
- except OSError:
- print ('yep')
- pass
- count = 0
- for r in res:
- file.write(r[0])
- count += r[1]
- return count
- with open('savehere', 'a') as file:
- while count < 200:
- print (curload, ' ', count)
- count += myload(curload - 1000, curload, file)
- curload -= 1000
- print (curload, ' ', count)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement