Advertisement
Morgan_iv

Untitled

May 29th, 2018
93
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.54 KB | None | 0 0
  1. import requests
  2. from urllib.request import urlopen
  3. from bs4 import BeautifulSoup
  4. from IPython.display import display, HTML
  5. import random
  6. from multiprocessing.dummy import Pool as ThreadPool
  7.  
  8. def getart(number):
  9.    
  10.     number = str(number)
  11.     pattern = 'insert author=[{0}] year=[{1}] number=[{2}] journal=[{3}] article=[{4}]\n'
  12.     QURL = 'https://istina.msu.ru/publications/article/'
  13.     QURL += number + '/'
  14.    
  15.     resp = requests.get(QURL, allow_redirects=False)
  16.     bs = BeautifulSoup(resp.text, 'lxml')
  17.     for x in bs('script'):
  18.         x.extract()
  19.    
  20.     sel = bs.select('html body div.container h2')
  21.     art = sel[0].text
  22.     if (art == 'Страница не найдена'):
  23.         return ('', 0)
  24.    
  25.     art = art.strip()
  26.    
  27.     sel = bs.select('html body div.container h2 span.detail_label')
  28.     for tag in sel[-1:0:-1]:
  29.         rem = len(tag.text)
  30.         art = art[0:-rem].strip()
  31.     rem = len(sel[0].text)
  32.     art = art[0:-rem].strip()
  33.     if len(art) > 127:
  34.         art = art[:127]
  35.    
  36.     aut = []
  37.     sel = bs.select('html body div.container div.span-16.colborder ul.object_detail li')
  38.     tmp = sel[0].text
  39.     tmp = tmp[tmp.find(':') + 1:]
  40.     for s in tmp.split(','):
  41.         aut.append(s.strip())
  42.    
  43.     sel = bs.select('html body div.container div.span-16.colborder ul.object_detail li')
  44.     jou = sel[1].text
  45.     jou = jou[jou.find(':') + 1:].strip()
  46.     if len(jou) > 63:
  47.         jou = jou[:63]
  48.        
  49.     year = ''
  50.     sel = bs.select('html body div.container div.span-16.colborder ul.object_detail li')
  51.     for pos in sel:
  52.         tmp = pos.text.lstrip()
  53.         if (tmp.startswith('Год издания:')):
  54.             year = tmp[12:].strip()
  55.     if year == '':
  56.         year = number
  57.    
  58.     theres = ''
  59.    
  60.     cnt = 0
  61.    
  62.     for a in aut:
  63.         theres += pattern.format(a, year, number, jou, art)
  64.         cnt += 1
  65.        
  66.     #print(theres)
  67.    
  68.     return (theres, cnt)
  69.  
  70. count = 0
  71. curload = 115000000
  72.  
  73. def myload(left, right, file):
  74.     while True:
  75.         try:
  76.             with ThreadPool(8) as p:
  77.                 res = p.map(getart, range(left, right))
  78.             break
  79.         except OSError:
  80.             print ('yep')
  81.             pass
  82.        
  83.     count = 0
  84.  
  85.     for r in res:
  86.         file.write(r[0])
  87.         count += r[1]
  88.     return count
  89.  
  90. with open('savehere', 'a') as file:
  91.     while count < 200:
  92.         print (curload, ' ', count)
  93.         count += myload(curload - 1000, curload, file)
  94.         curload -= 1000
  95.  
  96. print (curload, ' ', count)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement