Advertisement
Guest User

Untitled

a guest
Dec 4th, 2016
85
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.66 KB | None | 0 0
  1. from multiprocessing.dummy import Pool as ThreadPool
  2. import dbconnect
  3. from bs4 import BeautifulSoup
  4. from urllib.request import *
  5. import random
  6. import re
  7. import time
  8. cursor = dbconnect.connection()
  9.  
  10.  
  11. def reqs(url, encode='utf-8'):
  12. request = Request(url)
  13. ua_list = [
  14. 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2467.2 Safari/537.36',
  15. 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36',
  16. 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.10240',
  17. 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko']
  18. request.add_header('User-Agent', random.choice(ua_list))
  19. return urlopen(request).read()
  20.  
  21. def extraction(html):
  22. soup = BeautifulSoup(html,'html.parser')
  23. page = soup.find("div", class_="pages2").string
  24. return int(re.search(r'd+', page).group())
  25.  
  26. def main():
  27. cursor.execute("SELECT url FROM labirint")
  28. urls = cursor.fetchall()
  29. parse_urls = []
  30. t = time.clock()
  31. for url in urls:
  32. parse_urls.append(url[0])
  33. pool = ThreadPool(10)
  34. print('Метка 1 {:.3f} seconds'.format(time.clock() - t))
  35. result = pool.map(reqs, parse_urls)
  36. print('Метка 2 {:.3f} seconds'.format(time.clock() - t))
  37. print(pool.map(extraction, result))
  38. print('Метка 3 {:.3f} seconds'.format(time.clock() - t))
  39. pool.close()
  40. pool.join()
  41.  
  42.  
  43. if __name__ == '__main__':
  44. main()
  45.  
  46. Метка 1 0.030 seconds
  47. Метка 2 5.769 seconds
  48. [320, 320, 124, 416, 12, 713, 583, 192, 448, 384]
  49. Метка 3 45.567 seconds
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement