Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from multiprocessing.dummy import Pool as ThreadPool
- import dbconnect
- from bs4 import BeautifulSoup
- from urllib.request import *
- import random
- import re
- import time
- cursor = dbconnect.connection()
- def reqs(url, encode='utf-8'):
- request = Request(url)
- ua_list = [
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2467.2 Safari/537.36',
- 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36',
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.10240',
- 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko']
- request.add_header('User-Agent', random.choice(ua_list))
- return urlopen(request).read()
- def extraction(html):
- soup = BeautifulSoup(html,'html.parser')
- page = soup.find("div", class_="pages2").string
- return int(re.search(r'd+', page).group())
- def main():
- cursor.execute("SELECT url FROM labirint")
- urls = cursor.fetchall()
- parse_urls = []
- t = time.clock()
- for url in urls:
- parse_urls.append(url[0])
- pool = ThreadPool(10)
- print('Метка 1 {:.3f} seconds'.format(time.clock() - t))
- result = pool.map(reqs, parse_urls)
- print('Метка 2 {:.3f} seconds'.format(time.clock() - t))
- print(pool.map(extraction, result))
- print('Метка 3 {:.3f} seconds'.format(time.clock() - t))
- pool.close()
- pool.join()
- if __name__ == '__main__':
- main()
- Метка 1 0.030 seconds
- Метка 2 5.769 seconds
- [320, 320, 124, 416, 12, 713, 583, 192, 448, 384]
- Метка 3 45.567 seconds
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement