Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # 1. choose an url to visit
- # 2. get username and password
- # 3. fetch source of url
- import sys
- import json
- from threading import Thread
- import queue
- import time
- import requests
- import util
- # 1. choose an url to visit
- all_urls = json.load(open('URLS.json'))
- url = all_urls[123]
- print('analyzing', url)
- # 2. get username and password
- config = util.get_config()
- USERNAME = config['auth']['username']
- PASSWORD = config['auth']['password']
- print('using username', USERNAME)
- # 3. fetch source of one url, to see if we're connected
- r = requests.get(url, auth=(USERNAME, PASSWORD))
- if r.status_code == 401:
- print("[ERROR] NOT AUTHORIZED")
- sys.exit(1) # exit w/ error 1
- else:
- print(USERNAME, "connected")
- # 4. fetch all urls and do something
- q = queue.Queue()
- # this runs in a thread with two arguments: a threadid and
- # a queue
- def get_page(i, q):
- print('thread', i, 'started')
- r = requests.get(url, auth=(USERNAME, PASSWORD))
- source = r.text
- q.put(source.count('VIS'))
- start = time.time()
- for i, url in enumerate(all_urls[:100]):
- t = Thread(target=get_page, args=(i, q)).start()
- # sequential version
- #
- # for url in all_urls[:100]:
- # get_page(0, q)
- counter = 0
- for i, url in enumerate(all_urls[:100]):
- counter += q.get()
- print('[{i}/{all}]: counter'.format(i=i+1, all=100), counter)
- print('final counter', counter)
- print('finished in', time.time()-start, 'seconds')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement