Advertisement
Guest User

Untitled

a guest
Jan 9th, 2017
118
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.38 KB | None | 0 0
  1. # 1. choose an url to visit
  2. # 2. get username and password
  3. # 3. fetch source of url
  4.  
  5. import sys
  6. import json
  7. from threading import Thread
  8. import queue
  9. import time
  10.  
  11. import requests
  12.  
  13. import util
  14.  
  15. # 1. choose an url to visit
  16. all_urls = json.load(open('URLS.json'))
  17. url = all_urls[123]
  18.  
  19. print('analyzing', url)
  20.  
  21. # 2. get username and password
  22. config = util.get_config()
  23. USERNAME = config['auth']['username']
  24. PASSWORD = config['auth']['password']
  25.  
  26. print('using username', USERNAME)
  27. # 3. fetch source of one url, to see if we're connected
  28.  
  29. r = requests.get(url, auth=(USERNAME, PASSWORD))
  30. if r.status_code == 401:
  31. print("[ERROR] NOT AUTHORIZED")
  32. sys.exit(1) # exit w/ error 1
  33. else:
  34. print(USERNAME, "connected")
  35.  
  36. # 4. fetch all urls and do something
  37. q = queue.Queue()
  38.  
  39.  
  40. # this runs in a thread with two arguments: a threadid and
  41. # a queue
  42. def get_page(i, q):
  43. print('thread', i, 'started')
  44. r = requests.get(url, auth=(USERNAME, PASSWORD))
  45. source = r.text
  46. q.put(source.count('VIS'))
  47.  
  48.  
  49. start = time.time()
  50. for i, url in enumerate(all_urls[:100]):
  51. t = Thread(target=get_page, args=(i, q)).start()
  52.  
  53. # sequential version
  54. #
  55. # for url in all_urls[:100]:
  56. # get_page(0, q)
  57.  
  58. counter = 0
  59. for i, url in enumerate(all_urls[:100]):
  60. counter += q.get()
  61. print('[{i}/{all}]: counter'.format(i=i+1, all=100), counter)
  62. print('final counter', counter)
  63. print('finished in', time.time()-start, 'seconds')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement