Advertisement
Guest User

python_taile

a guest
Jan 20th, 2018
140
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.80 KB | None | 0 0
  1. # Network
  2. import urllib.request, urllib.parse, urllib.error
  3. import http.cookiejar
  4. import requests
  5.  
  6. # Concurrency
  7. import threading
  8. import queue
  9. import itertools
  10.  
  11. # Etc
  12. import time
  13.  
  14.  
  15. # Global variables
  16. #in_filename = 'input/3.txt'
  17. in_directory = './input/'
  18. out_filename = 'output/out_filtered2.txt'
  19. #test_url = 'http://www.google.com/humans.txt'
  20. test_url = 'http://www.koolinar.ru/recipe/view/'
  21. thread_number = 100
  22. timeout_value = 10
  23.  
  24. ok_msg = "OK! "
  25. fail_msg = "FAIL "
  26.  
  27. # Stats
  28. good_proxy_num = itertools.count()
  29. start_time = time.time()
  30. end_time = time.time()
  31.  
  32.  
  33.  
  34.  
  35. # Safe print()
  36. mylock = threading.Lock()
  37. def sprint(*a, **b):
  38. with mylock:
  39. print(*a, **b)
  40.  
  41.  
  42. #
  43. # Printer
  44. #
  45. class PrintThread(threading.Thread):
  46. def __init__(self, queue, filename):
  47. threading.Thread.__init__(self)
  48. self.queue = queue
  49. self.output = open(filename, 'a')
  50. self.shutdown = False
  51.  
  52. def write(self, line):
  53. print(line, file=self.output)
  54.  
  55. def run(self):
  56. while not self.shutdown:
  57. lines = self.queue.get()
  58. self.write(lines)
  59. self.queue.task_done()
  60.  
  61. def terminate(self):
  62. self.output.close()
  63. self.shutdown = True
  64.  
  65.  
  66.  
  67. #
  68. # Processor
  69. #
  70. class ProcessThread(threading.Thread):
  71. def __init__(self, id, task_queue, out_queue):
  72. threading.Thread.__init__(self)
  73. self.task_queue = task_queue
  74. self.out_queue = out_queue
  75. self.id = id
  76.  
  77. # ...
  78. def run(self):
  79. while True:
  80. task = self.task_queue.get()
  81. result = self.process2(task)
  82.  
  83. if result is not None:
  84. self.out_queue.put(result)
  85. next(good_proxy_num)
  86.  
  87. self.task_queue.task_done()
  88.  
  89.  
  90. # Do the processing job here
  91. def process2(self, task):
  92. proxy = task
  93. log_msg = str("Thread #%3d. Trying HTTP proxy %21s \t\t" % (self.id, proxy))
  94.  
  95. cj = http.cookiejar.CookieJar()
  96. opener = urllib.request.build_opener(
  97. urllib.request.HTTPCookieProcessor(cj),
  98. urllib.request.HTTPRedirectHandler(),
  99. urllib.request.ProxyHandler({'http': proxy})
  100. )
  101. ch_pages = [139761,101689,139762,139770,60470,139773,139782,139780,139779,139794,139800,139797,139801,139770,139799,139806,139820,139825,139822,139831,139834,139736,139839,139836,139768,139843,139736,139831,139799,139825,139832,139838,139795,139838,139844,139843,139848,139849,139847,139801,139761,139856,139858,139755,139832,139801,139824,139853,139761,139872,139874,139875,139877,139875,139736,139882,139799,139887]
  102. #ch_pages.append(139799)
  103.  
  104. for page in ch_pages:
  105. try:
  106. t1 = time.time()
  107. response = opener.open(test_url + str(ch_pages[0]), timeout=timeout_value).read()
  108. t2 = time.time()
  109. except Exception as e:
  110. log_msg += "%s (%s)" % (fail_msg, str(e))
  111. sprint(log_msg)
  112. return None
  113. log_msg += ok_msg + " Response time: %d, length=%s" % (int((t2 - t1) * 1000), str(len(response)))
  114. sprint(log_msg)
  115. return proxy
  116.  
  117. def terminate(self):
  118. None
  119. #print("Thread #%d is down..." % (self.id))
  120.  
  121. #
  122. # Main starts here
  123. #
  124. # Init some stuff
  125. input_queue = queue.Queue()
  126. result_queue = queue.Queue()
  127.  
  128.  
  129. # Spawn worker threads
  130. workers = []
  131. for i in range(0, thread_number):
  132. t = ProcessThread(i, input_queue, result_queue)
  133. t.setDaemon(True)
  134. t.start()
  135. workers.append(t)
  136.  
  137. # Spawn printer thread to print
  138. f_printer = PrintThread(result_queue, out_filename)
  139. f_printer.setDaemon(True)
  140. f_printer.start()
  141.  
  142. # Add some stuff to the input queue
  143. start_time = time.time()
  144.  
  145. proxy_list = []
  146. import os
  147. for root, dirs, files in os.walk(in_directory):
  148. for file in files:
  149. if file.endswith(".txt"):
  150. # read all lines from file
  151. file_line_list = [line.rstrip('\n') for line in open(os.path.join(root, file), 'r')]
  152. # append to proxy_list
  153. proxy_list.extend(file_line_list)
  154.  
  155. for proxy in proxy_list:
  156. input_queue.put(proxy)
  157.  
  158. total_proxy_num = len(proxy_list)
  159. print("got %d proxies to check" % total_proxy_num)
  160.  
  161. if total_proxy_num == 0:
  162. exit()
  163.  
  164. # Wait for queue to get empty
  165. input_queue.join()
  166. result_queue.join()
  167.  
  168.  
  169. #while (not input_queue.empty()):
  170. # time.sleep(1)
  171.  
  172.  
  173. # Shutdown
  174. f_printer.terminate()
  175.  
  176. for worker in workers:
  177. worker.terminate()
  178.  
  179. # Print some info
  180. good_proxy_num = float(next(good_proxy_num))
  181. print("In: %d. Good: %d, that's %.2f%%" % (total_proxy_num, good_proxy_num, 100.0 * good_proxy_num/total_proxy_num))
  182.  
  183. end_time = time.time()
  184. print("Time elapsed: %.1f seconds." % (end_time - start_time))
  185. print("Bye-bye!")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement