Advertisement
rs6000

crawler_meizitu_05_Proxy

Mar 1st, 2017
286
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.86 KB | None | 0 0
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on Wed Mar  1 16:01:21 2017
  4. @author: SmileHsu
  5. blog:smilehsu.cc
  6. requirements:Windows7、python3.52
  7. """
  8.  
  9. import os, requests, shutil
  10. import sqlite3 as lite
  11. from bs4 import BeautifulSoup
  12. import re, random, time
  13.  
  14. class Downloader(object):
  15.  
  16.     def __init__(self):
  17.         self.ip_pool=[]
  18.         html=requests.get('http://haoip.cc/tiqu.htm')
  19.         iplistn = re.findall(r'r/>(.*?)<b', html.text, re.S)
  20.  
  21.         for ip in iplistn:
  22.             i = re.sub('\n','', ip)
  23.             self.ip_pool.append(i.strip())
  24.        
  25.         self.user_agent_pool=["Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
  26.  "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
  27.  "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
  28.  "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
  29.  "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
  30.  "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
  31.  "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
  32.  "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
  33.  "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
  34.  "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3"
  35.         ]
  36.  
  37.     def get_user_agent(self):
  38.         return random.choice(self.user_agent_pool)
  39.  
  40.     def get_url(self, url):
  41.         ua = self.get_user_agent()
  42.         return self.get_url_with_chosen_agent(url, ua)
  43.        
  44.     def get_url_with_chosen_agent(self, url, agent):
  45.        headers = {'User-Agent': agent}
  46.        response = requests.get(url, headers=headers)
  47.        return response
  48.    
  49.     def get(self, url, timeout, proxy=None, num_retries=6):
  50.         print('開始擷取: {}'.format(url))
  51.         ua=random.choice(self.user_agent_pool)
  52.         headers={'User-Agent':ua}
  53.        
  54.         #
  55.         if proxy==None:
  56.             try:
  57.                 return requests.get(url,headers=headers,timeout=timeout)
  58.            
  59.             except:
  60.                 if num_retries>0:
  61.                     time.sleep(10)
  62.                     print('擷取網頁出錯10秒後,再嘗試{}次擷取'.format(num_retries))
  63.                     return self.get(url,timeout,num_retries-1)
  64.                
  65.                 else:
  66.                     print('開始使用prxoy')
  67.                     time.sleep(10)
  68.                     ip=''.join(str(random.choice(self.ip_pool)).strip())
  69.                     proxy={'http':ip}
  70.                     return self.get(url, timeout, proxy)
  71.         #
  72.         else:
  73.             try:
  74.                 ip=''.join(str(random.choice(self.ip_pool)).strip())
  75.                 proxy={'http':ip}
  76.                 return requests.get(url, headers=headers, proxies=proxy, timeout=timeout)
  77.            
  78.             except:
  79.                 if num_retries>0:
  80.                     time.sleep(10)
  81.                     ip=''.join(str(random.choice(self.ip_pool)).strip())
  82.                     proxy={'http':ip}
  83.                     print('正在更換proxy,10秒後再嘗試{}次擷取'.format(num_retries))
  84.                     print('目前使用的proxy:{}'.format(proxy))
  85.                     return self.get(url, timeout, proxy, num_retries-1)
  86.                        
  87.                 else:
  88.                     print('取消使用proxy')
  89.                     res=requests.get(url)
  90.                     return res.text
  91.  
  92. a=Downloader()
  93. url='http://icanhazip.com'
  94. a.get(url,6,proxy=1).text
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement