crawler_meizitu_05_Proxy

# -*- coding: utf-8 -*-
"""
Created on Wed Mar  1 16:01:21 2017
@author: SmileHsu
blog:smilehsu.cc
requirements:Windows7、python3.52
"""

import os, requests, shutil
import sqlite3 as lite
from bs4 import BeautifulSoup
import re, random, time

class Downloader(object):

    def __init__(self):
        self.ip_pool=[]
        html=requests.get('http://haoip.cc/tiqu.htm')
        iplistn = re.findall(r'r/>(.*?)<b', html.text, re.S)

        for ip in iplistn:
            i = re.sub('\n','', ip)
            self.ip_pool.append(i.strip())

        self.user_agent_pool=["Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
 "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
 "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
 "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
 "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
 "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3"
        ]

    def get_user_agent(self):
        return random.choice(self.user_agent_pool)

    def get_url(self, url):
        ua = self.get_user_agent()
        return self.get_url_with_chosen_agent(url, ua)

    def get_url_with_chosen_agent(self, url, agent):
       headers = {'User-Agent': agent}
       response = requests.get(url, headers=headers)
       return response

    def get(self, url, timeout, proxy=None, num_retries=6):
        print('開始擷取: {}'.format(url))
        ua=random.choice(self.user_agent_pool)
        headers={'User-Agent':ua}

        #
        if proxy==None:
            try:
                return requests.get(url,headers=headers,timeout=timeout)

            except:
                if num_retries>0:
                    time.sleep(10)
                    print('擷取網頁出錯10秒後，再嘗試{}次擷取'.format(num_retries))
                    return self.get(url,timeout,num_retries-1)

                else:
                    print('開始使用prxoy')
                    time.sleep(10)
                    ip=''.join(str(random.choice(self.ip_pool)).strip())
                    proxy={'http':ip}
                    return self.get(url, timeout, proxy)
        #
        else:
            try:
                ip=''.join(str(random.choice(self.ip_pool)).strip())
                proxy={'http':ip}
                return requests.get(url, headers=headers, proxies=proxy, timeout=timeout)

            except:
                if num_retries>0:
                    time.sleep(10)
                    ip=''.join(str(random.choice(self.ip_pool)).strip())
                    proxy={'http':ip}
                    print('正在更換proxy，10秒後再嘗試{}次擷取'.format(num_retries))
                    print('目前使用的proxy:{}'.format(proxy))
                    return self.get(url, timeout, proxy, num_retries-1)

                else:
                    print('取消使用proxy')
                    res=requests.get(url)
                    return res.text

a=Downloader()
url='http://icanhazip.com'
a.get(url,6,proxy=1).text