Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding: utf-8 -*-
- """
- Created on Wed Mar 1 16:01:21 2017
- @author: SmileHsu
- blog:smilehsu.cc
- requirements:Windows7、python3.52
- """
- import os, requests, shutil
- import sqlite3 as lite
- from bs4 import BeautifulSoup
- import re, random, time
- class Downloader(object):
- def __init__(self):
- self.ip_pool=[]
- html=requests.get('http://haoip.cc/tiqu.htm')
- iplistn = re.findall(r'r/>(.*?)<b', html.text, re.S)
- for ip in iplistn:
- i = re.sub('\n','', ip)
- self.ip_pool.append(i.strip())
- self.user_agent_pool=["Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
- "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
- "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
- "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
- "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
- "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
- "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
- "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
- "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3"
- ]
- def get_user_agent(self):
- return random.choice(self.user_agent_pool)
- def get_url(self, url):
- ua = self.get_user_agent()
- return self.get_url_with_chosen_agent(url, ua)
- def get_url_with_chosen_agent(self, url, agent):
- headers = {'User-Agent': agent}
- response = requests.get(url, headers=headers)
- return response
- def get(self, url, timeout, proxy=None, num_retries=6):
- print('開始擷取: {}'.format(url))
- ua=random.choice(self.user_agent_pool)
- headers={'User-Agent':ua}
- #
- if proxy==None:
- try:
- return requests.get(url,headers=headers,timeout=timeout)
- except:
- if num_retries>0:
- time.sleep(10)
- print('擷取網頁出錯10秒後,再嘗試{}次擷取'.format(num_retries))
- return self.get(url,timeout,num_retries-1)
- else:
- print('開始使用prxoy')
- time.sleep(10)
- ip=''.join(str(random.choice(self.ip_pool)).strip())
- proxy={'http':ip}
- return self.get(url, timeout, proxy)
- #
- else:
- try:
- ip=''.join(str(random.choice(self.ip_pool)).strip())
- proxy={'http':ip}
- return requests.get(url, headers=headers, proxies=proxy, timeout=timeout)
- except:
- if num_retries>0:
- time.sleep(10)
- ip=''.join(str(random.choice(self.ip_pool)).strip())
- proxy={'http':ip}
- print('正在更換proxy,10秒後再嘗試{}次擷取'.format(num_retries))
- print('目前使用的proxy:{}'.format(proxy))
- return self.get(url, timeout, proxy, num_retries-1)
- else:
- print('取消使用proxy')
- res=requests.get(url)
- return res.text
- a=Downloader()
- url='http://icanhazip.com'
- a.get(url,6,proxy=1).text
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement