Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding: utf-8 -*-
- """
- Created on Tue Mar 7 04:37:23 2017
- @author: SmileHsu
- blog:smilehsu.cc
- requirements:Windows7、python3.52
- """
- import os, requests, shutil
- import sqlite3 as lite
- from bs4 import BeautifulSoup
- import re, random, time
- #base_url='http://meizitu.com/a/'
- fk=1
- all_link=[]
- error_page=[]
- dir_path='d:\meizitu'
- sqlite_path='d:\meizitu\meizituDB.sqlite'
- class Downloader(object):
- def __init__(self):
- self.ip_pool=[]
- html=requests.get('http://haoip.cc/tiqu.htm')
- iplistn = re.findall(r'r/>(.*?)<b', html.text, re.S)
- for ip in iplistn:
- i = re.sub('\n','', ip)
- self.ip_pool.append(i.strip())
- self.user_agent_pool=["Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
- "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
- "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
- "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
- "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
- "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
- "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
- "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
- "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3"
- ]
- def get_user_agent(self):
- return random.choice(self.user_agent_pool)
- def get_url(self, url):
- ua = self.get_user_agent()
- return self.get_url_with_chosen_agent(url, ua)
- def get_url_with_chosen_agent(self, url, agent):
- headers = {'User-Agent': agent}
- response = requests.get(url, headers=headers)
- return response
- def get(self, url, timeout, proxy=None, num_retries=6,stream=True):
- print('開始擷取: {}'.format(url))
- ua=random.choice(self.user_agent_pool)
- headers={'User-Agent':ua}
- #
- if proxy==None:
- try:
- return requests.get(url,headers=headers,timeout=timeout,stream=stream)
- except:
- if num_retries>0:
- time.sleep(10)
- print('擷取網頁出錯10秒後,再嘗試{}次擷取'.format(num_retries))
- return self.get(url,timeout,num_retries-1,stream)
- else:
- print('開始使用prxoy')
- time.sleep(10)
- ip=''.join(str(random.choice(self.ip_pool)).strip())
- proxy={'http':ip}
- return self.get(url, timeout, proxy,stream)
- #
- else:
- try:
- ip=''.join(str(random.choice(self.ip_pool)).strip())
- proxy={'http':ip}
- return requests.get(url, headers=headers, proxies=proxy, timeout=timeout,stream=stream)
- except:
- if num_retries>0:
- time.sleep(10)
- ip=''.join(str(random.choice(self.ip_pool)).strip())
- proxy={'http':ip}
- print('正在更換proxy,10秒後再嘗試{}次擷取'.format(num_retries))
- print('目前使用的proxy:{}'.format(proxy))
- return self.get(url, timeout, proxy, num_retries-1,stream)
- else:
- print('取消使用proxy')
- res=requests.get(url)
- return res.text
- #sql語法
- #如果資料庫已經album資料表就刪掉它
- sql1="DROP TABLE IF EXISTS 'album';"
- #建立新的 album資料表
- sql2="CREATE TABLE 'album' ('id' INTEGER PRIMARY KEY NOT NULL , 'title' VARCHAR);"
- #如果資料庫已經album_images資料表就刪掉它
- sql3="DROP TABLE IF EXISTS 'album_imags';"
- #建立新的 album_images資料表
- #FOREIGN KEY(album_id) REFERENCES album(id) 設定 album_id為Foreign Key,跟album資料表中的id連結
- sql4="CREATE TABLE 'album_imags' ('img_id' INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL , 'album_id' INTEGER NOT NULL ,\
- 'title' VARCHAR NOT NULL , 'img_src' VARCHAR NOT NULL ,FOREIGN KEY(album_id) REFERENCES album(id) );"
- #資料庫連線,如果路徑底下沒有meizituDB.sqlite則會自動建立
- conn= lite.connect(sqlite_path)
- #再爬蟲程式開始運作前,先建立資料庫
- cur=conn.cursor()
- cur.execute(sql1)
- cur.execute(sql2)
- cur.execute(sql3)
- cur.execute(sql4)
- conn.commit()
- conn.close()
- class meizitu(Downloader):
- def all_url(self,url,maxpage):
- for i in range(1,maxpage+1):
- page_url=url+str(i)+'.html'
- all_link.append(page_url)
- #計數器
- counter=1
- for p in all_link:
- html=request.get(p,6)
- html.encoding='gb2312'
- soup=BeautifulSoup(html.text,'lxml')
- try:
- #取得頁面的title跟該頁面的圖片連結
- title=soup.find('div',{'class':'metaRight'}).find('a')
- #取得圖片連結
- img_url=soup.find('div',{'class':'postContent'}).find_all('img')
- #測試用 印出頁面的title
- #print(title.text)
- #測試用
- #print(len(img_url),img_url)
- #要存圖片的資料夾檔名就用頁面的title
- dirname=title.text
- #寫入資料庫
- album_sql="insert or ignore into album values({},'{}');".format(counter,dirname)
- #測試sql語法
- #print('insert_sql=',album_sql)
- conn= lite.connect(sqlite_path)
- cur=conn.cursor()
- cur.execute(album_sql)
- conn.commit()
- conn.close()
- #建立資料夾
- self.mkdir(dirname)
- fk=counter
- #check fk value in main()
- #print('main()裡的fk值',fk)
- #儲存圖檔
- self.save(img_url,dirname,fk)
- counter+=1
- except Exception as e:
- print('error: {}'.format(e))
- error_page.append(p)
- pass
- #def request(self,url):
- # headers = {'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"}
- # res = requests.get(url, headers=headers,stream=True)
- # res.encoding='gb2312'
- # return res
- def mkdir(self, dirname):
- dirname=dirname.strip()
- DisExists = os.path.exists(os.path.join(dir_path, dirname))
- mydir_path=os.path.join(dir_path, dirname)
- if DisExists==0:
- print('建立資料夾:'+mydir_path)
- os.makedirs(mydir_path)
- os.chdir(mydir_path)
- return True
- else:
- print('資料夾已存在'+mydir_path)
- os.chdir(mydir_path)
- return False
- def save(self, img_url,dirname,fk):
- #check fk value in save()
- #print('save()裡的 fk值=',fk)
- for pic in img_url:
- #路徑check
- #print('目前工作目錄:'+os.getcwd())
- #頁面裡的圖片連結
- pic_src=pic['src']
- #測試用
- #print('要下載的圖檔連結'+pic_src)
- #下載圖片後要存檔的檔名
- pic_name=pic_src.split('/')[-1]
- #寫入資料庫
- img_sql="INSERT INTO album_imags (album_id,title,img_src) VALUES ({},'{}','{}');".format(fk,dirname,pic_src)
- #check sql語法
- #print('table img sql=',img_sql)
- #寫入資料庫
- conn= lite.connect(sqlite_path)
- cur=conn.cursor()
- cur.execute(img_sql)
- conn.commit()
- conn.close()
- #下載圖片後要存檔的檔名
- #檢查檔案是否已經存存在
- #存檔的名稱與下載的圖檔名稱一樣
- #所以可以判斷是否已經下載過
- FisExists = os.path.exists(pic_name)
- if FisExists ==1:
- print('檔案{}已存在'.format(pic_name))
- else:
- #下載圖片
- print('開始下載{}'.format(pic_name))
- #先停用下載功能
- get_pic=request.get(pic_src,6,proxy=1)
- f=open(pic_name,'wb')
- shutil.copyfileobj(get_pic.raw,f)
- f.close()
- del get_pic
- request=Downloader()
- Meizitu=meizitu()
- Meizitu.all_url(url='http://meizitu.com/a/',maxpage=5)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement