Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- '''
- author:smilehsu
- blog:smilehsu.cc
- requirements:Windows7、python3.52
- date:2017/02/14
- 程式用物件化改寫
- 2017/02/14 10am
- 加入 MongoDB
- '''
- import os, datetime, requests, shutil
- from bs4 import BeautifulSoup
- from pymongo import MongoClient
- #base_url='http://meizitu.com/a/'
- all_link=[]
- error_page=[]
- dir_path='d:\meizitu'
- class meizitu():
- def __init__(self):
- client=MongoClient()
- db=client['crawlerDB']
- self.meizitu_collection=db['meizitu']
- self.title=''
- self.url=''
- self.img_urls=[]
- def all_url(self,url,maxpage):
- for i in range(1,maxpage+1):
- page_url=url+str(i)+'.html'
- all_link.append(page_url)
- for p in all_link:
- html=self.request(p)
- soup=BeautifulSoup(html.text,'lxml')
- try:
- #取得頁面的title跟該頁面的圖片連結
- title=soup.find('div',{'class':'metaRight'}).find('a')
- #取得圖片連結
- img_url=soup.find('div',{'class':'postContent'}).find_all('img')
- #測試用 印出頁面的title
- print(title.text)
- #測試用
- #print(len(img_url),img_url)
- #要存圖片的資料夾檔名就用頁面的title
- dirname=title.text
- #存資料庫
- self.title=title.text
- self.url=title['href']
- #判斷是否已抓取
- if self.meizitu_collection.find_one({'頁面連結':self.url}):
- print('該頁面已抓取過')
- else:
- #建立資料夾
- self.mkdir(dirname)
- #儲存圖檔
- self.save(img_url)
- except:
- print('error: {}'.format(p))
- error_page.append(p)
- pass
- def request(self,url):
- headers = {'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"}
- res = requests.get(url, headers=headers,stream=True)
- res.encoding='gb2312'
- return res
- def mkdir(self, dirname):
- dirname=dirname.strip()
- isExists = os.path.exists(os.path.join(dir_path, dirname))
- mydir_path=os.path.join(dir_path, dirname)
- if not isExists:
- print('建立資料夾:'+mydir_path)
- os.makedirs(mydir_path)
- os.chdir(mydir_path)
- return True
- else:
- print('資料夾已存在'+mydir_path)
- os.chdir(mydir_path)
- return False
- def save(self, img_url):
- for pic in img_url:
- #路徑check
- #print('目前工作目錄:'+os.getcwd())
- #頁面裡的圖片連結
- pic_src=pic['src']
- #圖檔的連結存到資料庫
- self.img_urls=pic_src
- post = {
- '頁面標題': self.title,
- '頁面連結': self.url,
- '圖片連結': self.img_urls,
- '下載時間': datetime.datetime.now()
- }
- self.meizitu_collection.save(post)
- print('要下載的圖檔連結'+pic_src)
- #下載圖片後要存檔的檔名
- #pic_name=dir_name+'_'+pic_src.split('/')[-1]
- pic_name=pic_src.split('/')[-1]
- #下載圖片
- get_pic=self.request(pic_src)
- f=open(pic_name,'wb')
- shutil.copyfileobj(get_pic.raw,f)
- f.close()
- del get_pic
- Meizitu=meizitu()
- Meizitu.all_url(url='http://meizitu.com/a/',maxpage=5)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement