Advertisement
rs6000

crawler_meizitu

Feb 11th, 2017
570
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.24 KB | None | 0 0
  1. '''
  2. author:smilehsu
  3. blog:smilehsu.cc
  4. requirements:Windows7、python3.52
  5. date:2017/02/12
  6. '''
  7.  
  8. import os, re, requests, shutil
  9. from bs4 import BeautifulSoup
  10.  
  11. headers = {'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"}
  12. base_url='http://meizitu.com/a/'
  13. dir_path='d:\meizitu'
  14. all_page_link=[]
  15.  
  16. #列出全部套圖的連結
  17. #num的範圍 1 <= num <= 5481
  18.  
  19. #先爬50頁試試看
  20. num=50
  21.  
  22. for i in range(1,num+1):
  23.     page=base_url+str(i)+'.html'
  24.     all_page_link.append(page)
  25.    
  26. #print(all_page_link)
  27.  
  28. error_page=[]
  29.  
  30. #把頁面連結的list傳進來,逐頁處理
  31. #ll_page_link[30:35] 先丟30~35頁測試用
  32. for get_album in all_page_link[30:35]:
  33.     page_html=requests.get(get_album)
  34.     page_html.encoding='gb2312'
  35.     page_soup=BeautifulSoup(page_html.text,'lxml')
  36.     try:
  37.         #取得頁面的title跟該頁面的圖片連結
  38.         title=page_soup.find('div',{'class':'metaRight'}).find('a')
  39.         #取得圖片連結
  40.         album_pics=page_soup.find('div',{'class':'postContent'}).find_all('img')
  41.         print(get_album)
  42.         print(title.text)
  43.         #print('目前工作目錄:'+os.getcwd())
  44.         dir_name=title.text
  45.         isExists = os.path.exists(os.path.join(dir_path, dir_name))
  46.         mydir_path=os.path.join(dir_path, dir_name)
  47.         if not isExists:
  48.             print('建立資料夾:'+mydir_path)
  49.             os.makedirs(mydir_path)
  50.            
  51.         else:
  52.             print('資料夾已存在'+mydir_path)
  53.  
  54.     except:
  55.         print('error: {}'.format(get_album))
  56.         error_page.append(get_album)
  57.         pass
  58.     #開始下載前先切換到要存放圖檔的資料夾    
  59.     os.chdir(mydir_path)    
  60.     for pic in album_pics:
  61.         #路徑check
  62.         #print('目前工作目錄:'+os.getcwd())
  63.         #頁面裡的圖片連結
  64.         pic_src=pic['src']
  65.         print('要下載的圖檔連結'+pic_src)
  66.         #下載圖片後要存檔的檔名
  67.         pic_name=dir_name+'_'+pic_src.split('/')[-1]
  68.         #下載圖片
  69.         get_pic=requests.get(pic_src,headers=headers,stream=True)
  70.         f=open(pic_name,'wb')
  71.         shutil.copyfileobj(get_pic.raw,f)
  72.         f.close()
  73.         del get_pic
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement