Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import requests, re, os, csv, wget, time
- from bs4 import BeautifulSoup
- base_url = "http://stockmarketpilipinas.com/"
- url='http://stockmarketpilipinas.com/thread-337.html'
- #url2='http://stockmarketpilipinas.com/thread-337-page-453.html'
- runat=time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
- workpath = os.getcwd()
- # 存檔路徑
- mydir = os.path.join('/root/python_lab/', "daily_csv")
- #重覆的檔案放這邊
- mydir2 = os.path.join('/root/python_lab/', "duplicate_files")
- DailyReport=''
- downloads_TXT=os.path.join('/root/python_lab/'+'download_files.txt')
- DailyReport_TXT=os.path.join('/root/python_lab/'+'DailyReport.txt')
- filelist=[]
- with open(downloads_TXT, 'r') as f:
- data = f.readlines()
- for line in data:
- filelist.append(line.strip())
- last_download_page=filelist[0]
- #print("上次下載的頁面是在{}".format(last_download_page))
- #print(len(filelist), filelist)
- res=requests.get(url)
- soup=BeautifulSoup(res.content, 'html5lib')
- get_lastpage=soup.find("span",{"class":"pages"})
- get_curren=soup.find("span",{"class":"pagination_current"})
- last_page=re.sub(r'\D','',get_lastpage.text)
- curren_page=re.sub(r'\D','',get_curren.text)
- #print('目前在討論版的第{}頁 \n最後一頁是:{}頁'.format(get_curren.text,last_page))
- page_list = []
- # 起始的頁數
- pg = int(last_download_page)
- #結束的頁數
- max_num = int(last_page)
- download_files = []
- error_page = []
- if max_num != pg:
- for i in range(pg, max_num+1):
- get_page = str(pg)
- pg += 1
- page_list.append(base_url+'thread-337-page-'+get_page+'.html')
- else:
- page_list.append(base_url+'thread-337-page-'+str(max_num)+'.html')
- #print("PG={}\nmax_num={}".format(pg,max_num))
- #print(page_list)
- for crawling_page in page_list:
- page_html = requests.get(crawling_page)
- page_soup = BeautifulSoup(page_html.text, 'lxml')
- title = page_soup.find('div', {'id': 'posts'}).find_all('fieldset')
- if title:
- print("開始從 {} 下載資料:".format(crawling_page))
- for i in title:
- try:
- #取得檔名 + 轉成小寫
- f_name = i.find('a').text.lower()
- # 取得檔案連結
- f_href = base_url+i.find('a')['href']
- #先檢查檔案是否在上次下載的檔案list裡面
- if f_name in filelist:
- print("已有檔案: {}".format(f_name))
- #有就跳出,檢查下一個
- continue
- else:
- #檢查檔案是否存在
- isExists = os.path.exists(os.path.join(mydir, f_name))
- if not isExists:
- # 下載檔案
- print("下載檔案:", f_name)
- #本次下載的檔案清單
- download_files.append(f_name)
- #所有下載的檔案清單
- filelist.append(f_name)
- wget.download(f_href, out=os.path.join(mydir, f_name))
- #下載檔案後,暫時0.3秒
- time.sleep(0.3)
- else:
- #如果檔案存在就下載到其他資料夾
- print("已有檔案: {}".format(f_name))
- #wget.download(f_href, out=os.path.join(mydir2, f_name))
- except Exception as e:
- #msg = 'error: {0} {1} \n'.format(crawling_page, f_name)
- error_page.append(e)
- continue
- else:
- print("沒有資料:",crawling_page)
- msg=''
- if len(download_files):
- for i in download_files:
- msg=msg+str(i)+'\n'
- else:
- msg='本次沒有沒有新檔案需要下載'
- #print(msg)
- msg2=''
- if len(error_page):
- for i in error_page:
- msg2=msg2+str(i)+'\n'
- print(msg2)
- else:
- msg2='沒有錯誤訊息!!!'
- #filelist[0]紀錄最後抓取的頁數
- filelist[0]=str(max_num)
- #print(len(filelist), filelist)
- #把list寫入文字檔,更新filelist[0]的內容
- with open(downloads_TXT, 'w') as f:
- for item in filelist:
- f.write("%s\n" % item)
- #建立工作log檔
- s='================================='
- DailyReport='每日更新報告 @ {} \n{}\n本次下載的檔案:\n{}\n錯誤訊息:{}\n{}\n'.format(runat,s,msg ,msg2,s)
- #以附加的方式將新增檔案的名稱寫入文字檔
- with open(DailyReport_TXT, 'a+') as f:
- for item in DailyReport:
- f.write("%s" % item)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement