Advertisement
rs6000

2019_PSE_0_CheckNewCsv

Jan 24th, 2019
217
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.50 KB | None | 0 0
  1. import requests, re, os, csv, wget, time
  2.  
  3. from bs4 import BeautifulSoup
  4. base_url = "http://stockmarketpilipinas.com/"
  5. url='http://stockmarketpilipinas.com/thread-337.html'
  6. #url2='http://stockmarketpilipinas.com/thread-337-page-453.html'
  7.  
  8. runat=time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  9. workpath = os.getcwd()
  10. # 存檔路徑
  11. mydir = os.path.join('/root/python_lab/', "daily_csv")
  12. #重覆的檔案放這邊
  13. mydir2 = os.path.join('/root/python_lab/', "duplicate_files")
  14. DailyReport=''
  15. downloads_TXT=os.path.join('/root/python_lab/'+'download_files.txt')
  16. DailyReport_TXT=os.path.join('/root/python_lab/'+'DailyReport.txt')
  17.  
  18. filelist=[]
  19. with open(downloads_TXT, 'r') as f:
  20.     data = f.readlines()
  21.     for line in data:
  22.         filelist.append(line.strip())
  23. last_download_page=filelist[0]
  24. #print("上次下載的頁面是在{}".format(last_download_page))
  25. #print(len(filelist), filelist)
  26.  
  27. res=requests.get(url)
  28. soup=BeautifulSoup(res.content, 'html5lib')
  29. get_lastpage=soup.find("span",{"class":"pages"})
  30. get_curren=soup.find("span",{"class":"pagination_current"})
  31.  
  32. last_page=re.sub(r'\D','',get_lastpage.text)
  33. curren_page=re.sub(r'\D','',get_curren.text)
  34. #print('目前在討論版的第{}頁 \n最後一頁是:{}頁'.format(get_curren.text,last_page))
  35.  
  36. page_list = []
  37. # 起始的頁數
  38. pg = int(last_download_page)
  39. #結束的頁數
  40. max_num = int(last_page)
  41.  
  42. download_files = []
  43. error_page = []
  44.  
  45. if max_num != pg:
  46.     for i in range(pg, max_num+1):
  47.         get_page = str(pg)
  48.         pg += 1
  49.         page_list.append(base_url+'thread-337-page-'+get_page+'.html')
  50. else:
  51.     page_list.append(base_url+'thread-337-page-'+str(max_num)+'.html')
  52.  
  53. #print("PG={}\nmax_num={}".format(pg,max_num))
  54. #print(page_list)
  55.  
  56. for crawling_page in page_list:
  57.     page_html = requests.get(crawling_page)
  58.     page_soup = BeautifulSoup(page_html.text, 'lxml')
  59.     title = page_soup.find('div', {'id': 'posts'}).find_all('fieldset')
  60.     if title:
  61.         print("開始從 {} 下載資料:".format(crawling_page))
  62.         for i in title:
  63.             try:
  64.                 #取得檔名 + 轉成小寫
  65.                 f_name = i.find('a').text.lower()
  66.                 # 取得檔案連結
  67.                 f_href = base_url+i.find('a')['href']
  68.                 #先檢查檔案是否在上次下載的檔案list裡面
  69.                 if f_name in filelist:
  70.                     print("已有檔案: {}".format(f_name))
  71.                     #有就跳出,檢查下一個
  72.                     continue
  73.                 else:
  74.                     #檢查檔案是否存在
  75.                     isExists = os.path.exists(os.path.join(mydir, f_name))
  76.                     if not isExists:
  77.                         # 下載檔案
  78.                         print("下載檔案:", f_name)
  79.                         #本次下載的檔案清單
  80.                         download_files.append(f_name)
  81.                         #所有下載的檔案清單
  82.                         filelist.append(f_name)
  83.                         wget.download(f_href, out=os.path.join(mydir, f_name))
  84.                         #下載檔案後,暫時0.3秒
  85.                         time.sleep(0.3)
  86.                     else:
  87.                     #如果檔案存在就下載到其他資料夾
  88.                         print("已有檔案: {}".format(f_name))
  89.                         #wget.download(f_href, out=os.path.join(mydir2, f_name))
  90.             except Exception as e:
  91.                 #msg = 'error: {0} {1} \n'.format(crawling_page, f_name)
  92.                 error_page.append(e)
  93.                 continue
  94.     else:
  95.         print("沒有資料:",crawling_page)
  96.  
  97. msg=''
  98. if len(download_files):
  99.     for i in download_files:
  100.         msg=msg+str(i)+'\n'
  101. else:
  102.     msg='本次沒有沒有新檔案需要下載'
  103. #print(msg)
  104.  
  105. msg2=''
  106. if len(error_page):
  107.     for i in error_page:
  108.         msg2=msg2+str(i)+'\n'
  109.     print(msg2)
  110. else:
  111.     msg2='沒有錯誤訊息!!!'
  112.  
  113.  
  114. #filelist[0]紀錄最後抓取的頁數
  115. filelist[0]=str(max_num)
  116. #print(len(filelist), filelist)
  117.  
  118. #把list寫入文字檔,更新filelist[0]的內容
  119. with open(downloads_TXT, 'w') as f:
  120.     for item in filelist:
  121.         f.write("%s\n" % item)
  122.  
  123.  
  124. #建立工作log檔
  125. s='================================='
  126. DailyReport='每日更新報告 @ {} \n{}\n本次下載的檔案:\n{}\n錯誤訊息:{}\n{}\n'.format(runat,s,msg ,msg2,s)
  127.  
  128. #以附加的方式將新增檔案的名稱寫入文字檔
  129. with open(DailyReport_TXT, 'a+') as f:
  130.     for item in DailyReport:
  131.         f.write("%s" % item)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement