Guest User

Untitled

a guest
Oct 21st, 2016
198
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.15 KB | None | 0 0
  1. Crawler <- function( GivenPage ) {
  2.  
  3. # 整合要爬的頁數
  4.  
  5. paste0("https://www.ptt.cc/bbs/StupidClown/index",GivenPage,".html") -> TargetPages
  6.  
  7. # 先從你要先抓的頁面找每篇文章的URL
  8.  
  9. lapply(TargetPages,function(k) { k %>% GET %>% content %>% xml_find_all("//div[@class='title']/a") %>% xml_attr("href") %>%
  10. paste0("https://www.ptt.cc",.) } ) %>% unlist -> articlelist
  11.  
  12. # 接著進入你要的這些urllist裏頭去抓取文章內容
  13.  
  14. lapply(articlelist,function(k) { k %>% GET %>% content } ) -> temp
  15. lapply(temp,function(k) { k %>% xml_find_all("//title")} %>% xml_text %>% strsplit(.,"-",) %>% .[[1]] %>% .[1] %>% trimws ) %>% unlist -> titlelist
  16. lapply(temp,function(k) { k %>% xml_find_all("//div[@id='main-content']") %>% xml_text() %>%
  17. gsub("\n"," ",.) %>% strsplit(.,"--") %>% .[[1]] %>% .[1] %>% trimws } ) %>% unlist -> articlecontent
  18.  
  19. # 把結果組合成一個dataframe會比較好看
  20.  
  21. data.frame(title = titlelist,
  22. contents = articlecontent) -> WhatUMightWant
  23.  
  24. assign("hive",WhatUMightWant,envir = globalenv())
  25. }
Advertisement
Add Comment
Please, Sign In to add comment