Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- Crawler <- function( GivenPage ) {
- # 整合要爬的頁數
- paste0("https://www.ptt.cc/bbs/StupidClown/index",GivenPage,".html") -> TargetPages
- # 先從你要先抓的頁面找每篇文章的URL
- lapply(TargetPages,function(k) { k %>% GET %>% content %>% xml_find_all("//div[@class='title']/a") %>% xml_attr("href") %>%
- paste0("https://www.ptt.cc",.) } ) %>% unlist -> articlelist
- # 接著進入你要的這些urllist裏頭去抓取文章內容
- lapply(articlelist,function(k) { k %>% GET %>% content } ) -> temp
- lapply(temp,function(k) { k %>% xml_find_all("//title")} %>% xml_text %>% strsplit(.,"-",) %>% .[[1]] %>% .[1] %>% trimws ) %>% unlist -> titlelist
- lapply(temp,function(k) { k %>% xml_find_all("//div[@id='main-content']") %>% xml_text() %>%
- gsub("\n"," ",.) %>% strsplit(.,"--") %>% .[[1]] %>% .[1] %>% trimws } ) %>% unlist -> articlecontent
- # 把結果組合成一個dataframe會比較好看
- data.frame(title = titlelist,
- contents = articlecontent) -> WhatUMightWant
- assign("hive",WhatUMightWant,envir = globalenv())
- }
Advertisement
Add Comment
Please, Sign In to add comment