celestialgod

抓幾米長城文章

May 23rd, 2016
200
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
R 1.57 KB | None | 0 0
  1. library(httr)
  2. library(xml2)
  3. library(pipeR)
  4. library(purrr)
  5. library(stringi)
  6. library(stringr)
  7.  
  8. URLdecodeToBIG5 <- function(url)
  9. {
  10.   url_raw <- charToRaw(url)
  11.   utf8code <- (url_raw == charToRaw("%")) %>>% (~ loc_utf8 <- . ) %>>% rle %>>%
  12.     (cumsum(.$lengths)[.$lengths == 1 & .$values == TRUE]) %>>%
  13.     (c(.+1, .+2)) %>>% sort %>>% (~ remove_loc <- .) %>>% (url_raw[.]) %>>% as.integer
  14.   utf8code[utf8code > 96L] <- utf8code[utf8code > 96L] - 32L
  15.   utf8code[utf8code > 57L] <- utf8code[utf8code > 57L] - 7L
  16.  
  17.   url_raw[loc_utf8] <- utf8code %>>% split(rep(1:(length(utf8code)/2), each = 2)) %>>%
  18.     map(~sum((. - 48L) * c(16L, 1L))) %>>% as.character %>>% as.raw
  19.   stringi::stri_conv(url_raw[setdiff(1:length(url_raw), remove_loc)], from = "UTF8", to = "BIG5")
  20. }
  21.  
  22. # url <- "http://www.jimmyfans.com/3/101658/%E6%84%9B%E6%83%85%E7%9A%84%E4%BA%94%E9%9D%A2%E9%8F%A1%E5%AD%90.html"
  23. url <- "http://www.jimmyfans.com/3/112612/%E4%B8%80%E8%88%AC%E4%BA%BA%E7%9C%8B%E4%B8%8D%E6%87%82%E7%9C%8B%E6%87%82%E7%9A%84%E7%B5%95%E5%B0%8D%E4%B8%8D%E6%98%AF%E4%B8%80%E8%88%AC%E4%BA%BA.html"
  24. outputName <- URLdecodeToBIG5(url) %>>% str_extract("\\d{6}/.*\\.html") %>>%
  25.   str_replace_all("\\d{6}/", "") %>>% str_replace_all("\\.html", "")
  26.  
  27. GET(url) %>>% content(encoding = "UTF8") %>>%
  28.   xml_find_all("//div[@class='contentBox']/div[@class='div_object_desc']") %>>%
  29.   xml_contents %>>% sapply(xml_text) %>>% (.[sapply(., str_length) > 3 & . != "Advertisement"]) %>>%
  30.   stri_conv(from = "UTF8", to = "BIG5") %>>% str_replace_all("\t|\n", "") %>>%
  31.   write(str_c(outputName, ".txt"))
Advertisement
Add Comment
Please, Sign In to add comment