抓幾米長城文章

library(httr)
library(xml2)
library(pipeR)
library(purrr)
library(stringi)
library(stringr)

URLdecodeToBIG5 <- function(url)
{
  url_raw <- charToRaw(url)
  utf8code <- (url_raw == charToRaw("%")) %>>% (~ loc_utf8 <- . ) %>>% rle %>>%
    (cumsum(.$lengths)[.$lengths == 1 & .$values == TRUE]) %>>%
    (c(.+1, .+2)) %>>% sort %>>% (~ remove_loc <- .) %>>% (url_raw[.]) %>>% as.integer
  utf8code[utf8code > 96L] <- utf8code[utf8code > 96L] - 32L
  utf8code[utf8code > 57L] <- utf8code[utf8code > 57L] - 7L

  url_raw[loc_utf8] <- utf8code %>>% split(rep(1:(length(utf8code)/2), each = 2)) %>>%
    map(~sum((. - 48L) * c(16L, 1L))) %>>% as.character %>>% as.raw
  stringi::stri_conv(url_raw[setdiff(1:length(url_raw), remove_loc)], from = "UTF8", to = "BIG5")
}

# url <- "http://www.jimmyfans.com/3/101658/%E6%84%9B%E6%83%85%E7%9A%84%E4%BA%94%E9%9D%A2%E9%8F%A1%E5%AD%90.html"
url <- "http://www.jimmyfans.com/3/112612/%E4%B8%80%E8%88%AC%E4%BA%BA%E7%9C%8B%E4%B8%8D%E6%87%82%E7%9C%8B%E6%87%82%E7%9A%84%E7%B5%95%E5%B0%8D%E4%B8%8D%E6%98%AF%E4%B8%80%E8%88%AC%E4%BA%BA.html"
outputName <- URLdecodeToBIG5(url) %>>% str_extract("\\d{6}/.*\\.html") %>>%
  str_replace_all("\\d{6}/", "") %>>% str_replace_all("\\.html", "")

GET(url) %>>% content(encoding = "UTF8") %>>%
  xml_find_all("//div[@class='contentBox']/div[@class='div_object_desc']") %>>%
  xml_contents %>>% sapply(xml_text) %>>% (.[sapply(., str_length) > 3 & . != "Advertisement"]) %>>%
  stri_conv(from = "UTF8", to = "BIG5") %>>% str_replace_all("\t|\n", "") %>>%
  write(str_c(outputName, ".txt"))