Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- library(httr)
- library(xml2)
- library(pipeR)
- library(purrr)
- library(stringi)
- library(stringr)
- URLdecodeToBIG5 <- function(url)
- {
- url_raw <- charToRaw(url)
- utf8code <- (url_raw == charToRaw("%")) %>>% (~ loc_utf8 <- . ) %>>% rle %>>%
- (cumsum(.$lengths)[.$lengths == 1 & .$values == TRUE]) %>>%
- (c(.+1, .+2)) %>>% sort %>>% (~ remove_loc <- .) %>>% (url_raw[.]) %>>% as.integer
- utf8code[utf8code > 96L] <- utf8code[utf8code > 96L] - 32L
- utf8code[utf8code > 57L] <- utf8code[utf8code > 57L] - 7L
- url_raw[loc_utf8] <- utf8code %>>% split(rep(1:(length(utf8code)/2), each = 2)) %>>%
- map(~sum((. - 48L) * c(16L, 1L))) %>>% as.character %>>% as.raw
- stringi::stri_conv(url_raw[setdiff(1:length(url_raw), remove_loc)], from = "UTF8", to = "BIG5")
- }
- # url <- "http://www.jimmyfans.com/3/101658/%E6%84%9B%E6%83%85%E7%9A%84%E4%BA%94%E9%9D%A2%E9%8F%A1%E5%AD%90.html"
- url <- "http://www.jimmyfans.com/3/112612/%E4%B8%80%E8%88%AC%E4%BA%BA%E7%9C%8B%E4%B8%8D%E6%87%82%E7%9C%8B%E6%87%82%E7%9A%84%E7%B5%95%E5%B0%8D%E4%B8%8D%E6%98%AF%E4%B8%80%E8%88%AC%E4%BA%BA.html"
- outputName <- URLdecodeToBIG5(url) %>>% str_extract("\\d{6}/.*\\.html") %>>%
- str_replace_all("\\d{6}/", "") %>>% str_replace_all("\\.html", "")
- GET(url) %>>% content(encoding = "UTF8") %>>%
- xml_find_all("//div[@class='contentBox']/div[@class='div_object_desc']") %>>%
- xml_contents %>>% sapply(xml_text) %>>% (.[sapply(., str_length) > 3 & . != "Advertisement"]) %>>%
- stri_conv(from = "UTF8", to = "BIG5") %>>% str_replace_all("\t|\n", "") %>>%
- write(str_c(outputName, ".txt"))
Advertisement
Add Comment
Please, Sign In to add comment