celestialgod

typhoon info crawler

Oct 19th, 2018
176
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
R 1.18 KB | None | 0 0
  1. library(httr)
  2. library(xml2)
  3. library(rvest)
  4. library(pipeR)
  5. library(data.table)
  6. library(stringr)
  7. library(lubridate)
  8.  
  9. url <- "https://sharaku.eorc.jaxa.jp/cgi-bin/typ_db/typ_db.cgi?lang=e&mode=search&GPM=ON&GW1AM2=ON&TRMM=ON&P1AME=ON&A2AMS=ON&area=&sy=1997&sm=12&ey=2018&em=09&tname="
  10. typhoonInfoWeb <- GET(url) %>>% content
  11. cols <- typhoonInfoWeb %>>%
  12.   xml_find_all("//th[@class='c1']") %>>%
  13.   xml_text
  14. values <- typhoonInfoWeb %>>%
  15.   xml_find_all("//td[@class='c2']") %>>%
  16.   xml_text %>>%
  17.   str_trim
  18. typhoneNames <- typhoonInfoWeb %>>%
  19.   xml_find_all("//table[@id='searchresult']") %>>%
  20.   xml_find_all("//h2") %>>%
  21.   xml_text %>>% str_replace_all("^[^A-Za-z ]", "")
  22.  
  23. typhoonMat <- matrix(NA_character_, nrow = length(typhoneNames), ncol = uniqueN(cols))
  24. typhoonMat[cbind(cumsum(cols == cols[1]), match(cols, unique(cols)))] <- values
  25. typhoonDT <- data.table(typhoonMat) %>>%
  26.   setnames(unique(cols) %>>% str_replace_all("[ :]+$", "")) %>>%
  27.   `[`(j = `:=`(start_dt = parse_date_time(str_extract(Period, "^[A-Za-z]{3} \\d{2},\\d{4}"), "mdy"),
  28.                end_dt = parse_date_time(str_extract(Period, "[A-Za-z]{3} \\d{2},\\d{4}$"), "mdy")))
  29. saveRDS(typhoonDT, "typhoonInfoDT.rds")
Advertisement
Add Comment
Please, Sign In to add comment