Advertisement
celestialgod

typhoon info crawler 2

Oct 19th, 2018
129
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
R 3.14 KB | None | 0 0
  1. library(httr)
  2. library(xml2)
  3. library(rvest)
  4. library(pipeR)
  5. library(data.table)
  6. library(stringr)
  7. library(lubridate)
  8.  
  9. getActiveTyphoonFunc <- function() {
  10.   url <- "https://sharaku.eorc.jaxa.jp/cgi-bin/typhoon_rt/main.cgi?lang=en"
  11.   tmp <- GET(url) %>>% content
  12.   areas <- c("asiaarea", "americaarea", "oceaniaarea")
  13.   activeTyphoonDT <- lapply(areas, function(s){
  14.     t <- xml_find_first(tmp, sprintf("//div[@id='%s']//ul[@class='act_typ']", s))
  15.     if (xml_length(t) == 0L)
  16.       return(NULL)
  17.     else {
  18.       urls <- t %>>% xml_children %>>% xml_children
  19.       data.table(area = s,
  20.                  typhoonNum = str_extract(xml_text(urls), "[^(]+"),
  21.                  typhoonName = str_replace(xml_text(urls), "[^(]+\\(([A-Z\\-]+)\\)", "\\1"),
  22.                  url = xml_attr(urls, "href") %>>%
  23.                    str_replace("\\./", str_replace(url, "[^/]+$", "")))
  24.     }
  25.   }) %>>% rbindlist
  26.   return(activeTyphoonDT)
  27. }
  28.  
  29. getActiveTyphoonDetailFunc <- function(url) {
  30.   outDT <- GET(url) %>>% content %>>%
  31.     xml_find_all("//table[@class='typinfoS']") %>>%
  32.     html_table(header = FALSE) %>>%
  33.     lapply(function(df){
  34.       data.table(satellite = df[1, 2], obs_date = parse_date_time(df[2, 2], "ymd HM"),
  35.                  r1 = str_extract(df[3, 2], "^\\d+.\\d+[NS]"), r2 = str_extract(df[3, 2], "\\d+.\\d+[NS]$"),
  36.                  r3 = str_extract(df[4, 2], "^\\d+.\\d+[EW]"), r4 = str_extract(df[4, 2], "\\d+.\\d+[EW]$"),
  37.                  pressure = df[7, 2], winds = df[8, 2])
  38.     }) %>>% rbindlist
  39.   return(outDT)
  40. }
  41.  
  42. activeTyphoonDT <- getActiveTyphoonFunc()
  43. getActiveTyphoonDetailFunc(activeTyphoonDT$url[1])
  44. getActiveTyphoonDetailFunc(activeTyphoonDT$url[2])
  45.  
  46.  
  47. getNasaDataFunc <- function(year, typhoonNames) {
  48.   url <- "http://rammb.cira.colostate.edu/products/tc_realtime/season.asp?storm_season=%i"
  49.   infoUrl <- GET(sprintf(url, year)) %>>%
  50.     content %>>%
  51.     xml_find_all(sprintf("//a[contains(.,'%s')]", typhoonNames)) %>>%
  52.     xml_attr("href") %>>%
  53.     (str_c(str_replace(url, "[^/]+$", ""), .))
  54.   tmp <- GET(infoUrl) %>>% content
  55.   outList <- html_table(tmp, header = 1L) %>>%
  56.     setNames(c("forecast_track", "track_history")) %>>%
  57.     lapply(as.data.table)
  58.   forecastTime <- xml_find_all(tmp, "//h4[contains(.,'Latest Forecast')]") %>>%
  59.     xml_text %>>% str_extract("\\d{12}") %>>% parse_date_time("ymdHM")
  60.   outList$forecast_track[ , forecastTime := forecastTime + `Forecast Hour` * 60]
  61.  
  62.   imageUrl <- "http://rammb.cira.colostate.edu/products/tc_realtime/archive.asp?product=4kmirimg&storm_identifier=%s" %>>%
  63.     sprintf(str_extract(infoUrl, "[^=]+$"))
  64.   outList$imageDT <- GET(imageUrl) %>>% content %>>% xml_find_all("//table") %>>% {
  65.     header <- xml_find_all(., "//tr/th") %>>% sapply(xml_text)
  66.     tmp2 <- xml_find_all(., "//tr/td")
  67.     idx <- which(sapply(tmp2, xml_text) == "Image")
  68.     data.table(urlTime = tmp2[idx-4L] %>>% sapply(xml_text) %>>% parse_date_time("ymd HM"),
  69.                imageUrls = tmp2[idx] %>>% xml_children %>>% sapply(xml_attr, attr = "href")) %>>%
  70.       setnames(c(header[1], tail(header, 1)))
  71.   }
  72.   return(outList)
  73. }
  74.  
  75. x <- getNasaDataFunc(2018, activeTyphoonDT$typhoonName[1])
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement