Advertisement
Guest User

Untitled

a guest
May 10th, 2016
174
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
R 1.94 KB | None | 0 0
  1. library(jsonlite)
  2. library(dplyr)
  3.  
  4. # input: nothing
  5. # output: thread urls
  6. getOPthreads <- function() {
  7.   catalog_url <- "http://a.4cdn.org/g/catalog.json"
  8.  
  9.   # a dataframe, 10x2 or maybe 11x2, 1st col is list of pages, 2nd col is /g/ OPs
  10.   catalog.data <- fromJSON(readLines(catalog_url, warn = "F"))
  11.  
  12.   # a list of OPs, with each OP as a data frame
  13.   catOPs <- catalog.data[[2]]
  14.  
  15.   # a data frame with active threads as cols, number/time as rows
  16.   # calls to getOPinfo()
  17.   OPinfo <- as.data.frame(getOPinfo(catOPs), row.names = c("threadNo", "createdAt"))
  18.  
  19.   # OPinfo[1, ] gives all thread numbers
  20.   # Could have multiple threads when approaching bump limit
  21.   threadNos <- as.numeric(OPinfo[1, ])
  22.  
  23.   # OPcreated_at <- as.POSIXct(as.numeric(OPinfo[2, ]), origin="1970/01/01")
  24.  
  25.   # Assemble URLs
  26.   thread_url <- character(length(threadNos))
  27.   for (i in seq_along(threadNos)) {
  28.     thread_url[i] <- paste0("http://a.4cdn.org/g/thread/", threadNos[i], ".json")
  29.   }
  30.  
  31.   return(thread_url)
  32. }
  33.  
  34. # takes list of OPs, returns a list with thread number and when created
  35. getOPinfo <- function(catOPs) {
  36.   OPinfo <- list()
  37.  
  38.   # iterates through the pages to find mlds in the semantic url
  39.   for (i in seq_along(catOPs)) {
  40.     page <- tbl_df(catOPs[[i]])  # current page
  41.     isMLDSthere <- grepl("mlds", page$semantic_url)
  42.    
  43.     if (any(isMLDSthere)) {  # found /mlds/ on a page
  44.       threadNo <- page$no[isMLDSthere]
  45.       createdAt <- page$time[isMLDSthere]  # not necessary right now
  46.       OPinfo <- c(OPinfo, list(c(threadNo, createdAt)))
  47.     }
  48.   }
  49.   return(OPinfo)
  50. }
  51.  
  52. # replace this assign with call to getOPthreads()
  53. # WARNING: respect the request rates as given in https://github.com/4chan/4chan-API
  54. # This is currently just grabbing one thing though.
  55. thread_url <- "http://a.4cdn.org/g/thread/54481707.json"
  56. thread.data <- thread_url %>%
  57.   readLines(warn="F") %>%
  58.   fromJSON() %>%
  59.   data.frame() %>%
  60.   tbl_df()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement