Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- library(jsonlite)
- library(dplyr)
- # input: nothing
- # output: thread urls
- getOPthreads <- function() {
- catalog_url <- "http://a.4cdn.org/g/catalog.json"
- # a dataframe, 10x2 or maybe 11x2, 1st col is list of pages, 2nd col is /g/ OPs
- catalog.data <- fromJSON(readLines(catalog_url, warn = "F"))
- # a list of OPs, with each OP as a data frame
- catOPs <- catalog.data[[2]]
- # a data frame with active threads as cols, number/time as rows
- # calls to getOPinfo()
- OPinfo <- as.data.frame(getOPinfo(catOPs), row.names = c("threadNo", "createdAt"))
- # OPinfo[1, ] gives all thread numbers
- # Could have multiple threads when approaching bump limit
- threadNos <- as.numeric(OPinfo[1, ])
- # OPcreated_at <- as.POSIXct(as.numeric(OPinfo[2, ]), origin="1970/01/01")
- # Assemble URLs
- thread_url <- character(length(threadNos))
- for (i in seq_along(threadNos)) {
- thread_url[i] <- paste0("http://a.4cdn.org/g/thread/", threadNos[i], ".json")
- }
- return(thread_url)
- }
- # takes list of OPs, returns a list with thread number and when created
- getOPinfo <- function(catOPs) {
- OPinfo <- list()
- # iterates through the pages to find mlds in the semantic url
- for (i in seq_along(catOPs)) {
- page <- tbl_df(catOPs[[i]]) # current page
- isMLDSthere <- grepl("mlds", page$semantic_url)
- if (any(isMLDSthere)) { # found /mlds/ on a page
- threadNo <- page$no[isMLDSthere]
- createdAt <- page$time[isMLDSthere] # not necessary right now
- OPinfo <- c(OPinfo, list(c(threadNo, createdAt)))
- }
- }
- return(OPinfo)
- }
- # replace this assign with call to getOPthreads()
- # WARNING: respect the request rates as given in https://github.com/4chan/4chan-API
- # This is currently just grabbing one thing though.
- thread_url <- "http://a.4cdn.org/g/thread/54481707.json"
- thread.data <- thread_url %>%
- readLines(warn="F") %>%
- fromJSON() %>%
- data.frame() %>%
- tbl_df()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement