Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- library(rvest)
- library(stringr)
- library(tidyr)
- library(data.table)
- library(plyr)
- library(xml2)
- library(selectr)
- library(tibble)
- library(purrr)
- library(datapasta)
- library(jsonlite)
- library(countrycode)
- library(httr)
- library(stringi)
- library(tidyverse)
- library(dplyr)
- ## 00. Pick the webpage
- BIHS <- read_html("http://bangladesh.ifpri.info/resources/datasets/")
- ## 01. Get the urls I am interested in
- BIHS_urls <- BIHS %>%
- html_nodes(".site-title a, .thetitle") %>%
- html_attr("href")
- ## 02. Get the title of the urls
- BIHS_title <- BIHS %>%
- html_nodes("h2, .thetitle") %>%
- html_text()
- ## 03. Get the title of the urls
- BIHS_text <- BIHS %>%
- html_nodes("h2 , .ifeb-col2") %>%
- html_text()
- ## 04. Adding the variable "AreaName", to merge the dataframe with future standardized ones
- BIHS_AreaName <- "Bangladesh"
- ## 05. Merging all the variables in a single dataframe
- BIHS_df <- cbind(BIHS_urls, BIHS_title, BIHS_text, BIHS_AreaName)
- ## 06. Settimg wordcount
- getCount <- function(data,keyword)
- {
- wcount <- str_count(BIHS_df, keyword)
- return(data.frame(data,wcount))
- }
- BIHS_Household <- getCount(BIHS_df, 'Household')
- BIHS_Unernourishment <- getCount(BIHS_Household, 'Undernourishment')
- # Assuming your data is always in the below format i.e. a large body of text
- dat <- "A Afghanistan Albania Algeria Andorra Angola Antigua and Barbuda Argentina
- Armenia Australia Austria Azerbaijan
- B Bahamas Bahrain Bangladesh Barbados Belarus Belgium Belize Benin Bhutan
- Bolivia Bosnia and Herzegovina Botswana Brazil Brunei Darussalam Bulgaria
- Burkina Faso Burundi"
- #Break into rough chunks based on the starting letter
- dat <- unlist(strsplit(dat, "\n\n"))
- #Loop through the character vector to get the different countries
- dat_list <- list()
- for(i in 1:length(dat)){
- current_letter <- substr(dat[i], 1, 1)
- smaller_chunks <- unlist(strsplit(dat[i], paste("[[:space:]]",current_letter, sep = "")))
- smaller_chunks <- paste(current_letter, gsub("\n","",smaller_chunks), sep = "")
- dat_list[[i]] <- smaller_chunks
- }
- > dat_list
- [[1]]
- [1] "AA" "Afghanistan" "Albania" "Algeria"
- [5] "Andorra" "Angola" "Antigua and Barbuda" "Argentina "
- [9] "Armenia" "Australia" "Austria" "Azerbaijan"
- [[2]]
- [1] "BB" "Bahamas" "Bahrain"
- [4] "Bangladesh" "Barbados" "Belarus"
- [7] "Belgium" "Belize" "Benin"
- [10] "Bhutan " "Bolivia" "Bosnia and Herzegovina"
- [13] "Botswana" "Brazil" "Brunei Darussalam"
- [16] "Bulgaria " "Burkina Faso" "Burundi"
Add Comment
Please, Sign In to add comment