Untitled

library(rvest)
library(stringr)
library(tidyr)
library(data.table)
library(plyr)
library(xml2)
library(selectr)
library(tibble)
library(purrr)
library(datapasta)
library(jsonlite)
library(countrycode)
library(httr)
library(stringi)
library(tidyverse)
library(dplyr)


## 00. Pick the webpage
   BIHS <- read_html("http://bangladesh.ifpri.info/resources/datasets/")


## 01. Get the urls I am interested in
  BIHS_urls <- BIHS %>%
  html_nodes(".site-title a, .thetitle") %>%
  html_attr("href")


## 02. Get the title of the urls
   BIHS_title <- BIHS %>%
   html_nodes("h2, .thetitle") %>%
   html_text()

## 03. Get the title of the urls
   BIHS_text <- BIHS %>%
   html_nodes("h2 , .ifeb-col2") %>%
   html_text()


## 04. Adding the variable "AreaName", to merge the dataframe with future standardized ones
   BIHS_AreaName <- "Bangladesh"


## 05. Merging all the variables in a single dataframe
       BIHS_df <- cbind(BIHS_urls, BIHS_title, BIHS_text, BIHS_AreaName)


## 06. Settimg wordcount
       getCount <- function(data,keyword)
 {
      wcount <- str_count(BIHS_df, keyword)
      return(data.frame(data,wcount))
  }

      BIHS_Household <- getCount(BIHS_df, 'Household')
      BIHS_Unernourishment <- getCount(BIHS_Household, 'Undernourishment')

# Assuming your data is always in the below format i.e. a large body of text
dat <- "A Afghanistan Albania Algeria Andorra Angola Antigua and Barbuda Argentina
  Armenia Australia Austria Azerbaijan

B Bahamas Bahrain Bangladesh Barbados Belarus Belgium Belize Benin Bhutan
Bolivia Bosnia and Herzegovina Botswana Brazil Brunei Darussalam Bulgaria
Burkina Faso Burundi"

#Break into rough chunks based on the starting letter
dat <- unlist(strsplit(dat, "\n\n"))

#Loop through the character vector to get the different countries
dat_list <- list()
for(i in 1:length(dat)){
  current_letter <- substr(dat[i], 1, 1)
  smaller_chunks <- unlist(strsplit(dat[i], paste("[[:space:]]",current_letter, sep = "")))
  smaller_chunks <- paste(current_letter, gsub("\n","",smaller_chunks), sep = "")
  dat_list[[i]] <- smaller_chunks
}

> dat_list
[[1]]
 [1] "AA"                  "Afghanistan"         "Albania"             "Algeria"
 [5] "Andorra"             "Angola"              "Antigua and Barbuda" "Argentina  "
 [9] "Armenia"             "Australia"           "Austria"             "Azerbaijan"

[[2]]
 [1] "BB"                     "Bahamas"                "Bahrain"
 [4] "Bangladesh"             "Barbados"               "Belarus"
 [7] "Belgium"                "Belize"                 "Benin"
[10] "Bhutan "                "Bolivia"                "Bosnia and Herzegovina"
[13] "Botswana"               "Brazil"                 "Brunei Darussalam"
[16] "Bulgaria "              "Burkina Faso"           "Burundi"