Guest User

Untitled

a guest
Jan 13th, 2018
54
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.95 KB | None | 0 0
  1. library(rvest)
  2. library(stringr)
  3. library(tidyr)
  4. library(data.table)
  5. library(plyr)
  6. library(xml2)
  7. library(selectr)
  8. library(tibble)
  9. library(purrr)
  10. library(datapasta)
  11. library(jsonlite)
  12. library(countrycode)
  13. library(httr)
  14. library(stringi)
  15. library(tidyverse)
  16. library(dplyr)
  17.  
  18.  
  19.  
  20. ## 00. Pick the webpage
  21. BIHS <- read_html("http://bangladesh.ifpri.info/resources/datasets/")
  22.  
  23.  
  24. ## 01. Get the urls I am interested in
  25. BIHS_urls <- BIHS %>%
  26. html_nodes(".site-title a, .thetitle") %>%
  27. html_attr("href")
  28.  
  29.  
  30. ## 02. Get the title of the urls
  31. BIHS_title <- BIHS %>%
  32. html_nodes("h2, .thetitle") %>%
  33. html_text()
  34.  
  35. ## 03. Get the title of the urls
  36. BIHS_text <- BIHS %>%
  37. html_nodes("h2 , .ifeb-col2") %>%
  38. html_text()
  39.  
  40.  
  41. ## 04. Adding the variable "AreaName", to merge the dataframe with future standardized ones
  42. BIHS_AreaName <- "Bangladesh"
  43.  
  44.  
  45. ## 05. Merging all the variables in a single dataframe
  46. BIHS_df <- cbind(BIHS_urls, BIHS_title, BIHS_text, BIHS_AreaName)
  47.  
  48.  
  49. ## 06. Settimg wordcount
  50. getCount <- function(data,keyword)
  51. {
  52. wcount <- str_count(BIHS_df, keyword)
  53. return(data.frame(data,wcount))
  54. }
  55.  
  56. BIHS_Household <- getCount(BIHS_df, 'Household')
  57. BIHS_Unernourishment <- getCount(BIHS_Household, 'Undernourishment')
  58.  
  59. # Assuming your data is always in the below format i.e. a large body of text
  60. dat <- "A Afghanistan Albania Algeria Andorra Angola Antigua and Barbuda Argentina
  61. Armenia Australia Austria Azerbaijan
  62.  
  63. B Bahamas Bahrain Bangladesh Barbados Belarus Belgium Belize Benin Bhutan
  64. Bolivia Bosnia and Herzegovina Botswana Brazil Brunei Darussalam Bulgaria
  65. Burkina Faso Burundi"
  66.  
  67. #Break into rough chunks based on the starting letter
  68. dat <- unlist(strsplit(dat, "\n\n"))
  69.  
  70. #Loop through the character vector to get the different countries
  71. dat_list <- list()
  72. for(i in 1:length(dat)){
  73. current_letter <- substr(dat[i], 1, 1)
  74. smaller_chunks <- unlist(strsplit(dat[i], paste("[[:space:]]",current_letter, sep = "")))
  75. smaller_chunks <- paste(current_letter, gsub("\n","",smaller_chunks), sep = "")
  76. dat_list[[i]] <- smaller_chunks
  77. }
  78.  
  79. > dat_list
  80. [[1]]
  81. [1] "AA" "Afghanistan" "Albania" "Algeria"
  82. [5] "Andorra" "Angola" "Antigua and Barbuda" "Argentina "
  83. [9] "Armenia" "Australia" "Austria" "Azerbaijan"
  84.  
  85. [[2]]
  86. [1] "BB" "Bahamas" "Bahrain"
  87. [4] "Bangladesh" "Barbados" "Belarus"
  88. [7] "Belgium" "Belize" "Benin"
  89. [10] "Bhutan " "Bolivia" "Bosnia and Herzegovina"
  90. [13] "Botswana" "Brazil" "Brunei Darussalam"
  91. [16] "Bulgaria " "Burkina Faso" "Burundi"
Add Comment
Please, Sign In to add comment