Guest User

Untitled

a guest
Apr 22nd, 2018
91
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.69 KB | None | 0 0
  1. ############################################################################
  2. #Mel Schickel
  3. #15-04-2018
  4. #
  5. #This script transforms the IMF corpus to one texts and adds to the EUL corpus
  6. #then it applies tf.idf to the corpus to find the unique IMF terms. It creates:
  7. #1) "IMF_tfidf_2009.txt", a dataframe of tf.idf weighted terms per text
  8. #
  9. #platform x86_64-w64-mingw32
  10. #arch x86_64
  11. #os mingw32
  12. #system x86_64, mingw32
  13. #status
  14. #major 3
  15. #minor 4.3
  16. #year 2017
  17. #month 11
  18. #day 30
  19. #svn rev 73796
  20. #language R
  21. #version.string R version 3.4.3 (2017-11-30)
  22. #nickname Kite-Eating Tree
  23. ############################################################################
  24.  
  25.  
  26. rm(list=ls())
  27. library(stringr)
  28. library(quanteda)
  29. library(readtext)
  30. library(readr)
  31. require(lubridate)
  32.  
  33.  
  34. workingdir <- str_split_fixed(paste0(getwd()),"/Mel",2)[,1]
  35. load(paste0(workingdir,"/Dataverse/super.dtm.RData"))
  36. load(paste0(workingdir,"/Dataverse/supercorpus.RData"))
  37.  
  38. base_corpus <- corpus(supercorpus)
  39. levels( as.factor( base_corpus$documents$country ))
  40.  
  41.  
  42. #Create date format
  43. base_corpus$documents$int_date <-
  44. as.Date( base_corpus$documents$date, format = "%d-%m-%Y")
  45. head(as.Date( base_corpus$documents$date, format = "%d-%m-%Y"))
  46.  
  47.  
  48. #Select pre-crisis EUP speeches for control group
  49. EUP_control_corpus<-
  50. corpus_subset(base_corpus, country == "European Parliament" & int_date < as.Date( '01-10-2009', format = "%d-%m-%Y"))
  51. head(docnames(EUP_control_corpus), 50)
  52. ndoc(EUP_control_corpus)
  53.  
  54.  
  55. #Create dfm out of EUP corpus
  56. EUP_control_dfm <-
  57. dfm(EUP_control_corpus, tolower = TRUE, stem = FALSE)
  58. ndoc(EUP_control_dfm)
  59.  
  60.  
  61. #Select pre-crisis IMF speeches for control group
  62. IMF_control_corpus<-
  63. corpus_subset(base_corpus, country == "International Monetary Fund" & int_date < as.Date( '01-10-2009', format = "%d-%m-%Y"))
  64. head(docnames(IMF_control_corpus), 50)
  65. ndoc(IMF_control_corpus)
  66.  
  67.  
  68. #Paste all documents in to on document
  69. temp <- IMF_control_corpus$documents$texts[1]
  70.  
  71. for(i in 2:337){
  72. temp <- paste(temp,IMF_control_corpus$documents$texts[i])
  73. }
  74.  
  75.  
  76. #Create corpus out of text and add docvars
  77. single_IMF_corpus <- corpus(temp)
  78. single_IMF_corpus$documents$title <- "IMF Text"
  79. single_IMF_corpus$documents$date <- ""
  80. single_IMF_corpus$documents$country <- "International Monetary Fund"
  81. single_IMF_corpus$documents$speaker <- "IMF"
  82. single_IMF_corpus$documents$length <- ""
  83. single_IMF_corpus$documents$language <- "en"
  84. single_IMF_corpus$documents$language2 <- "english"
  85. single_IMF_corpus$documents$int_date <- as.Date("", format = "%d-%m-%Y")
  86.  
  87.  
  88. #Add IMF_control_text to EUP_control_corpus/ doesn't work, make text into single text corpus and then add?
  89. total_control_corpus<-
  90. c(EUP_control_corpus, single_IMF_corpus)
  91. ndoc(total_control_corpus)
  92. ndoc(EUP_control_corpus)
  93.  
  94.  
  95. #Make dfm
  96. control_dfm <- dfm(total_control_corpus)
  97.  
  98. tfidf <- dfm_tfidf(control_dfm)
  99. head(tfidf[1,10])
  100.  
  101.  
  102.  
  103. write.table(tfidf, (paste0(workingdir,"/Mel/Results/tfidf_2009.txt")), row.names=FALSE, sep="\t")
  104.  
  105. #Is tab delimited
  106. total_tfidf <- read.csv(paste0(workingdir,"/Mel/Results/tfidf_2009.txt"), sep="\t")
  107.  
  108. IMFN_tfidf_2009 <- as.data.frame(total_tfidf[852,], drop=FALSE)
  109. terms_tfidf_2009 <- as.data.frame(total_tfidf[1,], drop=FALSE)
  110. IMF_tfidf_2009 <- merge(terms_tfidf_2009, IMFN_tfidf_2009)
  111. head(IMF_tfidf_2009)
  112.  
  113. write.table(IMFN_tfidf_2009, (paste0(workingdir,"/Mel/Results/IMF_tfidf_2009.txt")), row.names=FALSE, sep="\t")
Add Comment
Please, Sign In to add comment