Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ############################################################################
- #Mel Schickel
- #15-04-2018
- #
- #This script transforms the IMF corpus to one texts and adds to the EUL corpus
- #then it applies tf.idf to the corpus to find the unique IMF terms. It creates:
- #1) "IMF_tfidf_2009.txt", a dataframe of tf.idf weighted terms per text
- #
- #platform x86_64-w64-mingw32
- #arch x86_64
- #os mingw32
- #system x86_64, mingw32
- #status
- #major 3
- #minor 4.3
- #year 2017
- #month 11
- #day 30
- #svn rev 73796
- #language R
- #version.string R version 3.4.3 (2017-11-30)
- #nickname Kite-Eating Tree
- ############################################################################
- rm(list=ls())
- library(stringr)
- library(quanteda)
- library(readtext)
- library(readr)
- require(lubridate)
- workingdir <- str_split_fixed(paste0(getwd()),"/Mel",2)[,1]
- load(paste0(workingdir,"/Dataverse/super.dtm.RData"))
- load(paste0(workingdir,"/Dataverse/supercorpus.RData"))
- base_corpus <- corpus(supercorpus)
- levels( as.factor( base_corpus$documents$country ))
- #Create date format
- base_corpus$documents$int_date <-
- as.Date( base_corpus$documents$date, format = "%d-%m-%Y")
- head(as.Date( base_corpus$documents$date, format = "%d-%m-%Y"))
- #Select pre-crisis EUP speeches for control group
- EUP_control_corpus<-
- corpus_subset(base_corpus, country == "European Parliament" & int_date < as.Date( '01-10-2009', format = "%d-%m-%Y"))
- head(docnames(EUP_control_corpus), 50)
- ndoc(EUP_control_corpus)
- #Create dfm out of EUP corpus
- EUP_control_dfm <-
- dfm(EUP_control_corpus, tolower = TRUE, stem = FALSE)
- ndoc(EUP_control_dfm)
- #Select pre-crisis IMF speeches for control group
- IMF_control_corpus<-
- corpus_subset(base_corpus, country == "International Monetary Fund" & int_date < as.Date( '01-10-2009', format = "%d-%m-%Y"))
- head(docnames(IMF_control_corpus), 50)
- ndoc(IMF_control_corpus)
- #Paste all documents in to on document
- temp <- IMF_control_corpus$documents$texts[1]
- for(i in 2:337){
- temp <- paste(temp,IMF_control_corpus$documents$texts[i])
- }
- #Create corpus out of text and add docvars
- single_IMF_corpus <- corpus(temp)
- single_IMF_corpus$documents$title <- "IMF Text"
- single_IMF_corpus$documents$date <- ""
- single_IMF_corpus$documents$country <- "International Monetary Fund"
- single_IMF_corpus$documents$speaker <- "IMF"
- single_IMF_corpus$documents$length <- ""
- single_IMF_corpus$documents$language <- "en"
- single_IMF_corpus$documents$language2 <- "english"
- single_IMF_corpus$documents$int_date <- as.Date("", format = "%d-%m-%Y")
- #Add IMF_control_text to EUP_control_corpus/ doesn't work, make text into single text corpus and then add?
- total_control_corpus<-
- c(EUP_control_corpus, single_IMF_corpus)
- ndoc(total_control_corpus)
- ndoc(EUP_control_corpus)
- #Make dfm
- control_dfm <- dfm(total_control_corpus)
- tfidf <- dfm_tfidf(control_dfm)
- head(tfidf[1,10])
- write.table(tfidf, (paste0(workingdir,"/Mel/Results/tfidf_2009.txt")), row.names=FALSE, sep="\t")
- #Is tab delimited
- total_tfidf <- read.csv(paste0(workingdir,"/Mel/Results/tfidf_2009.txt"), sep="\t")
- IMFN_tfidf_2009 <- as.data.frame(total_tfidf[852,], drop=FALSE)
- terms_tfidf_2009 <- as.data.frame(total_tfidf[1,], drop=FALSE)
- IMF_tfidf_2009 <- merge(terms_tfidf_2009, IMFN_tfidf_2009)
- head(IMF_tfidf_2009)
- write.table(IMFN_tfidf_2009, (paste0(workingdir,"/Mel/Results/IMF_tfidf_2009.txt")), row.names=FALSE, sep="\t")
Add Comment
Please, Sign In to add comment