Advertisement
sabinaarndt

country-regex

May 25th, 2012
91
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
R 0.96 KB | None | 0 0
  1. C1 <- rec$C1
  2. RP <- rec$RP
  3. PY <- rec$PY
  4. WC <- rec$WC
  5. UT <- rec$UT
  6.  
  7. nr.recs <- length(PY)
  8.  
  9. C1s <- C1
  10.  
  11. for (rn in 1:nr.recs)
  12.   {
  13.   C1s = gsub("\\[.*?\\] ", "", C1s) #remove author assignment
  14.   C1s = toupper(C1s) #capitals
  15.   C1s = strsplit(C1s, ";") #separate addresses
  16.   numbers = sapply(C1s, length) #number of adresses?
  17.   C1s = unlist(C1s)
  18.   C1s = gsub(",", ", ", C1s) #add spaces
  19.   #replace 'HUMBOLDT...GERMANY' with 'HUMBOLDT'
  20.   C1s = gsub("(.*)(HUMBOLDT)(.*)(GERMANY$)", "\\2", C1s)
  21.   #'UNITED KINGDOM'
  22.   C1s = gsub("ENGLAND|SCOTLAND|WALES|NORTH IRELAND", "UNITED KINGDOM", C1s)
  23.   #'USA'
  24.   #state and zip
  25.   C1s = gsub(" [A-Z]{2} [0-9]{5}", " USA", C1s)
  26.   #state only
  27.   C1s = gsub(", [A-Z]{2}$", ", USA", C1s)
  28.   #no comma before USA
  29.   C1s <- gsub("(.*) USA", "USA", C1s)
  30.   #country name is word before last comma
  31.   C1s <- sapply(C1s, function(x)sub("^.*, ([A-Za-z ]*)$", "\\1", x))
  32.   names(C1s) <- NULL
  33.   C1s = gsub("^ | $", "", C1s)  #remove spaces
  34.   }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement