Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- > df
- kwd1 kwd2 sim
- 1 a b 1
- 2 b a 1
- 3 c a 2
- 4 a c 2
- > df
- kwd1 kwd2 sim
- a b 1
- a c 2
- b c 0
- df <- data.frame(c('a', 'b', 'c', 'a'), c('b', 'a', 'a', 'c'), c(.1,.1,.2,.2))
- colnames(df) = c('kwd1', 'kwd2', 'sim')
- > dput(df)
- structure(list(kwd1 = structure(c(1L, 2L, 3L, 1L), .Label = c("a",
- "b", "c"), class = "factor"), kwd2 = structure(c(2L, 1L, 1L,
- 3L), .Label = c("a", "b", "c"), class = "factor"), sim = c(1,
- 1, 2, 2)), .Names = c("kwd1", "kwd2", "sim"), row.names = c(NA,
- -4L), class = "data.frame")
- library(plyr)
- res <- merge(expand.grid(kwd1 = unique(c(df$kwd1, df$kwd2)),
- kwd2 = unique(c(df$kwd1,
- df$kwd2))), df, all.x = T)
- res <- ddply(res, .(kwd1, kwd2), function(x) {
- if (which(letters == x$kwd1) != which(letters == x$kwd2)) {
- if (which(letters == x$kwd1) > which(letters == x$kwd2)) {
- return(data.frame(kwd1 = x$kwd2, kwd2 = x$kwd1, sim = x$sim))
- } else {
- return(x)
- }
- }
- })
- res1 <- res[!duplicated(res), ]
- > res1
- kwd1 kwd2 sim
- 1 a b 0.1
- 2 a c 0.2
- 4 b c NA
- convert_df <- function(df) {
- res <- merge(expand.grid(kwd1 = unique(c(df$kwd1, df$kwd2)),
- kwd2 = unique(c(df$kwd1,
- df$kwd2))), df, all.x = T)
- res <- ddply(res, .(kwd1, kwd2), function(x) {
- if (which(letters == x$kwd1) != which(letters == x$kwd2)) {
- if (which(letters == x$kwd1) > which(letters == x$kwd2)) {
- return(data.frame(kwd1 = x$kwd2, kwd2 = x$kwd1, sim = x$sim))
- } else {
- return(x)
- }
- }
- })
- return(res[!duplicated(res), ])
- }
- # Then simply run this to convert your actual data.frame
- convert_df(df)
- # make a data.frame with all possible combinations of kwd1 and kwd2.
- # the ones that aren't in df are NA for sim.
- k <- merge(expand.grid(kwd1=df$kwd1, kwd2=df$kwd2), df, all=TRUE)
- # order the result to put the NA rows at the end, so that rows that are in df
- # have priority in the following step.
- k <- k[order(k$sim), ]
- # remove all rows where the kwd1-kwd2 combo appears earlier in the data.frame
- k <- k[! duplicated(apply(k[1:2], MARGIN=1, sort), MARGIN=2), ]
- # assuming you don't want the rows where kwd1 and kwd2 are the same, remove them.
- k <- subset(k, kwd1 != kwd2)
- # set the NA values to 0
- k[is.na(k)] <- 0
- kwd1 kwd2 sim
- 5 a b 0.1
- 7 a c 0.2
- 12 b c 0.0
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement