Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- df <- merge(df,df
- ,by.x=c("from_id","to_id"),by.y=c("to_id","from_id")
- ,all.x=TRUE,all.y=TRUE)
- df <- mutate(df,Messages_sent.x=coalesce(Messages_sent.x,0),
- Messages_sent.y=coalesce(Messages_sent.y,0))
- df$row <- 1:nrow(df)
- rbind(select(df,-Messages_sents.y) %>%
- rename(Messages_sent=Messages_sent.x),
- select(df,-Messages_sent.x) %>%
- rename(Messages_sent=Messages_sent.y,from_id=to_id,to_id=from_id)
- ) %>% arrange(row) %>% select(-row)
- df <- data.frame(from_id=c(624227,624227,624227,624227,624227,624227,667255,667255,667255,7134655,713465),
- to_id = c(352731,693915,184455,771100,503940,91558,626814,857601,862512,156874,419242),
- message_sent=c(1,6,2,1,1,1,2,7,3,1,1))
- #create another dataset where from and to columns are switched
- df.swap <- data.frame(from_id = df$to_id, to_id=df$from_id)
- # merge 2 together
- df.full <- merge(df,df.swap, by=c("from_id","to_id"), all=TRUE)
- # fill missing values with 0
- # those records will correspond to all the pairs where
- # someone did not send any messages back
- df.full$message_sent[is.na(df.full$message_sent)] <- 0
- #create a unique ID for each pair. This can be down various ways. One easy way is to
- df.full$pair_id = sprintf("%06d%6d",pmin(df.full$from_id,df.full$to_id ),
- pmax(df.full$from_id,df.full$to_id ))
- # calculate total time for each pair:
- total.time <- aggregate(df.full$message_sent, list(df.full$pair_id), sum)
- names(total.time) <- c("pair_id", "total_sum")
- # if this total time needs to be a part of the data.frame you can merge it back
- df.full <- merge(df.full, total.time, by = c("pair_id"), all=TRUE)
- head(df.full)
- # pair_id from_id to_id message_sent total_sum
- # 1 091558624227 91558 624227 0 1
- # 2 091558624227 624227 91558 1 1
- # 3 1568747134655 156874 7134655 0 1
- # 4 1568747134655 7134655 156874 1 1
- # 5 184455624227 624227 184455 2 2
- # 6 184455624227 184455 624227 0 2
Add Comment
Please, Sign In to add comment