Advertisement
celestialgod

duplicated rows

Aug 4th, 2015
264
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
R 0.96 KB | None | 0 0
  1. library(data.table)
  2. library(plyr)
  3. library(dplyr)
  4. library(magrittr)
  5. tbl_left = data.table(ID = 1:1821350, matrix(rnorm(23*1821350), 1821350))
  6. tbl_right = data.table(matrix(rnorm(23*1289), 1289))
  7. ID_right = sample(1:1821350, 1289-130)
  8. ID_right = c(ID_right, sample(ID_right, 130, TRUE))
  9. tbl_right %<>% mutate(ID = ID_right)
  10. tbl_joined <- dplyr:::left_join(tbl_left, tbl_right, by = "ID")
  11. tbl_joined %>% nrow # 1821480
  12. ## 1
  13. tbl_right %>% select(ID) %>% distinct %>% nrow %>% subtract(1289) # -130
  14. ## 2
  15. right_ID <- tbl_right %>% select(ID)
  16. right_ID_dup = right_ID %>% filter(right_ID %>% duplicated())
  17. right_ID_dup_sort = right_ID_dup %>% arrange(ID)
  18. right_ID_dup_sort %>% nrow # 130
  19. ## 3
  20. comm_ID = right_ID_dup %>% distinct %>%
  21.   rbind(tbl_left %>% select(ID) %>% distinct)
  22. comm_ID = comm_ID %>% filter(comm_ID %>% duplicated)
  23. comm_ID %>% nrow # 127
  24. ## 4
  25. tbl_right_ID = table(tbl_right$ID)
  26. tbl_right_ID[match(comm_ID$ID, names(tbl_right_ID))] %>% sum # 257
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement