Advertisement
Guest User

Untitled

a guest
Feb 10th, 2016
64
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.43 KB | None | 0 0
  1. dput(df[1:30,-3])
  2. structure(list(origpat = c(4247592, 4247592, 4247592, 4247592,
  3. 4247592, 4247592, 4247592, 4247592, 4247592, 4247592, 4247592,
  4. 4247592, 4247592, 4247592, 4247592, 4247592, 4247592, 4247592,
  5. 4247592, 4247592, 4247592, 4247592, 4247592, 4247592, 4247592,
  6. 4247592, 4247592, 4247592, 4247592, 4247592), ref.pat = c(4318978,
  7. 4436368, 4358181, 4478622, 4312654, 4293439, 4286061, 4363648,
  8. 4406517, 4478623, 4277285, 4375743, 4470520, 4328022, 4248614,
  9. 4297139, 4296607, 4296608, 4395271, 4321141, 4294190, 4431420,
  10. 4322467, 4285730, 4393138, 4246034, 4251278, 4339174, 4277322,
  11. 4290586), mainprim = c("442", "442", "442", "442", "442", "442",
  12. "442", "442", "442", "442", "442", "442", "442", "442", "442",
  13. "442", "442", "442", "442", "442", "442", "442", "442", "442",
  14. "442", "442", "442", "442", "442", "442")), .Names = c("origpat",
  15. "ref.pat", "mainprim"), row.names = c(NA, 30L), class = c("data.table",
  16. "data.frame"))
  17.  
  18. dput(tmp)
  19. structure(list(pnum = c("4318978", "4318978", "4318978", "4318978",
  20. "4318978", "4318978", "4318978", "4318978", "4436368", "4436368",
  21. "4436368", "4436368", "4358181", "4358181", "4358181", "4358181",
  22. "4478622", "4312654", "4312654", "4312654", "4312654", "4312654",
  23. "4312654", "4293439", "4293439", "4293439", "4293439", "4293439",
  24. "4293439", "4293439", "4293439", "4293439", "4293439", "4293439",
  25. "4293439", "4293439", "4286061", "4286061", "4286061", "4286061",
  26. "4286061", "4286061", "4286061", "4286061", "4363648", "4363648",
  27. "4363648", "4406517", "4478623", "4478623", "4277285", "4375743",
  28. "4375743", "4375743", "4375743", "4470520", "4470520", "4470520",
  29. "4328022", "4328022", "4248614", "4248614", "4248614", "4248614",
  30. "4248614", "4248614", "4297139", "4297139", "4297139", "4297139",
  31. "4297139", "4296607", "4296607", "4296607", "4296607", "4296607",
  32. "4296607", "4296608", "4296608", "4296608", "4296608", "4296608",
  33. "4395271", "4395271", "4395271", "4321141", "4321141", "4321141",
  34. "4321141", "4294190", "4294190", "4294190", "4294190", "4294190",
  35. "4294190", "4431420", "4431420", "4431420", "4431420", "4431420",
  36. "4431420", "4322467", "4322467", "4322467", "4322467", "4322467",
  37. "4322467", "4322467", "4322467", "4322467", "4322467", "4285730",
  38. "4285730", "4393138", "4393138", "4393138", "4393138", "4393138",
  39. "4393138", "4393138", "4246034", "4246034", "4246034", "4246034",
  40. "4251278", "4251278", "4251278", "4339174", "4339174", "4339174",
  41. "4339174", "4277322", "4277322", "4290586", "4290586", "4290586",
  42. "4290586", "4290586", "4247592", "4247592", "4247592", "4247592",
  43. "4247592", "4247592", "4247592", "4247592", "4247592"), prim = c("430",
  44. "430", "430", "430", "430", "430", "430", "430", "340", "385",
  45. "385", "385", "385", "385", "65", "65", "65", "118", "427", "65",
  46. "65", "65", "65", "106", "106", "106", "501", "501", "501", "501",
  47. "501", "516", "516", "516", "516", "516", "435", "435", "435",
  48. "435", "435", "435", "435", "435", "156", "428", "65", "385",
  49. "65", "65", "501", "422", "53", "53", "53", "222", "422", "604",
  50. "65", "65", "385", "385", "65", "65", "65", "65", "106", "106",
  51. "501", "501", "501", "252", "423", "423", "501", "505", "62",
  52. "423", "501", "501", "505", "62", "65", "65", "65", "210", "210",
  53. "210", "435", "118", "118", "118", "118", "118", "118", "106",
  54. "433", "433", "433", "433", "501", "156", "427", "427", "428",
  55. "428", "428", "428", "428", "428", "428", "501", "501", "426",
  56. "426", "426", "435", "435", "435", "435", "428", "501", "501",
  57. "501", "501", "501", "65", "385", "385", "385", "65", "204",
  58. "204", "204", "266", "266", "432", "73", "427", "427", "428",
  59. "442", "442", "442", "442", "8", "8")), .Names = c("pnum", "prim"
  60. ), class = c("data.table", "data.frame"), row.names = c(NA, -147L
  61. ), .internal.selfref = <pointer: 0x0000000000100788>)
  62.  
  63. library(data.table)
  64. df <- data.table(df) ; setkey(df, refpat, origpat)
  65. refs <- unique(df$refpat) # Capture all unique refpat in df (71,000 in entire data.table)
  66. startrow <- 0 # Set loop
  67. overlap <- function(a,b) sum (a %chin% b) / length(b)
  68. df$compare <- NA # overlap values will be inserted here
  69.  
  70. for (h in 1:length(refs)) {
  71. refclass <- tmp$prim[tmp$pnum == refs[h]] #subgroup of relevant 'prim'
  72. x <- length(df$refpat[df$refpat == refs[h]])
  73. prims <- df$mainprim[startrow:(startrow + x)] # isolate subset from large `df` data.table to reduce memory needed in second loop
  74. for (i in 1:x) {
  75. df$compare[startrow + i] <- overlap(prims[i], refclass)
  76. }
  77. startrow <- startrow + x
  78. print(h)
  79. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement