Guest User

Untitled

a guest
Jan 19th, 2019
98
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.67 KB | None | 0 0
  1. region_id statistic genelist
  2. 1 2.5 A, B, C
  3. 2 0.5 B, C, D, E
  4. 3 3.2 <NA>
  5. 4 0.1 E, F
  6.  
  7. region_id statistic gene
  8. 1 2.5 A
  9. 1 2.5 B
  10. 1 2.5 C
  11. 2 0.5 B
  12. 2 0.5 C
  13. 2 0.5 D
  14. 2 0.5 E
  15. 3 3.2 <NA>
  16. 4 0.1 E
  17. 4 0.1 F
  18.  
  19. a <- structure(list(region_id = 1:4, statistic = c(2.5, 0.5, 3.2,
  20. 0.1), genelist = structure(c(1L, 2L, NA, 3L), .Label = c("A, B, C",
  21. "B, C, D, E", "E, F"), class = "factor")), .Names = c("region_id",
  22. "statistic", "genelist"), class = "data.frame", row.names = c(NA,
  23. -4L))
  24.  
  25. b <- structure(list(region_id = c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L,
  26. 4L, 4L), statistic = c(2.5, 2.5, 2.5, 0.5, 0.5, 0.5, 0.5, 3.2,
  27. 0.1, 0.1), gene = structure(c(1L, 2L, 3L, 2L, 3L, 4L, 5L, NA,
  28. 5L, 6L), .Label = c("A", "B", "C", "D", "E", "F"), class = "factor")), .Names = c("region_id",
  29. "statistic", "gene"), class = "data.frame", row.names = c(NA,
  30. -10L))
  31.  
  32. library(data.table)
  33. DT <- data.table(a)
  34. DT[, list(statistic,
  35. gene = unlist(strsplit(as.character(genelist), ', ' ))),
  36. by = list(region_id)]
  37.  
  38. DTL <- DT[, list(statistic,
  39. gene = strsplit(as.character(genelist), ', ' )),
  40. by = list(region_id)]
  41.  
  42. DTL
  43. ## region_id statistic gene
  44. ## 1: 1 2.5 A,B,C
  45. ## 2: 2 0.5 B,C,D,E
  46. ## 3: 3 3.2 NA
  47. ## 4: 4 0.1 E,F
  48.  
  49. DTL[region_id == 1,unlist(gene)]
  50. ## [1] "A" "B" "C"
  51. DTL[region_id == 2,unlist(gene)]
  52. ## [1] "B" "C" "D" "E"
  53. # or if the following is of interest
  54. DTL[statistic < 2,unlist(gene)]
  55. ## [1] "B" "C" "D" "E" "E" "F"
  56.  
  57. use strict;
  58. use warnings;
  59.  
  60. while (<DATA>) {
  61. chomp; # remove newline
  62. my ($reg, $stat, $gene) = split /t/; # split fields
  63. my @genes = split /,s*/, $gene; # split genes
  64. for (@genes) {
  65. local $ = "n"; # adds newline to print
  66. print join "t", $reg, $stat, $_;
  67. }
  68. }
  69.  
  70. __DATA__
  71. region_id statistic genelist
  72. 1 2.5 A, B, C
  73. 2 0.5 B, C, D, E
  74. 3 3.2 <NA>
  75. 4 0.1 E, F
  76.  
  77. region_id statistic genelist
  78. 1 2.5 A
  79. 1 2.5 B
  80. 1 2.5 C
  81. 2 0.5 B
  82. 2 0.5 C
  83. 2 0.5 D
  84. 2 0.5 E
  85. 3 3.2 <NA>
  86. 4 0.1 E
  87. 4 0.1 F
  88.  
  89. library(stringr) # for str_split
  90. join(subset(a, select=c("region_id", "statistic")),
  91. ddply(a, .(region_id), summarise, gene=str_split(genelist, ",\S*")[[1]]))
  92.  
  93. ddply(a, .(region_id),
  94. function(x) data.frame(gene=str_split(x$genelist, ",\S*")[[1]],
  95. statistic=x$statistic))
  96.  
  97. data<-cbind(region_id=1:4, statistic=c(2.5, 0.5, 3.2, 0.1), genelist=c("A, B, C", "B, C, D, E", NA, "E, F"))
  98.  
  99. do.call(rbind,
  100. apply(data, 1,
  101. function(r) do.call(expand.grid,
  102. c(unlist(r[-3]),
  103. strsplit(r[3], ", ")))))
  104.  
  105. region_id statistic genelist
  106. 1 1 2.5 A
  107. 2 1 2.5 B
  108. 3 1 2.5 C
  109. 4 2 0.5 B
  110. 5 2 0.5 C
  111. 6 2 0.5 D
  112. 7 2 0.5 E
  113. 8 3 3.2 <NA>
  114. 9 4 0.1 E
  115. 10 4 0.1 F
  116.  
  117. #!/usr/bin/perl
  118. <>;
  119. print "region_idtstatistictgenen";
  120. while(<>) {
  121. chomp;
  122. my ($reg, $stat, $genes) = split /s+/, $_, 3;
  123. foreach my $gene (split /,s*/, $genes) {
  124. print "$regt$statt$genen";
  125. }
  126. }
  127.  
  128. ddply(a, .(region_id), transform, gene = str_split(genelist, ',')[[1]])
Add Comment
Please, Sign In to add comment