daily pastebin goal
92%
SHARE
TWEET

Untitled

a guest Jan 19th, 2019 47 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. region_id  statistic      genelist
  2.           1        2.5       A, B, C
  3.           2        0.5    B, C, D, E
  4.           3        3.2          <NA>
  5.           4        0.1          E, F
  6.    
  7. region_id statistic gene
  8.            1       2.5    A
  9.            1       2.5    B
  10.            1       2.5    C
  11.            2       0.5    B
  12.            2       0.5    C
  13.            2       0.5    D
  14.            2       0.5    E
  15.            3       3.2 <NA>
  16.            4       0.1    E
  17.            4       0.1    F
  18.    
  19. a <- structure(list(region_id = 1:4, statistic = c(2.5, 0.5, 3.2,
  20. 0.1), genelist = structure(c(1L, 2L, NA, 3L), .Label = c("A, B, C",
  21. "B, C, D, E", "E, F"), class = "factor")), .Names = c("region_id",
  22. "statistic", "genelist"), class = "data.frame", row.names = c(NA,
  23. -4L))
  24.  
  25. b <- structure(list(region_id = c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L,
  26. 4L, 4L), statistic = c(2.5, 2.5, 2.5, 0.5, 0.5, 0.5, 0.5, 3.2,
  27. 0.1, 0.1), gene = structure(c(1L, 2L, 3L, 2L, 3L, 4L, 5L, NA,
  28. 5L, 6L), .Label = c("A", "B", "C", "D", "E", "F"), class = "factor")), .Names = c("region_id",
  29. "statistic", "gene"), class = "data.frame", row.names = c(NA,
  30. -10L))
  31.    
  32. library(data.table)
  33. DT <- data.table(a)
  34. DT[, list(statistic,
  35.           gene = unlist(strsplit(as.character(genelist), ', ' ))),
  36.    by = list(region_id)]
  37.    
  38. DTL <- DT[, list(statistic,
  39.          gene = strsplit(as.character(genelist), ', ' )),
  40.     by = list(region_id)]
  41.  
  42. DTL
  43. ##    region_id statistic    gene
  44. ## 1:         1       2.5   A,B,C
  45. ## 2:         2       0.5 B,C,D,E
  46. ## 3:         3       3.2      NA
  47. ## 4:         4       0.1     E,F
  48.    
  49. DTL[region_id == 1,unlist(gene)]
  50. ## [1] "A" "B" "C"
  51. DTL[region_id == 2,unlist(gene)]
  52. ## [1] "B" "C" "D" "E"
  53. # or if the following is of interest
  54. DTL[statistic < 2,unlist(gene)]
  55. ## [1] "B" "C" "D" "E" "E" "F"
  56.    
  57. use strict;
  58. use warnings;
  59.  
  60. while (<DATA>) {
  61.     chomp;                                   # remove newline
  62.     my ($reg, $stat, $gene) = split /t/;    # split fields
  63.     my @genes = split /,s*/, $gene;         # split genes
  64.     for (@genes) {
  65.         local $ = "n";                 # adds newline to print
  66.         print join "t", $reg, $stat, $_;
  67.     }
  68. }
  69.  
  70. __DATA__
  71. region_id   statistic   genelist
  72. 1   2.5 A, B, C
  73. 2   0.5 B, C, D, E
  74. 3   3.2 <NA>
  75. 4   0.1 E, F
  76.    
  77. region_id       statistic       genelist
  78. 1       2.5     A
  79. 1       2.5     B
  80. 1       2.5     C
  81. 2       0.5     B
  82. 2       0.5     C
  83. 2       0.5     D
  84. 2       0.5     E
  85. 3       3.2     <NA>
  86. 4       0.1     E
  87. 4       0.1     F
  88.    
  89. library(stringr) # for str_split
  90. join(subset(a, select=c("region_id", "statistic")),
  91.      ddply(a, .(region_id), summarise, gene=str_split(genelist, ",\S*")[[1]]))
  92.    
  93. ddply(a, .(region_id),
  94.       function(x) data.frame(gene=str_split(x$genelist, ",\S*")[[1]],
  95.                              statistic=x$statistic))
  96.    
  97. data<-cbind(region_id=1:4, statistic=c(2.5, 0.5, 3.2, 0.1), genelist=c("A, B, C", "B, C, D, E", NA, "E, F"))
  98.  
  99. do.call(rbind,
  100.         apply(data, 1,
  101.               function(r) do.call(expand.grid,
  102.                                   c(unlist(r[-3]),
  103.                                     strsplit(r[3], ", ")))))
  104.    
  105. region_id statistic genelist
  106. 1          1       2.5        A
  107. 2          1       2.5        B
  108. 3          1       2.5        C
  109. 4          2       0.5        B
  110. 5          2       0.5        C
  111. 6          2       0.5        D
  112. 7          2       0.5        E
  113. 8          3       3.2     <NA>
  114. 9          4       0.1        E
  115. 10         4       0.1        F
  116.    
  117. #!/usr/bin/perl
  118. <>;
  119. print "region_idtstatistictgenen";
  120. while(<>) {
  121.   chomp;
  122.   my ($reg, $stat, $genes) = split /s+/, $_, 3;
  123.   foreach my $gene (split /,s*/, $genes) {
  124.      print "$regt$statt$genen";
  125.   }
  126. }
  127.    
  128. ddply(a, .(region_id), transform, gene = str_split(genelist, ',')[[1]])
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top