Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- region_id statistic genelist
- 1 2.5 A, B, C
- 2 0.5 B, C, D, E
- 3 3.2 <NA>
- 4 0.1 E, F
- region_id statistic gene
- 1 2.5 A
- 1 2.5 B
- 1 2.5 C
- 2 0.5 B
- 2 0.5 C
- 2 0.5 D
- 2 0.5 E
- 3 3.2 <NA>
- 4 0.1 E
- 4 0.1 F
- a <- structure(list(region_id = 1:4, statistic = c(2.5, 0.5, 3.2,
- 0.1), genelist = structure(c(1L, 2L, NA, 3L), .Label = c("A, B, C",
- "B, C, D, E", "E, F"), class = "factor")), .Names = c("region_id",
- "statistic", "genelist"), class = "data.frame", row.names = c(NA,
- -4L))
- b <- structure(list(region_id = c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L,
- 4L, 4L), statistic = c(2.5, 2.5, 2.5, 0.5, 0.5, 0.5, 0.5, 3.2,
- 0.1, 0.1), gene = structure(c(1L, 2L, 3L, 2L, 3L, 4L, 5L, NA,
- 5L, 6L), .Label = c("A", "B", "C", "D", "E", "F"), class = "factor")), .Names = c("region_id",
- "statistic", "gene"), class = "data.frame", row.names = c(NA,
- -10L))
- library(data.table)
- DT <- data.table(a)
- DT[, list(statistic,
- gene = unlist(strsplit(as.character(genelist), ', ' ))),
- by = list(region_id)]
- DTL <- DT[, list(statistic,
- gene = strsplit(as.character(genelist), ', ' )),
- by = list(region_id)]
- DTL
- ## region_id statistic gene
- ## 1: 1 2.5 A,B,C
- ## 2: 2 0.5 B,C,D,E
- ## 3: 3 3.2 NA
- ## 4: 4 0.1 E,F
- DTL[region_id == 1,unlist(gene)]
- ## [1] "A" "B" "C"
- DTL[region_id == 2,unlist(gene)]
- ## [1] "B" "C" "D" "E"
- # or if the following is of interest
- DTL[statistic < 2,unlist(gene)]
- ## [1] "B" "C" "D" "E" "E" "F"
- use strict;
- use warnings;
- while (<DATA>) {
- chomp; # remove newline
- my ($reg, $stat, $gene) = split /t/; # split fields
- my @genes = split /,s*/, $gene; # split genes
- for (@genes) {
- local $ = "n"; # adds newline to print
- print join "t", $reg, $stat, $_;
- }
- }
- __DATA__
- region_id statistic genelist
- 1 2.5 A, B, C
- 2 0.5 B, C, D, E
- 3 3.2 <NA>
- 4 0.1 E, F
- region_id statistic genelist
- 1 2.5 A
- 1 2.5 B
- 1 2.5 C
- 2 0.5 B
- 2 0.5 C
- 2 0.5 D
- 2 0.5 E
- 3 3.2 <NA>
- 4 0.1 E
- 4 0.1 F
- library(stringr) # for str_split
- join(subset(a, select=c("region_id", "statistic")),
- ddply(a, .(region_id), summarise, gene=str_split(genelist, ",\S*")[[1]]))
- ddply(a, .(region_id),
- function(x) data.frame(gene=str_split(x$genelist, ",\S*")[[1]],
- statistic=x$statistic))
- data<-cbind(region_id=1:4, statistic=c(2.5, 0.5, 3.2, 0.1), genelist=c("A, B, C", "B, C, D, E", NA, "E, F"))
- do.call(rbind,
- apply(data, 1,
- function(r) do.call(expand.grid,
- c(unlist(r[-3]),
- strsplit(r[3], ", ")))))
- region_id statistic genelist
- 1 1 2.5 A
- 2 1 2.5 B
- 3 1 2.5 C
- 4 2 0.5 B
- 5 2 0.5 C
- 6 2 0.5 D
- 7 2 0.5 E
- 8 3 3.2 <NA>
- 9 4 0.1 E
- 10 4 0.1 F
- #!/usr/bin/perl
- <>;
- print "region_idtstatistictgenen";
- while(<>) {
- chomp;
- my ($reg, $stat, $genes) = split /s+/, $_, 3;
- foreach my $gene (split /,s*/, $genes) {
- print "$regt$statt$genen";
- }
- }
- ddply(a, .(region_id), transform, gene = str_split(genelist, ',')[[1]])
Add Comment
Please, Sign In to add comment