Guest User

Untitled

a guest
Feb 20th, 2018
82
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.29 KB | None | 0 0
  1. gtf_to_tbl<-function(gtf_path){
  2. library(data.table)
  3. message("read gtf")
  4. gtf<-fread(gtf_path) #14 146, 19 186, 10 161
  5. gtf<-gtf[V3 %in% "exon"]
  6.  
  7. gtf$exon_len<-gtf$V5-gtf$V4+1 # 1-based
  8. message("split column")
  9. gtf[,c(paste0("vv",rep(1:19))):=tstrsplit(V9, ";", fixed=TRUE)]
  10. gtf<-gtf[,c(1:19),with=F]
  11. message("add column name ")
  12. names(gtf)[11:19]<-as.character(sapply(gtf[1,11:19,with=FALSE], function(x) gsub(" \".*\"| ","",x)))
  13.  
  14. # sapply(gtf[,c(11:19),with=F],function(x) gsub("gene_id|\"|transcript_id|havana_.* |gene_type|gene_status|transcript_.* |exon_id |exon_number | |tag","",x))
  15. # gsub("gene_id|\"|transcript_id|havana_.* |gene_type|gene_status|transcript_.* |exon_id |exon_number | |tag","",strsplit(gtf$V9[1],";")[[1]])
  16.  
  17. message("remve unwnated strings")
  18. gtf[,11:19:=mclapply(.SD,function(x) gsub("gene_id|\"|transcript_id|havana_.* |gene_type|gene_type|gene_name|gene_status|transcript_.* |exon_id |exon_number | |tag","",x),mc.cores = 40),.SDcols = 11:19]
  19. gtf$V9<-NULL
  20. #gtf$transcript_id<-gsub("\\..*","",gtf$transcript_id)
  21. #gtf$gene_id<-gsub("\\..*","",gtf$gene_id)
  22. gtf<-unique(gtf[,c("gene_id","transcript_id","gene_name"),with=F])
  23. gtf<-melt(gtf,id="gene_name")
  24. gtf$version<-stringr::str_extract(basename(gtf_path),"[0-9]+")
  25. message("fninsh")
  26. return(gtf)
  27. }
Add Comment
Please, Sign In to add comment