Guest User

Untitled

a guest
Feb 24th, 2018
124
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.76 KB | None | 0 0
  1. library(dplyr)
  2.  
  3.  
  4. library(RMySQL)
  5. con = dbConnect(MySQL(),user='USERNAME',password='PASSWORD',host='solexadb.XXXXXXXXXX.us-east-1.rds.amazonaws.com',port=3306,dbname='solexa')
  6. res = dbGetQuery(con,
  7. "select study_id,source.name as source_name, sample.name as sample_name, fcb.type as software,
  8. fcb.dateStamp as basecalldate, fcb.softwareVersion as version, study_id,
  9. sr.date as run_date, sr.sequencer as sequencer, sm.model, ssr.library_id as library_id,
  10. sr.ID as run_id, sample.ID as sample_id, nt.value as sample_source,
  11. sample.sample_type, part.value as partitioning_method, ssr.ID as read_group_id
  12. from solexa_sample as sample
  13. join solexa_source_sample as sss on sss.sample_id=sample.ID
  14. join solexa_source as source on sss.source_id=source.ID
  15. join solexa_sample_library as sslib on sslib.sample_id=sample.ID
  16. join solexa_studyresult as ssr on ssr.library_id=sslib.library_id
  17. join solexa_lane as slane on ssr.lane_id=slane.ID
  18. join solexa_run as sr on sr.ID = slane.run_id
  19. join solexa_variable_sample as svs on svs.sample_id=sample.ID
  20. join solexa_variable as sv on svs.variable_id=sv.ID
  21. join flowCellBasecall as fcb on fcb.id = ssr.`basecall_id`
  22. join solexa_variable_library as svl on ssr.library_id=svl.library_id
  23. join solexa_machine sm on sr.sequencer=sm.name
  24. left join (select sv.ID as variable_id,value
  25. from solexa_variable as sv where name='Genome Partition') as part
  26. on part.variable_id=svl.variable_id
  27. left join (select sv.ID as variable_id,value
  28. from solexa_variable as sv where name='Normal/Tumor') as nt
  29. on nt.variable_id=svs.variable_id
  30. where study_id in (16,25,94,152,128,179,124,166)
  31. and ssr.include='yes'
  32. ")
  33. res[[3]] = gsub('_','-',res[[3]])
  34. res[[3]] = gsub('(0[0-9])$','\\1D',res[[3]])
  35. colnames(res)[3] = 'samplename'
  36. res = res %>% filter(!is.na(sample_source)) %>% unique()
  37.  
  38. captureTrans = function(partition_method) {
  39. retmodel = rep('',length(partition_method))
  40. retmodel[grepl("TARGETOsteo020416_Valid_BAC_KlenowBait",partition_method)] = "LCC"
  41. retmodel[grepl('SeqCap_141001_HG19_TargetOsteo1807_EZ',partition_method)] = 'Roche'
  42. retmodel[partition_method %in% c('Illumina_Nextera_Rapid_Capture_Exome',
  43. 'Illumina_TruSeq_Exome_Enrichment_Kit',
  44. 'Agilent_SureSelect_Human_All_Exon_50Mb_Kit')] = 'Exome'
  45. retmodel
  46. }
  47.  
  48.  
  49.  
  50. res = res[res$sample_type != 'mRNA',]
  51. res$capture_strategy = captureTrans(res$partitioning_method)
  52.  
  53. vcf_manifest = readr::read_delim('~/Downloads/vcf_manifest_md5.txt',delim="/",col_names=FALSE)
  54. discover_validation_map = data.frame(directory = sub('TargetOsteo','',vcf_manifest[[5]]),
  55. capture_strategy = sub('Discovery|Validation','',sub('TargetOsteo','',vcf_manifest[[5]])),
  56. filename = vcf_manifest[[8]],
  57. source_name = substr(vcf_manifest[[8]],1,6),
  58. sample_source = ifelse(grepl('Tumor',vcf_manifest[[8]]),'Tumor','Normal'),
  59. stringsAsFactors = FALSE)
  60. tmpmap = discover_validation_map[discover_validation_map$capture_strategy=='',]
  61. tmpmap$capture_strategy = 'Exome'
  62. discover_validation_map = rbind(discover_validation_map,
  63. tmpmap)
  64.  
  65. res2 = res[,-1] %>% left_join(discover_validation_map)
  66.  
  67. readr::write_tsv(res2,'target_vcf_metadata.tsv',col_names=TRUE)
Add Comment
Please, Sign In to add comment