Advertisement
Guest User

Untitled

a guest
Sep 1st, 2019
168
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.50 KB | None | 0 0
  1. #!/bin/bash
  2. ## small bash script to download and reformat dbNSFP for pipeline
  3. ## Miles Benton
  4. ## created: 2018-01-13
  5. ## modified: 2019-08-21
  6.  
  7. # Set to dbNSFP version to download and build
  8. version="4.0a"
  9. #TODO: add an option to 'scrape' this from the url to always return latest version
  10. # define thread number for parallel processing where able
  11. THREADS=6
  12.  
  13. # Download dbNSFP database
  14. wget -O dbNSFP${version}.zip "ftp://dbnsfp:dbnsfp@dbnsfp.softgenetics.com/dbNSFP${version}.zip"
  15.  
  16. # Uncompress
  17. unzip dbNSFP${version}.zip
  18.  
  19. # grab header
  20. zcat dbNSFP${version}_variant.chr1.gz | head -n 1 | bgzip > header.gz
  21.  
  22. ### this section will produce data for hg38 capable pipelines
  23. ## hg38 version
  24.  
  25. # Create a single file version
  26. # NOTE: bgzip parameter -@ X represents number of threads
  27. cat dbNSFP${version}_variant.chr{1..22}.gz dbNSFP${version}_variant.chrX.gz dbNSFP${version}_variant.chrY.gz dbNSFP${version}_variant.chrM.gz | zgrep -v '#chr' | bgzip -@ ${THREADS} > dbNSFPv${version}_custom.gz
  28. #TODO: there must be a 'nicer' way of ordering the input into the cat (to include the X,Y and M chrs without explicitly stating them)
  29.  
  30. # add header back into file
  31. cat header.gz dbNSFPv${version}_custom.gz > dbNSFPv${version}_custombuild.gz
  32.  
  33. # Create tabix index
  34. tabix -s 1 -b 2 -e 2 dbNSFPv${version}_custombuild.gz
  35.  
  36. # test annotation
  37. # java -jar ~/install/snpEff/SnpSift.jar dbnsfp -v -db /mnt/dbNSFP/hg19/dbNSFPv${version}_custombuild.gz test/chr1_test.vcf > test/chr1_test_anno.vcf
  38. #TODO: provide actual unit test files for testing purposes, i.e. a section of public data with known annotation rates.
  39. #TODO: the above is currently a placeholder but it had it's intended purpose in terms of identifying incorrect genome build.
  40.  
  41. # clean up
  42. #TODO: add clean up step to rm all intermediate files after testing confirmed working (i.e. correct annotation 'rates')
  43. #/END hg38
  44. ###
  45.  
  46. ### this section will produce data for hg19 capable pipelines
  47. ## hg19 version
  48. # for hg19 (coordinate data is located in columns 8 [chr] and 9 [position])
  49. # this takes output from above, filters out any variants with no hg19 coords and then sorts on hg19 chr and position, and then bgzips output
  50. # NOTE: bgzip parameter -@ X represents number of threads
  51.  
  52. # create a file with ordered chrosome names
  53. echo {1..22} X Y M | tr ' ' '\n' > chromosomeList.txt
  54.  
  55. # set awk params as variable for parallel
  56. awkBody1='$8 != "."'
  57. awkBody2='BEGIN{FS=OFS="\t"} {t = $2; $2 = $9; $9 = t; x = $1; $1 = $8; $8 = $1; print;}'
  58.  
  59. # parallel implementation of the hg19 format and sort
  60. parallel -j 4 \
  61. "zgrep -v '#chr' dbNSFP${version}_variant.chr{}.gz | \
  62. awk '${awkBody1}' | \
  63. awk '${awkBody2}' | \
  64. LC_ALL=C sort --parallel=2 -n -S 1G -T . -k 1,1 -k 2,2 --compress-program=gzip | \
  65. bgzip -@ 2 > dbNSFPv${version}.chr{}.hg19.custombuild.gz" :::: chromosomeList.txt
  66. #TODO: need to implement a defined approach to GNU parallel, can't use $THREADS
  67.  
  68. # cat all files back together (should retain sort order)
  69. cat header.gz dbNSFPv${version}.chr{1..22}.hg19.custombuild.gz dbNSFPv${version}.chrX.hg19.custombuild.gz dbNSFPv${version}.chrY.hg19.custombuild.gz dbNSFPv${version}.chrM.hg19.custombuild.gz > dbNSFPv${version}.hg19.custombuild.gz
  70.  
  71. # Create tabix index
  72. tabix -s 1 -b 2 -e 2 dbNSFPv${version}.hg19.custombuild.gz
  73.  
  74. # test hg19 annotation
  75. # java -jar ~/install/snpEff/SnpSift.jar dbnsfp -v -db /mnt/dbNSFP/hg19/dbNSFPv${version}.hg19.custombuild.gz test/chr1_test.vcf > test/chr1_test_anno.vcf
  76.  
  77. # clean up
  78. #TODO: add clean up step to rm all intermediate files after testing confirmed working (i.e. correct annotation 'rates')
  79. #/END hg38
  80. ###
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement