Advertisement
Guest User

Untitled

a guest
Nov 29th, 2015
71
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 20.02 KB | None | 0 0
  1. # create fasta file from gff with fasta blocks using bedtools
  2. # see rationale at the end
  3.  
  4. gffFile=$1;
  5. tmpid=tmp_${$}
  6.  
  7. # make tmp dir
  8. mkdir $tmpid
  9. echo mving to tempdir >&2
  10. cd $tmpid
  11. echo $(pwd) >&2
  12.  
  13. # get fasta lines
  14. perl -lane 'print if $fasta_line; if(/^##FASTA/){$fasta_line=1}' ../$gffFile > fasta_from_gff.fa;
  15. # get only gff files
  16. perl -lane 'if(/^##FASTA/){last};print' ../$gffFile > test.nofasta.gff;
  17.  
  18. # generate de fasata from gff features
  19. bedtools getfasta -fi fasta_from_gff.fa -bed test.nofasta.gff -fullHeader -fo fasta_out.fa
  20. # put the final fasta in the initial dir and rename ass the gff file
  21. cp fasta_out.fa ../$gffFile.fa
  22.  
  23. # return to initial dir
  24. cd ..
  25. # tmp dir is not deleted on pourpose, just in case I need it for debuging. It is easy to remove by hand wit 'rm -rf tmp_*'
  26.  
  27. #####################
  28. #
  29. # rationale: bedtools does not work with gff with fasta blocks, you need to split in plain gff and fasta
  30. #
  31. # #script to print out fasta:
  32. # $ perl -lane 'print if $fasta_line; if(/^##FASTA/){$fasta_line=1}' test.gff > fasta_from_gff.fa;
  33. #
  34. # #scritp to print plain gff
  35. # $ perl -lane 'if(/^##FASTA/){last};print' test.gff > test.nofasta.gff;
  36. #
  37. # # get fasta
  38. # $ sh get_fasta_for_gff.sh test.gff
  39. # mving to tempdir
  40. # /Users/pmg/workspace/genomics/bedtools_tests/tmp_19560
  41. # index file fasta_from_gff.fa.fai not found, generating...
  42. #
  43. #######
  44. # gff tab delimited file with fasata blocks
  45. #######
  46. #| ##gff-version 3.2.1
  47. #| ##sequence-region ctg123 1 1500
  48. #| ctg123 . gene 100 900 . + . ID=gene00001;Name=EDEN
  49. #| ctg123 . TF_binding_site 100 101 . + . ID=tfbs00001;Parent=gene00001
  50. #| ctg123 . mRNA 105 900 . + . ID=mRNA00001;Parent=gene00001;Name=EDEN.1
  51. #| ctg123 . five_prime_UTR 105 120 . + . Parent=mRNA00001
  52. #| ctg123 . CDS 120 150 . + 0 ID=cds00001;Parent=mRNA00001
  53. #| ctg123 . CDS 300 390 . + 0 ID=cds00001;Parent=mRNA00001
  54. #| ctg123 . CDS 500 550 . + 0 ID=cds00001;Parent=mRNA00001
  55. #| ctg123 . CDS 700 760 . + 0 ID=cds00001;Parent=mRNA00001
  56. #| ctg123 . three_prime_UTR 761 900 . + . Parent=mRNA00001
  57. #| ctg123 . cDNA_match 105 150 5.8e-42 + . ID=match00001;Target=cdna0123+12+57
  58. #| ctg123 . cDNA_match 500 550 8.1e-43 + . ID=match00001;Target=cdna0123+463+963
  59. #| ctg123 . cDNA_match 700 900 1.4e-40 + . ID=match00001;Target=cdna0123+964+2964
  60. #| ##FASTA
  61. #| >ctg123
  62. #| cttctgggcgtacccgattctcggagaacttgccgcaccattccgccttg
  63. #| tgttcattgctgcctgcatgttcattgtctacctcggctacgtgtggcta
  64. #| tctttcctcggtgccctcgtgcacggagtcgagaaaccaaagaacaaaaa
  65. #| aagaaattaaaatatttattttgctgtggtttttgatgtgtgttttttat
  66. #| aatgatttttgatgtgaccaattgtacttttcctttaaatgaaatgtaat
  67. #| cttaaatgtatttccgacgaattcgaggcctgaaaagtgtgacgccattc
  68. #| gtatttgatttgggtttactatcgaataatgagaattttcaggcttaggc
  69. #| ttaggcttaggcttaggcttaggcttaggcttaggcttaggcttaggctt
  70. #| aggcttaggcttaggcttaggcttaggcttaggcttaggcttaggcttag
  71. #| aatctagctagctatccgaaattcgaggcctgaaaagtgtgacgccattc
  72. #| cttctgggcgtacccgattctcggagaacttgccgcaccattccgccttg
  73. #| tgttcattgctgcctgcatgttcattgtctacctcggctacgtgtggcta
  74. #| tctttcctcggtgccctcgtgcacggagtcgagaaaccaaagaacaaaaa
  75. #| aagaaattaaaatatttattttgctgtggtttttgatgtgtgttttttat
  76. #| aatgatttttgatgtgaccaattgtacttttcctttaaatgaaatgtaat
  77. #| cttaaatgtatttccgacgaattcgaggcctgaaaagtgtgacgccattc
  78. #| gtatttgatttgggtttactatcgaataatgagaattttcaggcttaggc
  79. #| ttaggcttaggcttaggcttaggcttaggcttaggcttaggcttaggctt
  80. #| aggcttaggcttaggcttaggcttaggcttaggcttaggcttaggcttag
  81. #| aatctagctagctatccgaaattcgaggcctgaaaagtgtgacgccattc
  82. #| cttctgggcgtacccgattctcggagaacttgccgcaccattccgccttg
  83. #| tgttcattgctgcctgcatgttcattgtctacctcggctacgtgtggcta
  84. #| tctttcctcggtgccctcgtgcacggagtcgagaaaccaaagaacaaaaa
  85. #| aagaaattaaaatatttattttgctgtggtttttgatgtgtgttttttat
  86. #| aatgatttttgatgtgaccaattgtacttttcctttaaatgaaatgtaat
  87. #| cttaaatgtatttccgacgaattcgaggcctgaaaagtgtgacgccattc
  88. #| gtatttgatttgggtttactatcgaataatgagaattttcaggcttaggc
  89. #| ttaggcttaggcttaggcttaggcttaggcttaggcttaggcttaggctt
  90. #| aggcttaggcttaggcttaggcttaggcttaggcttaggcttaggcttag
  91. #| aatctagctagctatccgaaattcgaggcctgaaaagtgtgacgccattc
  92. #| >cnda0123
  93. #| ttcaagtgctcagtcaatgtgattcacagtatgtcaccaaatattttggc
  94. #| agctttctcaagggatcaaaattatggatcattatggaatacctcggtgg
  95. #| aggctcagcgctcgatttaactaaaagtggaaagctggacgaaagtcata
  96. #| tcgctgtgattcttcgcgaaattttgaaaggtctcgagtatctgcatagt
  97. #| gaaagaaaaatccacagagatattaaaggagccaacgttttgttggaccg
  98. #| tcaaacagcggctgtaaaaatttgtgattatggttaaagg
  99. #|
  100. #
  101. ###########
  102. ## output
  103. ###########
  104. ## $ head test.gff.fa
  105. ## >ctg123:99-900
  106. ## atctttcctcggtgccctcgtgcacggagtcgagaaaccaaagaacaaaaaaagaaattaaaatatttattttgctgtggtttttgatgtgtgttttttataatgatttttgatgtgaccaattgtacttttcctttaaatgaaatgtaatcttaaatgtatttccgacgaattcgaggcctgaaaagtgtgacgccattcgtatttgatttgggtttactatcgaataatgag\
  107. aattttcaggcttaggcttaggcttaggcttaggcttaggcttaggcttaggcttaggcttaggcttaggcttaggcttaggcttaggcttaggcttaggcttaggcttaggcttagaatctagctagctatccgaaattcgaggcctgaaaagtgtgacgccattccttctgggcgtacccgattctcggagaacttgccgcaccattccgccttgtgttcattgctgcctgcatg\
  108. ttcattgtctacctcggctacgtgtggctatctttcctcggtgccctcgtgcacggagtcgagaaaccaaagaacaaaaaaagaaattaaaatatttattttgctgtggtttttgatgtgtgttttttataatgatttttgatgtgaccaattgtacttttcctttaaatgaaatgtaatcttaaatgtatttccgacgaattcgaggcctgaaaagtgtgacgccattcgtatttg\
  109. atttgggtttactatcgaataatgagaattttcaggcttaggcttaggcttaggcttaggcttaggcttaggcttaggcttaggcttaggctt
  110. ## >ctg123:99-101
  111. ## at
  112. ## >ctg123:104-900
  113. ## tcctcggtgccctcgtgcacggagtcgagaaaccaaagaacaaaaaaagaaattaaaatatttattttgctgtggtttttgatgtgtgttttttataatgatttttgatgtgaccaattgtacttttcctttaaatgaaatgtaatcttaaatgtatttccgacgaattcgaggcctgaaaagtgtgacgccattcgtatttgatttgggtttactatcgaataatgagaattt\
  114. tcaggcttaggcttaggcttaggcttaggcttaggcttaggcttaggcttaggcttaggcttaggcttaggcttaggcttaggcttaggcttaggcttaggcttaggcttagaatctagctagctatccgaaattcgaggcctgaaaagtgtgacgccattccttctgggcgtacccgattctcggagaacttgccgcaccattccgccttgtgttcattgctgcctgcatgttcat\
  115. tgtctacctcggctacgtgtggctatctttcctcggtgccctcgtgcacggagtcgagaaaccaaagaacaaaaaaagaaattaaaatatttattttgctgtggtttttgatgtgtgttttttataatgatttttgatgtgaccaattgtacttttcctttaaatgaaatgtaatcttaaatgtatttccgacgaattcgaggcctgaaaagtgtgacgccattcgtatttgatttg\
  116. ggtttactatcgaataatgagaattttcaggcttaggcttaggcttaggcttaggcttaggcttaggcttaggcttaggcttaggctt
  117. ## >ctg123:104-120
  118. ## tcctcggtgccctcgt
  119. ## >ctg123:119-150
  120. ## ....
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement