Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # create fasta file from gff with fasta blocks using bedtools
- # see rationale at the end
- gffFile=$1;
- tmpid=tmp_${$}
- # make tmp dir
- mkdir $tmpid
- echo mving to tempdir >&2
- cd $tmpid
- echo $(pwd) >&2
- # get fasta lines
- perl -lane 'print if $fasta_line; if(/^##FASTA/){$fasta_line=1}' ../$gffFile > fasta_from_gff.fa;
- # get only gff files
- perl -lane 'if(/^##FASTA/){last};print' ../$gffFile > test.nofasta.gff;
- # generate de fasata from gff features
- bedtools getfasta -fi fasta_from_gff.fa -bed test.nofasta.gff -fullHeader -fo fasta_out.fa
- # put the final fasta in the initial dir and rename ass the gff file
- cp fasta_out.fa ../$gffFile.fa
- # return to initial dir
- cd ..
- # tmp dir is not deleted on pourpose, just in case I need it for debuging. It is easy to remove by hand wit 'rm -rf tmp_*'
- #####################
- #
- # rationale: bedtools does not work with gff with fasta blocks, you need to split in plain gff and fasta
- #
- # #script to print out fasta:
- # $ perl -lane 'print if $fasta_line; if(/^##FASTA/){$fasta_line=1}' test.gff > fasta_from_gff.fa;
- #
- # #scritp to print plain gff
- # $ perl -lane 'if(/^##FASTA/){last};print' test.gff > test.nofasta.gff;
- #
- # # get fasta
- # $ sh get_fasta_for_gff.sh test.gff
- # mving to tempdir
- # /Users/pmg/workspace/genomics/bedtools_tests/tmp_19560
- # index file fasta_from_gff.fa.fai not found, generating...
- #
- #######
- # gff tab delimited file with fasata blocks
- #######
- #| ##gff-version 3.2.1
- #| ##sequence-region ctg123 1 1500
- #| ctg123 . gene 100 900 . + . ID=gene00001;Name=EDEN
- #| ctg123 . TF_binding_site 100 101 . + . ID=tfbs00001;Parent=gene00001
- #| ctg123 . mRNA 105 900 . + . ID=mRNA00001;Parent=gene00001;Name=EDEN.1
- #| ctg123 . five_prime_UTR 105 120 . + . Parent=mRNA00001
- #| ctg123 . CDS 120 150 . + 0 ID=cds00001;Parent=mRNA00001
- #| ctg123 . CDS 300 390 . + 0 ID=cds00001;Parent=mRNA00001
- #| ctg123 . CDS 500 550 . + 0 ID=cds00001;Parent=mRNA00001
- #| ctg123 . CDS 700 760 . + 0 ID=cds00001;Parent=mRNA00001
- #| ctg123 . three_prime_UTR 761 900 . + . Parent=mRNA00001
- #| ctg123 . cDNA_match 105 150 5.8e-42 + . ID=match00001;Target=cdna0123+12+57
- #| ctg123 . cDNA_match 500 550 8.1e-43 + . ID=match00001;Target=cdna0123+463+963
- #| ctg123 . cDNA_match 700 900 1.4e-40 + . ID=match00001;Target=cdna0123+964+2964
- #| ##FASTA
- #| >ctg123
- #| cttctgggcgtacccgattctcggagaacttgccgcaccattccgccttg
- #| tgttcattgctgcctgcatgttcattgtctacctcggctacgtgtggcta
- #| tctttcctcggtgccctcgtgcacggagtcgagaaaccaaagaacaaaaa
- #| aagaaattaaaatatttattttgctgtggtttttgatgtgtgttttttat
- #| aatgatttttgatgtgaccaattgtacttttcctttaaatgaaatgtaat
- #| cttaaatgtatttccgacgaattcgaggcctgaaaagtgtgacgccattc
- #| gtatttgatttgggtttactatcgaataatgagaattttcaggcttaggc
- #| ttaggcttaggcttaggcttaggcttaggcttaggcttaggcttaggctt
- #| aggcttaggcttaggcttaggcttaggcttaggcttaggcttaggcttag
- #| aatctagctagctatccgaaattcgaggcctgaaaagtgtgacgccattc
- #| cttctgggcgtacccgattctcggagaacttgccgcaccattccgccttg
- #| tgttcattgctgcctgcatgttcattgtctacctcggctacgtgtggcta
- #| tctttcctcggtgccctcgtgcacggagtcgagaaaccaaagaacaaaaa
- #| aagaaattaaaatatttattttgctgtggtttttgatgtgtgttttttat
- #| aatgatttttgatgtgaccaattgtacttttcctttaaatgaaatgtaat
- #| cttaaatgtatttccgacgaattcgaggcctgaaaagtgtgacgccattc
- #| gtatttgatttgggtttactatcgaataatgagaattttcaggcttaggc
- #| ttaggcttaggcttaggcttaggcttaggcttaggcttaggcttaggctt
- #| aggcttaggcttaggcttaggcttaggcttaggcttaggcttaggcttag
- #| aatctagctagctatccgaaattcgaggcctgaaaagtgtgacgccattc
- #| cttctgggcgtacccgattctcggagaacttgccgcaccattccgccttg
- #| tgttcattgctgcctgcatgttcattgtctacctcggctacgtgtggcta
- #| tctttcctcggtgccctcgtgcacggagtcgagaaaccaaagaacaaaaa
- #| aagaaattaaaatatttattttgctgtggtttttgatgtgtgttttttat
- #| aatgatttttgatgtgaccaattgtacttttcctttaaatgaaatgtaat
- #| cttaaatgtatttccgacgaattcgaggcctgaaaagtgtgacgccattc
- #| gtatttgatttgggtttactatcgaataatgagaattttcaggcttaggc
- #| ttaggcttaggcttaggcttaggcttaggcttaggcttaggcttaggctt
- #| aggcttaggcttaggcttaggcttaggcttaggcttaggcttaggcttag
- #| aatctagctagctatccgaaattcgaggcctgaaaagtgtgacgccattc
- #| >cnda0123
- #| ttcaagtgctcagtcaatgtgattcacagtatgtcaccaaatattttggc
- #| agctttctcaagggatcaaaattatggatcattatggaatacctcggtgg
- #| aggctcagcgctcgatttaactaaaagtggaaagctggacgaaagtcata
- #| tcgctgtgattcttcgcgaaattttgaaaggtctcgagtatctgcatagt
- #| gaaagaaaaatccacagagatattaaaggagccaacgttttgttggaccg
- #| tcaaacagcggctgtaaaaatttgtgattatggttaaagg
- #|
- #
- ###########
- ## output
- ###########
- ## $ head test.gff.fa
- ## >ctg123:99-900
- ## atctttcctcggtgccctcgtgcacggagtcgagaaaccaaagaacaaaaaaagaaattaaaatatttattttgctgtggtttttgatgtgtgttttttataatgatttttgatgtgaccaattgtacttttcctttaaatgaaatgtaatcttaaatgtatttccgacgaattcgaggcctgaaaagtgtgacgccattcgtatttgatttgggtttactatcgaataatgag\
- aattttcaggcttaggcttaggcttaggcttaggcttaggcttaggcttaggcttaggcttaggcttaggcttaggcttaggcttaggcttaggcttaggcttaggcttaggcttagaatctagctagctatccgaaattcgaggcctgaaaagtgtgacgccattccttctgggcgtacccgattctcggagaacttgccgcaccattccgccttgtgttcattgctgcctgcatg\
- ttcattgtctacctcggctacgtgtggctatctttcctcggtgccctcgtgcacggagtcgagaaaccaaagaacaaaaaaagaaattaaaatatttattttgctgtggtttttgatgtgtgttttttataatgatttttgatgtgaccaattgtacttttcctttaaatgaaatgtaatcttaaatgtatttccgacgaattcgaggcctgaaaagtgtgacgccattcgtatttg\
- atttgggtttactatcgaataatgagaattttcaggcttaggcttaggcttaggcttaggcttaggcttaggcttaggcttaggcttaggctt
- ## >ctg123:99-101
- ## at
- ## >ctg123:104-900
- ## tcctcggtgccctcgtgcacggagtcgagaaaccaaagaacaaaaaaagaaattaaaatatttattttgctgtggtttttgatgtgtgttttttataatgatttttgatgtgaccaattgtacttttcctttaaatgaaatgtaatcttaaatgtatttccgacgaattcgaggcctgaaaagtgtgacgccattcgtatttgatttgggtttactatcgaataatgagaattt\
- tcaggcttaggcttaggcttaggcttaggcttaggcttaggcttaggcttaggcttaggcttaggcttaggcttaggcttaggcttaggcttaggcttaggcttaggcttagaatctagctagctatccgaaattcgaggcctgaaaagtgtgacgccattccttctgggcgtacccgattctcggagaacttgccgcaccattccgccttgtgttcattgctgcctgcatgttcat\
- tgtctacctcggctacgtgtggctatctttcctcggtgccctcgtgcacggagtcgagaaaccaaagaacaaaaaaagaaattaaaatatttattttgctgtggtttttgatgtgtgttttttataatgatttttgatgtgaccaattgtacttttcctttaaatgaaatgtaatcttaaatgtatttccgacgaattcgaggcctgaaaagtgtgacgccattcgtatttgatttg\
- ggtttactatcgaataatgagaattttcaggcttaggcttaggcttaggcttaggcttaggcttaggcttaggcttaggcttaggctt
- ## >ctg123:104-120
- ## tcctcggtgccctcgt
- ## >ctg123:119-150
- ## ....
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement