Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ## Overview
- This gist displays several strategies to parse and correct genbank files in the general format of a cookbook
- ### Delete and append a line by matching a pattern PATTERN
- `sed '/PATTERN/d' someFile.gb > outputFile.gb`
- `perl -pe '$_.= qq(TEXT N\n) if /PATTERN/' outputFile.gb > otherOutfile.gb`
- ### Modify a line 1
- *sed* can be used for this purpose, by matching PATTERN, and optionally using a variable $NM by using double-quotes
- `NM="some value"`
- `sed "s/ACCESSION <unknown id>/ACCESSION $NM/g" genbankFile.gb > genbankFile.mod.gb`
- ### Modify a line 2
- commandline *perl* can also be used
- `perl -pe 's/^\s*ORGANISM \./ORGANISM Macaca mulatta/' genbankFile.gb > genbankFile.mod.gb`
- ### Combine commands
- for i in *.gb ; do
- NM=$(echo "$i" | rev | cut -d. -f2- | rev) ;`
- # replace <unknown id> with name string`
- sed "s/ACCESSION <unknown id>/ACCESSION $NM/g" ${i} > ${i}.tmp.1 ;
- # when converting from genbank to embl
- # Biopython looks at the VERSION line for the identifier when converting a gb file
- sed "s/VERSION <unknown id>/VERSION $NM/g" ${i}.tmp.1 > ${i}.tmp.2 ;
- # misc aesthetic cleanup
- perl -pe 's/^\s*ORGANISM \./ORGANISM Macaca mulatta/' ${i}.tmp.3 > ${i}.gb ;
- rm *.tmp.* ;
- done
- ### Scripts
- bash
- #!/bin/bash
- for i in *.gb ; do
- NM=$(echo "$i" | rev | cut -d. -f2- | rev) ;
- # replace <unknown id> with name string
- sed "s/ACCESSION <unknown id>/ACCESSION $NM/g" ${i} > ${i}.tmp.1 ;
- sed "s/VERSION <unknown id>/VERSION $NM/g" ${i}.tmp.1 > ${i}.tmp.2 ;
- # when converting from genbank to embl
- # Biopython looks at the VERSION line for the identifier when converting a gb file
- perl -pe 's/^\s*ORGANISM \./ORGANISM Macaca mulatta/' ${i}.tmp.2 > ${i}.gb ;
- rm *.tmp.* ;
- done
- # then loop through the files, and use biopython to modify each to an embl file
- python
- #!/Users/caskey/anaconda3/bin/python
- # requires bash env variable FILENAME to be set or script will fail
- import sys
- import os
- from Bio import SeqIO
- fname=os.environ['FILENAME']
- fname = fname.replace('/',':')
- presentDir=os.environ['PWD']
- outFileName = presentDir + '/' + fname + '.embl'
- fnameIn = presentDir + '/' + fname
- count = SeqIO.convert(fnameIn, "genbank", outFileName, "embl")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement