Untitled

## Overview
This gist displays several strategies to parse and correct genbank files in the general format of a cookbook

### Delete and append a line by matching a pattern PATTERN

`sed '/PATTERN/d' someFile.gb > outputFile.gb`
`perl -pe '$_.= qq(TEXT     N\n) if /PATTERN/' outputFile.gb > otherOutfile.gb`


### Modify a line 1
*sed* can be used for this purpose, by matching PATTERN, and optionally using a variable $NM by using double-quotes

`NM="some value"`
`sed "s/ACCESSION   <unknown id>/ACCESSION   $NM/g" genbankFile.gb > genbankFile.mod.gb`

### Modify a line 2
commandline *perl* can also be used

`perl -pe 's/^\s*ORGANISM  \./ORGANISM    Macaca mulatta/' genbankFile.gb > genbankFile.mod.gb`

### Combine commands

    for i in *.gb ; do
      NM=$(echo "$i" | rev | cut -d. -f2- | rev) ;`
      # replace <unknown id> with name string`
      sed "s/ACCESSION   <unknown id>/ACCESSION   $NM/g" ${i} > ${i}.tmp.1 ;
      # when converting from genbank to embl
      # Biopython looks at the VERSION line for the identifier when converting a gb file
      sed "s/VERSION     <unknown id>/VERSION   $NM/g" ${i}.tmp.1 > ${i}.tmp.2 ;
      # misc aesthetic cleanup
      perl -pe 's/^\s*ORGANISM  \./ORGANISM    Macaca mulatta/' ${i}.tmp.3 > ${i}.gb ;
      rm *.tmp.* ;
    done

### Scripts

bash

      #!/bin/bash
      for i in *.gb ; do
        NM=$(echo "$i" | rev | cut -d. -f2- | rev) ;
        # replace <unknown id> with name string
        sed "s/ACCESSION   <unknown id>/ACCESSION   $NM/g" ${i} > ${i}.tmp.1 ;
        sed "s/VERSION     <unknown id>/VERSION   $NM/g" ${i}.tmp.1 > ${i}.tmp.2 ;
        # when converting from genbank to embl
        # Biopython looks at the VERSION line for the identifier when converting a gb file
        perl -pe 's/^\s*ORGANISM  \./ORGANISM    Macaca mulatta/' ${i}.tmp.2 > ${i}.gb ;
        rm *.tmp.* ;
      done
      # then loop through the files, and use biopython to modify each to an embl file


python

      #!/Users/caskey/anaconda3/bin/python
      # requires bash env variable FILENAME to be set or script will fail

      import sys
      import os

      from Bio import SeqIO


      fname=os.environ['FILENAME']
      fname = fname.replace('/',':')
      presentDir=os.environ['PWD']
      outFileName = presentDir + '/' + fname + '.embl'
      fnameIn = presentDir + '/' + fname
      count = SeqIO.convert(fnameIn, "genbank", outFileName, "embl")