bio

from collections import OrderedDict

headers_string = '''LOCUS       pNEBR                   4287 bp ds-DNA     circular     16-JUL-2012
DEFINITION  Cloning vector pNEBR-X1, complete sequence.
ACCESSION   urn.local...1328823972725.496
KEYWORDS    .
SOURCE      Cloning vector pNEBR-X1
  ORGANISM  Cloning vector pNEBR-X1 other sequences; artificial sequences;
            vectors.
REFERENCE   1  (bases 1 to 4287)
  AUTHORS   Maina,C.V.
  TITLE     Direct Submission
  JOURNAL   Submitted (19-OCT-2007) Research Department, New England Biolabs,
            240 County Road, Ipswich, MA 01938, USA
COMMENT
COMMENT     ApEinfo:methylated:1'''

features_string = '''FEATURES             Location/Qualifiers
     TATA_signal     3873..3879
                     /gene="lacZalpha"
                     /note="minimal TATA box for 5XRE"
                     /label=lacZalpha TATA signal
                     /ApEinfo_fwdcolor=pink
                     /ApEinfo_revcolor=pink
                     /ApEinfo_graphicformat=arrow_data {{0 1 2 0 0 -1} {} 0}
                     width 5 offset 0
     polyA_signal    4..855
                     /note="SV40 polyA"
                     /label=SV40 polyA polyA signal
                     /ApEinfo_fwdcolor=pink
                     /ApEinfo_revcolor=pink
                     /ApEinfo_graphicformat=arrow_data {{0 1 2 0 0 -1} {} 0}
                     width 5 offset 0
     misc_feature    3910..4068
                     /gene="lacZalpha"
                     /note="multiple cloning site (ApaI-StuI)"
                     /note="Geneious type: polylinker"
                     /label=multiple cloning site (ApaI-StuI)
                     /ApEinfo_fwdcolor=pink
                     /ApEinfo_revcolor=pink
                     /ApEinfo_graphicformat=arrow_data {{0 1 2 0 0 -1} {} 0}
                     width 5 offset 0
     promoter        3762..3854
                     /gene="lacZalpha"
                     /note="5X GAL4 response element (5XRE) (approx. transcript
                     start 3908 clockwise)"
                     /label=lacZalpha promoter
                     /ApEinfo_fwdcolor=pink
                     /ApEinfo_revcolor=pink
                     /ApEinfo_graphicformat=arrow_data {{0 1 2 0 0 -1} {} 0}
                     width 5 offset 0
     terminator      3167..3625
                     /note="SV40 transcription terminator; prevents
                     read-through transcription of the expression cassette from
                     sources upstream of 5XRE"
                     /label=terminator
                     /ApEinfo_fwdcolor=pink
                     /ApEinfo_revcolor=pink
                     /ApEinfo_graphicformat=arrow_data {{0 1 2 0 0 -1} {} 0}
                     width 5 offset 0
     gene            995..1855
                     /gene="bla"
                     /label=bla gene
                     /ApEinfo_fwdcolor=pink
                     /ApEinfo_revcolor=pink
                     /ApEinfo_graphicformat=arrow_data {{0 1 2 0 0 -1} {} 0}
                     width 5 offset 0
     gene            3735..4259
                     /gene="lacZalpha"
                     /label=lacZalpha gene
                     /ApEinfo_fwdcolor=pink
                     /ApEinfo_revcolor=pink
                     /ApEinfo_graphicformat=arrow_data {{0 1 2 0 0 -1} {} 0}
                     width 5 offset 0
     rep_origin      2517..3105
                     /note="pUC19 origin of replication (clockwise) (RNAII -35
                     to RNA/DNA switch point)"
                     /label=rep origin
                     /ApEinfo_fwdcolor=pink
                     /ApEinfo_revcolor=pink
                     /ApEinfo_graphicformat=arrow_data {{0 1 2 0 0 -1} {} 0}
                     width 5 offset 0
     rep_origin      complement(1897..2406)
                     /note="M13 origin of replication (+ -)"
                     /label=M13 origin of replication (+ -) rep origin
                     /ApEinfo_fwdcolor=pink
                     /ApEinfo_revcolor=pink
                     /ApEinfo_graphicformat=arrow_data {{0 1 2 0 0 -1} {} 0}
                     width 5 offset 0
'''


#----------------------------------------------------------------------
def getHeaders(lines):
    """"""

    headers_keywords = ['LOCUS',
                'DEFINITION',
                'ACCESSION',
                'KEYWORDS',
                'SOURCE',
                '  ORGANISM',
                'REFERENCE',
                '  AUTHORS',
                '  TITLE',
                '  JOURNAL',
                'COMMENT']

    #a dict to hold all the headers
    #headers = OrderedDict() # switch to this, if order is important.
    headers = {}

    #last valid header keyword
    last_keyword = None
    #loop over all the lines, putting them in a dict with
    # the appropriate keyword as the key
    for line in lines:
        #if this variable doesn't change to true by the end of the for loop,
        # it means that no keywords were found, meaning that it probably was a second
        # line of the previous keyword, like COMMENT or JOURNAL
        matched = False

        for kw in headers_keywords:
            if line.startswith(kw):
                headers[kw] = [line.lstrip(kw).lstrip()]
                last_keyword = kw
                matched = True
        if not matched:
            headers[last_keyword].append(line)

    return headers

#----------------------------------------------------------------------
def getFeatures(lines):
    """"""
    features_keywords = ['FEATURES',
                        '     TATA_signal',
                        '     polyA_signal',
                        '     misc_feature',
                        '     promoter',
                        '     terminator',
                        '     gene',
                        '     gene',
                        '     rep_origin',
                        '     rep_origin',
                        ]

#list of all the lines in the header section (up to FEATURES)
headers_lines = headers_string.splitlines()
headers = getHeaders(headers_lines)

## this isn't done. Lost steam from here on, sorry.
features_lines = features_string.splitlines()
features = getFeatures(features_lines)


pass