Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from collections import OrderedDict
- headers_string = '''LOCUS pNEBR 4287 bp ds-DNA circular 16-JUL-2012
- DEFINITION Cloning vector pNEBR-X1, complete sequence.
- ACCESSION urn.local...1328823972725.496
- KEYWORDS .
- SOURCE Cloning vector pNEBR-X1
- ORGANISM Cloning vector pNEBR-X1 other sequences; artificial sequences;
- vectors.
- REFERENCE 1 (bases 1 to 4287)
- AUTHORS Maina,C.V.
- TITLE Direct Submission
- JOURNAL Submitted (19-OCT-2007) Research Department, New England Biolabs,
- 240 County Road, Ipswich, MA 01938, USA
- COMMENT
- COMMENT ApEinfo:methylated:1'''
- features_string = '''FEATURES Location/Qualifiers
- TATA_signal 3873..3879
- /gene="lacZalpha"
- /note="minimal TATA box for 5XRE"
- /label=lacZalpha TATA signal
- /ApEinfo_fwdcolor=pink
- /ApEinfo_revcolor=pink
- /ApEinfo_graphicformat=arrow_data {{0 1 2 0 0 -1} {} 0}
- width 5 offset 0
- polyA_signal 4..855
- /note="SV40 polyA"
- /label=SV40 polyA polyA signal
- /ApEinfo_fwdcolor=pink
- /ApEinfo_revcolor=pink
- /ApEinfo_graphicformat=arrow_data {{0 1 2 0 0 -1} {} 0}
- width 5 offset 0
- misc_feature 3910..4068
- /gene="lacZalpha"
- /note="multiple cloning site (ApaI-StuI)"
- /note="Geneious type: polylinker"
- /label=multiple cloning site (ApaI-StuI)
- /ApEinfo_fwdcolor=pink
- /ApEinfo_revcolor=pink
- /ApEinfo_graphicformat=arrow_data {{0 1 2 0 0 -1} {} 0}
- width 5 offset 0
- promoter 3762..3854
- /gene="lacZalpha"
- /note="5X GAL4 response element (5XRE) (approx. transcript
- start 3908 clockwise)"
- /label=lacZalpha promoter
- /ApEinfo_fwdcolor=pink
- /ApEinfo_revcolor=pink
- /ApEinfo_graphicformat=arrow_data {{0 1 2 0 0 -1} {} 0}
- width 5 offset 0
- terminator 3167..3625
- /note="SV40 transcription terminator; prevents
- read-through transcription of the expression cassette from
- sources upstream of 5XRE"
- /label=terminator
- /ApEinfo_fwdcolor=pink
- /ApEinfo_revcolor=pink
- /ApEinfo_graphicformat=arrow_data {{0 1 2 0 0 -1} {} 0}
- width 5 offset 0
- gene 995..1855
- /gene="bla"
- /label=bla gene
- /ApEinfo_fwdcolor=pink
- /ApEinfo_revcolor=pink
- /ApEinfo_graphicformat=arrow_data {{0 1 2 0 0 -1} {} 0}
- width 5 offset 0
- gene 3735..4259
- /gene="lacZalpha"
- /label=lacZalpha gene
- /ApEinfo_fwdcolor=pink
- /ApEinfo_revcolor=pink
- /ApEinfo_graphicformat=arrow_data {{0 1 2 0 0 -1} {} 0}
- width 5 offset 0
- rep_origin 2517..3105
- /note="pUC19 origin of replication (clockwise) (RNAII -35
- to RNA/DNA switch point)"
- /label=rep origin
- /ApEinfo_fwdcolor=pink
- /ApEinfo_revcolor=pink
- /ApEinfo_graphicformat=arrow_data {{0 1 2 0 0 -1} {} 0}
- width 5 offset 0
- rep_origin complement(1897..2406)
- /note="M13 origin of replication (+ -)"
- /label=M13 origin of replication (+ -) rep origin
- /ApEinfo_fwdcolor=pink
- /ApEinfo_revcolor=pink
- /ApEinfo_graphicformat=arrow_data {{0 1 2 0 0 -1} {} 0}
- width 5 offset 0
- '''
- #----------------------------------------------------------------------
- def getHeaders(lines):
- """"""
- headers_keywords = ['LOCUS',
- 'DEFINITION',
- 'ACCESSION',
- 'KEYWORDS',
- 'SOURCE',
- ' ORGANISM',
- 'REFERENCE',
- ' AUTHORS',
- ' TITLE',
- ' JOURNAL',
- 'COMMENT']
- #a dict to hold all the headers
- #headers = OrderedDict() # switch to this, if order is important.
- headers = {}
- #last valid header keyword
- last_keyword = None
- #loop over all the lines, putting them in a dict with
- # the appropriate keyword as the key
- for line in lines:
- #if this variable doesn't change to true by the end of the for loop,
- # it means that no keywords were found, meaning that it probably was a second
- # line of the previous keyword, like COMMENT or JOURNAL
- matched = False
- for kw in headers_keywords:
- if line.startswith(kw):
- headers[kw] = [line.lstrip(kw).lstrip()]
- last_keyword = kw
- matched = True
- if not matched:
- headers[last_keyword].append(line)
- return headers
- #----------------------------------------------------------------------
- def getFeatures(lines):
- """"""
- features_keywords = ['FEATURES',
- ' TATA_signal',
- ' polyA_signal',
- ' misc_feature',
- ' promoter',
- ' terminator',
- ' gene',
- ' gene',
- ' rep_origin',
- ' rep_origin',
- ]
- #list of all the lines in the header section (up to FEATURES)
- headers_lines = headers_string.splitlines()
- headers = getHeaders(headers_lines)
- ## this isn't done. Lost steam from here on, sorry.
- features_lines = features_string.splitlines()
- features = getFeatures(features_lines)
- pass
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement