Untitled

#!/usr/bin/env python
# -*- coding: utf-8 -*-
import psycopg2
import  sys

__author__ = 'pedro'

def split_keywords(keywords):
    keywords = keywords.rstrip(".").rstrip();
    keywords = keywords.replace("\n", ";")
    keywords = keywords.replace(",", ";")
    for separator in [';' ,' ']:
        if keywords.find(separator) > -1:
            return [word.strip().rstrip(".") for word in keywords.split(separator) if word.strip() != ""]

    return [keywords]

# connection:
conn_string = "host='localhost' dbname='sinbiota_legacy' user='pedro' password='pedro'"
try:
    conn = psycopg2.connect(conn_string)
except Exception:
    exceptionType, exceptionValue, exceptionTraceback = sys.exc_info()
    sys.exit("Database connection failed!\n ->%s" % exceptionValue)

cursor = conn.cursor()
cursor.execute("select identificador, palavras_chave from descritivo")

current_id = 1
keyword_dict = dict()
occurrence_dict = dict()

# Parse all keywords, testing for repeated keywords:
while True:
    row = cursor.fetchone()
    if row is None: break
    occurrence_id, keywords = row

    # some occurrences don't have keywords:
    if keywords == "":
        continue

    # split the keywords into a list
    keyword_list = split_keywords(keywords)

    # verify if keywords already exist, assigning ids along the process:
    for word in keyword_list:
        if not keyword_dict.has_key(word):
            keyword_dict[word] = current_id
            current_id += 1

    # build the list of keyword ids of this occurrence and save it:
    occurrence_dict[occurrence_id] = [keyword_dict[word] for word in keyword_list]

# Now we write the output; choose one of the formats below:
OUTPUT = "dump"
#OUTPUT = "insert"
# keywords:
keyword_file = open("Keyword.data",'w')
for keyword, id in keyword_dict.iteritems():
    if OUTPUT == "dump":
        keyword_file.write("%d|1|%s\n" % (id,keyword))
    elif OUTPUT == "insert":
        keyword_file.write("INSERT INTO Keyword (Id, Version, Name) VALUES (%d, 1, '%s')\n" % (id,keyword))
keyword_file.close()

# occurrences:
occurrence_file = open("KeywordToOccurrence.data",'w')
for occurrence_id, keyword_id_list in  occurrence_dict.iteritems():
    for keyword_id in keyword_id_list:
        if OUTPUT == "dump":
            occurrence_file.write("%d|%d\n" % (occurrence_id, keyword_id))
        elif OUTPUT == "insert":
            occurrence_file.write("INSERT INTO KeywordToOccurrence (Occurrence_id, Keyword_id) VALUES (%d, %d)\n" % (occurrence_id, keyword_id))
occurrence_file.close()