Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- # -*- coding: utf-8 -*-
- import psycopg2
- import sys
- __author__ = 'pedro'
- def split_keywords(keywords):
- keywords = keywords.rstrip(".").rstrip();
- keywords = keywords.replace("\n", ";")
- keywords = keywords.replace(",", ";")
- for separator in [';' ,' ']:
- if keywords.find(separator) > -1:
- return [word.strip().rstrip(".") for word in keywords.split(separator) if word.strip() != ""]
- return [keywords]
- # connection:
- conn_string = "host='localhost' dbname='sinbiota_legacy' user='pedro' password='pedro'"
- try:
- conn = psycopg2.connect(conn_string)
- except Exception:
- exceptionType, exceptionValue, exceptionTraceback = sys.exc_info()
- sys.exit("Database connection failed!\n ->%s" % exceptionValue)
- cursor = conn.cursor()
- cursor.execute("select identificador, palavras_chave from descritivo")
- current_id = 1
- keyword_dict = dict()
- occurrence_dict = dict()
- # Parse all keywords, testing for repeated keywords:
- while True:
- row = cursor.fetchone()
- if row is None: break
- occurrence_id, keywords = row
- # some occurrences don't have keywords:
- if keywords == "":
- continue
- # split the keywords into a list
- keyword_list = split_keywords(keywords)
- # verify if keywords already exist, assigning ids along the process:
- for word in keyword_list:
- if not keyword_dict.has_key(word):
- keyword_dict[word] = current_id
- current_id += 1
- # build the list of keyword ids of this occurrence and save it:
- occurrence_dict[occurrence_id] = [keyword_dict[word] for word in keyword_list]
- # Now we write the output; choose one of the formats below:
- OUTPUT = "dump"
- #OUTPUT = "insert"
- # keywords:
- keyword_file = open("Keyword.data",'w')
- for keyword, id in keyword_dict.iteritems():
- if OUTPUT == "dump":
- keyword_file.write("%d|1|%s\n" % (id,keyword))
- elif OUTPUT == "insert":
- keyword_file.write("INSERT INTO Keyword (Id, Version, Name) VALUES (%d, 1, '%s')\n" % (id,keyword))
- keyword_file.close()
- # occurrences:
- occurrence_file = open("KeywordToOccurrence.data",'w')
- for occurrence_id, keyword_id_list in occurrence_dict.iteritems():
- for keyword_id in keyword_id_list:
- if OUTPUT == "dump":
- occurrence_file.write("%d|%d\n" % (occurrence_id, keyword_id))
- elif OUTPUT == "insert":
- occurrence_file.write("INSERT INTO KeywordToOccurrence (Occurrence_id, Keyword_id) VALUES (%d, %d)\n" % (occurrence_id, keyword_id))
- occurrence_file.close()
Add Comment
Please, Sign In to add comment