Guest User

Untitled

a guest
Apr 19th, 2018
114
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.57 KB | None | 0 0
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. import psycopg2
  4. import  sys
  5.  
  6. __author__ = 'pedro'
  7.  
  8. def split_keywords(keywords):
  9.     keywords = keywords.rstrip(".").rstrip();
  10.     keywords = keywords.replace("\n", ";")
  11.     keywords = keywords.replace(",", ";")
  12.     for separator in [';' ,' ']:
  13.         if keywords.find(separator) > -1:
  14.             return [word.strip().rstrip(".") for word in keywords.split(separator) if word.strip() != ""]
  15.  
  16.     return [keywords]
  17.  
  18. # connection:
  19. conn_string = "host='localhost' dbname='sinbiota_legacy' user='pedro' password='pedro'"
  20. try:
  21.     conn = psycopg2.connect(conn_string)
  22. except Exception:
  23.     exceptionType, exceptionValue, exceptionTraceback = sys.exc_info()
  24.     sys.exit("Database connection failed!\n ->%s" % exceptionValue)
  25.  
  26. cursor = conn.cursor()
  27. cursor.execute("select identificador, palavras_chave from descritivo")
  28.  
  29. current_id = 1
  30. keyword_dict = dict()
  31. occurrence_dict = dict()
  32.  
  33. # Parse all keywords, testing for repeated keywords:
  34. while True:
  35.     row = cursor.fetchone()
  36.     if row is None: break
  37.     occurrence_id, keywords = row
  38.  
  39.     # some occurrences don't have keywords:
  40.     if keywords == "":
  41.         continue
  42.  
  43.     # split the keywords into a list
  44.     keyword_list = split_keywords(keywords)
  45.  
  46.     # verify if keywords already exist, assigning ids along the process:
  47.     for word in keyword_list:
  48.         if not keyword_dict.has_key(word):
  49.             keyword_dict[word] = current_id
  50.             current_id += 1
  51.  
  52.     # build the list of keyword ids of this occurrence and save it:
  53.     occurrence_dict[occurrence_id] = [keyword_dict[word] for word in keyword_list]
  54.  
  55. # Now we write the output; choose one of the formats below:
  56. OUTPUT = "dump"
  57. #OUTPUT = "insert"
  58. # keywords:
  59. keyword_file = open("Keyword.data",'w')
  60. for keyword, id in keyword_dict.iteritems():
  61.     if OUTPUT == "dump":
  62.         keyword_file.write("%d|1|%s\n" % (id,keyword))
  63.     elif OUTPUT == "insert":
  64.         keyword_file.write("INSERT INTO Keyword (Id, Version, Name) VALUES (%d, 1, '%s')\n" % (id,keyword))
  65. keyword_file.close()
  66.  
  67. # occurrences:
  68. occurrence_file = open("KeywordToOccurrence.data",'w')
  69. for occurrence_id, keyword_id_list in  occurrence_dict.iteritems():
  70.     for keyword_id in keyword_id_list:
  71.         if OUTPUT == "dump":
  72.             occurrence_file.write("%d|%d\n" % (occurrence_id, keyword_id))
  73.         elif OUTPUT == "insert":
  74.             occurrence_file.write("INSERT INTO KeywordToOccurrence (Occurrence_id, Keyword_id) VALUES (%d, %d)\n" % (occurrence_id, keyword_id))
  75. occurrence_file.close()
Add Comment
Please, Sign In to add comment