Advertisement
Guest User

Untitled

a guest
Apr 21st, 2015
217
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.50 KB | None | 0 0
  1. # -*- coding: UTF-8 -*-
  2.  
  3. # idea for a script to rewrite mallet out to order
  4. # weights by topic rather than by weight
  5.  
  6. # to use type the following command
  7. # $python [path]/mallet-out-by-topic.py [path to mallet output file]
  8. # [(optional) delimiter character]
  9.  
  10. import csv
  11. from sys import argv
  12.  
  13. def mallet_out_by_topic(file_in):
  14.  
  15. delimiter = argv[2] if len(argv) > 2 else '\t'
  16. f_out = open(file_in[:-4] + '_by_topic.txt', 'a')
  17.  
  18. with open(file_in, 'rU') as data:
  19.  
  20. dialect = csv.Sniffer().sniff(data.read())
  21. data.seek(0)
  22. data_reader = csv.reader(data, dialect=dialect, delimiter=delimiter)
  23.  
  24.  
  25. topic_count = int((len(next(data_reader)) - 2) / 2)
  26.  
  27. while topic_count < 4:
  28.  
  29. topic_count = int((len(next(data_reader)) - 2) / 2)
  30.  
  31. data.seek(0)
  32.  
  33. header = "\t"
  34. for i in range(topic_count):
  35. header += "\t" + str(i)
  36. header += "\n"
  37. f_out.write(header)
  38.  
  39. for row in data_reader:
  40.  
  41. if len(row) > 4:
  42. curr_weights = [0] * topic_count
  43. line_out = row.pop(0)
  44. line_out += '\t' + row.pop(0)
  45.  
  46. for i in range (len(row) - 2):
  47. if not i % 2:
  48. curr_weights[int(row[i])] = row[i+1]
  49. for weight in curr_weights:
  50. line_out += "\t" + str(weight)
  51. line_out += "\n"
  52. f_out.write(line_out)
  53.  
  54. f_out.close()
  55.  
  56.  
  57. if __name__ == '__main__':
  58.  
  59. mallet_out_by_topic(argv[1])
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement