Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding: UTF-8 -*-
- # idea for a script to rewrite mallet out to order
- # weights by topic rather than by weight
- # to use type the following command
- # $python [path]/mallet-out-by-topic.py [path to mallet output file]
- # [(optional) delimiter character]
- import csv
- from sys import argv
- def mallet_out_by_topic(file_in):
- delimiter = argv[2] if len(argv) > 2 else '\t'
- f_out = open(file_in[:-4] + '_by_topic.txt', 'a')
- with open(file_in, 'rU') as data:
- dialect = csv.Sniffer().sniff(data.read())
- data.seek(0)
- data_reader = csv.reader(data, dialect=dialect, delimiter=delimiter)
- topic_count = int((len(next(data_reader)) - 2) / 2)
- while topic_count < 4:
- topic_count = int((len(next(data_reader)) - 2) / 2)
- data.seek(0)
- header = "\t"
- for i in range(topic_count):
- header += "\t" + str(i)
- header += "\n"
- f_out.write(header)
- for row in data_reader:
- if len(row) > 4:
- curr_weights = [0] * topic_count
- line_out = row.pop(0)
- line_out += '\t' + row.pop(0)
- for i in range (len(row) - 2):
- if not i % 2:
- curr_weights[int(row[i])] = row[i+1]
- for weight in curr_weights:
- line_out += "\t" + str(weight)
- line_out += "\n"
- f_out.write(line_out)
- f_out.close()
- if __name__ == '__main__':
- mallet_out_by_topic(argv[1])
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement