Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import sys
- FILENAMES = {
- 'first': [ 'shuffled_review_with_tag_{0}'.format(i) for i in range(0, 8) ],
- 'second': [ 'shuffled_review_without_tag_{0}'.format(i) for i in range(0, 8) ],
- }
- NUMBER_OF_CATEGORIES = 8 + 1 # including neutral
- def read_entry(input_file):
- key = input_file.readline().strip()
- if not key:
- return None
- content = input_file.readline().strip()
- tag = input_file.readline().strip()
- if tag:
- tag = int(tag)
- else:
- tag = -1
- return (key, content, tag)
- def compare_entries(first_file, second_file):
- counts = [ [ 0 for i in range(0, NUMBER_OF_CATEGORIES) ] for i in range(0, NUMBER_OF_CATEGORIES) ]
- not_matched_entries = []
- while True:
- first_entry = read_entry(first_file)
- second_entry = read_entry(second_file)
- if (not first_entry) or (not second_entry):
- if (not first_entry) and (not second_entry):
- break
- else:
- sys.exit('size of entry is not matched')
- if first_entry[0] != second_entry[0]:
- print 'key1: {0}\nkey2: {1}'.format(first_entry[0], second_entry[0])
- sys.exit('key is not matched')
- counts[first_entry[2]][second_entry[2]] += 1
- if first_entry[2] != second_entry[2]:
- not_matched_entries.append((first_entry, second_entry))
- return (counts, not_matched_entries)
- def calculate_kappa(counts):
- total = sum(map(lambda x: sum(x), counts))
- p = map(lambda x: map(lambda y: float(y) / total, x), counts)
- p_a = 0.0
- for i in range(0, NUMBER_OF_CATEGORIES):
- p_a += p[i][i]
- p_e = 0.0
- for i in range(0, NUMBER_OF_CATEGORIES):
- for j in range(0, NUMBER_OF_CATEGORIES):
- p_e += p[i][j] * p[j][i]
- return (p_a - p_e) / (1.0 - p_e)
- while True:
- for (first_filename, second_filename) in zip(FILENAMES['first'], FILENAMES['second']):
- with open(first_filename) as first_file:
- with open (second_filename) as second_file:
- (counts, not_matched_entries) = compare_entries(first_file, second_file)
- kappa = calculate_kappa(counts)
- print(kappa)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement