Untitled

# A script to calculate Frisch/Broe/Pierrehumbert similarity, based on a feature file
import sys
import re
import os.path
from FeatureFileTools import cleanup_line

feature_filename = 'VowelsRound.txt'
feature_file = open(feature_filename, 'r')

# Now also the output files for the similarity table, natural classes, and similarity matrix.
# First, find the "prefix" of the feature filename
filename_prefix = re.sub(r'\.[^\.]*$', '', feature_filename)
log_filename = filename_prefix + '.log'
log_file = open(log_filename, 'w')

similarity_table_filename = filename_prefix + '.stb'
similarity_table_file = open(similarity_table_filename, 'w')

# The following bit of code can set how a natural class should be described: with the shortest/most economic description, or with the longest/most complete description.
# For example, in a five vowel system a,e,i,o,u
# 	Minimal: o,u = +rd
# 	Maximal: o,u = +bk,-lo,+rd

# The code below does not make use of this setting, so it's up to you to implement or ignore it.
# Uncomment the one you want to use.
specification = 'minimal'
#specification = 'maximal'

# Now we'll read the feature file
print ("Feature file: %s" % feature_filename)
# We'll read the whole file at once, since we want to look at the first line (header) separately from the remaining lines.
feature_lines = feature_file.readlines()

# The first line contains the feature names
firstline = cleanup_line(feature_lines[0])
# also for some reason these often begin without tabs, but just in case they do have tabs
firstline = re.sub(r'^\s+', '', firstline)

# Break up the header row into a list of feature names
features = firstline.split('\t')
number_of_features = len(features)

# Now the rest of the file, which contains the feature values
# We need to remember the feature values, and also it's handy to have a list of segments
feature_matrix = {}
segments = []

# Go through all the lines after the first line
for line in feature_lines[1:]:
	line = cleanup_line(line)
	# Split up the line. The following puts the first value into the variable 'seg', and all the remaining values into the list 'values'
	seg, *values = line.split("\t")
	# Skip lines with a blank segment
	if seg == '':
		continue
	print('Segment %s: %s' % (seg, ','.join(values)))
	# Ideally, here we'd check for digraphs. The perl version does this, but it requires a bit of overhead. (The .trans file, and checking all the segments against it, creating new arbitrary single character ways to represent them, etc.)


	# Ignore blank lines, process only those lines with values
	if len(values) == number_of_features:
		# If the first value is nothing but digits (and is not simply a 0), or is completely empty, it's a code (assuming no scalar features!! but this script can't handle scalar features, anyway)
		if (re.match(r'^\d*$', values[0]) and values[0] != 0) or values[0] == "":
			del values[0]

		# Store the segments in a list of segments, for convenience
		segments.append(seg)

		# And remember the feature values
		feature_matrix[seg] = values

	else:
			print('Warning! segments %s has an incorrect number of features. \(%s instead of %s\)' % (seg, len(values), number_of_features))

feature_file.close()

# Something we could do here: print the feature matrix to the log file, to double-check that it's been read correctly

##########################################################
##### Now here's where the core of the script belongs
##### The strategy we discussed in the lab has two parts:
#####     1. Find all of the natural classes that can be described using the features
#####     2. For all pairs of segments, figure out which natural classes they both occur in (shared), and which natural classes contains one segment but not the other (unshared)
##########################################################

# Given:
# a list called features containing the names of the features
# a list called segments, containing all of the segment
# a dictionary called feature_matrix, segments to vector list

# make some custom data structures

my_matrix = {}
for segment in segments:
	my_matrix[segment] = []
	for index, feature in enumerate(features):
		value = feature_matrix[segment][index]
		my_matrix[segment].append((value, feature))

feature_values = []
for feature in features:
	feature_values.append(('+', feature))
	feature_values.append(('-', feature))

def get_feature_name(x):
	return x[0] + x[1]

# 1. Find all the natural classes
# This could be done by going through all logically possible combinations of feature values, or it could be done by starting with the natural classes described by a single feature value ([+high], [-high], etc.) and then combining (=intersecting) them.

# inefficient powerset method

from itertools import *
# from itertools documentation page
def powerset(iterable):
    s = list(iterable)
    return chain.from_iterable(combinations(s, r) for r in range(0, len(s)+1))

# powerset all the features
power = list(powerset(feature_values))

natural_classes = {}

def find_segment_class(feature_set):
	return ''.join(sorted([segment for segment in segments if all(feature in my_matrix[segment] for feature in feature_set)]))

for combo in power:
	seg_class = find_segment_class(combo)
	# no segments found!
	if len(seg_class) == 0:
		continue
	if seg_class in natural_classes:
		# collapse the classes, but prefer the smaller set of features that describe it
		existing_set = natural_classes[seg_class]
		if len(combo) < len(existing_set):
			natural_classes[seg_class] = combo
	else:
		# add the class to the natural classes
		natural_classes[seg_class] = combo

# write out to log file
log_file.write("Natural classes: \n")
# sort by number of features that describe the class
for nc, feature_set in sorted(natural_classes.items(), key=lambda kv: len(kv[1])):
	log_file.write(nc + "\t" + ", ".join(get_feature_name(f) for f in feature_set) + "\n")


# 2. Now go through all pairs of segments (including identical pairs-- that is, comparing a segment with itself. The similarity for those pairs should always come out to 1, if you've calculated it right)

def find_similarity_info(s1, s2):
	shared = 0
	unshared = 0
	for nc in natural_classes:
		if s1 in nc and s2 in nc:
			shared += 1
		# what a condition!
		elif s1 in nc and s2 not in nc or s1 not in nc and s2 in nc:
			unshared += 1
	total = shared + unshared
	return (shared, total, round(shared / total, 4))

similarities = [(s1, s2, find_similarity_info(s1, s2)) for s1 in segments for s2 in segments]

# write to the similarity table file
similarity_table_file.write("Seg1\tSeg2\t#Shared\tTotal\tSimilarity\n")
for s in similarities:
	s1, s2, info = s
	similarity_table_file.write(s1 + "\t" + s2 + "\t" + "\t".join(str(n) for n in info) + "\n")

log_file.close()
similarity_table_file.close()