Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # A script to calculate Frisch/Broe/Pierrehumbert similarity, based on a feature file
- import sys
- import re
- import os.path
- from FeatureFileTools import cleanup_line
- feature_filename = 'VowelsRound.txt'
- feature_file = open(feature_filename, 'r')
- # Now also the output files for the similarity table, natural classes, and similarity matrix.
- # First, find the "prefix" of the feature filename
- filename_prefix = re.sub(r'\.[^\.]*$', '', feature_filename)
- log_filename = filename_prefix + '.log'
- log_file = open(log_filename, 'w')
- similarity_table_filename = filename_prefix + '.stb'
- similarity_table_file = open(similarity_table_filename, 'w')
- # The following bit of code can set how a natural class should be described: with the shortest/most economic description, or with the longest/most complete description.
- # For example, in a five vowel system a,e,i,o,u
- # Minimal: o,u = +rd
- # Maximal: o,u = +bk,-lo,+rd
- # The code below does not make use of this setting, so it's up to you to implement or ignore it.
- # Uncomment the one you want to use.
- specification = 'minimal'
- #specification = 'maximal'
- # Now we'll read the feature file
- print ("Feature file: %s" % feature_filename)
- # We'll read the whole file at once, since we want to look at the first line (header) separately from the remaining lines.
- feature_lines = feature_file.readlines()
- # The first line contains the feature names
- firstline = cleanup_line(feature_lines[0])
- # also for some reason these often begin without tabs, but just in case they do have tabs
- firstline = re.sub(r'^\s+', '', firstline)
- # Break up the header row into a list of feature names
- features = firstline.split('\t')
- number_of_features = len(features)
- # Now the rest of the file, which contains the feature values
- # We need to remember the feature values, and also it's handy to have a list of segments
- feature_matrix = {}
- segments = []
- # Go through all the lines after the first line
- for line in feature_lines[1:]:
- line = cleanup_line(line)
- # Split up the line. The following puts the first value into the variable 'seg', and all the remaining values into the list 'values'
- seg, *values = line.split("\t")
- # Skip lines with a blank segment
- if seg == '':
- continue
- print('Segment %s: %s' % (seg, ','.join(values)))
- # Ideally, here we'd check for digraphs. The perl version does this, but it requires a bit of overhead. (The .trans file, and checking all the segments against it, creating new arbitrary single character ways to represent them, etc.)
- # Ignore blank lines, process only those lines with values
- if len(values) == number_of_features:
- # If the first value is nothing but digits (and is not simply a 0), or is completely empty, it's a code (assuming no scalar features!! but this script can't handle scalar features, anyway)
- if (re.match(r'^\d*$', values[0]) and values[0] != 0) or values[0] == "":
- del values[0]
- # Store the segments in a list of segments, for convenience
- segments.append(seg)
- # And remember the feature values
- feature_matrix[seg] = values
- else:
- print('Warning! segments %s has an incorrect number of features. \(%s instead of %s\)' % (seg, len(values), number_of_features))
- feature_file.close()
- # Something we could do here: print the feature matrix to the log file, to double-check that it's been read correctly
- ##########################################################
- ##### Now here's where the core of the script belongs
- ##### The strategy we discussed in the lab has two parts:
- ##### 1. Find all of the natural classes that can be described using the features
- ##### 2. For all pairs of segments, figure out which natural classes they both occur in (shared), and which natural classes contains one segment but not the other (unshared)
- ##########################################################
- # Given:
- # a list called features containing the names of the features
- # a list called segments, containing all of the segment
- # a dictionary called feature_matrix, segments to vector list
- # make some custom data structures
- my_matrix = {}
- for segment in segments:
- my_matrix[segment] = []
- for index, feature in enumerate(features):
- value = feature_matrix[segment][index]
- my_matrix[segment].append((value, feature))
- feature_values = []
- for feature in features:
- feature_values.append(('+', feature))
- feature_values.append(('-', feature))
- def get_feature_name(x):
- return x[0] + x[1]
- # 1. Find all the natural classes
- # This could be done by going through all logically possible combinations of feature values, or it could be done by starting with the natural classes described by a single feature value ([+high], [-high], etc.) and then combining (=intersecting) them.
- # inefficient powerset method
- from itertools import *
- # from itertools documentation page
- def powerset(iterable):
- s = list(iterable)
- return chain.from_iterable(combinations(s, r) for r in range(0, len(s)+1))
- # powerset all the features
- power = list(powerset(feature_values))
- natural_classes = {}
- def find_segment_class(feature_set):
- return ''.join(sorted([segment for segment in segments if all(feature in my_matrix[segment] for feature in feature_set)]))
- for combo in power:
- seg_class = find_segment_class(combo)
- # no segments found!
- if len(seg_class) == 0:
- continue
- if seg_class in natural_classes:
- # collapse the classes, but prefer the smaller set of features that describe it
- existing_set = natural_classes[seg_class]
- if len(combo) < len(existing_set):
- natural_classes[seg_class] = combo
- else:
- # add the class to the natural classes
- natural_classes[seg_class] = combo
- # write out to log file
- log_file.write("Natural classes: \n")
- # sort by number of features that describe the class
- for nc, feature_set in sorted(natural_classes.items(), key=lambda kv: len(kv[1])):
- log_file.write(nc + "\t" + ", ".join(get_feature_name(f) for f in feature_set) + "\n")
- # 2. Now go through all pairs of segments (including identical pairs-- that is, comparing a segment with itself. The similarity for those pairs should always come out to 1, if you've calculated it right)
- def find_similarity_info(s1, s2):
- shared = 0
- unshared = 0
- for nc in natural_classes:
- if s1 in nc and s2 in nc:
- shared += 1
- # what a condition!
- elif s1 in nc and s2 not in nc or s1 not in nc and s2 in nc:
- unshared += 1
- total = shared + unshared
- return (shared, total, round(shared / total, 4))
- similarities = [(s1, s2, find_similarity_info(s1, s2)) for s1 in segments for s2 in segments]
- # write to the similarity table file
- similarity_table_file.write("Seg1\tSeg2\t#Shared\tTotal\tSimilarity\n")
- for s in similarities:
- s1, s2, info = s
- similarity_table_file.write(s1 + "\t" + s2 + "\t" + "\t".join(str(n) for n in info) + "\n")
- log_file.close()
- similarity_table_file.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement