SHARE
TWEET

Untitled

a guest Mar 24th, 2019 74 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. # A script to calculate Frisch/Broe/Pierrehumbert similarity, based on a feature file
  2. import sys
  3. import re
  4. import os.path
  5. from FeatureFileTools import cleanup_line
  6.  
  7. feature_filename = 'VowelsRound.txt'
  8. feature_file = open(feature_filename, 'r')
  9.  
  10. # Now also the output files for the similarity table, natural classes, and similarity matrix.
  11. # First, find the "prefix" of the feature filename
  12. filename_prefix = re.sub(r'\.[^\.]*$', '', feature_filename)
  13. log_filename = filename_prefix + '.log'
  14. log_file = open(log_filename, 'w')
  15.  
  16. similarity_table_filename = filename_prefix + '.stb'
  17. similarity_table_file = open(similarity_table_filename, 'w')
  18.  
  19. # The following bit of code can set how a natural class should be described: with the shortest/most economic description, or with the longest/most complete description.
  20. # For example, in a five vowel system a,e,i,o,u
  21. #   Minimal: o,u = +rd
  22. #   Maximal: o,u = +bk,-lo,+rd
  23.  
  24. # The code below does not make use of this setting, so it's up to you to implement or ignore it.
  25. # Uncomment the one you want to use.
  26. specification = 'minimal'
  27. #specification = 'maximal'
  28.  
  29. # Now we'll read the feature file
  30. print ("Feature file: %s" % feature_filename)
  31. # We'll read the whole file at once, since we want to look at the first line (header) separately from the remaining lines.
  32. feature_lines = feature_file.readlines()
  33.  
  34. # The first line contains the feature names
  35. firstline = cleanup_line(feature_lines[0])
  36. # also for some reason these often begin without tabs, but just in case they do have tabs
  37. firstline = re.sub(r'^\s+', '', firstline)
  38.  
  39. # Break up the header row into a list of feature names
  40. features = firstline.split('\t')
  41. number_of_features = len(features)
  42.  
  43. # Now the rest of the file, which contains the feature values
  44. # We need to remember the feature values, and also it's handy to have a list of segments
  45. feature_matrix = {}
  46. segments = []
  47.  
  48. # Go through all the lines after the first line
  49. for line in feature_lines[1:]:
  50.     line = cleanup_line(line)
  51.     # Split up the line. The following puts the first value into the variable 'seg', and all the remaining values into the list 'values'
  52.     seg, *values = line.split("\t")
  53.     # Skip lines with a blank segment
  54.     if seg == '':
  55.         continue
  56.     print('Segment %s: %s' % (seg, ','.join(values)))
  57.     # Ideally, here we'd check for digraphs. The perl version does this, but it requires a bit of overhead. (The .trans file, and checking all the segments against it, creating new arbitrary single character ways to represent them, etc.)
  58.  
  59.  
  60.     # Ignore blank lines, process only those lines with values
  61.     if len(values) == number_of_features:
  62.         # If the first value is nothing but digits (and is not simply a 0), or is completely empty, it's a code (assuming no scalar features!! but this script can't handle scalar features, anyway)
  63.         if (re.match(r'^\d*$', values[0]) and values[0] != 0) or values[0] == "":
  64.             del values[0]
  65.  
  66.         # Store the segments in a list of segments, for convenience
  67.         segments.append(seg)
  68.  
  69.         # And remember the feature values
  70.         feature_matrix[seg] = values
  71.  
  72.     else:
  73.             print('Warning! segments %s has an incorrect number of features. \(%s instead of %s\)' % (seg, len(values), number_of_features))
  74.  
  75. feature_file.close()
  76.  
  77. # Something we could do here: print the feature matrix to the log file, to double-check that it's been read correctly
  78.  
  79. ##########################################################
  80. ##### Now here's where the core of the script belongs
  81. ##### The strategy we discussed in the lab has two parts:
  82. #####     1. Find all of the natural classes that can be described using the features
  83. #####     2. For all pairs of segments, figure out which natural classes they both occur in (shared), and which natural classes contains one segment but not the other (unshared)
  84. ##########################################################
  85.  
  86. # Given:
  87. # a list called features containing the names of the features
  88. # a list called segments, containing all of the segment
  89. # a dictionary called feature_matrix, segments to vector list
  90.  
  91. # make some custom data structures
  92.  
  93. my_matrix = {}
  94. for segment in segments:
  95.     my_matrix[segment] = []
  96.     for index, feature in enumerate(features):
  97.         value = feature_matrix[segment][index]
  98.         my_matrix[segment].append((value, feature))
  99.  
  100. feature_values = []
  101. for feature in features:
  102.     feature_values.append(('+', feature))
  103.     feature_values.append(('-', feature))
  104.  
  105. def get_feature_name(x):
  106.     return x[0] + x[1]
  107.  
  108. # 1. Find all the natural classes
  109. # This could be done by going through all logically possible combinations of feature values, or it could be done by starting with the natural classes described by a single feature value ([+high], [-high], etc.) and then combining (=intersecting) them.
  110.  
  111. # inefficient powerset method
  112.  
  113. from itertools import *
  114. # from itertools documentation page
  115. def powerset(iterable):
  116.     s = list(iterable)
  117.     return chain.from_iterable(combinations(s, r) for r in range(0, len(s)+1))
  118.  
  119. # powerset all the features
  120. power = list(powerset(feature_values))
  121.  
  122. natural_classes = {}
  123.  
  124. def find_segment_class(feature_set):
  125.     return ''.join(sorted([segment for segment in segments if all(feature in my_matrix[segment] for feature in feature_set)]))
  126.  
  127. for combo in power:
  128.     seg_class = find_segment_class(combo)
  129.     # no segments found!
  130.     if len(seg_class) == 0:
  131.         continue
  132.     if seg_class in natural_classes:
  133.         # collapse the classes, but prefer the smaller set of features that describe it
  134.         existing_set = natural_classes[seg_class]
  135.         if len(combo) < len(existing_set):
  136.             natural_classes[seg_class] = combo
  137.     else:
  138.         # add the class to the natural classes
  139.         natural_classes[seg_class] = combo
  140.  
  141. # write out to log file
  142. log_file.write("Natural classes: \n")
  143. # sort by number of features that describe the class
  144. for nc, feature_set in sorted(natural_classes.items(), key=lambda kv: len(kv[1])):
  145.     log_file.write(nc + "\t" + ", ".join(get_feature_name(f) for f in feature_set) + "\n")
  146.  
  147.  
  148. # 2. Now go through all pairs of segments (including identical pairs-- that is, comparing a segment with itself. The similarity for those pairs should always come out to 1, if you've calculated it right)
  149.  
  150. def find_similarity_info(s1, s2):
  151.     shared = 0
  152.     unshared = 0
  153.     for nc in natural_classes:
  154.         if s1 in nc and s2 in nc:
  155.             shared += 1
  156.         # what a condition!
  157.         elif s1 in nc and s2 not in nc or s1 not in nc and s2 in nc:
  158.             unshared += 1
  159.     total = shared + unshared
  160.     return (shared, total, round(shared / total, 4))
  161.  
  162. similarities = [(s1, s2, find_similarity_info(s1, s2)) for s1 in segments for s2 in segments]
  163.  
  164. # write to the similarity table file
  165. similarity_table_file.write("Seg1\tSeg2\t#Shared\tTotal\tSimilarity\n")
  166. for s in similarities:
  167.     s1, s2, info = s
  168.     similarity_table_file.write(s1 + "\t" + s2 + "\t" + "\t".join(str(n) for n in info) + "\n")
  169.  
  170. log_file.close()
  171. similarity_table_file.close()
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top