Advertisement
Guest User

Untitled

a guest
Mar 24th, 2019
131
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 6.63 KB | None | 0 0
  1. # A script to calculate Frisch/Broe/Pierrehumbert similarity, based on a feature file
  2. import sys
  3. import re
  4. import os.path
  5. from FeatureFileTools import cleanup_line
  6.  
  7. feature_filename = 'VowelsRound.txt'
  8. feature_file = open(feature_filename, 'r')
  9.  
  10. # Now also the output files for the similarity table, natural classes, and similarity matrix.
  11. # First, find the "prefix" of the feature filename
  12. filename_prefix = re.sub(r'\.[^\.]*$', '', feature_filename)
  13. log_filename = filename_prefix + '.log'
  14. log_file = open(log_filename, 'w')
  15.  
  16. similarity_table_filename = filename_prefix + '.stb'
  17. similarity_table_file = open(similarity_table_filename, 'w')
  18.  
  19. # The following bit of code can set how a natural class should be described: with the shortest/most economic description, or with the longest/most complete description.
  20. # For example, in a five vowel system a,e,i,o,u
  21. # Minimal: o,u = +rd
  22. # Maximal: o,u = +bk,-lo,+rd
  23.  
  24. # The code below does not make use of this setting, so it's up to you to implement or ignore it.
  25. # Uncomment the one you want to use.
  26. specification = 'minimal'
  27. #specification = 'maximal'
  28.  
  29. # Now we'll read the feature file
  30. print ("Feature file: %s" % feature_filename)
  31. # We'll read the whole file at once, since we want to look at the first line (header) separately from the remaining lines.
  32. feature_lines = feature_file.readlines()
  33.  
  34. # The first line contains the feature names
  35. firstline = cleanup_line(feature_lines[0])
  36. # also for some reason these often begin without tabs, but just in case they do have tabs
  37. firstline = re.sub(r'^\s+', '', firstline)
  38.  
  39. # Break up the header row into a list of feature names
  40. features = firstline.split('\t')
  41. number_of_features = len(features)
  42.  
  43. # Now the rest of the file, which contains the feature values
  44. # We need to remember the feature values, and also it's handy to have a list of segments
  45. feature_matrix = {}
  46. segments = []
  47.  
  48. # Go through all the lines after the first line
  49. for line in feature_lines[1:]:
  50. line = cleanup_line(line)
  51. # Split up the line. The following puts the first value into the variable 'seg', and all the remaining values into the list 'values'
  52. seg, *values = line.split("\t")
  53. # Skip lines with a blank segment
  54. if seg == '':
  55. continue
  56. print('Segment %s: %s' % (seg, ','.join(values)))
  57. # Ideally, here we'd check for digraphs. The perl version does this, but it requires a bit of overhead. (The .trans file, and checking all the segments against it, creating new arbitrary single character ways to represent them, etc.)
  58.  
  59.  
  60. # Ignore blank lines, process only those lines with values
  61. if len(values) == number_of_features:
  62. # If the first value is nothing but digits (and is not simply a 0), or is completely empty, it's a code (assuming no scalar features!! but this script can't handle scalar features, anyway)
  63. if (re.match(r'^\d*$', values[0]) and values[0] != 0) or values[0] == "":
  64. del values[0]
  65.  
  66. # Store the segments in a list of segments, for convenience
  67. segments.append(seg)
  68.  
  69. # And remember the feature values
  70. feature_matrix[seg] = values
  71.  
  72. else:
  73. print('Warning! segments %s has an incorrect number of features. \(%s instead of %s\)' % (seg, len(values), number_of_features))
  74.  
  75. feature_file.close()
  76.  
  77. # Something we could do here: print the feature matrix to the log file, to double-check that it's been read correctly
  78.  
  79. ##########################################################
  80. ##### Now here's where the core of the script belongs
  81. ##### The strategy we discussed in the lab has two parts:
  82. ##### 1. Find all of the natural classes that can be described using the features
  83. ##### 2. For all pairs of segments, figure out which natural classes they both occur in (shared), and which natural classes contains one segment but not the other (unshared)
  84. ##########################################################
  85.  
  86. # Given:
  87. # a list called features containing the names of the features
  88. # a list called segments, containing all of the segment
  89. # a dictionary called feature_matrix, segments to vector list
  90.  
  91. # make some custom data structures
  92.  
  93. my_matrix = {}
  94. for segment in segments:
  95. my_matrix[segment] = []
  96. for index, feature in enumerate(features):
  97. value = feature_matrix[segment][index]
  98. my_matrix[segment].append((value, feature))
  99.  
  100. feature_values = []
  101. for feature in features:
  102. feature_values.append(('+', feature))
  103. feature_values.append(('-', feature))
  104.  
  105. def get_feature_name(x):
  106. return x[0] + x[1]
  107.  
  108. # 1. Find all the natural classes
  109. # This could be done by going through all logically possible combinations of feature values, or it could be done by starting with the natural classes described by a single feature value ([+high], [-high], etc.) and then combining (=intersecting) them.
  110.  
  111. # inefficient powerset method
  112.  
  113. from itertools import *
  114. # from itertools documentation page
  115. def powerset(iterable):
  116. s = list(iterable)
  117. return chain.from_iterable(combinations(s, r) for r in range(0, len(s)+1))
  118.  
  119. # powerset all the features
  120. power = list(powerset(feature_values))
  121.  
  122. natural_classes = {}
  123.  
  124. def find_segment_class(feature_set):
  125. return ''.join(sorted([segment for segment in segments if all(feature in my_matrix[segment] for feature in feature_set)]))
  126.  
  127. for combo in power:
  128. seg_class = find_segment_class(combo)
  129. # no segments found!
  130. if len(seg_class) == 0:
  131. continue
  132. if seg_class in natural_classes:
  133. # collapse the classes, but prefer the smaller set of features that describe it
  134. existing_set = natural_classes[seg_class]
  135. if len(combo) < len(existing_set):
  136. natural_classes[seg_class] = combo
  137. else:
  138. # add the class to the natural classes
  139. natural_classes[seg_class] = combo
  140.  
  141. # write out to log file
  142. log_file.write("Natural classes: \n")
  143. # sort by number of features that describe the class
  144. for nc, feature_set in sorted(natural_classes.items(), key=lambda kv: len(kv[1])):
  145. log_file.write(nc + "\t" + ", ".join(get_feature_name(f) for f in feature_set) + "\n")
  146.  
  147.  
  148. # 2. Now go through all pairs of segments (including identical pairs-- that is, comparing a segment with itself. The similarity for those pairs should always come out to 1, if you've calculated it right)
  149.  
  150. def find_similarity_info(s1, s2):
  151. shared = 0
  152. unshared = 0
  153. for nc in natural_classes:
  154. if s1 in nc and s2 in nc:
  155. shared += 1
  156. # what a condition!
  157. elif s1 in nc and s2 not in nc or s1 not in nc and s2 in nc:
  158. unshared += 1
  159. total = shared + unshared
  160. return (shared, total, round(shared / total, 4))
  161.  
  162. similarities = [(s1, s2, find_similarity_info(s1, s2)) for s1 in segments for s2 in segments]
  163.  
  164. # write to the similarity table file
  165. similarity_table_file.write("Seg1\tSeg2\t#Shared\tTotal\tSimilarity\n")
  166. for s in similarities:
  167. s1, s2, info = s
  168. similarity_table_file.write(s1 + "\t" + s2 + "\t" + "\t".join(str(n) for n in info) + "\n")
  169.  
  170. log_file.close()
  171. similarity_table_file.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement