Advertisement
Guest User

alternation.py

a guest
Jan 29th, 2014
56
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 16.80 KB | None | 0 0
  1. # -*- coding: UTF-8 -*-
  2.  
  3. # alternation.py: finds rate of hand alternation for layouts and input
  4. # Copyright (C) 2014
  5. # Version 29 Jun 2014
  6.  
  7. # This program is free software: you can redistribute it and/or modify
  8. # it under the terms of the GNU General Public License as published by
  9. # the Free Software Foundation, either version 3 of the License, or
  10. # (at your option) any later version.
  11.  
  12. # This program is distributed in the hope that it will be useful,
  13. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15. # GNU General Public License for more details.
  16.  
  17. # You should have received a copy of the GNU General Public License
  18. # along with this program.  If not, see <http://www.gnu.org/licenses/>.
  19.  
  20. DEBUG = True
  21.  
  22. import sys
  23. import codecs
  24. import re
  25. import random
  26.  
  27. from collections import namedtuple
  28.  
  29.  
  30. #some undesired fancy characters to replace
  31. fancy2normal = [
  32.     #transformations to dots
  33.     (u"…",u"..."),(u". . .",u"..."),
  34.  
  35.     #trimming of dots; put after dot transformations
  36.     (u" ... ",u"..."),        
  37.    
  38.     #dash transformations
  39.     (u"—",u"-"),
  40.    
  41.     #trimming of dashes; put after dash transformations
  42.     (u" - '",u"-'"),(u"' - ",u"'-"),(u' - "',u'-"'),(u'" - ',u'"-'),
  43.    
  44.     #quote transformations
  45.     (u"“",u'"'),
  46.     (u"”",u'"'),
  47.     (u"’",u"'"),
  48.     (u"’",u"'"),
  49.     (u"‘",u"'")
  50. ]
  51.  
  52.  
  53. identity_func = lambda p : p
  54.  
  55. if len(sys.argv) < 2:
  56.     print("Usage: python alternation.py INPUT-FILE")
  57.     sys.exit(1)
  58. elif len(sys.argv) < 3:
  59.     output_file = input_file = sys.argv[1]
  60. else:
  61.     input_file = sys.argv[1]
  62.     output_file = sys.argv[2]
  63.    
  64. with codecs.open(input_file, "r", "utf_8_sig") as f:
  65.     content = f.read().split("\n")
  66.  
  67. def replace_fancy(l):
  68.     #designated replacement for some undesired fancy characters
  69.     for fancy, normal in fancy2normal:
  70.         l = l.replace(fancy, normal)
  71.  
  72.     #replaces all non-ascii characters
  73.     try:
  74.         ascii_line = l.decode('ascii')
  75.     except UnicodeEncodeError:
  76.         ascii_line = ''
  77.         for c in l:
  78.             if ord(c) < 128:
  79.                 ascii_line += c
  80.             else:
  81.                 ascii_line += ""
  82.  
  83.     return ascii_line.strip()
  84.    
  85.  
  86. modified_content = map(replace_fancy, content)
  87. content = "\n".join(modified_content)
  88.  
  89.  
  90. #random numbers
  91. LEFT = -3
  92. RIGHT = 9
  93. SPACE = 0
  94. ENTER = 2
  95.  
  96. #assumes that used by both hands (so does not break same-hand combo)
  97. #contrast with None, i.e. either hand (which does break same-hand combo)
  98. BOTH = 999  
  99.  
  100. SAME_HAND = 442343
  101. EXPECTED_SAME_HAND = -234324
  102. ALTERNATING_HAND = 3829172
  103. ALTERNATING_HAND_EXCLUSIVE = 3822222
  104.  
  105. TOP_ROW = -23823934
  106.  
  107. class Char_Data(namedtuple('Char_Data',['char','hand'])):
  108.     "Tuple with fields: char (a character) and hand (LEFT,RIGHT,BOTH,SPACE,ENTER)"
  109.     def __new__(self, char, hand):
  110.         return super(Char_Data, self).__new__(self,char,hand)
  111.  
  112. def hand_typing_char(layout_hands, space_hand = None, enter_hand = None):
  113.     '''Returns a function that, given a character c, returns
  114.    Char_Data(c, hand that typed c)
  115.  
  116. Arguments:
  117. layout_hands is a 2-tuple of two strings, respectively containing the left and right characters
  118. space_hand: which hand space is assigned to; LEFT, RIGHT, BOTH, or None
  119. enter_hand: similar to space_hand, but for Return
  120. '''
  121.  
  122.     def hand_typing(c):
  123.         left_keys,right_keys = layout_hands
  124.         if c == " ":
  125.             return SPACE if space_hand == None else space_hand
  126.         elif c == "\n":
  127.             return ENTER if enter_hand == None else enter_hand
  128.         elif c in left_keys or c in left_keys.upper():
  129.             return LEFT
  130.         elif c in right_keys or c in right_keys.upper():
  131.             return RIGHT
  132.         else:
  133.             if DEBUG:
  134.                 print "Character not found: " + c
  135.             return None
  136.            
  137.     def hand_typing_tuple(c):
  138.         return Char_Data(char = c, hand = hand_typing(c))
  139.  
  140.     return hand_typing_tuple
  141.  
  142. class Same_Hand_Data(object):
  143.     '''[Immutable] data representing same hand results.
  144.  
  145.    Attributes:
  146.    length is the length of the same hand strings
  147.    examples are the same hand strings of that length
  148.    num is the number of examples
  149.    total is the total number of substrings (whether or not same hand) of that length'''
  150.     def __init__(self,length,examples=None,total=-1):
  151.         self.length = length
  152.         self.total = total
  153.        
  154.         if examples == None:
  155.             examples = []
  156.         self.num = len(examples)
  157.         self.examples_list = examples
  158.        
  159.     @property
  160.     def examples(self,value):
  161.         if value == None:
  162.             value = []
  163.         self.examples_list = value
  164.         self.num = len(value)
  165.        
  166.     @property
  167.     def examples(self):
  168.         return self.examples_list
  169.  
  170.     @property
  171.     def unique_examples(self):
  172.         '''Returns same hand examples, no duplicates.'''
  173.         return list(set(self.examples))
  174.        
  175.     def same_hand_percentage(self):
  176.         '''Percentage of strings of length that are same hand'''
  177.         return 1.0 * self.num / self.total
  178.  
  179.     def query(self,query_type, percentage = True):
  180.         '''Queries the data.
  181.        
  182.        query_type can be:
  183.        SAME_HAND - returns proportion of same_hand strings
  184.        ALTERNATING_HAND or ALTERNATING_HAND_EXCLUSIVE - returns proportion of alternating hand strings
  185.        EXPECTED_SAME_HAND - returns length of string where E(same hand string of length) = 1
  186.  
  187.        If percentage true, displays as percentage rather than as decimal'''
  188.         if query_type == SAME_HAND:
  189.             return (100.0 if percentage else 1.0) * self.same_hand_percentage()
  190.         elif query_type == ALTERNATING_HAND or query_type == ALTERNATING_HAND_EXCLUSIVE:
  191.             return (100.0 if percentage else 1.0) * (1 - self.same_hand_percentage())
  192.         elif query_type == EXPECTED_SAME_HAND:
  193.             p = self.same_hand_percentage()
  194.             return float("inf") if p == 0 else 1/p
  195.         else:
  196.             if DEBUG:
  197.                 print("Query type not recognized: " + str(query_type))
  198.             return None
  199.        
  200.            
  201. def not_all_same_hand(char_data_list):
  202.     '''char_data_list is a list of Char_Data
  203.    Any of the latter three are assumed to belong to either hand.
  204.  
  205.    Returns true if not all elements in hand_list are the same hand, false otherwise'''
  206.     hand_list = map(lambda c : c.hand, char_data_list)
  207.     return SPACE in hand_list or ENTER in hand_list or None in hand_list or LEFT in hand_list and RIGHT in hand_list
  208.  
  209. def all_same_hand(char_data_list):
  210.     '''hand_list is a list of elements that are either LEFT, RIGHT, SPACE, ENTER, or None
  211.    Any of the latter three are assumed to belong to either hand.
  212.  
  213.    Returns true if all elements in hand_list are the same hand, false otherwise'''
  214.     return not not_all_same_hand(char_data_list)
  215.  
  216.  
  217.  
  218. def find_same_hand(s, length, layout_hands, space_hand = None, enter_hand = None, include_repeats = False, include_unknowns = True, random_samples = -1):
  219.     '''Finds (and counts) the number of instances of same hand
  220.  
  221. Arguments:
  222. s is string to find same hand in
  223. length is length of same-hand strings to look for (if any alternation within that length, not counted)
  224. layout_hands is a 2-tuple of two strings, respectively containing the left and right characters
  225. space_hand: which hand space is assigned to; LEFT, RIGHT, BOTH or None
  226. enter_hand: similar to space_hand, but for Return
  227. include_repeats: if False, repeated letters (e.g. "jjj") are compressed into a single letter ("j") before analysis
  228. include_unknowns: if False, any unknown letters are removed completely from analysis
  229.  
  230. random_samplings: if > 0, get hand alternation results by random sampling n-substrings random_samplings times.  
  231.    Otherwise, get results by searching through all n-substrings.
  232.  
  233.  
  234. Returns SameHandData'''
  235.  
  236.     if not include_repeats:
  237.         #compress all repeating characters into one
  238.         s_prev = None
  239.         while s != s_prev:
  240.             s_prev = s
  241.             s = re.sub(r'(.)\1', r'\1', s)
  242.            
  243.     #list of tuples [ ... Char_Data(char = c, hand = hand that typed c) ... ]
  244.     chars = map(hand_typing_char(layout_hands, space_hand, enter_hand), s)
  245.  
  246.     if not include_unknowns:
  247.         chars = filter(lambda h : h.hand != None, chars)
  248.  
  249.     same_hand = []
  250.    
  251.     def append_if_all_same_hand(i):
  252.         '''If the length n substring starting from i is all same hand, append to same_hand'''
  253.         chs = chars[i: i+length]
  254.         if all_same_hand(chs):
  255.             same_hand.append("".join(map(lambda c : c.char, chs)))
  256.  
  257.     #get results by sampling random strings of length n
  258.     if random_samples > 0:
  259.         for number in range(random_samples):
  260.             append_if_all_same_hand(random.randint(0, len(chars) - length - 1))
  261.         return Same_Hand_Data(length, same_hand, random_samples)
  262.     else: #get results by sampling all strings of length n
  263.         for i in range(len(chars) - length):
  264.             append_if_all_same_hand(i)
  265.         return Same_Hand_Data(length, same_hand, len(chars) - length)
  266.  
  267.  
  268. def padder(length, front = True, padding = " "):
  269.     '''Given length, returns a function that converts single input into string and pads string to length
  270.  
  271.    If front is True, pads in front; otherwise, pads in back.
  272.  
  273. padding is the character to pad with (space by default)'''
  274.     if front:
  275.         return lambda s : padding * ((length - len(str(s)))/len(padding)) + str(s)
  276.     else:
  277.         return lambda s : str(s) + padding * ((length - len(str(s)))/len(padding))
  278.  
  279.  
  280. def create_table(layout_alternation_results, mode = ALTERNATING_HAND, decimal_places = 1):
  281.     '''
  282. Arguments:
  283. layout_alternation_results is a #dict -> dict -> Same_Hand_Data
  284.    defined by length -> layout_name -> SameHandData for that length and layout
  285.  
  286. mode can be SAME_HAND to display the same hand percentages
  287.            ALTERNATING_HAND to display the alternating hand percentages
  288.            ALTERNATING_HAND_EXCLUSIVE to display the alternating hand percentages, but with columns n-1 instead of n
  289.            EXPECTED_SAME_HAND to display the number of characters before E(same hand of length l) = 1
  290.  
  291. decimal_places is the number of decimal places to which to display results
  292.    
  293. Returns dict -> list of strings defined by
  294. row_name (either TOP_ROW or layout_name) -> list of columns in that row
  295. '''
  296.     headers = {SAME_HAND:"P(l same hand)\\l",
  297.                ALTERNATING_HAND:"P(l alternating) \ n",
  298.                ALTERNATING_HAND_EXCLUSIVE:"P(alt w/i n) \ l",
  299.                EXPECTED_SAME_HAND:"E(same length l)=1\\l"}
  300.     header = headers[mode]
  301.  
  302.     #dict of layout -> Same_Hand_Data for some random length
  303.     sample_layout_alternation_results = layout_alternation_results[layout_alternation_results.keys()[0]]
  304.  
  305.     layout_names = [name for name in sample_layout_alternation_results.iterkeys()]
  306.     max_name_length = max(len(header),max(map(len,layout_names)))
  307.  
  308.     top_label_transform = lambda length : length + (-1 if mode == ALTERNATING_HAND_EXCLUSIVE else 0)
  309.  
  310.     table_rows = dict([(layout_name,[]) for layout_name, alternation_results in sample_layout_alternation_results.iteritems()]
  311.                       + [(TOP_ROW,[])] )
  312.  
  313.     pad_first_column = padder(max_name_length,front= False)
  314.  
  315.     #first column, top row
  316.     table_rows[TOP_ROW].append(pad_first_column(header))
  317.  
  318.     #first column, lower rows listing layouts
  319.     for layout_name in layout_names:
  320.         table_rows[layout_name].append(pad_first_column(layout_name))
  321.  
  322.     #other columns, listing data
  323.     for length in sorted(layout_alternation_results.keys()):
  324.         unpadded_column = dict([(TOP_ROW,str(top_label_transform(length)))])
  325.  
  326.         for layout_name, alternation_results in layout_alternation_results[length].iteritems():
  327.             unpadded_column[layout_name] = "{0:.{1}f}".format(alternation_results.query(mode),decimal_places)
  328.            
  329.         max_column_length = max([len(entry) for entry in unpadded_column.itervalues()])
  330.         pad = padder(max_column_length)
  331.        
  332.         for layout_name, row in table_rows.iteritems():
  333.             row.append(pad(unpadded_column[layout_name]))
  334.        
  335.     return table_rows
  336.  
  337.  
  338. def print_table(layout_alternation_results, mode = ALTERNATING_HAND, decimal_places = 1):
  339.     '''
  340. Arguments:
  341. layout_alternation results is a #dict -> dict -> Same_Hand_Data
  342.    defined by length -> layout_name -> SameHandData for that length and layout
  343.  
  344.    the n ranges for each layout_alternation_results[key] are assumed to be the same
  345.  
  346. mode can be SAME_HAND to display the same hand percentages
  347.            ALTERNATING_HAND to display the alternating hand percentages
  348.            ALTERNATING_HAND_EXCLUSIVE to display the alternating hand percentages, but with columns n-1 instead of n
  349.            EXPECTED_SAME_HAND to display the number of characters before E(same hand of length l) = 1
  350.  
  351. decimal_places is the number of decimal places to display it to
  352. '''
  353.     print table_str(create_table(layout_alternation_results, mode, decimal_places))
  354.  
  355.    
  356. def table_str(table):
  357.     '''Given a table as defined by create_table, converts to str'''
  358.     s = table_row_str(table[TOP_ROW]) + "\n"
  359.  
  360.     for layout_name, row in table.iteritems():
  361.         if not layout_name == TOP_ROW:
  362.             s += table_row_str(row) + "\n"
  363.  
  364.     return s
  365.  
  366. def table_row_str(row):
  367.     '''Given a row of a table (as a list) returns as a string'''
  368.     return " ".join(row)
  369.  
  370.  
  371. layouts_hands = {
  372.                  'Wide-Colemak':('''`[](){}qwfpgarstdzxcvb!@#$%^123456''', '''&=-+_$@~%#*^_+-jlu?/y;:<>'"\\hneio/km,.=7890_+-'''),
  373.                  # 'T9-QWERTY':('''`~1cyksuthjevwnbz23456!@#$%^''', ''',adfolximgrpq.>?;':@[]#{}~789</0-\\|=&*()_+"'''),
  374.                  'Dvorak':( '''aoeuipyqjkx;:.>,<'"`~123456!@#$%^''', '''fgcrldhtnsbmwvz-_\\/?+|[]{}7890&*()='''),
  375.                  'QWERTY':('''`~qwertasdfgzxcvb123456!@#$%^''', '''yuiophjklnm,.>?;':@[]#{}~789</0-\\|=&*()_+"'''),
  376.                  # 'MTGAP':('''`~,./?<;:>ypoujineaqz123456!@#$%^''', '''kdlcwmhtsrbfgvx'@[]#{}~7890-\\|=&*()_+"'''),
  377.                  # 'QGMLWY':('''`~qgmlwdstnrzxcvj123456!@#$%^''', '''yfubiaeohkp,.>?;':@[]#{}~789</0-\\|=&*()_+"'''),
  378.                  # 'QGMLWB':('''`~qgmlwdstnrzxcfj123456!@#$%^''', '''yvubiaeohkp,.>?;':@[]#{}~789</0-\\|=&*()_+"'''),
  379.                  'HIEAQMTSRN':( '''~`byou'hiea,/().-*{}:_;=123456+%#$&^''', '''zkdclpqmt!srnvwgf?7890jx|@<>p"\\P[]''')
  380. }
  381.  
  382.  
  383. lengths_range = range(2,12)
  384.  
  385. alternation_results = {}  #dict -> dict -> Same_Hand_Data :: length -> layout_name -> SameHandData for that length and layout
  386.  
  387. #gets alternation results
  388. for n in lengths_range:
  389.     alternation_results[n] = dict([(layout_name, find_same_hand(content, n, layout_hands, space_hand = None, enter_hand = RIGHT))
  390.                                    for layout_name, layout_hands in layouts_hands.iteritems()])
  391.    
  392.  
  393. print_table(alternation_results, mode = ALTERNATING_HAND_EXCLUSIVE, decimal_places = 1)
  394. print ""
  395. print_table(alternation_results, mode = SAME_HAND, decimal_places = 2)
  396. print ""
  397. print_table(alternation_results, mode = EXPECTED_SAME_HAND, decimal_places = 0)
  398.  
  399.  
  400. # #gets alternation results by random sampling
  401. # for n in lengths_range:
  402. #     alternation_results[n] = dict([(layout_name, find_same_hand(content, n, layout_hands, space_hand = None, enter_hand = RIGHT, random_samples=100000))
  403. #                                    for layout_name, layout_hands in layouts_hands.iteritems()])
  404.  
  405. # print_table(alternation_results, mode = ALTERNATING_HAND_EXCLUSIVE, decimal_places = 1)
  406. # print ""
  407. # print_table(alternation_results, mode = SAME_HAND, decimal_places = 2)
  408. # print ""
  409. # print_table(alternation_results, mode = EXPECTED_SAME_HAND, decimal_places = 0)
  410.  
  411.  
  412.  
  413.  
  414.  
  415. #
  416. #prints same-hand examples
  417. #
  418.  
  419. #lengths to get same-hand examples for.
  420. #NOTE: needs to go from greatest to least
  421. #in order for substrings to be filtered
  422. examples_lengths = range(13,4,-1)
  423. # examples_lengths = []
  424.  
  425. previous_examples = dict((layout_name,[]) for layout_name in layouts_hands.iterkeys())
  426. for n in examples_lengths:
  427.     print ""
  428.     for layout_name, layout_hands in layouts_hands.iteritems():
  429.         #fetches the examples (nt, nt2 are dummy variables)
  430.         examples = find_same_hand(content, n, layout_hands, space_hand = None, enter_hand = None, include_repeats = True).unique_examples
  431.        
  432.         #filters out any example that's a substring of a previous, longer example
  433.         examples = filter(lambda ex : not any(map(lambda prev_ex : ex.lower() in prev_ex.lower(), previous_examples[layout_name])), examples)
  434.  
  435.         if examples:
  436.             #prints out examples for this length and layout
  437.             print "{0} (length {1}):".format(layout_name,n)
  438.             for e in examples: print e
  439.             print ""
  440.  
  441.         previous_examples[layout_name] += examples
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement