Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding: UTF-8 -*-
- # alternation.py: finds rate of hand alternation for layouts and input
- # Copyright (C) 2014
- # Version 29 Jun 2014
- # This program is free software: you can redistribute it and/or modify
- # it under the terms of the GNU General Public License as published by
- # the Free Software Foundation, either version 3 of the License, or
- # (at your option) any later version.
- # This program is distributed in the hope that it will be useful,
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- # GNU General Public License for more details.
- # You should have received a copy of the GNU General Public License
- # along with this program. If not, see <http://www.gnu.org/licenses/>.
- DEBUG = True
- import sys
- import codecs
- import re
- import random
- from collections import namedtuple
- #some undesired fancy characters to replace
- fancy2normal = [
- #transformations to dots
- (u"…",u"..."),(u". . .",u"..."),
- #trimming of dots; put after dot transformations
- (u" ... ",u"..."),
- #dash transformations
- (u"—",u"-"),
- #trimming of dashes; put after dash transformations
- (u" - '",u"-'"),(u"' - ",u"'-"),(u' - "',u'-"'),(u'" - ',u'"-'),
- #quote transformations
- (u"“",u'"'),
- (u"”",u'"'),
- (u"’",u"'"),
- (u"’",u"'"),
- (u"‘",u"'")
- ]
- identity_func = lambda p : p
- if len(sys.argv) < 2:
- print("Usage: python alternation.py INPUT-FILE")
- sys.exit(1)
- elif len(sys.argv) < 3:
- output_file = input_file = sys.argv[1]
- else:
- input_file = sys.argv[1]
- output_file = sys.argv[2]
- with codecs.open(input_file, "r", "utf_8_sig") as f:
- content = f.read().split("\n")
- def replace_fancy(l):
- #designated replacement for some undesired fancy characters
- for fancy, normal in fancy2normal:
- l = l.replace(fancy, normal)
- #replaces all non-ascii characters
- try:
- ascii_line = l.decode('ascii')
- except UnicodeEncodeError:
- ascii_line = ''
- for c in l:
- if ord(c) < 128:
- ascii_line += c
- else:
- ascii_line += ""
- return ascii_line.strip()
- modified_content = map(replace_fancy, content)
- content = "\n".join(modified_content)
- #random numbers
- LEFT = -3
- RIGHT = 9
- SPACE = 0
- ENTER = 2
- #assumes that used by both hands (so does not break same-hand combo)
- #contrast with None, i.e. either hand (which does break same-hand combo)
- BOTH = 999
- SAME_HAND = 442343
- EXPECTED_SAME_HAND = -234324
- ALTERNATING_HAND = 3829172
- ALTERNATING_HAND_EXCLUSIVE = 3822222
- TOP_ROW = -23823934
- class Char_Data(namedtuple('Char_Data',['char','hand'])):
- "Tuple with fields: char (a character) and hand (LEFT,RIGHT,BOTH,SPACE,ENTER)"
- def __new__(self, char, hand):
- return super(Char_Data, self).__new__(self,char,hand)
- def hand_typing_char(layout_hands, space_hand = None, enter_hand = None):
- '''Returns a function that, given a character c, returns
- Char_Data(c, hand that typed c)
- Arguments:
- layout_hands is a 2-tuple of two strings, respectively containing the left and right characters
- space_hand: which hand space is assigned to; LEFT, RIGHT, BOTH, or None
- enter_hand: similar to space_hand, but for Return
- '''
- def hand_typing(c):
- left_keys,right_keys = layout_hands
- if c == " ":
- return SPACE if space_hand == None else space_hand
- elif c == "\n":
- return ENTER if enter_hand == None else enter_hand
- elif c in left_keys or c in left_keys.upper():
- return LEFT
- elif c in right_keys or c in right_keys.upper():
- return RIGHT
- else:
- if DEBUG:
- print "Character not found: " + c
- return None
- def hand_typing_tuple(c):
- return Char_Data(char = c, hand = hand_typing(c))
- return hand_typing_tuple
- class Same_Hand_Data(object):
- '''[Immutable] data representing same hand results.
- Attributes:
- length is the length of the same hand strings
- examples are the same hand strings of that length
- num is the number of examples
- total is the total number of substrings (whether or not same hand) of that length'''
- def __init__(self,length,examples=None,total=-1):
- self.length = length
- self.total = total
- if examples == None:
- examples = []
- self.num = len(examples)
- self.examples_list = examples
- @property
- def examples(self,value):
- if value == None:
- value = []
- self.examples_list = value
- self.num = len(value)
- @property
- def examples(self):
- return self.examples_list
- @property
- def unique_examples(self):
- '''Returns same hand examples, no duplicates.'''
- return list(set(self.examples))
- def same_hand_percentage(self):
- '''Percentage of strings of length that are same hand'''
- return 1.0 * self.num / self.total
- def query(self,query_type, percentage = True):
- '''Queries the data.
- query_type can be:
- SAME_HAND - returns proportion of same_hand strings
- ALTERNATING_HAND or ALTERNATING_HAND_EXCLUSIVE - returns proportion of alternating hand strings
- EXPECTED_SAME_HAND - returns length of string where E(same hand string of length) = 1
- If percentage true, displays as percentage rather than as decimal'''
- if query_type == SAME_HAND:
- return (100.0 if percentage else 1.0) * self.same_hand_percentage()
- elif query_type == ALTERNATING_HAND or query_type == ALTERNATING_HAND_EXCLUSIVE:
- return (100.0 if percentage else 1.0) * (1 - self.same_hand_percentage())
- elif query_type == EXPECTED_SAME_HAND:
- p = self.same_hand_percentage()
- return float("inf") if p == 0 else 1/p
- else:
- if DEBUG:
- print("Query type not recognized: " + str(query_type))
- return None
- def not_all_same_hand(char_data_list):
- '''char_data_list is a list of Char_Data
- Any of the latter three are assumed to belong to either hand.
- Returns true if not all elements in hand_list are the same hand, false otherwise'''
- hand_list = map(lambda c : c.hand, char_data_list)
- return SPACE in hand_list or ENTER in hand_list or None in hand_list or LEFT in hand_list and RIGHT in hand_list
- def all_same_hand(char_data_list):
- '''hand_list is a list of elements that are either LEFT, RIGHT, SPACE, ENTER, or None
- Any of the latter three are assumed to belong to either hand.
- Returns true if all elements in hand_list are the same hand, false otherwise'''
- return not not_all_same_hand(char_data_list)
- def find_same_hand(s, length, layout_hands, space_hand = None, enter_hand = None, include_repeats = False, include_unknowns = True, random_samples = -1):
- '''Finds (and counts) the number of instances of same hand
- Arguments:
- s is string to find same hand in
- length is length of same-hand strings to look for (if any alternation within that length, not counted)
- layout_hands is a 2-tuple of two strings, respectively containing the left and right characters
- space_hand: which hand space is assigned to; LEFT, RIGHT, BOTH or None
- enter_hand: similar to space_hand, but for Return
- include_repeats: if False, repeated letters (e.g. "jjj") are compressed into a single letter ("j") before analysis
- include_unknowns: if False, any unknown letters are removed completely from analysis
- random_samplings: if > 0, get hand alternation results by random sampling n-substrings random_samplings times.
- Otherwise, get results by searching through all n-substrings.
- Returns SameHandData'''
- if not include_repeats:
- #compress all repeating characters into one
- s_prev = None
- while s != s_prev:
- s_prev = s
- s = re.sub(r'(.)\1', r'\1', s)
- #list of tuples [ ... Char_Data(char = c, hand = hand that typed c) ... ]
- chars = map(hand_typing_char(layout_hands, space_hand, enter_hand), s)
- if not include_unknowns:
- chars = filter(lambda h : h.hand != None, chars)
- same_hand = []
- def append_if_all_same_hand(i):
- '''If the length n substring starting from i is all same hand, append to same_hand'''
- chs = chars[i: i+length]
- if all_same_hand(chs):
- same_hand.append("".join(map(lambda c : c.char, chs)))
- #get results by sampling random strings of length n
- if random_samples > 0:
- for number in range(random_samples):
- append_if_all_same_hand(random.randint(0, len(chars) - length - 1))
- return Same_Hand_Data(length, same_hand, random_samples)
- else: #get results by sampling all strings of length n
- for i in range(len(chars) - length):
- append_if_all_same_hand(i)
- return Same_Hand_Data(length, same_hand, len(chars) - length)
- def padder(length, front = True, padding = " "):
- '''Given length, returns a function that converts single input into string and pads string to length
- If front is True, pads in front; otherwise, pads in back.
- padding is the character to pad with (space by default)'''
- if front:
- return lambda s : padding * ((length - len(str(s)))/len(padding)) + str(s)
- else:
- return lambda s : str(s) + padding * ((length - len(str(s)))/len(padding))
- def create_table(layout_alternation_results, mode = ALTERNATING_HAND, decimal_places = 1):
- '''
- Arguments:
- layout_alternation_results is a #dict -> dict -> Same_Hand_Data
- defined by length -> layout_name -> SameHandData for that length and layout
- mode can be SAME_HAND to display the same hand percentages
- ALTERNATING_HAND to display the alternating hand percentages
- ALTERNATING_HAND_EXCLUSIVE to display the alternating hand percentages, but with columns n-1 instead of n
- EXPECTED_SAME_HAND to display the number of characters before E(same hand of length l) = 1
- decimal_places is the number of decimal places to which to display results
- Returns dict -> list of strings defined by
- row_name (either TOP_ROW or layout_name) -> list of columns in that row
- '''
- headers = {SAME_HAND:"P(l same hand)\\l",
- ALTERNATING_HAND:"P(l alternating) \ n",
- ALTERNATING_HAND_EXCLUSIVE:"P(alt w/i n) \ l",
- EXPECTED_SAME_HAND:"E(same length l)=1\\l"}
- header = headers[mode]
- #dict of layout -> Same_Hand_Data for some random length
- sample_layout_alternation_results = layout_alternation_results[layout_alternation_results.keys()[0]]
- layout_names = [name for name in sample_layout_alternation_results.iterkeys()]
- max_name_length = max(len(header),max(map(len,layout_names)))
- top_label_transform = lambda length : length + (-1 if mode == ALTERNATING_HAND_EXCLUSIVE else 0)
- table_rows = dict([(layout_name,[]) for layout_name, alternation_results in sample_layout_alternation_results.iteritems()]
- + [(TOP_ROW,[])] )
- pad_first_column = padder(max_name_length,front= False)
- #first column, top row
- table_rows[TOP_ROW].append(pad_first_column(header))
- #first column, lower rows listing layouts
- for layout_name in layout_names:
- table_rows[layout_name].append(pad_first_column(layout_name))
- #other columns, listing data
- for length in sorted(layout_alternation_results.keys()):
- unpadded_column = dict([(TOP_ROW,str(top_label_transform(length)))])
- for layout_name, alternation_results in layout_alternation_results[length].iteritems():
- unpadded_column[layout_name] = "{0:.{1}f}".format(alternation_results.query(mode),decimal_places)
- max_column_length = max([len(entry) for entry in unpadded_column.itervalues()])
- pad = padder(max_column_length)
- for layout_name, row in table_rows.iteritems():
- row.append(pad(unpadded_column[layout_name]))
- return table_rows
- def print_table(layout_alternation_results, mode = ALTERNATING_HAND, decimal_places = 1):
- '''
- Arguments:
- layout_alternation results is a #dict -> dict -> Same_Hand_Data
- defined by length -> layout_name -> SameHandData for that length and layout
- the n ranges for each layout_alternation_results[key] are assumed to be the same
- mode can be SAME_HAND to display the same hand percentages
- ALTERNATING_HAND to display the alternating hand percentages
- ALTERNATING_HAND_EXCLUSIVE to display the alternating hand percentages, but with columns n-1 instead of n
- EXPECTED_SAME_HAND to display the number of characters before E(same hand of length l) = 1
- decimal_places is the number of decimal places to display it to
- '''
- print table_str(create_table(layout_alternation_results, mode, decimal_places))
- def table_str(table):
- '''Given a table as defined by create_table, converts to str'''
- s = table_row_str(table[TOP_ROW]) + "\n"
- for layout_name, row in table.iteritems():
- if not layout_name == TOP_ROW:
- s += table_row_str(row) + "\n"
- return s
- def table_row_str(row):
- '''Given a row of a table (as a list) returns as a string'''
- return " ".join(row)
- layouts_hands = {
- 'Wide-Colemak':('''`[](){}qwfpgarstdzxcvb!@#$%^123456''', '''&=-+_$@~%#*^_+-jlu?/y;:<>'"\\hneio/km,.=7890_+-'''),
- # 'T9-QWERTY':('''`~1cyksuthjevwnbz23456!@#$%^''', ''',adfolximgrpq.>?;':@[]#{}~789</0-\\|=&*()_+"'''),
- 'Dvorak':( '''aoeuipyqjkx;:.>,<'"`~123456!@#$%^''', '''fgcrldhtnsbmwvz-_\\/?+|[]{}7890&*()='''),
- 'QWERTY':('''`~qwertasdfgzxcvb123456!@#$%^''', '''yuiophjklnm,.>?;':@[]#{}~789</0-\\|=&*()_+"'''),
- # 'MTGAP':('''`~,./?<;:>ypoujineaqz123456!@#$%^''', '''kdlcwmhtsrbfgvx'@[]#{}~7890-\\|=&*()_+"'''),
- # 'QGMLWY':('''`~qgmlwdstnrzxcvj123456!@#$%^''', '''yfubiaeohkp,.>?;':@[]#{}~789</0-\\|=&*()_+"'''),
- # 'QGMLWB':('''`~qgmlwdstnrzxcfj123456!@#$%^''', '''yvubiaeohkp,.>?;':@[]#{}~789</0-\\|=&*()_+"'''),
- 'HIEAQMTSRN':( '''~`byou'hiea,/().-*{}:_;=123456+%#$&^''', '''zkdclpqmt!srnvwgf?7890jx|@<>p"\\P[]''')
- }
- lengths_range = range(2,12)
- alternation_results = {} #dict -> dict -> Same_Hand_Data :: length -> layout_name -> SameHandData for that length and layout
- #gets alternation results
- for n in lengths_range:
- alternation_results[n] = dict([(layout_name, find_same_hand(content, n, layout_hands, space_hand = None, enter_hand = RIGHT))
- for layout_name, layout_hands in layouts_hands.iteritems()])
- print_table(alternation_results, mode = ALTERNATING_HAND_EXCLUSIVE, decimal_places = 1)
- print ""
- print_table(alternation_results, mode = SAME_HAND, decimal_places = 2)
- print ""
- print_table(alternation_results, mode = EXPECTED_SAME_HAND, decimal_places = 0)
- # #gets alternation results by random sampling
- # for n in lengths_range:
- # alternation_results[n] = dict([(layout_name, find_same_hand(content, n, layout_hands, space_hand = None, enter_hand = RIGHT, random_samples=100000))
- # for layout_name, layout_hands in layouts_hands.iteritems()])
- # print_table(alternation_results, mode = ALTERNATING_HAND_EXCLUSIVE, decimal_places = 1)
- # print ""
- # print_table(alternation_results, mode = SAME_HAND, decimal_places = 2)
- # print ""
- # print_table(alternation_results, mode = EXPECTED_SAME_HAND, decimal_places = 0)
- #
- #prints same-hand examples
- #
- #lengths to get same-hand examples for.
- #NOTE: needs to go from greatest to least
- #in order for substrings to be filtered
- examples_lengths = range(13,4,-1)
- # examples_lengths = []
- previous_examples = dict((layout_name,[]) for layout_name in layouts_hands.iterkeys())
- for n in examples_lengths:
- print ""
- for layout_name, layout_hands in layouts_hands.iteritems():
- #fetches the examples (nt, nt2 are dummy variables)
- examples = find_same_hand(content, n, layout_hands, space_hand = None, enter_hand = None, include_repeats = True).unique_examples
- #filters out any example that's a substring of a previous, longer example
- examples = filter(lambda ex : not any(map(lambda prev_ex : ex.lower() in prev_ex.lower(), previous_examples[layout_name])), examples)
- if examples:
- #prints out examples for this length and layout
- print "{0} (length {1}):".format(layout_name,n)
- for e in examples: print e
- print ""
- previous_examples[layout_name] += examples
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement