Untitled

"""
Have fun comparing various spellchecking methods
You should carefully follow the handout for this lab
so that you do things in the right order.
"""
import re
import os
from time import clock, perf_counter
import doctest

# setup a ruled line string
LINE_WIDTH = 50
SINGLE_LINE = '-' *LINE_WIDTH


def print_line():
    """ Prints a single line """
    print(SINGLE_LINE)


def _c_mul(value_a, value_b):
    '''Substitute for c multiply function'''
    return ((int(value_a) * int(value_b)) & 0xFFFFFFFF)


def nice_hash(input_string):
    '''Takes a string name and returns a hash for the string. This hash value
    will be os independent, unlike the default Python hash function.'''
    if input_string is None:
        return 0  # empty
    value = ord(input_string[0]) << 7
    for char in input_string:
        value = _c_mul(1000003, value) ^ ord(char)
    value = value ^ len(input_string)
    if value == -1:
        value = -2
    return value


#---------------------------------------------
# Start of Class definitions
#---------------------------------------------
class ChainingHashTable():
    """A simple HashTable.
    ***** IMPORTANT
    ***** =========
    ***** NOTE: These will fail initially, ie, when _hash returns 0
    ***** DON'T worry you will fix this later:)
    >>> hash_table = ChainingHashTable(5)
    >>> hash_table.store('George')
    >>> hash_table.store('Bungle')
    >>> hash_table.store('Zippy')
    >>> hash_table.store('Jane')
    >>> hash_table.store('Rod')
    >>> hash_table.store('Freddy')
    >>> print(hash_table)
    ChainingHashTable:
    slot        0 = ['Rod']
    slot        1 = ['George', 'Jane']
    slot        2 = ['Bungle']
    slot        3 = ['Zippy', 'Freddy']
    slot        4 = []
    n_slots = 5
    n_items in table = 6
    Load factor =  1.200
    Average chain length = 1.200
    <BLANKLINE>
    NOTE: ChainingHashTable print doctests will fail initially, ie, when _hash returns 0
    Don't worry you will fix this later... See handout
    <BLANKLINE>
    >>> print('Freddy' in hash_table)
    True
    >>> print('Dipsy' in hash_table)
    False
    >>> print('Joey' in hash_table)
    False
    >>> print('Jane' in hash_table)
    True
    >>> print('Bungle' in hash_table)
    True
    """

    def __init__(self, slots=11):
        """
        Creates a new hashtable with a number of slots (default: 11).
        It will help if you always make your hash table size a prime number.

        Each slot will contain a list of items that hash to the slot.
        Initially the list in each slot contains a None.
        n_items contains the number of items that have been put in the table.
        """
        self._data = [[] for i in range(slots)]
        self.n_slots = slots
        self.n_items = 0

    def _hash(self, item):
        """Computes the hash of an item.
        First calculate using nice_hash(item) and then use modulo
        to reduce it down to a number in the range 0..slef.n_slots """
        # NOTE:
        # We will use a trivial hash function here to start with
        # Don't worry, you will get to update it later in the lab...
        thing = nice_hash(item)
        output = thing % self.n_slots
        return output

    def store(self, item):
        """Appends an item to the list in the slot at _data[hash]."""
        index = self._hash(item)
        self._data[index].append(item)
        # Keep track of number of items in hash table
        self.n_items += 1

    def average_chain_length(self):
        """ Returns the average chain length """
        return self.n_items / self.n_slots

    def __str__(self):
        output = 'ChainingHashTable:\n'
        for i in range(self.n_slots):
            list_at_slot = self._data[i]
            output = output +'slot {0:8d} = '.format(i)
            if list_at_slot == None:
                output = output + 'None'
            else:
                output = output + str(list_at_slot)
            output += '\n'
        load_factor = float(self.n_items) /self.n_slots
        output += 'n_slots = {0:d}\n'.format(self.n_slots)
        output += 'n_items in table = {0:d}\n'.format(self.n_items)
        output += 'Load factor = {0:6.3f}\n'.format(load_factor)
        output += 'Average chain length = {0:.3f}\n'.format(self.average_chain_length())
        output += '\n'
        output += 'NOTE: ChainingHashTable print doctests will fail initially, ie, when _hash returns 0\n'
        output += "Don't worry you will fix this later... See handout\n"
        return output

    def __contains__(self, item):
        """
        Checks to see if an item is stored within the hashtable.
        Returns True if it does, otherwise it returns False.
        You can call this method using Python's 'in' keyword:
            ht = HashTable()
            ...
            if 'foo' in ht:
            ...
        """
        # remember self._data[index] contains a list of items that hash to
        # the slot at the given index
        # ---start student section---
        index = self._hash(item)
        if item in self._data[index]:
            return True
        else:
            return False
        # ===end student section===


#----------------------------------------------------------------------------
class LinearHashTable():
    """A simple Open Addressing HashTable with Linear Probing.
    Called simply LinearHashTable to make the name shorter...

    >>> hash_table = LinearHashTable(11)
    >>> hash_table.store('Dianna')
    >>> hash_table.store('Bobby')
    >>> hash_table.store('Dirk')
    >>> hash_table.store('Darlene')
    >>> hash_table.store('Paul')
    >>> hash_table.store('Peter')
    >>> hash_table.store('Paula')
    >>> hash_table.store('David')
    >>> hash_table.store('Zoe')
    >>> print(hash_table)
    LinearHashTable:
    slot        0 = David
    slot        1 = Darlene
    slot        2 = Paul
    slot        3 = Zoe
    slot        4 = Peter
    slot        5 = Paula
    slot        6 = Dirk
    slot        7 = -
    slot        8 = -
    slot        9 = Dianna
    slot       10 = Bobby
    n_slots = 11
    n_items in table = 9
    Load factor =  0.818
    <BLANKLINE>
    >>> 'Paul' in hash_table
    True
    >>> 'Peter' in hash_table
    True
    >>> 'Paula' in hash_table
    True
    >>> 'David' in hash_table
    True
    >>> 'Bobby' in hash_table
    True
    >>> 'Dianna' in hash_table
    True
    >>> 'Dirk' in hash_table
    True
    >>> 'Darlene' in hash_table
    True
    >>> 'Zoe' in hash_table
    True
    >>> print('Dingus' in hash_table)
    False
    """

    def __init__(self, slots=11):
        """
        Creates a new hashtable with the given number of slots.
        Remember we can't have a load factor greater than 1 for an open hash...
        """
        self._data = [None for i in range(slots)]
        self.n_slots = slots
        self.n_items = 0


    def store(self, item):
        """Stores an item in the hashtable."""
        if self.n_slots == self.n_items:
            #Argh - crash, who made the hashtable too small?
            print(self._data)
            print('Size = ' + str(self.n_slots))
            print('Slots used = ' + str(self.n_items))
            print("Whoops, hash table is full!!!!")
            print("In reality you would have made it bigger by now...")
            print("But, you're not expected to for this lab.")
            raise IndexError("Hash table is full!!!!")
        # ***********************************************************
        # ---start student section---
        index_initial = self._hash(item)
        if not self.__contains__(item):
            slot_found = False
            while slot_found == False:
                if self._data[index_initial] == None:
                    self._data[index_initial] = item
                    slot_found = True
                else:
                    index_initial += 1
                    if index_initial == 11:
                        index_initial = 1
        # ===end student section===

        # Keep track of number of items in hash table
        self.n_items += 1

    def _hash(self, item):
        """Computes the hash of an item."""
        return nice_hash(item) % self.n_slots

    def __contains__(self, item):
        """
        Checks to see if an item is stored within the hashtable.
        Returns True if it does, otherwise it returns False.
        You can call this method using Python's 'in' keyword:
            ht = HashTable()
            ...
            print('foo' in ht)   # invoked ht.__contains__
                ...
        """
        first_hash = self._hash(item)
        have_wrapped = False
        if self._data[first_hash] == item:
            return True
        else:
            current_index = first_hash
            while self._data[current_index] is not None:
                if self._data[current_index] == item:
                    # horay we found it
                    return True
                if (current_index == first_hash) and have_wrapped:
                    # back to original hash and didn't find item
                    # phew - the hashtable is full!
                    return False
                if current_index == (self.n_slots -1):
                    # wrap back to start of hash table
                    current_index = 0
                    have_wrapped = True
                else:
                    current_index += 1

    def __str__(self):
        """ Returns a string representation of the hashtable
        Along with some summary stats.
        """
        output = 'LinearHashTable:\n'
        for i in range(self.n_slots):
            output += f'slot {i:8} = '
            item = self._data[i]
            if item == None:
                output = output + '-'
            else:
                output = output + str(item)
            output += '\n'
        load_factor = float(self.n_items) / self.n_slots
        output += f'n_slots = {self.n_slots}\n'
        output += f'n_items in table = {self.n_items}\n'
        output += f'Load factor = {load_factor:6.3f}\n'
        return output

    def __repr__(self):
        """ Just uses __str__ """
        return str(self)


#----------------------------------------------------------------------------
class QuadraticHashTable():
    """Is a child class of OpenAddressHashTable.
    If a collision then uses a quadratic probing function to find a free slot
    >>> hash_table = QuadraticHashTable(11)
    >>> hash_table.store('Dianna')
    >>> hash_table.store('Bobby')
    >>> hash_table.store('Dirk')
    >>> hash_table.store('Darlene')
    >>> hash_table.store('Paul')
    >>> hash_table.store('Paula')
    >>> hash_table.store('David')
    >>> hash_table.store('Harry')
    >>> print(hash_table)
    QuadraticOpenAddressHashTable:
    slot        0 = David
    slot        1 = Darlene
    slot        2 = Paul
    slot        3 = -
    slot        4 = Paula
    slot        5 = Harry
    slot        6 = Dirk
    slot        7 = -
    slot        8 = -
    slot        9 = Dianna
    slot       10 = Bobby
    n_slots = 11
    n_items in table = 8
    Load factor =  0.727
    <BLANKLINE>
    >>> 'Harry' in hash_table
    True
    >>> 'Paul' in hash_table
    True
    >>> 'Paula' in hash_table
    True
    >>> 'David' in hash_table
    True
    >>> 'Bobby' in hash_table
    True
    >>> 'Dianna' in hash_table
    True
    >>> 'Dirk' in hash_table
    True
    >>> 'Darlene' in hash_table
    True
    >>> 'Dingus' in hash_table
    False

    """

    def __init__(self, slots=11):
        """
        Creates a new hashtable with a number of slots (default: 10).
        Remember we can't have a load factor greater than 1 for an open hash...
        """
        self._data = [None for i in range(slots)]
        self.n_slots = slots
        self.n_items = 0

    def _next_free_slot(self, first_hash):
        """
        Keeps incrementing hash index until an empty slot is found.
        Then returns the index of that slot.
        Used by the store method to help deal with a collisions.
        Note: this method isn't useful in __contains__ but you shoul
              follow a similar series of probing there.
        """
        curr_index = first_hash
        try_number = 0
        tried = []
        #print self._data
        while self._data[curr_index] is not None:
            tried.append(curr_index)
            try_number += 1
            if try_number > self.n_slots // 2:
                #print self._data
                print('Size = ' + str(self.n_slots))
                print('Number of items = ' + str(self.n_items))
                print('Failed to find an empty slot...')
                print('Try number = ' + str(try_number))
                print('List of tried slots = ' + str(tried))
                print('Current table = ' + str(self._data))
                print('*** Failed to find an empty slot!!!! ')
                print('*** This can happen with quadratic probing')
                print('*** if the table is over half full')
                raise ValueError('Failed to find an empty slot!!!!')
            else:
                curr_index = (first_hash + try_number**2) % self.n_slots
        return curr_index

    def store(self, item):
        """Stores an item in the hashtable."""
        if self.n_slots == self.n_items:
            #Argh - crash, who made the hashtable too small?
            print(self._data)
            print('Size = ' + str(self.n_slots))
            print('Slots used = ' + str(self.n_items))
            print("Hash table is full!!!! You eeediot")
            print("A good Hasher would have resized the hash table by now!")
            raise ValueError("Hash table is full!!!!")
        # **************************************************
        # ---start student section---
        pass
        # ===end student section===
        self.n_items += 1

    def _hash(self, item):
        """Computes the hash of an item."""
        return nice_hash(item) % self.n_slots

    def __contains__(self, item):
        """
        Checks to see if an item is stored within the hashtable.
        Returns True if it is, otherwise it returns False.
        Note: The function should give up and return False if
        the try_number reaches the number of slots in the table.
        At this stage we definitely know we are in a never ending
        cycle and will never find the item...
        You can call this method using Python's 'in' keyword:
            ht = HashTable()
            ...
            if 'foo' in ht:
                ...
        """
        first_hash = self._hash(item)
        try_number = 0
        current_index = first_hash
        # ---start student section---
        pass
        # ===end student section===

    def __str__(self):
        output = 'QuadraticOpenAddressHashTable:\n'
        for i in range(self.n_slots):
            output += f'slot {i:8} = '
            item = self._data[i]
            if item == None:
                output = output + '-'
            else:
                output = output + str(item)
            output += '\n'
        load_factor = float(self.n_items) /self.n_slots
        output += f'n_slots = {self.n_slots}\n'
        output += f'n_items in table = {self.n_items}\n'
        output += f'Load factor = {load_factor:6.3f}\n'
        return output

    def __repr__(self):
        return str(self)

# ==============================================


def load_dict_words(filename):
    """
    Load a dictionary from a file, and returns a list of all words in the
    dictionary.
    Dictionary files should have one word per line.
    """
    with open(filename, 'r', encoding= 'latin_1') as file:
        words = [line.strip().lower() for line in file]
    return words


def load_doc_words(filename):
    """
    Loads a document from a file, and returns a list of all words in the
    document. 'Words' are sequences of one or more [A-Za-z] characters.
    Words are converted to lowercase.
    """
    with open(filename, 'r', encoding= 'ascii') as file:
        words = [word.lower() for word in re.findall(r'[A-Za-z]+', file.read())]
    return words


#----------------------------------------------------------------------------
def spellcheck_with_list(document_word_list, dictionary_word_list, quiet_mode=False):
    """
    Checks the spelling of each word in document_word_list (a list of words from a text)
    against the dictionary_word_list (the list of correct words).
    """
    #Check all words in the document_word_list list
    #Printing out misspelled words and counting them
    if not quiet_mode:
        print_line()
        print('Misspelled words:')
        print_line()

    # ***********************************************************
    # Write your spell check code below
    dictionary_length = len(dictionary_word_list)
    document_length = len(document_word_list)
    num_errors = 0
    unique_errors = set()  # hint hint :)
    start_check_time = perf_counter()
    # start
    # ---start student section---
    pass

    # ===end student section===
    # end
    end_check_time = perf_counter()
    check_time = end_check_time - start_check_time
    ms_per_word = (check_time /len(document_word_list)) *1000
    if not quiet_mode:
        print_line()
        print(f'Number of errors = {num_errors} words')
        print(f'Number of unique errors = {len(unique_errors)} words')
        print_line()
        print()
        print_line()
        print('Summary stats (using simple linear search):')
        print_line()
        print(f'Words in dictionary = {dictionary_length} words')
        print(f'Document length = {document_length} words')
        print(f'Document check time = {check_time:8.4f}s')
        print(f'Check time per word in document = {ms_per_word:10.6f} ms')
        print_line()
        print()

    return check_time


#----------------------------------------------------------------------------
def build_hash_table(ht_type, ht_size, dictionary_word_list, quiet_mode=False):
    """ Stores all the dictionary words in a hash table of the
    given type and size.
    Returns the hash table and the time taken to build it
    """
    #Build Hash Table by adding words from the dictionary word list
    start_build_time = perf_counter()
    if ht_type == 'Chaining':
        hash_table = ChainingHashTable(ht_size)
    elif ht_type == 'Linear':
        hash_table = LinearHashTable(ht_size)
    elif ht_type == 'Quadratic':
        hash_table = QuadraticHashTable(ht_size)
    else:
        print('Hash type must be Chaining, Linear or Quadratic.')
    for word in dictionary_word_list:
        hash_table.store(word)
    end_build_time = perf_counter()
    build_time = end_build_time - start_build_time
    return hash_table, build_time


#----------------------------------------------------------------------------
def spellcheck_with_hashtable(document_word_list,
                              dictionary_word_list,
                              ht_type,
                              ht_size,
                              quiet_mode=False):
    """
    Checks the spelling of each word in 'document' (a list of words) against
    the dictionary (another list of words, using the given hash_table).

    """
    hash_table, build_time = build_hash_table(ht_type, ht_size, dictionary_word_list)

    #Check all words in the document list
    #Printing out misspelled words and counting them
    if not quiet_mode:
        print_line()
        print('Misspelled words:')
        print_line()

    # Write your spell check code below
    dictionary_length = len(dictionary_word_list)
    document_length = len(document_word_list)
    num_errors = 0
    counter_10 = 0
    unique_errors = set()  # hint hint :)
    start_check_time = perf_counter()
    # start
    # ---start student section---
    for word in document_word_list:
        if word not in dictionary_word_list:
            num_errors += 1
            unique_errors.add(word)
            if num_errors % 10 == 0:
                print("This is the {}th word: {}".format(num_errors, word))
            else:
                print(word)
    # ===end student section===
    end_check_time = perf_counter()
    # ---- end ----
    check_time = end_check_time - start_check_time
    load_factor = float(hash_table.n_items) /hash_table.n_slots
    ms_per_word = (check_time /len(document_word_list)) *1000
    if not quiet_mode:
        print_line()
        print(f'Number of errors = {num_errors} words')
        print(f'Number of unique errors = {len(unique_errors)} words')
        print_line()
        print()
        print_line()
        print('Summary stats (using ' +ht_type +' hash table):')
        print_line()
        print(f'Words in dictionary = {dictionary_length} words')
        print('Hash table size = {0:d} slots'.format(hash_table.n_slots))
        print('Hash table load factor = {0:8.4f}'.format(load_factor))
        print('Hash table build time = {0:8.4f}s'.format(build_time))
        print(f'Document length = {document_length} words')
        print(f'Document check time = {check_time:8.4f}s')
        print(f'Check time per word in document = {ms_per_word:10.6f} ms')
        print_line()
        print()
    return check_time


def binary_search(values, needle):
    lowerBound = 0
    upperBound = len(values)
    index = 0
    # ---start student section---
    pass
    # ===end student section===
    return False


# ------------------------------------------------
def spellcheck_bin(document_word_list, dictionary_word_list, quiet_mode=False):
    #Check all words in the document list
    #Printing out misspelled words and counting them
    if not quiet_mode:
        print_line()
        print('Misspelled words:')
        print_line()
    num_errors = 0

    dict_sort_start = perf_counter()
    dictionary = sorted(dictionary_word_list)
    dict_sort_end = perf_counter()

    dictionary_length = len(dictionary_word_list)
    document_length = len(document_word_list)
    unique_errors = set()  # hint hint :)
    num_errors = 0
    # ---- start -----
    start_check_time = perf_counter()
    for word in document_word_list:
        if not binary_search(dictionary, word):
            if not quiet_mode:
                print(num_errors, word)
            num_errors += 1
    end_check_time = perf_counter()
    # ---- end ----

    check_time = end_check_time - start_check_time
    ms_per_word = (check_time /len(document_word_list)) *1000
    dict_sort_time = (dict_sort_end - dict_sort_start) *1000

    if not quiet_mode:
        print_line()
        print(f'Number of errors = {num_errors} words')
        print(f'Number of unique errors = {len(unique_errors)} words')
        print_line()
        print()
        print_line()
        print('Summary stats (using Binary Search on Dictionary):')
        print_line()
        print(f'Words in dictionary = {dictionary_length} words')
        print(f'Time to sort dictionary = {dict_sort_time:.4f} ms')
        print(f'Document length = {document_length} words')
        print(f'Document check time = {check_time:8.4f}s')
        print(f'Check time per word in document = {ms_per_word:10.6f} ms')
        print_line()
        print()
    return check_time


if __name__ == '__main__':
    # you don't need to worry about what the next line does :)
    os.environ['TERM'] = 'linux'  # Suppress ^[[?1034h to keep doctests happy

    # Idividual tests:
    # doctest.run_docstring_examples(ChainingHashTable, None)
    doctest.run_docstring_examples(LinearHashTable, None)
    # doctest.run_docstring_examples(QuadraticHashTable, None)

    # run all the doctests:
    # doctest.testmod()

    # NOTE:
    # Use tester.py to do some test runs with hash tables and
    # to do some spellchecks on various files