Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- """
- Have fun comparing various spellchecking methods
- You should carefully follow the handout for this lab
- so that you do things in the right order.
- """
- import re
- import os
- from time import clock, perf_counter
- import doctest
- # setup a ruled line string
- LINE_WIDTH = 50
- SINGLE_LINE = '-' *LINE_WIDTH
- def print_line():
- """ Prints a single line """
- print(SINGLE_LINE)
- def _c_mul(value_a, value_b):
- '''Substitute for c multiply function'''
- return ((int(value_a) * int(value_b)) & 0xFFFFFFFF)
- def nice_hash(input_string):
- '''Takes a string name and returns a hash for the string. This hash value
- will be os independent, unlike the default Python hash function.'''
- if input_string is None:
- return 0 # empty
- value = ord(input_string[0]) << 7
- for char in input_string:
- value = _c_mul(1000003, value) ^ ord(char)
- value = value ^ len(input_string)
- if value == -1:
- value = -2
- return value
- #---------------------------------------------
- # Start of Class definitions
- #---------------------------------------------
- class ChainingHashTable():
- """A simple HashTable.
- ***** IMPORTANT
- ***** =========
- ***** NOTE: These will fail initially, ie, when _hash returns 0
- ***** DON'T worry you will fix this later:)
- >>> hash_table = ChainingHashTable(5)
- >>> hash_table.store('George')
- >>> hash_table.store('Bungle')
- >>> hash_table.store('Zippy')
- >>> hash_table.store('Jane')
- >>> hash_table.store('Rod')
- >>> hash_table.store('Freddy')
- >>> print(hash_table)
- ChainingHashTable:
- slot 0 = ['Rod']
- slot 1 = ['George', 'Jane']
- slot 2 = ['Bungle']
- slot 3 = ['Zippy', 'Freddy']
- slot 4 = []
- n_slots = 5
- n_items in table = 6
- Load factor = 1.200
- Average chain length = 1.200
- <BLANKLINE>
- NOTE: ChainingHashTable print doctests will fail initially, ie, when _hash returns 0
- Don't worry you will fix this later... See handout
- <BLANKLINE>
- >>> print('Freddy' in hash_table)
- True
- >>> print('Dipsy' in hash_table)
- False
- >>> print('Joey' in hash_table)
- False
- >>> print('Jane' in hash_table)
- True
- >>> print('Bungle' in hash_table)
- True
- """
- def __init__(self, slots=11):
- """
- Creates a new hashtable with a number of slots (default: 11).
- It will help if you always make your hash table size a prime number.
- Each slot will contain a list of items that hash to the slot.
- Initially the list in each slot contains a None.
- n_items contains the number of items that have been put in the table.
- """
- self._data = [[] for i in range(slots)]
- self.n_slots = slots
- self.n_items = 0
- def _hash(self, item):
- """Computes the hash of an item.
- First calculate using nice_hash(item) and then use modulo
- to reduce it down to a number in the range 0..slef.n_slots """
- # NOTE:
- # We will use a trivial hash function here to start with
- # Don't worry, you will get to update it later in the lab...
- thing = nice_hash(item)
- output = thing % self.n_slots
- return output
- def store(self, item):
- """Appends an item to the list in the slot at _data[hash]."""
- index = self._hash(item)
- self._data[index].append(item)
- # Keep track of number of items in hash table
- self.n_items += 1
- def average_chain_length(self):
- """ Returns the average chain length """
- return self.n_items / self.n_slots
- def __str__(self):
- output = 'ChainingHashTable:\n'
- for i in range(self.n_slots):
- list_at_slot = self._data[i]
- output = output +'slot {0:8d} = '.format(i)
- if list_at_slot == None:
- output = output + 'None'
- else:
- output = output + str(list_at_slot)
- output += '\n'
- load_factor = float(self.n_items) /self.n_slots
- output += 'n_slots = {0:d}\n'.format(self.n_slots)
- output += 'n_items in table = {0:d}\n'.format(self.n_items)
- output += 'Load factor = {0:6.3f}\n'.format(load_factor)
- output += 'Average chain length = {0:.3f}\n'.format(self.average_chain_length())
- output += '\n'
- output += 'NOTE: ChainingHashTable print doctests will fail initially, ie, when _hash returns 0\n'
- output += "Don't worry you will fix this later... See handout\n"
- return output
- def __contains__(self, item):
- """
- Checks to see if an item is stored within the hashtable.
- Returns True if it does, otherwise it returns False.
- You can call this method using Python's 'in' keyword:
- ht = HashTable()
- ...
- if 'foo' in ht:
- ...
- """
- # remember self._data[index] contains a list of items that hash to
- # the slot at the given index
- # ---start student section---
- index = self._hash(item)
- if item in self._data[index]:
- return True
- else:
- return False
- # ===end student section===
- #----------------------------------------------------------------------------
- class LinearHashTable():
- """A simple Open Addressing HashTable with Linear Probing.
- Called simply LinearHashTable to make the name shorter...
- >>> hash_table = LinearHashTable(11)
- >>> hash_table.store('Dianna')
- >>> hash_table.store('Bobby')
- >>> hash_table.store('Dirk')
- >>> hash_table.store('Darlene')
- >>> hash_table.store('Paul')
- >>> hash_table.store('Peter')
- >>> hash_table.store('Paula')
- >>> hash_table.store('David')
- >>> hash_table.store('Zoe')
- >>> print(hash_table)
- LinearHashTable:
- slot 0 = David
- slot 1 = Darlene
- slot 2 = Paul
- slot 3 = Zoe
- slot 4 = Peter
- slot 5 = Paula
- slot 6 = Dirk
- slot 7 = -
- slot 8 = -
- slot 9 = Dianna
- slot 10 = Bobby
- n_slots = 11
- n_items in table = 9
- Load factor = 0.818
- <BLANKLINE>
- >>> 'Paul' in hash_table
- True
- >>> 'Peter' in hash_table
- True
- >>> 'Paula' in hash_table
- True
- >>> 'David' in hash_table
- True
- >>> 'Bobby' in hash_table
- True
- >>> 'Dianna' in hash_table
- True
- >>> 'Dirk' in hash_table
- True
- >>> 'Darlene' in hash_table
- True
- >>> 'Zoe' in hash_table
- True
- >>> print('Dingus' in hash_table)
- False
- """
- def __init__(self, slots=11):
- """
- Creates a new hashtable with the given number of slots.
- Remember we can't have a load factor greater than 1 for an open hash...
- """
- self._data = [None for i in range(slots)]
- self.n_slots = slots
- self.n_items = 0
- def store(self, item):
- """Stores an item in the hashtable."""
- if self.n_slots == self.n_items:
- #Argh - crash, who made the hashtable too small?
- print(self._data)
- print('Size = ' + str(self.n_slots))
- print('Slots used = ' + str(self.n_items))
- print("Whoops, hash table is full!!!!")
- print("In reality you would have made it bigger by now...")
- print("But, you're not expected to for this lab.")
- raise IndexError("Hash table is full!!!!")
- # ***********************************************************
- # ---start student section---
- index_initial = self._hash(item)
- if not self.__contains__(item):
- slot_found = False
- while slot_found == False:
- if self._data[index_initial] == None:
- self._data[index_initial] = item
- slot_found = True
- else:
- index_initial += 1
- if index_initial == 11:
- index_initial = 1
- # ===end student section===
- # Keep track of number of items in hash table
- self.n_items += 1
- def _hash(self, item):
- """Computes the hash of an item."""
- return nice_hash(item) % self.n_slots
- def __contains__(self, item):
- """
- Checks to see if an item is stored within the hashtable.
- Returns True if it does, otherwise it returns False.
- You can call this method using Python's 'in' keyword:
- ht = HashTable()
- ...
- print('foo' in ht) # invoked ht.__contains__
- ...
- """
- first_hash = self._hash(item)
- have_wrapped = False
- if self._data[first_hash] == item:
- return True
- else:
- current_index = first_hash
- while self._data[current_index] is not None:
- if self._data[current_index] == item:
- # horay we found it
- return True
- if (current_index == first_hash) and have_wrapped:
- # back to original hash and didn't find item
- # phew - the hashtable is full!
- return False
- if current_index == (self.n_slots -1):
- # wrap back to start of hash table
- current_index = 0
- have_wrapped = True
- else:
- current_index += 1
- def __str__(self):
- """ Returns a string representation of the hashtable
- Along with some summary stats.
- """
- output = 'LinearHashTable:\n'
- for i in range(self.n_slots):
- output += f'slot {i:8} = '
- item = self._data[i]
- if item == None:
- output = output + '-'
- else:
- output = output + str(item)
- output += '\n'
- load_factor = float(self.n_items) / self.n_slots
- output += f'n_slots = {self.n_slots}\n'
- output += f'n_items in table = {self.n_items}\n'
- output += f'Load factor = {load_factor:6.3f}\n'
- return output
- def __repr__(self):
- """ Just uses __str__ """
- return str(self)
- #----------------------------------------------------------------------------
- class QuadraticHashTable():
- """Is a child class of OpenAddressHashTable.
- If a collision then uses a quadratic probing function to find a free slot
- >>> hash_table = QuadraticHashTable(11)
- >>> hash_table.store('Dianna')
- >>> hash_table.store('Bobby')
- >>> hash_table.store('Dirk')
- >>> hash_table.store('Darlene')
- >>> hash_table.store('Paul')
- >>> hash_table.store('Paula')
- >>> hash_table.store('David')
- >>> hash_table.store('Harry')
- >>> print(hash_table)
- QuadraticOpenAddressHashTable:
- slot 0 = David
- slot 1 = Darlene
- slot 2 = Paul
- slot 3 = -
- slot 4 = Paula
- slot 5 = Harry
- slot 6 = Dirk
- slot 7 = -
- slot 8 = -
- slot 9 = Dianna
- slot 10 = Bobby
- n_slots = 11
- n_items in table = 8
- Load factor = 0.727
- <BLANKLINE>
- >>> 'Harry' in hash_table
- True
- >>> 'Paul' in hash_table
- True
- >>> 'Paula' in hash_table
- True
- >>> 'David' in hash_table
- True
- >>> 'Bobby' in hash_table
- True
- >>> 'Dianna' in hash_table
- True
- >>> 'Dirk' in hash_table
- True
- >>> 'Darlene' in hash_table
- True
- >>> 'Dingus' in hash_table
- False
- """
- def __init__(self, slots=11):
- """
- Creates a new hashtable with a number of slots (default: 10).
- Remember we can't have a load factor greater than 1 for an open hash...
- """
- self._data = [None for i in range(slots)]
- self.n_slots = slots
- self.n_items = 0
- def _next_free_slot(self, first_hash):
- """
- Keeps incrementing hash index until an empty slot is found.
- Then returns the index of that slot.
- Used by the store method to help deal with a collisions.
- Note: this method isn't useful in __contains__ but you shoul
- follow a similar series of probing there.
- """
- curr_index = first_hash
- try_number = 0
- tried = []
- #print self._data
- while self._data[curr_index] is not None:
- tried.append(curr_index)
- try_number += 1
- if try_number > self.n_slots // 2:
- #print self._data
- print('Size = ' + str(self.n_slots))
- print('Number of items = ' + str(self.n_items))
- print('Failed to find an empty slot...')
- print('Try number = ' + str(try_number))
- print('List of tried slots = ' + str(tried))
- print('Current table = ' + str(self._data))
- print('*** Failed to find an empty slot!!!! ')
- print('*** This can happen with quadratic probing')
- print('*** if the table is over half full')
- raise ValueError('Failed to find an empty slot!!!!')
- else:
- curr_index = (first_hash + try_number**2) % self.n_slots
- return curr_index
- def store(self, item):
- """Stores an item in the hashtable."""
- if self.n_slots == self.n_items:
- #Argh - crash, who made the hashtable too small?
- print(self._data)
- print('Size = ' + str(self.n_slots))
- print('Slots used = ' + str(self.n_items))
- print("Hash table is full!!!! You eeediot")
- print("A good Hasher would have resized the hash table by now!")
- raise ValueError("Hash table is full!!!!")
- # **************************************************
- # ---start student section---
- pass
- # ===end student section===
- self.n_items += 1
- def _hash(self, item):
- """Computes the hash of an item."""
- return nice_hash(item) % self.n_slots
- def __contains__(self, item):
- """
- Checks to see if an item is stored within the hashtable.
- Returns True if it is, otherwise it returns False.
- Note: The function should give up and return False if
- the try_number reaches the number of slots in the table.
- At this stage we definitely know we are in a never ending
- cycle and will never find the item...
- You can call this method using Python's 'in' keyword:
- ht = HashTable()
- ...
- if 'foo' in ht:
- ...
- """
- first_hash = self._hash(item)
- try_number = 0
- current_index = first_hash
- # ---start student section---
- pass
- # ===end student section===
- def __str__(self):
- output = 'QuadraticOpenAddressHashTable:\n'
- for i in range(self.n_slots):
- output += f'slot {i:8} = '
- item = self._data[i]
- if item == None:
- output = output + '-'
- else:
- output = output + str(item)
- output += '\n'
- load_factor = float(self.n_items) /self.n_slots
- output += f'n_slots = {self.n_slots}\n'
- output += f'n_items in table = {self.n_items}\n'
- output += f'Load factor = {load_factor:6.3f}\n'
- return output
- def __repr__(self):
- return str(self)
- # ==============================================
- def load_dict_words(filename):
- """
- Load a dictionary from a file, and returns a list of all words in the
- dictionary.
- Dictionary files should have one word per line.
- """
- with open(filename, 'r', encoding= 'latin_1') as file:
- words = [line.strip().lower() for line in file]
- return words
- def load_doc_words(filename):
- """
- Loads a document from a file, and returns a list of all words in the
- document. 'Words' are sequences of one or more [A-Za-z] characters.
- Words are converted to lowercase.
- """
- with open(filename, 'r', encoding= 'ascii') as file:
- words = [word.lower() for word in re.findall(r'[A-Za-z]+', file.read())]
- return words
- #----------------------------------------------------------------------------
- def spellcheck_with_list(document_word_list, dictionary_word_list, quiet_mode=False):
- """
- Checks the spelling of each word in document_word_list (a list of words from a text)
- against the dictionary_word_list (the list of correct words).
- """
- #Check all words in the document_word_list list
- #Printing out misspelled words and counting them
- if not quiet_mode:
- print_line()
- print('Misspelled words:')
- print_line()
- # ***********************************************************
- # Write your spell check code below
- dictionary_length = len(dictionary_word_list)
- document_length = len(document_word_list)
- num_errors = 0
- unique_errors = set() # hint hint :)
- start_check_time = perf_counter()
- # start
- # ---start student section---
- pass
- # ===end student section===
- # end
- end_check_time = perf_counter()
- check_time = end_check_time - start_check_time
- ms_per_word = (check_time /len(document_word_list)) *1000
- if not quiet_mode:
- print_line()
- print(f'Number of errors = {num_errors} words')
- print(f'Number of unique errors = {len(unique_errors)} words')
- print_line()
- print()
- print_line()
- print('Summary stats (using simple linear search):')
- print_line()
- print(f'Words in dictionary = {dictionary_length} words')
- print(f'Document length = {document_length} words')
- print(f'Document check time = {check_time:8.4f}s')
- print(f'Check time per word in document = {ms_per_word:10.6f} ms')
- print_line()
- print()
- return check_time
- #----------------------------------------------------------------------------
- def build_hash_table(ht_type, ht_size, dictionary_word_list, quiet_mode=False):
- """ Stores all the dictionary words in a hash table of the
- given type and size.
- Returns the hash table and the time taken to build it
- """
- #Build Hash Table by adding words from the dictionary word list
- start_build_time = perf_counter()
- if ht_type == 'Chaining':
- hash_table = ChainingHashTable(ht_size)
- elif ht_type == 'Linear':
- hash_table = LinearHashTable(ht_size)
- elif ht_type == 'Quadratic':
- hash_table = QuadraticHashTable(ht_size)
- else:
- print('Hash type must be Chaining, Linear or Quadratic.')
- for word in dictionary_word_list:
- hash_table.store(word)
- end_build_time = perf_counter()
- build_time = end_build_time - start_build_time
- return hash_table, build_time
- #----------------------------------------------------------------------------
- def spellcheck_with_hashtable(document_word_list,
- dictionary_word_list,
- ht_type,
- ht_size,
- quiet_mode=False):
- """
- Checks the spelling of each word in 'document' (a list of words) against
- the dictionary (another list of words, using the given hash_table).
- """
- hash_table, build_time = build_hash_table(ht_type, ht_size, dictionary_word_list)
- #Check all words in the document list
- #Printing out misspelled words and counting them
- if not quiet_mode:
- print_line()
- print('Misspelled words:')
- print_line()
- # Write your spell check code below
- dictionary_length = len(dictionary_word_list)
- document_length = len(document_word_list)
- num_errors = 0
- counter_10 = 0
- unique_errors = set() # hint hint :)
- start_check_time = perf_counter()
- # start
- # ---start student section---
- for word in document_word_list:
- if word not in dictionary_word_list:
- num_errors += 1
- unique_errors.add(word)
- if num_errors % 10 == 0:
- print("This is the {}th word: {}".format(num_errors, word))
- else:
- print(word)
- # ===end student section===
- end_check_time = perf_counter()
- # ---- end ----
- check_time = end_check_time - start_check_time
- load_factor = float(hash_table.n_items) /hash_table.n_slots
- ms_per_word = (check_time /len(document_word_list)) *1000
- if not quiet_mode:
- print_line()
- print(f'Number of errors = {num_errors} words')
- print(f'Number of unique errors = {len(unique_errors)} words')
- print_line()
- print()
- print_line()
- print('Summary stats (using ' +ht_type +' hash table):')
- print_line()
- print(f'Words in dictionary = {dictionary_length} words')
- print('Hash table size = {0:d} slots'.format(hash_table.n_slots))
- print('Hash table load factor = {0:8.4f}'.format(load_factor))
- print('Hash table build time = {0:8.4f}s'.format(build_time))
- print(f'Document length = {document_length} words')
- print(f'Document check time = {check_time:8.4f}s')
- print(f'Check time per word in document = {ms_per_word:10.6f} ms')
- print_line()
- print()
- return check_time
- def binary_search(values, needle):
- lowerBound = 0
- upperBound = len(values)
- index = 0
- # ---start student section---
- pass
- # ===end student section===
- return False
- # ------------------------------------------------
- def spellcheck_bin(document_word_list, dictionary_word_list, quiet_mode=False):
- #Check all words in the document list
- #Printing out misspelled words and counting them
- if not quiet_mode:
- print_line()
- print('Misspelled words:')
- print_line()
- num_errors = 0
- dict_sort_start = perf_counter()
- dictionary = sorted(dictionary_word_list)
- dict_sort_end = perf_counter()
- dictionary_length = len(dictionary_word_list)
- document_length = len(document_word_list)
- unique_errors = set() # hint hint :)
- num_errors = 0
- # ---- start -----
- start_check_time = perf_counter()
- for word in document_word_list:
- if not binary_search(dictionary, word):
- if not quiet_mode:
- print(num_errors, word)
- num_errors += 1
- end_check_time = perf_counter()
- # ---- end ----
- check_time = end_check_time - start_check_time
- ms_per_word = (check_time /len(document_word_list)) *1000
- dict_sort_time = (dict_sort_end - dict_sort_start) *1000
- if not quiet_mode:
- print_line()
- print(f'Number of errors = {num_errors} words')
- print(f'Number of unique errors = {len(unique_errors)} words')
- print_line()
- print()
- print_line()
- print('Summary stats (using Binary Search on Dictionary):')
- print_line()
- print(f'Words in dictionary = {dictionary_length} words')
- print(f'Time to sort dictionary = {dict_sort_time:.4f} ms')
- print(f'Document length = {document_length} words')
- print(f'Document check time = {check_time:8.4f}s')
- print(f'Check time per word in document = {ms_per_word:10.6f} ms')
- print_line()
- print()
- return check_time
- if __name__ == '__main__':
- # you don't need to worry about what the next line does :)
- os.environ['TERM'] = 'linux' # Suppress ^[[?1034h to keep doctests happy
- # Idividual tests:
- # doctest.run_docstring_examples(ChainingHashTable, None)
- doctest.run_docstring_examples(LinearHashTable, None)
- # doctest.run_docstring_examples(QuadraticHashTable, None)
- # run all the doctests:
- # doctest.testmod()
- # NOTE:
- # Use tester.py to do some test runs with hash tables and
- # to do some spellchecks on various files
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement