Advertisement
Guest User

Untitled

a guest
Aug 26th, 2019
382
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 23.99 KB | None | 0 0
  1. """
  2. Have fun comparing various spellchecking methods
  3. You should carefully follow the handout for this lab
  4. so that you do things in the right order.
  5. """
  6. import re
  7. import os
  8. from time import clock, perf_counter
  9. import doctest
  10.  
  11. # setup a ruled line string
  12. LINE_WIDTH = 50
  13. SINGLE_LINE = '-' *LINE_WIDTH
  14.  
  15.  
  16. def print_line():
  17. """ Prints a single line """
  18. print(SINGLE_LINE)
  19.  
  20.  
  21. def _c_mul(value_a, value_b):
  22. '''Substitute for c multiply function'''
  23. return ((int(value_a) * int(value_b)) & 0xFFFFFFFF)
  24.  
  25.  
  26. def nice_hash(input_string):
  27. '''Takes a string name and returns a hash for the string. This hash value
  28. will be os independent, unlike the default Python hash function.'''
  29. if input_string is None:
  30. return 0 # empty
  31. value = ord(input_string[0]) << 7
  32. for char in input_string:
  33. value = _c_mul(1000003, value) ^ ord(char)
  34. value = value ^ len(input_string)
  35. if value == -1:
  36. value = -2
  37. return value
  38.  
  39.  
  40. #---------------------------------------------
  41. # Start of Class definitions
  42. #---------------------------------------------
  43. class ChainingHashTable():
  44. """A simple HashTable.
  45. ***** IMPORTANT
  46. ***** =========
  47. ***** NOTE: These will fail initially, ie, when _hash returns 0
  48. ***** DON'T worry you will fix this later:)
  49. >>> hash_table = ChainingHashTable(5)
  50. >>> hash_table.store('George')
  51. >>> hash_table.store('Bungle')
  52. >>> hash_table.store('Zippy')
  53. >>> hash_table.store('Jane')
  54. >>> hash_table.store('Rod')
  55. >>> hash_table.store('Freddy')
  56. >>> print(hash_table)
  57. ChainingHashTable:
  58. slot 0 = ['Rod']
  59. slot 1 = ['George', 'Jane']
  60. slot 2 = ['Bungle']
  61. slot 3 = ['Zippy', 'Freddy']
  62. slot 4 = []
  63. n_slots = 5
  64. n_items in table = 6
  65. Load factor = 1.200
  66. Average chain length = 1.200
  67. <BLANKLINE>
  68. NOTE: ChainingHashTable print doctests will fail initially, ie, when _hash returns 0
  69. Don't worry you will fix this later... See handout
  70. <BLANKLINE>
  71. >>> print('Freddy' in hash_table)
  72. True
  73. >>> print('Dipsy' in hash_table)
  74. False
  75. >>> print('Joey' in hash_table)
  76. False
  77. >>> print('Jane' in hash_table)
  78. True
  79. >>> print('Bungle' in hash_table)
  80. True
  81. """
  82.  
  83. def __init__(self, slots=11):
  84. """
  85. Creates a new hashtable with a number of slots (default: 11).
  86. It will help if you always make your hash table size a prime number.
  87.  
  88. Each slot will contain a list of items that hash to the slot.
  89. Initially the list in each slot contains a None.
  90. n_items contains the number of items that have been put in the table.
  91. """
  92. self._data = [[] for i in range(slots)]
  93. self.n_slots = slots
  94. self.n_items = 0
  95.  
  96. def _hash(self, item):
  97. """Computes the hash of an item.
  98. First calculate using nice_hash(item) and then use modulo
  99. to reduce it down to a number in the range 0..slef.n_slots """
  100. # NOTE:
  101. # We will use a trivial hash function here to start with
  102. # Don't worry, you will get to update it later in the lab...
  103. thing = nice_hash(item)
  104. output = thing % self.n_slots
  105. return output
  106.  
  107. def store(self, item):
  108. """Appends an item to the list in the slot at _data[hash]."""
  109. index = self._hash(item)
  110. self._data[index].append(item)
  111. # Keep track of number of items in hash table
  112. self.n_items += 1
  113.  
  114. def average_chain_length(self):
  115. """ Returns the average chain length """
  116. return self.n_items / self.n_slots
  117.  
  118. def __str__(self):
  119. output = 'ChainingHashTable:\n'
  120. for i in range(self.n_slots):
  121. list_at_slot = self._data[i]
  122. output = output +'slot {0:8d} = '.format(i)
  123. if list_at_slot == None:
  124. output = output + 'None'
  125. else:
  126. output = output + str(list_at_slot)
  127. output += '\n'
  128. load_factor = float(self.n_items) /self.n_slots
  129. output += 'n_slots = {0:d}\n'.format(self.n_slots)
  130. output += 'n_items in table = {0:d}\n'.format(self.n_items)
  131. output += 'Load factor = {0:6.3f}\n'.format(load_factor)
  132. output += 'Average chain length = {0:.3f}\n'.format(self.average_chain_length())
  133. output += '\n'
  134. output += 'NOTE: ChainingHashTable print doctests will fail initially, ie, when _hash returns 0\n'
  135. output += "Don't worry you will fix this later... See handout\n"
  136. return output
  137.  
  138. def __contains__(self, item):
  139. """
  140. Checks to see if an item is stored within the hashtable.
  141. Returns True if it does, otherwise it returns False.
  142. You can call this method using Python's 'in' keyword:
  143. ht = HashTable()
  144. ...
  145. if 'foo' in ht:
  146. ...
  147. """
  148. # remember self._data[index] contains a list of items that hash to
  149. # the slot at the given index
  150. # ---start student section---
  151. index = self._hash(item)
  152. if item in self._data[index]:
  153. return True
  154. else:
  155. return False
  156. # ===end student section===
  157.  
  158.  
  159. #----------------------------------------------------------------------------
  160. class LinearHashTable():
  161. """A simple Open Addressing HashTable with Linear Probing.
  162. Called simply LinearHashTable to make the name shorter...
  163.  
  164. >>> hash_table = LinearHashTable(11)
  165. >>> hash_table.store('Dianna')
  166. >>> hash_table.store('Bobby')
  167. >>> hash_table.store('Dirk')
  168. >>> hash_table.store('Darlene')
  169. >>> hash_table.store('Paul')
  170. >>> hash_table.store('Peter')
  171. >>> hash_table.store('Paula')
  172. >>> hash_table.store('David')
  173. >>> hash_table.store('Zoe')
  174. >>> print(hash_table)
  175. LinearHashTable:
  176. slot 0 = David
  177. slot 1 = Darlene
  178. slot 2 = Paul
  179. slot 3 = Zoe
  180. slot 4 = Peter
  181. slot 5 = Paula
  182. slot 6 = Dirk
  183. slot 7 = -
  184. slot 8 = -
  185. slot 9 = Dianna
  186. slot 10 = Bobby
  187. n_slots = 11
  188. n_items in table = 9
  189. Load factor = 0.818
  190. <BLANKLINE>
  191. >>> 'Paul' in hash_table
  192. True
  193. >>> 'Peter' in hash_table
  194. True
  195. >>> 'Paula' in hash_table
  196. True
  197. >>> 'David' in hash_table
  198. True
  199. >>> 'Bobby' in hash_table
  200. True
  201. >>> 'Dianna' in hash_table
  202. True
  203. >>> 'Dirk' in hash_table
  204. True
  205. >>> 'Darlene' in hash_table
  206. True
  207. >>> 'Zoe' in hash_table
  208. True
  209. >>> print('Dingus' in hash_table)
  210. False
  211. """
  212.  
  213. def __init__(self, slots=11):
  214. """
  215. Creates a new hashtable with the given number of slots.
  216. Remember we can't have a load factor greater than 1 for an open hash...
  217. """
  218. self._data = [None for i in range(slots)]
  219. self.n_slots = slots
  220. self.n_items = 0
  221.  
  222.  
  223. def store(self, item):
  224. """Stores an item in the hashtable."""
  225. if self.n_slots == self.n_items:
  226. #Argh - crash, who made the hashtable too small?
  227. print(self._data)
  228. print('Size = ' + str(self.n_slots))
  229. print('Slots used = ' + str(self.n_items))
  230. print("Whoops, hash table is full!!!!")
  231. print("In reality you would have made it bigger by now...")
  232. print("But, you're not expected to for this lab.")
  233. raise IndexError("Hash table is full!!!!")
  234. # ***********************************************************
  235. # ---start student section---
  236. index_initial = self._hash(item)
  237. if not self.__contains__(item):
  238. slot_found = False
  239. while slot_found == False:
  240. if self._data[index_initial] == None:
  241. self._data[index_initial] = item
  242. slot_found = True
  243. else:
  244. index_initial += 1
  245. if index_initial == 11:
  246. index_initial = 1
  247. # ===end student section===
  248.  
  249. # Keep track of number of items in hash table
  250. self.n_items += 1
  251.  
  252. def _hash(self, item):
  253. """Computes the hash of an item."""
  254. return nice_hash(item) % self.n_slots
  255.  
  256. def __contains__(self, item):
  257. """
  258. Checks to see if an item is stored within the hashtable.
  259. Returns True if it does, otherwise it returns False.
  260. You can call this method using Python's 'in' keyword:
  261. ht = HashTable()
  262. ...
  263. print('foo' in ht) # invoked ht.__contains__
  264. ...
  265. """
  266. first_hash = self._hash(item)
  267. have_wrapped = False
  268. if self._data[first_hash] == item:
  269. return True
  270. else:
  271. current_index = first_hash
  272. while self._data[current_index] is not None:
  273. if self._data[current_index] == item:
  274. # horay we found it
  275. return True
  276. if (current_index == first_hash) and have_wrapped:
  277. # back to original hash and didn't find item
  278. # phew - the hashtable is full!
  279. return False
  280. if current_index == (self.n_slots -1):
  281. # wrap back to start of hash table
  282. current_index = 0
  283. have_wrapped = True
  284. else:
  285. current_index += 1
  286.  
  287. def __str__(self):
  288. """ Returns a string representation of the hashtable
  289. Along with some summary stats.
  290. """
  291. output = 'LinearHashTable:\n'
  292. for i in range(self.n_slots):
  293. output += f'slot {i:8} = '
  294. item = self._data[i]
  295. if item == None:
  296. output = output + '-'
  297. else:
  298. output = output + str(item)
  299. output += '\n'
  300. load_factor = float(self.n_items) / self.n_slots
  301. output += f'n_slots = {self.n_slots}\n'
  302. output += f'n_items in table = {self.n_items}\n'
  303. output += f'Load factor = {load_factor:6.3f}\n'
  304. return output
  305.  
  306. def __repr__(self):
  307. """ Just uses __str__ """
  308. return str(self)
  309.  
  310.  
  311. #----------------------------------------------------------------------------
  312. class QuadraticHashTable():
  313. """Is a child class of OpenAddressHashTable.
  314. If a collision then uses a quadratic probing function to find a free slot
  315. >>> hash_table = QuadraticHashTable(11)
  316. >>> hash_table.store('Dianna')
  317. >>> hash_table.store('Bobby')
  318. >>> hash_table.store('Dirk')
  319. >>> hash_table.store('Darlene')
  320. >>> hash_table.store('Paul')
  321. >>> hash_table.store('Paula')
  322. >>> hash_table.store('David')
  323. >>> hash_table.store('Harry')
  324. >>> print(hash_table)
  325. QuadraticOpenAddressHashTable:
  326. slot 0 = David
  327. slot 1 = Darlene
  328. slot 2 = Paul
  329. slot 3 = -
  330. slot 4 = Paula
  331. slot 5 = Harry
  332. slot 6 = Dirk
  333. slot 7 = -
  334. slot 8 = -
  335. slot 9 = Dianna
  336. slot 10 = Bobby
  337. n_slots = 11
  338. n_items in table = 8
  339. Load factor = 0.727
  340. <BLANKLINE>
  341. >>> 'Harry' in hash_table
  342. True
  343. >>> 'Paul' in hash_table
  344. True
  345. >>> 'Paula' in hash_table
  346. True
  347. >>> 'David' in hash_table
  348. True
  349. >>> 'Bobby' in hash_table
  350. True
  351. >>> 'Dianna' in hash_table
  352. True
  353. >>> 'Dirk' in hash_table
  354. True
  355. >>> 'Darlene' in hash_table
  356. True
  357. >>> 'Dingus' in hash_table
  358. False
  359.  
  360. """
  361.  
  362. def __init__(self, slots=11):
  363. """
  364. Creates a new hashtable with a number of slots (default: 10).
  365. Remember we can't have a load factor greater than 1 for an open hash...
  366. """
  367. self._data = [None for i in range(slots)]
  368. self.n_slots = slots
  369. self.n_items = 0
  370.  
  371. def _next_free_slot(self, first_hash):
  372. """
  373. Keeps incrementing hash index until an empty slot is found.
  374. Then returns the index of that slot.
  375. Used by the store method to help deal with a collisions.
  376. Note: this method isn't useful in __contains__ but you shoul
  377. follow a similar series of probing there.
  378. """
  379. curr_index = first_hash
  380. try_number = 0
  381. tried = []
  382. #print self._data
  383. while self._data[curr_index] is not None:
  384. tried.append(curr_index)
  385. try_number += 1
  386. if try_number > self.n_slots // 2:
  387. #print self._data
  388. print('Size = ' + str(self.n_slots))
  389. print('Number of items = ' + str(self.n_items))
  390. print('Failed to find an empty slot...')
  391. print('Try number = ' + str(try_number))
  392. print('List of tried slots = ' + str(tried))
  393. print('Current table = ' + str(self._data))
  394. print('*** Failed to find an empty slot!!!! ')
  395. print('*** This can happen with quadratic probing')
  396. print('*** if the table is over half full')
  397. raise ValueError('Failed to find an empty slot!!!!')
  398. else:
  399. curr_index = (first_hash + try_number**2) % self.n_slots
  400. return curr_index
  401.  
  402. def store(self, item):
  403. """Stores an item in the hashtable."""
  404. if self.n_slots == self.n_items:
  405. #Argh - crash, who made the hashtable too small?
  406. print(self._data)
  407. print('Size = ' + str(self.n_slots))
  408. print('Slots used = ' + str(self.n_items))
  409. print("Hash table is full!!!! You eeediot")
  410. print("A good Hasher would have resized the hash table by now!")
  411. raise ValueError("Hash table is full!!!!")
  412. # **************************************************
  413. # ---start student section---
  414. pass
  415. # ===end student section===
  416. self.n_items += 1
  417.  
  418. def _hash(self, item):
  419. """Computes the hash of an item."""
  420. return nice_hash(item) % self.n_slots
  421.  
  422. def __contains__(self, item):
  423. """
  424. Checks to see if an item is stored within the hashtable.
  425. Returns True if it is, otherwise it returns False.
  426. Note: The function should give up and return False if
  427. the try_number reaches the number of slots in the table.
  428. At this stage we definitely know we are in a never ending
  429. cycle and will never find the item...
  430. You can call this method using Python's 'in' keyword:
  431. ht = HashTable()
  432. ...
  433. if 'foo' in ht:
  434. ...
  435. """
  436. first_hash = self._hash(item)
  437. try_number = 0
  438. current_index = first_hash
  439. # ---start student section---
  440. pass
  441. # ===end student section===
  442.  
  443. def __str__(self):
  444. output = 'QuadraticOpenAddressHashTable:\n'
  445. for i in range(self.n_slots):
  446. output += f'slot {i:8} = '
  447. item = self._data[i]
  448. if item == None:
  449. output = output + '-'
  450. else:
  451. output = output + str(item)
  452. output += '\n'
  453. load_factor = float(self.n_items) /self.n_slots
  454. output += f'n_slots = {self.n_slots}\n'
  455. output += f'n_items in table = {self.n_items}\n'
  456. output += f'Load factor = {load_factor:6.3f}\n'
  457. return output
  458.  
  459. def __repr__(self):
  460. return str(self)
  461.  
  462. # ==============================================
  463.  
  464.  
  465. def load_dict_words(filename):
  466. """
  467. Load a dictionary from a file, and returns a list of all words in the
  468. dictionary.
  469. Dictionary files should have one word per line.
  470. """
  471. with open(filename, 'r', encoding= 'latin_1') as file:
  472. words = [line.strip().lower() for line in file]
  473. return words
  474.  
  475.  
  476. def load_doc_words(filename):
  477. """
  478. Loads a document from a file, and returns a list of all words in the
  479. document. 'Words' are sequences of one or more [A-Za-z] characters.
  480. Words are converted to lowercase.
  481. """
  482. with open(filename, 'r', encoding= 'ascii') as file:
  483. words = [word.lower() for word in re.findall(r'[A-Za-z]+', file.read())]
  484. return words
  485.  
  486.  
  487. #----------------------------------------------------------------------------
  488. def spellcheck_with_list(document_word_list, dictionary_word_list, quiet_mode=False):
  489. """
  490. Checks the spelling of each word in document_word_list (a list of words from a text)
  491. against the dictionary_word_list (the list of correct words).
  492. """
  493. #Check all words in the document_word_list list
  494. #Printing out misspelled words and counting them
  495. if not quiet_mode:
  496. print_line()
  497. print('Misspelled words:')
  498. print_line()
  499.  
  500. # ***********************************************************
  501. # Write your spell check code below
  502. dictionary_length = len(dictionary_word_list)
  503. document_length = len(document_word_list)
  504. num_errors = 0
  505. unique_errors = set() # hint hint :)
  506. start_check_time = perf_counter()
  507. # start
  508. # ---start student section---
  509. pass
  510.  
  511. # ===end student section===
  512. # end
  513. end_check_time = perf_counter()
  514. check_time = end_check_time - start_check_time
  515. ms_per_word = (check_time /len(document_word_list)) *1000
  516. if not quiet_mode:
  517. print_line()
  518. print(f'Number of errors = {num_errors} words')
  519. print(f'Number of unique errors = {len(unique_errors)} words')
  520. print_line()
  521. print()
  522. print_line()
  523. print('Summary stats (using simple linear search):')
  524. print_line()
  525. print(f'Words in dictionary = {dictionary_length} words')
  526. print(f'Document length = {document_length} words')
  527. print(f'Document check time = {check_time:8.4f}s')
  528. print(f'Check time per word in document = {ms_per_word:10.6f} ms')
  529. print_line()
  530. print()
  531.  
  532. return check_time
  533.  
  534.  
  535. #----------------------------------------------------------------------------
  536. def build_hash_table(ht_type, ht_size, dictionary_word_list, quiet_mode=False):
  537. """ Stores all the dictionary words in a hash table of the
  538. given type and size.
  539. Returns the hash table and the time taken to build it
  540. """
  541. #Build Hash Table by adding words from the dictionary word list
  542. start_build_time = perf_counter()
  543. if ht_type == 'Chaining':
  544. hash_table = ChainingHashTable(ht_size)
  545. elif ht_type == 'Linear':
  546. hash_table = LinearHashTable(ht_size)
  547. elif ht_type == 'Quadratic':
  548. hash_table = QuadraticHashTable(ht_size)
  549. else:
  550. print('Hash type must be Chaining, Linear or Quadratic.')
  551. for word in dictionary_word_list:
  552. hash_table.store(word)
  553. end_build_time = perf_counter()
  554. build_time = end_build_time - start_build_time
  555. return hash_table, build_time
  556.  
  557.  
  558. #----------------------------------------------------------------------------
  559. def spellcheck_with_hashtable(document_word_list,
  560. dictionary_word_list,
  561. ht_type,
  562. ht_size,
  563. quiet_mode=False):
  564. """
  565. Checks the spelling of each word in 'document' (a list of words) against
  566. the dictionary (another list of words, using the given hash_table).
  567.  
  568. """
  569. hash_table, build_time = build_hash_table(ht_type, ht_size, dictionary_word_list)
  570.  
  571. #Check all words in the document list
  572. #Printing out misspelled words and counting them
  573. if not quiet_mode:
  574. print_line()
  575. print('Misspelled words:')
  576. print_line()
  577.  
  578. # Write your spell check code below
  579. dictionary_length = len(dictionary_word_list)
  580. document_length = len(document_word_list)
  581. num_errors = 0
  582. counter_10 = 0
  583. unique_errors = set() # hint hint :)
  584. start_check_time = perf_counter()
  585. # start
  586. # ---start student section---
  587. for word in document_word_list:
  588. if word not in dictionary_word_list:
  589. num_errors += 1
  590. unique_errors.add(word)
  591. if num_errors % 10 == 0:
  592. print("This is the {}th word: {}".format(num_errors, word))
  593. else:
  594. print(word)
  595. # ===end student section===
  596. end_check_time = perf_counter()
  597. # ---- end ----
  598. check_time = end_check_time - start_check_time
  599. load_factor = float(hash_table.n_items) /hash_table.n_slots
  600. ms_per_word = (check_time /len(document_word_list)) *1000
  601. if not quiet_mode:
  602. print_line()
  603. print(f'Number of errors = {num_errors} words')
  604. print(f'Number of unique errors = {len(unique_errors)} words')
  605. print_line()
  606. print()
  607. print_line()
  608. print('Summary stats (using ' +ht_type +' hash table):')
  609. print_line()
  610. print(f'Words in dictionary = {dictionary_length} words')
  611. print('Hash table size = {0:d} slots'.format(hash_table.n_slots))
  612. print('Hash table load factor = {0:8.4f}'.format(load_factor))
  613. print('Hash table build time = {0:8.4f}s'.format(build_time))
  614. print(f'Document length = {document_length} words')
  615. print(f'Document check time = {check_time:8.4f}s')
  616. print(f'Check time per word in document = {ms_per_word:10.6f} ms')
  617. print_line()
  618. print()
  619. return check_time
  620.  
  621.  
  622. def binary_search(values, needle):
  623. lowerBound = 0
  624. upperBound = len(values)
  625. index = 0
  626. # ---start student section---
  627. pass
  628. # ===end student section===
  629. return False
  630.  
  631.  
  632. # ------------------------------------------------
  633. def spellcheck_bin(document_word_list, dictionary_word_list, quiet_mode=False):
  634. #Check all words in the document list
  635. #Printing out misspelled words and counting them
  636. if not quiet_mode:
  637. print_line()
  638. print('Misspelled words:')
  639. print_line()
  640. num_errors = 0
  641.  
  642. dict_sort_start = perf_counter()
  643. dictionary = sorted(dictionary_word_list)
  644. dict_sort_end = perf_counter()
  645.  
  646. dictionary_length = len(dictionary_word_list)
  647. document_length = len(document_word_list)
  648. unique_errors = set() # hint hint :)
  649. num_errors = 0
  650. # ---- start -----
  651. start_check_time = perf_counter()
  652. for word in document_word_list:
  653. if not binary_search(dictionary, word):
  654. if not quiet_mode:
  655. print(num_errors, word)
  656. num_errors += 1
  657. end_check_time = perf_counter()
  658. # ---- end ----
  659.  
  660. check_time = end_check_time - start_check_time
  661. ms_per_word = (check_time /len(document_word_list)) *1000
  662. dict_sort_time = (dict_sort_end - dict_sort_start) *1000
  663.  
  664. if not quiet_mode:
  665. print_line()
  666. print(f'Number of errors = {num_errors} words')
  667. print(f'Number of unique errors = {len(unique_errors)} words')
  668. print_line()
  669. print()
  670. print_line()
  671. print('Summary stats (using Binary Search on Dictionary):')
  672. print_line()
  673. print(f'Words in dictionary = {dictionary_length} words')
  674. print(f'Time to sort dictionary = {dict_sort_time:.4f} ms')
  675. print(f'Document length = {document_length} words')
  676. print(f'Document check time = {check_time:8.4f}s')
  677. print(f'Check time per word in document = {ms_per_word:10.6f} ms')
  678. print_line()
  679. print()
  680. return check_time
  681.  
  682.  
  683. if __name__ == '__main__':
  684. # you don't need to worry about what the next line does :)
  685. os.environ['TERM'] = 'linux' # Suppress ^[[?1034h to keep doctests happy
  686.  
  687. # Idividual tests:
  688. # doctest.run_docstring_examples(ChainingHashTable, None)
  689. doctest.run_docstring_examples(LinearHashTable, None)
  690. # doctest.run_docstring_examples(QuadraticHashTable, None)
  691.  
  692. # run all the doctests:
  693. # doctest.testmod()
  694.  
  695. # NOTE:
  696. # Use tester.py to do some test runs with hash tables and
  697. # to do some spellchecks on various files
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement