Advertisement
Guest User

Untitled

a guest
Dec 12th, 2018
64
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.84 KB | None | 0 0
  1.  
  2. from pathlib import Path
  3. import os
  4. import re
  5.  
  6.  
  7. class Markov:
  8. def __init__(self,filenames):
  9. self.__sources = filenames
  10. self.tree = dict()
  11.  
  12.  
  13.  
  14. # Section from assignment 5
  15. def clean_line(self, line):
  16. # YOUR CODE HERE
  17. # Returning line without punctuation and numeric Symbols
  18. line = re.sub("'", '', line)
  19. line = re.sub("-", '', line)
  20. line = re.sub("/", '', line)
  21. # line = re.sub("\", '', line) ger något fel
  22. line = re.sub('[^A-Za-z.,]+', ' ', line)
  23. line = line.split()
  24.  
  25. #print(line)
  26. #return []
  27. return line
  28.  
  29.  
  30. def text_gen(self):
  31. for fname in self.__sources:
  32. with open(fname, encoding='utf8', errors='ignore') as f:
  33. for line in f:
  34.  
  35. yield self.clean_line(line)
  36.  
  37.  
  38.  
  39. def build_tree(self):
  40. """
  41. Build vocabulary of words from the provided text files
  42. """
  43. print("Du lyckas köra build_vocabulary")
  44. # YOUR CODE HERE
  45.  
  46. # Go through all words and if they don't exist in self.__vocab = set() (are unique)
  47. # Add to self.__vocab = set()
  48. word_prev=""
  49. count=0
  50. factor=1
  51. bool=False
  52. for line in self.text_gen():
  53. for a in line:
  54. a=a.lower()
  55. if bool:
  56. #print("a: ",word_prev," b: ",word )
  57. #count+=1
  58. if b not in self.tree:
  59. count+=1
  60. print(a," count: ", count)
  61. self.tree[b] = dict()
  62. #self.tree[a][b] = factor if b not in self.tree[a] else self.tree[a][b] + self.tree[a][b] * factor
  63. if a not in self.tree[b]:
  64. self.tree[b][a]= factor
  65. else:
  66. self.tree[b][a] = self.tree[b][a] + self.tree[b][a]*factor
  67. else:
  68. bool=True
  69. print("a: ",a)
  70. #self.tree[b] = dict()
  71. b=a
  72.  
  73.  
  74.  
  75.  
  76. '''for word in line:
  77. if bool:
  78. #print("a: ",word_prev," b: ",word )
  79. #count+=1
  80. if word not in self.tree:
  81. self.tree[word] = dict()
  82.  
  83. self.tree[word][word_prev] = factor if word_prev not in self.tree[word] else self.tree[word][word_prev] + self.tree[word][word_prev] * factor
  84.  
  85. else:
  86. bool=True
  87. word_prev=word'''
  88. #print(self.tree)
  89. #print(count)
  90.  
  91.  
  92.  
  93. #if w not in self.__vocab:
  94. #self.__vocab.add(w)
  95. #print("Updated vocab with: ", w)
  96.  
  97. #self.write_vocabulary()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement