Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from pathlib import Path
- import os
- import re
- class Markov:
- def __init__(self,filenames):
- self.__sources = filenames
- self.tree = dict()
- # Section from assignment 5
- def clean_line(self, line):
- # YOUR CODE HERE
- # Returning line without punctuation and numeric Symbols
- line = re.sub("'", '', line)
- line = re.sub("-", '', line)
- line = re.sub("/", '', line)
- # line = re.sub("\", '', line) ger något fel
- line = re.sub('[^A-Za-z.,]+', ' ', line)
- line = line.split()
- #print(line)
- #return []
- return line
- def text_gen(self):
- for fname in self.__sources:
- with open(fname, encoding='utf8', errors='ignore') as f:
- for line in f:
- yield self.clean_line(line)
- def build_tree(self):
- """
- Build vocabulary of words from the provided text files
- """
- print("Du lyckas köra build_vocabulary")
- # YOUR CODE HERE
- # Go through all words and if they don't exist in self.__vocab = set() (are unique)
- # Add to self.__vocab = set()
- word_prev=""
- count=0
- factor=1
- bool=False
- for line in self.text_gen():
- for a in line:
- a=a.lower()
- if bool:
- #print("a: ",word_prev," b: ",word )
- #count+=1
- if b not in self.tree:
- count+=1
- print(a," count: ", count)
- self.tree[b] = dict()
- #self.tree[a][b] = factor if b not in self.tree[a] else self.tree[a][b] + self.tree[a][b] * factor
- if a not in self.tree[b]:
- self.tree[b][a]= factor
- else:
- self.tree[b][a] = self.tree[b][a] + self.tree[b][a]*factor
- else:
- bool=True
- print("a: ",a)
- #self.tree[b] = dict()
- b=a
- '''for word in line:
- if bool:
- #print("a: ",word_prev," b: ",word )
- #count+=1
- if word not in self.tree:
- self.tree[word] = dict()
- self.tree[word][word_prev] = factor if word_prev not in self.tree[word] else self.tree[word][word_prev] + self.tree[word][word_prev] * factor
- else:
- bool=True
- word_prev=word'''
- #print(self.tree)
- #print(count)
- #if w not in self.__vocab:
- #self.__vocab.add(w)
- #print("Updated vocab with: ", w)
- #self.write_vocabulary()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement