Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- class Ngram ():
- children = [] # these are class variables
- name = ""
- count = 0
- probability = 0
- def __init__(self, name): # called the constructor, allocates memory for the class
- self.name = name
- self.children = []
- self.count = 0
- self.probability = 0
- def calc_prob(self):
- child_count=0 #the total number of child occurances
- for n in self.children: # self.chidlren is basically a list of all the children
- child_count+= n.count #the total sum of all the children, aka the number following the word on the worksheet
- for n in self.children:
- n.probability= n.count / child_count
- n.calc_prob() # basically means that it'll repeat so long as there are children
- def set_count(self, count):
- self.count = count
- def increment_count(self):
- self.count += 1
- def add_child(self, child):
- self.children.append(child)
- def get_child(self, name):
- for ng in self.children:
- if ng.name == name:
- return ng
- return None
- def find_children(self, names):
- nms = names.split() # split on space
- node = self
- for n in nms:
- node = node.get_child(n)
- if node == None:
- break;
- return node
- def construct(self, data, length):
- # changes all words to lowercase
- # removes 's
- # removes '
- # removes ,
- # removes .
- words = data.lower()\
- .replace("'s","")\
- .replace("'","") \
- .replace(",","") \
- .replace(".","") \
- .replace("-","") \
- .replace("\"","").split()
- for lower_lim in range(len(words)+1-length):
- word_string = words[lower_lim]
- n = self.find_children(word_string)
- if n is None:
- n = Ngram(words[lower_lim])
- n.set_count(1)
- self.add_child(n)
- else:
- n.increment_count()
- node = n
- for upper_lim in range(1, length):
- word_string += " " + words[lower_lim+upper_lim]
- n2 = self.find_children(word_string)
- if n2 is None:
- n2 = Ngram(words[lower_lim+upper_lim])
- n2.set_count(1)
- node.add_child(n2)
- else:
- n2.increment_count()
- node = n2
- def print_index(self, i):
- print(" "*i, self.name, "(", self.count, ")", "probability:",self.probability) #" "*i indents it according to depth
- for c in self.children:
- c.print_index(i+1)
- def print(self):
- self.print_index(0)
- n= Ngram("root")
- paragraph=""
- with open("sogstory","r") as f:
- for line in f:
- paragraph+=line
- n.construct(paragraph,2)
- n.calc_prob()
- n.print()
- print("dogprob",n.get_child("the").get_child("dog").probability) #prints the probability of "the dog" happens
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement