SHARE
TWEET

Untitled

a guest Jan 23rd, 2019 43 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. class Ngram ():
  2.     children = [] # these are class variables
  3.     name = ""
  4.     count = 0
  5.     probability = 0
  6.  
  7.     def __init__(self, name): # called the constructor, allocates memory for the class
  8.         self.name = name
  9.         self.children = []
  10.         self.count = 0
  11.         self.probability = 0
  12.  
  13.     def calc_prob(self):
  14.         child_count=0 #the total number of child occurances
  15.         for n in self.children: # self.chidlren is basically a list of all the children
  16.             child_count+= n.count #the total sum of all the children, aka the number following the word on the worksheet
  17.         for n in self.children:
  18.             n.probability= n.count / child_count
  19.             n.calc_prob() # basically means that it'll repeat so long as there are children
  20.  
  21.  
  22.     def set_count(self, count):
  23.         self.count = count
  24.  
  25.     def increment_count(self):
  26.         self.count += 1
  27.  
  28.     def add_child(self, child):
  29.         self.children.append(child)
  30.  
  31.     def get_child(self, name):
  32.         for ng in self.children:
  33.             if ng.name == name:
  34.                 return ng
  35.  
  36.         return None
  37.  
  38.     def find_children(self, names):
  39.         nms = names.split() # split on space
  40.  
  41.         node = self
  42.  
  43.         for n in nms:
  44.             node = node.get_child(n)
  45.  
  46.             if node == None:
  47.                 break;
  48.  
  49.         return node
  50.  
  51.     def construct(self, data, length):
  52.         # changes all words to lowercase
  53.         # removes 's
  54.         # removes '
  55.         # removes ,
  56.         # removes .
  57.         words = data.lower()\
  58.             .replace("'s","")\
  59.             .replace("'","") \
  60.             .replace(",","") \
  61.             .replace(".","") \
  62.             .replace("-","") \
  63.             .replace("\"","").split()
  64.  
  65.         for lower_lim in range(len(words)+1-length):
  66.  
  67.             word_string = words[lower_lim]
  68.  
  69.             n = self.find_children(word_string)
  70.  
  71.             if n is None:
  72.                 n = Ngram(words[lower_lim])
  73.                 n.set_count(1)
  74.  
  75.                 self.add_child(n)
  76.             else:
  77.                 n.increment_count()
  78.  
  79.  
  80.  
  81.             node = n
  82.  
  83.             for upper_lim in range(1, length):
  84.                 word_string += " " + words[lower_lim+upper_lim]
  85.                 n2 = self.find_children(word_string)
  86.  
  87.                 if n2 is None:
  88.                     n2 = Ngram(words[lower_lim+upper_lim])
  89.                     n2.set_count(1)
  90.  
  91.                     node.add_child(n2)
  92.                 else:
  93.                     n2.increment_count()
  94.  
  95.                 node = n2
  96.  
  97.     def print_index(self, i):
  98.         print(" "*i, self.name, "(", self.count, ")", "probability:",self.probability) #" "*i indents it according to depth
  99.  
  100.         for c in self.children:
  101.             c.print_index(i+1)
  102.  
  103.     def print(self):
  104.         self.print_index(0)
  105.  
  106. n= Ngram("root")
  107.  
  108. paragraph=""
  109. with open("sogstory","r") as f:
  110.     for line in f:
  111.         paragraph+=line
  112.  
  113. n.construct(paragraph,2)
  114. n.calc_prob()
  115. n.print()
  116. print("dogprob",n.get_child("the").get_child("dog").probability) #prints the probability of "the dog" happens
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top