Untitled

class Ngram ():
    children = [] # these are class variables
    name = ""
    count = 0
    probability = 0

    def __init__(self, name): # called the constructor, allocates memory for the class
        self.name = name
        self.children = []
        self.count = 0
        self.probability = 0

    def calc_prob(self):
        child_count=0 #the total number of child occurances
        for n in self.children: # self.chidlren is basically a list of all the children
            child_count+= n.count #the total sum of all the children, aka the number following the word on the worksheet
        for n in self.children:
            n.probability= n.count / child_count
            n.calc_prob() # basically means that it'll repeat so long as there are children


    def set_count(self, count):
        self.count = count

    def increment_count(self):
        self.count += 1

    def add_child(self, child):
        self.children.append(child)

    def get_child(self, name):
        for ng in self.children:
            if ng.name == name:
                return ng

        return None

    def find_children(self, names):
        nms = names.split() # split on space

        node = self

        for n in nms:
            node = node.get_child(n)

            if node == None:
                break;

        return node

    def construct(self, data, length):
        # changes all words to lowercase
        # removes 's
        # removes '
        # removes ,
        # removes .
        words = data.lower()\
            .replace("'s","")\
            .replace("'","") \
            .replace(",","") \
            .replace(".","") \
            .replace("-","") \
            .replace("\"","").split()

        for lower_lim in range(len(words)+1-length):

            word_string = words[lower_lim]

            n = self.find_children(word_string)

            if n is None:
                n = Ngram(words[lower_lim])
                n.set_count(1)

                self.add_child(n)
            else:
                n.increment_count()


            node = n

            for upper_lim in range(1, length):
                word_string += " " + words[lower_lim+upper_lim]
                n2 = self.find_children(word_string)

                if n2 is None:
                    n2 = Ngram(words[lower_lim+upper_lim])
                    n2.set_count(1)

                    node.add_child(n2)
                else:
                    n2.increment_count()

                node = n2

    def print_index(self, i):
        print(" "*i, self.name, "(", self.count, ")", "probability:",self.probability) #" "*i indents it according to depth

        for c in self.children:
            c.print_index(i+1)

    def print(self):
        self.print_index(0)

n= Ngram("root")

paragraph=""
with open("sogstory","r") as f:
    for line in f:
        paragraph+=line

n.construct(paragraph,2)
n.calc_prob()
n.print()
print("dogprob",n.get_child("the").get_child("dog").probability) #prints the probability of "the dog" happens