Untitled

import urllib2
import time
opener = urllib2.build_opener()
opener.addheaders = [('User-Agent', 'Mozilla/5.0')]


from bs4.element import Comment
from bs4 import BeautifulSoup


def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True


def text_from_html(body):
    soup = BeautifulSoup(body, 'html.parser')
    texts = soup.findAll(text=True)
    visible_texts = filter(tag_visible, texts)
    return u" ".join(t.strip() for t in visible_texts)


def compute_ngrams(toks, n=2):
    """Returns an n-gram dictionary based on the provided list of tokens."""
    toksDict={}
    for index,tok in enumerate(toks[:(len(toks)-n+1)]):  #go through the list but give us an index # to work with
        #the line toks[:len(toks)-n+1] is just to prevent the list from going past the data we care about

        if tok not in toksDict:  #check to see if we need to add tok
            toksDict[tok] = []


        toksDict[tok].append(tuple(toks[(index+1):(index+n)])) #add tuple to the tok of len n to the tok


   # print(toksDict)

    #return dict(sorted(toksDict.items())) use this if it needs to be returned sorted

    return toksDict


import random

def gen_passage(ngram_dict, length=100):
    #select a random key from the dictionary use as start token
    curTok = random.choice(sorted(ngram_dict.keys()))
    output = curTok
   # print("Starting with value: " + curTok)

    for i in range(0,length-1):
        #select random tuple from the current token

        #make randomly selected token next current token
            #check if randomly selected token is bad (aka not in token) while loop
        if curTok in ngram_dict:
            oldTok = curTok
            curTok = random.choice(ngram_dict[curTok])[0]
            #print("branching to value: " + curTok + " from " + oldTok + " : " + str(ngram_dict[oldTok]))
            #print can be removed, this is just to show the pattern the computer is following


        else:
            curTok = random.choice(sorted(ngram_dict.keys()))
           # print("choosing new random value: " + curTok)

            #print('resorting')
       # print(curTok)

        output += ' ' + curTok
        #then check if curTok has any children, if not re randomize
    return output

def process_link(link):

    response = opener.open(link)
    myfile = response.read()
    myfile = text_from_html(myfile)
    myfile = myfile.encode('ascii', 'ignore')

    myfile = myfile.replace('&aposs','')
    myfile = myfile.replace('&apost','')
    myfile = myfile.replace('&aposre', '')
    myfile = myfile.lower()

    myfile = myfile.split(" ")

    myfile = filter(None,myfile)
    myfile = myfile[myfile.index('email')+12:len(myfile)-45]

    return myfile

startWatch = time.time()

text = process_link("https://www.wattpad.com/454070831-jojo%27s-lemon-gayventure-true-love%27s-kiss-dio-x.php?")
text += process_link("https://www.wattpad.com/457208090-jojo%27s-lemon-gayventure-teen-angst-jotaro-x")
text += process_link("https://www.wattpad.com/475882497-jojo%27s-lemon-gayventure-the-dinner-party-josuke-x")
text += process_link("https://www.wattpad.com/481136554-jojo%27s-lemon-gayventure-halloween-party-jonathan-x")

print("It took " + str(time.time()-startWatch) + " seconds to make the gayest shit ever")

gen_passage(compute_ngrams(text,2),9969)