Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import urllib2
- import time
- opener = urllib2.build_opener()
- opener.addheaders = [('User-Agent', 'Mozilla/5.0')]
- from bs4.element import Comment
- from bs4 import BeautifulSoup
- def tag_visible(element):
- if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
- return False
- if isinstance(element, Comment):
- return False
- return True
- def text_from_html(body):
- soup = BeautifulSoup(body, 'html.parser')
- texts = soup.findAll(text=True)
- visible_texts = filter(tag_visible, texts)
- return u" ".join(t.strip() for t in visible_texts)
- def compute_ngrams(toks, n=2):
- """Returns an n-gram dictionary based on the provided list of tokens."""
- toksDict={}
- for index,tok in enumerate(toks[:(len(toks)-n+1)]): #go through the list but give us an index # to work with
- #the line toks[:len(toks)-n+1] is just to prevent the list from going past the data we care about
- if tok not in toksDict: #check to see if we need to add tok
- toksDict[tok] = []
- toksDict[tok].append(tuple(toks[(index+1):(index+n)])) #add tuple to the tok of len n to the tok
- # print(toksDict)
- #return dict(sorted(toksDict.items())) use this if it needs to be returned sorted
- return toksDict
- import random
- def gen_passage(ngram_dict, length=100):
- #select a random key from the dictionary use as start token
- curTok = random.choice(sorted(ngram_dict.keys()))
- output = curTok
- # print("Starting with value: " + curTok)
- for i in range(0,length-1):
- #select random tuple from the current token
- #make randomly selected token next current token
- #check if randomly selected token is bad (aka not in token) while loop
- if curTok in ngram_dict:
- oldTok = curTok
- curTok = random.choice(ngram_dict[curTok])[0]
- #print("branching to value: " + curTok + " from " + oldTok + " : " + str(ngram_dict[oldTok]))
- #print can be removed, this is just to show the pattern the computer is following
- else:
- curTok = random.choice(sorted(ngram_dict.keys()))
- # print("choosing new random value: " + curTok)
- #print('resorting')
- # print(curTok)
- output += ' ' + curTok
- #then check if curTok has any children, if not re randomize
- return output
- def process_link(link):
- response = opener.open(link)
- myfile = response.read()
- myfile = text_from_html(myfile)
- myfile = myfile.encode('ascii', 'ignore')
- myfile = myfile.replace('&aposs','')
- myfile = myfile.replace('&apost','')
- myfile = myfile.replace('&aposre', '')
- myfile = myfile.lower()
- myfile = myfile.split(" ")
- myfile = filter(None,myfile)
- myfile = myfile[myfile.index('email')+12:len(myfile)-45]
- return myfile
- startWatch = time.time()
- text = process_link("https://www.wattpad.com/454070831-jojo%27s-lemon-gayventure-true-love%27s-kiss-dio-x.php?")
- text += process_link("https://www.wattpad.com/457208090-jojo%27s-lemon-gayventure-teen-angst-jotaro-x")
- text += process_link("https://www.wattpad.com/475882497-jojo%27s-lemon-gayventure-the-dinner-party-josuke-x")
- text += process_link("https://www.wattpad.com/481136554-jojo%27s-lemon-gayventure-halloween-party-jonathan-x")
- print("It took " + str(time.time()-startWatch) + " seconds to make the gayest shit ever")
- gen_passage(compute_ngrams(text,2),9969)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement