SHARE
TWEET

Untitled

a guest Sep 11th, 2019 92 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. import urllib2
  2. import time
  3. opener = urllib2.build_opener()
  4. opener.addheaders = [('User-Agent', 'Mozilla/5.0')]
  5.  
  6.  
  7. from bs4.element import Comment
  8. from bs4 import BeautifulSoup
  9.  
  10.  
  11.  
  12.  
  13. def tag_visible(element):
  14.     if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
  15.         return False
  16.     if isinstance(element, Comment):
  17.         return False
  18.     return True
  19.  
  20.  
  21.  
  22. def text_from_html(body):
  23.     soup = BeautifulSoup(body, 'html.parser')
  24.     texts = soup.findAll(text=True)
  25.     visible_texts = filter(tag_visible, texts)  
  26.     return u" ".join(t.strip() for t in visible_texts)
  27.  
  28.  
  29.  
  30.  
  31. def compute_ngrams(toks, n=2):
  32.     """Returns an n-gram dictionary based on the provided list of tokens."""
  33.     toksDict={}
  34.     for index,tok in enumerate(toks[:(len(toks)-n+1)]):  #go through the list but give us an index # to work with
  35.         #the line toks[:len(toks)-n+1] is just to prevent the list from going past the data we care about
  36.        
  37.         if tok not in toksDict:  #check to see if we need to add tok
  38.             toksDict[tok] = []          
  39.        
  40.        
  41.         toksDict[tok].append(tuple(toks[(index+1):(index+n)])) #add tuple to the tok of len n to the tok
  42.                
  43.  
  44.    # print(toksDict)
  45.        
  46.     #return dict(sorted(toksDict.items())) use this if it needs to be returned sorted
  47.    
  48.     return toksDict
  49.        
  50.  
  51.  
  52. import random
  53.  
  54. def gen_passage(ngram_dict, length=100):
  55.     #select a random key from the dictionary use as start token    
  56.     curTok = random.choice(sorted(ngram_dict.keys()))
  57.     output = curTok
  58.    # print("Starting with value: " + curTok)
  59.    
  60.     for i in range(0,length-1):      
  61.         #select random tuple from the current token
  62.        
  63.         #make randomly selected token next current token
  64.             #check if randomly selected token is bad (aka not in token) while loop
  65.         if curTok in ngram_dict:
  66.             oldTok = curTok
  67.             curTok = random.choice(ngram_dict[curTok])[0]
  68.             #print("branching to value: " + curTok + " from " + oldTok + " : " + str(ngram_dict[oldTok]))
  69.             #print can be removed, this is just to show the pattern the computer is following
  70.            
  71.                                    
  72.         else:
  73.             curTok = random.choice(sorted(ngram_dict.keys()))
  74.            # print("choosing new random value: " + curTok)
  75.        
  76.             #print('resorting')
  77.        # print(curTok)
  78.        
  79.         output += ' ' + curTok
  80.         #then check if curTok has any children, if not re randomize
  81.     return output
  82.  
  83. def process_link(link):
  84.    
  85.     response = opener.open(link)
  86.     myfile = response.read()
  87.     myfile = text_from_html(myfile)
  88.     myfile = myfile.encode('ascii', 'ignore')
  89.    
  90.     myfile = myfile.replace('&aposs','')
  91.     myfile = myfile.replace('&apost','')
  92.     myfile = myfile.replace('&aposre', '')
  93.     myfile = myfile.lower()
  94.    
  95.     myfile = myfile.split(" ")
  96.    
  97.     myfile = filter(None,myfile)
  98.     myfile = myfile[myfile.index('email')+12:len(myfile)-45]
  99.    
  100.     return myfile
  101.  
  102. startWatch = time.time()
  103.  
  104. text = process_link("https://www.wattpad.com/454070831-jojo%27s-lemon-gayventure-true-love%27s-kiss-dio-x.php?")
  105. text += process_link("https://www.wattpad.com/457208090-jojo%27s-lemon-gayventure-teen-angst-jotaro-x")
  106. text += process_link("https://www.wattpad.com/475882497-jojo%27s-lemon-gayventure-the-dinner-party-josuke-x")
  107. text += process_link("https://www.wattpad.com/481136554-jojo%27s-lemon-gayventure-halloween-party-jonathan-x")
  108.  
  109. print("It took " + str(time.time()-startWatch) + " seconds to make the gayest shit ever")
  110.  
  111. gen_passage(compute_ngrams(text,2),9969)
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top