Advertisement
Guest User

Untitled

a guest
Sep 11th, 2019
123
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.64 KB | None | 0 0
  1. import urllib2
  2. import time
  3. opener = urllib2.build_opener()
  4. opener.addheaders = [('User-Agent', 'Mozilla/5.0')]
  5.  
  6.  
  7. from bs4.element import Comment
  8. from bs4 import BeautifulSoup
  9.  
  10.  
  11.  
  12.  
  13. def tag_visible(element):
  14. if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
  15. return False
  16. if isinstance(element, Comment):
  17. return False
  18. return True
  19.  
  20.  
  21.  
  22. def text_from_html(body):
  23. soup = BeautifulSoup(body, 'html.parser')
  24. texts = soup.findAll(text=True)
  25. visible_texts = filter(tag_visible, texts)
  26. return u" ".join(t.strip() for t in visible_texts)
  27.  
  28.  
  29.  
  30.  
  31. def compute_ngrams(toks, n=2):
  32. """Returns an n-gram dictionary based on the provided list of tokens."""
  33. toksDict={}
  34. for index,tok in enumerate(toks[:(len(toks)-n+1)]): #go through the list but give us an index # to work with
  35. #the line toks[:len(toks)-n+1] is just to prevent the list from going past the data we care about
  36.  
  37. if tok not in toksDict: #check to see if we need to add tok
  38. toksDict[tok] = []
  39.  
  40.  
  41. toksDict[tok].append(tuple(toks[(index+1):(index+n)])) #add tuple to the tok of len n to the tok
  42.  
  43.  
  44. # print(toksDict)
  45.  
  46. #return dict(sorted(toksDict.items())) use this if it needs to be returned sorted
  47.  
  48. return toksDict
  49.  
  50.  
  51.  
  52. import random
  53.  
  54. def gen_passage(ngram_dict, length=100):
  55. #select a random key from the dictionary use as start token
  56. curTok = random.choice(sorted(ngram_dict.keys()))
  57. output = curTok
  58. # print("Starting with value: " + curTok)
  59.  
  60. for i in range(0,length-1):
  61. #select random tuple from the current token
  62.  
  63. #make randomly selected token next current token
  64. #check if randomly selected token is bad (aka not in token) while loop
  65. if curTok in ngram_dict:
  66. oldTok = curTok
  67. curTok = random.choice(ngram_dict[curTok])[0]
  68. #print("branching to value: " + curTok + " from " + oldTok + " : " + str(ngram_dict[oldTok]))
  69. #print can be removed, this is just to show the pattern the computer is following
  70.  
  71.  
  72. else:
  73. curTok = random.choice(sorted(ngram_dict.keys()))
  74. # print("choosing new random value: " + curTok)
  75.  
  76. #print('resorting')
  77. # print(curTok)
  78.  
  79. output += ' ' + curTok
  80. #then check if curTok has any children, if not re randomize
  81. return output
  82.  
  83. def process_link(link):
  84.  
  85. response = opener.open(link)
  86. myfile = response.read()
  87. myfile = text_from_html(myfile)
  88. myfile = myfile.encode('ascii', 'ignore')
  89.  
  90. myfile = myfile.replace('&aposs','')
  91. myfile = myfile.replace('&apost','')
  92. myfile = myfile.replace('&aposre', '')
  93. myfile = myfile.lower()
  94.  
  95. myfile = myfile.split(" ")
  96.  
  97. myfile = filter(None,myfile)
  98. myfile = myfile[myfile.index('email')+12:len(myfile)-45]
  99.  
  100. return myfile
  101.  
  102. startWatch = time.time()
  103.  
  104. text = process_link("https://www.wattpad.com/454070831-jojo%27s-lemon-gayventure-true-love%27s-kiss-dio-x.php?")
  105. text += process_link("https://www.wattpad.com/457208090-jojo%27s-lemon-gayventure-teen-angst-jotaro-x")
  106. text += process_link("https://www.wattpad.com/475882497-jojo%27s-lemon-gayventure-the-dinner-party-josuke-x")
  107. text += process_link("https://www.wattpad.com/481136554-jojo%27s-lemon-gayventure-halloween-party-jonathan-x")
  108.  
  109. print("It took " + str(time.time()-startWatch) + " seconds to make the gayest shit ever")
  110.  
  111. gen_passage(compute_ngrams(text,2),9969)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement