Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/python
- import praw
- import pdb
- import re
- import os
- import nltk
- import sys
- import pickle
- import datetime
- class TWSS:
- training_data = [] # [("sentence 1", bool), ("sentence 2", bool), ... ]
- classifier = None
- def __init__(self, sentence=None, training_data=None, positive_corpus_file=None, negative_corpus_file=None):
- if training_data:
- self.training_data = training_data
- if positive_corpus_file and negative_corpus_file:
- self.import_training_data(positive_corpus_file, negative_corpus_file)
- if sentence:
- self.__call__(sentence)
- def __call__(self, phrase):
- if not self.classifier:
- self.train()
- return self.is_twss(phrase)
- def import_training_data(self,
- positive_corpus_file=os.path.join(os.path.dirname(__file__),
- "positive.txt"),
- negative_corpus_file=os.path.join(os.path.dirname(__file__),
- "negative.txt")
- ):
- """
- This method imports the positive and negative training data from the
- two corpus files and creates the training data list.
- """
- positive_corpus = open(positive_corpus_file)
- negative_corpus = open(negative_corpus_file)
- # for line in positive_corpus:
- # self.training_data.append((line, True))
- # for line in negative_corpus:
- # self.training_data.append((line, False))
- # The following code works. Need to profile this to see if this is an
- # improvement over the code above.
- positive_training_data = list(map(lambda x: (x, True), positive_corpus))
- negative_training_data = list(map(lambda x: (x, False), negative_corpus))
- self.training_data = positive_training_data + negative_training_data
- def train(self):
- """
- This method generates the classifier. This method assumes that the
- training data has been loaded
- """
- if not self.training_data:
- self.import_training_data()
- training_feature_set = [(self.extract_features(line.decode('utf-8')), label)
- for (line, label) in self.training_data]
- self.classifier = nltk.NaiveBayesClassifier.train(training_feature_set)
- def extract_features(self, phrase):
- """
- This function will extract features from the phrase being used.
- Currently, the feature we are extracting are unigrams of the text corpus.
- """
- words = nltk.word_tokenize(phrase)
- features = {}
- for word in words:
- features['contains(%s)' % word] = (word in words)
- return features
- def is_twss(self, phrase):
- """
- The magic function- this accepts a phrase and tells you if it
- classifies as an entendre
- """
- featureset = self.extract_features(phrase)
- return self.classifier.classify(featureset)
- def save(self, filename='classifier.dump'):
- """
- Pickles the classifier and dumps it into a file
- """
- ofile = open(filename,'w+')
- pickle.dump(self.classifier, ofile)
- ofile.close()
- def load(self, filename='classifier.dump'):
- """
- Unpickles the classifier used
- """
- ifile = open(filename, 'r+')
- self.classifier = pickle.load(ifile)
- ifile.close()
- print "Starting bot..."
- user_agent = ("TWSS 0")
- r = praw.Reddit(username = "_TWSSBot_",password = "****",user_agent=user_agent,client_id = "****", client_secret = "****")
- twss = TWSS()
- print "Training..."
- twss("That was hard")
- print "Loading files..."
- if not os.path.isfile("TWSSyes.txt"):
- no = []
- else:
- with open("TWSSyes.txt", "r") as f:
- yes = f.read()
- yes = yes.split("\n")
- yes = filter(None, yes)
- if not os.path.isfile("TWSSreplied.txt"):
- replied = []
- else:
- with open("TWSSreplied.txt", "r") as f:
- replied = f.read()
- replied = replied.split("\n")
- replied = filter(None, replied)
- if not os.path.isfile("TWSSno.txt"):
- no = []
- else:
- with open("TWSSno.txt", "r") as f:
- no = f.read()
- no = no.split("\n")
- no = filter(None, no)
- if not os.path.isfile("TWSSuserIgnores.txt"):
- uIgnores = []
- else:
- with open("TWSSuserIgnores.txt", "r") as f:
- uIgnores = f.read()
- uIgnores = uIgnores.split("\n")
- uIgnores = filter(None, uIgnores)
- if not os.path.isfile("TWSSsubIgnores.txt"):
- sIgnores = []
- else:
- with open("TWSSsubIgnores.txt", "r") as f:
- sIgnores = f.read()
- sIgnores = sIgnores.split("\n")
- sIgnores = filter(None, sIgnores)
- def getParent(comment):
- return r.comment(comment.parent_id.rsplit('_', 1)[1])
- def responds():
- print "REPLYING TO COMMANDS..."
- rf = [0,0,0,0,0]
- rs = [0,0,0,0,0]
- for message in r.inbox.unread(limit = None):
- if "!info" in message.body.lower():
- try:
- message.reply("**TWSSBot**\n-\n\nThis bot uses a [Naive Bayes classifier](https://en.wikipedia.org/wiki/Naive_Bayes_classifier) to detect the classic [double entendre](https://en.wikipedia.org/wiki/Double_entendre) of \"That's what she said\" jokes. I have several commands available:\n\n**!info**: Displays this message\n\n**!ignoreme**: Will ignore your comments in the future\n\n**!ignoresubreddit**: Will ignore all comments from this subreddit in the future. Requires moderator status.\n\n**!train yes**: Will add the comment to positive training\n\n**!train no**: Will add the comment to negative training\n\n^(Note: all suggestions made by the **!train** commands will be reviewed by the creator. Do not use them as a reply to this message, instead reply to the bot's original reply.)")
- message.mark_read()
- print "Replied to !info command by ",message.author
- rs[0] = rs[0] + 1
- except Exception as e:
- print e
- rf[0] = rf[0] + 1
- elif "!ignoreme" in message.body.lower():
- try:
- message.reply("**Thank you**, I will ignore your posts in the future.")
- message.mark_read()
- uIgnores.append(message.author.name)
- with open("TWSSuserIgnores.txt", "w") as f:
- for u in uIgnores:
- f.write(u + "\n")
- print "Ignored user ",message.author.name
- rs[1] = rs[1] + 1
- except Exception as e:
- print e
- rf[1] = rf[1] + 1
- elif "!ignoresubreddit" in message.body.lower():
- mods = []
- for mod in r.subreddit(message.subreddit.display_name).moderator:
- mods.append(mod)
- try:
- if message.author in mods:
- message.reply("**Thank you**, I will ignore this subreddit in the future.")
- message.mark_read()
- sIgnores.append(message.subreddit.display_name)
- with open("TWSSsubIgnores.txt", "w") as f:
- for u in sIgnores:
- f.write(u + "\n")
- print "Ignored subreddit ",message.subreddit
- rs[2] = rs[2] + 1
- except Exception as e:
- print e
- rf[2] = rf[2] + 1
- elif "!train yes" in message.body.lower():
- try:
- if getParent(message).body == "##That's what she said!\n\n-\nI am a bot that uses a [Naive Bayes classifier](https://en.wikipedia.org/wiki/Naive_Bayes_classifier) to detect \"That's what she said\" jokes. Reply with **!info** to get more information.\n\n**PLEASE READ !INFO BEFORE DOWNVOTING!**\n\nThe more downvotes this bot gets, the longer it takes me to reply to your commands! Also, visit /r/TWSSBot":
- bod = getParent(getParent(message)).body
- message.reply("**Thank you**, my creator will review your suggestion to train \""+bod+"\" as a TWSS joke.")
- message.mark_read()
- print "Trained yes: ",bod
- yes.append(getParent(getParent(message)).body)
- with open("TWSSyes.txt", "w") as f:
- for u in yes:
- f.write(u + "\n")
- rs[3] = rs[3] + 1
- except Exception as e:
- rf[3] = rf[3] + 1
- elif "!train no" in message.body.lower():
- try:
- if getParent(message).body == "##That's what she said!\n\n-\nI am a bot that uses a [Naive Bayes classifier](https://en.wikipedia.org/wiki/Naive_Bayes_classifier) to detect \"That's what she said\" jokes. Reply with **!info** to get more information.\n\n**PLEASE READ !INFO BEFORE DOWNVOTING!**\n\nThe more downvotes this bot gets, the longer it takes me to reply to your commands! Also, visit /r/TWSSBot":
- bod = getParent(getParent(message)).body
- message.reply("**Thank you**, my creator will review your suggestion to train \""+bod+"\" as not a TWSS joke.")
- message.mark_read()
- print "Trained no: ",bod
- no.append(getParent(getParent(message)).body)
- with open("TWSSno.txt", "w") as f:
- for u in no:
- f.write(u + "\n")
- rs[4] = rs[4] + 1
- except Exception as e:
- print e
- rf[4] = rf[4] + 1
- print "Done replying!\n !info: ",rs[0]," : ",rf[0],"\n !ignoreme: ",rs[1]," : ",rf[1],"\n !ignoresubreddit: ",rs[2]," : ",rf[2],"\n !train yes: ",rs[3]," : ",rf[3],"\n !train no: ",rs[4]," : ",rf[4]
- print rs,rf
- responds()
- print "Comment Loop:"
- c = 0
- while True:
- for comment in r.subreddit('all').comments():
- mlen = len(nltk.word_tokenize(comment.body))
- if not comment.id in replied and twss(comment.body) and mlen > 2 and mlen<15 and not comment.author.name in uIgnores and not comment.subreddit.display_name in sIgnores:
- try:
- comment.reply("##That's what she said!\n\n-\nI am a bot that uses a [Naive Bayes classifier](https://en.wikipedia.org/wiki/Naive_Bayes_classifier) to detect \"That's what she said\" jokes. Reply with **!info** to get more information.\n\n**PLEASE READ !INFO BEFORE DOWNVOTING!**\n\nThe more downvotes this bot gets, the longer it takes me to reply to your commands! Also, visit /r/TWSSBot")
- replied.append(comment.id)
- print "\n==================================\n",comment.body
- with open("TWSSreplied.txt", "w") as f:
- for u in replied:
- f.write(u + "\n")
- c = c + 1
- except Exception as e:
- c = c + 1
- if c % 10 == 0:
- responds()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement