Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- def filterTweet(str1,str2,str3):
- PUNCTUATION = '!"$%\'()*+,-./:;<=>?[\\]^_`{|}~' + u"\u2014" + u"\u2026"
- STOP_WORDS_SHORT = set(["a", "an", "the", "this", "that", "of", "for", "or", "and", "on", "to", "be", "if", "we", "you", "in", "is", "at", "it", "rt", "mt"])
- STOP_WORDS = {"basic":STOP_WORDS_SHORT,
- "hrc":set(["clinton", "hillary", "tim", "timothy", "kaine"]).union(STOP_WORDS_SHORT),
- "djt": set(["donald", "trump", "mike", "michael", "pence"]).union(STOP_WORDS_SHORT),
- "both": STOP_WORDS_SHORT.union(set(["clinton", "hillary", "donald", "trump", "tim", "timothy", "kaine", "mike", "michael", "pence"])),
- "none": set([])}
- STOP_PREFIXES = {"default": set(["@", "#", "http", "&"]),
- "hashtags_only": set(["#"]),
- "none": set([])}
- HRC_STOP_WORDS = STOP_WORDS["hrc"]
- DT_STOP_WORDS = STOP_WORDS["djt"]
- BOTH_CAND_STOP_WORDS = STOP_WORDS["both"]
- returnList = []
- for word in ((''.join((x for x in str1 if x not in PUNCTUATION))).split(' ')):
- if word.lower() not in STOP_WORDS[str2] and (word.lower())[0] not in STOP_PREFIXES[str3] and (word.lower())[:4] not in STOP_PREFIXES[str3]:
- returnList.append(word)
- return returnList
- def main(s1,s2,s3):
- print filterTweet(s1,s2,s3)
- hrctweet = "RT @HillaryforIA: The #iacaucus starts in 24 hours! #ImWithHer https://t.co/bF5fyfKbFt"
- main(hrctweet, "basic", "default")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement