Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas
- import ast
- df = pandas.read_csv("twitter_cleanedsample.csv")
- all_words = []
- for raw_stopwords in df["Tweet_stopped"]:
- #raw_stopwords is a string that _looks_ like a list of strings, for example:
- #"['micosapiens', 'faqstv', 'hannahbcn', 'joancbaez', 'tvcat']"
- #This looks like a list with a length of 5, but if you called len on it,
- #you would actually get 60, because that's how many characters it has,
- #counting the brackets and commas and quote marks and such.
- #this is useless to us. If we want sensible length data, we need to convert to an actual list.
- #ast.literal_eval is an effective way of turning list-looking strings into actual lists
- #without opening us up to security problems. So let's use that.
- stopwords = ast.literal_eval(raw_stopwords)
- #now add the words to the list of all words.
- all_words.extend(stopwords)
- print ("Found {} words.".format(len(all_words)))
- #result:
- #Found 7489 words.
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement