Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import csv
- import string
- news = open('C:/Users/Fishers/Desktop/RedditNews.csv')
- words_list = list()
- csv_news = csv.reader(news)
- with open('DIM_WORD.csv', 'w', newline='') as myfile:
- for row in csv_news:
- line = row[1]
- lower_words = [w.lower() for w in line.split()]
- lower_words = [''.join(c for c in s if c not in string.punctuation) for s in lower_words]
- words_list.extend(lower_words)
- # Make the list unique
- # print (words_list)
- unique_words = list(set(words_list))
- # Remove words that are "filler words" (in, the, a, on, if, an)
- exclude_list = ['in', 'the', 'a', 'on', 'if', 'an']
- print('The excluded words are: ', ', '.join(exclude_list))
- final_words = [x for x in unique_words if x not in exclude_list]
- for word in unique_words:
- wr = csv.writer(myfile)
- wr.writerow([word])
- print('Parsing Complete. There are ', str(len(final_words)), ' total unique words.')
- news.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement