Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from sklearn import cross_validation
- import numpy as np # linear algebra
- import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
- from sklearn.model_selection import train_test_split # function for splitting data to train and test sets
- import nltk
- from nltk.corpus import stopwords
- from nltk.classify import SklearnClassifier
- from wordcloud import WordCloud,STOPWORDS
- import matplotlib.pyplot as plt
- %matplotlib inline
- # Input data files are available in the "../input/" directory.
- # For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
- from subprocess import check_output
- data = pd.read_csv('999.csv', nrows=6200)
- # Keeping only the neccessary columns
- data = data[['text','label']]
- # Splitting the dataset into train and test set
- train, test = train_test_split(data,test_size = 0.1)
- # Removing neutral sentiments
- train = train[train.label != ""]
- train_pos = train[ train['label'] == 'REAL']
- train_pos = train_pos['text']
- train_neg = train[ train['label'] == 'FAKE']
- train_neg = train_neg['text']
- def wordcloud_draw(data, color = 'black'):
- words = ' '.join(data)
- cleaned_word = " ".join([word for word in words.split()
- if 'http' not in word
- and not word.startswith('@')
- and not word.startswith('#')
- and word != 'RT'
- ])
- wordcloud = WordCloud(stopwords=STOPWORDS,
- background_color=color,
- width=2500,
- height=2000
- ).generate(cleaned_word)
- plt.figure(1,figsize=(13, 13))
- plt.imshow(wordcloud)
- plt.axis('off')
- plt.show()
- print("Positive words")
- wordcloud_draw(train_pos,'white')
- print("Negative words")
- wordcloud_draw(train_neg)
- tweets = []
- stopwords_set = set(stopwords.words("english"))
- for index, row in train.iterrows():
- words_filtered = [e.lower() for e in row.text.split() if len(e) >= 3]
- words_cleaned = [word for word in words_filtered
- if 'http' not in word
- and not word.startswith('@')
- and not word.startswith('#')
- and word != 'RT']
- words_without_stopwords = [word for word in words_cleaned if not word in stopwords_set]
- tweets.append((words_cleaned,row.label))
- test_pos = test[ test['label'] == 'REAL']
- test_pos = test_pos['text']
- test_neg = test[ test['label'] == 'FAKE']
- test_neg = test_neg['text']
- print("Positiv2e words")
- wordcloud_draw(train_pos,'white')
- print("Negativ2e words")
- wordcloud_draw(train_neg)
- # Extracting word features
- def get_words_in_tweets(tweets):
- all = []
- for (words, sentiment) in tweets:
- all.extend(words)
- return all
- def get_word_features(wordlist):
- wordlist = nltk.FreqDist(wordlist)
- features = wordlist.keys()
- return features
- w_features = get_word_features(get_words_in_tweets(tweets))
- def extract_features(document):
- document_words = set(document)
- features = {}
- for word in w_features:
- features['containts(%s)' % word] = (word in document_words)
- return features
- # wordcloud_draw(w_features)
- # Training the Naive Bayes classifier
- training_set = nltk.classify.apply_features(extract_features,tweets)
- classifier = nltk.NaiveBayesClassifier.train(training_set)
- neg_cnt = 0
- pos_cnt = 0
- for obj in test_neg:
- res = classifier.classify(extract_features(obj.split()))
- if(res == 'FAKE'):
- neg_cnt = neg_cnt + 1
- for obj in test_pos:
- res = classifier.classify(extract_features(obj.split()))
- if(res == 'REAL'):
- pos_cnt = pos_cnt + 1
- print("Result")
- print('[FAKE]: %s/%s ' % (len(test_neg),neg_cnt))
- print('[REAL]: %s/%s ' % (len(test_pos),pos_cnt))
- print("ciota")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement