daily pastebin goal
6%
SHARE
TWEET

Untitled

a guest Apr 16th, 2018 57 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. from sklearn import cross_validation
  2. import numpy as np # linear algebra
  3. import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
  4. from sklearn.model_selection import train_test_split # function for splitting data to train and test sets
  5.  
  6. import nltk
  7. from nltk.corpus import stopwords
  8. from nltk.classify import SklearnClassifier
  9.  
  10. from wordcloud import WordCloud,STOPWORDS
  11. import matplotlib.pyplot as plt
  12. %matplotlib inline
  13.  
  14. # Input data files are available in the "../input/" directory.
  15. # For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
  16.  
  17. from subprocess import check_output
  18.  
  19. data = pd.read_csv('999.csv', nrows=6200)
  20. # Keeping only the neccessary columns
  21. data = data[['text','label']]
  22.  
  23. # Splitting the dataset into train and test set
  24. train, test = train_test_split(data,test_size = 0.1)
  25. # Removing neutral sentiments
  26. train = train[train.label != ""]
  27.  
  28. train_pos = train[ train['label'] == 'REAL']
  29. train_pos = train_pos['text']
  30. train_neg = train[ train['label'] == 'FAKE']
  31. train_neg = train_neg['text']
  32.  
  33. def wordcloud_draw(data, color = 'black'):
  34.     words = ' '.join(data)
  35.     cleaned_word = " ".join([word for word in words.split()
  36.                             if 'http' not in word
  37.                                 and not word.startswith('@')
  38.                                 and not word.startswith('#')
  39.                                 and word != 'RT'
  40.                             ])
  41.     wordcloud = WordCloud(stopwords=STOPWORDS,
  42.                       background_color=color,
  43.                       width=2500,
  44.                       height=2000
  45.                      ).generate(cleaned_word)
  46.     plt.figure(1,figsize=(13, 13))
  47.     plt.imshow(wordcloud)
  48.     plt.axis('off')
  49.     plt.show()
  50.    
  51. print("Positive words")
  52. wordcloud_draw(train_pos,'white')
  53. print("Negative words")
  54. wordcloud_draw(train_neg)
  55.  
  56. tweets = []
  57. stopwords_set = set(stopwords.words("english"))
  58.  
  59. for index, row in train.iterrows():
  60.     words_filtered = [e.lower() for e in row.text.split() if len(e) >= 3]
  61.     words_cleaned = [word for word in words_filtered
  62.         if 'http' not in word
  63.         and not word.startswith('@')
  64.         and not word.startswith('#')
  65.         and word != 'RT']
  66.     words_without_stopwords = [word for word in words_cleaned if not word in stopwords_set]
  67.     tweets.append((words_cleaned,row.label))
  68.  
  69. test_pos = test[ test['label'] == 'REAL']
  70. test_pos = test_pos['text']
  71. test_neg = test[ test['label'] == 'FAKE']
  72. test_neg = test_neg['text']
  73.  
  74. print("Positiv2e words")
  75. wordcloud_draw(train_pos,'white')
  76. print("Negativ2e words")
  77. wordcloud_draw(train_neg)
  78.  
  79. # Extracting word features
  80. def get_words_in_tweets(tweets):
  81.     all = []
  82.     for (words, sentiment) in tweets:
  83.         all.extend(words)
  84.     return all
  85.  
  86. def get_word_features(wordlist):
  87.     wordlist = nltk.FreqDist(wordlist)
  88.     features = wordlist.keys()
  89.     return features
  90. w_features = get_word_features(get_words_in_tweets(tweets))
  91.  
  92. def extract_features(document):
  93.     document_words = set(document)
  94.     features = {}
  95.     for word in w_features:
  96.         features['containts(%s)' % word] = (word in document_words)
  97.     return features
  98.  
  99.  
  100. # wordcloud_draw(w_features)
  101.  
  102.  
  103. # Training the Naive Bayes classifier
  104. training_set = nltk.classify.apply_features(extract_features,tweets)
  105. classifier = nltk.NaiveBayesClassifier.train(training_set)
  106.  
  107.  
  108. neg_cnt = 0
  109. pos_cnt = 0
  110. for obj in test_neg:
  111.     res =  classifier.classify(extract_features(obj.split()))
  112.     if(res == 'FAKE'):
  113.         neg_cnt = neg_cnt + 1
  114. for obj in test_pos:
  115.     res =  classifier.classify(extract_features(obj.split()))
  116.     if(res == 'REAL'):
  117.         pos_cnt = pos_cnt + 1
  118.        
  119.  
  120. print("Result")
  121.        
  122. print('[FAKE]: %s/%s '  % (len(test_neg),neg_cnt))        
  123. print('[REAL]: %s/%s '  % (len(test_pos),pos_cnt))    
  124.  
  125. print("ciota")
RAW Paste Data
Top