Guest User

Untitled

a guest
Nov 22nd, 2017
66
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.37 KB | None | 0 0
  1. import os, json
  2. from sklearn.feature_extraction.text import CountVectorizer
  3. from sklearn.ensemble import RandomForestClassifier
  4. from sklearn.model_selection import train_test_split
  5. import pandas as pd
  6.  
  7. #finds all json files in a directory
  8. json_files = [pos_json for pos_json in os.listdir('./') if pos_json.endswith('.json')]
  9.  
  10. words = []
  11.  
  12. #these iterations is to get all body text which has at least one topic and body is not blank from articles to a words list.
  13. for file in json_files:
  14. for article in json.loads(open(file).read()):
  15. if 'topics' in article:
  16. if len(article['topics']) > 0:
  17. if 'body' in article:
  18. article['body'] = article['body'].lower().replace('\n', ' ')
  19. words.append(article['body'])
  20.  
  21. vectorizer = CountVectorizer(token_pattern=r"\b\w+\b")
  22. data = vectorizer.fit_transform(words).todense()
  23.  
  24. #prints data matrix and the size of that matrix
  25. print (data)
  26. print ('Bag of words matrix size is', data.shape)
  27.  
  28. #spliting data to train and test data
  29. X, y = make_classification(n_samples=100, n_features=28473, n_informative=2, n_redundant=0, random_state=0, shuffle=False)
  30.  
  31. df = pd.DataFrame(data=data)
  32. train, test = train_test_split(df, test_size=0.2)
  33.  
  34. #running Random Forest Classifier with 50 estimators
  35. forest = RandomForestClassifier(n_estimators = 50)
  36. forest = forest.fit(train, y)
  37. predict = forest.predict(test)
  38. #printing out the accuracy result
  39. print (predict)
Add Comment
Please, Sign In to add comment