Untitled

import os, json
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import pandas as pd

#finds all json files in a directory
json_files = [pos_json for pos_json in os.listdir('./') if pos_json.endswith('.json')]

words = []

#these iterations is to get all body text which has at least one topic and body is not blank from articles to a words list.
for file in json_files:
	for article in json.loads(open(file).read()):
		if 'topics' in article:
			if len(article['topics']) > 0:
				if 'body' in article:
					article['body'] = article['body'].lower().replace('\n', ' ')
					words.append(article['body'])

vectorizer = CountVectorizer(token_pattern=r"\b\w+\b")
data = vectorizer.fit_transform(words).todense()

#prints data matrix and the size of that matrix
print (data)
print ('Bag of words matrix size is', data.shape)

#spliting data to train and test data
X, y = make_classification(n_samples=100, n_features=28473, n_informative=2, n_redundant=0, random_state=0, shuffle=False)

df = pd.DataFrame(data=data)
train, test = train_test_split(df, test_size=0.2)

#running Random Forest Classifier with 50 estimators
forest = RandomForestClassifier(n_estimators = 50)
forest = forest.fit(train, y)
predict = forest.predict(test)
#printing out the accuracy result
print (predict)