Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import os, json
- from sklearn.feature_extraction.text import CountVectorizer
- from sklearn.ensemble import RandomForestClassifier
- from sklearn.model_selection import train_test_split
- import pandas as pd
- #finds all json files in a directory
- json_files = [pos_json for pos_json in os.listdir('./') if pos_json.endswith('.json')]
- words = []
- #these iterations is to get all body text which has at least one topic and body is not blank from articles to a words list.
- for file in json_files:
- for article in json.loads(open(file).read()):
- if 'topics' in article:
- if len(article['topics']) > 0:
- if 'body' in article:
- article['body'] = article['body'].lower().replace('\n', ' ')
- words.append(article['body'])
- vectorizer = CountVectorizer(token_pattern=r"\b\w+\b")
- data = vectorizer.fit_transform(words).todense()
- #prints data matrix and the size of that matrix
- print (data)
- print ('Bag of words matrix size is', data.shape)
- #spliting data to train and test data
- X, y = make_classification(n_samples=100, n_features=28473, n_informative=2, n_redundant=0, random_state=0, shuffle=False)
- df = pd.DataFrame(data=data)
- train, test = train_test_split(df, test_size=0.2)
- #running Random Forest Classifier with 50 estimators
- forest = RandomForestClassifier(n_estimators = 50)
- forest = forest.fit(train, y)
- predict = forest.predict(test)
- #printing out the accuracy result
- print (predict)
Add Comment
Please, Sign In to add comment