Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import json
- import pandas as pd
- import os
- import collections, re
- from sklearn.feature_extraction.text import CountVectorizer
- import timeit
- import numpy as np
- import hashlib
- from sklearn.ensemble import RandomForestClassifier
- from sklearn.datasets import make_classification
- def bag_the_words():
- c = 0
- file_counter = 0
- texts = []
- topics = []
- hashed_features = []
- for filename in os.listdir('.\\'):
- #print (file_counter)
- if filename.endswith(".json"):
- with open(filename) as data_file:
- file_counter += 1
- json_object = json.load(data_file)
- #print(data[4]["title"])
- for data in json_object:
- # now song is a dictionary
- if 'body' not in data:
- data.pop
- elif 'topics' not in data:
- data.pop
- else:
- #print (topic)
- body = (data['body']).lower()
- body = re.sub(r"[^\w.,?!]", " ", body)
- body = re.sub(' +',' ', body)
- texts.append(body)
- hashed_array = []
- hashed_array = hashing_vectorizer(body.split(' '), 1000)
- hashed_features.append(hashed_array)
- if 'earn' in data['topics']:
- topics.append(1)
- else:
- topics.append(0)
- c+=1
- else:
- continue
- print (c)
- return verctorize_bag_of_words(texts), hashed_features, topics
- def create_bag_of_words(texts):
- start_time = timeit.default_timer()
- bagsofwords = [collections.Counter(re.findall(r"[^\w.,?!]", txt)) for txt in texts]
- sumbags = sum(bagsofwords, collections.Counter())
- elapsed = timeit.default_timer() - start_time
- print("Bag_of_words_created_in " + str(elapsed) + " seconds")
- print (len(sumbags))
- def verctorize_bag_of_words(texts):
- vectorizer = CountVectorizer()
- X = vectorizer.fit_transform(texts)
- array = X.toarray()
- #print (numpy.shape(array))
- featurenames = vectorizer.get_feature_names()
- #print (len(featurenames))
- return X
- def hashing_vectorizer(features, n_features):
- hashed_features = np.zeros(n_features, dtype=int)
- for feature in features:
- #print ("the word is: " + feature)
- hashed = hashlib.md5(feature.encode()).hexdigest()
- index = int_generator(hashed) % n_features
- #print ("index " + str(index))
- hashed_features[index] = 1
- #print (hashed_features)
- #print (hashed.hexdigest())
- return hashed_features
- def int_generator(md5_hashed_string):
- value = 0
- counter = 0
- for char in md5_hashed_string:
- value += ord(char) + counter*4242
- counter += 1
- return value
- def randomtree(X, y):
- print ("\n--------------Growing new Tree-----------------")
- y = np.asarray(y)
- trainX = X[:8300]
- trainY = y[:8300]
- testX = X[8300:]
- testy = y[8300:]
- print("total input: " + str(len(trainX) + len(testX)))
- print("training data count: " + str(len(trainX)))
- print("test data count: " + str(len(testX)))
- start_time = timeit.default_timer()
- clf = RandomForestClassifier(max_depth=2, random_state=0, n_estimators=50, n_jobs = 2)
- clf.fit(trainX , trainY)
- #print (clf.feature_importances_)
- predictions = clf.predict(testX)
- #print ("number of predictions " + str(len(predictions)))
- pos = 0
- neg = 0
- posneg = 0
- for i in range(0, len(testy)):
- if predictions[i] == testy[i]:
- pos += 1
- elif predictions[i] == 1 and testy[i] == 0:
- posneg += 1
- else:
- neg += 1
- elapsed = timeit.default_timer() - start_time
- print("Yggdrassil done in " + str(elapsed) + " seconds")
- print (" ------------------results--------------------")
- print ("negetives "+ str(neg))
- print ("positive negatives "+ str(posneg))
- print ("correct estimations "+ str(pos))
- print ("correctness ratio " + str(pos / len(testy)))
- #print(int_generator("637fgb0e587e46c79448da3a2fea83fe") % 1000)
- #ins = "stor fed bold"
- #hh = hashing_vectorizer(ins.split(' '), 50)
- #print(hh)
- X, h_X, y = bag_the_words()
- randomtree(X.toarray(),y)
- randomtree(h_X,y)
Add Comment
Please, Sign In to add comment