Guest User

Untitled

a guest
Nov 22nd, 2017
90
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.53 KB | None | 0 0
  1. import json
  2. import pandas as pd
  3. import os
  4. import collections, re
  5. from sklearn.feature_extraction.text import CountVectorizer
  6. import timeit
  7. import numpy as np
  8. import hashlib
  9.  
  10.  
  11. from sklearn.ensemble import RandomForestClassifier
  12. from sklearn.datasets import make_classification
  13.  
  14. def bag_the_words():
  15. c = 0
  16. file_counter = 0
  17. texts = []
  18. topics = []
  19. hashed_features = []
  20. for filename in os.listdir('.\\'):
  21.  
  22. #print (file_counter)
  23. if filename.endswith(".json"):
  24.  
  25. with open(filename) as data_file:
  26. file_counter += 1
  27. json_object = json.load(data_file)
  28. #print(data[4]["title"])
  29. for data in json_object:
  30. # now song is a dictionary
  31. if 'body' not in data:
  32. data.pop
  33. elif 'topics' not in data:
  34. data.pop
  35. else:
  36. #print (topic)
  37. body = (data['body']).lower()
  38. body = re.sub(r"[^\w.,?!]", " ", body)
  39. body = re.sub(' +',' ', body)
  40. texts.append(body)
  41. hashed_array = []
  42. hashed_array = hashing_vectorizer(body.split(' '), 1000)
  43. hashed_features.append(hashed_array)
  44. if 'earn' in data['topics']:
  45. topics.append(1)
  46. else:
  47. topics.append(0)
  48. c+=1
  49.  
  50.  
  51. else:
  52. continue
  53.  
  54. print (c)
  55.  
  56. return verctorize_bag_of_words(texts), hashed_features, topics
  57.  
  58. def create_bag_of_words(texts):
  59. start_time = timeit.default_timer()
  60. bagsofwords = [collections.Counter(re.findall(r"[^\w.,?!]", txt)) for txt in texts]
  61. sumbags = sum(bagsofwords, collections.Counter())
  62. elapsed = timeit.default_timer() - start_time
  63. print("Bag_of_words_created_in " + str(elapsed) + " seconds")
  64. print (len(sumbags))
  65.  
  66.  
  67. def verctorize_bag_of_words(texts):
  68. vectorizer = CountVectorizer()
  69. X = vectorizer.fit_transform(texts)
  70. array = X.toarray()
  71. #print (numpy.shape(array))
  72. featurenames = vectorizer.get_feature_names()
  73. #print (len(featurenames))
  74. return X
  75.  
  76. def hashing_vectorizer(features, n_features):
  77. hashed_features = np.zeros(n_features, dtype=int)
  78.  
  79. for feature in features:
  80. #print ("the word is: " + feature)
  81. hashed = hashlib.md5(feature.encode()).hexdigest()
  82.  
  83. index = int_generator(hashed) % n_features
  84. #print ("index " + str(index))
  85. hashed_features[index] = 1
  86. #print (hashed_features)
  87.  
  88. #print (hashed.hexdigest())
  89. return hashed_features
  90.  
  91. def int_generator(md5_hashed_string):
  92. value = 0
  93. counter = 0
  94. for char in md5_hashed_string:
  95. value += ord(char) + counter*4242
  96. counter += 1
  97. return value
  98.  
  99. def randomtree(X, y):
  100.  
  101. print ("\n--------------Growing new Tree-----------------")
  102.  
  103.  
  104. y = np.asarray(y)
  105.  
  106. trainX = X[:8300]
  107. trainY = y[:8300]
  108.  
  109. testX = X[8300:]
  110. testy = y[8300:]
  111.  
  112. print("total input: " + str(len(trainX) + len(testX)))
  113. print("training data count: " + str(len(trainX)))
  114. print("test data count: " + str(len(testX)))
  115.  
  116. start_time = timeit.default_timer()
  117. clf = RandomForestClassifier(max_depth=2, random_state=0, n_estimators=50, n_jobs = 2)
  118. clf.fit(trainX , trainY)
  119.  
  120. #print (clf.feature_importances_)
  121. predictions = clf.predict(testX)
  122. #print ("number of predictions " + str(len(predictions)))
  123. pos = 0
  124. neg = 0
  125. posneg = 0
  126.  
  127. for i in range(0, len(testy)):
  128. if predictions[i] == testy[i]:
  129. pos += 1
  130. elif predictions[i] == 1 and testy[i] == 0:
  131. posneg += 1
  132. else:
  133. neg += 1
  134.  
  135. elapsed = timeit.default_timer() - start_time
  136. print("Yggdrassil done in " + str(elapsed) + " seconds")
  137.  
  138. print (" ------------------results--------------------")
  139. print ("negetives "+ str(neg))
  140.  
  141. print ("positive negatives "+ str(posneg))
  142.  
  143. print ("correct estimations "+ str(pos))
  144.  
  145. print ("correctness ratio " + str(pos / len(testy)))
  146.  
  147. #print(int_generator("637fgb0e587e46c79448da3a2fea83fe") % 1000)
  148.  
  149. #ins = "stor fed bold"
  150.  
  151. #hh = hashing_vectorizer(ins.split(' '), 50)
  152. #print(hh)
  153. X, h_X, y = bag_the_words()
  154.  
  155.  
  156. randomtree(X.toarray(),y)
  157.  
  158. randomtree(h_X,y)
Add Comment
Please, Sign In to add comment