Advertisement
Guest User

Untitled

a guest
May 22nd, 2015
236
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.14 KB | None | 0 0
  1. import csv
  2. from sklearn.feature_extraction.text import TfidfVectorizer
  3. from sklearn.metrics.pairwise import linear_kernel
  4. from sklearn import linear_model
  5.  
  6. import numpy
  7. from sklearn.metrics import mean_squared_error
  8.  
  9. class TfidfModel:
  10.  
  11. #reading the file. must be in same folder
  12. def read_file(self, filename):
  13. data = []
  14. with open(filename, newline='') as csvfile:
  15. spamreader = csv.reader(csvfile, delimiter=',')
  16. for row in spamreader:
  17. if row[0] != 'id' and filename == 'train.csv':
  18. row_details = (row[1], row[2], row[3], row[4], row[5])
  19. data.append((row[0], row_details))
  20. elif row[0] != 'id' and filename == 'test.csv':
  21. row_details = (row[1], row[2], row[3])
  22. data.append((row[0], row_details))
  23. return data
  24.  
  25. #getting corpus of current row for tfidfvectorizer
  26. def make_corpus_for_qeury(self, row_details):
  27. result = []
  28. result.append(row_details[0])
  29. result.append(row_details[1])
  30. descriptrion = row_details[2]
  31. if '<' in descriptrion:
  32. descriptrion = ''
  33. result.append(descriptrion)
  34. return result
  35.  
  36. # calculating tfidf between query with title and query with desctiprion
  37. def calc_tfidf_for_row(self, row_details):
  38. tfidf = TfidfVectorizer(stop_words='english')
  39. #analyzer='word', ngram_range=(1,3), min_df=0, stop_words='english')
  40. corpus = self.make_corpus_for_qeury(row_details)
  41. tfidf_matrix = tfidf.fit_transform(corpus)
  42. cosine_similarities = linear_kernel(
  43. tfidf_matrix[0:1], tfidf_matrix).flatten()
  44. return cosine_similarities[1], cosine_similarities[2]
  45.  
  46. #calclulate tfidf for all rows and prepare data for prediction model
  47. def calc_tfidf(self, filename):
  48. result = self.read_file(filename)
  49. result_data = []
  50. for row_id, row_details in result:
  51. tfidf_title, tfidf_descr = self.calc_tfidf_for_row(row_details)
  52. row_model_details = []
  53. row_model_details.append(row_id)
  54. if filename == 'train.csv':
  55. row_model_details.append(row_details[3])
  56. else:
  57. row_model_details.append('0')
  58. row_model_details.append(tfidf_title)
  59. row_model_details.append(tfidf_descr)
  60. if filename == 'train.csv':
  61. row_model_details.append(row_details[4])
  62. else:
  63. row_model_details.append('0')
  64. row_model_details.append(row_details[0])
  65. row_model_details.append(row_details[1])
  66. row_model_details.append(row_details[2])
  67.  
  68. result_data.append(row_model_details)
  69. return result_data
  70.  
  71. class PredictionModel:
  72.  
  73. # get x and y for model from tfidf data
  74. def get_xy(self, data):
  75. x = []
  76. y = []
  77. for row_details in data:
  78. y.append(int(row_details[1]))
  79. x.append((float(row_details[2]), float(row_details[3])))
  80. return numpy.matrix(x), numpy.array(y)
  81.  
  82. def get_test_x(self, data):
  83. x = []
  84. for row_details in data:
  85. x.append((float(row_details[2]), float(row_details[3])))
  86. return numpy.matrix(x)
  87.  
  88. #train model with ridge regression, calc mse and get predictions for test data
  89. def ridge_regression_model(self, data_fit, data_test):
  90. x, y = self.get_xy(data_fit)
  91. linear_regression = linear_model.Ridge()
  92. model = linear_regression.fit(x, y)
  93. #mean_squared_error(y_train, model.predict(X_train))
  94. predicted_y = model.predict(x)
  95. normalized_predictions = self.normalize_predictions(predicted_y)
  96. mse = mean_squared_error(y, normalized_predictions)
  97. print(mse)
  98. x_predict = self.get_test_x(data_test)
  99. predicted_values = model.predict(x_predict)
  100. self.get_predictions(data_test, predicted_values)
  101.  
  102. #write predictions for test data into a file
  103. def get_predictions(self, data_test, predicted_values):
  104. with open('result.csv', 'w') as f_res:
  105. for index, row in enumerate(data_test):
  106. row_id = row[0]
  107. prediction = predicted_values[index]
  108. f_res.write('{},{},\n'.format(row_id, prediction))
  109.  
  110. #if predictions greater then 4 or less then 1, ceil them
  111. def normalize_predictions(self, predictions):
  112. normalized_predictions = []
  113. for pred in predictions:
  114. if float(pred) > 4:
  115. normalized_predictions.append(float(4))
  116. elif float(pred) < 1:
  117. normalized_predictions.append(float(1))
  118. else:
  119. normalized_predictions.append(pred)
  120. return numpy.array(normalized_predictions)
  121.  
  122.  
  123. if __name__ == '__main__':
  124. tfidf_model = TfidfModel()
  125. data_for_model = tfidf_model.calc_tfidf(filename='train.csv')
  126. test_data = tfidf_model.calc_tfidf(filename='test.csv')
  127. predict_model = PredictionModel()
  128. regression_model = predict_model.ridge_regression_model(
  129. data_for_model, test_data)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement