Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import csv
- from sklearn.feature_extraction.text import TfidfVectorizer
- from sklearn.metrics.pairwise import linear_kernel
- from sklearn import linear_model
- import numpy
- from sklearn.metrics import mean_squared_error
- class TfidfModel:
- #reading the file. must be in same folder
- def read_file(self, filename):
- data = []
- with open(filename, newline='') as csvfile:
- spamreader = csv.reader(csvfile, delimiter=',')
- for row in spamreader:
- if row[0] != 'id' and filename == 'train.csv':
- row_details = (row[1], row[2], row[3], row[4], row[5])
- data.append((row[0], row_details))
- elif row[0] != 'id' and filename == 'test.csv':
- row_details = (row[1], row[2], row[3])
- data.append((row[0], row_details))
- return data
- #getting corpus of current row for tfidfvectorizer
- def make_corpus_for_qeury(self, row_details):
- result = []
- result.append(row_details[0])
- result.append(row_details[1])
- descriptrion = row_details[2]
- if '<' in descriptrion:
- descriptrion = ''
- result.append(descriptrion)
- return result
- # calculating tfidf between query with title and query with desctiprion
- def calc_tfidf_for_row(self, row_details):
- tfidf = TfidfVectorizer(stop_words='english')
- #analyzer='word', ngram_range=(1,3), min_df=0, stop_words='english')
- corpus = self.make_corpus_for_qeury(row_details)
- tfidf_matrix = tfidf.fit_transform(corpus)
- cosine_similarities = linear_kernel(
- tfidf_matrix[0:1], tfidf_matrix).flatten()
- return cosine_similarities[1], cosine_similarities[2]
- #calclulate tfidf for all rows and prepare data for prediction model
- def calc_tfidf(self, filename):
- result = self.read_file(filename)
- result_data = []
- for row_id, row_details in result:
- tfidf_title, tfidf_descr = self.calc_tfidf_for_row(row_details)
- row_model_details = []
- row_model_details.append(row_id)
- if filename == 'train.csv':
- row_model_details.append(row_details[3])
- else:
- row_model_details.append('0')
- row_model_details.append(tfidf_title)
- row_model_details.append(tfidf_descr)
- if filename == 'train.csv':
- row_model_details.append(row_details[4])
- else:
- row_model_details.append('0')
- row_model_details.append(row_details[0])
- row_model_details.append(row_details[1])
- row_model_details.append(row_details[2])
- result_data.append(row_model_details)
- return result_data
- class PredictionModel:
- # get x and y for model from tfidf data
- def get_xy(self, data):
- x = []
- y = []
- for row_details in data:
- y.append(int(row_details[1]))
- x.append((float(row_details[2]), float(row_details[3])))
- return numpy.matrix(x), numpy.array(y)
- def get_test_x(self, data):
- x = []
- for row_details in data:
- x.append((float(row_details[2]), float(row_details[3])))
- return numpy.matrix(x)
- #train model with ridge regression, calc mse and get predictions for test data
- def ridge_regression_model(self, data_fit, data_test):
- x, y = self.get_xy(data_fit)
- linear_regression = linear_model.Ridge()
- model = linear_regression.fit(x, y)
- #mean_squared_error(y_train, model.predict(X_train))
- predicted_y = model.predict(x)
- normalized_predictions = self.normalize_predictions(predicted_y)
- mse = mean_squared_error(y, normalized_predictions)
- print(mse)
- x_predict = self.get_test_x(data_test)
- predicted_values = model.predict(x_predict)
- self.get_predictions(data_test, predicted_values)
- #write predictions for test data into a file
- def get_predictions(self, data_test, predicted_values):
- with open('result.csv', 'w') as f_res:
- for index, row in enumerate(data_test):
- row_id = row[0]
- prediction = predicted_values[index]
- f_res.write('{},{},\n'.format(row_id, prediction))
- #if predictions greater then 4 or less then 1, ceil them
- def normalize_predictions(self, predictions):
- normalized_predictions = []
- for pred in predictions:
- if float(pred) > 4:
- normalized_predictions.append(float(4))
- elif float(pred) < 1:
- normalized_predictions.append(float(1))
- else:
- normalized_predictions.append(pred)
- return numpy.array(normalized_predictions)
- if __name__ == '__main__':
- tfidf_model = TfidfModel()
- data_for_model = tfidf_model.calc_tfidf(filename='train.csv')
- test_data = tfidf_model.calc_tfidf(filename='test.csv')
- predict_model = PredictionModel()
- regression_model = predict_model.ridge_regression_model(
- data_for_model, test_data)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement