Untitled

import csv
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn import linear_model

import numpy
from sklearn.metrics import mean_squared_error

class TfidfModel:

    #reading the file. must be in same folder
    def read_file(self, filename):
        data = []
        with open(filename, newline='') as csvfile:
            spamreader = csv.reader(csvfile, delimiter=',')
            for row in spamreader:
                if row[0] != 'id' and filename == 'train.csv':
                    row_details = (row[1], row[2], row[3], row[4], row[5])
                    data.append((row[0], row_details))
                elif row[0] != 'id' and filename == 'test.csv':
                    row_details = (row[1], row[2], row[3])
                    data.append((row[0], row_details))
        return data

    #getting corpus of current row for tfidfvectorizer
    def make_corpus_for_qeury(self, row_details):
        result = []
        result.append(row_details[0])
        result.append(row_details[1])
        descriptrion = row_details[2]
        if '<' in descriptrion:
            descriptrion = ''
        result.append(descriptrion)
        return result

    # calculating tfidf between query with title and query with desctiprion
    def calc_tfidf_for_row(self, row_details):
        tfidf = TfidfVectorizer(stop_words='english')
        #analyzer='word', ngram_range=(1,3), min_df=0, stop_words='english')
        corpus = self.make_corpus_for_qeury(row_details)
        tfidf_matrix = tfidf.fit_transform(corpus)
        cosine_similarities = linear_kernel(
            tfidf_matrix[0:1], tfidf_matrix).flatten()
        return cosine_similarities[1], cosine_similarities[2]

    #calclulate tfidf for all rows and prepare data for prediction model
    def calc_tfidf(self, filename):
        result = self.read_file(filename)
        result_data = []
        for row_id, row_details in result:
            tfidf_title, tfidf_descr = self.calc_tfidf_for_row(row_details)
            row_model_details = []
            row_model_details.append(row_id)
            if filename == 'train.csv':
                row_model_details.append(row_details[3])
            else:
                row_model_details.append('0')
            row_model_details.append(tfidf_title)
            row_model_details.append(tfidf_descr)
            if filename == 'train.csv':
                row_model_details.append(row_details[4])
            else:
                row_model_details.append('0')
            row_model_details.append(row_details[0])
            row_model_details.append(row_details[1])
            row_model_details.append(row_details[2])

            result_data.append(row_model_details)
        return result_data

class PredictionModel:

    # get x and y for model from tfidf data
    def get_xy(self, data):
        x = []
        y = []
        for row_details in data:
            y.append(int(row_details[1]))
            x.append((float(row_details[2]), float(row_details[3])))
        return numpy.matrix(x), numpy.array(y)

    def get_test_x(self, data):
        x = []
        for row_details in data:
            x.append((float(row_details[2]), float(row_details[3])))
        return numpy.matrix(x)

    #train model with ridge regression, calc mse and get predictions for test data
    def ridge_regression_model(self, data_fit, data_test):
        x, y = self.get_xy(data_fit)
        linear_regression = linear_model.Ridge()
        model = linear_regression.fit(x, y)
        #mean_squared_error(y_train, model.predict(X_train))
        predicted_y = model.predict(x)
        normalized_predictions = self.normalize_predictions(predicted_y)
        mse = mean_squared_error(y, normalized_predictions)
        print(mse)
        x_predict = self.get_test_x(data_test)
        predicted_values = model.predict(x_predict)
        self.get_predictions(data_test, predicted_values)

    #write predictions for test data into a file
    def get_predictions(self, data_test, predicted_values):
        with open('result.csv', 'w') as f_res:
            for index, row in enumerate(data_test):
                row_id = row[0]
                prediction = predicted_values[index]
                f_res.write('{},{},\n'.format(row_id, prediction))

    #if predictions greater then 4 or less then 1, ceil them
    def normalize_predictions(self, predictions):
        normalized_predictions = []
        for pred in predictions:
            if float(pred) > 4:
                normalized_predictions.append(float(4))
            elif float(pred) < 1:
                normalized_predictions.append(float(1))
            else:
                normalized_predictions.append(pred)
        return numpy.array(normalized_predictions)


if __name__ == '__main__':
    tfidf_model = TfidfModel()
    data_for_model = tfidf_model.calc_tfidf(filename='train.csv')
    test_data = tfidf_model.calc_tfidf(filename='test.csv')
    predict_model = PredictionModel()
    regression_model = predict_model.ridge_regression_model(
        data_for_model, test_data)