Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from sklearn.preprocessing import OneHotEncoder
- hot_enc = OneHotEncoder(sparse=False)
- categorical_train_hot = data_train[:, categorical_features]
- hot_enc.fit(categorical_train_hot)
- print(hot_enc.categories_)
- categorical_train_hot = hot_enc.transform(categorical_train_hot)
- categorical_train_hot = np.column_stack((np.ones(len(categorical_train_hot)), categorical_train_hot))
- categorical_train_hot = np.delete(categorical_train_hot, [5, 12, 20], axis=1)
- print("Categorical train shape: " + str(categorical_train_hot.shape))
- categorical_weights_hot = calculate_approximate_coefficients(categorical_train_hot, y_v)
- print("shape of weights: " + str(categorical_weights_hot.shape))
- print("Categorical weights: ", categorical_weights_hot)
- categorical_test = data_test[:, categorical_features]
- categorical_test_hot = np.array(hot_enc.transform(categorical_test))
- categorical_test_hot = np.delete(categorical_test_hot, [4, 11, 19], axis=1)
- print("Categorical test shape: " + str(categorical_test_hot.shape))
- def apply_categorical_coefficients_hot(input_matrix):
- return apply_coefficients(input_matrix, categorical_weights_hot)
- categorical_error = calculate_quadratic_error(apply_categorical_coefficients_hot, categorical_test_hot, y_test)
- print("Categorical error with hot encoding: " + str(round(categorical_error/10**6, 3)) + " million")
- # bruh moment. Ошибка 1 707 миллионов на тестовых данных:-(
- # Если не брать экспоненту, ошибка получается не такой большой (всего 31 миллион)
- # Проверим на обучающей выборке
- categorical_error_train_hot = calculate_quadratic_error(apply_categorical_coefficients_hot, np.delete(categorical_train_hot, 0, axis=1), y_v)
- categorical_error_train = calculate_quadratic_error(apply_categorical_coefficients, np.delete(categorical_train, 0, axis=1), y_v)
- print("Error on training data (hot): " + str(round(categorical_error_train_hot/10**6, 3)) + " million")
- print("Error on training data (ordinal): " + str(round(categorical_error_train/10**6, 3)) + " million")
- print()
- # Разница почти такая же. Видимо принимая больше переменных, модель как-то очень странно распределяет между ними
- # ответственность за цену, что в итоге приводит к огромной ошибке
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement