Advertisement
lapkin25

Untitled

Jun 17th, 2024
141
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 8.80 KB | None | 0 0
  1. import csv
  2. import numpy as np
  3. from sklearn.model_selection import StratifiedKFold, train_test_split
  4. from sklearn.linear_model import LogisticRegression
  5. from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, roc_curve
  6. from sklearn.decomposition import PCA
  7. import math
  8.  
  9. class InterpretableModel:
  10. def __init__(self):
  11. pass
  12.  
  13. def fit(self, x, y):
  14. pca = PCA(n_components=9)
  15. pca.fit(x)
  16. print(pca.components_)
  17. print(pca.explained_variance_ratio_)
  18. components = pca.transform(x)
  19. #print(components[0])
  20. #print(np.dot(pca.components_, x[0, :]))
  21. num_components = 9
  22. data_size, num_features = x.shape
  23.  
  24. cutoffs1 = np.zeros(num_features)
  25. cutoffs2 = np.zeros(num_features)
  26.  
  27. # находим порог для каждой главной компоненты
  28. for k in range(num_components):
  29. # разбиваем диапазон значений k-й компоненты на 100 частей
  30. grid = np.linspace(np.min(components[:, k]), np.max(components[:, k]), 100, endpoint=False)
  31. min_entropy = None
  32. optimal_cutoff = None
  33. for cutoff in grid:
  34. y_pred = np.where(components[:, k] >= cutoff, 1, 0).reshape(-1, 1)
  35. cm = confusion_matrix(y, y_pred)
  36. #print(cm)
  37. N0 = cm[0, 0] + cm[1, 0] # число предсказанных "0"
  38. N1 = cm[0, 1] + cm[1, 1] # число предсказанных "1"
  39. if N1 == 0 or N0 == 0:
  40. continue
  41. p0 = cm[0, 0] / N0
  42. p1 = cm[1, 1] / N1
  43. if p0 == 0 or p1 == 0:
  44. continue
  45. entropy = -(N1/data_size) * math.log(p1) - (N0/data_size) * math.log(p0) + \
  46. (N1/data_size) * math.log(N1/data_size) + (N0/data_size) * math.log(N0/data_size)
  47. if min_entropy is None or entropy < min_entropy:
  48. min_entropy = entropy
  49. optimal_cutoff = cutoff
  50. # print(cm)
  51.  
  52. print("k =", k, " cutoff =", optimal_cutoff)
  53. cutoffs1[k] = optimal_cutoff
  54.  
  55. min_entropy = None
  56. optimal_cutoff = None
  57. for cutoff in grid:
  58. y_pred = np.where(components[:, k] <= cutoff, 1, 0).reshape(-1, 1)
  59. cm = confusion_matrix(y, y_pred)
  60. #print(cm)
  61. N0 = cm[0, 0] + cm[1, 0] # число предсказанных "0"
  62. N1 = cm[0, 1] + cm[1, 1] # число предсказанных "1"
  63. if N1 == 0 or N0 == 0:
  64. continue
  65. p0 = cm[0, 0] / N0
  66. p1 = cm[1, 1] / N1
  67. if p0 == 0 or p1 == 0:
  68. continue
  69. entropy = -(N1/data_size) * math.log(p1) - (N0/data_size) * math.log(p0) + \
  70. (N1/data_size) * math.log(N1/data_size) + (N0/data_size) * math.log(N0/data_size)
  71. if min_entropy is None or entropy < min_entropy:
  72. min_entropy = entropy
  73. optimal_cutoff = cutoff
  74. # print(cm)
  75.  
  76. print("k =", k, " cutoff =", optimal_cutoff)
  77. cutoffs2[k] = optimal_cutoff
  78.  
  79. # обучаем модель логистической регрессии на дихотомизированных главных компонентах
  80. bin_components = np.zeros((data_size, 2 * num_components))
  81. for i in range(data_size):
  82. for k in range(num_components):
  83. bin_components[i, k] = 1 if components[i, k] >= cutoffs1[k] else 0
  84. bin_components[i, num_components + k] = 1 if components[i, k] <= cutoffs2[k] else 0
  85.  
  86. log_reg = LogisticRegression()
  87. log_reg.fit(bin_components, y)
  88.  
  89. self.pca = pca
  90. self.cutoffs_upper = cutoffs1
  91. self.cutoffs_lower = cutoffs2
  92. self.log_reg = log_reg
  93.  
  94.  
  95. def predict_proba(self, x):
  96. components = self.pca.transform(x)
  97. data_size, num_features = x.shape
  98. num_components = 9
  99.  
  100. bin_components = np.zeros((data_size, 2 * num_components))
  101. for i in range(data_size):
  102. for k in range(num_components):
  103. bin_components[i, k] = 1 if components[i, k] >= self.cutoffs_upper[k] else 0
  104. bin_components[i, num_components + k] = 1 if components[i, k] <= self.cutoffs_lower[k] else 0
  105.  
  106. return self.log_reg.predict_proba(bin_components)
  107.  
  108.  
  109.  
  110. def read_data(file_name, x_num, y_num):
  111. with open(file_name) as fp:
  112. reader = csv.reader(fp, delimiter=";")
  113. next(reader, None) # пропустить заголовки
  114. data_str = [row for row in reader]
  115. data_size = len(data_str)
  116. data = [[] for _ in range(data_size)]
  117. for i in range(data_size):
  118. try:
  119. data[i] = list(map(float, data_str[i]))
  120. assert(len(data[i]) == x_num + y_num)
  121. except (ValueError, AssertionError) as e:
  122. print(f"Error processing row {i+1}: {e}")
  123. continue
  124. data_x = np.array([[data[i][j] for j in range(x_num)] for i in range(data_size)])
  125. data_y = np.array([[data[i][x_num + j] for j in range(y_num)] for i in range(data_size)])
  126. return data_x, data_y
  127.  
  128. file_name = 'data0.csv' # Замените 'your_data_file.csv' на путь к вашему файлу данных
  129. x_num = 29
  130. y_num = 3
  131. data_x, data_y = read_data(file_name, x_num, y_num)
  132.  
  133. def k_fold_cross_validation(data_x, data_y, k=5):
  134. skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
  135. accuracy_scores = []
  136. sensitivity_scores = []
  137. specificity_scores = auc_scores = []
  138.  
  139. data_z = np.where(np.mean(data_y, axis=1) >= 75, 1, 0)
  140.  
  141. for i, (train_indices, test_indices) in enumerate(skf.split(data_x, data_z)):
  142. train_x, test_x = data_x[train_indices], data_x[test_indices]
  143. train_z, test_z = data_z[train_indices], data_z[test_indices]
  144.  
  145. # model = LogisticRegression()
  146. model = InterpretableModel()
  147. model.fit(train_x, train_z) # Обучение модели
  148.  
  149. # predictions = model.predict(test_x)
  150. predictions = model.predict_proba(test_x)[:, 1]
  151. accuracy = accuracy_score(test_z, predictions > 0.5)
  152. accuracy_scores.append(accuracy)
  153.  
  154. cm = confusion_matrix(test_z, predictions > 0.5)
  155. sensitivity = cm[1, 1] / (cm[1, 0] + cm[1, 1])
  156. specificity = cm[0, 0] / (cm[0, 0] + cm[0, 1])
  157. sensitivity_scores.append(sensitivity)
  158. specificity_scores.append(specificity)
  159.  
  160. # Рассчитаем AUC
  161. fpr, tpr, _ = roc_curve(test_z, predictions)
  162. auc = roc_auc_score(test_z, predictions)
  163. auc_scores.append(auc)
  164.  
  165. return accuracy_scores, sensitivity_scores, specificity_scores, auc_scores
  166.  
  167. def final_test(data_x, data_y):
  168. data_z = np.where(np.mean(data_y, axis=1) >= 75, 1, 0)
  169.  
  170. train_x, test_x, train_z, test_z = train_test_split(data_x, data_z, test_size=0.2, random_state=42)
  171.  
  172. # model = LogisticRegression()
  173. model = InterpretableModel()
  174. model.fit(train_x, train_z) # Обучение модели
  175.  
  176. #predictions = model.predict(test_x)
  177. predictions = model.predict_proba(test_x)[:, 1]
  178. accuracy = accuracy_score(test_z, predictions > 0.5)
  179.  
  180. print(test_z)
  181. print(predictions)
  182.  
  183. # Рассчитаем AUC для финального теста
  184. fpr, tpr, _ = roc_curve(test_z, predictions)
  185. final_auc = roc_auc_score(test_z, predictions)
  186.  
  187. return accuracy, final_auc
  188.  
  189. # Вывод статистики кросс-валидации
  190. accuracy_scores, sensitivity_scores, specificity_scores, auc_scores = k_fold_cross_validation(data_x, data_y)
  191. print(f'Cross-Validation Accuracy: {np.mean(accuracy_scores):.2%} (std: {np.std(accuracy_scores):.2%})')
  192. print(f'Cross-Validation Sensitivity: {np.mean(sensitivity_scores):.2%}')
  193. print(f'Cross-Validation Specificity: {np.mean(specificity_scores):.2%}')
  194. print(f'Cross-Validation AUC: {np.mean(auc_scores):.2%} (std: {np.std(auc_scores):.2%})')
  195.  
  196. # Итоговое тестирование
  197. final_accuracy, final_auc = final_test(data_x, data_y)
  198. print(f'Final Test Accuracy: {final_accuracy:.2%}')
  199. print(f'Final Test AUC: {final_auc:.2%}')
  200.  
  201.  
  202. model = InterpretableModel()
  203. #model = LogisticRegression()
  204. data_z = np.where(np.mean(data_y, axis=1) >= 75, 1, 0)
  205.  
  206. print(data_z)
  207.  
  208. train_x, test_x, train_z, test_z = train_test_split(data_x, data_z, test_size=0.2, random_state=42)
  209.  
  210. model.fit(train_x, train_z)
  211. pred = model.predict_proba(test_x)[:, 1]
  212. fpr, tpr, _ = roc_curve(test_z, pred)
  213. auc = roc_auc_score(test_z, pred)
  214. print(auc)
  215.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement