Guest User

Python Machine Learning

a guest
Jun 16th, 2023
81
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 19.36 KB | Source Code | 0 0
  1. print("Loading external modules...")
  2. print("Loading pytorch...")
  3. import torch
  4. import torch.nn as nn
  5. import torch.optim as optim
  6. from torch.utils.data import Dataset, DataLoader
  7. print("Finished loading pytorch!")
  8.  
  9. print("Loading sklearn / scikit-learn...")
  10. from sklearn.feature_extraction.text import TfidfVectorizer
  11. from sklearn.preprocessing import LabelEncoder
  12. from sklearn.model_selection import train_test_split
  13. from sklearn.preprocessing import MultiLabelBinarizer
  14. print("Finished loading sklearn / scikit-learn...")
  15.  
  16. print("Loading other modules...")
  17. import numpy as np
  18. import matplotlib.pyplot as plt
  19. import random
  20. print("Finished loading other modules")
  21. print("Finished loading all external modules!")
  22.  
  23. print("Loading internal modules...")
  24. import ml.dataset
  25. import ml.config
  26. import ml.utils
  27. import ml.devices
  28. print("Finished loaded internal modules!")
  29.  
  30.  
  31. gpu_available = ml.devices.is_cuda_available()
  32.  
  33. # Check if pytorch is compiled with CUDA enabled
  34. try:
  35.     torch.cuda.current_device() # Try to get current CUDA device, throws AssertionError if not
  36.     torch_cuda_compiled = True
  37.     print("[DEVICE] PyTorch compiled with CUDA enabled")
  38. except:
  39.     torch_cuda_compiled = False
  40.     gpu_available = False
  41.     print("[DEVICE] PyTorch not compiled with CUDA enabled")
  42.  
  43.  
  44. # Check for devices
  45. if (gpu_available and ml.config.use_gpu): # Prefer GPU and GPU is availbale
  46.     print("[GPU] CUDA Devices available:", ml.devices.get_device_count())
  47.     print("[GPU] Current CUDA Device:", ml.devices.get_device_name(ml.devices.get_current_device()))
  48.     device = torch.device("cuda")
  49. elif (gpu_available == False and ml.config.use_gpu): # Prefer GPU / CUDA but not available, using CPU
  50.     if (torch_cuda_compiled):
  51.         print("[GPU] Failed to use GPU, CUDA device not found")
  52.     else:
  53.         print("[GPU] Failed to use GPU, PyTorch not compiled with CUDA enabled")
  54.     print("[CPU] Defaulting to CPU")
  55.     device = torch.device("cpu")
  56. else:
  57.     print("[GPU] CUDA Device found but not used, use GPU is false")
  58.     print("[CPU] Defaulting to CPU")
  59.     device = torch.device("cpu")
  60.  
  61. device_type = ""
  62. if (device.type == "cpu"):
  63.     device_type = "CPU"
  64. elif (device.type == "cuda"):
  65.     device_type = "GPU"
  66. else:
  67.     device_type = "UNKNOWN"
  68.  
  69. print("[DEVICE] Using device:", device_type)
  70.  
  71.  
  72.  
  73. # Original training data
  74. inputs = ml.dataset.multi_texts
  75. labels = ml.dataset.multi_labels
  76.  
  77.  
  78. ml.config.random_split_seed = random.randint(0,9999) # Generate random split seed
  79.  
  80. # Split original inputs into training and temporary testing sets
  81. inputs_text_train, inputs_text_temp_test, labels_train, labels_temp_test = train_test_split(inputs, labels, test_size=ml.config.test_split_ratio, random_state=ml.config.random_split_seed)
  82.  
  83. # Split the temporary test set into validation and test sets
  84. inputs_text_val, inputs_text_test, labels_val, labels_test = train_test_split(inputs_text_temp_test, labels_temp_test, test_size=ml.config.validation_split_ratio, random_state=ml.config.random_split_seed)
  85.  
  86. # Convert texts to TF-IDF vectors
  87. vectorizer = TfidfVectorizer()
  88. X_train = vectorizer.fit_transform(inputs_text_train)
  89. X_test = vectorizer.transform(inputs_text_test)
  90. X_val = vectorizer.transform(inputs_text_val)
  91.  
  92. # Labels are already in a form that resembles one-hot encoding, so no need to transform them
  93. y_train = labels_train
  94. y_val = labels_val
  95. y_test = labels_test
  96.  
  97. mlb = MultiLabelBinarizer()
  98. mlb.fit(labels)
  99.  
  100. # One-hot encoding
  101. """
  102. print("Converting labels to one-hot encodings...")
  103. mlb = MultiLabelBinarizer()
  104. y = mlb.fit_transform(labels)
  105.  
  106. # Split original inputs into training and temporary testing sets
  107. inputs_text_train, inputs_text_temp_test, labels_train, labels_temp_test = train_test_split(inputs, labels, test_size=ml.config.test_split_ratio, random_state=ml.config.random_split_seed)
  108.  
  109. # Split the temporary test set into validation and test sets
  110. inputs_text_val, inputs_text_test, labels_val, labels_test = train_test_split(inputs_text_temp_test, labels_temp_test, test_size=ml.config.validation_split_ratio, random_state=ml.config.random_split_seed)
  111.  
  112.  
  113. # Split original inputs into training and testing sets
  114. #inputs_text_train, inputs_text_test = train_test_split(inputs, test_size=ml.config.test_split_ratio, random_state=ml.config.random_split_seed)
  115.  
  116.  
  117. # Convert texts to TF-IDF vectors
  118. vectorizer = TfidfVectorizer()
  119. X_train = vectorizer.fit_transform(inputs_text_train)
  120. X_test = vectorizer.transform(inputs_text_test)
  121. X_val = vectorizer.transform(inputs_text_val)
  122.  
  123.  
  124. # Convert labels to integers
  125. if (ml.config.enable_timing): start = ml.utils.timer()
  126. print("Converting labels to integers...")
  127. #le = LabelEncoder()
  128. #y_train = le.fit_transform(labels_train)
  129. #y_test = le.transform(labels_test)
  130. #y_val = le.transform(labels_val)
  131. y_train = mlb.fit_transform(labels_train)
  132. y_test = mlb.transform(labels_test)
  133. y_val = mlb.transform(labels_val)
  134. print("Finished converting labels to integers!")
  135. if (ml.config.enable_timing): end = ml.utils.timer()
  136. if (ml.config.enable_timing): print(f"Converting labels took {ml.utils.elapsed_time(start, end)} seconds")
  137. """
  138.  
  139. # Unique labels is the labels list without duplicates
  140. # Unique labels holds 1 of each possible output / label
  141. # Unique int labels holds the index or id of the label
  142. # Number of unique labels is the length of labels list without duplicates
  143. #unique_labels = le.classes_
  144. #unique_labels = mlb.classes_
  145.  
  146.  
  147. unique_int_labels = ml.utils.array_to_list(np.unique(y_train,), "%i")
  148. num_unique_labels = len(np.unique(y_train))
  149.  
  150. label_counts = {}
  151.  
  152. if (ml.config.display_train_labels or ml.config.display_train_label_counts):
  153.     if (ml.config.display_train_labels): print("\nLabels:")
  154.     for l_idx in range(len(y_train)):
  155.         for i in range(len(labels[l_idx])):
  156.             if labels[l_idx][i] not in label_counts:
  157.                 label_counts[labels[l_idx][i]] = 0
  158.             label_counts[labels[l_idx][i]] += 1
  159.  
  160.  
  161.         if (ml.config.display_train_labels): print(f"    {inputs[l_idx]}: {str(labels[l_idx])}")
  162.    
  163.     if (ml.config.display_train_label_counts):
  164.         for key, value in label_counts.items():
  165.             print("Label:", key, "- Count:", value)
  166.  
  167. if (ml.config.display_train_unique_labels):
  168.     print("\nUnique Labels:")
  169.     for u_l_idx in range(num_unique_labels):
  170.         break
  171.         #print("    "+unique_labels[u_l_idx]+":"+str(unique_int_labels[u_l_idx]))
  172.     print()
  173.  
  174.  
  175. # Split data into training and testing sets
  176. #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=ml.config.test_split_ratio, random_state=ml.config.random_split_seed) # X: inputs, y: labels
  177.  
  178.  
  179. # Define the Dataset
  180. class TextDataset(Dataset):
  181.     def __init__(self, X, y):
  182.         self.X = torch.from_numpy(X.toarray()).float()  # Convert to PyTorch tensor
  183.         self.y = torch.from_numpy(np.array(y)).float()  # Labels should be floats
  184.  
  185.     def __len__(self):
  186.         return len(self.y)
  187.  
  188.     def __getitem__(self, idx):
  189.         return self.X[idx], self.y[idx], idx  # Return idx along with the data and labels
  190.  
  191.  
  192. # Create Dataloaders
  193. if (ml.config.enable_timing): start = ml.utils.timer()
  194. print("Creating Dataloaders...")
  195. train_data = TextDataset(X_train, y_train)
  196. test_data = TextDataset(X_test, y_test)
  197. val_data = TextDataset(X_val, y_val)  # Validation data
  198. train_loader = DataLoader(train_data, batch_size=ml.config.train_batch_size, shuffle=ml.config.shuffle_train_data)
  199. test_loader = DataLoader(test_data, batch_size=ml.config.test_batch_size)
  200. val_loader = DataLoader(val_data, batch_size=ml.config.validation_batch_size)  # Validation DataLoader
  201. print("Finished creating Dataloaders!")
  202. if (ml.config.enable_timing): end = ml.utils.timer()
  203. if (ml.config.enable_timing): print(f"Created Dataloaders in {ml.utils.elapsed_time(start, end)} seconds")
  204.  
  205. # Long Short-Term Memory Model (LSTM)
  206. class LSTMClassifier(nn.Module):
  207.     def __init__(self, input_size, hidden_size, output_size, num_layers):
  208.         super(LSTMClassifier, self).__init__()
  209.  
  210.         self.hidden_size = hidden_size
  211.         self.num_layers = num_layers
  212.         self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=False)
  213.         self.fc = nn.Linear(hidden_size, output_size)
  214.  
  215.     def forward(self, x):
  216.         # Initialize hidden state with zeros
  217.         h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
  218.  
  219.         # Initialize cell state
  220.         c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
  221.  
  222.         # Forward propagate LSTM
  223.         out, _ = self.lstm(x, (h0, c0))  # out: tensor of shape (batch_size, seq_length, hidden_size)
  224.  
  225.         # Decode the hidden state of the last time step
  226.         out = self.fc(out[:, -1, :])
  227.        
  228.         return out
  229.  
  230. # Multi-class classifier model
  231. class MultiLabelClassifier(nn.Module):
  232.     def __init__(self, input_size, hidden_size, num_classes):
  233.         super(MultiLabelClassifier, self).__init__()
  234.         print("Input size:", input_size)
  235.         print("Hidden size:", hidden_size)
  236.         print("Num classes:", num_classes)
  237.         self.layer1 = nn.Linear(input_size, hidden_size)
  238.         self.layer2 = nn.Linear(hidden_size, num_classes)
  239.        
  240.     def forward(self, x):
  241.         out = self.layer1(x)
  242.         out = nn.functional.relu(out)
  243.         return self.layer2(out)
  244.     def save(self, path):
  245.         torch.save(model.state_dict(), path)
  246.  
  247. # Linear model
  248. class LinearModel(nn.Module):
  249.     def __init__(self, input_dim, output_dim):
  250.         super(LinearModel, self).__init__()
  251.         self.fc1 = nn.Linear(input_dim, ml.config.hidden_units)
  252.         self.fc2 = nn.Linear(ml.config.hidden_units, output_dim)
  253.  
  254.     def forward(self, x):
  255.         x = nn.functional.relu(self.fc1(x))
  256.         x = self.fc2(x)
  257.         return nn.functional.log_softmax(x, dim=1)
  258.  
  259.  
  260. # LINEAR
  261. # Create the model
  262. #model = LinearModel(X_train.shape[1], len(mlb.classes_)).to(device)
  263.  
  264. # Define the loss function and the optimizer
  265. #criterion = nn.CrossEntropyLoss()
  266. #optimizer = optim.SGD(model.parameters(), lr=ml.config.learning_rate)
  267.  
  268. # Multi Label Classifier
  269. #model = MultiLabelClassifier(X_train.shape[1], ml.config.hidden_units, len(mlb.classes_)).to(device)
  270. model = MultiLabelClassifier(X_train.shape[1], ml.config.hidden_units, len(y_train[0])).to(device)
  271. criterion = nn.BCEWithLogitsLoss()
  272. optimizer = torch.optim.SGD(model.parameters(), lr=ml.config.learning_rate)
  273.  
  274.  
  275. # LSTM
  276. #model = LSTMClassifier(X_train.shape[1], ml.config.hidden_units, len(y_train[0]), num_layers=1).to(device)
  277.  
  278.  
  279. # Train the model
  280. if (ml.config.enable_timing): start = ml.utils.timer()
  281. print(f"\nTraining model...")
  282. total_step = len(train_loader)
  283. step_iter = 0
  284. loss_list = [] # List for storing average loss per step
  285. acc_list = [] # List for storing average accuracy per step
  286. epoch_loss_list = []  # List for storing average loss per epoch
  287. epoch_acc_list = []  # List for storing average accuracy per epoch
  288. val_epoch_loss_list = []  # List for storing average loss per epoch for validation data
  289. val_epoch_acc_list = []  # List for storing average accuracy per epoch for validation data
  290. val_loss_list = [] # List for storing loss for validation data
  291. val_acc_list = [] # List for storing accuracy for validation data
  292. epoch_timers = []
  293. for epoch in range(ml.config.num_epochs+1):
  294.     if (ml.config.enable_epoch_average_time): epoch_start = ml.utils.timer()
  295.     epoch_correct = 0  # Total correct predictions in the epoch
  296.     epoch_total = 0  # Total labels processed in the epoch
  297.     for i, data in enumerate(train_loader):
  298.         """
  299.        inputs, labels, indices = data
  300.        inputs = inputs.to(device)
  301.        labels = labels.to(device)
  302.        optimizer.zero_grad()
  303.        outputs = model(inputs)
  304.        loss = criterion(outputs, labels)
  305.        loss_list.append(loss.item())
  306.        loss.backward()
  307.        optimizer.step()
  308.        """
  309.         inputs, labels, indices = data
  310.         inputs = inputs.to(device)
  311.         labels = labels.to(device).float()
  312.  
  313.         # Forward pass
  314.         outputs = model(inputs)
  315.         loss = criterion(outputs, labels)
  316.         loss_list.append(loss.item())
  317.  
  318.         # Backward and optimize
  319.         optimizer.zero_grad()
  320.         loss.backward()
  321.         optimizer.step()
  322.  
  323.  
  324.         # Track the accuracy
  325.         total = labels.size(0)
  326.         _, predicted = torch.max(outputs.data, 1)
  327.  
  328.         # Get predictions: apply the threshold to the outputs
  329.         predicted = (torch.sigmoid(outputs.data) > ml.config.threshold).float()
  330.  
  331.         # Calculate the number of correctly predicted labels
  332.         correct = (predicted == labels).sum().item() / (labels.size(0) * labels.size(1))
  333.         #correct = (predicted == labels).sum().item()
  334.         epoch_correct += correct  # Increment by number of correct predictions
  335.         epoch_total += total  # Increment by number of total predictions
  336.         acc_list.append(correct / total)
  337.        
  338.         if (ml.config.display_step_info):
  339.             print(f'Epoch [{epoch}/{ml.config.num_epochs}] - Step [{i+1}/{total_step}] - Loss: {loss.item():.4f} - Accuracy: {correct}/{total} ({100*correct / total:.2f}%)')
  340.         elif (ml.config.display_train_progress):
  341.             print(f"Training: {100*(epoch)/(ml.config.num_epochs):.2f}%  - Epoch: [{epoch}/{ml.config.num_epochs}] - Step [{step_iter-total_step+1}/{(total_step*ml.config.num_epochs)}]", end="\r")
  342.             step_iter += 1
  343.        
  344.     # Add average loss and accuracy for this epoch to respective lists
  345.     # Calcute mean (average) before resetting accuracy and loss lists
  346.     epoch_loss_list.append(np.mean(loss_list))
  347.     epoch_acc_list.append(np.mean(acc_list))
  348.  
  349.     # Reset loss and accuracy lists
  350.     loss_list = []
  351.     acc_list = []
  352.  
  353.     if (ml.config.display_epoch_info): # Print at end of epoch
  354.         print(f'Epoch {epoch} - Loss: {np.mean(loss_list):.4f} - Accuracy: {epoch_correct}/{epoch_total} ({100 * np.mean(acc_list):.2f}%)')
  355.  
  356.     if (ml.config.enable_epoch_average_time):
  357.         epoch_end = ml.utils.timer()
  358.         epoch_timers.append(ml.utils.elapsed_time(epoch_start, epoch_end))
  359.    
  360.     # Validation
  361.     model.eval()  # Set the model to evaluation mode
  362.     with torch.no_grad():
  363.         val_correct = 0
  364.         val_total = 0
  365.         for data in val_loader:
  366.             inputs, labels, indices = data
  367.             inputs = inputs.to(device)
  368.             labels = labels.to(device).float()
  369.             outputs = model(inputs)
  370.             loss = criterion(outputs, labels)
  371.             val_loss_list.append(loss.item())
  372.  
  373.             # Track the accuracy
  374.             total = labels.size(0)
  375.             _, predicted = torch.max(outputs.data, 1)
  376.             # Get predictions: apply the threshold to the outputs
  377.             predicted = (torch.sigmoid(outputs.data) > ml.config.threshold).float()
  378.  
  379.             # Calculate the number of correctly predicted labels
  380.             correct = (predicted == labels).sum().item() / (labels.size(0) * labels.size(1))
  381.             #correct = (predicted == labels).sum().item()
  382.             val_correct += correct  # Increment by number of correct predictions
  383.             val_total += total  # Increment by number of total predictions
  384.             val_acc_list.append(correct / total)
  385.          
  386.         # Add average loss and accuracy for this epoch to respective lists
  387.         val_epoch_loss_list.append(np.mean(val_loss_list))
  388.         val_epoch_acc_list.append(np.mean(val_acc_list))
  389.         # Reset loss and accuracy lists for the next epoch
  390.         val_loss_list = []
  391.         val_acc_list = []
  392.     model.train()  # Set the model back to training mode
  393.  
  394.  
  395.  
  396. print("\nFinished training model!")
  397. if (ml.config.enable_timing): end = ml.utils.timer()
  398. if (ml.config.enable_timing): print(f"Training model took {ml.utils.elapsed_time(start, end)} seconds")
  399. if (ml.config.enable_epoch_average_time): print(f"Average epoch training time: {ml.utils.list_average(epoch_timers, ml.config.time_average_precision)} seconds")
  400.  
  401. if (ml.config.auto_show_graph):
  402.     ml.utils.show_graph(epoch_loss_list, epoch_acc_list, val_epoch_loss_list, val_epoch_acc_list)
  403.  
  404. # Test the model on testing data
  405. if (ml.config.enable_timing): start = ml.utils.timer()
  406. print(f"\nTesting model on training data...")
  407. model.eval()
  408. with torch.no_grad():
  409.     #test_iter = 0
  410.     for data in test_loader:
  411.         inputs, labels, indices = data  # Update to accept indices
  412.         inputs = inputs.to(device)
  413.         labels = labels.to(device).float()
  414.         outputs = model(inputs)
  415.        
  416.         _, predicted = torch.max(outputs, 1)
  417.  
  418.         #if (ml.config.display_test_info):
  419.             #for p in range(len(predicted)):
  420.                 #original_text = inputs_text_test[indices[p]]  # Access the original text using the index
  421.                 #print(f"Batch: [{p+1}/{ml.config.batch_size}] - Predicted: {le.inverse_transform([predicted[p].cpu()])[0]} - Expected: {le.inverse_transform([labels[p].cpu()])[0]}")
  422.                 #print(f"Batch: [{p+1}/{ml.config.batch_size}] - Predicted: {mlb.inverse_transform([predicted[p].cpu()])[0]} - Expected: {mlb.inverse_transform([labels[p].cpu()])[0]}")
  423.                 #test_iter += 1
  424.  
  425. print("Finished testing model!")
  426. if (ml.config.enable_timing): end = ml.utils.timer()
  427. if (ml.config.enable_timing): print(f"Testing model took {ml.utils.elapsed_time(start, end)} seconds")
  428.  
  429.  
  430. def predict(input_text):
  431.     #with torch.no_grad():
  432.     if (ml.config.enable_timing): start = ml.utils.timer()
  433.     print("Predicting using model...")
  434.     new_text = vectorizer.transform([input_text])
  435.     new_text = torch.tensor(new_text.toarray()).float().to(device)
  436.     output = model(new_text)
  437.     predicted = (torch.sigmoid(outputs) > ml.config.threshold).float()
  438.     print("Finished predicting!")
  439.     if (ml.config.enable_timing): end = ml.utils.timer()
  440.     if (ml.config.enable_timing): print(f"Prediction took {ml.utils.elapsed_time(start, end)} seconds")
  441.     return output
  442.  
  443. # Prediction
  444. while True:
  445.     prompt = input("\nPrompt: ")
  446.     if (prompt.startswith("/")):
  447.         if (prompt == "/exit"):
  448.             print("Exitting...")
  449.             exit()
  450.         elif (prompt == "/device"):
  451.             print(f"Device {device_type}")
  452.         elif (prompt == "/graph"):
  453.             ml.utils.show_graph(epoch_loss_list, epoch_acc_list, val_epoch_loss_list, val_epoch_acc_list)
  454.     else:
  455.         output = predict(prompt)
  456.         #print("Output:", output.detach().numpy())
  457.         #ml.utils.print_formatted_output(output)
  458.         probabilities = ml.utils.output_to_probabilities(output)
  459.  
  460.         probabilities = torch.sigmoid(output)
  461.  
  462.         # Convert probabilities to numpy array for easier manipulation
  463.         probabilities = probabilities.detach().cpu().numpy()
  464.  
  465.         # Apply threshold to get predicted labels
  466.         predicted_labels = (probabilities > ml.config.threshold).astype(int)
  467.  
  468.         print("Predictions:")
  469.         for i in range(len(labels[0])):
  470.             prob = probabilities[0][i]*100
  471.             print(f"{predicted_labels[0][i]} - {ml.utils.round_down(prob, ml.config.probabilities_round_precision)}")
  472.         #sorted_probabilities, formatted_probabilities = ml.utils.sort_probabilities(probabilities, encoder=mlb) # Sorted probabilities and indices
  473.         #print(f"Prediction: {next(iter(formatted_probabilities))} ({round(formatted_probabilities[next(iter(formatted_probabilities))], ml.config.probabilities_round_precision)})") # Print first element in probabilities dict
  474.         #ml.utils.print_formatted_probabilities(formatted_probabilities)
  475.  
  476.  
Tags: python
Advertisement
Add Comment
Please, Sign In to add comment