Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- from sklearn.model_selection import train_test_split
- from sklearn.metrics import mean_absolute_percentage_error
- import numpy as np
- import torch
- import torch.nn as nn
- import torch.optim as optim
- from keras.preprocessing.text import Tokenizer
- from keras.preprocessing.sequence import pad_sequences
- from IPython.display import clear_output
- # Initialize lists to store data
- # Split the data into train, valid and validation sets
- train_df, valid_df, test_df = train_valid_test_split(label_df, target = 'PRECO_VENDA_OBSERVADO', train_size = 0.6, valid_size = 0.2, test_size=0.2, shuffle = True, random_state=42)
- # Split X y
- # ------------------------
- X_train = train_df['DESCRICAO_TEXTO'].values # texts
- y_train = train_df['PRECO_VENDA_OBSERVADO'].astype(float).values # labels
- # Text and label columns
- X_valid = valid_df['DESCRICAO_TEXTO'].values # texts
- y_valid = valid_df['PRECO_VENDA_OBSERVADO'].astype(float).values # labels
- # Text and label columns
- X_test = test_df['DESCRICAO_TEXTO'].values # texts
- y_test = test_df['PRECO_VENDA_OBSERVADO'].astype(float).values # labels
- # Tokenization
- # -----------------------------------
- #
- tokenizer = Tokenizer()
- tokenizer.fit_on_texts(X_train)
- word_index = tokenizer.word_index
- # Train
- train_sequences = tokenizer.texts_to_sequences(X_train)
- max_sequence_length = max(len(seq) for seq in train_sequences)
- X_train_padded_sequences = pad_sequences(train_sequences, maxlen=max_sequence_length)
- # Validation
- valid_sequences = tokenizer.texts_to_sequences(X_valid)
- max_sequence_length = max(len(seq) for seq in train_sequences)
- X_valid_padded_sequences = pad_sequences(valid_sequences, maxlen=max_sequence_length)
- # Test
- test_sequences = tokenizer.texts_to_sequences(X_test)
- max_sequence_length = max(len(seq) for seq in test_sequences)
- X_test_padded_sequences = pad_sequences(test_sequences, maxlen=max_sequence_length)
- # Deep Learning Network configuration
- # =========================================================================
- # Define the model architecture
- class LSTMModel(nn.Module):
- def __init__(self, vocab_size, embedding_dim, hidden_dim):
- super(LSTMModel, self).__init__()
- self.embedding = nn.Embedding(vocab_size, embedding_dim)
- self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
- self.linear = nn.Linear(hidden_dim, 1)
- def forward(self, x):
- x = self.embedding(x)
- output, _ = self.lstm(x)
- output = output[:, -1, :] # Use only the last output from the LSTM
- output = self.linear(output)
- return output
- model = LSTMModel(vocab_size=len(word_index) + 1, embedding_dim=100, hidden_dim=128)
- # Define the loss function and optimizer
- criterion = nn.MSELoss()
- optimizer = optim.Adam(model.parameters(), lr=0.01)
- # Convert labels to torch tensor
- y_train = torch.tensor(y_train,dtype=torch.float32)
- y_valid = torch.tensor(y_valid,dtype=torch.float32)
- # Train the model
- batch_size = 1
- train_data = list(zip(X_train_padded_sequences,y_train))
- train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True)
- # TRAIN
- #========================================================================
- mapes_train_list = []; mapes_valid_list = []
- for epoch in range(1000):
- total_loss = 0
- for batch_x, batch_y in train_loader:
- optimizer.zero_grad()
- output = model(batch_x)
- loss = criterion(output, batch_y.view(-1, 1))
- loss.backward()
- optimizer.step()
- total_loss += loss.item()
- # Calculate MAPE for each epoch
- with torch.no_grad():
- y_train_hat = model(torch.tensor(X_train_padded_sequences, dtype=torch.long))
- mape_train = mean_absolute_percentage_error(y_train,y_train_hat.view(-1))
- y_valid_hat = model(torch.tensor(X_valid_padded_sequences, dtype=torch.long))
- mape_valid = mean_absolute_percentage_error(y_valid,y_valid_hat.view(-1))
- # Print the MAPE for each epoch
- print(f"Epoch {epoch + 1}, Loss: {total_loss}, MAPE_VALID: {mape_valid.item()} , MAPE_TRAIN: {mape_train.item()} ")
- # Save data for each epoch
- mapes_train_list.append(mape_train)
- mapes_valid_list.append(mape_valid)
- #clear_output(wait=True)
- # plt.plot(mapes_train_list); plt.plot(mapes_valid_list)
- # plt.ylabel('data') #set the label for y axis
- # plt.xlabel('index') #set the label for x-axis
- # plt.title("Plotting a list") #set the title of the graph
- # PLOTS
- # ===================================================================
- # Test map computation
- y_test_hat = model(torch.tensor(X_test_padded_sequences, dtype=torch.long))
- mape_test = mean_absolute_percentage_error(y_test, y_test_hat.view(-1))
- print(mape_test)
- plt.plot(mapes_train_list); plt.plot(mapes_valid_list)
- plt.ylabel('data') #set the label for y axis
- plt.xlabel('index') #set the label for x-axis
- plt.title("Plotting a list") #set the title of the graph
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement