Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- # coding: utf-8
- # In[1]:
- import pandas as pd
- from torch.nn import TransformerEncoder, TransformerEncoderLayer
- from torch.nn import TransformerDecoder, TransformerDecoderLayer
- import torch.nn.functional as F
- import torch
- import torch.nn as nn
- import torch.optim as optim
- import numpy as np
- import math
- import random
- import os
- import re
- from tqdm import tqdm
- from transformers import AutoModel
- from transformers import AutoTokenizer
- from transformers import AdamW, get_linear_schedule_with_warmup
- from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler, Dataset, Sampler
- import numpy as np
- import pandas as pd
- import matplotlib.pyplot as plt
- from sklearn.model_selection import KFold
- import time
- # In[2]:
- model_name_ar = 'moha/arabert_c19'
- model_name_en = 'bert-base-uncased'
- batch_size = 32
- n_epochs = 3
- # In[3]:
- tokenizer_ar = AutoTokenizer.from_pretrained(model_name_ar, do_lower_case=True)
- tokenizer_en = AutoTokenizer.from_pretrained(model_name_en, do_lower_case=True)
- # In[4]:
- seed = 99 #Important for reproducing the results
- # In[5]:
- def set_seed():
- """Set seed for reproducibility.
- """
- random.seed(seed)
- np.random.seed(seed)
- torch.manual_seed(seed)
- torch.cuda.manual_seed_all(seed)
- os.environ['PYTHONHASHSEED'] = str(seed)
- set_seed()
- # In[ ]:
- # In[6]:
- xls = pd.ExcelFile("../input/arab2arabizi/dataset.xlsx")
- dataset = pd.read_excel(xls, "Sheet1")
- known = dataset[dataset.from_source == True]
- dataset = dataset[["arabizi", "arabic", "from_source"]]
- dataset.columns = ["Arabize", "Arabic", "from_source"]
- #dataset. #Drop Arabic duplicates
- # In[7]:
- known = known[["arabizi", "arabic"]].set_index("arabizi", drop=True).arabic.to_dict()
- known_idx = list(known.keys())
- # In[8]:
- in_max = dataset.apply(lambda x: len(str(x.Arabize)), axis=1).max()
- out_max = dataset.apply(lambda x: len(x.Arabic), axis=1).max() + 2 #Take into account eos and sos
- pad_token = 0
- eos_token = 2
- sos_token = 1
- device = "cuda" if torch.cuda.is_available() else "cpu"
- # In[9]:
- def preprocess(a):
- x = a.copy()
- def filter_letters_arabizi(word):
- word = word.replace("$", "s")
- word = word.replace("å", "a")
- word = word.replace("é", "e")
- word = word.replace("ê", "e")
- word = word.replace("ÿ", "y")
- word = word.replace("ą", "a")
- word = word.replace("ī", "i")
- word = word.replace("\n", "")
- word = word.replace("′", "'")
- return word
- x.Arabize = filter_letters_arabizi(str(x.Arabize))
- x.Arabic = x.Arabic
- return x
- # In[10]:
- dataset[["Arabize","Arabic"]] = dataset[["Arabize","Arabic"]].apply(preprocess, axis=1)
- # In[11]:
- in_tokens = set(" ".join(dataset.Arabize.values.tolist()).lower())
- in_token_to_int = {token: (i+1) for i,token in enumerate(sorted(in_tokens))}
- in_token_to_int[0] = "<pad>"
- out_tokens = set(" ".join(dataset.Arabic.values.tolist()))
- out_token_to_int = {token: (i+3) for i,token in enumerate(sorted(out_tokens))}
- out_token_to_int["<pad>"] = pad_token
- out_token_to_int["<sos>"] = sos_token
- out_token_to_int["<eos>"] = eos_token
- # In[12]:
- def tokenize(a):
- x = a.copy()
- x.Arabize = [in_token_to_int[i] for i in x.Arabize.lower()]
- x.Arabic = [sos_token] + [out_token_to_int[i] for i in x.Arabic] + [eos_token]
- x.Arabize = x.Arabize + (in_max - len(x.Arabize)) * [pad_token]
- x.Arabic = x.Arabic + (out_max - len(x.Arabic)) * [pad_token]
- return x
- # In[13]:
- dataset[["Arabize","Arabic"]] = dataset[["Arabize","Arabic"]].apply(tokenize, axis=1)
- validation = dataset.sample(frac=0.1)
- train = dataset.drop(validation.index)
- X_train = train.Arabize
- y_train = train.Arabic
- X_valid = validation.Arabize
- y_valid = validation.Arabic
- # In[14]:
- class PositionalEncoding(nn.Module):
- def __init__(self, d_model, dropout=0.1, max_len=9000):
- super(PositionalEncoding, self).__init__()
- self.dropout = nn.Dropout(p=dropout)
- self.scale = nn.Parameter(torch.ones(1))
- pe = torch.zeros(max_len, d_model)
- position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
- div_term = torch.exp(torch.arange(
- 0, d_model, 2).float() * (-math.log(10000.0) / d_model))
- pe[:, 0::2] = torch.sin(position * div_term)
- pe[:, 1::2] = torch.cos(position * div_term)
- pe = pe.unsqueeze(0).transpose(0, 1)
- self.register_buffer('pe', pe)
- def forward(self, x):
- x = x + self.scale * self.pe[:x.size(0), :]
- return self.dropout(x)
- # In[15]:
- class TransformerModel(nn.Module):
- def __init__(self, intoken, outtoken ,hidden, enc_layers=1, dec_layers=1, dropout=0.15, nheads=4):
- super(TransformerModel, self).__init__()
- ff_model = hidden*4
- self.encoder = nn.Embedding(intoken, hidden)
- self.pos_encoder = PositionalEncoding(hidden, dropout)
- self.decoder = nn.Embedding(outtoken, hidden)
- self.pos_decoder = PositionalEncoding(hidden, dropout)
- encoder_layers = TransformerEncoderLayer(d_model=hidden, nhead = nheads, dim_feedforward = ff_model, dropout=dropout, activation='relu')
- self.transformer_encoder = TransformerEncoder(encoder_layers, enc_layers)
- encoder_layers = TransformerDecoderLayer(hidden, nheads, ff_model, dropout, activation='relu')
- self.transformer_decoder = TransformerDecoder(encoder_layers, dec_layers)
- self.fc_out = nn.Linear(hidden, outtoken)
- self.src_mask = None
- self.trg_mask = None
- self.memory_mask = None
- def generate_square_subsequent_mask(self, sz, sz1=None):
- if sz1 == None:
- mask = torch.triu(torch.ones(sz, sz), 1)
- else:
- mask = torch.triu(torch.ones(sz, sz1), 1)
- return mask.masked_fill(mask==1, float('-inf'))
- def make_len_mask_enc(self, inp):
- return (inp == pad_token).transpose(0, 1) #(batch_size, output_seq_len)
- def make_len_mask_dec(self, inp):
- return (inp == pad_token).transpose(0, 1) #(batch_size, input_seq_len)
- def forward(self, src, trg): #SRC: (seq_len, batch_size)
- if self.trg_mask is None or self.trg_mask.size(0) != len(trg):
- self.trg_mask = self.generate_square_subsequent_mask(len(trg)).to(trg.device)
- #Adding padding mask
- src_pad_mask = self.make_len_mask_enc(src)
- trg_pad_mask = self.make_len_mask_dec(trg)
- #Add embeddings Encoder
- src = self.encoder(src) #Embedding, (seq_len, batch_size, d_model)
- src = self.pos_encoder(src) #Pos embedding
- #Add embedding decoder
- trg = self.decoder(trg) #(seq_len, batch_size, d_model)
- trg = self.pos_decoder(trg)
- memory = self.transformer_encoder(src, None, src_pad_mask)
- output = self.transformer_decoder(tgt = trg, memory = memory, tgt_mask = self.trg_mask, memory_mask = None,
- tgt_key_padding_mask = trg_pad_mask, memory_key_padding_mask = src_pad_mask)
- output = self.fc_out(output)
- return output
- # In[16]:
- len(in_token_to_int)
- # In[17]:
- len(out_token_to_int)
- # In[18]:
- set_seed()
- model = TransformerModel(len(in_token_to_int), len(out_token_to_int), 128).to(device)
- # In[19]:
- class NoamOpt:
- "Optim wrapper that implements rate."
- def __init__(self, model_size, factor, warmup, optimizer):
- self.optimizer = optimizer
- self._step = 0
- self.warmup = warmup
- self.factor = factor
- self.model_size = model_size
- self._rate = 0
- def step(self):
- "Update parameters and rate"
- self._step += 1
- rate = self.rate()
- for p in self.optimizer.param_groups:
- p['lr'] = rate
- self._rate = rate
- self.optimizer.step()
- def rate(self, step = None):
- "Implement `lrate` above"
- if step is None:
- step = self._step
- return self.factor * (self.model_size ** (-0.5) *
- min(step ** (-0.5), step * self.warmup ** (-1.5)))
- # In[20]:
- class Arab2ArabizDS(Dataset):
- def __init__(self, data, label):
- self.data = data.values.tolist()
- self.labels = label.values.tolist()
- self.lengths_source = [len(i) for i in data]
- self.lengths_label = [len(i) for i in label]
- def __len__(self):
- return len(self.data)
- def __getitem__(self, idx):
- return (self.data[idx], self.labels[idx], self.lengths_source[idx], self.lengths_label[idx])
- # In[21]:
- def data_collator_Arab2Arabiz(data):
- word, label, length_source, length_label = zip(*data)
- tensor_dim_1 = max(length_source)
- tensor_dim_2 = max(length_label)
- out_word = torch.full((len(word), tensor_dim_1), dtype=torch.long, fill_value=pad_token)
- label_word = torch.full((len(word), tensor_dim_2), dtype=torch.long, fill_value=pad_token)
- for i in range(len(word)):
- out_word[i][:len(word[i])] = torch.Tensor(word[i])
- label_word[i][:len(label[i])] = torch.Tensor(label[i])
- return (out_word, label_word)
- # In[22]:
- class KSampler(Sampler):
- def __init__(self, data_source, batch_size):
- self.lens = [x[1] for x in data_source]
- self.batch_size = batch_size
- def __iter__(self):
- idx = list(range(len(self.lens)))
- arr = list(zip(self.lens, idx))
- random.shuffle(arr)
- n = self.batch_size*100
- iterator = []
- for i in range(0, len(self.lens), n):
- dt = arr[i:i+n]
- dt = sorted(dt, key=lambda x: x[0])
- for j in range(0, len(dt), self.batch_size):
- indices = list(map(lambda x: x[1], dt[j:j+self.batch_size]))
- iterator.append(indices)
- random.shuffle(iterator)
- return iter([item for sublist in iterator for item in sublist]) #Flatten nested list
- def __len__(self):
- return len(self.lens)
- # In[23]:
- def seed_worker(worker_id):
- worker_seed = torch.initial_seed() % 2**32
- numpy.random.seed(worker_seed)
- random.seed(worker_seed)
- # In[24]:
- batch_size = 32
- train_data = Arab2ArabizDS(X_train, y_train)
- train_sampler = KSampler(train_data, batch_size)
- train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size, worker_init_fn=seed_worker, collate_fn=data_collator_Arab2Arabiz)
- valid_data = Arab2ArabizDS(X_valid, y_valid)
- valid_sampler = KSampler(valid_data, batch_size)
- valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=batch_size,worker_init_fn=seed_worker, collate_fn=data_collator_Arab2Arabiz)
- # In[25]:
- criterion = nn.CrossEntropyLoss(ignore_index=pad_token)
- optimizer = NoamOpt(128, 1, 4000 ,optim.Adam(model.parameters(), lr=0))
- # In[26]:
- def run_epoch(iterator):
- total_loss = 0
- for src, trg in iterator:
- src = src.T.to(device)
- trg = trg.T.to(device)
- output = model(src, trg[:-1, :])
- output = output.reshape(-1, output.shape[2])
- optimizer.optimizer.zero_grad()
- loss = criterion(output, trg[1:].reshape(-1))
- total_loss += loss.item()
- loss.backward()
- torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
- optimizer.step()
- return total_loss / len(iterator)
- # In[27]:
- def run_validation(iterator):
- total_loss = 0
- for src, trg in iterator:
- src = src.T.to(device)
- trg = trg.T.to(device)
- output = model(src, trg[:-1, :])
- output = output.reshape(-1, output.shape[2])
- optimizer.optimizer.zero_grad()
- loss = criterion(output, trg[1:].reshape(-1))
- total_loss += loss.item()
- return total_loss / len(iterator)
- # In[28]:
- set_seed()
- min_loss = 99
- #Change model size
- for i in range(100):
- loss = run_epoch(train_dataloader)
- loss_val = run_validation(valid_dataloader)
- if loss_val < min_loss:
- min_loss = loss_val
- torch.save(model, "convert_best")
- print("EPOCH %d -- %f -- Val Loss: %f" % (i, loss, loss_val))
- # In[29]:
- model = torch.load("convert_best").eval()
- # In[30]:
- min_loss
- # In[31]:
- out_int_to_token = {out_token_to_int[t]:t for t in out_token_to_int}
- # In[32]:
- def arabizi_2_arabic(inp):
- input_sentence = [in_token_to_int[i] for i in inp.lower()]
- preds = [sos_token]
- input_sentence = torch.Tensor(input_sentence).unsqueeze(-1).long().to(device)
- new_char = -1
- while new_char != eos_token:
- output_sentence = torch.Tensor(preds).unsqueeze(-1).long().to(device)
- src = model.pos_encoder(model.encoder(input_sentence))
- trg = model.pos_encoder(model.decoder(output_sentence))
- memory = model.transformer_encoder(src)
- output = model.transformer_decoder(tgt = trg, memory = memory)
- output = model.fc_out(output)
- new_char = output.argmax(-1)[-1, 0].item()
- preds.append(new_char)
- if len(preds) > 50:
- break
- return "".join([out_int_to_token[i] for i in preds[1:-1]])
- # In[33]:
- train = pd.read_csv("../input/zindidd/Train.csv")[["textt", "labell"]]
- train.columns = ["texts", "data_labels"]
- data = train
- # In[34]:
- def preprocess(text): #Might use the same setting if they work to other languages (english and french)
- text = text.replace('ß',"b")
- text = text.replace('à',"a")
- text = text.replace('á',"a")
- text = text.replace('ç',"c")
- text = text.replace('è',"e")
- text = text.replace('é',"e")
- text = text.replace('$',"s")
- text = text.replace("1","")
- text = text.lower()
- text = re.sub(r'[^A-Za-z0-9 ,!?.]', '', text)
- # Remove '@name'
- text = re.sub(r'(@.*?)[\s]', ' ', text)
- # Replace '&' with '&'
- text = re.sub(r'&', '&', text)
- # Remove trailing whitespace
- text = re.sub(r'\s+', ' ', text).strip()
- text = re.sub(r'([h][h][h][h])\1+', r'\1', text)
- text = re.sub(r'([a-g-i-z])\1+', r'\1', text) #Remove repeating characters
- text = re.sub(r' [0-9]+ ', " ", text)
- text = re.sub(r'^[0-9]+ ', "", text)
- return text
- # In[35]:
- #Keep numbers block
- def split(text):
- splits = re.findall(r"[\w']+|[?!.,]", text)
- to_be_added = []
- idx_to_be_added = []
- forbidden = ["?", "!", ".", ","] + known_idx
- for i, split in enumerate(splits):
- if split in forbidden:
- if split in known_idx:
- to_be_added.append(known[split])
- else:
- to_be_added.append(split)
- idx_to_be_added.append(i)
- #else:
- #splits[i] = splits[i][:1000]
- splits = [i for i in splits if not i in forbidden]
- return splits, to_be_added, idx_to_be_added
- # In[36]:
- problematic = []
- def convert_phrase_2(text):
- text = text.replace("0","")
- text = text.replace("6","")
- #print("\nTEXT: "+text)
- phrase, to_be_added, idx_to_be_added = split(text.lower())
- max_len_phrase = max([len(i) for i in phrase])
- input_sentence = []
- for word in phrase:
- input_sentence.append([in_token_to_int[i] for i in word] + [pad_token]*(max_len_phrase-len(word)))
- input_sentence = torch.Tensor(input_sentence).long().T.to(device)
- preds = [[sos_token] * len(phrase)]
- end_word = len(phrase) * [False]
- src_pad_mask = model.make_len_mask_enc(input_sentence)
- while not all(end_word):
- output_sentence = torch.Tensor(preds).long().to(device)
- src = model.pos_encoder(model.encoder(input_sentence))
- trg = model.pos_encoder(model.decoder(output_sentence))
- memory = model.transformer_encoder(src, None ,src_pad_mask)
- output = model.transformer_decoder(tgt = trg, memory = memory, memory_key_padding_mask = src_pad_mask)
- output = model.fc_out(output)
- output = output.argmax(-1)[-1].cpu().detach().numpy()
- preds.append(output.tolist())
- end_word = (output == eos_token) | end_word
- if len(preds) > 50:
- global problematic
- problematic.append(text)
- #print(text)
- break
- preds = np.array(preds).T
- result = []
- for word in preds:
- tmp = []
- for i in word[1:]:
- if out_int_to_token[i] == "<eos>":
- break
- tmp.append(out_int_to_token[i])
- result.append("".join(tmp))
- #Re-add removed punctuation
- for item, idx in zip(to_be_added, idx_to_be_added):
- if item == "?":
- item = "؟"
- elif item == ",":
- item = "،"
- result.insert(idx, item)
- result = " ".join(result)
- return result
- # In[37]:
- train.texts = train.texts.apply(preprocess)
- # In[38]:
- results = []
- step_size = 100
- texts = train.texts.values.tolist()
- for i in tqdm(range(0, len(texts), step_size)):
- out = convert_phrase_2(" lkrb3 ".join(texts[i:i+step_size]))
- splitted_sentences = [ex.lstrip().rstrip() for ex in out.split(" " + convert_phrase_2("lkrb3") + " ")]
- if len(splitted_sentences) != len(texts[i:i+step_size]):
- print("DANGER")
- break
- results.extend(splitted_sentences)
- # In[39]:
- train["converted"] = results.copy()
- train.to_csv("train_data.csv")
- # In[40]:
- test = pd.read_csv("../input/zindidd/Test.csv")
- test.textt = test.textt.apply(preprocess)
- # In[41]:
- results = []
- step_size = 50
- texts = test.textt.values.tolist()
- for i in tqdm(range(0, len(texts), step_size)):
- out = convert_phrase_2(" lkrb3 ".join(texts[i:i+step_size]))
- splitted_sentences = [ex.lstrip().rstrip() for ex in out.split(" " + convert_phrase_2("lkrb3") + " ")]
- if len(splitted_sentences) != len(texts[i:i+step_size]):
- print("DANGER")
- break
- results.extend(splitted_sentences)
- # In[42]:
- test["converted"] = results
- test.to_csv("test_data.csv")
- # In[43]:
- def preprocessing_for_bert(data, tokenizer, preprocess_text, max_len=256):
- input_ids = []
- attention_masks = []
- tmp = tokenizer.encode("ab")[-1]
- for sentence in data:
- encoding = tokenizer.encode(preprocess_text(sentence))
- if len(encoding) > max_len:
- encoding = encoding[:max_len-1] + [tmp]
- in_ids = encoding
- att_mask = [1]*len(encoding)
- input_ids.append(in_ids)
- attention_masks.append(att_mask)
- return input_ids, attention_masks
- # In[44]:
- class BertDataset(Dataset):
- def __init__(self, data, masks, label=None):
- self.data = data
- self.masks = masks
- if label != None:
- self.labels = label
- else:
- self.labels = None
- self.lengths = [len(i) for i in data]
- def __len__(self):
- return len(self.data)
- def __getitem__(self, idx):
- if self.labels != None:
- return (self.data[idx], self.masks[idx], self.labels[idx], self.lengths[idx])
- else: #For validation
- return (self.data[idx], self.masks[idx], None, self.lengths[idx])
- # In[45]:
- def data_collator(data):
- sentence, mask, label, length = zip(*data)
- tensor_dim = max(length)
- out_sentence = torch.full((len(sentence), tensor_dim), dtype=torch.long, fill_value=pad)
- out_mask = torch.zeros(len(sentence), tensor_dim, dtype=torch.long)
- for i in range(len(sentence)):
- out_sentence[i][:len(sentence[i])] = torch.Tensor(sentence[i])
- out_mask[i][:len(mask[i])] = torch.Tensor(mask[i])
- if label[0] != None:
- return (out_sentence, out_mask, torch.Tensor(label).long())
- else:
- return (out_sentence, out_mask)
- # In[46]:
- class KSampler(Sampler):
- def __init__(self, data_source, batch_size):
- self.lens = [x[1] for x in data_source]
- self.batch_size = batch_size
- def __iter__(self):
- idx = list(range(len(self.lens)))
- arr = list(zip(self.lens, idx))
- random.shuffle(arr)
- n = self.batch_size*100
- iterator = []
- for i in range(0, len(self.lens), n):
- dt = arr[i:i+n]
- dt = sorted(dt, key=lambda x: x[0])
- for j in range(0, len(dt), self.batch_size):
- indices = list(map(lambda x: x[1], dt[j:j+self.batch_size]))
- iterator.append(indices)
- random.shuffle(iterator)
- return iter([item for sublist in iterator for item in sublist]) #Flatten nested list
- def __len__(self):
- return len(self.lens)
- # In[47]:
- # Create the BertClassfier class
- class BertClassifier(nn.Module):
- def __init__(self, model_name, dropout, freeze_bert=False):
- super(BertClassifier, self).__init__()
- D_in, H, D_out = 768, 200, 3
- self.bert = AutoModel.from_pretrained(model_name)
- self.classifier = nn.Sequential(
- nn.Linear(D_in, H),
- nn.ReLU(),
- nn.Linear(H, D_out)
- )
- if freeze_bert:
- for param in self.bert.parameters():
- param.requires_grad = False
- def forward(self, input_ids, attention_mask):
- outputs = self.bert(input_ids=input_ids,
- attention_mask=attention_mask)
- last_hidden_state_cls = outputs[0][:, 0, :]
- logits = self.classifier(last_hidden_state_cls)
- return logits
- # In[48]:
- def initialize_model(model_name, epochs=4, dropout=0.1):
- bert_classifier = BertClassifier(model_name, dropout=dropout, freeze_bert=False)
- bert_classifier.to(device)
- optimizer = AdamW(bert_classifier.parameters(),
- lr=5e-5,
- eps=1e-8
- )
- total_steps = len(train_dataloader) * epochs
- scheduler = get_linear_schedule_with_warmup(optimizer,
- num_warmup_steps=0, # Default value
- num_training_steps=total_steps)
- return bert_classifier, optimizer, scheduler
- # In[49]:
- loss_fn = nn.CrossEntropyLoss()
- def train(model, train_dataloader, val_dataloader=None, epochs=4, evaluation=False, fold=0, prefix=""):
- global max_acc
- print("Start training...\n")
- for epoch_i in range(epochs):
- print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}")
- print("-"*70)
- t0_epoch, t0_batch = time.time(), time.time()
- total_loss, batch_loss, batch_counts = 0, 0, 0
- model.train()
- for step, batch in enumerate(train_dataloader):
- batch_counts +=1
- b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)
- model.zero_grad()
- logits = model(b_input_ids, b_attn_mask)
- loss = loss_fn(logits, b_labels)
- batch_loss += loss.item()
- total_loss += loss.item()
- loss.backward()
- torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
- optimizer.step()
- scheduler.step()
- if (step % 20 == 0 and step != 0) or (step == len(train_dataloader) - 1):
- time_elapsed = time.time() - t0_batch
- print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {time_elapsed:^9.2f}")
- batch_loss, batch_counts = 0, 0
- t0_batch = time.time()
- if step%200 == 0 and step != 0 and epoch_i != 0 and epoch_i != 1:
- print("-"*70)
- if evaluation == True:
- val_loss, val_accuracy = evaluate(model, val_dataloader)
- if val_accuracy > max_acc:
- max_acc = val_accuracy
- torch.save(model, prefix + "_best_"+str(fold))
- print("new max")
- print(val_accuracy)
- print("-"*70)
- print("\n")
- model.train()
- avg_train_loss = total_loss / len(train_dataloader)
- print("-"*70)
- if evaluation == True:
- val_loss, val_accuracy = evaluate(model, val_dataloader)
- if val_accuracy > max_acc:
- max_acc = val_accuracy
- torch.save(model, prefix+"_best_"+str(fold))
- print("new max")
- time_elapsed = time.time() - t0_epoch
- print(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_accuracy:^9.2f} | {time_elapsed:^9.2f}")
- print("-"*70)
- print("\n")
- print("Training complete!")
- def evaluate(model, val_dataloader):
- model.eval()
- val_accuracy = []
- val_loss = []
- for batch in val_dataloader:
- b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)
- with torch.no_grad():
- logits = model(b_input_ids, b_attn_mask)
- loss = loss_fn(logits, b_labels)
- val_loss.append(loss.item())
- preds = torch.argmax(logits, dim=1).flatten()
- accuracy = (preds == b_labels).cpu().numpy().mean() * 100
- val_accuracy.append(accuracy)
- val_loss = np.mean(val_loss)
- val_accuracy = np.mean(val_accuracy)
- return val_loss, val_accuracy
- # In[50]:
- def get_indices(arr, idxs): #Helper function to get multiple indexes from a list
- output = []
- for idx in idxs:
- output.append(arr[idx])
- return output
- def text_preprocessing_1(text):
- text = text.lower()
- text = re.sub(r'\s+', ' ', text).strip()
- return text
- def text_preprocessing_2(text):
- text = text.lower()
- text = re.sub(r'\s+', ' ', text).strip()
- text = re.sub(r'([a-g-i-z][a-g-i-z])\1+', r'\1', text)
- return text
- def text_preprocessing_3(text):
- text = text.replace('ß',"b")
- text = text.replace('à',"a")
- text = text.replace('á',"a")
- text = text.replace('ç',"c")
- text = text.replace('è',"e")
- text = text.replace('é',"e")
- text = text.replace('$',"s")
- text = text.replace("1","")
- text = text.lower()
- text = re.sub(r'[^A-Za-z0-9 ,!?.]', '', text)
- # Remove '@name'
- text = re.sub(r'(@.*?)[\s]', ' ', text)
- # Replace '&' with '&'
- text = re.sub(r'&', '&', text)
- # Remove trailing whitespace
- text = re.sub(r'\s+', ' ', text).strip()
- text = re.sub(r'([h][h][h][h])\1+', r'\1', text)
- text = re.sub(r'([a-g-i-z])\1+', r'\1', text) #Remove repeating characters
- text = re.sub(r' [0-9]+ ', " ", text)
- text = re.sub(r'^[0-9]+ ', "", text)
- return text
- # In[51]:
- data = pd.read_csv("../input/zindidd/Train.csv")[["textt", "labell"]].iloc[1000:]
- data.columns = ["texts", "data_labels"]
- data.data_labels = data.data_labels.replace(0,2) #Neutral 2, Positive 1, Negative 0
- data.data_labels = data.data_labels.replace(-1,0)
- X = data.texts.values
- y = data.data_labels.values
- preprocessed_data, masks = preprocessing_for_bert(X, tokenizer_en, text_preprocessing_2, max_len=256)
- pad = tokenizer_en.pad_token_id
- # In[52]:
- kfold = KFold(5, True, seed)
- fold = 0
- bests = []
- for train_ids, val_ids in kfold.split(preprocessed_data):
- print("\n\tFOLD %d \n" % (fold))
- max_acc = -99
- X_train = get_indices(preprocessed_data, train_ids)
- y_train = get_indices(y, train_ids)
- train_masks = get_indices(masks, train_ids)
- X_val = get_indices(preprocessed_data, val_ids)
- y_val = get_indices(y, val_ids)
- val_masks = get_indices(masks, val_ids)
- X_val, y_val, val_masks = list(zip(*sorted(zip(X_val, y_val, val_masks), key=lambda x: len(x[0])))) #Order the validation data for faster validation
- X_val, y_val, val_masks = list(X_val), list(y_val), list(val_masks)
- # Convert other data types to torch.Tensor
- y_train = torch.tensor(y_train)
- y_val = torch.tensor(y_val)
- # Create the DataLoader for our training set
- train_data = BertDataset(X_train, train_masks, y_train)
- train_sampler = KSampler(train_data, batch_size)
- train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size, collate_fn=data_collator)
- # Create the DataLoader for our validation set
- val_data = BertDataset(X_val, val_masks, y_val)
- val_sampler = SequentialSampler(val_data)
- val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size, collate_fn=data_collator)
- set_seed() # Set seed for reproducibility
- bert_classifier, optimizer, scheduler = initialize_model(model_name=model_name_en, epochs=n_epochs, dropout=0.05)
- train(bert_classifier, train_dataloader, val_dataloader, epochs=n_epochs, evaluation=True, fold=fold, prefix="en")
- fold += 1
- bests.append(max_acc)
- # In[53]:
- bests
- # In[54]:
- data = pd.read_csv("train_data.csv")[["converted", "data_labels"]].iloc[1000:]
- data.columns = ["texts", "data_labels"]
- data.data_labels = data.data_labels.replace(0,2) #Neutral 2, Positive 1, Negative 0
- data.data_labels = data.data_labels.replace(-1,0)
- X = data.texts.values
- y = data.data_labels.values
- preprocessed_data, masks = preprocessing_for_bert(X, tokenizer_ar, lambda x: x, max_len=256)
- pad = tokenizer_ar.pad_token_id
- # In[55]:
- kfold = KFold(10, True, seed)
- fold = 0
- bests = []
- for train_ids, val_ids in kfold.split(preprocessed_data):
- print("\n\tFOLD %d \n" % (fold))
- max_acc = -99
- X_train = get_indices(preprocessed_data, train_ids)
- y_train = get_indices(y, train_ids)
- train_masks = get_indices(masks, train_ids)
- X_val = get_indices(preprocessed_data, val_ids)
- y_val = get_indices(y, val_ids)
- val_masks = get_indices(masks, val_ids)
- X_val, y_val, val_masks = list(zip(*sorted(zip(X_val, y_val, val_masks), key=lambda x: len(x[0])))) #Order the validation data for faster validation
- X_val, y_val, val_masks = list(X_val), list(y_val), list(val_masks)
- # Convert other data types to torch.Tensor
- y_train = torch.tensor(y_train)
- y_val = torch.tensor(y_val)
- # Create the DataLoader for our training set
- train_data = BertDataset(X_train, train_masks, y_train)
- train_sampler = KSampler(train_data, batch_size)
- train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size, collate_fn=data_collator)
- # Create the DataLoader for our validation set
- val_data = BertDataset(X_val, val_masks, y_val)
- val_sampler = SequentialSampler(val_data)
- val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size, collate_fn=data_collator)
- set_seed() # Set seed for reproducibility
- bert_classifier, optimizer, scheduler = initialize_model(model_name=model_name_ar, epochs=n_epochs, dropout=0)
- train(bert_classifier, train_dataloader, val_dataloader, epochs=n_epochs, evaluation=True, fold=fold, prefix="ar")
- fold += 1
- bests.append(max_acc)
- # In[56]:
- bests
- # In[ ]:
- # In[57]:
- def bert_single_predict(model, test_dataloader):
- model.eval()
- all_logits = []
- for batch in tqdm(test_dataloader):
- b_input_ids, b_attn_mask = tuple(t.to(device) for t in batch)[:2]
- with torch.no_grad():
- logits = model(b_input_ids, b_attn_mask)
- all_logits.append(logits)
- all_logits = torch.cat(all_logits, dim=0)
- probs = F.softmax(all_logits, dim=1).cpu().numpy()
- return probs
- # In[58]:
- def bert_ensemble_predict(sentences, models, tokenizer, preprocess, truncate=True, max_len=256):
- inputs, masks = preprocessing_for_bert(sentences, tokenizer, preprocess, max_len=max_len)
- dataset = BertDataset(inputs, masks)
- sample = SequentialSampler(dataset)
- dataloader = DataLoader(dataset, sampler=sample, batch_size=128, collate_fn=data_collator)
- preds = []
- for model in models:
- preds.append(bert_single_predict(model, dataloader))
- return preds
- # In[59]:
- def predict_lang(lang_prefix, directory, preprocess_fn, dataset, model_name, n=1, truncate=True, max_len=256):
- print("Loading the models ....")
- global pad
- tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True)
- pad = tokenizer.pad_token_id
- lang_models = []
- for i in range(n):
- lang_models.append(torch.load(directory + "/" + lang_prefix + "best_"+str(i), map_location=device))
- print("Inference ....")
- out = bert_ensemble_predict(dataset, lang_models, tokenizer, preprocess_fn, truncate=truncate, max_len=max_len)
- out_sum = out[0]
- for i in range(1,n):
- out_sum = out[i] + out_sum
- return out_sum
- # In[60]:
- #Sort the list for faster inference
- df = pd.read_csv("../input/zindidd/Test.csv")
- df_converted = pd.read_csv("test_data.csv")
- df["lens"] = df.textt.apply(len)
- df = df.sort_values(by="lens").set_index("IDD", drop=True)
- df_converted = df_converted.set_index("IDD", drop=True).loc[df.index]
- #Convert to list
- test = df.textt.tolist()
- test_converted = df_converted[["converted"]].converted.tolist()
- # In[61]:
- output = predict_lang("ar_", "./", lambda x:x, test_converted, model_name_ar, n=5, truncate=True, max_len=512)
- # In[62]:
- df["preds"] = (output).argmax(1)
- df.preds = df.preds.replace(0,-1)
- df.preds = df.preds.replace(2,0)
- the_output = df.reset_index()[["IDD", "preds"]]
- the_output.columns = ["ID", "label"]
- the_output.to_csv("lessvalid_convvalid150.csv", index=False)
- # In[63]:
- #Difference between this and Extra BERT:
- # - Text_preprocessing used is one here, and two in the other
- # - This with Dropout, the other, no dropout
- # - This 50 dimension in the end ,the other 32 dimension in the end
- # - Seed value for both kfolds and set_seed
- #Suggestion: sub1 (first fold, allnote v13), sub2 (first fold, allnote v13 + nodropout), sub3 (first fold, allnote v13 + nodropout + preprocess two)
- #sub4 (first fold, allnote v13 + nodropout + preprocess two + dim 30), sub6 (ensemble, allnote v13)
- #This would help in noticing the difference in different seed values, Between sub1 and sub5. Also see with a fixed seed value, the difference that
- #dropout, preprocess one or two, and last dim would make in a submission
- # In[ ]:
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement