Advertisement
Maroxtn

Untitled

Jun 3rd, 2021
58
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 35.56 KB | None | 0 0
  1. #!/usr/bin/env python
  2. # coding: utf-8
  3.  
  4. # In[1]:
  5.  
  6.  
  7. import pandas as pd
  8.  
  9. from torch.nn import TransformerEncoder, TransformerEncoderLayer
  10. from torch.nn import TransformerDecoder, TransformerDecoderLayer
  11. import torch.nn.functional as F
  12.  
  13. import torch
  14. import torch.nn as nn
  15. import torch.optim as optim
  16.  
  17.  
  18. import numpy as np
  19.  
  20.  
  21. import math
  22. import random
  23.  
  24. import os
  25. import re
  26. from tqdm import tqdm
  27.  
  28.  
  29. from transformers import AutoModel
  30. from transformers import AutoTokenizer
  31. from transformers import AdamW, get_linear_schedule_with_warmup
  32.  
  33. from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler, Dataset, Sampler
  34.  
  35.  
  36.  
  37. import numpy as np
  38. import pandas as pd
  39.  
  40. import matplotlib.pyplot as plt
  41. from sklearn.model_selection import KFold
  42.  
  43. import time
  44.  
  45.  
  46. # In[2]:
  47.  
  48.  
  49. model_name_ar = 'moha/arabert_c19'
  50. model_name_en = 'bert-base-uncased'
  51.  
  52. batch_size = 32
  53. n_epochs = 3
  54.  
  55.  
  56. # In[3]:
  57.  
  58.  
  59. tokenizer_ar = AutoTokenizer.from_pretrained(model_name_ar, do_lower_case=True)
  60. tokenizer_en = AutoTokenizer.from_pretrained(model_name_en, do_lower_case=True)
  61.  
  62.  
  63. # In[4]:
  64.  
  65.  
  66. seed = 99 #Important for reproducing the results
  67.  
  68.  
  69. # In[5]:
  70.  
  71.  
  72. def set_seed():
  73. """Set seed for reproducibility.
  74. """
  75. random.seed(seed)
  76. np.random.seed(seed)
  77. torch.manual_seed(seed)
  78. torch.cuda.manual_seed_all(seed)
  79.  
  80. os.environ['PYTHONHASHSEED'] = str(seed)
  81.  
  82. set_seed()
  83.  
  84.  
  85. # In[ ]:
  86.  
  87.  
  88.  
  89.  
  90.  
  91. # In[6]:
  92.  
  93.  
  94. xls = pd.ExcelFile("../input/arab2arabizi/dataset.xlsx")
  95. dataset = pd.read_excel(xls, "Sheet1")
  96.  
  97. known = dataset[dataset.from_source == True]
  98. dataset = dataset[["arabizi", "arabic", "from_source"]]
  99.  
  100. dataset.columns = ["Arabize", "Arabic", "from_source"]
  101.  
  102. #dataset. #Drop Arabic duplicates
  103.  
  104.  
  105. # In[7]:
  106.  
  107.  
  108. known = known[["arabizi", "arabic"]].set_index("arabizi", drop=True).arabic.to_dict()
  109. known_idx = list(known.keys())
  110.  
  111.  
  112. # In[8]:
  113.  
  114.  
  115. in_max = dataset.apply(lambda x: len(str(x.Arabize)), axis=1).max()
  116. out_max = dataset.apply(lambda x: len(x.Arabic), axis=1).max() + 2 #Take into account eos and sos
  117.  
  118. pad_token = 0
  119. eos_token = 2
  120. sos_token = 1
  121.  
  122. device = "cuda" if torch.cuda.is_available() else "cpu"
  123.  
  124.  
  125. # In[9]:
  126.  
  127.  
  128. def preprocess(a):
  129.  
  130. x = a.copy()
  131.  
  132. def filter_letters_arabizi(word):
  133.  
  134. word = word.replace("$", "s")
  135. word = word.replace("å", "a")
  136. word = word.replace("é", "e")
  137. word = word.replace("ê", "e")
  138. word = word.replace("ÿ", "y")
  139. word = word.replace("ą", "a")
  140. word = word.replace("ī", "i")
  141. word = word.replace("\n", "")
  142. word = word.replace("′", "'")
  143.  
  144. return word
  145.  
  146. x.Arabize = filter_letters_arabizi(str(x.Arabize))
  147. x.Arabic = x.Arabic
  148.  
  149. return x
  150.  
  151.  
  152. # In[10]:
  153.  
  154.  
  155. dataset[["Arabize","Arabic"]] = dataset[["Arabize","Arabic"]].apply(preprocess, axis=1)
  156.  
  157.  
  158. # In[11]:
  159.  
  160.  
  161. in_tokens = set(" ".join(dataset.Arabize.values.tolist()).lower())
  162. in_token_to_int = {token: (i+1) for i,token in enumerate(sorted(in_tokens))}
  163.  
  164. in_token_to_int[0] = "<pad>"
  165.  
  166. out_tokens = set(" ".join(dataset.Arabic.values.tolist()))
  167. out_token_to_int = {token: (i+3) for i,token in enumerate(sorted(out_tokens))}
  168.  
  169.  
  170.  
  171. out_token_to_int["<pad>"] = pad_token
  172.  
  173. out_token_to_int["<sos>"] = sos_token
  174. out_token_to_int["<eos>"] = eos_token
  175.  
  176.  
  177. # In[12]:
  178.  
  179.  
  180. def tokenize(a):
  181.  
  182. x = a.copy()
  183.  
  184. x.Arabize = [in_token_to_int[i] for i in x.Arabize.lower()]
  185. x.Arabic = [sos_token] + [out_token_to_int[i] for i in x.Arabic] + [eos_token]
  186.  
  187. x.Arabize = x.Arabize + (in_max - len(x.Arabize)) * [pad_token]
  188. x.Arabic = x.Arabic + (out_max - len(x.Arabic)) * [pad_token]
  189.  
  190. return x
  191.  
  192.  
  193. # In[13]:
  194.  
  195.  
  196. dataset[["Arabize","Arabic"]] = dataset[["Arabize","Arabic"]].apply(tokenize, axis=1)
  197.  
  198. validation = dataset.sample(frac=0.1)
  199. train = dataset.drop(validation.index)
  200.  
  201. X_train = train.Arabize
  202. y_train = train.Arabic
  203.  
  204. X_valid = validation.Arabize
  205. y_valid = validation.Arabic
  206.  
  207.  
  208. # In[14]:
  209.  
  210.  
  211. class PositionalEncoding(nn.Module):
  212. def __init__(self, d_model, dropout=0.1, max_len=9000):
  213. super(PositionalEncoding, self).__init__()
  214. self.dropout = nn.Dropout(p=dropout)
  215. self.scale = nn.Parameter(torch.ones(1))
  216.  
  217. pe = torch.zeros(max_len, d_model)
  218. position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
  219. div_term = torch.exp(torch.arange(
  220. 0, d_model, 2).float() * (-math.log(10000.0) / d_model))
  221. pe[:, 0::2] = torch.sin(position * div_term)
  222. pe[:, 1::2] = torch.cos(position * div_term)
  223. pe = pe.unsqueeze(0).transpose(0, 1)
  224. self.register_buffer('pe', pe)
  225.  
  226. def forward(self, x):
  227. x = x + self.scale * self.pe[:x.size(0), :]
  228. return self.dropout(x)
  229.  
  230.  
  231. # In[15]:
  232.  
  233.  
  234.  
  235. class TransformerModel(nn.Module):
  236.  
  237. def __init__(self, intoken, outtoken ,hidden, enc_layers=1, dec_layers=1, dropout=0.15, nheads=4):
  238. super(TransformerModel, self).__init__()
  239.  
  240. ff_model = hidden*4
  241.  
  242. self.encoder = nn.Embedding(intoken, hidden)
  243. self.pos_encoder = PositionalEncoding(hidden, dropout)
  244.  
  245. self.decoder = nn.Embedding(outtoken, hidden)
  246. self.pos_decoder = PositionalEncoding(hidden, dropout)
  247.  
  248.  
  249. encoder_layers = TransformerEncoderLayer(d_model=hidden, nhead = nheads, dim_feedforward = ff_model, dropout=dropout, activation='relu')
  250. self.transformer_encoder = TransformerEncoder(encoder_layers, enc_layers)
  251.  
  252. encoder_layers = TransformerDecoderLayer(hidden, nheads, ff_model, dropout, activation='relu')
  253. self.transformer_decoder = TransformerDecoder(encoder_layers, dec_layers)
  254.  
  255. self.fc_out = nn.Linear(hidden, outtoken)
  256.  
  257. self.src_mask = None
  258. self.trg_mask = None
  259. self.memory_mask = None
  260.  
  261.  
  262. def generate_square_subsequent_mask(self, sz, sz1=None):
  263.  
  264. if sz1 == None:
  265. mask = torch.triu(torch.ones(sz, sz), 1)
  266. else:
  267. mask = torch.triu(torch.ones(sz, sz1), 1)
  268.  
  269. return mask.masked_fill(mask==1, float('-inf'))
  270.  
  271. def make_len_mask_enc(self, inp):
  272. return (inp == pad_token).transpose(0, 1) #(batch_size, output_seq_len)
  273.  
  274. def make_len_mask_dec(self, inp):
  275. return (inp == pad_token).transpose(0, 1) #(batch_size, input_seq_len)
  276.  
  277.  
  278.  
  279. def forward(self, src, trg): #SRC: (seq_len, batch_size)
  280.  
  281. if self.trg_mask is None or self.trg_mask.size(0) != len(trg):
  282. self.trg_mask = self.generate_square_subsequent_mask(len(trg)).to(trg.device)
  283.  
  284.  
  285. #Adding padding mask
  286. src_pad_mask = self.make_len_mask_enc(src)
  287. trg_pad_mask = self.make_len_mask_dec(trg)
  288.  
  289.  
  290. #Add embeddings Encoder
  291. src = self.encoder(src) #Embedding, (seq_len, batch_size, d_model)
  292. src = self.pos_encoder(src) #Pos embedding
  293.  
  294.  
  295. #Add embedding decoder
  296. trg = self.decoder(trg) #(seq_len, batch_size, d_model)
  297. trg = self.pos_decoder(trg)
  298.  
  299.  
  300. memory = self.transformer_encoder(src, None, src_pad_mask)
  301. output = self.transformer_decoder(tgt = trg, memory = memory, tgt_mask = self.trg_mask, memory_mask = None,
  302. tgt_key_padding_mask = trg_pad_mask, memory_key_padding_mask = src_pad_mask)
  303.  
  304. output = self.fc_out(output)
  305.  
  306. return output
  307.  
  308.  
  309. # In[16]:
  310.  
  311.  
  312. len(in_token_to_int)
  313.  
  314.  
  315. # In[17]:
  316.  
  317.  
  318. len(out_token_to_int)
  319.  
  320.  
  321. # In[18]:
  322.  
  323.  
  324. set_seed()
  325. model = TransformerModel(len(in_token_to_int), len(out_token_to_int), 128).to(device)
  326.  
  327.  
  328. # In[19]:
  329.  
  330.  
  331. class NoamOpt:
  332. "Optim wrapper that implements rate."
  333. def __init__(self, model_size, factor, warmup, optimizer):
  334. self.optimizer = optimizer
  335. self._step = 0
  336. self.warmup = warmup
  337. self.factor = factor
  338. self.model_size = model_size
  339. self._rate = 0
  340.  
  341. def step(self):
  342. "Update parameters and rate"
  343. self._step += 1
  344. rate = self.rate()
  345. for p in self.optimizer.param_groups:
  346. p['lr'] = rate
  347. self._rate = rate
  348. self.optimizer.step()
  349.  
  350. def rate(self, step = None):
  351. "Implement `lrate` above"
  352. if step is None:
  353. step = self._step
  354. return self.factor * (self.model_size ** (-0.5) *
  355. min(step ** (-0.5), step * self.warmup ** (-1.5)))
  356.  
  357.  
  358. # In[20]:
  359.  
  360.  
  361. class Arab2ArabizDS(Dataset):
  362.  
  363. def __init__(self, data, label):
  364.  
  365. self.data = data.values.tolist()
  366. self.labels = label.values.tolist()
  367.  
  368. self.lengths_source = [len(i) for i in data]
  369. self.lengths_label = [len(i) for i in label]
  370.  
  371.  
  372. def __len__(self):
  373. return len(self.data)
  374.  
  375. def __getitem__(self, idx):
  376. return (self.data[idx], self.labels[idx], self.lengths_source[idx], self.lengths_label[idx])
  377.  
  378.  
  379. # In[21]:
  380.  
  381.  
  382. def data_collator_Arab2Arabiz(data):
  383.  
  384. word, label, length_source, length_label = zip(*data)
  385.  
  386. tensor_dim_1 = max(length_source)
  387. tensor_dim_2 = max(length_label)
  388.  
  389. out_word = torch.full((len(word), tensor_dim_1), dtype=torch.long, fill_value=pad_token)
  390. label_word = torch.full((len(word), tensor_dim_2), dtype=torch.long, fill_value=pad_token)
  391.  
  392. for i in range(len(word)):
  393.  
  394. out_word[i][:len(word[i])] = torch.Tensor(word[i])
  395. label_word[i][:len(label[i])] = torch.Tensor(label[i])
  396.  
  397. return (out_word, label_word)
  398.  
  399.  
  400. # In[22]:
  401.  
  402.  
  403. class KSampler(Sampler):
  404.  
  405. def __init__(self, data_source, batch_size):
  406. self.lens = [x[1] for x in data_source]
  407. self.batch_size = batch_size
  408.  
  409. def __iter__(self):
  410.  
  411. idx = list(range(len(self.lens)))
  412. arr = list(zip(self.lens, idx))
  413.  
  414. random.shuffle(arr)
  415. n = self.batch_size*100
  416.  
  417. iterator = []
  418.  
  419. for i in range(0, len(self.lens), n):
  420. dt = arr[i:i+n]
  421. dt = sorted(dt, key=lambda x: x[0])
  422.  
  423. for j in range(0, len(dt), self.batch_size):
  424. indices = list(map(lambda x: x[1], dt[j:j+self.batch_size]))
  425. iterator.append(indices)
  426.  
  427. random.shuffle(iterator)
  428. return iter([item for sublist in iterator for item in sublist]) #Flatten nested list
  429.  
  430. def __len__(self):
  431. return len(self.lens)
  432.  
  433.  
  434. # In[23]:
  435.  
  436.  
  437. def seed_worker(worker_id):
  438. worker_seed = torch.initial_seed() % 2**32
  439. numpy.random.seed(worker_seed)
  440. random.seed(worker_seed)
  441.  
  442.  
  443. # In[24]:
  444.  
  445.  
  446. batch_size = 32
  447.  
  448. train_data = Arab2ArabizDS(X_train, y_train)
  449. train_sampler = KSampler(train_data, batch_size)
  450. train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size, worker_init_fn=seed_worker, collate_fn=data_collator_Arab2Arabiz)
  451.  
  452. valid_data = Arab2ArabizDS(X_valid, y_valid)
  453. valid_sampler = KSampler(valid_data, batch_size)
  454. valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=batch_size,worker_init_fn=seed_worker, collate_fn=data_collator_Arab2Arabiz)
  455.  
  456.  
  457. # In[25]:
  458.  
  459.  
  460. criterion = nn.CrossEntropyLoss(ignore_index=pad_token)
  461. optimizer = NoamOpt(128, 1, 4000 ,optim.Adam(model.parameters(), lr=0))
  462.  
  463.  
  464. # In[26]:
  465.  
  466.  
  467. def run_epoch(iterator):
  468.  
  469. total_loss = 0
  470.  
  471. for src, trg in iterator:
  472.  
  473. src = src.T.to(device)
  474. trg = trg.T.to(device)
  475.  
  476. output = model(src, trg[:-1, :])
  477. output = output.reshape(-1, output.shape[2])
  478.  
  479. optimizer.optimizer.zero_grad()
  480. loss = criterion(output, trg[1:].reshape(-1))
  481. total_loss += loss.item()
  482.  
  483. loss.backward()
  484.  
  485. torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
  486. optimizer.step()
  487.  
  488.  
  489. return total_loss / len(iterator)
  490.  
  491.  
  492. # In[27]:
  493.  
  494.  
  495. def run_validation(iterator):
  496.  
  497. total_loss = 0
  498.  
  499. for src, trg in iterator:
  500.  
  501. src = src.T.to(device)
  502. trg = trg.T.to(device)
  503.  
  504. output = model(src, trg[:-1, :])
  505. output = output.reshape(-1, output.shape[2])
  506.  
  507. optimizer.optimizer.zero_grad()
  508. loss = criterion(output, trg[1:].reshape(-1))
  509. total_loss += loss.item()
  510.  
  511.  
  512. return total_loss / len(iterator)
  513.  
  514.  
  515. # In[28]:
  516.  
  517.  
  518. set_seed()
  519.  
  520.  
  521. min_loss = 99
  522. #Change model size
  523. for i in range(100):
  524.  
  525. loss = run_epoch(train_dataloader)
  526. loss_val = run_validation(valid_dataloader)
  527.  
  528. if loss_val < min_loss:
  529. min_loss = loss_val
  530. torch.save(model, "convert_best")
  531.  
  532. print("EPOCH %d -- %f -- Val Loss: %f" % (i, loss, loss_val))
  533.  
  534.  
  535. # In[29]:
  536.  
  537.  
  538. model = torch.load("convert_best").eval()
  539.  
  540.  
  541. # In[30]:
  542.  
  543.  
  544. min_loss
  545.  
  546.  
  547. # In[31]:
  548.  
  549.  
  550. out_int_to_token = {out_token_to_int[t]:t for t in out_token_to_int}
  551.  
  552.  
  553. # In[32]:
  554.  
  555.  
  556. def arabizi_2_arabic(inp):
  557.  
  558. input_sentence = [in_token_to_int[i] for i in inp.lower()]
  559. preds = [sos_token]
  560.  
  561. input_sentence = torch.Tensor(input_sentence).unsqueeze(-1).long().to(device)
  562.  
  563.  
  564. new_char = -1
  565.  
  566. while new_char != eos_token:
  567.  
  568. output_sentence = torch.Tensor(preds).unsqueeze(-1).long().to(device)
  569.  
  570. src = model.pos_encoder(model.encoder(input_sentence))
  571. trg = model.pos_encoder(model.decoder(output_sentence))
  572.  
  573. memory = model.transformer_encoder(src)
  574. output = model.transformer_decoder(tgt = trg, memory = memory)
  575.  
  576. output = model.fc_out(output)
  577. new_char = output.argmax(-1)[-1, 0].item()
  578.  
  579. preds.append(new_char)
  580.  
  581. if len(preds) > 50:
  582. break
  583.  
  584.  
  585. return "".join([out_int_to_token[i] for i in preds[1:-1]])
  586.  
  587.  
  588. # In[33]:
  589.  
  590.  
  591. train = pd.read_csv("../input/zindidd/Train.csv")[["textt", "labell"]]
  592. train.columns = ["texts", "data_labels"]
  593.  
  594. data = train
  595.  
  596.  
  597. # In[34]:
  598.  
  599.  
  600. def preprocess(text): #Might use the same setting if they work to other languages (english and french)
  601.  
  602. text = text.replace('ß',"b")
  603. text = text.replace('à',"a")
  604. text = text.replace('á',"a")
  605. text = text.replace('ç',"c")
  606. text = text.replace('è',"e")
  607. text = text.replace('é',"e")
  608. text = text.replace('$',"s")
  609. text = text.replace("1","")
  610.  
  611.  
  612. text = text.lower()
  613. text = re.sub(r'[^A-Za-z0-9 ,!?.]', '', text)
  614.  
  615.  
  616. # Remove '@name'
  617. text = re.sub(r'(@.*?)[\s]', ' ', text)
  618.  
  619. # Replace '&amp;' with '&'
  620. text = re.sub(r'&amp;', '&', text)
  621.  
  622. # Remove trailing whitespace
  623. text = re.sub(r'\s+', ' ', text).strip()
  624.  
  625. text = re.sub(r'([h][h][h][h])\1+', r'\1', text)
  626. text = re.sub(r'([a-g-i-z])\1+', r'\1', text) #Remove repeating characters
  627. text = re.sub(r' [0-9]+ ', " ", text)
  628. text = re.sub(r'^[0-9]+ ', "", text)
  629.  
  630. return text
  631.  
  632.  
  633. # In[35]:
  634.  
  635.  
  636. #Keep numbers block
  637. def split(text):
  638.  
  639. splits = re.findall(r"[\w']+|[?!.,]", text)
  640.  
  641. to_be_added = []
  642. idx_to_be_added = []
  643.  
  644. forbidden = ["?", "!", ".", ","] + known_idx
  645.  
  646. for i, split in enumerate(splits):
  647.  
  648. if split in forbidden:
  649. if split in known_idx:
  650. to_be_added.append(known[split])
  651. else:
  652. to_be_added.append(split)
  653. idx_to_be_added.append(i)
  654. #else:
  655. #splits[i] = splits[i][:1000]
  656.  
  657.  
  658. splits = [i for i in splits if not i in forbidden]
  659.  
  660. return splits, to_be_added, idx_to_be_added
  661.  
  662.  
  663. # In[36]:
  664.  
  665.  
  666. problematic = []
  667.  
  668. def convert_phrase_2(text):
  669. text = text.replace("0","")
  670. text = text.replace("6","")
  671.  
  672. #print("\nTEXT: "+text)
  673. phrase, to_be_added, idx_to_be_added = split(text.lower())
  674.  
  675. max_len_phrase = max([len(i) for i in phrase])
  676.  
  677. input_sentence = []
  678. for word in phrase:
  679. input_sentence.append([in_token_to_int[i] for i in word] + [pad_token]*(max_len_phrase-len(word)))
  680.  
  681. input_sentence = torch.Tensor(input_sentence).long().T.to(device)
  682. preds = [[sos_token] * len(phrase)]
  683.  
  684. end_word = len(phrase) * [False]
  685. src_pad_mask = model.make_len_mask_enc(input_sentence)
  686.  
  687.  
  688. while not all(end_word):
  689. output_sentence = torch.Tensor(preds).long().to(device)
  690.  
  691. src = model.pos_encoder(model.encoder(input_sentence))
  692. trg = model.pos_encoder(model.decoder(output_sentence))
  693.  
  694. memory = model.transformer_encoder(src, None ,src_pad_mask)
  695. output = model.transformer_decoder(tgt = trg, memory = memory, memory_key_padding_mask = src_pad_mask)
  696.  
  697.  
  698. output = model.fc_out(output)
  699.  
  700.  
  701. output = output.argmax(-1)[-1].cpu().detach().numpy()
  702. preds.append(output.tolist())
  703.  
  704.  
  705. end_word = (output == eos_token) | end_word
  706.  
  707. if len(preds) > 50:
  708. global problematic
  709.  
  710. problematic.append(text)
  711. #print(text)
  712. break
  713.  
  714.  
  715. preds = np.array(preds).T
  716. result = []
  717.  
  718. for word in preds:
  719.  
  720. tmp = []
  721. for i in word[1:]:
  722. if out_int_to_token[i] == "<eos>":
  723. break
  724. tmp.append(out_int_to_token[i])
  725.  
  726. result.append("".join(tmp))
  727.  
  728.  
  729. #Re-add removed punctuation
  730. for item, idx in zip(to_be_added, idx_to_be_added):
  731.  
  732. if item == "?":
  733. item = "؟"
  734. elif item == ",":
  735. item = "،"
  736.  
  737. result.insert(idx, item)
  738.  
  739.  
  740. result = " ".join(result)
  741.  
  742. return result
  743.  
  744.  
  745. # In[37]:
  746.  
  747.  
  748. train.texts = train.texts.apply(preprocess)
  749.  
  750.  
  751. # In[38]:
  752.  
  753.  
  754. results = []
  755. step_size = 100
  756.  
  757. texts = train.texts.values.tolist()
  758.  
  759. for i in tqdm(range(0, len(texts), step_size)):
  760.  
  761. out = convert_phrase_2(" lkrb3 ".join(texts[i:i+step_size]))
  762. splitted_sentences = [ex.lstrip().rstrip() for ex in out.split(" " + convert_phrase_2("lkrb3") + " ")]
  763.  
  764. if len(splitted_sentences) != len(texts[i:i+step_size]):
  765. print("DANGER")
  766. break
  767.  
  768. results.extend(splitted_sentences)
  769.  
  770.  
  771. # In[39]:
  772.  
  773.  
  774. train["converted"] = results.copy()
  775. train.to_csv("train_data.csv")
  776.  
  777.  
  778. # In[40]:
  779.  
  780.  
  781. test = pd.read_csv("../input/zindidd/Test.csv")
  782. test.textt = test.textt.apply(preprocess)
  783.  
  784.  
  785. # In[41]:
  786.  
  787.  
  788. results = []
  789. step_size = 50
  790.  
  791. texts = test.textt.values.tolist()
  792.  
  793. for i in tqdm(range(0, len(texts), step_size)):
  794.  
  795. out = convert_phrase_2(" lkrb3 ".join(texts[i:i+step_size]))
  796. splitted_sentences = [ex.lstrip().rstrip() for ex in out.split(" " + convert_phrase_2("lkrb3") + " ")]
  797.  
  798. if len(splitted_sentences) != len(texts[i:i+step_size]):
  799. print("DANGER")
  800. break
  801.  
  802. results.extend(splitted_sentences)
  803.  
  804.  
  805. # In[42]:
  806.  
  807.  
  808. test["converted"] = results
  809. test.to_csv("test_data.csv")
  810.  
  811.  
  812. # In[43]:
  813.  
  814.  
  815. def preprocessing_for_bert(data, tokenizer, preprocess_text, max_len=256):
  816.  
  817. input_ids = []
  818. attention_masks = []
  819. tmp = tokenizer.encode("ab")[-1]
  820.  
  821. for sentence in data:
  822.  
  823. encoding = tokenizer.encode(preprocess_text(sentence))
  824.  
  825. if len(encoding) > max_len:
  826. encoding = encoding[:max_len-1] + [tmp]
  827.  
  828. in_ids = encoding
  829. att_mask = [1]*len(encoding)
  830.  
  831. input_ids.append(in_ids)
  832. attention_masks.append(att_mask)
  833.  
  834. return input_ids, attention_masks
  835.  
  836.  
  837. # In[44]:
  838.  
  839.  
  840. class BertDataset(Dataset):
  841.  
  842. def __init__(self, data, masks, label=None):
  843.  
  844. self.data = data
  845. self.masks = masks
  846.  
  847. if label != None:
  848. self.labels = label
  849. else:
  850. self.labels = None
  851.  
  852. self.lengths = [len(i) for i in data]
  853.  
  854. def __len__(self):
  855. return len(self.data)
  856.  
  857. def __getitem__(self, idx):
  858. if self.labels != None:
  859. return (self.data[idx], self.masks[idx], self.labels[idx], self.lengths[idx])
  860. else: #For validation
  861. return (self.data[idx], self.masks[idx], None, self.lengths[idx])
  862.  
  863.  
  864. # In[45]:
  865.  
  866.  
  867. def data_collator(data):
  868.  
  869. sentence, mask, label, length = zip(*data)
  870.  
  871. tensor_dim = max(length)
  872.  
  873. out_sentence = torch.full((len(sentence), tensor_dim), dtype=torch.long, fill_value=pad)
  874. out_mask = torch.zeros(len(sentence), tensor_dim, dtype=torch.long)
  875.  
  876. for i in range(len(sentence)):
  877.  
  878. out_sentence[i][:len(sentence[i])] = torch.Tensor(sentence[i])
  879. out_mask[i][:len(mask[i])] = torch.Tensor(mask[i])
  880.  
  881. if label[0] != None:
  882. return (out_sentence, out_mask, torch.Tensor(label).long())
  883. else:
  884. return (out_sentence, out_mask)
  885.  
  886.  
  887. # In[46]:
  888.  
  889.  
  890. class KSampler(Sampler):
  891.  
  892. def __init__(self, data_source, batch_size):
  893. self.lens = [x[1] for x in data_source]
  894. self.batch_size = batch_size
  895.  
  896. def __iter__(self):
  897.  
  898. idx = list(range(len(self.lens)))
  899. arr = list(zip(self.lens, idx))
  900.  
  901. random.shuffle(arr)
  902. n = self.batch_size*100
  903.  
  904. iterator = []
  905.  
  906. for i in range(0, len(self.lens), n):
  907. dt = arr[i:i+n]
  908. dt = sorted(dt, key=lambda x: x[0])
  909.  
  910. for j in range(0, len(dt), self.batch_size):
  911. indices = list(map(lambda x: x[1], dt[j:j+self.batch_size]))
  912. iterator.append(indices)
  913.  
  914. random.shuffle(iterator)
  915. return iter([item for sublist in iterator for item in sublist]) #Flatten nested list
  916.  
  917. def __len__(self):
  918. return len(self.lens)
  919.  
  920.  
  921. # In[47]:
  922.  
  923.  
  924. # Create the BertClassfier class
  925. class BertClassifier(nn.Module):
  926.  
  927. def __init__(self, model_name, dropout, freeze_bert=False):
  928.  
  929. super(BertClassifier, self).__init__()
  930. D_in, H, D_out = 768, 200, 3
  931.  
  932. self.bert = AutoModel.from_pretrained(model_name)
  933.  
  934. self.classifier = nn.Sequential(
  935. nn.Linear(D_in, H),
  936. nn.ReLU(),
  937. nn.Linear(H, D_out)
  938. )
  939.  
  940. if freeze_bert:
  941. for param in self.bert.parameters():
  942. param.requires_grad = False
  943.  
  944. def forward(self, input_ids, attention_mask):
  945.  
  946. outputs = self.bert(input_ids=input_ids,
  947. attention_mask=attention_mask)
  948.  
  949. last_hidden_state_cls = outputs[0][:, 0, :]
  950.  
  951. logits = self.classifier(last_hidden_state_cls)
  952.  
  953. return logits
  954.  
  955.  
  956. # In[48]:
  957.  
  958.  
  959. def initialize_model(model_name, epochs=4, dropout=0.1):
  960.  
  961. bert_classifier = BertClassifier(model_name, dropout=dropout, freeze_bert=False)
  962.  
  963. bert_classifier.to(device)
  964.  
  965. optimizer = AdamW(bert_classifier.parameters(),
  966. lr=5e-5,
  967. eps=1e-8
  968. )
  969.  
  970. total_steps = len(train_dataloader) * epochs
  971.  
  972. scheduler = get_linear_schedule_with_warmup(optimizer,
  973. num_warmup_steps=0, # Default value
  974. num_training_steps=total_steps)
  975. return bert_classifier, optimizer, scheduler
  976.  
  977.  
  978. # In[49]:
  979.  
  980.  
  981. loss_fn = nn.CrossEntropyLoss()
  982.  
  983. def train(model, train_dataloader, val_dataloader=None, epochs=4, evaluation=False, fold=0, prefix=""):
  984.  
  985. global max_acc
  986.  
  987. print("Start training...\n")
  988. for epoch_i in range(epochs):
  989.  
  990. print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}")
  991. print("-"*70)
  992.  
  993. t0_epoch, t0_batch = time.time(), time.time()
  994.  
  995. total_loss, batch_loss, batch_counts = 0, 0, 0
  996. model.train()
  997.  
  998. for step, batch in enumerate(train_dataloader):
  999. batch_counts +=1
  1000.  
  1001. b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)
  1002.  
  1003. model.zero_grad()
  1004.  
  1005. logits = model(b_input_ids, b_attn_mask)
  1006.  
  1007. loss = loss_fn(logits, b_labels)
  1008. batch_loss += loss.item()
  1009. total_loss += loss.item()
  1010.  
  1011. loss.backward()
  1012.  
  1013. torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
  1014.  
  1015. optimizer.step()
  1016. scheduler.step()
  1017.  
  1018. if (step % 20 == 0 and step != 0) or (step == len(train_dataloader) - 1):
  1019.  
  1020. time_elapsed = time.time() - t0_batch
  1021.  
  1022. print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {time_elapsed:^9.2f}")
  1023.  
  1024. batch_loss, batch_counts = 0, 0
  1025. t0_batch = time.time()
  1026.  
  1027. if step%200 == 0 and step != 0 and epoch_i != 0 and epoch_i != 1:
  1028.  
  1029. print("-"*70)
  1030.  
  1031. if evaluation == True:
  1032.  
  1033. val_loss, val_accuracy = evaluate(model, val_dataloader)
  1034.  
  1035. if val_accuracy > max_acc:
  1036. max_acc = val_accuracy
  1037. torch.save(model, prefix + "_best_"+str(fold))
  1038. print("new max")
  1039.  
  1040.  
  1041. print(val_accuracy)
  1042.  
  1043. print("-"*70)
  1044. print("\n")
  1045.  
  1046. model.train()
  1047.  
  1048. avg_train_loss = total_loss / len(train_dataloader)
  1049.  
  1050. print("-"*70)
  1051.  
  1052. if evaluation == True:
  1053.  
  1054. val_loss, val_accuracy = evaluate(model, val_dataloader)
  1055.  
  1056. if val_accuracy > max_acc:
  1057. max_acc = val_accuracy
  1058. torch.save(model, prefix+"_best_"+str(fold))
  1059. print("new max")
  1060.  
  1061. time_elapsed = time.time() - t0_epoch
  1062.  
  1063. print(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_accuracy:^9.2f} | {time_elapsed:^9.2f}")
  1064. print("-"*70)
  1065. print("\n")
  1066.  
  1067. print("Training complete!")
  1068.  
  1069.  
  1070. def evaluate(model, val_dataloader):
  1071.  
  1072. model.eval()
  1073.  
  1074. val_accuracy = []
  1075. val_loss = []
  1076.  
  1077. for batch in val_dataloader:
  1078.  
  1079. b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)
  1080.  
  1081. with torch.no_grad():
  1082. logits = model(b_input_ids, b_attn_mask)
  1083.  
  1084. loss = loss_fn(logits, b_labels)
  1085. val_loss.append(loss.item())
  1086.  
  1087. preds = torch.argmax(logits, dim=1).flatten()
  1088.  
  1089. accuracy = (preds == b_labels).cpu().numpy().mean() * 100
  1090. val_accuracy.append(accuracy)
  1091.  
  1092. val_loss = np.mean(val_loss)
  1093. val_accuracy = np.mean(val_accuracy)
  1094.  
  1095. return val_loss, val_accuracy
  1096.  
  1097.  
  1098. # In[50]:
  1099.  
  1100.  
  1101. def get_indices(arr, idxs): #Helper function to get multiple indexes from a list
  1102.  
  1103. output = []
  1104. for idx in idxs:
  1105. output.append(arr[idx])
  1106.  
  1107. return output
  1108.  
  1109.  
  1110. def text_preprocessing_1(text):
  1111.  
  1112. text = text.lower()
  1113. text = re.sub(r'\s+', ' ', text).strip()
  1114.  
  1115. return text
  1116.  
  1117.  
  1118. def text_preprocessing_2(text):
  1119.  
  1120. text = text.lower()
  1121. text = re.sub(r'\s+', ' ', text).strip()
  1122.  
  1123. text = re.sub(r'([a-g-i-z][a-g-i-z])\1+', r'\1', text)
  1124.  
  1125. return text
  1126.  
  1127.  
  1128. def text_preprocessing_3(text):
  1129.  
  1130. text = text.replace('ß',"b")
  1131. text = text.replace('à',"a")
  1132. text = text.replace('á',"a")
  1133. text = text.replace('ç',"c")
  1134. text = text.replace('è',"e")
  1135. text = text.replace('é',"e")
  1136. text = text.replace('$',"s")
  1137. text = text.replace("1","")
  1138.  
  1139.  
  1140. text = text.lower()
  1141. text = re.sub(r'[^A-Za-z0-9 ,!?.]', '', text)
  1142.  
  1143.  
  1144. # Remove '@name'
  1145. text = re.sub(r'(@.*?)[\s]', ' ', text)
  1146.  
  1147. # Replace '&amp;' with '&'
  1148. text = re.sub(r'&amp;', '&', text)
  1149.  
  1150. # Remove trailing whitespace
  1151. text = re.sub(r'\s+', ' ', text).strip()
  1152.  
  1153. text = re.sub(r'([h][h][h][h])\1+', r'\1', text)
  1154. text = re.sub(r'([a-g-i-z])\1+', r'\1', text) #Remove repeating characters
  1155. text = re.sub(r' [0-9]+ ', " ", text)
  1156. text = re.sub(r'^[0-9]+ ', "", text)
  1157.  
  1158. return text
  1159.  
  1160.  
  1161. # In[51]:
  1162.  
  1163.  
  1164. data = pd.read_csv("../input/zindidd/Train.csv")[["textt", "labell"]].iloc[1000:]
  1165. data.columns = ["texts", "data_labels"]
  1166.  
  1167. data.data_labels = data.data_labels.replace(0,2) #Neutral 2, Positive 1, Negative 0
  1168. data.data_labels = data.data_labels.replace(-1,0)
  1169.  
  1170.  
  1171.  
  1172. X = data.texts.values
  1173. y = data.data_labels.values
  1174.  
  1175. preprocessed_data, masks = preprocessing_for_bert(X, tokenizer_en, text_preprocessing_2, max_len=256)
  1176. pad = tokenizer_en.pad_token_id
  1177.  
  1178.  
  1179. # In[52]:
  1180.  
  1181.  
  1182. kfold = KFold(5, True, seed)
  1183. fold = 0
  1184.  
  1185. bests = []
  1186.  
  1187. for train_ids, val_ids in kfold.split(preprocessed_data):
  1188.  
  1189. print("\n\tFOLD %d \n" % (fold))
  1190. max_acc = -99
  1191.  
  1192. X_train = get_indices(preprocessed_data, train_ids)
  1193. y_train = get_indices(y, train_ids)
  1194. train_masks = get_indices(masks, train_ids)
  1195.  
  1196. X_val = get_indices(preprocessed_data, val_ids)
  1197. y_val = get_indices(y, val_ids)
  1198. val_masks = get_indices(masks, val_ids)
  1199.  
  1200.  
  1201. X_val, y_val, val_masks = list(zip(*sorted(zip(X_val, y_val, val_masks), key=lambda x: len(x[0])))) #Order the validation data for faster validation
  1202. X_val, y_val, val_masks = list(X_val), list(y_val), list(val_masks)
  1203.  
  1204.  
  1205. # Convert other data types to torch.Tensor
  1206. y_train = torch.tensor(y_train)
  1207. y_val = torch.tensor(y_val)
  1208.  
  1209. # Create the DataLoader for our training set
  1210. train_data = BertDataset(X_train, train_masks, y_train)
  1211. train_sampler = KSampler(train_data, batch_size)
  1212. train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size, collate_fn=data_collator)
  1213.  
  1214. # Create the DataLoader for our validation set
  1215. val_data = BertDataset(X_val, val_masks, y_val)
  1216. val_sampler = SequentialSampler(val_data)
  1217. val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size, collate_fn=data_collator)
  1218.  
  1219.  
  1220. set_seed() # Set seed for reproducibility
  1221. bert_classifier, optimizer, scheduler = initialize_model(model_name=model_name_en, epochs=n_epochs, dropout=0.05)
  1222. train(bert_classifier, train_dataloader, val_dataloader, epochs=n_epochs, evaluation=True, fold=fold, prefix="en")
  1223.  
  1224. fold += 1
  1225. bests.append(max_acc)
  1226.  
  1227.  
  1228. # In[53]:
  1229.  
  1230.  
  1231. bests
  1232.  
  1233.  
  1234. # In[54]:
  1235.  
  1236.  
  1237. data = pd.read_csv("train_data.csv")[["converted", "data_labels"]].iloc[1000:]
  1238. data.columns = ["texts", "data_labels"]
  1239.  
  1240. data.data_labels = data.data_labels.replace(0,2) #Neutral 2, Positive 1, Negative 0
  1241. data.data_labels = data.data_labels.replace(-1,0)
  1242.  
  1243.  
  1244.  
  1245. X = data.texts.values
  1246. y = data.data_labels.values
  1247.  
  1248. preprocessed_data, masks = preprocessing_for_bert(X, tokenizer_ar, lambda x: x, max_len=256)
  1249. pad = tokenizer_ar.pad_token_id
  1250.  
  1251.  
  1252. # In[55]:
  1253.  
  1254.  
  1255. kfold = KFold(10, True, seed)
  1256. fold = 0
  1257.  
  1258. bests = []
  1259.  
  1260. for train_ids, val_ids in kfold.split(preprocessed_data):
  1261.  
  1262. print("\n\tFOLD %d \n" % (fold))
  1263. max_acc = -99
  1264.  
  1265. X_train = get_indices(preprocessed_data, train_ids)
  1266. y_train = get_indices(y, train_ids)
  1267. train_masks = get_indices(masks, train_ids)
  1268.  
  1269. X_val = get_indices(preprocessed_data, val_ids)
  1270. y_val = get_indices(y, val_ids)
  1271. val_masks = get_indices(masks, val_ids)
  1272.  
  1273.  
  1274. X_val, y_val, val_masks = list(zip(*sorted(zip(X_val, y_val, val_masks), key=lambda x: len(x[0])))) #Order the validation data for faster validation
  1275. X_val, y_val, val_masks = list(X_val), list(y_val), list(val_masks)
  1276.  
  1277.  
  1278. # Convert other data types to torch.Tensor
  1279. y_train = torch.tensor(y_train)
  1280. y_val = torch.tensor(y_val)
  1281.  
  1282. # Create the DataLoader for our training set
  1283. train_data = BertDataset(X_train, train_masks, y_train)
  1284. train_sampler = KSampler(train_data, batch_size)
  1285. train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size, collate_fn=data_collator)
  1286.  
  1287. # Create the DataLoader for our validation set
  1288. val_data = BertDataset(X_val, val_masks, y_val)
  1289. val_sampler = SequentialSampler(val_data)
  1290. val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size, collate_fn=data_collator)
  1291.  
  1292.  
  1293. set_seed() # Set seed for reproducibility
  1294. bert_classifier, optimizer, scheduler = initialize_model(model_name=model_name_ar, epochs=n_epochs, dropout=0)
  1295. train(bert_classifier, train_dataloader, val_dataloader, epochs=n_epochs, evaluation=True, fold=fold, prefix="ar")
  1296.  
  1297. fold += 1
  1298. bests.append(max_acc)
  1299.  
  1300.  
  1301.  
  1302.  
  1303. # In[56]:
  1304.  
  1305.  
  1306. bests
  1307.  
  1308.  
  1309. # In[ ]:
  1310.  
  1311.  
  1312.  
  1313.  
  1314.  
  1315. # In[57]:
  1316.  
  1317.  
  1318. def bert_single_predict(model, test_dataloader):
  1319.  
  1320. model.eval()
  1321.  
  1322. all_logits = []
  1323.  
  1324. for batch in tqdm(test_dataloader):
  1325.  
  1326. b_input_ids, b_attn_mask = tuple(t.to(device) for t in batch)[:2]
  1327.  
  1328. with torch.no_grad():
  1329. logits = model(b_input_ids, b_attn_mask)
  1330. all_logits.append(logits)
  1331.  
  1332. all_logits = torch.cat(all_logits, dim=0)
  1333.  
  1334. probs = F.softmax(all_logits, dim=1).cpu().numpy()
  1335.  
  1336. return probs
  1337.  
  1338.  
  1339. # In[58]:
  1340.  
  1341.  
  1342. def bert_ensemble_predict(sentences, models, tokenizer, preprocess, truncate=True, max_len=256):
  1343.  
  1344. inputs, masks = preprocessing_for_bert(sentences, tokenizer, preprocess, max_len=max_len)
  1345.  
  1346.  
  1347. dataset = BertDataset(inputs, masks)
  1348. sample = SequentialSampler(dataset)
  1349. dataloader = DataLoader(dataset, sampler=sample, batch_size=128, collate_fn=data_collator)
  1350.  
  1351. preds = []
  1352.  
  1353. for model in models:
  1354. preds.append(bert_single_predict(model, dataloader))
  1355.  
  1356. return preds
  1357.  
  1358.  
  1359. # In[59]:
  1360.  
  1361.  
  1362. def predict_lang(lang_prefix, directory, preprocess_fn, dataset, model_name, n=1, truncate=True, max_len=256):
  1363.  
  1364. print("Loading the models ....")
  1365.  
  1366. global pad
  1367. tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True)
  1368. pad = tokenizer.pad_token_id
  1369.  
  1370. lang_models = []
  1371. for i in range(n):
  1372. lang_models.append(torch.load(directory + "/" + lang_prefix + "best_"+str(i), map_location=device))
  1373.  
  1374. print("Inference ....")
  1375.  
  1376. out = bert_ensemble_predict(dataset, lang_models, tokenizer, preprocess_fn, truncate=truncate, max_len=max_len)
  1377.  
  1378. out_sum = out[0]
  1379. for i in range(1,n):
  1380. out_sum = out[i] + out_sum
  1381.  
  1382. return out_sum
  1383.  
  1384.  
  1385. # In[60]:
  1386.  
  1387.  
  1388. #Sort the list for faster inference
  1389. df = pd.read_csv("../input/zindidd/Test.csv")
  1390. df_converted = pd.read_csv("test_data.csv")
  1391.  
  1392. df["lens"] = df.textt.apply(len)
  1393. df = df.sort_values(by="lens").set_index("IDD", drop=True)
  1394. df_converted = df_converted.set_index("IDD", drop=True).loc[df.index]
  1395.  
  1396.  
  1397. #Convert to list
  1398. test = df.textt.tolist()
  1399. test_converted = df_converted[["converted"]].converted.tolist()
  1400.  
  1401.  
  1402. # In[61]:
  1403.  
  1404.  
  1405. output = predict_lang("ar_", "./", lambda x:x, test_converted, model_name_ar, n=5, truncate=True, max_len=512)
  1406.  
  1407.  
  1408. # In[62]:
  1409.  
  1410.  
  1411. df["preds"] = (output).argmax(1)
  1412.  
  1413. df.preds = df.preds.replace(0,-1)
  1414. df.preds = df.preds.replace(2,0)
  1415.  
  1416. the_output = df.reset_index()[["IDD", "preds"]]
  1417. the_output.columns = ["ID", "label"]
  1418.  
  1419. the_output.to_csv("lessvalid_convvalid150.csv", index=False)
  1420.  
  1421.  
  1422. # In[63]:
  1423.  
  1424.  
  1425. #Difference between this and Extra BERT:
  1426. # - Text_preprocessing used is one here, and two in the other
  1427. # - This with Dropout, the other, no dropout
  1428. # - This 50 dimension in the end ,the other 32 dimension in the end
  1429. # - Seed value for both kfolds and set_seed
  1430.  
  1431. #Suggestion: sub1 (first fold, allnote v13), sub2 (first fold, allnote v13 + nodropout), sub3 (first fold, allnote v13 + nodropout + preprocess two)
  1432. #sub4 (first fold, allnote v13 + nodropout + preprocess two + dim 30), sub6 (ensemble, allnote v13)
  1433.  
  1434. #This would help in noticing the difference in different seed values, Between sub1 and sub5. Also see with a fixed seed value, the difference that
  1435. #dropout, preprocess one or two, and last dim would make in a submission
  1436.  
  1437.  
  1438. # In[ ]:
  1439.  
  1440.  
  1441.  
  1442.  
  1443.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement