Advertisement
Guest User

Untitled

a guest
Oct 17th, 2019
132
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 0.98 KB | None | 0 0
  1. # Loading the data
  2. data = pandas.read_csv("transcripts.csv")
  3.  
  4. # Creating a pretrained tokenizer
  5. tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
  6.  
  7. indexed_tokens = []
  8. tokens_tensor = []
  9.  
  10. for i, transcript in enumerate(data['transcript']):
  11. # Removing parenthesis and blackslashes from transcripts
  12. data['transcript'][i] = re.sub(r" ?\([^)]+\)"," ", data['transcript'][i])
  13. data['transcript'][i] = re.sub('\'',"", data['transcript'][i])
  14.  
  15. input_text = ""
  16. for index, word in enumerate(data['transcript'][i].split()):
  17. input_text=input_text +" "+word
  18. #print("hÄR ÄR DEN",input_text)
  19. if index>600:
  20. break
  21. if i==0:
  22. print(len(input_text))
  23.  
  24. #if i>10:
  25. # break
  26.  
  27. # Encoding the transcripts
  28. indexed_tokens.append(tokenizer.encode(input_text))
  29.  
  30. # Convert indexed tokens into a pytorch tensors
  31. tokens_tensor.append(torch.tensor([indexed_tokens[i]]))
  32. print(len(tokens_tensor[0][:]))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement