Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # Loading the data
- data = pandas.read_csv("transcripts.csv")
- # Creating a pretrained tokenizer
- tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
- indexed_tokens = []
- tokens_tensor = []
- for i, transcript in enumerate(data['transcript']):
- # Removing parenthesis and blackslashes from transcripts
- data['transcript'][i] = re.sub(r" ?\([^)]+\)"," ", data['transcript'][i])
- data['transcript'][i] = re.sub('\'',"", data['transcript'][i])
- input_text = ""
- for index, word in enumerate(data['transcript'][i].split()):
- input_text=input_text +" "+word
- #print("hÄR ÄR DEN",input_text)
- if index>600:
- break
- if i==0:
- print(len(input_text))
- #if i>10:
- # break
- # Encoding the transcripts
- indexed_tokens.append(tokenizer.encode(input_text))
- # Convert indexed tokens into a pytorch tensors
- tokens_tensor.append(torch.tensor([indexed_tokens[i]]))
- print(len(tokens_tensor[0][:]))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement