Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import string
- IN_PATH = '42bin_haber/news/dunya/1.txt'
- SENTENCE_START = '<s>'
- SENTENCE_END = '</s>'
- PAD_TOKEN = '[PAD]' # This is used to pad the encoder input, decoder input and target sequence
- UNKNOWN_TOKEN = '[UNK]' # This is used to represent out-of-vocabulary(OOV) words
- START_SEQUENCE = '[START]' # This is used to indicate start of sequence, feed this to decoder as initial token
- STOP_SEQUENCE = '[STOP]' # This is used to indicate end of sequence
- def preprocess(freq_thresh=5,file_path=IN_PATH,sentence_len=16,crop_end=True):
- with open('full.txt') as full:
- uniques = list(set(full))
- words = [i.split()[0] for i in uniques if int(i.split()[1]) >= freq_thresh]
- word2index = {word: index for index, word in enumerate(words)}
- index2word = {index: word for index, word in enumerate(words)}
- with open(file_path) as input_file:
- sentences = input_file.read().split('.')
- result = []
- for i,sentence in enumerate(sentences): #looping over sentences
- #print("\n\nSENT",sentence)
- sentence = clear_sentence(sentence)
- #sentence = "merhaba koşarak gel at bin"
- my_enc = encode_sentence(sentence,word2index,crop_end=crop_end,nn_input_size=sentence_len)
- #print(my_enc)
- result.append(my_enc)
- deced = [index2word.get(words) for words in my_enc]
- #print("verify :",deced)
- return result
- #clear_sentence("asdsad, adfg...")
- def clear_sentence(sentence):
- table = str.maketrans('','',string.punctuation)
- newss = sentence.translate(table)
- #print(newss)
- return newss
- def encode_sentence(sentence,word2index_dict,crop_end=True,nn_input_size=10):
- assert isinstance(sentence, str) and isinstance(word2index_dict,dict)
- words = sentence.split()
- v_encoded = [PAD_TOKEN for i in range(nn_input_size)]
- v_encoded[0] = START_SEQUENCE
- v_encoded[-1]= STOP_SEQUENCE
- start_index = 0
- if crop_end is False:
- start_index = len(words) - nn_input_size
- for counter, word in enumerate(v_encoded[1:-1]): # looping over words of sentence
- #print(counter)
- try:
- word_now = word2index_dict.get(words[counter + start_index].lower())
- if word_now is None:
- word_now = UNKNOWN_TOKEN
- #print("\nwordy",word_now)
- except IndexError:
- v_encoded[counter + 1] = STOP_SEQUENCE
- v_encoded[-1] = PAD_TOKEN
- break
- v_encoded[counter + 1] = word_now
- return v_encoded
- rety =preprocess(crop_end=False)
- [print(i) for i in rety]
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement