preprocess

import string

IN_PATH = '42bin_haber/news/dunya/1.txt'

SENTENCE_START = '<s>'
SENTENCE_END = '</s>'
PAD_TOKEN = '[PAD]' # This is used to pad the encoder input, decoder input and target sequence
UNKNOWN_TOKEN = '[UNK]' # This is used to represent out-of-vocabulary(OOV) words
START_SEQUENCE = '[START]' # This is used to indicate start of sequence, feed this to decoder as initial token
STOP_SEQUENCE = '[STOP]' # This is used to indicate end of sequence

def preprocess(freq_thresh=5,file_path=IN_PATH,sentence_len=16,crop_end=True):

    with open('full.txt') as full:
        uniques = list(set(full))

    words = [i.split()[0] for i in uniques if int(i.split()[1]) >= freq_thresh]

    word2index = {word: index for index, word in enumerate(words)}
    index2word = {index: word for index, word in enumerate(words)}

    with open(file_path) as input_file:
        sentences = input_file.read().split('.')

    result = []
    for i,sentence in enumerate(sentences):  #looping over sentences

        #print("\n\nSENT",sentence)
        sentence = clear_sentence(sentence)
        #sentence = "merhaba koşarak gel at bin"
        my_enc = encode_sentence(sentence,word2index,crop_end=crop_end,nn_input_size=sentence_len)
        #print(my_enc)
        result.append(my_enc)
        deced = [index2word.get(words) for words in my_enc]
        #print("verify :",deced)

    return result
    #clear_sentence("asdsad, adfg...")

def clear_sentence(sentence):

    table = str.maketrans('','',string.punctuation)
    newss = sentence.translate(table)
    #print(newss)
    return newss


def encode_sentence(sentence,word2index_dict,crop_end=True,nn_input_size=10):

    assert isinstance(sentence, str) and isinstance(word2index_dict,dict)
    words = sentence.split()
    v_encoded = [PAD_TOKEN for i in range(nn_input_size)]
    v_encoded[0] = START_SEQUENCE
    v_encoded[-1]= STOP_SEQUENCE

    start_index = 0
    if crop_end is False:
        start_index = len(words) - nn_input_size

    for counter, word in enumerate(v_encoded[1:-1]):  # looping over words of sentence
        #print(counter)
        try:
            word_now = word2index_dict.get(words[counter + start_index].lower())
            if word_now is None:
                word_now = UNKNOWN_TOKEN
            #print("\nwordy",word_now)
        except IndexError:
            v_encoded[counter + 1] = STOP_SEQUENCE
            v_encoded[-1] = PAD_TOKEN
            break

        v_encoded[counter + 1] = word_now

    return v_encoded


rety =preprocess(crop_end=False)

[print(i) for i in rety]