Advertisement
Guest User

preprocess

a guest
Mar 26th, 2019
93
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.37 KB | None | 0 0
  1. import string
  2.  
  3. IN_PATH = '42bin_haber/news/dunya/1.txt'
  4.  
  5. SENTENCE_START = '<s>'
  6. SENTENCE_END = '</s>'
  7. PAD_TOKEN = '[PAD]' # This is used to pad the encoder input, decoder input and target sequence
  8. UNKNOWN_TOKEN = '[UNK]' # This is used to represent out-of-vocabulary(OOV) words
  9. START_SEQUENCE = '[START]' # This is used to indicate start of sequence, feed this to decoder as initial token
  10. STOP_SEQUENCE = '[STOP]' # This is used to indicate end of sequence
  11.  
  12. def preprocess(freq_thresh=5,file_path=IN_PATH,sentence_len=16,crop_end=True):
  13.  
  14.     with open('full.txt') as full:
  15.         uniques = list(set(full))
  16.  
  17.     words = [i.split()[0] for i in uniques if int(i.split()[1]) >= freq_thresh]
  18.  
  19.     word2index = {word: index for index, word in enumerate(words)}
  20.     index2word = {index: word for index, word in enumerate(words)}
  21.  
  22.     with open(file_path) as input_file:
  23.         sentences = input_file.read().split('.')
  24.  
  25.     result = []
  26.     for i,sentence in enumerate(sentences):  #looping over sentences
  27.  
  28.         #print("\n\nSENT",sentence)
  29.         sentence = clear_sentence(sentence)
  30.         #sentence = "merhaba koşarak gel at bin"
  31.         my_enc = encode_sentence(sentence,word2index,crop_end=crop_end,nn_input_size=sentence_len)
  32.         #print(my_enc)
  33.         result.append(my_enc)
  34.         deced = [index2word.get(words) for words in my_enc]
  35.         #print("verify :",deced)
  36.  
  37.     return result
  38.     #clear_sentence("asdsad, adfg...")
  39.  
  40. def clear_sentence(sentence):
  41.  
  42.     table = str.maketrans('','',string.punctuation)
  43.     newss = sentence.translate(table)
  44.     #print(newss)
  45.     return newss
  46.  
  47.  
  48. def encode_sentence(sentence,word2index_dict,crop_end=True,nn_input_size=10):
  49.  
  50.     assert isinstance(sentence, str) and isinstance(word2index_dict,dict)
  51.     words = sentence.split()
  52.     v_encoded = [PAD_TOKEN for i in range(nn_input_size)]
  53.     v_encoded[0] = START_SEQUENCE
  54.     v_encoded[-1]= STOP_SEQUENCE
  55.  
  56.     start_index = 0
  57.     if crop_end is False:
  58.         start_index = len(words) - nn_input_size
  59.  
  60.     for counter, word in enumerate(v_encoded[1:-1]):  # looping over words of sentence
  61.         #print(counter)
  62.         try:
  63.             word_now = word2index_dict.get(words[counter + start_index].lower())
  64.             if word_now is None:
  65.                 word_now = UNKNOWN_TOKEN
  66.             #print("\nwordy",word_now)
  67.         except IndexError:
  68.             v_encoded[counter + 1] = STOP_SEQUENCE
  69.             v_encoded[-1] = PAD_TOKEN
  70.             break
  71.  
  72.         v_encoded[counter + 1] = word_now
  73.  
  74.     return v_encoded
  75.  
  76.  
  77. rety =preprocess(crop_end=False)
  78.  
  79. [print(i) for i in rety]
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement