Advertisement
Th3NiKo

Linear regression - paranormal or sceptic challange

Apr 5th, 2020
276
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.92 KB | None | 0 0
  1. #!/usr/bin/python3
  2. '''
  3. Linear regression for paranormal and sceptic challange 2.0.0
  4. In order to use train.py you need to pass two columns
  5. label   document
  6. splited by \t
  7. Commands used: xzcat, paste
  8. '''
  9.  
  10. import sys
  11. import pickle
  12. import random
  13. import collections
  14. from tokenizer import tokenize
  15.  
  16.  
  17. def train():
  18.     #Prepare
  19.     vocabulary = set()
  20.     word_to_index_mapping = {}
  21.     index_to_word_mapping = {}
  22.     word_count = collections.defaultdict(int)
  23.  
  24.     #Array x,y to use later for training process
  25.     x = []
  26.     y = []
  27.  
  28.     learning_rate = 0.0001
  29.  
  30.     #Read values from file
  31.     for line in sys.stdin:
  32.         line = line.rstrip()
  33.         fields = line.split('\t')
  34.         label = fields[0]
  35.         document = fields[1]
  36.         terms = tokenize(document)
  37.  
  38.         #Add words from document to x and label to y (we need to reconfigure label p is 1 and s is 0)
  39.         x.append(terms)
  40.         if label == "P":
  41.             y.append(1)
  42.         else:
  43.             y.append(0)
  44.  
  45.         #Update vocabulary and count how often word appear
  46.         for t in terms:
  47.             word_count[t] += 1
  48.             vocabulary.add(t)
  49.  
  50.     #Give numbers for words. Each word its own value. Indexing
  51.     ix = 1
  52.     for w in vocabulary:
  53.         word_to_index_mapping[w] = ix
  54.         index_to_word_mapping[ix] = w
  55.         ix += 1
  56.  
  57.     #Initialize weights with random values from -1.0 to 1.0 (floats)
  58.     weights = []
  59.     for ix in range(0,len(vocabulary) + 1):
  60.         weights.append(random.uniform(-1.00, 1.00))
  61.    
  62.     Loss_sum = 0.0
  63.     Loss_sum_counter = 1
  64.  
  65.     while True:
  66.         choose_random_example = random.randint(0,len(x)-1)
  67.         actual_x = x[choose_random_example] #list of words
  68.         actual_y = y[choose_random_example] #label for this set of words
  69.  
  70.         #Predict result
  71.         y_predicted = weights[0]
  72.  
  73.         #Iterate over all words in randomly choosen example
  74.         #With get u can avoid missing words and replace them with value u want
  75.         #Weights replace value doesn't matter if word is missing cause word_count will give 0
  76.         for word in actual_x:
  77.             y_predicted += weights[word_to_index_mapping.get(word,0)] * (word_count.get(word,0) / len(word_count))
  78.  
  79.         #Cost count. Check how good was our prediction
  80.         Loss = (y_predicted - actual_y) ** 2.0
  81.         print(Loss_sum_counter)
  82.         print(Loss + "\n")
  83.         Loss_sum_counter += 1
  84.  
  85.         #Update weights
  86.         delta = (y_predicted - actual_y) * learning_rate
  87.         weights[0] = weights[0] - delta
  88.         for word in actual_x:
  89.             if word in word_to_index_mapping:
  90.                 weights[word_to_index_mapping[word]] -= ((word_count[word] / len(word_count)) * delta)
  91.  
  92.         if Loss_sum_counter > 10000:
  93.             break
  94.        
  95.  
  96.     #We save only things we need for prediction
  97.     model = (weights, word_to_index_mapping, word_count)
  98.     pickle.dump(model, open("model.pkl", "wb"))
  99.  
  100.  
  101. train()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement