Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/python3
- '''
- Linear regression for paranormal and sceptic challange 2.0.0
- In order to use train.py you need to pass two columns
- label document
- splited by \t
- Commands used: xzcat, paste
- '''
- import sys
- import pickle
- import random
- import collections
- from tokenizer import tokenize
- def train():
- #Prepare
- vocabulary = set()
- word_to_index_mapping = {}
- index_to_word_mapping = {}
- word_count = collections.defaultdict(int)
- #Array x,y to use later for training process
- x = []
- y = []
- learning_rate = 0.0001
- #Read values from file
- for line in sys.stdin:
- line = line.rstrip()
- fields = line.split('\t')
- label = fields[0]
- document = fields[1]
- terms = tokenize(document)
- #Add words from document to x and label to y (we need to reconfigure label p is 1 and s is 0)
- x.append(terms)
- if label == "P":
- y.append(1)
- else:
- y.append(0)
- #Update vocabulary and count how often word appear
- for t in terms:
- word_count[t] += 1
- vocabulary.add(t)
- #Give numbers for words. Each word its own value. Indexing
- ix = 1
- for w in vocabulary:
- word_to_index_mapping[w] = ix
- index_to_word_mapping[ix] = w
- ix += 1
- #Initialize weights with random values from -1.0 to 1.0 (floats)
- weights = []
- for ix in range(0,len(vocabulary) + 1):
- weights.append(random.uniform(-1.00, 1.00))
- Loss_sum = 0.0
- Loss_sum_counter = 1
- while True:
- choose_random_example = random.randint(0,len(x)-1)
- actual_x = x[choose_random_example] #list of words
- actual_y = y[choose_random_example] #label for this set of words
- #Predict result
- y_predicted = weights[0]
- #Iterate over all words in randomly choosen example
- #With get u can avoid missing words and replace them with value u want
- #Weights replace value doesn't matter if word is missing cause word_count will give 0
- for word in actual_x:
- y_predicted += weights[word_to_index_mapping.get(word,0)] * (word_count.get(word,0) / len(word_count))
- #Cost count. Check how good was our prediction
- Loss = (y_predicted - actual_y) ** 2.0
- print(Loss_sum_counter)
- print(Loss + "\n")
- Loss_sum_counter += 1
- #Update weights
- delta = (y_predicted - actual_y) * learning_rate
- weights[0] = weights[0] - delta
- for word in actual_x:
- if word in word_to_index_mapping:
- weights[word_to_index_mapping[word]] -= ((word_count[word] / len(word_count)) * delta)
- if Loss_sum_counter > 10000:
- break
- #We save only things we need for prediction
- model = (weights, word_to_index_mapping, word_count)
- pickle.dump(model, open("model.pkl", "wb"))
- train()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement