Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import numpy as np
- from sklearn.feature_extraction.text import TfidfVectorizer
- import re
- def read_line(data, x, cnt):
- y = data.readline()
- if len(y) != 0:
- x.append(y)
- return y
- f = open("train_samples.txt", "r", encoding='utf-8')
- if f.mode == 'r':
- cnt = 0
- train_samples = []
- while read_line(f, train_samples, cnt):
- cnt += 1
- f = open("train_labels.txt", "r", encoding='utf-8')
- if f.mode == 'r':
- cnt = 0
- train_labels = []
- while read_line(f, train_labels, cnt):
- cnt += 1
- f = open("validation_samples.txt", "r", encoding='utf-8')
- if f.mode == 'r':
- cnt = 0
- validation_samples = []
- while read_line(f, validation_samples, cnt):
- cnt += 1
- f = open("validation_labels.txt", "r", encoding='utf-8')
- if f.mode == 'r':
- cnt = 0
- validation_labels = []
- while read_line(f, validation_labels, cnt):
- cnt += 1
- f = open("test_samples.txt", "r", encoding='utf-8')
- if f.mode == 'r':
- test_samples = []
- while read_line(f, test_samples, cnt):
- pass
- id_list = []
- # for idx in range(len(train_samples)):
- # train_samples[idx] = train_samples[idx].replace('$NE$', '')
- # #train_samples[idx] = re.sub('[0123456789!@+%#?,.:„”“";()]', '', train_samples[idx])
- # # train_samples[idx] = train_samples[idx].split()
- # # train_samples[idx] = list(
- # # filter(lambda x: x != '.' and x.isdigit() == False and x not in cuvinte_stop, train_samples[idx]))
- # train_labels parsing
- for idx in range(len(train_labels)):
- train_labels[idx] = train_labels[idx].split()
- train_labels[idx].pop(0)
- aux = []
- for idx in range(len(train_labels)):
- aux.append(train_labels[idx])
- train_labels = aux
- # for idx in range(len(validation_samples)):
- # validation_samples[idx]= validation_samples[idx].replace('$NE$', '')
- # #validation_target_samples[idx] = re.sub('[0123456789!@+%#?,.:„”“";()]', '', validation_target_samples[idx])
- # # validation_target_samples[idx] = validation_target_samples[idx].split()
- # # validation_target_samples[idx] = list(filter(lambda x : x != '.' and x.isdigit() == False and x not in cuvinte_stop, validation_target_samples[idx]))
- for idx in range(len(validation_labels)):
- validation_labels[idx] = validation_labels[idx].split()
- validation_labels[idx].pop(0)
- aux = []
- for idx in range(len(validation_labels)):
- aux.append(validation_labels[idx])
- validation_labels = aux
- print(len(test_samples))
- for idx in range(len(test_samples)):
- # test_samples[idx] = test_samples[idx].replace('$NE$', '')
- line = test_samples[idx].split()
- id_list.append(line[0])
- # test_samples[idx] = re.sub('[0123456789!@+%#?,.:„”“";()]', '', test_samples[idx])
- # test_samples[idx] = test_samples[idx].split()
- # test_samples[idx] = list(filter(lambda x: x != '-' and x != '.' and x.isdigit() == False and x not in cuvinte_stop, test_samples[idx]))
- for idx in range(len(validation_samples)):
- train_samples.append(validation_samples[idx])
- for idx in range(len(validation_labels)):
- train_labels.append(validation_labels[idx])
- tf_vectorizer = TfidfVectorizer(ngram_range=(1, 3))
- train_samples_tfidf = tf_vectorizer.fit_transform(train_samples)
- print(train_samples_tfidf.shape)
- # validation_samples_tfidf = tf_vectorizer.transform(validation_samples)
- test_samples_tfidf = tf_vectorizer.transform(test_samples)
- from sklearn import preprocessing
- from sklearn import svm
- from sklearn.svm import LinearSVC
- from sklearn import linear_model
- from sklearn.metrics import f1_score
- from sklearn.metrics import accuracy_score
- import csv
- # print(accuracy_score(predicted_labels, validation_source_labels))
- print("gata")
- # print(f1_score(predicted_labels, validation_target_labels))
- C_param = 1
- # svm_model = svm.LinearSVC(C=c, soft_marg) # kernel liniar
- # svm_model.fit(train_samples_tfidf, np.ravel(train_labels)) # train
- Loss_param = ['hinge'] #, 'log', 'modified_huber', 'squared_hinge', 'perceptron','squared_loss', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'
- Penalty_param = ['l2', 'l1', 'elasticnet']
- Alpha_param = [0.0001, 0.001, 0.00001, 0.000001, 0.000005]
- Fit_intercept_param = [True] #, False
- Max_iter_param = [1000]
- Tol_param = [1e-3, 1e-4, 1e-2]
- Shuffle_param = [True] #, False
- Verbose_param = [0]
- Epsilon_param = [0.1, 0.5, 0.05, 0.005] #
- Learning_rate_param = ['optimal']
- Eta0_param = [0, 1, 5, 20] #
- Power_t_param = [0.5, 0.1, 0.7, 1]
- Average_param = [False] # True, 5, 10, 100
- maxim = 0
- clf_maxim = 0
- loss_max = 0
- penalty_max = 0
- alpha_max = 0
- fit_intercept_max = 0
- max_iter_max = 0
- tol_max = 0
- shuffle_max = 0
- verbose_max = 0
- epsilon_max = 0
- learning_rate_max = 0
- eta0_max = 0
- power_t_max = 0
- average_max = 0
- nr_iteratii = 0
- # for loss_param in Loss_param:
- # if nr_iteratii == 10000:
- # break;
- # for penalty_param in Penalty_param:
- # if nr_iteratii == 10000:
- # break;
- # for alpha_param in Alpha_param:
- # if nr_iteratii == 10000:
- # break;
- # for fit_intercept_param in Fit_intercept_param:
- # if nr_iteratii == 10000:
- # break;
- # for max_iter_param in Max_iter_param:
- # if nr_iteratii == 10000:
- # break;
- # for tol_param in Tol_param:
- # if nr_iteratii == 10000:
- # break;
- # for shuffle_param in Shuffle_param:
- # if nr_iteratii == 10000:
- # break;
- # for verbose_param in Verbose_param:
- # if nr_iteratii == 10000:
- # break;
- # for epsilon_param in Epsilon_param:
- # if nr_iteratii == 10000:
- # break;
- # for learning_rate_param in Learning_rate_param:
- # if nr_iteratii == 10000:
- # break;
- # for eta0_param in Eta0_param:
- # if nr_iteratii == 10000:
- # break;
- # for power_t_param in Power_t_param:
- # if nr_iteratii == 10000:
- # break;
- # for average_param in Average_param:
- # if nr_iteratii == 10000:
- # break;
- # clf = linear_model.SGDClassifier(loss=loss_param,
- # penalty=penalty_param,
- # alpha=alpha_param,
- # fit_intercept=fit_intercept_param,
- # max_iter=max_iter_param,
- # tol=tol_param,
- # shuffle=shuffle_param,
- # verbose=verbose_param,
- # epsilon=epsilon_param,
- # learning_rate=learning_rate_param,
- # eta0=eta0_param,
- # power_t=power_t_param,
- # average=average_param)
- # clf.fit(train_samples_tfidf, np.ravel(train_labels))
- # predicted_labels = clf.predict(validation_samples_tfidf)
- # nr_iteratii += 1
- # if nr_iteratii % 1000 == 0:
- # print(nr_iteratii)
- # if accuracy_score(predicted_labels, validation_labels) > maxim:
- # maxim = accuracy_score(predicted_labels, validation_labels)
- # clf_maxim = clf
- # loss_max = loss_param
- # penalty_max = penalty_param
- # alpha_max = alpha_param
- # fit_intercept_max = fit_intercept_param
- # max_iter_max = max_iter_param
- # tol_max = tol_param
- # shuffle_max = shuffle_param
- # verbose_max = verbose_param
- # epsilon_max = epsilon_param
- # learning_rate_max = learning_rate_param
- # eta0_max = eta0_param
- # power_t_max = power_t_param
- # average_max = average_param
- # print(maxim)
- # print(maxim)
- # print(loss_max)
- # print(penalty_max)
- # print(alpha_max)
- # print(fit_intercept_max)
- # print(max_iter_max)
- # print(tol_max)
- # print(shuffle_max)
- # print(verbose_max)
- # print(epsilon_max)
- # print(learning_rate_max)
- # print(eta0_max)
- # print(power_t_max)
- # print(average_max)
- #clf = linear_model.SGDClassifier(loss='hinge', penalty='elasticnet', alpha=1e-05, fit_intercept=True, max_iter=1000, tol=0.01, shuffle=True, verbose=0, epsilon=0.005, learning_rate='optimal', eta0=0, power_t=0.1, average=False)
- #clf = linear_model.SGDClassifier(loss='hinge', penalty='l2', alpha=1e-05, fit_intercept=True, max_iter=1000, tol=0.0001, shuffle=True, verbose=0, epsilon=0.005, learning_rate='optimal', eta0=0, power_t=0.5, average=False)
- # clf = linear_model.SGDClassifier(loss='hinge', penalty='l2', alpha=1e-05, fit_intercept=True, max_iter=1000, tol=0.0001, shuffle=True, verbose=0, epsilon=0.5, learning_rate='optimal', eta0=0, power_t=0.5, average=False)
- clf = linear_model.SGDClassifier()
- clf.fit(train_samples_tfidf, np.ravel(train_labels))
- predicted_labels = clf.predict(test_samples_tfidf)
- # print(accuracy_score(predicted_labels, validation_labels))
- with open('sample_submission.csv', 'w', newline='') as file:
- writer = csv.writer(file, delimiter=',')
- writer.writerow(["id", "label"])
- for idx in range(len(predicted_labels)):
- writer.writerow([id_list[idx], predicted_labels[idx]])
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement