Untitled

import re
import fastText
import morfeusz2
import pandas as pd
import numpy as np
from tqdm import tqdm
import unicodedata
from skmultilearn.adapt import MLkNN
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csc_matrix
import os.path
import operator
import time
import io
import csv


def dummy_fun(doc):
    return doc


def main():
    prepare_train_data()
    prepare_test_data()
    train_model()
    test = get_test_data()
    test_data = get_test_data_ft()
    predicted = predict_test(test_data)
    # for pred in predicted:
    #     print(pred)

    with io.open('eggs.csv', 'w', encoding="utf-8") as file:
        for id, labels in zip(test["id"], predicted):
            labels = labels_to_string(labels)
            file.write(str(id) + "," + labels + "\n")
            print(str(id) + ", " + labels)
    # for labels in enumerate(results):
    #     print(labels_to_string(labels))


def get_stopwords():
    with open('data/stopwords.txt') as f:
        stopwords = f.readlines()
    stopwords = [x.strip() for x in stopwords]
    return stopwords


def get_train_data():
    train_data = pd.read_csv('data/dataninja2019_ads_train.csv')
    return train_data


def get_test_data():
    test_data = pd.read_csv('data/dataninja2019_ads_test.csv')
    return test_data


def get_test_data_ft(file_name='data/FastText_test.txt'):
    with open(file_name, 'r', encoding='utf8') as f:
        return f.readlines()


def train_model():
    model = fastText.train_supervised('data/FastText_train.txt', epoch=50, lr=0.02, wordNgrams=2, minCount=2,
                                      label='__label__', loss='hs')
    model.save_model('model.bin')


def prepare_train_data():
    stopwords = get_stopwords()
    morfeusz = morfeusz2.Morfeusz()
    X = get_train_data()[['title', 'description', 'labels']]
    file = open("data/FastText_train.txt", "w", encoding='utf-8')
    for index, row in tqdm(X.iterrows()):
        text = []
        if row['labels'] is not None:
            for label in str(row['labels']).split():
                text.append(''.join(['__label__', str(label), ' , ']))
        if row['title'] is not None:
            text.append(tokenize(row['title'], morfeusz, stopwords))
        if row['description'] is not None:
            text.append(tokenize(row['description'], morfeusz, stopwords))
        text.append('\n')
        file.write(str(" ".join(text)))
        file.flush()
    file.close()


def prepare_test_data():
    stopwords = get_stopwords()
    morfeusz = morfeusz2.Morfeusz()
    X = get_test_data()[['title', 'description']]
    file = open("data/FastText_test.txt", "w", encoding='utf8')
    for index, row in tqdm(X.iterrows()):
        text = []
        if row['title'] is not None:
            text.append(tokenize(row['title'], morfeusz, stopwords))
        if row['description'] is not None:
            text.append(tokenize(row['description'], morfeusz, stopwords))
        text.append('\n')
        file.write(str(" ".join(text)))
        file.flush()
    file.close()


def predict_test(test_texts):
    model = fastText.load_model('model.bin')
    results = []
    print('test text len ', len(test_texts))
    for text in tqdm(test_texts):
        tmp = model.predict(str(text).replace('\n', ''), k=5)[0]
        results.append(tmp)
    return results


def labels_to_string(labels):
    new_labels = []
    for label in labels:
        new_labels.append(str(label).replace('__label__', ''))
    return " ".join(new_labels)


def labelise(dataset):
    Y = []
    labelByAd = {}
    label_dict = {}
    listOfLabels = []
    iterator = 0
    #  For each value with the tag 'labels' an entry is added
    #  to the dictionary with the number of occurrences of the given tag.
    #  If the entry contains more than one label then the string is split.
    for label in tqdm(dataset['labels'].values):
        label = str(label)  # We change everything into a string
        #  Dividing the string into individual labels
        single_labels = label.split()
        listOfLabels.append(single_labels)
        labelByAd[iterator] = single_labels
        iterator = iterator + 1
        if label.find(' ') != -1:
            single_labels = label.split()
            for i in range(len(single_labels)):
                if single_labels[i] in label_dict:
                    label_dict[single_labels[i]] += 1
                else:
                    label_dict[single_labels[i]] = 1
        else:
            if label in label_dict:
                label_dict[label] += 1
            else:
                label_dict[label] = 1

    return label_dict, labelByAd, listOfLabels


def generateY(dataset, labels):
    X_s = []
    Y_s = []

    for i, word in tqdm(enumerate(dataset['labels'])):
        word = str(word).split()
        for j, label in enumerate(labels):
            if label in word:
                X_s.append(i)
                Y_s.append(j)
    Y = csc_matrix((np.ones(len(Y_s)), (X_s, Y_s)), dtype=np.int_)
    print(Y)
    return Y


def normalize_dataset(dataset):
    with open('data/stopwords.txt', encoding="utf-8") as f:
        stopwords = f.readlines()
    stopwords = [x.strip() for x in stopwords]
    stopwords = lemmatise_stopwords(stopwords)
    for i in tqdm(dataset.index):
        value = dataset.at[i, 'description']
        description_array = re.split(r'\W+', value.lower())
        description_array = lemmatise(description_array)
        description_array = remove_stopwords(stopwords, description_array)
        description_array = tokenize(description_array)
        dataset.at[i, 'description'] = description_array
        # X.append(description_array)
    return dataset['description']


def tokenize(text, morfeusz, stopwords):
    words = tmp = re.split(r'\W+', str(text).lower())
    tokens = []
    for word in words:
        if word != ' ' and word != '':
            word = morfeusz.analyse(word)[0][2][1]
        word = unicodedata.normalize('NFKD', word).replace(u'ł', 'l').encode('ascii', 'ignore').decode("utf-8")
        if word not in stopwords and word != '':
            tokens.append(word)
    return " ".join(tokens)


morf = morfeusz2.Morfeusz()


def lemmatise(description_array):
    for word in description_array:
        result = morf.analyse(word)
        if len(word) > 0:
            description_array[description_array.index(word)] = result[0][2][1]
        else:
            description_array.remove(word)

    return description_array


def remove_stopwords(stopwords, description_array):
    for desc in description_array:
        if desc in stopwords:
            description_array.remove(desc)
    return description_array


def lemmatise_stopwords(stopwords):
    morpheus = morfeusz2.Morfeusz()
    for word in stopwords:
        result = morpheus.analyse(word)
        if len(result) > 0:
            stopwords[stopwords.index(word)] = result[0][2][1]
        else:
            stopwords.remove(word)
    stopwords = list(set(stopwords))
    stopwords.sort()
    print(stopwords)
    return stopwords


def transform_to_tfidf(X):
    tfidf = TfidfVectorizer(
        analyzer='word',
        token_pattern=None,
        tokenizer=dummy_fun,
        preprocessor=dummy_fun,
        vocabulary=get_tfidf_vocabulary(X, 1000))
    tfidf.fit(X)
    X = tfidf.transform(X)
    return X


def get_tfidf_vocabulary(X, max_features):
    filename = "vocabulary.npy"
    filename = "".join([str(max_features), "_", filename])
    path = "/".join(['data', filename])
    if os.path.isfile(path):
        return np.load(path)
    tfidf = TfidfVectorizer(
        analyzer='word',
        tokenizer=dummy_fun,
        preprocessor=dummy_fun,
        token_pattern=None)
    tfidf.fit(X)
    sorted_vocabulary = sorted(tfidf.vocabulary_.items(), key=operator.itemgetter(1), reverse=True)
    vocabulary = []
    i = 0
    for k, v in sorted_vocabulary:
        vocabulary.append(str(k))
        i = i + 1
        if i >= max_features:
            break
    np.save(path, vocabulary)
    return vocabulary


if __name__ == "__main__":
    start_time = time.time()
    main()
    print("Czas: ", time.time() - start_time)