Untitled

# train_df['review_score'].value_counts()
# 5.0    13.723
# 1.0     6.074
# 4.0     4.045
# 3.0     2.444
# 2.0     1.471

import pickle
import pandas as pd
import re
import numpy as np
from unicodedata import normalize
from sklearn.utils import shuffle
from keras_preprocessing.text import text_to_word_sequence

class Nilc():
    '''
    Filter using rules from https://github.com/nathanshartmann/portuguese_word_embeddings/blob/master/preprocessing.py
    '''

    # Punctuation list
    punctuations = re.escape('!"#%\'()*+,./:;<=>?@[\\]^_`{|}~')

    # ##### #
    # Regex #
    # ##### #
    re_remove_brackets = re.compile(r'\{.*\}')
    re_remove_html = re.compile(r'<(\/|\\)?.+?>', re.UNICODE)
    re_transform_numbers = re.compile(r'\d', re.UNICODE)
    re_transform_emails = re.compile(r'[^\s]+@[^\s]+', re.UNICODE)
    re_transform_url = re.compile(r'(http|https)://[^\s]+', re.UNICODE)
    # Different quotes are used.
    re_quotes_1 = re.compile(r"(?u)(^|\W)[‘’′`']", re.UNICODE)
    re_quotes_2 = re.compile(r"(?u)[‘’`′'](\W|$)", re.UNICODE)
    re_quotes_3 = re.compile(r'(?u)[‘’`′“”]', re.UNICODE)
    re_dots = re.compile(r'(?<!\.)\.\.(?!\.)', re.UNICODE)
    re_punctuation = re.compile(r'([,";:]){2},', re.UNICODE)
    re_hiphen = re.compile(r' -(?=[^\W\d_])', re.UNICODE)
    re_tree_dots = re.compile(u'…', re.UNICODE)
    # Differents punctuation patterns are used.
    re_punkts = re.compile(r'(\w+)([%s])([ %s])' %
                        (punctuations, punctuations), re.UNICODE)
    re_punkts_b = re.compile(r'([ %s])([%s])(\w+)' %
                            (punctuations, punctuations), re.UNICODE)
    re_punkts_c = re.compile(r'(\w+)([%s])$' % (punctuations), re.UNICODE)
    re_changehyphen = re.compile(u'–')
    re_doublequotes_1 = re.compile(r'(\"\")')
    re_doublequotes_2 = re.compile(r'(\'\')')
    re_trim = re.compile(r' +', re.UNICODE)
    re_nr = re.compile(r'\n\r')

    @staticmethod
    def clean_text(text):
        """Apply all regex above to a given string."""
        text = text.lower()
        text = text.replace('\xa0', ' ')
        text = text.replace('\x93', ' ') # Added by Kleyson
        text = text.replace('\x94', ' ') # Added by Kleyson
        text = text.replace('\x96', ' ') # Added by Kleyson
        text = text.replace('\t', ' ') # Added by Kleyson
        text = text.replace('ñ', 'não') # Added by Kleyson
        text = text.replace('ò', 'o') # Added by Kleyson
        text = text.replace('ă', 'a') # Added by Kleyson
        text = text.replace('ä', 'a') # Added by Kleyson
        text = text.replace('è', 'é') # Added by Kleyson
        text = text.replace('ő', 'o') # Added by Kleyson
        text = text.replace('ö', 'o') # Added by Kleyson
        text = text.replace('à', 'a') # Added by Kleyson
        text = text.replace('ì', 'i') # Added by Kleyson
        text = text.replace('å', 'a') # Added by Kleyson
        text = text.replace('ķ', 'o') # Added by Kleyson
        text = Nilc.re_nr.sub(' ', text) # Added by Kleyson
        text = Nilc.re_tree_dots.sub('...', text)
        text = re.sub('\.\.\.', '', text)
        text = Nilc.re_remove_brackets.sub('', text)
        text = Nilc.re_changehyphen.sub('-', text)
        text = Nilc.re_remove_html.sub(' ', text)
        text = Nilc.re_transform_numbers.sub('0', text)
        text = Nilc.re_transform_url.sub('URL', text)
        text = Nilc.re_transform_emails.sub('EMAIL', text)
        text = Nilc.re_quotes_1.sub(r'\1"', text)
        text = Nilc.re_quotes_2.sub(r'"\1', text)
        text = Nilc.re_quotes_3.sub('"', text)
        text = re.sub('"', '', text)
        text = Nilc.re_dots.sub('.', text)
        text = Nilc.re_punctuation.sub(r'\1', text)
        text = Nilc.re_hiphen.sub(' - ', text)
        text = Nilc.re_punkts.sub(r'\1 \2 \3', text)
        text = Nilc.re_punkts_b.sub(r'\1 \2 \3', text)
        text = Nilc.re_punkts_c.sub(r'\1 \2', text)
        text = Nilc.re_doublequotes_1.sub('\"', text)
        text = Nilc.re_doublequotes_2.sub('\'', text)
        text = Nilc.re_trim.sub(' ', text)
        text = text.strip()
        return text

    def process(self, data):

        print('Starting Nilc processing.')

        texts = [Nilc.clean_text(str(d)) for d in data]
        return np.array(texts)

# -------------------------------------------------------------------


class RemoveAcentos():
    '''
    Devolve cpia de uma str substituindo os caracteres acentuados pelos seus equivalentes no acentuados.

    ATENO: carateres graficos nao ASCII e nao alfa-numricos, tais como bullets, travesses,
    aspas assimtricas, etc, so simplesmente removidos!
    '''

    def process(self, data):

        print('Starting RemoveAcentos processing.')

        texts = [normalize('NFKD', str(d)).encode('ASCII', 'ignore').decode('ASCII') for d in data]
        return np.array(texts)

class ReplacePadroes():
    '''
    '''

    re_eh = re.compile(r'\s+é\s+')
    re_feliz = re.compile(r':\)')
    re_triste = re.compile(r':\(')
    re_exclamacao = re.compile(r'!')
    re_interrogacao = re.compile(r'\?')
    re_r = re.compile(r'\r')
    re_n = re.compile(r'\n')

    @staticmethod
    def clean_text(text):
        """Apply all regex above to a given string."""

        # text = ReplacePadroes.re_eh.sub(' eh ', text)
        text = ReplacePadroes.re_feliz.sub(' ', text) # ' feliz '
        text = ReplacePadroes.re_triste.sub(' ', text) # ' triste '
        # text = ReplacePadroes.re_exclamacao.sub(' excl ', text)
        # text = ReplacePadroes.re_interrogacao.sub(' intr ', text)
        text = ReplacePadroes.re_r.sub(' ', text)
        text = ReplacePadroes.re_n.sub(' ', text)
        text = text.strip()
        return text

    def process(self, data):

        print('Starting ReplacePadroes processing.')

        texts = [ReplacePadroes.clean_text(str(d)) for d in data]
        return np.array(texts)

class RemoveStopWords():
    '''
    Remove stop words
    '''

    # Lista de caracteres utilizados para filtro
    FILTER_CHARS='!"#$%&()*+,-./:;<=>?@[\]^_`´{|}~ªº°§'

    def process(self, data, stopwords):

        print('Starting RemoveStopWords processing.')

        # remove stop words from tokens
        texts = [' '.join(word for word in text_to_word_sequence(d, filters=RemoveStopWords.FILTER_CHARS) if word not in stopwords and word.isalpha()) for d in data]
        return np.array(texts)

class RemoveLetrasDuplicadas():
    '''
    '''

    regex = r'([^rs])(?=\1+)|(rr)(?=r+)|(ss)(?=s+)'

    def process(self, data):

        print('Starting RemoveLetrasDuplicadas processing.')

        texts = [re.sub(RemoveLetrasDuplicadas.regex, '', d, 0) for d in data]

        return np.array(texts)


def clean_text(texts):

    # Lista de caracteres utilizados para filtro
    FILTER_CHARS='!"#$%&()*+,-./:;<=>?@[\]^_`´{|}~ªº°§'

    # Carrega stopwords
    with open('stopwords.txt', 'r') as f:
        stopwords = list(f)
    stopwords = [s.strip() for s in stopwords]

    # Pre-processors
    nilc = Nilc()
    replacePadroes = ReplacePadroes()
    removeAcentos = RemoveAcentos()
    removeStopWords = RemoveStopWords()
    removeLetrasDuplicadas = RemoveLetrasDuplicadas()

    texts = [r for r in texts]
    texts = nilc.process(texts)
    texts = replacePadroes.process(texts)
    texts = removeStopWords.process(texts, stopwords)
    # texts = removeAcentos.process(texts)
    texts = [' '.join(text_to_word_sequence(t, filters=FILTER_CHARS)) for t in texts]
    texts = removeLetrasDuplicadas.process(texts)

    list_chars(texts)

    return texts

def list_chars(texts):

    txt = ''

    for text in texts:
        for s in text:
            char_validations = ['ă','ñ','à','ő','ì','ò','è','ü','ķ','ö','å','ä']
            if s in char_validations:
                print('{}: {}'.format(s, text))
            txt += s

    chars = set(txt)

    print('Caracteres presentes nos textos: {}'.format(''.join(list(chars))))

def main():

    # Read datasets
    train_df = pd.read_csv('dataset/train.csv', encoding='utf8', escapechar='\\')
    test_df = pd.read_csv('dataset/test.csv', encoding='utf8')

    #
    # Clean training data
    #
    reviews = train_df['review_comment_message'].astype(str)
    reviews = clean_text(reviews)

    # Generate X,y
    dataset = pd.DataFrame(list(zip(reviews, train_df['review_score'])), columns=['review_comment_message', 'review_score'])
    dataset = dataset[np.isfinite(dataset['review_score'])] # remove algumas linhas onde o review_score = NaN
    dataset['review_score'] = dataset['review_score'].astype(int)
    dataset = shuffle(dataset)

    # Save cleaned dataset
    with open('dataset/train.pickle', 'wb') as f:
        pickle.dump(dataset, f)

    #
    # Clean testing data
    #
    reviews = test_df['review_comment_message'].astype(str)
    reviews = clean_text(reviews)

    # Generate X,y
    dataset = pd.DataFrame(list(zip(test_df['review_id'], reviews)), columns =['review_id', 'review_comment_message'])

    # Save cleaned dataset
    with open('dataset/test.pickle', 'wb') as f:
        pickle.dump(dataset, f)


if __name__ == "__main__":
    main()