Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # train_df['review_score'].value_counts()
- # 5.0 13.723
- # 1.0 6.074
- # 4.0 4.045
- # 3.0 2.444
- # 2.0 1.471
- import pickle
- import pandas as pd
- import re
- import numpy as np
- from unicodedata import normalize
- from sklearn.utils import shuffle
- from keras_preprocessing.text import text_to_word_sequence
- class Nilc():
- '''
- Filter using rules from https://github.com/nathanshartmann/portuguese_word_embeddings/blob/master/preprocessing.py
- '''
- # Punctuation list
- punctuations = re.escape('!"#%\'()*+,./:;<=>?@[\\]^_`{|}~')
- # ##### #
- # Regex #
- # ##### #
- re_remove_brackets = re.compile(r'\{.*\}')
- re_remove_html = re.compile(r'<(\/|\\)?.+?>', re.UNICODE)
- re_transform_numbers = re.compile(r'\d', re.UNICODE)
- re_transform_emails = re.compile(r'[^\s]+@[^\s]+', re.UNICODE)
- re_transform_url = re.compile(r'(http|https)://[^\s]+', re.UNICODE)
- # Different quotes are used.
- re_quotes_1 = re.compile(r"(?u)(^|\W)[‘’′`']", re.UNICODE)
- re_quotes_2 = re.compile(r"(?u)[‘’`′'](\W|$)", re.UNICODE)
- re_quotes_3 = re.compile(r'(?u)[‘’`′“”]', re.UNICODE)
- re_dots = re.compile(r'(?<!\.)\.\.(?!\.)', re.UNICODE)
- re_punctuation = re.compile(r'([,";:]){2},', re.UNICODE)
- re_hiphen = re.compile(r' -(?=[^\W\d_])', re.UNICODE)
- re_tree_dots = re.compile(u'…', re.UNICODE)
- # Differents punctuation patterns are used.
- re_punkts = re.compile(r'(\w+)([%s])([ %s])' %
- (punctuations, punctuations), re.UNICODE)
- re_punkts_b = re.compile(r'([ %s])([%s])(\w+)' %
- (punctuations, punctuations), re.UNICODE)
- re_punkts_c = re.compile(r'(\w+)([%s])$' % (punctuations), re.UNICODE)
- re_changehyphen = re.compile(u'–')
- re_doublequotes_1 = re.compile(r'(\"\")')
- re_doublequotes_2 = re.compile(r'(\'\')')
- re_trim = re.compile(r' +', re.UNICODE)
- re_nr = re.compile(r'\n\r')
- @staticmethod
- def clean_text(text):
- """Apply all regex above to a given string."""
- text = text.lower()
- text = text.replace('\xa0', ' ')
- text = text.replace('\x93', ' ') # Added by Kleyson
- text = text.replace('\x94', ' ') # Added by Kleyson
- text = text.replace('\x96', ' ') # Added by Kleyson
- text = text.replace('\t', ' ') # Added by Kleyson
- text = text.replace('ñ', 'não') # Added by Kleyson
- text = text.replace('ò', 'o') # Added by Kleyson
- text = text.replace('ă', 'a') # Added by Kleyson
- text = text.replace('ä', 'a') # Added by Kleyson
- text = text.replace('è', 'é') # Added by Kleyson
- text = text.replace('ő', 'o') # Added by Kleyson
- text = text.replace('ö', 'o') # Added by Kleyson
- text = text.replace('à', 'a') # Added by Kleyson
- text = text.replace('ì', 'i') # Added by Kleyson
- text = text.replace('å', 'a') # Added by Kleyson
- text = text.replace('ķ', 'o') # Added by Kleyson
- text = Nilc.re_nr.sub(' ', text) # Added by Kleyson
- text = Nilc.re_tree_dots.sub('...', text)
- text = re.sub('\.\.\.', '', text)
- text = Nilc.re_remove_brackets.sub('', text)
- text = Nilc.re_changehyphen.sub('-', text)
- text = Nilc.re_remove_html.sub(' ', text)
- text = Nilc.re_transform_numbers.sub('0', text)
- text = Nilc.re_transform_url.sub('URL', text)
- text = Nilc.re_transform_emails.sub('EMAIL', text)
- text = Nilc.re_quotes_1.sub(r'\1"', text)
- text = Nilc.re_quotes_2.sub(r'"\1', text)
- text = Nilc.re_quotes_3.sub('"', text)
- text = re.sub('"', '', text)
- text = Nilc.re_dots.sub('.', text)
- text = Nilc.re_punctuation.sub(r'\1', text)
- text = Nilc.re_hiphen.sub(' - ', text)
- text = Nilc.re_punkts.sub(r'\1 \2 \3', text)
- text = Nilc.re_punkts_b.sub(r'\1 \2 \3', text)
- text = Nilc.re_punkts_c.sub(r'\1 \2', text)
- text = Nilc.re_doublequotes_1.sub('\"', text)
- text = Nilc.re_doublequotes_2.sub('\'', text)
- text = Nilc.re_trim.sub(' ', text)
- text = text.strip()
- return text
- def process(self, data):
- print('Starting Nilc processing.')
- texts = [Nilc.clean_text(str(d)) for d in data]
- return np.array(texts)
- # -------------------------------------------------------------------
- class RemoveAcentos():
- '''
- Devolve cpia de uma str substituindo os caracteres acentuados pelos seus equivalentes no acentuados.
- ATENO: carateres graficos nao ASCII e nao alfa-numricos, tais como bullets, travesses,
- aspas assimtricas, etc, so simplesmente removidos!
- '''
- def process(self, data):
- print('Starting RemoveAcentos processing.')
- texts = [normalize('NFKD', str(d)).encode('ASCII', 'ignore').decode('ASCII') for d in data]
- return np.array(texts)
- class ReplacePadroes():
- '''
- '''
- re_eh = re.compile(r'\s+é\s+')
- re_feliz = re.compile(r':\)')
- re_triste = re.compile(r':\(')
- re_exclamacao = re.compile(r'!')
- re_interrogacao = re.compile(r'\?')
- re_r = re.compile(r'\r')
- re_n = re.compile(r'\n')
- @staticmethod
- def clean_text(text):
- """Apply all regex above to a given string."""
- # text = ReplacePadroes.re_eh.sub(' eh ', text)
- text = ReplacePadroes.re_feliz.sub(' ', text) # ' feliz '
- text = ReplacePadroes.re_triste.sub(' ', text) # ' triste '
- # text = ReplacePadroes.re_exclamacao.sub(' excl ', text)
- # text = ReplacePadroes.re_interrogacao.sub(' intr ', text)
- text = ReplacePadroes.re_r.sub(' ', text)
- text = ReplacePadroes.re_n.sub(' ', text)
- text = text.strip()
- return text
- def process(self, data):
- print('Starting ReplacePadroes processing.')
- texts = [ReplacePadroes.clean_text(str(d)) for d in data]
- return np.array(texts)
- class RemoveStopWords():
- '''
- Remove stop words
- '''
- # Lista de caracteres utilizados para filtro
- FILTER_CHARS='!"#$%&()*+,-./:;<=>?@[\]^_`´{|}~ªº°§'
- def process(self, data, stopwords):
- print('Starting RemoveStopWords processing.')
- # remove stop words from tokens
- texts = [' '.join(word for word in text_to_word_sequence(d, filters=RemoveStopWords.FILTER_CHARS) if word not in stopwords and word.isalpha()) for d in data]
- return np.array(texts)
- class RemoveLetrasDuplicadas():
- '''
- '''
- regex = r'([^rs])(?=\1+)|(rr)(?=r+)|(ss)(?=s+)'
- def process(self, data):
- print('Starting RemoveLetrasDuplicadas processing.')
- texts = [re.sub(RemoveLetrasDuplicadas.regex, '', d, 0) for d in data]
- return np.array(texts)
- def clean_text(texts):
- # Lista de caracteres utilizados para filtro
- FILTER_CHARS='!"#$%&()*+,-./:;<=>?@[\]^_`´{|}~ªº°§'
- # Carrega stopwords
- with open('stopwords.txt', 'r') as f:
- stopwords = list(f)
- stopwords = [s.strip() for s in stopwords]
- # Pre-processors
- nilc = Nilc()
- replacePadroes = ReplacePadroes()
- removeAcentos = RemoveAcentos()
- removeStopWords = RemoveStopWords()
- removeLetrasDuplicadas = RemoveLetrasDuplicadas()
- texts = [r for r in texts]
- texts = nilc.process(texts)
- texts = replacePadroes.process(texts)
- texts = removeStopWords.process(texts, stopwords)
- # texts = removeAcentos.process(texts)
- texts = [' '.join(text_to_word_sequence(t, filters=FILTER_CHARS)) for t in texts]
- texts = removeLetrasDuplicadas.process(texts)
- list_chars(texts)
- return texts
- def list_chars(texts):
- txt = ''
- for text in texts:
- for s in text:
- char_validations = ['ă','ñ','à','ő','ì','ò','è','ü','ķ','ö','å','ä']
- if s in char_validations:
- print('{}: {}'.format(s, text))
- txt += s
- chars = set(txt)
- print('Caracteres presentes nos textos: {}'.format(''.join(list(chars))))
- def main():
- # Read datasets
- train_df = pd.read_csv('dataset/train.csv', encoding='utf8', escapechar='\\')
- test_df = pd.read_csv('dataset/test.csv', encoding='utf8')
- #
- # Clean training data
- #
- reviews = train_df['review_comment_message'].astype(str)
- reviews = clean_text(reviews)
- # Generate X,y
- dataset = pd.DataFrame(list(zip(reviews, train_df['review_score'])), columns=['review_comment_message', 'review_score'])
- dataset = dataset[np.isfinite(dataset['review_score'])] # remove algumas linhas onde o review_score = NaN
- dataset['review_score'] = dataset['review_score'].astype(int)
- dataset = shuffle(dataset)
- # Save cleaned dataset
- with open('dataset/train.pickle', 'wb') as f:
- pickle.dump(dataset, f)
- #
- # Clean testing data
- #
- reviews = test_df['review_comment_message'].astype(str)
- reviews = clean_text(reviews)
- # Generate X,y
- dataset = pd.DataFrame(list(zip(test_df['review_id'], reviews)), columns =['review_id', 'review_comment_message'])
- # Save cleaned dataset
- with open('dataset/test.pickle', 'wb') as f:
- pickle.dump(dataset, f)
- if __name__ == "__main__":
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement