Advertisement
Guest User

Untitled

a guest
Oct 22nd, 2019
100
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 9.42 KB | None | 0 0
  1. # train_df['review_score'].value_counts()                                                                                                                                                            
  2. # 5.0    13.723
  3. # 1.0     6.074
  4. # 4.0     4.045
  5. # 3.0     2.444
  6. # 2.0     1.471
  7.  
  8. import pickle
  9. import pandas as pd
  10. import re
  11. import numpy as np
  12. from unicodedata import normalize
  13. from sklearn.utils import shuffle
  14. from keras_preprocessing.text import text_to_word_sequence
  15.  
  16. class Nilc():
  17.     '''
  18.    Filter using rules from https://github.com/nathanshartmann/portuguese_word_embeddings/blob/master/preprocessing.py
  19.    '''
  20.  
  21.     # Punctuation list
  22.     punctuations = re.escape('!"#%\'()*+,./:;<=>?@[\\]^_`{|}~')
  23.  
  24.     # ##### #
  25.     # Regex #
  26.     # ##### #
  27.     re_remove_brackets = re.compile(r'\{.*\}')
  28.     re_remove_html = re.compile(r'<(\/|\\)?.+?>', re.UNICODE)
  29.     re_transform_numbers = re.compile(r'\d', re.UNICODE)
  30.     re_transform_emails = re.compile(r'[^\s]+@[^\s]+', re.UNICODE)
  31.     re_transform_url = re.compile(r'(http|https)://[^\s]+', re.UNICODE)
  32.     # Different quotes are used.
  33.     re_quotes_1 = re.compile(r"(?u)(^|\W)[‘’′`']", re.UNICODE)
  34.     re_quotes_2 = re.compile(r"(?u)[‘’`′'](\W|$)", re.UNICODE)
  35.     re_quotes_3 = re.compile(r'(?u)[‘’`′“”]', re.UNICODE)
  36.     re_dots = re.compile(r'(?<!\.)\.\.(?!\.)', re.UNICODE)
  37.     re_punctuation = re.compile(r'([,";:]){2},', re.UNICODE)
  38.     re_hiphen = re.compile(r' -(?=[^\W\d_])', re.UNICODE)
  39.     re_tree_dots = re.compile(u'…', re.UNICODE)
  40.     # Differents punctuation patterns are used.
  41.     re_punkts = re.compile(r'(\w+)([%s])([ %s])' %
  42.                         (punctuations, punctuations), re.UNICODE)
  43.     re_punkts_b = re.compile(r'([ %s])([%s])(\w+)' %
  44.                             (punctuations, punctuations), re.UNICODE)
  45.     re_punkts_c = re.compile(r'(\w+)([%s])$' % (punctuations), re.UNICODE)
  46.     re_changehyphen = re.compile(u'–')
  47.     re_doublequotes_1 = re.compile(r'(\"\")')
  48.     re_doublequotes_2 = re.compile(r'(\'\')')
  49.     re_trim = re.compile(r' +', re.UNICODE)
  50.     re_nr = re.compile(r'\n\r')
  51.  
  52.     @staticmethod
  53.     def clean_text(text):
  54.         """Apply all regex above to a given string."""
  55.         text = text.lower()
  56.         text = text.replace('\xa0', ' ')
  57.         text = text.replace('\x93', ' ') # Added by Kleyson
  58.         text = text.replace('\x94', ' ') # Added by Kleyson
  59.         text = text.replace('\x96', ' ') # Added by Kleyson
  60.         text = text.replace('\t', ' ') # Added by Kleyson
  61.         text = text.replace('ñ', 'não') # Added by Kleyson
  62.         text = text.replace('ò', 'o') # Added by Kleyson
  63.         text = text.replace('ă', 'a') # Added by Kleyson
  64.         text = text.replace('ä', 'a') # Added by Kleyson
  65.         text = text.replace('è', 'é') # Added by Kleyson
  66.         text = text.replace('ő', 'o') # Added by Kleyson
  67.         text = text.replace('ö', 'o') # Added by Kleyson
  68.         text = text.replace('à', 'a') # Added by Kleyson
  69.         text = text.replace('ì', 'i') # Added by Kleyson
  70.         text = text.replace('å', 'a') # Added by Kleyson
  71.         text = text.replace('ķ', 'o') # Added by Kleyson
  72.         text = Nilc.re_nr.sub(' ', text) # Added by Kleyson
  73.         text = Nilc.re_tree_dots.sub('...', text)
  74.         text = re.sub('\.\.\.', '', text)
  75.         text = Nilc.re_remove_brackets.sub('', text)
  76.         text = Nilc.re_changehyphen.sub('-', text)
  77.         text = Nilc.re_remove_html.sub(' ', text)
  78.         text = Nilc.re_transform_numbers.sub('0', text)
  79.         text = Nilc.re_transform_url.sub('URL', text)
  80.         text = Nilc.re_transform_emails.sub('EMAIL', text)
  81.         text = Nilc.re_quotes_1.sub(r'\1"', text)
  82.         text = Nilc.re_quotes_2.sub(r'"\1', text)
  83.         text = Nilc.re_quotes_3.sub('"', text)
  84.         text = re.sub('"', '', text)
  85.         text = Nilc.re_dots.sub('.', text)
  86.         text = Nilc.re_punctuation.sub(r'\1', text)
  87.         text = Nilc.re_hiphen.sub(' - ', text)
  88.         text = Nilc.re_punkts.sub(r'\1 \2 \3', text)
  89.         text = Nilc.re_punkts_b.sub(r'\1 \2 \3', text)
  90.         text = Nilc.re_punkts_c.sub(r'\1 \2', text)
  91.         text = Nilc.re_doublequotes_1.sub('\"', text)
  92.         text = Nilc.re_doublequotes_2.sub('\'', text)
  93.         text = Nilc.re_trim.sub(' ', text)
  94.         text = text.strip()
  95.         return text
  96.  
  97.     def process(self, data):
  98.  
  99.         print('Starting Nilc processing.')
  100.  
  101.         texts = [Nilc.clean_text(str(d)) for d in data]
  102.         return np.array(texts)
  103.  
  104. # -------------------------------------------------------------------
  105.  
  106.  
  107. class RemoveAcentos():
  108.     '''
  109.    Devolve cpia de uma str substituindo os caracteres acentuados pelos seus equivalentes no acentuados.
  110.    
  111.    ATENO: carateres graficos nao ASCII e nao alfa-numricos, tais como bullets, travesses,
  112.    aspas assimtricas, etc, so simplesmente removidos!
  113.    '''
  114.  
  115.     def process(self, data):
  116.  
  117.         print('Starting RemoveAcentos processing.')
  118.  
  119.         texts = [normalize('NFKD', str(d)).encode('ASCII', 'ignore').decode('ASCII') for d in data]
  120.         return np.array(texts)
  121.  
  122. class ReplacePadroes():
  123.     '''
  124.    '''
  125.  
  126.     re_eh = re.compile(r'\s\s+')
  127.     re_feliz = re.compile(r':\)')
  128.     re_triste = re.compile(r':\(')
  129.     re_exclamacao = re.compile(r'!')
  130.     re_interrogacao = re.compile(r'\?')
  131.     re_r = re.compile(r'\r')
  132.     re_n = re.compile(r'\n')
  133.  
  134.     @staticmethod
  135.     def clean_text(text):
  136.         """Apply all regex above to a given string."""
  137.  
  138.         # text = ReplacePadroes.re_eh.sub(' eh ', text)
  139.         text = ReplacePadroes.re_feliz.sub(' ', text) # ' feliz '
  140.         text = ReplacePadroes.re_triste.sub(' ', text) # ' triste '
  141.         # text = ReplacePadroes.re_exclamacao.sub(' excl ', text)
  142.         # text = ReplacePadroes.re_interrogacao.sub(' intr ', text)
  143.         text = ReplacePadroes.re_r.sub(' ', text)
  144.         text = ReplacePadroes.re_n.sub(' ', text)
  145.         text = text.strip()
  146.         return text
  147.  
  148.     def process(self, data):
  149.  
  150.         print('Starting ReplacePadroes processing.')
  151.  
  152.         texts = [ReplacePadroes.clean_text(str(d)) for d in data]
  153.         return np.array(texts)
  154.  
  155. class RemoveStopWords():
  156.     '''
  157.    Remove stop words
  158.    '''
  159.  
  160.     # Lista de caracteres utilizados para filtro
  161.     FILTER_CHARS='!"#$%&()*+,-./:;<=>?@[\]^_`´{|}~ªº°§'
  162.    
  163.     def process(self, data, stopwords):
  164.  
  165.         print('Starting RemoveStopWords processing.')
  166.  
  167.         # remove stop words from tokens
  168.         texts = [' '.join(word for word in text_to_word_sequence(d, filters=RemoveStopWords.FILTER_CHARS) if word not in stopwords and word.isalpha()) for d in data]
  169.         return np.array(texts)        
  170.  
  171. class RemoveLetrasDuplicadas():
  172.     '''
  173.    '''
  174.  
  175.     regex = r'([^rs])(?=\1+)|(rr)(?=r+)|(ss)(?=s+)'
  176.    
  177.     def process(self, data):
  178.  
  179.         print('Starting RemoveLetrasDuplicadas processing.')
  180.  
  181.         texts = [re.sub(RemoveLetrasDuplicadas.regex, '', d, 0) for d in data]
  182.  
  183.         return np.array(texts)        
  184.  
  185.  
  186. def clean_text(texts):
  187.  
  188.     # Lista de caracteres utilizados para filtro
  189.     FILTER_CHARS='!"#$%&()*+,-./:;<=>?@[\]^_`´{|}~ªº°§'
  190.  
  191.     # Carrega stopwords
  192.     with open('stopwords.txt', 'r') as f:
  193.         stopwords = list(f)
  194.     stopwords = [s.strip() for s in stopwords]
  195.  
  196.     # Pre-processors
  197.     nilc = Nilc()
  198.     replacePadroes = ReplacePadroes()
  199.     removeAcentos = RemoveAcentos()
  200.     removeStopWords = RemoveStopWords()
  201.     removeLetrasDuplicadas = RemoveLetrasDuplicadas()
  202.  
  203.     texts = [r for r in texts]
  204.     texts = nilc.process(texts)
  205.     texts = replacePadroes.process(texts)
  206.     texts = removeStopWords.process(texts, stopwords)
  207.     # texts = removeAcentos.process(texts)
  208.     texts = [' '.join(text_to_word_sequence(t, filters=FILTER_CHARS)) for t in texts]
  209.     texts = removeLetrasDuplicadas.process(texts)
  210.  
  211.     list_chars(texts)
  212.  
  213.     return texts
  214.  
  215. def list_chars(texts):
  216.    
  217.     txt = ''
  218.  
  219.     for text in texts:
  220.         for s in text:
  221.             char_validations = ['ă','ñ','à','ő','ì','ò','è','ü','ķ','ö','å','ä']
  222.             if s in char_validations:
  223.                 print('{}: {}'.format(s, text))
  224.             txt += s
  225.  
  226.     chars = set(txt)
  227.  
  228.     print('Caracteres presentes nos textos: {}'.format(''.join(list(chars))))
  229.  
  230. def main():
  231.  
  232.     # Read datasets
  233.     train_df = pd.read_csv('dataset/train.csv', encoding='utf8', escapechar='\\')
  234.     test_df = pd.read_csv('dataset/test.csv', encoding='utf8')
  235.  
  236.     #
  237.     # Clean training data
  238.     #
  239.     reviews = train_df['review_comment_message'].astype(str)
  240.     reviews = clean_text(reviews)
  241.  
  242.     # Generate X,y
  243.     dataset = pd.DataFrame(list(zip(reviews, train_df['review_score'])), columns=['review_comment_message', 'review_score'])
  244.     dataset = dataset[np.isfinite(dataset['review_score'])] # remove algumas linhas onde o review_score = NaN
  245.     dataset['review_score'] = dataset['review_score'].astype(int)
  246.     dataset = shuffle(dataset)
  247.  
  248.     # Save cleaned dataset
  249.     with open('dataset/train.pickle', 'wb') as f:
  250.         pickle.dump(dataset, f)
  251.  
  252.     #
  253.     # Clean testing data
  254.     #
  255.     reviews = test_df['review_comment_message'].astype(str)
  256.     reviews = clean_text(reviews)
  257.  
  258.     # Generate X,y
  259.     dataset = pd.DataFrame(list(zip(test_df['review_id'], reviews)), columns =['review_id', 'review_comment_message'])
  260.  
  261.     # Save cleaned dataset
  262.     with open('dataset/test.pickle', 'wb') as f:
  263.         pickle.dump(dataset, f)
  264.  
  265.  
  266. if __name__ == "__main__":
  267.     main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement