Guest User

Untitled

a guest
Nov 22nd, 2017
87
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.49 KB | None | 0 0
  1. """
  2. It read files from csv file and separates "labels" and "texts" from it.
  3. """
  4. import pandas as pd
  5. import os
  6. import numpy as np
  7. import string
  8.  
  9.  
  10. def run():
  11. """
  12. run to separate rows
  13. :return:
  14. """
  15. file_sep_dir = os.getcwd()
  16. msg_label_dir = os.getcwd() + '/msg_labels'
  17. msg_label_file = msg_label_dir + '/msg_labels_file.txt'
  18.  
  19. msg_texts_dir = os.getcwd() + '/msg_texts'
  20. msg_texts_file = msg_texts_dir + '/msg_texts_file.txt'
  21.  
  22. if not os.path.isfile(file_sep_dir + "/temp_spam_data.csv"):
  23. print('---file not found!')
  24. return None
  25.  
  26. data = pd.read_csv('temp_spam_data.csv', sep=",")
  27. labels = data[['label']]
  28. texts = data[['text']]
  29.  
  30. # turning them to 0. & 1.
  31. labels = [1. if label == 'spam' else 0. for label in labels.values]
  32. # saving labels
  33. if not os.path.isdir(msg_label_dir):
  34. os.mkdir(msg_label_dir)
  35. np.savetxt(msg_label_file, labels, fmt='%f')
  36. print('---saving labels done!')
  37.  
  38. # saving texts
  39. texts = [np.array_str(text) for text in texts.values]
  40. texts = [x.lower() for x in texts]
  41. texts = [''.join(c for c in x if c not in string.punctuation) for x in texts]
  42. texts = [''.join(c for c in x if c not in '0123456789') for x in texts]
  43. texts = [' '.join(x.split()) for x in texts]
  44.  
  45. if not os.path.isdir(msg_texts_dir):
  46. os.mkdir(msg_texts_dir)
  47. np.savetxt(msg_texts_file, texts, delimiter=' ', fmt='%s')
  48. print('---saving texts done!')
  49. print('---all done!')
  50.  
  51.  
  52. if __name__ == '__main__':
  53. run()
Add Comment
Please, Sign In to add comment