Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- """
- It read files from csv file and separates "labels" and "texts" from it.
- """
- import pandas as pd
- import os
- import numpy as np
- import string
- def run():
- """
- run to separate rows
- :return:
- """
- file_sep_dir = os.getcwd()
- msg_label_dir = os.getcwd() + '/msg_labels'
- msg_label_file = msg_label_dir + '/msg_labels_file.txt'
- msg_texts_dir = os.getcwd() + '/msg_texts'
- msg_texts_file = msg_texts_dir + '/msg_texts_file.txt'
- if not os.path.isfile(file_sep_dir + "/temp_spam_data.csv"):
- print('---file not found!')
- return None
- data = pd.read_csv('temp_spam_data.csv', sep=",")
- labels = data[['label']]
- texts = data[['text']]
- # turning them to 0. & 1.
- labels = [1. if label == 'spam' else 0. for label in labels.values]
- # saving labels
- if not os.path.isdir(msg_label_dir):
- os.mkdir(msg_label_dir)
- np.savetxt(msg_label_file, labels, fmt='%f')
- print('---saving labels done!')
- # saving texts
- texts = [np.array_str(text) for text in texts.values]
- texts = [x.lower() for x in texts]
- texts = [''.join(c for c in x if c not in string.punctuation) for x in texts]
- texts = [''.join(c for c in x if c not in '0123456789') for x in texts]
- texts = [' '.join(x.split()) for x in texts]
- if not os.path.isdir(msg_texts_dir):
- os.mkdir(msg_texts_dir)
- np.savetxt(msg_texts_file, texts, delimiter=' ', fmt='%s')
- print('---saving texts done!')
- print('---all done!')
- if __name__ == '__main__':
- run()
Add Comment
Please, Sign In to add comment