Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import tensorflow as tf
- import numpy as np
- from sklearn.feature_extraction.text import TfidfVectorizer
- import os
- import io
- import string
- import requests
- import csv
- import nltk
- from zipfile import ZipFile
- sess = tf.Session()
- batch_size = 100
- max_features = 1000
- save_file_name = os.path.join('smsspamcollection','SMSSpamCollection.csv')
- if os.path.isfile(save_file_name):
- text_data = []
- with open(save_file_name,'r') as temp_output_file:
- reader = csv.reader(temp_output_file)
- for row in reader:
- text_data.append(row)
- else:
- zip_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip'
- r = requests.get(zip_url)
- z = ZipFile(io.BytesIO(r.content))
- file = z.read('SMSSpamCollection')
- #Format data
- text_data = file.decode()
- text_data = text_data.encode('ascii',errors='ignore')
- text_data = text_data.decode().split('n')
- text_data = [x.split('t') for x in text_data if len(x)>=1]
- #And write to csv
- with open(save_file_name,'w') as temp_output_file:
- writer = csv.writer(temp_output_file)
- writer.writerows(text_data)
- texts = [x[1] for x in text_data]
- target = [x[0] for x in text_data]
- target = [1 if x=='spam' else 0 for x in target]
- #Normalize the text
- texts = [x.lower() for x in texts] #lower
- texts = [''.join(c for c in x if c not in string.punctuation) for x in texts] #remove punctuation
- texts = [''.join(c for c in x if c not in '0123456789') for x in texts] #remove numbers
- texts = [' '.join(x.split()) for x in texts] #trim extra whitespace
- def tokenizer(text):
- words = nltk.word_tokenize(text)
- return words
- tfidf = TfidfVectorizer(tokenizer=tokenizer, stop_words='english', max_features=max_features)
- sparse_tfidf_texts = tfidf.fit_transform(texts)
- print(sparse_tfidf_texts)
Add Comment
Please, Sign In to add comment