Guest User

Untitled

a guest
Jun 18th, 2018
84
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.80 KB | None | 0 0
  1. import tensorflow as tf
  2. import numpy as np
  3. from sklearn.feature_extraction.text import TfidfVectorizer
  4. import os
  5. import io
  6. import string
  7. import requests
  8. import csv
  9. import nltk
  10. from zipfile import ZipFile
  11.  
  12. sess = tf.Session()
  13.  
  14. batch_size = 100
  15. max_features = 1000
  16.  
  17. save_file_name = os.path.join('smsspamcollection','SMSSpamCollection.csv')
  18. if os.path.isfile(save_file_name):
  19. text_data = []
  20. with open(save_file_name,'r') as temp_output_file:
  21. reader = csv.reader(temp_output_file)
  22. for row in reader:
  23. text_data.append(row)
  24.  
  25. else:
  26. zip_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip'
  27. r = requests.get(zip_url)
  28. z = ZipFile(io.BytesIO(r.content))
  29. file = z.read('SMSSpamCollection')
  30.  
  31. #Format data
  32. text_data = file.decode()
  33. text_data = text_data.encode('ascii',errors='ignore')
  34. text_data = text_data.decode().split('n')
  35. text_data = [x.split('t') for x in text_data if len(x)>=1]
  36.  
  37. #And write to csv
  38. with open(save_file_name,'w') as temp_output_file:
  39. writer = csv.writer(temp_output_file)
  40. writer.writerows(text_data)
  41.  
  42. texts = [x[1] for x in text_data]
  43. target = [x[0] for x in text_data]
  44. target = [1 if x=='spam' else 0 for x in target]
  45.  
  46.  
  47. #Normalize the text
  48. texts = [x.lower() for x in texts] #lower
  49. texts = [''.join(c for c in x if c not in string.punctuation) for x in texts] #remove punctuation
  50. texts = [''.join(c for c in x if c not in '0123456789') for x in texts] #remove numbers
  51. texts = [' '.join(x.split()) for x in texts] #trim extra whitespace
  52.  
  53. def tokenizer(text):
  54. words = nltk.word_tokenize(text)
  55. return words
  56.  
  57. tfidf = TfidfVectorizer(tokenizer=tokenizer, stop_words='english', max_features=max_features)
  58. sparse_tfidf_texts = tfidf.fit_transform(texts)
  59. print(sparse_tfidf_texts)
Add Comment
Please, Sign In to add comment