Guest User

Untitled

a guest
May 24th, 2018
95
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.67 KB | None | 0 0
  1. import numpy as np
  2. import re
  3. from nltk.corpus import stopwords
  4.  
  5. from distutils.version import LooseVersion as Version
  6. from sklearn import __version__ as sklearn_version
  7.  
  8. from sklearn.feature_extraction.text import HashingVectorizer
  9. from sklearn.linear_model import SGDClassifier
  10.  
  11. if Version(sklearn_version) < '0.18':
  12. clf = SGDClassifier(loss='log', random_state=1, n_iter=1)
  13. else:
  14. clf = SGDClassifier(loss='log', random_state=1, max_iter=1)
  15.  
  16. doc_stream = stream_docs(path='movie_data.csv')
  17.  
  18. stop = stopwords.words('english')
  19.  
  20. def stream_docs(path):
  21. with open(path, 'r', encoding='utf-8') as csv:
  22. next(csv) # skip header
  23. for line in csv:
  24. text, label = line[:-3], int(line[-2])
  25. yield text, label
  26.  
  27. def get_minibatch(doc_stream, size):
  28. docs, y = [], []
  29. try:
  30. for _ in range(size):
  31. text, label = next(doc_stream)
  32. docs.append(text)
  33. y.append(label)
  34. except StopIteration:
  35. return None, None
  36. return docs, y
  37.  
  38. def tokenizer(text):
  39. text = re.sub('<[^>]*>', '', text)
  40. emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
  41. text = re.sub('[\W]+', ' ', text.lower()) +\
  42. ' '.join(emoticons).replace('-', '')
  43. tokenized = [w for w in text.split() if w not in stop]
  44. return tokenized
  45.  
  46. vect = HashingVectorizer(decode_error='ignore',
  47. n_features=2**21,
  48. preprocessor=None,
  49. tokenizer=tokenizer)
  50.  
  51. classes = np.array([0, 1])
  52. for _ in range(45):
  53. X_train, y_train = get_minibatch(doc_stream, size=1000)
  54. if not X_train:
  55. break
  56. X_train = vect.transform(X_train)
  57. clf.partial_fit(X_train, y_train, classes=classes)
Add Comment
Please, Sign In to add comment