Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import numpy as np
- import re
- from nltk.corpus import stopwords
- from distutils.version import LooseVersion as Version
- from sklearn import __version__ as sklearn_version
- from sklearn.feature_extraction.text import HashingVectorizer
- from sklearn.linear_model import SGDClassifier
- if Version(sklearn_version) < '0.18':
- clf = SGDClassifier(loss='log', random_state=1, n_iter=1)
- else:
- clf = SGDClassifier(loss='log', random_state=1, max_iter=1)
- doc_stream = stream_docs(path='movie_data.csv')
- stop = stopwords.words('english')
- def stream_docs(path):
- with open(path, 'r', encoding='utf-8') as csv:
- next(csv) # skip header
- for line in csv:
- text, label = line[:-3], int(line[-2])
- yield text, label
- def get_minibatch(doc_stream, size):
- docs, y = [], []
- try:
- for _ in range(size):
- text, label = next(doc_stream)
- docs.append(text)
- y.append(label)
- except StopIteration:
- return None, None
- return docs, y
- def tokenizer(text):
- text = re.sub('<[^>]*>', '', text)
- emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
- text = re.sub('[\W]+', ' ', text.lower()) +\
- ' '.join(emoticons).replace('-', '')
- tokenized = [w for w in text.split() if w not in stop]
- return tokenized
- vect = HashingVectorizer(decode_error='ignore',
- n_features=2**21,
- preprocessor=None,
- tokenizer=tokenizer)
- classes = np.array([0, 1])
- for _ in range(45):
- X_train, y_train = get_minibatch(doc_stream, size=1000)
- if not X_train:
- break
- X_train = vect.transform(X_train)
- clf.partial_fit(X_train, y_train, classes=classes)
Add Comment
Please, Sign In to add comment