imports ...
def calculate_gauss_width(train_set):
ones = 0
for i in range(len(train_set)):
newones = sum(label for label in train_set[i] if label == 1)
ones = ones + newones
return ones/len(train_set)
def document_features(document):
document_words = set(document)
features = []
for word in word_features:
if (word in document_words):
features.append(1.0)
else:
features.append(0.0)
return features
m = {'pos': 1., 'neg': -1.}
C = 1.
documents = [(set(movie_reviews.words(fileid)), m[category])
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)
#all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
#stopWordsEng = set(stopwords.words('english'))
#OaNs = [w.lower() for w in movie_reviews.words() if w.isalpha() and w.lower() not in stopwords.words('english')]
f = open("movies_oans.data", "r")
OaNs = f.read()
OaNs = OaNs.split("\n")
all_words = nltk.FreqDist(OaNs)
word_features = all_words.keys()[:2000]
categories = [c for (d,c) in documents]
trainlab, testlab = categories[200:], categories[:200]
labels = Labels(trainlab)
featuresets = [document_features(d) for (d,c) in documents]
train_set, test_set = featuresets[200:], featuresets[:200]
feats_train = RealFeatures(array(train_set).T)
feats_test = RealFeatures(array(test_set).T)
width = calculate_gauss_width(train_set)
kernel = GaussianKernel(feats_train, feats_train, width)
svm = LibSVM(C, kernel, labels)
svm.train()
out=svm.classify().get_labels();
testerr=mean(sign(out)!=testlab)
print testerr
corrects=0
for i in range(200):
if sign(out[i]) == sign(testlab[i]):
corrects = corrects+1
print corrects
# NOT USEFUL REMOVING TOO MUCH
#def unusual_words(text):
# text_vocab = set(w.lower() for w in text if w.isalpha())
# english_vocab = set(w.lower() for w in nltk.corpus.words.words())
# unusual = text_vocab.difference(english_vocab)
# return unusual
#all_words = nltk.FreqDist(w.lower() for w in unusual_words(movie_reviews.words()))