Guest
Public paste!

Untitled

By: a guest | Mar 22nd, 2010 | Syntax: Python | Size: 2.13 KB | Hits: 121 | Expires: Never
Copy text to clipboard
  1. imports ...
  2.  
  3.  
  4. def calculate_gauss_width(train_set):
  5.   ones = 0
  6.   for i in range(len(train_set)):
  7.     newones = sum(label for label in train_set[i] if label == 1)
  8.     ones = ones + newones
  9.   return ones/len(train_set)
  10.  
  11. def document_features(document):
  12.   document_words = set(document)
  13.   features = []
  14.   for word in word_features:
  15.     if (word in document_words):
  16.       features.append(1.0)
  17.     else:
  18.       features.append(0.0)
  19.   return features
  20.  
  21. m = {'pos': 1., 'neg': -1.}
  22. C = 1.
  23.  
  24. documents = [(set(movie_reviews.words(fileid)), m[category])
  25.             for category in movie_reviews.categories()
  26.             for fileid in movie_reviews.fileids(category)]
  27.            
  28. random.shuffle(documents)
  29.  
  30. #all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
  31. #stopWordsEng = set(stopwords.words('english'))
  32. #OaNs = [w.lower() for w in movie_reviews.words() if w.isalpha() and w.lower() not in stopwords.words('english')]
  33.  
  34. f = open("movies_oans.data", "r")  
  35. OaNs = f.read()
  36. OaNs = OaNs.split("\n")
  37.  
  38. all_words = nltk.FreqDist(OaNs)
  39.  
  40. word_features = all_words.keys()[:2000]
  41.    
  42. categories = [c for (d,c) in documents]
  43. trainlab, testlab = categories[200:], categories[:200]
  44. labels = Labels(trainlab)
  45.    
  46. featuresets = [document_features(d) for (d,c) in documents]
  47. train_set, test_set = featuresets[200:], featuresets[:200]            
  48. feats_train = RealFeatures(array(train_set).T)
  49. feats_test = RealFeatures(array(test_set).T)
  50.  
  51. width = calculate_gauss_width(train_set)
  52. kernel = GaussianKernel(feats_train, feats_train, width)
  53. svm = LibSVM(C, kernel, labels)
  54. svm.train()
  55.  
  56. out=svm.classify().get_labels();
  57. testerr=mean(sign(out)!=testlab)
  58. print testerr
  59.  
  60. corrects=0
  61. for i in range(200):
  62.   if sign(out[i]) == sign(testlab[i]):
  63.     corrects = corrects+1
  64.  
  65. print corrects
  66.  
  67. # NOT USEFUL REMOVING TOO MUCH
  68. #def unusual_words(text):
  69. #  text_vocab = set(w.lower() for w in text if w.isalpha())
  70. #  english_vocab = set(w.lower() for w in nltk.corpus.words.words())
  71. #  unusual = text_vocab.difference(english_vocab)
  72. #  return unusual
  73.  
  74. #all_words = nltk.FreqDist(w.lower() for w in unusual_words(movie_reviews.words()))