document.write('
Data hosted with ♥ by Pastebin.com - Download Raw - See Original
  1. import java.util.List;
  2.  
  3. /* First we define the feature functions. The Document class is defined
  4. elsewhere in our source - we will discuss that later. The sense keyword
  5. means that what follows is to be considered a feature. Notice that this
  6. function and the next rely on the Document data structure.
  7.  
  8. This particular function simply produces "bag of words" features. */
  9. discrete% WordFeatures(Document d) <- {
  10.     List words = d.getWords();
  11.     for (int i = 0; i < words.size(); i++)
  12.         sense words.get(i);
  13. }
  14.  
  15. /* This function produces bigram features. Notice how the argument to
  16.    "sense" has changed. */
  17. discrete% BigramFeatures(Document d) <- {
  18.     List words = d.getWords();
  19.     for (int i = 0; i < words.size() - 1; i++)
  20.         sense words.get(i)+"-"+words.get(i+1);
  21. }
  22.  
  23. /* Here we define the labels our classifier can take. They are "spam" and "ham".
  24.    Again, we rely on the Document class, which we will discuss later.  */
  25. discrete{"spam", "ham"} Label(Document d) <- { return d.getLabel(); }
  26.  
  27. /* A learned classifier; its definition comes from data. Read the next three
  28.    lines as: define a classifier which takes data in the form of Documents, and
  29.    learns the labels we have defined above as "Label" and uses the features we have
  30.    defined above. */
  31. discrete SpamClassifier(Document d) <-
  32.     learn Label
  33.     using WordFeatures, BigramFeatures
  34.    
  35.     // Use a DocumentReader to load the training data                                      
  36.     // We will discuss the DocumentReader class below                                  
  37.     from new DocumentReader("data/spam/train")
  38.    
  39.     // Train for 5 rounds
  40.     5 rounds
  41.    
  42.     // Use a NaiveBayes classifier (other options listed below)
  43.     with new NaiveBayes()
  44.     // with new SupportVectorMachine()
  45.     // with new AdaBoost()
  46.     // with new LinearThresholdUnit(0.5)
  47.     // with new PassiveAggressive()
  48.     // with new SparseConfidenceWeighted()
  49.     // with new SparseMIRA()
  50.     // with new SparseNetworkLearner()
  51.     // with SparseAveragedPerceptron {
  52.     //   learningRate = 0.1 ;
  53.     //   thickness = 3.5;
  54.     // }
  55.    
  56.     // Use a DocumentReader to load the testing data
  57.     testFrom new DocumentReader("data/spam/test")
  58.    
  59.     // Give an update every 2000 documents
  60.     progressOutput 2000
  61.  end
');