import java.util.List;
/* First we define the feature functions. The Document class is defined
elsewhere in our source - we will discuss that later. The sense keyword
means that what follows is to be considered a feature. Notice that this
function and the next rely on the Document data structure.
This particular function simply produces "bag of words" features. */
discrete% WordFeatures(Document d) <- {
List words = d.getWords();
for (int i = 0; i < words.size(); i++)
sense words.get(i);
}
/* This function produces bigram features. Notice how the argument to
"sense" has changed. */
discrete% BigramFeatures(Document d) <- {
List words = d.getWords();
for (int i = 0; i < words.size() - 1; i++)
sense words.get(i)+"-"+words.get(i+1);
}
/* Here we define the labels our classifier can take. They are "spam" and "ham".
Again, we rely on the Document class, which we will discuss later. */
discrete{"spam", "ham"} Label(Document d) <- { return d.getLabel(); }
/* A learned classifier; its definition comes from data. Read the next three
lines as: define a classifier which takes data in the form of Documents, and
learns the labels we have defined above as "Label" and uses the features we have
defined above. */
discrete SpamClassifier(Document d) <-
learn Label
using WordFeatures, BigramFeatures
// Use a DocumentReader to load the training data
// We will discuss the DocumentReader class below
from new DocumentReader("data/spam/train")
// Train for 5 rounds
5 rounds
// Use a NaiveBayes classifier (other options listed below)
with new NaiveBayes()
// with new SupportVectorMachine()
// with new AdaBoost()
// with new LinearThresholdUnit(0.5)
// with new PassiveAggressive()
// with new SparseConfidenceWeighted()
// with new SparseMIRA()
// with new SparseNetworkLearner()
// with SparseAveragedPerceptron {
// learningRate = 0.1 ;
// thickness = 3.5;
// }
// Use a DocumentReader to load the testing data
testFrom new DocumentReader("data/spam/test")
// Give an update every 2000 documents
progressOutput 2000
end