import java.io.IOException;
import java.util.StringTokenizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
public class WordMapper extends MapReduceBase
implements Mapper<Object, Text, Text, IntWritable>
{
private Text word = new Text();
private final static IntWritable ONE = new IntWritable(1);
public void map(Object key,
Text value,
OutputCollector<Text, IntWritable> output,
Reporter reporter) throws IOException
{
// Break line into words for processing
Pattern pat = Pattern.compile("(\\w+)");
Matcher mat = pat.matcher(value.toString());
while (mat.find())
{
word.set(mat.group(1).toLowerCase());
output.collect(word, ONE);
}
/*
StringTokenizer wordList = new StringTokenizer(value.toString());
while (wordList.hasMoreTokens())
{
word.set(wordList.nextToken());
output.collect(word, ONE);
}
*/
}
}