Pastebin.com

import java.io.IOException;
import java.util.StringTokenizer;

import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;

public class WordMapper extends MapReduceBase
        implements Mapper<Object, Text, Text, IntWritable>
{
    private Text word = new Text();
    private final static IntWritable ONE = new IntWritable(1);

    public void map(Object key,
            Text value,
            OutputCollector<Text, IntWritable> output,
            Reporter reporter) throws IOException
  {
        // Break line into words for processing
    Pattern pat = Pattern.compile("(\\w+)");
    Matcher mat = pat.matcher(value.toString());
    while (mat.find())
    {
            word.set(mat.group(1).toLowerCase());
            output.collect(word, ONE);
    }
    /*
        StringTokenizer wordList = new StringTokenizer(value.toString());
        while (wordList.hasMoreTokens())
    {
            word.set(wordList.nextToken());
            output.collect(word, ONE);
        }
    */
    }
}