1. import java.io.IOException;
  2. import java.util.StringTokenizer;
  3.  
  4. import java.util.regex.Matcher;
  5. import java.util.regex.Pattern;
  6.  
  7. import org.apache.hadoop.io.IntWritable;
  8. import org.apache.hadoop.io.Text;
  9. import org.apache.hadoop.mapred.Mapper;
  10. import org.apache.hadoop.mapred.MapReduceBase;
  11. import org.apache.hadoop.mapred.OutputCollector;
  12. import org.apache.hadoop.mapred.Reporter;
  13.  
  14. public class WordMapper extends MapReduceBase
  15.         implements Mapper<Object, Text, Text, IntWritable>
  16. {
  17.     private Text word = new Text();
  18.     private final static IntWritable ONE = new IntWritable(1);
  19.    
  20.     public void map(Object key,
  21.             Text value,
  22.             OutputCollector<Text, IntWritable> output,
  23.             Reporter reporter) throws IOException
  24.   {
  25.         // Break line into words for processing
  26.     Pattern pat = Pattern.compile("(\\w+)");
  27.     Matcher mat = pat.matcher(value.toString());
  28.     while (mat.find())
  29.     {
  30.             word.set(mat.group(1).toLowerCase());
  31.             output.collect(word, ONE);
  32.     }
  33.     /*
  34.         StringTokenizer wordList = new StringTokenizer(value.toString());
  35.         while (wordList.hasMoreTokens())
  36.     {
  37.             word.set(wordList.nextToken());
  38.             output.collect(word, ONE);
  39.         }
  40.     */
  41.     }
  42. }