import java.io.IOException; import java.util.StringTokenizer; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.MapReduceBase; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reporter; public class WordMapper extends MapReduceBase implements Mapper { private Text word = new Text(); private final static IntWritable ONE = new IntWritable(1); public void map(Object key, Text value, OutputCollector output, Reporter reporter) throws IOException { // Break line into words for processing Pattern pat = Pattern.compile("(\\w+)"); Matcher mat = pat.matcher(value.toString()); while (mat.find()) { word.set(mat.group(1).toLowerCase()); output.collect(word, ONE); } /* StringTokenizer wordList = new StringTokenizer(value.toString()); while (wordList.hasMoreTokens()) { word.set(wordList.nextToken()); output.collect(word, ONE); } */ } }