Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import java.io.IOException;
- import java.util.StringTokenizer;
- import java.util.regex.Matcher;
- import java.util.regex.Pattern;
- import org.apache.hadoop.io.IntWritable;
- import org.apache.hadoop.io.Text;
- import org.apache.hadoop.mapred.Mapper;
- import org.apache.hadoop.mapred.MapReduceBase;
- import org.apache.hadoop.mapred.OutputCollector;
- import org.apache.hadoop.mapred.Reporter;
- public class WordMapper extends MapReduceBase
- implements Mapper<Object, Text, Text, IntWritable>
- {
- private Text word = new Text();
- private final static IntWritable ONE = new IntWritable(1);
- public void map(Object key,
- Text value,
- OutputCollector<Text, IntWritable> output,
- Reporter reporter) throws IOException
- {
- // Break line into words for processing
- Pattern pat = Pattern.compile("(\\w+)");
- Matcher mat = pat.matcher(value.toString());
- while (mat.find())
- {
- word.set(mat.group(1).toLowerCase());
- output.collect(word, ONE);
- }
- /*
- StringTokenizer wordList = new StringTokenizer(value.toString());
- while (wordList.hasMoreTokens())
- {
- word.set(wordList.nextToken());
- output.collect(word, ONE);
- }
- */
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement