Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- package org.mdp.hadoop.cli;
- import java.io.IOException;
- import java.util.Arrays;
- import java.util.List;
- import org.apache.hadoop.conf.Configuration;
- import org.apache.hadoop.fs.Path;
- import org.apache.hadoop.io.IntWritable;
- import org.apache.hadoop.io.Text;
- import org.apache.hadoop.mapreduce.Job;
- import org.apache.hadoop.mapreduce.Mapper;
- import org.apache.hadoop.mapreduce.Reducer;
- import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
- import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
- import org.apache.hadoop.util.GenericOptionsParser;
- //import org.mdp.hadoop.cli.CitationCount.CitationCountMapper;
- //import org.mdp.hadoop.cli.CitationCount.CitationCountReducer;
- /**
- * Java class to run a remote Hadoop word count job.
- *
- * Contains the main method, an inner Reducer class
- * and an inner Mapper class.
- *
- * @author Aidan
- */
- public class WordCount {
- /**
- * Use this with line.split(SPLIT_REGEX) to get fairly nice
- * word splits.
- */
- public static String SPLIT_REGEX = "[^\\p{L}]+";
- /**
- * This is the Mapper Class. This sends key-value pairs to different machines
- * based on the key.
- *
- * Remember that the generic is Mapper<InputKey, InputValue, MapKey, MapValue>
- *
- * InputKey we don't care about (a LongWritable will be passed as the input
- * file offset, but we don't care; we can also set as Object)
- *
- * InputKey will be Text: a line of the file
- *
- * MapKey will be Text: a word from the file
- *
- * MapValue will be IntWritable: a count: emit 1 for each occurrence of the word
- *
- * @author Aidan
- *
- */
- public static class WordCountMapper extends Mapper<Object, Text, Text, IntWritable>{
- private final IntWritable one = new IntWritable(1);
- private Text word = new Text();
- /**
- * @throws InterruptedException
- *
- */
- @Override
- public void map(Object key, Text value, Context output)
- throws IOException, InterruptedException {
- //TODO implement
- // you can adapt example in
- // ref/org.mdp.hadoop.cli.CitationCount
- String[] rawWords = value.toString().split(SPLIT_REGEX);
- for(String w:rawWords){
- if(!w.isEmpty()){
- String lowercase = w.toLowerCase();
- word.set(lowercase);
- output.write(word,one);
- }
- }
- //List<String> rawWords = Arrays.asList(value.toString().split(SPLIT_REGEX));
- //List<String> uniqueWords = rawWords.stream().map(x->x.toLowerCase()).filter(x->!x.isEmpty());
- //for(String w : uniqueWords){
- //word.set(w);
- // }
- //}
- }
- }
- /**
- * This is the Reducer Class.
- *
- * This collects sets of key-value pairs with the same key on one machine.
- *
- * Remember that the generic is Reducer<MapKey, MapValue, OutputKey, OutputValue>
- *
- * MapKey will be Text: a word from the file
- *
- * MapValue will be IntWritable: a count: emit 1 for each occurrence of the word
- *
- * OutputKey will be Text: the same word
- *
- * OutputValue will be IntWritable: the final count
- *
- * @author Aidan
- *
- */
- public static class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
- /**
- * @throws InterruptedException
- */
- @Override
- public void reduce(Text key, Iterable<IntWritable> values,
- Context output) throws IOException, InterruptedException {
- int sum = 0;
- for(IntWritable i:values){
- sum+=i.get();
- }
- output.write(key, new IntWritable(sum));
- //TODO implement
- // you can adapt example in
- // ref/org.mdp.hadoop.cli.CitationCount
- }
- }
- /**
- * Main method that sets up and runs the job
- *
- * @param args First argument is input, second is output
- * @throws Exception
- */
- public static void main(String[] args) throws Exception {
- Configuration conf = new Configuration();
- String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
- if (otherArgs.length != 2) {
- System.err.println("Usage: WordCount <in> <out>");
- System.exit(2);
- }
- String inputLocation = otherArgs[0];
- String outputLocation = otherArgs[1];
- Job job = Job.getInstance(new Configuration());
- FileInputFormat.setInputPaths(job, new Path(inputLocation));
- FileOutputFormat.setOutputPath(job, new Path(outputLocation));
- job.setOutputKeyClass(Text.class);
- job.setOutputValueClass(IntWritable.class);
- job.setMapOutputKeyClass(Text.class);
- job.setMapOutputValueClass(IntWritable.class);
- job.setMapperClass(WordCountMapper.class);
- job.setCombinerClass(WordCountReducer.class);
- job.setReducerClass(WordCountReducer.class);
- job.setJarByClass(WordCount.class);
- job.waitForCompletion(true);
- // TODO implement
- // you can adapt example in
- // ref/org.mdp.hadoop.cli.CitationCount
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement