Advertisement
Guest User

Untitled

a guest
Mar 29th, 2017
68
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.79 KB | None | 0 0
  1. package org.mdp.hadoop.cli;
  2.  
  3. import java.io.IOException;
  4. import java.util.Arrays;
  5. import java.util.List;
  6.  
  7. import org.apache.hadoop.conf.Configuration;
  8. import org.apache.hadoop.fs.Path;
  9. import org.apache.hadoop.io.IntWritable;
  10. import org.apache.hadoop.io.Text;
  11. import org.apache.hadoop.mapreduce.Job;
  12. import org.apache.hadoop.mapreduce.Mapper;
  13. import org.apache.hadoop.mapreduce.Reducer;
  14. import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
  15. import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
  16. import org.apache.hadoop.util.GenericOptionsParser;
  17. //import org.mdp.hadoop.cli.CitationCount.CitationCountMapper;
  18. //import org.mdp.hadoop.cli.CitationCount.CitationCountReducer;
  19.  
  20. /**
  21. * Java class to run a remote Hadoop word count job.
  22. *
  23. * Contains the main method, an inner Reducer class
  24. * and an inner Mapper class.
  25. *
  26. * @author Aidan
  27. */
  28. public class WordCount {
  29.  
  30. /**
  31. * Use this with line.split(SPLIT_REGEX) to get fairly nice
  32. * word splits.
  33. */
  34. public static String SPLIT_REGEX = "[^\\p{L}]+";
  35.  
  36. /**
  37. * This is the Mapper Class. This sends key-value pairs to different machines
  38. * based on the key.
  39. *
  40. * Remember that the generic is Mapper<InputKey, InputValue, MapKey, MapValue>
  41. *
  42. * InputKey we don't care about (a LongWritable will be passed as the input
  43. * file offset, but we don't care; we can also set as Object)
  44. *
  45. * InputKey will be Text: a line of the file
  46. *
  47. * MapKey will be Text: a word from the file
  48. *
  49. * MapValue will be IntWritable: a count: emit 1 for each occurrence of the word
  50. *
  51. * @author Aidan
  52. *
  53. */
  54. public static class WordCountMapper extends Mapper<Object, Text, Text, IntWritable>{
  55.  
  56. private final IntWritable one = new IntWritable(1);
  57. private Text word = new Text();
  58.  
  59. /**
  60. * @throws InterruptedException
  61. *
  62. */
  63. @Override
  64. public void map(Object key, Text value, Context output)
  65. throws IOException, InterruptedException {
  66. //TODO implement
  67. // you can adapt example in
  68. // ref/org.mdp.hadoop.cli.CitationCount
  69.  
  70. String[] rawWords = value.toString().split(SPLIT_REGEX);
  71. for(String w:rawWords){
  72. if(!w.isEmpty()){
  73. String lowercase = w.toLowerCase();
  74. word.set(lowercase);
  75. output.write(word,one);
  76. }
  77. }
  78.  
  79. //List<String> rawWords = Arrays.asList(value.toString().split(SPLIT_REGEX));
  80.  
  81. //List<String> uniqueWords = rawWords.stream().map(x->x.toLowerCase()).filter(x->!x.isEmpty());
  82. //for(String w : uniqueWords){
  83. //word.set(w);
  84. // }
  85. //}
  86. }
  87. }
  88.  
  89. /**
  90. * This is the Reducer Class.
  91. *
  92. * This collects sets of key-value pairs with the same key on one machine.
  93. *
  94. * Remember that the generic is Reducer<MapKey, MapValue, OutputKey, OutputValue>
  95. *
  96. * MapKey will be Text: a word from the file
  97. *
  98. * MapValue will be IntWritable: a count: emit 1 for each occurrence of the word
  99. *
  100. * OutputKey will be Text: the same word
  101. *
  102. * OutputValue will be IntWritable: the final count
  103. *
  104. * @author Aidan
  105. *
  106. */
  107. public static class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
  108.  
  109. /**
  110. * @throws InterruptedException
  111. */
  112. @Override
  113. public void reduce(Text key, Iterable<IntWritable> values,
  114. Context output) throws IOException, InterruptedException {
  115. int sum = 0;
  116. for(IntWritable i:values){
  117. sum+=i.get();
  118. }
  119. output.write(key, new IntWritable(sum));
  120. //TODO implement
  121. // you can adapt example in
  122. // ref/org.mdp.hadoop.cli.CitationCount
  123. }
  124. }
  125.  
  126. /**
  127. * Main method that sets up and runs the job
  128. *
  129. * @param args First argument is input, second is output
  130. * @throws Exception
  131. */
  132. public static void main(String[] args) throws Exception {
  133. Configuration conf = new Configuration();
  134. String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
  135. if (otherArgs.length != 2) {
  136. System.err.println("Usage: WordCount <in> <out>");
  137. System.exit(2);
  138. }
  139. String inputLocation = otherArgs[0];
  140. String outputLocation = otherArgs[1];
  141.  
  142. Job job = Job.getInstance(new Configuration());
  143.  
  144. FileInputFormat.setInputPaths(job, new Path(inputLocation));
  145. FileOutputFormat.setOutputPath(job, new Path(outputLocation));
  146.  
  147. job.setOutputKeyClass(Text.class);
  148. job.setOutputValueClass(IntWritable.class);
  149. job.setMapOutputKeyClass(Text.class);
  150. job.setMapOutputValueClass(IntWritable.class);
  151.  
  152. job.setMapperClass(WordCountMapper.class);
  153. job.setCombinerClass(WordCountReducer.class);
  154. job.setReducerClass(WordCountReducer.class);
  155.  
  156. job.setJarByClass(WordCount.class);
  157. job.waitForCompletion(true);
  158.  
  159. // TODO implement
  160. // you can adapt example in
  161. // ref/org.mdp.hadoop.cli.CitationCount
  162. }
  163. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement