Advertisement
Dundre32

Untitled

Apr 20th, 2020
118
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.07 KB | None | 0 0
  1. public static class ACReducer extends Reducer<Text,Text,Text,Text> {
  2.  
  3. // I used a hashmap which will store category as key, and a list of string where each elements will contain: the actual term, value 'A', and value 'C'
  4. // First we need to make a pair of the category with each word in it. (the hashmap guarantees that we have the distinct words)
  5. @Override
  6. protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
  7. String categoryKey = key.toString();
  8. HashSet<String> docLister = new HashSet<>();
  9. // This HashMap counts the number of documents in the category with the term (A). The term is defined in the key.
  10. HashMap<String, Long> numOfDocsWithTermT = new HashMap<String, Long>();
  11. for (Text val : values) {
  12. String currentTerm = uTerm.toString().split("\\s")[1];
  13. numOfDocsWithTermT.computeIfAbsent(currentTerm, k -> (long) 0);
  14. long numOfDocs = numOfDocsWithTermT.get(currentTerm);
  15. numOfDocs++;
  16. numOfDocsWithTermT.put(currentTerm, numOfDocs);
  17. uniqueTermDocPairs.add(new Text(val.toString()));
  18. docLister.add(val.toString().split("\\s")[0]);
  19. }
  20.  
  21.  
  22.  
  23. // Calculate values A and C
  24. long totalNumOfDocsInCat = docLister.size();
  25.  
  26. numOfDocsWithTermT.entrySet().stream().forEach(term ->
  27. {
  28. String termAC = term.getKey().toString() + " " +
  29. term.getValue() + " " +
  30. (totalNumOfDocsInCat - term.getValue());
  31.  
  32. try {
  33. context.write(new Text(categoryKey), new Text(termAC));
  34. } catch (IOException e) {
  35. e.printStackTrace();
  36. } catch (InterruptedException e) {
  37. e.printStackTrace();
  38. }
  39. }
  40. );
  41. }
  42. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement