Advertisement
Dundre32

Untitled

Apr 17th, 2020
109
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.05 KB | None | 0 0
  1.  
  2. public static class chiSquareReducer extends Reducer<Text,Text,Text,Text>{
  3. //Get the Top 200 terms with the highest chi-squares, they must appear in descending order
  4. //Also get all the terms: we will store it into a hashset in order to avoid duplicates(Will be converted later for sorting alphabetically)
  5. public HashSet<String> allTerms;
  6. public TreeMap<String, String > catsAndVals;
  7. //public HashSet<String> catsAndVals;
  8. @Override
  9. public void setup(Context context){
  10. allTerms = new HashSet<>();
  11. //catsAndVals = new TreeMap<>();
  12. catsAndVals = new TreeMap<>();
  13. }
  14. //https://stackoverflow.com/questions/38142250/java-sort-a-list-of-strings-determining-the-order-depending-on-the-contains-o
  15. @Override
  16. public void reduce(Text key, Iterable<Text> value, Context context) {
  17. String currentKey = key.toString();
  18. //Get all term, chisquare pairs into a TreeMap, we need to sort it on descending order
  19. //catsAndVals.put(key.toString(), new TreeMap<>());
  20. ArrayList<ChiResults> listOfResults = new ArrayList<>();
  21. value.forEach(v-> {
  22. String[] splitVal = v.toString().split("\\s");
  23. listOfResults.add(new ChiResults(splitVal[0], Double.parseDouble(splitVal[1])));
  24. });
  25.  
  26. //listOfResults.parallelStream().forEach(l ->
  27. /*for (ChiResults squareParams : listOfResults) {
  28. allTerms.add(squareParams.getTerm().toString());
  29. }*/
  30. listOfResults.sort(new ChiResultComparator());
  31. //Add the category and the top 200 values and terms: https://www.geeksforgeeks.org/arraylist-sublist-method-in-java-with-examples/
  32. /*
  33. Parameters: This method takes the following argument as a parameter.
  34. fromIndex – low endpoint (inclusive) of the subList
  35. toIndex – high endpoint (exclusive) of the subList
  36. */
  37. //to avoid indexing exceptions (but it might be not a problem): we will determine that the size total number of words related to the category is higher or smaller
  38. //than 200: if smaller, the last element will be set by the size of the sublist, otherwise it
  39. // will be the element with index 199 (index 200 must be defined in that case, due that the high endpoint is exclusive)
  40. // (Math.min)
  41. //all elements of the sublist will be concatenated with StringBuilder
  42. //int lastElementOfCat = listOfResults.size()>=200 ? 200 : listOfResults.size();
  43. StringBuilder resultsOfCategory = new StringBuilder();
  44. //stream ot simple foreach? maybe simple foreach: https://stackoverflow.com/questions/23218874/what-is-difference-between-collection-stream-foreach-and-collection-foreach
  45. listOfResults.subList(0, Math.min(listOfResults.size(), 200)).forEach(l->{
  46. String currentTerm = l.getTerm().toString();
  47. resultsOfCategory.append(currentTerm).append(":").append(l.getChiVal()).append(" ");
  48. //Load the top 200 terms from the current category
  49. allTerms.add(currentTerm);
  50. });
  51.  
  52.  
  53. catsAndVals.put(key.toString(), resultsOfCategory.toString());
  54. }
  55.  
  56. @Override
  57. public void cleanup(Context context) throws IOException, InterruptedException {
  58. for (Map.Entry<String, String> keyAndVal : catsAndVals.entrySet()) {
  59. context.write(new Text(keyAndVal.getKey().toString().trim()), new Text(keyAndVal.getValue().trim()));
  60. }
  61. ArrayList<String >termList = new ArrayList<>(allTerms);
  62. Collections.sort(termList);
  63. //For all appearing words: define a stringJoiner(which will be a whitespace)
  64. StringJoiner whtSpace = new StringJoiner(" ");
  65. termList.forEach(t -> whtSpace.add(t));
  66. context.write(new Text(whtSpace.toString()),new Text(""));
  67. }
  68.  
  69.  
  70. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement