Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- public static class chiSquareReducer extends Reducer<Text,Text,Text,Text>{
- //Get the Top 200 terms with the highest chi-squares, they must appear in descending order
- //Also get all the terms: we will store it into a hashset in order to avoid duplicates(Will be converted later for sorting alphabetically)
- public HashSet<String> allTerms;
- public TreeMap<String, String > catsAndVals;
- //public HashSet<String> catsAndVals;
- @Override
- public void setup(Context context){
- allTerms = new HashSet<>();
- //catsAndVals = new TreeMap<>();
- catsAndVals = new TreeMap<>();
- }
- //https://stackoverflow.com/questions/38142250/java-sort-a-list-of-strings-determining-the-order-depending-on-the-contains-o
- @Override
- public void reduce(Text key, Iterable<Text> value, Context context) {
- String currentKey = key.toString();
- //Get all term, chisquare pairs into a TreeMap, we need to sort it on descending order
- //catsAndVals.put(key.toString(), new TreeMap<>());
- ArrayList<ChiResults> listOfResults = new ArrayList<>();
- value.forEach(v-> {
- String[] splitVal = v.toString().split("\\s");
- listOfResults.add(new ChiResults(splitVal[0], Double.parseDouble(splitVal[1])));
- });
- //listOfResults.parallelStream().forEach(l ->
- /*for (ChiResults squareParams : listOfResults) {
- allTerms.add(squareParams.getTerm().toString());
- }*/
- listOfResults.sort(new ChiResultComparator());
- //Add the category and the top 200 values and terms: https://www.geeksforgeeks.org/arraylist-sublist-method-in-java-with-examples/
- /*
- Parameters: This method takes the following argument as a parameter.
- fromIndex – low endpoint (inclusive) of the subList
- toIndex – high endpoint (exclusive) of the subList
- */
- //to avoid indexing exceptions (but it might be not a problem): we will determine that the size total number of words related to the category is higher or smaller
- //than 200: if smaller, the last element will be set by the size of the sublist, otherwise it
- // will be the element with index 199 (index 200 must be defined in that case, due that the high endpoint is exclusive)
- // (Math.min)
- //all elements of the sublist will be concatenated with StringBuilder
- //int lastElementOfCat = listOfResults.size()>=200 ? 200 : listOfResults.size();
- StringBuilder resultsOfCategory = new StringBuilder();
- //stream ot simple foreach? maybe simple foreach: https://stackoverflow.com/questions/23218874/what-is-difference-between-collection-stream-foreach-and-collection-foreach
- listOfResults.subList(0, Math.min(listOfResults.size(), 200)).forEach(l->{
- String currentTerm = l.getTerm().toString();
- resultsOfCategory.append(currentTerm).append(":").append(l.getChiVal()).append(" ");
- //Load the top 200 terms from the current category
- allTerms.add(currentTerm);
- });
- catsAndVals.put(key.toString(), resultsOfCategory.toString());
- }
- @Override
- public void cleanup(Context context) throws IOException, InterruptedException {
- for (Map.Entry<String, String> keyAndVal : catsAndVals.entrySet()) {
- context.write(new Text(keyAndVal.getKey().toString().trim()), new Text(keyAndVal.getValue().trim()));
- }
- ArrayList<String >termList = new ArrayList<>(allTerms);
- Collections.sort(termList);
- //For all appearing words: define a stringJoiner(which will be a whitespace)
- StringJoiner whtSpace = new StringJoiner(" ");
- termList.forEach(t -> whtSpace.add(t));
- context.write(new Text(whtSpace.toString()),new Text(""));
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement