Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- public static class ACReducer extends Reducer<Text,Text,Text,Text> {
- // I used a hashmap which will store category as key, and a list of string where each elements will contain: the actual term, value 'A', and value 'C'
- // First we need to make a pair of the category with each word in it. (the hashmap guarantees that we have the distinct words)
- @Override
- protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
- String categoryKey = key.toString();
- HashSet<String> docLister = new HashSet<>();
- // This HashMap counts the number of documents in the category with the term (A). The term is defined in the key.
- HashMap<String, Long> numOfDocsWithTermT = new HashMap<String, Long>();
- for (Text val : values) {
- String currentTerm = uTerm.toString().split("\\s")[1];
- numOfDocsWithTermT.computeIfAbsent(currentTerm, k -> (long) 0);
- long numOfDocs = numOfDocsWithTermT.get(currentTerm);
- numOfDocs++;
- numOfDocsWithTermT.put(currentTerm, numOfDocs);
- uniqueTermDocPairs.add(new Text(val.toString()));
- docLister.add(val.toString().split("\\s")[0]);
- }
- // Calculate values A and C
- long totalNumOfDocsInCat = docLister.size();
- numOfDocsWithTermT.entrySet().stream().forEach(term ->
- {
- String termAC = term.getKey().toString() + " " +
- term.getValue() + " " +
- (totalNumOfDocsInCat - term.getValue());
- try {
- context.write(new Text(categoryKey), new Text(termAC));
- } catch (IOException e) {
- e.printStackTrace();
- } catch (InterruptedException e) {
- e.printStackTrace();
- }
- }
- );
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement