Untitled

// Sayat implemented a cos, how tf-idf says. Implemented it incorrectly, because he doesn’t know or native with operators order (* should be in braces). Saw that ```cos``` decreased quality and instead of fixing a real issue - decided to use ```dot``` instead of ```cos``` removing the whole concept that takes into account influence a size of a document to term importancy.
double dotProduct = vector.dot(other);
public class SparseVector {
    public double dot(SparseVector other) {
        double product = 0;

        for (Map.Entry<Integer, Double> word : other.weights.entrySet()) {
            int wordOtherId = word.getKey();
            if(weights.containsKey(wordOtherId)) {
                product += weights.get(wordOtherId) * word.getValue();
            }
        }
        return product;
    }

    public double cosine(SparseVector other) {
        double dotProduct = other.dot(this);
        return dotProduct / this.getL2Norm() * other.getL2Norm();
    }
}
//“invert document frequency” of part is not implemented at all, at least I don’t see it.
//As a result of non using ```cos``` and not implementing “invert document frequency” tf-idf reduced to dot product of terms frequencies inside documents. So, `and`, `a` and `the` will be the most important words. This fact was covered by usage of stop words from Lucene, but of course an importance of all other words is skewed too. If you implement tf-idf correctly - you don't need any stop words set to reduce importance of common words.
//To demonstrate it, if remove his stop words plugging in here - accuracy falling to 0.4:
filter = new StopFilter(filter, StandardAnalyzer.STOP_WORDS_SET);


// so, ok, here we use ```encode``` purely to store the result, we don’t care about the result
public void fit(Classification train) {
    Multiset<String> features = getFeatures(train.getDocument());
    encode(features, Optional.of(train.getCategory()));
}
// here we pass a magic empty value to a getter to avoid storing. This is the only place where we use this getter and we pass a magic empty value into it
public String predict(String html) {
    String text = TextUtil.parseHtml(html);
    SparseVector vector = getSparseVector(text, Optional.empty());
    …
}
private SparseVector getSparseVector(String text, Optional<String> category) {
    Multiset<String> features = getFeatures(text);
    return encode(features, category);
}
//Finally, ```encode``` method that modifies internal state (this::getFeatureVector returns a vector stored inside the instance) and basically is incremental saving method if we pass a category and an actual encoder if we pass a category that is not inside category map. Pretty cool usage of Optional - to load persisted data if ```Optional category```  is not empty and use one category is empty.
private SparseVector encode(Multiset<String> tokens, Optional<String> category) {
    SparseVector vector = category.map(this::getFeatureVector).orElse(SparseVector.create());
    for (Multiset.Entry<String> entry: tokens.entrySet()) {
        int wordId = getWordId(entry.getElement());
        double weight = entry.getCount() * 1.0 / tokens.size();
        vector.add(wordId, weight);
    }
    return vector;
}

//ineffective: putIfAbsent instead of computeIfAbsent for non constant values
//ignoring result of putIfAbsent
private SparseVector getFeatureVector(String category) {
    featureStore.putIfAbsent(category, new SparseVector());
    return featureStore.get(category);
}

//ineffective: weights.containsKey + get
if(weights.containsKey(wordOtherId)) {
    product += weights.get(wordOtherId) * word.getValue();
}

// here I more or less ready to forget modifying ```get```, because it’s encapsulated and easy to track
// ConcurrentHashMap mixed with non thread safe code - more or less fine
// Where is putIfAbsent and computeIfAbsent here, where we need it at most
public class WordSet {
    private final Map<String, Integer> dict = new ConcurrentHashMap<>();
    /**
     * Returns unique word if already seen in the text. Otherwise, assings a new id.
     */
    public int getWordId(String s) {
        if (!dict.containsKey(s)) {
            // this will work, but anybody can get this easily from the first sight? Such an id generation should be done in a separate method with description why it’s legit at all
            dict.put(s, dict.size());
        }
        return dict.get(s);
    }
    public int size() {
        return dict.size();
    }
}