Untitled

# Create "p", which represents probabilities P(word|newsgroup) = P(a randomly selected document from given newsgroup contains given word)
for each newsgroup:
    ...
    # How often each word was seen in documents of this newsgroup
    train_word_counts[newsgroup] = train[newsgroup]['wordID'].value_counts()
	# Count documents + pseudocount +1 per word for 53975 words
    count_docs = len(train[newsgroup]['docID'].unique()) + 53975
    # P(newsgroup,word) as explained at the top. All
    p[newsgroup] = train_word_counts[newsgroup].add(1) / count_docs
    # Sort by wordId and fill pseudovalue for nonexisting words
    p[newsgroup] = pd.DataFrame(data={'fraction': p[newsgroup]}, index=p[newsgroup].index)
    p[newsgroup].sort_index(inplace=True)
    p = pd.DataFrame(p, index=range(1,53976), columns=['fraction'])
    # Pseudocount for unseen words
    p = p.fillna(1 / count_docs)


# Classify rows, which are wordID-docID combinations, representing "this word is present in this document".
# We will write our results in DataFrame "b", where rows represent documents and columns represent newsgroups
# At first we fill columns with prior values for each newsgroup
# I'm omitting prior calculations, because my results don't change materially even when all priors are set to 1/20 (and logarithmized)
for each row of data we want to classify:
    for each newsgroup:
        docID = row[1]
        wordID = row[2]
        # Val represents how likely this document belongs to this newsgroup, before dealing with this current row
        val = b.at[docID, newsgroup]
        # P(word|newsgroup)
        wordFraction = p[newsgroup].iat[wordID-1, 0]
        # Summing up logarithmized probabilities produces the same results as multiplying normal probabilities
        val += math.log(wordFraction)
        b.set_value(docID, newsgroup, val)

# For each doc, pick the newsgroup with max val.