Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- /**
- * Constructs the prior distribution for the classifier
- *
- * @param store the message store
- */
- private void constructPrior(MessageStore store) {
- prior = new HashMap<Integer,Double>();
- IInvertedIndex[] index = store.getIndexes();
- double total = 0;
- classLen = store.getIndexes().length;
- for (int classNumber = 0 ; classNumber < store.getIndexes().length ; classNumber++) {
- IInvertedIndex ii = index[classNumber];
- total += ii.getDocumentStore().size();
- }
- for (int classNumber = 0 ; classNumber < store.getIndexes().length ; classNumber++) {
- double numDocsInClass = index[classNumber].getDocumentStore().size();
- double localPrior = numDocsInClass/total;
- prior.put(classNumber, localPrior);
- }
- }
- /**
- * Constructs the likelihood distribution for the classifier
- *
- * @param store the message store
- */
- private void constructLikelihood(MessageStore store) {
- likelihood = new HashMap<Integer,Map<Integer,Double>>();
- for (int classNumber = 0 ; classNumber < store.getIndexes().length ; classNumber++) {
- likelihood.put(classNumber, new HashMap<Integer,Double>());
- double allTermsInClass = 0;
- IInvertedIndex localIndex = store.getIndexes()[classNumber];
- ILexicon localLexicon = localIndex.getLexicon();
- for (int i = 0; i < localIndex.getDocumentStore().size(); i++) {
- allTermsInClass+=localIndex.getDocumentStore().getDocument(i).getNormalizedLength();
- }
- allTermsInClass+=globalLexicon.size();
- HashMap<Integer, Double> localLikelihood = new HashMap<>();
- for (Iterator<String> iterator =globalLexicon.iterator(); iterator.hasNext();) {
- String str = (String) iterator.next();
- int termID = localLexicon.lookup(str);
- int globalTermID = globalLexicon.lookup(str);
- if(termID == ILexicon.INVALID){
- double likelihood = Math.log(1/allTermsInClass);
- localLikelihood.put(globalTermID, likelihood);
- }else{
- PostingList pl = localIndex.getPostingList(termID);
- double Tct = 0;
- for (int i = 0; i < pl.size(); i++) {
- Tct+=pl.getPosting(i).getOccurrenceCount();
- }
- double likelihood = (Tct+1)/allTermsInClass;
- localLikelihood.put(globalTermID, likelihood);
- }
- }
- likelihood.put(classNumber, localLikelihood);
- }
- }
- /**
- * Classifies the document into one of the possible classes, given
- * its content. The returned value is the class number for the class
- * which has the highest probability given the document content.
- *
- * @param documentContent the document content (already normalized and tokenized)
- * @return the class with highest probability
- */
- public int classify (List<IToken> documentContent) {
- Sieve<Integer, Double> posterior = new Sieve<Integer, Double>(1);
- /** Prior class distribution P(c) */
- //Map<Integer,Double> prior;
- Integer[] priorClassIDs = new Integer[prior.size()];
- priorClassIDs = prior.keySet().toArray(priorClassIDs);
- for (int classNumber = 0 ; classNumber < classLen ; classNumber++) {
- double priorScore = Math.abs(prior.get(classNumber).doubleValue());
- if(priorScore > 0){
- priorScore = Math.log(priorScore);
- }
- HashMap<Integer, Double> likelihoodScores = (HashMap<Integer, Double>)likelihood.get(classNumber);
- double likelihoodScore = priorScore;
- for (Iterator<IToken> iterator = documentContent.iterator(); iterator.hasNext();) {
- IToken docTerm = (IToken) iterator.next();
- int globalTermID = globalLexicon.lookup(docTerm.getValue());
- if(globalTermID == ILexicon.INVALID){
- System.out.println("The fucking word is not in the global dictionary,");
- continue;
- }
- double subScore = Math.abs(likelihoodScores.get(globalTermID));
- if(subScore>0){
- likelihoodScore += Math.abs(Math.log(subScore));
- }
- }
- double totalClassScore = likelihoodScore;
- System.out.println(totalClassScore);
- posterior.sift(classNumber, totalClassScore);
- }
- return posterior.iterator().next().data;
- }
Advertisement
Add Comment
Please, Sign In to add comment