Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- private PostingsList getTfidfQuery(Query query) {
- /** Build query vector */
- ArrayList<Double> q = new ArrayList<>();
- HashMap<String, Integer> tkns = new HashMap<>();
- for (int i = 0; i < query.queryterm.size(); i++) {
- if (!tkns.containsKey(query.queryterm.get(i).term))
- q.add(query.queryterm.get(i).weight);
- else {
- int idx = tkns.get(query.queryterm.get(i).term);
- q.set(idx, q.get(idx) + query.queryterm.get(i).weight);
- }
- }
- ArrayList<TokenIndexData> postingsLists = getPostingsLists(query);
- HashMap<Integer, PostingsEntry> scores = new HashMap<>();
- HashMap<Integer, Double> denom = new HashMap<>();
- int i = 0;
- for (TokenIndexData pt: postingsLists) {
- PostingsList pl = pt.postingsList;
- for (PostingsEntry pe: pl) {
- double tfidf = tfidf(pe, pl);
- double score = q.get(i) * tfidf;
- if (!scores.containsKey(pe.docID)) {
- scores.put(pe.docID, new PostingsEntry(pe.docID, score));
- }
- else {
- scores.get(pe.docID).score += score;
- }
- }
- i++;
- }
- PostingsList results = new PostingsList();
- /** Normalize score */
- for (int docID: scores.keySet()) {
- PostingsEntry pe = scores.get(docID);
- pe.score /= Index.docLengths.get(pe.docID);
- results.add(pe);
- }
- Collections.sort(results);
- return results;
- }
- private double tfidf(PostingsEntry pe, PostingsList pl) {
- double tf = pe.getOccurences();
- double idf = Math.log(Index.docNames.size() / pl.size());
- double tfidf = tf * idf;
- return tfidf;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement