Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- {
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Скопируем функцию оценки качества ранжирования."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 306,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "def print_standings(groundtruth_file = 'qrel_clean', answer_file = 'qrel_nnovik', should_log=True):\n",
- " q2reld = {} \n",
- " for line in open(groundtruth_file):\n",
- " qid, did = [int(x) for x in line.split()]\n",
- " if qid not in q2reld.keys():\n",
- " q2reld[qid] = set()\n",
- " q2reld[qid].add(did) \n",
- "\n",
- " q2retrd = {}\n",
- " for line in open(answer_file):\n",
- " qid, did = [int(x) for x in line.split()]\n",
- " if qid not in q2retrd.keys():\n",
- " q2retrd[qid] = []\n",
- " q2retrd[qid].append(did) \n",
- "\n",
- " N = len(q2retrd.keys())\n",
- " precision = sum([len(q2reld[q].intersection(q2retrd[q]))*1.0/len(q2retrd[q]) for q in q2retrd.keys()]) / N\n",
- " recall = sum([len(q2reld[q].intersection(q2retrd[q]))*1.0/len(q2reld[q]) for q in q2retrd.keys()]) / N\n",
- " \n",
- " if should_log:\n",
- " print(\"Mean precision: {}\\nMean recall: {}\\nMean F-measure: {}\"\\\n",
- " .format(precision, recall, 2*precision*recall/(precision+recall)))\n",
- "\n",
- " # MAP@10\n",
- " import numpy as np\n",
- "\n",
- " MAP = 0.0\n",
- " for q in q2retrd.keys():\n",
- " n_results = min(10, len(q2retrd[q]))\n",
- " avep = np.zeros(n_results)\n",
- " for i in range(n_results):\n",
- " avep[i:] += q2retrd[q][i] in q2reld[q]\n",
- " avep[i] *= (q2retrd[q][i] in q2reld[q]) / (i+1.0)\n",
- " MAP += sum(avep) / min(n_results, len(q2reld[q]))\n",
- " \n",
- " if should_log:\n",
- " print(\"MAP@10: {}\".format(MAP/N))\n",
- " \n",
- " return (precision, recall, 2*precision*recall/(precision+recall), MAP/N)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Напишем вспомогательные функции для извлечения информации о документах и запросах:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 307,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "import re\n",
- "\n",
- "class Data(object):\n",
- " def __init__(self, index = 0, header = \"\", annotation = \"\"):\n",
- " self.index = index\n",
- " self.header = header\n",
- " self.annotation = annotation\n",
- " \n",
- " def __str__(self):\n",
- " return \"Data(index = %d, header = %s, annotation = %s)\" % (self.index, self.header, self.annotation)\n",
- "\n",
- "class Query(object):\n",
- " def __init__(self, index = 0, query = \"\"):\n",
- " self.index = index\n",
- " self.query = query\n",
- " \n",
- " def __str__(self):\n",
- " return \"Query(index = %d, query = %s)\" % (self.index, self.query)\n",
- " \n",
- "def read_until_new_section(data_file):\n",
- " text = \"\"\n",
- " line = \" \"\n",
- " \n",
- " while line != \"\":\n",
- " line = data_file.readline().strip()\n",
- " if re.match(\"(.)[ITABW]\", line) != None:\n",
- " break\n",
- " else:\n",
- " text += \" \" + line\n",
- "\n",
- " return text.strip(), line\n",
- " \n",
- "def parse_data_file(filename):\n",
- " data_list = []\n",
- "\n",
- " with open(filename) as data_f:\n",
- " data = None\n",
- " line = \" \"\n",
- " \n",
- " while line != \"\":\n",
- " line = data_f.readline().strip()\n",
- " if line.startswith(\".W\"):\n",
- " data.annotation, line = read_until_new_section(data_f)\n",
- " if line.startswith(\".I\"):\n",
- " if data != None: \n",
- " data_list.append(data)\n",
- " data = Data()\n",
- " data.index = int(line[3:])\n",
- " if line.startswith(\".T\"):\n",
- " data.header, line = read_until_new_section(data_f)\n",
- "\n",
- " data_list.append(data)\n",
- " \n",
- " return data_list\n",
- "\n",
- "def parse_query_file(filename):\n",
- " query_list = []\n",
- "\n",
- " with open(filename) as data_f:\n",
- " query = None\n",
- " line = \" \"\n",
- " \n",
- " while line != \"\":\n",
- " line = data_f.readline().strip()\n",
- " if line.startswith(\".W\"):\n",
- " query.query, line = read_until_new_section(data_f)\n",
- " if line.startswith(\".I\"):\n",
- " if query != None: \n",
- " query_list.append(query)\n",
- " query = Query()\n",
- " query.index = int(line[3:])\n",
- "\n",
- " query_list.append(query)\n",
- " \n",
- " return query_list"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "А также напишем вспомогательную функцию *** text_to_tokens() *** для разбора текста не лексемы:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 330,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "import nltk\n",
- "from nltk.tokenize import word_tokenize\n",
- "from nltk.stem import WordNetLemmatizer\n",
- "\n",
- "from nltk.stem.porter import PorterStemmer\n",
- "from nltk.stem.snowball import SnowballStemmer\n",
- "\n",
- "from nltk.corpus import stopwords\n",
- "\n",
- "lemmatizer = WordNetLemmatizer() # Simple lemmatizer\n",
- "stemmer1 = PorterStemmer() # Snowball better than Porter\n",
- "stemmer2 = SnowballStemmer('english', False) # Snowball better than Porter\n",
- "stop_words = set(stopwords.words('english')) # Stop words set\n",
- "\n",
- "from nltk.tag import pos_tag\n",
- "import re\n",
- "import string\n",
- "\n",
- "def tokenize(text):\n",
- " # Replaces all punctuation symbols with white spaces\n",
- " #text = re.sub(r'[^\\w\\s]',' ',text)\n",
- " \n",
- " tokens = [_.strip().lower() for _ in word_tokenize(text)]\n",
- " return list(filter(lambda x: x not in string.punctuation, tokens))\n",
- "\n",
- "def lemmatize_all(tokens, lemmatizer):\n",
- " wnl = lemmatizer\n",
- " for word, tag in pos_tag(tokens):\n",
- " word = word.lower()\n",
- " if tag.startswith(\"NN\"):\n",
- " yield wnl.lemmatize(word, pos='n')\n",
- " elif tag.startswith('VB'):\n",
- " yield wnl.lemmatize(word, pos='v')\n",
- " elif tag.startswith('JJ'):\n",
- " yield wnl.lemmatize(word, pos='a')\n",
- " else:\n",
- " yield wnl.lemmatize(word)\n",
- "\n",
- "# Используем лемматизацию вместе со стеммингом, ибо так немножко лучше результаты.\n",
- "# С алгоритмом все просто:\n",
- "# 1) Бьем на токены токенайзером\n",
- "# 2) В зависимости от части речи используем лемматизацию именно для этой части речи\n",
- "# 3) Удаляем стоп слова\n",
- "# 4) Шлифуем стеммингом\n",
- "def text_to_tokens(text):\n",
- " lst = tokenize(text)\n",
- " lemmaized = lemmatize_all(lst, lemmatizer)#[lemmatizer.lemmatize(_) for _ in lst] \n",
- " wo_stop_words = filter(lambda word: word not in stop_words, lemmaized)\n",
- " return [stemmer2.stem(_) for _ in wo_stop_words]#list(wo_stop_words)#"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Класс ___ CorpusIndex ___ используется для хранения инвертированного индекса и работы с ним:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 353,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "from nltk import Text\n",
- "import itertools\n",
- "import operator\n",
- "from collections import Counter\n",
- "import textwrap\n",
- "\n",
- "from math import log\n",
- "\n",
- "\n",
- "def find_indexes(word, list_of_pages):\n",
- " pages_amount = len(list_of_pages)\n",
- " word_indexes = []\n",
- " for index in range(pages_amount):\n",
- " if word in list_of_pages[index]:\n",
- " word_indexes.append(index + 1)\n",
- " return word_indexes\n",
- "\n",
- "def calculate_frequency(list_of_pages, docs_length):\n",
- " return [(doc_len, dict(Counter(page).items())) \n",
- " for (doc_len, page) in zip(docs_length, list_of_pages)]\n",
- " \n",
- "class CorpusIndex:\n",
- " \n",
- " \"\"\"\n",
- " Build CorpusIndex from list of texts.\n",
- " \"\"\"\n",
- " @staticmethod\n",
- " def from_corpus(corpus):\n",
- " corpus = list(corpus)\n",
- " docs_length = [len(x) for x in corpus]\n",
- " average_length = sum(docs_length) / len(docs_length)\n",
- " pages_list = list(map(lambda x: text_to_tokens(x), corpus))\n",
- " unique_lemms = sorted(list(set(itertools.chain(*pages_list))))\n",
- " inverted_index = list(map(lambda lemma: (lemma, find_indexes(lemma, pages_list)), unique_lemms))\n",
- " words_frequency = calculate_frequency(pages_list, docs_length)\n",
- " return CorpusIndex(unique_lemms, inverted_index, words_frequency, average_length)\n",
- "\n",
- " @staticmethod\n",
- " def _parse_frequency_token(token):\n",
- " word, freq = token.split(\"(\")\n",
- " freq = int(freq[:-1])\n",
- " return word, freq\n",
- " \n",
- " @staticmethod\n",
- " def from_disk(path):\n",
- " with open(path, mode=\"r\") as input_file:\n",
- " # Unique lemmas\n",
- " input_file.readline()\n",
- " unique_lemmas_line = input_file.readline().strip()\n",
- " unique_lemms = unique_lemmas_line.split(\", \")\n",
- " input_file.readline()\n",
- " \n",
- " # Inverted index\n",
- " input_file.readline()\n",
- " word_index_line = input_file.readline().strip()\n",
- " inverted_index = []\n",
- " while word_index_line != \"\":\n",
- " word, indexes = word_index_line.split(\": \")\n",
- " indexes = [int(x) for x in indexes.split(\" \")]\n",
- " inverted_index.append((word, indexes))\n",
- " word_index_line = input_file.readline().strip()\n",
- " \n",
- " # Words frequency:\n",
- " input_file.readline()\n",
- " frequency_line = input_file.readline()[:-1]\n",
- " words_frequency = []\n",
- " while frequency_line != \"\":\n",
- " amount, freqs = frequency_line.split(\": \")\n",
- " amount = int(amount)\n",
- " if amount != 0:\n",
- " freqs = dict(CorpusIndex._parse_frequency_token(freq) for freq in freqs.split(\", \"))\n",
- " else:\n",
- " freqs = dict()\n",
- " words_frequency.append((amount, freqs))\n",
- " frequency_line = input_file.readline()[:-1]\n",
- " \n",
- " return CorpusIndex(unique_lemms, inverted_index, words_frequency)\n",
- " \n",
- " def __init__(self, unique_lemms, inverted_index, words_frequency, average_length):\n",
- " self.unique_lemms = unique_lemms\n",
- " # Refactor!\n",
- " self.inverted_index = dict(inverted_index)\n",
- " self.words_frequency = words_frequency\n",
- " self.average_doc_len = average_length\n",
- "\n",
- " def __eq__(self, other):\n",
- " return self.unique_lemms == other.unique_lemms \\\n",
- " and self.inverted_index == other.inverted_index \\\n",
- " and self.words_frequency == other.words_frequency\n",
- " \n",
- " def __str__(self):\n",
- " return textwrap.dedent(\"\"\"\n",
- " CorpusIndex(\n",
- " Unique lemmas: %s\n",
- " Inverted index: %s\n",
- " Words frequency: %s \n",
- " )\n",
- " \"\"\" % (self.unique_lemms, self.inverted_index, self.words_frequency))\n",
- "\n",
- " def save(self, path):\n",
- " with open(path, mode=\"w+\") as output_file:\n",
- " output_file.write(\"Unique lemmas:\\n\")\n",
- " output_file.write(\", \".join(self.unique_lemms))\n",
- " output_file.write(\"\\n\\n\")\n",
- " output_file.write(\"Inverted index:\\n\")\n",
- " output_file.write(\"\\n\".join(\"%s: %s\" % (word, \" \".join(str(i) for i in indexes)) \\\n",
- " for (word, indexes) in self.inverted_index.items()))\n",
- " output_file.write(\"\\n\\n\")\n",
- " output_file.write(\"Words frequency:\\n\")\n",
- " output_file.write(\"\\n\".join(\"%d: %s\" % (words_amount, \", \".join(\"%s(%d)\" % pair for pair in freq.items())) \\\n",
- " for (words_amount, freq) in self.words_frequency))\n",
- " output_file.write(\"\\n\")\n",
- "\n",
- " def find(self, lemma):\n",
- " result = self.inverted_index.get(lemma)\n",
- " return result if result != None else []\n",
- "\n",
- " def lemma_freq(self, lemma, doc_id):\n",
- " doc_info = self.words_frequency[doc_id - 1]\n",
- " if doc_info[0] == 0:\n",
- " return 0\n",
- " hits = doc_info[1].get(lemma)\n",
- " if hits == None:\n",
- " return 0\n",
- " return hits\n",
- " \n",
- " def search_in_index(self, query, rsv_func):\n",
- " query_tokens = text_to_tokens(query)\n",
- " indexes = list(map(self.find, query_tokens))\n",
- " match_docIds = list(set(itertools.chain(*indexes)))\n",
- " \n",
- " rsv_scores = dict(map(lambda docId: (docId, rsv_func(self, docId, query_tokens)), match_docIds))\n",
- " sorted_rsv_scores = sorted(rsv_scores.items(), key=operator.itemgetter(1))\n",
- " sorted_rsv_scores.reverse()\n",
- "\n",
- " return list(map(lambda x: x[0], sorted_rsv_scores))\n",
- " \n",
- " def get_avg_index_len(self):\n",
- " index_len_sum = sum([len(x[1]) for x in self.inverted_index.items()])\n",
- " return index_len_sum / len(self.inverted_index)\n",
- " \n",
- " def get_max_index_len(self):\n",
- " index_len_max = max([len(x[1]) for x in self.inverted_index.items()])\n",
- " return index_len_max\n",
- " \n",
- " def print_statistics(self):\n",
- " dict_len = len(self.unique_lemms)\n",
- " print(\"Length of the dictionary: %s\" % dict_len)\n",
- " print(\"Average list of word's positions length: %s\" % self.get_avg_index_len())\n",
- " print(\"Max list of word's positions length: %s\" % self.get_max_index_len())"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Добавим функцию для создания различных версий RSV(q,d) функции:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 332,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "def idf(corpusIndex, lemma):\n",
- " indexes = corpusIndex.inverted_index.get(lemma)\n",
- " docs_amount = 0\n",
- " if indexes != None:\n",
- " docs_amount = len(indexes)\n",
- " N = len(corpusIndex.words_frequency)\n",
- " Nt = docs_amount\n",
- " return log(1.0 + (N - Nt + 0.5) / (Nt + 0.5))\n",
- "\n",
- "def idf_simple(corpusIndex, lemma):\n",
- " indexes = corpusIndex.inverted_index.get(lemma)\n",
- " docs_amount = 0\n",
- " if indexes != None:\n",
- " docs_amount = len(indexes)\n",
- " N = len(corpusIndex.words_frequency)\n",
- " Nt = docs_amount\n",
- " return log(N / Nt)\n",
- "\n",
- "def construct_rsv_func(k1, b,\n",
- " idf_func = idf, \n",
- " norm_rsv = False,\n",
- " use_tfq = False, k2 = 100):\n",
- " def rsv(corpusIndex, doc_id, query_lemmas):\n",
- " score, idf_sum = 0.0, 0.0\n",
- " Ld = float(corpusIndex.words_frequency[doc_id - 1][0])\n",
- " _L_ = corpusIndex.average_doc_len\n",
- " for lemma in set(query_lemmas):\n",
- " if norm_rsv:\n",
- " idf_sum += idf(corpusIndex, lemma)\n",
- "\n",
- " f_td = corpusIndex.lemma_freq(lemma, doc_id)\n",
- " f_tq = query_lemmas.count(lemma)\n",
- " if f_td == 0:\n",
- " continue\n",
- "\n",
- " addition = idf_func(corpusIndex, lemma) * f_td * (k1 + 1) / (k1 * ((1 - b) + b * Ld / _L_) + f_td)\n",
- " if use_tfq:\n",
- " addition *= (k2 + 1) * f_tq / (k2 + f_tq)\n",
- " score += addition\n",
- "\n",
- " if norm_rsv:\n",
- " score /= idf_sum\n",
- "\n",
- " return score\n",
- " \n",
- " return rsv"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Функция для замера оценок получивсшегося ранжирования:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 354,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "def test_ranking(corpus, rsv_func, extra_log_info, logging=True):\n",
- " disk_corpus = corpus\n",
- "\n",
- " query_list = parse_query_file(\"cran.qry\")\n",
- " #print(\"\\n\\n\".join(str(x) for x in query_list))\n",
- "\n",
- " search_results = map(lambda q: (q.index, disk_corpus.search_in_index(q.query, rsv_func)[:10]), query_list)\n",
- " with open(\"qrel_nnovik\", mode=\"w+\") as results_f:\n",
- " index = 1\n",
- " for (_, resutls) in search_results:\n",
- " for docId in resutls:\n",
- " results_f.write(\"%s %s\\n\" % (index, docId))\n",
- " index += 1\n",
- "\n",
- " result = print_standings(should_log = logging)\n",
- " return result"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Протестируем вначале наш индекс вместе с самой базовой BM25:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 355,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "*** Ranking for k1 = 1.2, b = 0.75 (headers) ***\n",
- "Mean precision: 0.24666666666666678\n",
- "Mean recall: 0.3609997558574267\n",
- "Mean F-measure: 0.2930772645786639\n",
- "MAP@10: 0.28114384115786245\n",
- "Length of the dictionary: 1523\n",
- "Average list of word's positions length: 7.115561391989495\n",
- "Max list of word's positions length: 358\n",
- "\n",
- "\n",
- "*** Ranking for k1 = 1.2, b = 0.75 (annotations) ***\n",
- "Mean precision: 0.2942222222222223\n",
- "Mean recall: 0.42721996967640197\n",
- "Mean F-measure: 0.3484620396960764\n",
- "MAP@10: 0.3662481089555164\n",
- "Length of the dictionary: 6851\n",
- "Average list of word's positions length: 12.559772296015181\n",
- "Max list of word's positions length: 713\n"
- ]
- }
- ],
- "source": [
- "data_list = parse_data_file(\"cran.all.1400\")\n",
- "rsv = construct_rsv_func(k1 = 1.2, b = 0.75)\n",
- "\n",
- "headers_list = map(lambda x: x.header, data_list)\n",
- "corpus = CorpusIndex.from_corpus(headers_list)\n",
- "#corpus.save(\"test\")\n",
- "print(\"*** Ranking for k1 = 1.2, b = 0.75 (headers) ***\")\n",
- "ranking = test_ranking(corpus, rsv, \"headers\")\n",
- "corpus.print_statistics()\n",
- "print(\"\\n\")\n",
- "\n",
- "annotations_list = map(lambda x: x.annotation, data_list)\n",
- "corpus = CorpusIndex.from_corpus(annotations_list)\n",
- "print(\"*** Ranking for k1 = 1.2, b = 0.75 (annotations) ***\")\n",
- "ranking = test_ranking(corpus, rsv, \"annotations\")\n",
- "corpus.print_statistics()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 356,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "data_list = parse_data_file(\"cran.all.1400\")\n",
- "annotations_list = map(lambda x: x.annotation, data_list)\n",
- "corpus = CorpusIndex.from_corpus(annotations_list)\n",
- "#headers_list = map(lambda x: x.header, data_list)\n",
- "#corpus = CorpusIndex.from_corpus(headers_list)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "В дальнейшем будем использовать инвертированный индекс, построенный по аннотациям (не по заголовкам)."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 337,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Best f-measure: %s ((1.9, 0.7), 0.3560820379208156)\n",
- "Best MAP@10: %s ((2.0, 0.7), 0.37541323031270102)\n"
- ]
- }
- ],
- "source": [
- "from itertools import product\n",
- "\n",
- "range_k1 = [_ / 100.0 for _ in range(120, 201, 10)]\n",
- "range_b = [_ / 100.0 for _ in range(0, 101, 10)]\n",
- "\n",
- "best_f_measure = ((), 0.0)\n",
- "best_map_10 = ((), 0.0)\n",
- "for (k1, b) in product(range_k1, range_b):\n",
- " rsv = construct_rsv_func(k1, b)\n",
- " ranking = test_ranking(corpus, rsv, \"grid search\", False)\n",
- " if (ranking[2] > best_f_measure[1]):\n",
- " best_f_measure = ((k1, b), ranking[2])\n",
- " if (ranking[3] > best_map_10[1]):\n",
- " best_map_10 = ((k1, b), ranking[3])\n",
- " \n",
- "print(\"Best f-measure: %s\", str(best_f_measure))\n",
- "print(\"Best MAP@10: %s\", str(best_map_10))"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Из полученных результатов видно, что наилучшая ___f-measure___ достигается при $k1 = 1.9, b = 0.7$. \n",
- "\n",
- "Такой $k1(=1.9)$ говорит о том, что оптимальный ранг сильно зависит от того, насколько часто терм запроса встречается в документах.\n",
- "\n",
- "Такой $b(=0.7)$ говорит о том, что есть корелляция между весом терма запроса для данного документа и длиной документа. То есть, вес терма запроса для данного документа существенно зависит от длины документа."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 338,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "best_k1 = 1.9\n",
- "best_b = 0.7"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 339,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Mean precision: 0.30177777777777776\n",
- "Mean recall: 0.4345521020216618\n",
- "Mean F-measure: 0.3561940681056661\n",
- "MAP@10: 0.3744629405671733\n"
- ]
- }
- ],
- "source": [
- "rsv = construct_rsv_func(best_k1, best_b, idf_func = idf_simple)\n",
- "ranking = test_ranking(corpus, rsv, \"New idf\")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "При достаточно большом кол-ве текстов две формулы вычисления $IDF$ отличаются минимально. \n",
- "Поэтому и результат поменялся незначительно."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 343,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Mean precision: 0.3017777777777778\n",
- "Mean recall: 0.4342187686883285\n",
- "Mean F-measure: 0.3560820379208156\n",
- "MAP@10: 0.37417475714565657\n"
- ]
- }
- ],
- "source": [
- "rsv = construct_rsv_func(best_k1, best_b, norm_rsv = True)\n",
- "ranking = test_ranking(corpus, rsv, \"Norm RSV\")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "collapsed": true
- },
- "source": [
- "Поскольку сумма IDF термов запроса одинакова для всех RSV(q, d), то нормирование по этой сумме не должно повлиять на результат (однако, меняет его минимально)."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 341,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "*** Ranking for k2 = 0 (annotations) ***\n",
- "Mean precision: 0.3017777777777778\n",
- "Mean recall: 0.4342187686883285\n",
- "Mean F-measure: 0.3560820379208156\n",
- "MAP@10: 0.37417475714565657\n",
- "*** Ranking for k2 = 5 (annotations) ***\n",
- "Mean precision: 0.3031111111111111\n",
- "Mean recall: 0.4371427706123304\n",
- "Mean F-measure: 0.3579929377904864\n",
- "MAP@10: 0.37369987682315736\n",
- "*** Ranking for k2 = 10 (annotations) ***\n",
- "Mean precision: 0.30266666666666664\n",
- "Mean recall: 0.4368958570320835\n",
- "Mean F-measure: 0.35760009056986974\n",
- "MAP@10: 0.37282727247277503\n",
- "*** Ranking for k2 = 50 (annotations) ***\n",
- "Mean precision: 0.30266666666666664\n",
- "Mean recall: 0.4368958570320835\n",
- "Mean F-measure: 0.35760009056986974\n",
- "MAP@10: 0.3737362230900591\n",
- "*** Ranking for k2 = 100 (annotations) ***\n",
- "Mean precision: 0.30266666666666664\n",
- "Mean recall: 0.4368958570320835\n",
- "Mean F-measure: 0.35760009056986974\n",
- "MAP@10: 0.3736447327902354\n",
- "*** Ranking for k2 = 500 (annotations) ***\n",
- "Mean precision: 0.30266666666666664\n",
- "Mean recall: 0.4368958570320835\n",
- "Mean F-measure: 0.35760009056986974\n",
- "MAP@10: 0.3736447327902354\n",
- "*** Ranking for k2 = 1000 (annotations) ***\n",
- "Mean precision: 0.30266666666666664\n",
- "Mean recall: 0.4368958570320835\n",
- "Mean F-measure: 0.35760009056986974\n",
- "MAP@10: 0.3736447327902354\n"
- ]
- }
- ],
- "source": [
- "k2_values = [0, 5, 10, 50, 100, 500, 1000]\n",
- "\n",
- "for _k2 in k2_values:\n",
- " rsv = construct_rsv_func(best_k1, best_b, use_tfq = True, k2 = _k2)\n",
- " print(\"*** Ranking for k2 = %s (annotations) ***\" % _k2)\n",
- " ranking = test_ranking(corpus, rsv, \"Use TFQ\", True)\n",
- " if (ranking[2] > best_f_measure[1]):\n",
- " best_f_measure = ((k1, b), ranking[2])"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "collapsed": true
- },
- "source": [
- "k2 = 5 оказался наилучшим вариантом для улучшения ___f-measure___. Это означает, что:\n",
- "1. у нас имеются запросы с повторящимися термами;\n",
- "2. если терм запроса повторяется несколько раз, то это должно это значит, что ранг данного документа должен быть немножечко увеличен для лучших результатов"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.6.1"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
- }
Add Comment
Please, Sign In to add comment