Guest User

Untitled

a guest
Apr 20th, 2018
106
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 28.41 KB | None | 0 0
  1. {
  2. "cells": [
  3. {
  4. "cell_type": "markdown",
  5. "metadata": {},
  6. "source": [
  7. "Скопируем функцию оценки качества ранжирования."
  8. ]
  9. },
  10. {
  11. "cell_type": "code",
  12. "execution_count": 306,
  13. "metadata": {
  14. "collapsed": true
  15. },
  16. "outputs": [],
  17. "source": [
  18. "def print_standings(groundtruth_file = 'qrel_clean', answer_file = 'qrel_nnovik', should_log=True):\n",
  19. " q2reld = {} \n",
  20. " for line in open(groundtruth_file):\n",
  21. " qid, did = [int(x) for x in line.split()]\n",
  22. " if qid not in q2reld.keys():\n",
  23. " q2reld[qid] = set()\n",
  24. " q2reld[qid].add(did) \n",
  25. "\n",
  26. " q2retrd = {}\n",
  27. " for line in open(answer_file):\n",
  28. " qid, did = [int(x) for x in line.split()]\n",
  29. " if qid not in q2retrd.keys():\n",
  30. " q2retrd[qid] = []\n",
  31. " q2retrd[qid].append(did) \n",
  32. "\n",
  33. " N = len(q2retrd.keys())\n",
  34. " precision = sum([len(q2reld[q].intersection(q2retrd[q]))*1.0/len(q2retrd[q]) for q in q2retrd.keys()]) / N\n",
  35. " recall = sum([len(q2reld[q].intersection(q2retrd[q]))*1.0/len(q2reld[q]) for q in q2retrd.keys()]) / N\n",
  36. " \n",
  37. " if should_log:\n",
  38. " print(\"Mean precision: {}\\nMean recall: {}\\nMean F-measure: {}\"\\\n",
  39. " .format(precision, recall, 2*precision*recall/(precision+recall)))\n",
  40. "\n",
  41. " # MAP@10\n",
  42. " import numpy as np\n",
  43. "\n",
  44. " MAP = 0.0\n",
  45. " for q in q2retrd.keys():\n",
  46. " n_results = min(10, len(q2retrd[q]))\n",
  47. " avep = np.zeros(n_results)\n",
  48. " for i in range(n_results):\n",
  49. " avep[i:] += q2retrd[q][i] in q2reld[q]\n",
  50. " avep[i] *= (q2retrd[q][i] in q2reld[q]) / (i+1.0)\n",
  51. " MAP += sum(avep) / min(n_results, len(q2reld[q]))\n",
  52. " \n",
  53. " if should_log:\n",
  54. " print(\"MAP@10: {}\".format(MAP/N))\n",
  55. " \n",
  56. " return (precision, recall, 2*precision*recall/(precision+recall), MAP/N)"
  57. ]
  58. },
  59. {
  60. "cell_type": "markdown",
  61. "metadata": {},
  62. "source": [
  63. "Напишем вспомогательные функции для извлечения информации о документах и запросах:"
  64. ]
  65. },
  66. {
  67. "cell_type": "code",
  68. "execution_count": 307,
  69. "metadata": {
  70. "collapsed": true
  71. },
  72. "outputs": [],
  73. "source": [
  74. "import re\n",
  75. "\n",
  76. "class Data(object):\n",
  77. " def __init__(self, index = 0, header = \"\", annotation = \"\"):\n",
  78. " self.index = index\n",
  79. " self.header = header\n",
  80. " self.annotation = annotation\n",
  81. " \n",
  82. " def __str__(self):\n",
  83. " return \"Data(index = %d, header = %s, annotation = %s)\" % (self.index, self.header, self.annotation)\n",
  84. "\n",
  85. "class Query(object):\n",
  86. " def __init__(self, index = 0, query = \"\"):\n",
  87. " self.index = index\n",
  88. " self.query = query\n",
  89. " \n",
  90. " def __str__(self):\n",
  91. " return \"Query(index = %d, query = %s)\" % (self.index, self.query)\n",
  92. " \n",
  93. "def read_until_new_section(data_file):\n",
  94. " text = \"\"\n",
  95. " line = \" \"\n",
  96. " \n",
  97. " while line != \"\":\n",
  98. " line = data_file.readline().strip()\n",
  99. " if re.match(\"(.)[ITABW]\", line) != None:\n",
  100. " break\n",
  101. " else:\n",
  102. " text += \" \" + line\n",
  103. "\n",
  104. " return text.strip(), line\n",
  105. " \n",
  106. "def parse_data_file(filename):\n",
  107. " data_list = []\n",
  108. "\n",
  109. " with open(filename) as data_f:\n",
  110. " data = None\n",
  111. " line = \" \"\n",
  112. " \n",
  113. " while line != \"\":\n",
  114. " line = data_f.readline().strip()\n",
  115. " if line.startswith(\".W\"):\n",
  116. " data.annotation, line = read_until_new_section(data_f)\n",
  117. " if line.startswith(\".I\"):\n",
  118. " if data != None: \n",
  119. " data_list.append(data)\n",
  120. " data = Data()\n",
  121. " data.index = int(line[3:])\n",
  122. " if line.startswith(\".T\"):\n",
  123. " data.header, line = read_until_new_section(data_f)\n",
  124. "\n",
  125. " data_list.append(data)\n",
  126. " \n",
  127. " return data_list\n",
  128. "\n",
  129. "def parse_query_file(filename):\n",
  130. " query_list = []\n",
  131. "\n",
  132. " with open(filename) as data_f:\n",
  133. " query = None\n",
  134. " line = \" \"\n",
  135. " \n",
  136. " while line != \"\":\n",
  137. " line = data_f.readline().strip()\n",
  138. " if line.startswith(\".W\"):\n",
  139. " query.query, line = read_until_new_section(data_f)\n",
  140. " if line.startswith(\".I\"):\n",
  141. " if query != None: \n",
  142. " query_list.append(query)\n",
  143. " query = Query()\n",
  144. " query.index = int(line[3:])\n",
  145. "\n",
  146. " query_list.append(query)\n",
  147. " \n",
  148. " return query_list"
  149. ]
  150. },
  151. {
  152. "cell_type": "markdown",
  153. "metadata": {},
  154. "source": [
  155. "А также напишем вспомогательную функцию *** text_to_tokens() *** для разбора текста не лексемы:"
  156. ]
  157. },
  158. {
  159. "cell_type": "code",
  160. "execution_count": 330,
  161. "metadata": {
  162. "collapsed": true
  163. },
  164. "outputs": [],
  165. "source": [
  166. "import nltk\n",
  167. "from nltk.tokenize import word_tokenize\n",
  168. "from nltk.stem import WordNetLemmatizer\n",
  169. "\n",
  170. "from nltk.stem.porter import PorterStemmer\n",
  171. "from nltk.stem.snowball import SnowballStemmer\n",
  172. "\n",
  173. "from nltk.corpus import stopwords\n",
  174. "\n",
  175. "lemmatizer = WordNetLemmatizer() # Simple lemmatizer\n",
  176. "stemmer1 = PorterStemmer() # Snowball better than Porter\n",
  177. "stemmer2 = SnowballStemmer('english', False) # Snowball better than Porter\n",
  178. "stop_words = set(stopwords.words('english')) # Stop words set\n",
  179. "\n",
  180. "from nltk.tag import pos_tag\n",
  181. "import re\n",
  182. "import string\n",
  183. "\n",
  184. "def tokenize(text):\n",
  185. " # Replaces all punctuation symbols with white spaces\n",
  186. " #text = re.sub(r'[^\\w\\s]',' ',text)\n",
  187. " \n",
  188. " tokens = [_.strip().lower() for _ in word_tokenize(text)]\n",
  189. " return list(filter(lambda x: x not in string.punctuation, tokens))\n",
  190. "\n",
  191. "def lemmatize_all(tokens, lemmatizer):\n",
  192. " wnl = lemmatizer\n",
  193. " for word, tag in pos_tag(tokens):\n",
  194. " word = word.lower()\n",
  195. " if tag.startswith(\"NN\"):\n",
  196. " yield wnl.lemmatize(word, pos='n')\n",
  197. " elif tag.startswith('VB'):\n",
  198. " yield wnl.lemmatize(word, pos='v')\n",
  199. " elif tag.startswith('JJ'):\n",
  200. " yield wnl.lemmatize(word, pos='a')\n",
  201. " else:\n",
  202. " yield wnl.lemmatize(word)\n",
  203. "\n",
  204. "# Используем лемматизацию вместе со стеммингом, ибо так немножко лучше результаты.\n",
  205. "# С алгоритмом все просто:\n",
  206. "# 1) Бьем на токены токенайзером\n",
  207. "# 2) В зависимости от части речи используем лемматизацию именно для этой части речи\n",
  208. "# 3) Удаляем стоп слова\n",
  209. "# 4) Шлифуем стеммингом\n",
  210. "def text_to_tokens(text):\n",
  211. " lst = tokenize(text)\n",
  212. " lemmaized = lemmatize_all(lst, lemmatizer)#[lemmatizer.lemmatize(_) for _ in lst] \n",
  213. " wo_stop_words = filter(lambda word: word not in stop_words, lemmaized)\n",
  214. " return [stemmer2.stem(_) for _ in wo_stop_words]#list(wo_stop_words)#"
  215. ]
  216. },
  217. {
  218. "cell_type": "markdown",
  219. "metadata": {},
  220. "source": [
  221. "Класс ___ CorpusIndex ___ используется для хранения инвертированного индекса и работы с ним:"
  222. ]
  223. },
  224. {
  225. "cell_type": "code",
  226. "execution_count": 353,
  227. "metadata": {
  228. "collapsed": true
  229. },
  230. "outputs": [],
  231. "source": [
  232. "from nltk import Text\n",
  233. "import itertools\n",
  234. "import operator\n",
  235. "from collections import Counter\n",
  236. "import textwrap\n",
  237. "\n",
  238. "from math import log\n",
  239. "\n",
  240. "\n",
  241. "def find_indexes(word, list_of_pages):\n",
  242. " pages_amount = len(list_of_pages)\n",
  243. " word_indexes = []\n",
  244. " for index in range(pages_amount):\n",
  245. " if word in list_of_pages[index]:\n",
  246. " word_indexes.append(index + 1)\n",
  247. " return word_indexes\n",
  248. "\n",
  249. "def calculate_frequency(list_of_pages, docs_length):\n",
  250. " return [(doc_len, dict(Counter(page).items())) \n",
  251. " for (doc_len, page) in zip(docs_length, list_of_pages)]\n",
  252. " \n",
  253. "class CorpusIndex:\n",
  254. " \n",
  255. " \"\"\"\n",
  256. " Build CorpusIndex from list of texts.\n",
  257. " \"\"\"\n",
  258. " @staticmethod\n",
  259. " def from_corpus(corpus):\n",
  260. " corpus = list(corpus)\n",
  261. " docs_length = [len(x) for x in corpus]\n",
  262. " average_length = sum(docs_length) / len(docs_length)\n",
  263. " pages_list = list(map(lambda x: text_to_tokens(x), corpus))\n",
  264. " unique_lemms = sorted(list(set(itertools.chain(*pages_list))))\n",
  265. " inverted_index = list(map(lambda lemma: (lemma, find_indexes(lemma, pages_list)), unique_lemms))\n",
  266. " words_frequency = calculate_frequency(pages_list, docs_length)\n",
  267. " return CorpusIndex(unique_lemms, inverted_index, words_frequency, average_length)\n",
  268. "\n",
  269. " @staticmethod\n",
  270. " def _parse_frequency_token(token):\n",
  271. " word, freq = token.split(\"(\")\n",
  272. " freq = int(freq[:-1])\n",
  273. " return word, freq\n",
  274. " \n",
  275. " @staticmethod\n",
  276. " def from_disk(path):\n",
  277. " with open(path, mode=\"r\") as input_file:\n",
  278. " # Unique lemmas\n",
  279. " input_file.readline()\n",
  280. " unique_lemmas_line = input_file.readline().strip()\n",
  281. " unique_lemms = unique_lemmas_line.split(\", \")\n",
  282. " input_file.readline()\n",
  283. " \n",
  284. " # Inverted index\n",
  285. " input_file.readline()\n",
  286. " word_index_line = input_file.readline().strip()\n",
  287. " inverted_index = []\n",
  288. " while word_index_line != \"\":\n",
  289. " word, indexes = word_index_line.split(\": \")\n",
  290. " indexes = [int(x) for x in indexes.split(\" \")]\n",
  291. " inverted_index.append((word, indexes))\n",
  292. " word_index_line = input_file.readline().strip()\n",
  293. " \n",
  294. " # Words frequency:\n",
  295. " input_file.readline()\n",
  296. " frequency_line = input_file.readline()[:-1]\n",
  297. " words_frequency = []\n",
  298. " while frequency_line != \"\":\n",
  299. " amount, freqs = frequency_line.split(\": \")\n",
  300. " amount = int(amount)\n",
  301. " if amount != 0:\n",
  302. " freqs = dict(CorpusIndex._parse_frequency_token(freq) for freq in freqs.split(\", \"))\n",
  303. " else:\n",
  304. " freqs = dict()\n",
  305. " words_frequency.append((amount, freqs))\n",
  306. " frequency_line = input_file.readline()[:-1]\n",
  307. " \n",
  308. " return CorpusIndex(unique_lemms, inverted_index, words_frequency)\n",
  309. " \n",
  310. " def __init__(self, unique_lemms, inverted_index, words_frequency, average_length):\n",
  311. " self.unique_lemms = unique_lemms\n",
  312. " # Refactor!\n",
  313. " self.inverted_index = dict(inverted_index)\n",
  314. " self.words_frequency = words_frequency\n",
  315. " self.average_doc_len = average_length\n",
  316. "\n",
  317. " def __eq__(self, other):\n",
  318. " return self.unique_lemms == other.unique_lemms \\\n",
  319. " and self.inverted_index == other.inverted_index \\\n",
  320. " and self.words_frequency == other.words_frequency\n",
  321. " \n",
  322. " def __str__(self):\n",
  323. " return textwrap.dedent(\"\"\"\n",
  324. " CorpusIndex(\n",
  325. " Unique lemmas: %s\n",
  326. " Inverted index: %s\n",
  327. " Words frequency: %s \n",
  328. " )\n",
  329. " \"\"\" % (self.unique_lemms, self.inverted_index, self.words_frequency))\n",
  330. "\n",
  331. " def save(self, path):\n",
  332. " with open(path, mode=\"w+\") as output_file:\n",
  333. " output_file.write(\"Unique lemmas:\\n\")\n",
  334. " output_file.write(\", \".join(self.unique_lemms))\n",
  335. " output_file.write(\"\\n\\n\")\n",
  336. " output_file.write(\"Inverted index:\\n\")\n",
  337. " output_file.write(\"\\n\".join(\"%s: %s\" % (word, \" \".join(str(i) for i in indexes)) \\\n",
  338. " for (word, indexes) in self.inverted_index.items()))\n",
  339. " output_file.write(\"\\n\\n\")\n",
  340. " output_file.write(\"Words frequency:\\n\")\n",
  341. " output_file.write(\"\\n\".join(\"%d: %s\" % (words_amount, \", \".join(\"%s(%d)\" % pair for pair in freq.items())) \\\n",
  342. " for (words_amount, freq) in self.words_frequency))\n",
  343. " output_file.write(\"\\n\")\n",
  344. "\n",
  345. " def find(self, lemma):\n",
  346. " result = self.inverted_index.get(lemma)\n",
  347. " return result if result != None else []\n",
  348. "\n",
  349. " def lemma_freq(self, lemma, doc_id):\n",
  350. " doc_info = self.words_frequency[doc_id - 1]\n",
  351. " if doc_info[0] == 0:\n",
  352. " return 0\n",
  353. " hits = doc_info[1].get(lemma)\n",
  354. " if hits == None:\n",
  355. " return 0\n",
  356. " return hits\n",
  357. " \n",
  358. " def search_in_index(self, query, rsv_func):\n",
  359. " query_tokens = text_to_tokens(query)\n",
  360. " indexes = list(map(self.find, query_tokens))\n",
  361. " match_docIds = list(set(itertools.chain(*indexes)))\n",
  362. " \n",
  363. " rsv_scores = dict(map(lambda docId: (docId, rsv_func(self, docId, query_tokens)), match_docIds))\n",
  364. " sorted_rsv_scores = sorted(rsv_scores.items(), key=operator.itemgetter(1))\n",
  365. " sorted_rsv_scores.reverse()\n",
  366. "\n",
  367. " return list(map(lambda x: x[0], sorted_rsv_scores))\n",
  368. " \n",
  369. " def get_avg_index_len(self):\n",
  370. " index_len_sum = sum([len(x[1]) for x in self.inverted_index.items()])\n",
  371. " return index_len_sum / len(self.inverted_index)\n",
  372. " \n",
  373. " def get_max_index_len(self):\n",
  374. " index_len_max = max([len(x[1]) for x in self.inverted_index.items()])\n",
  375. " return index_len_max\n",
  376. " \n",
  377. " def print_statistics(self):\n",
  378. " dict_len = len(self.unique_lemms)\n",
  379. " print(\"Length of the dictionary: %s\" % dict_len)\n",
  380. " print(\"Average list of word's positions length: %s\" % self.get_avg_index_len())\n",
  381. " print(\"Max list of word's positions length: %s\" % self.get_max_index_len())"
  382. ]
  383. },
  384. {
  385. "cell_type": "markdown",
  386. "metadata": {},
  387. "source": [
  388. "Добавим функцию для создания различных версий RSV(q,d) функции:"
  389. ]
  390. },
  391. {
  392. "cell_type": "code",
  393. "execution_count": 332,
  394. "metadata": {
  395. "collapsed": true
  396. },
  397. "outputs": [],
  398. "source": [
  399. "def idf(corpusIndex, lemma):\n",
  400. " indexes = corpusIndex.inverted_index.get(lemma)\n",
  401. " docs_amount = 0\n",
  402. " if indexes != None:\n",
  403. " docs_amount = len(indexes)\n",
  404. " N = len(corpusIndex.words_frequency)\n",
  405. " Nt = docs_amount\n",
  406. " return log(1.0 + (N - Nt + 0.5) / (Nt + 0.5))\n",
  407. "\n",
  408. "def idf_simple(corpusIndex, lemma):\n",
  409. " indexes = corpusIndex.inverted_index.get(lemma)\n",
  410. " docs_amount = 0\n",
  411. " if indexes != None:\n",
  412. " docs_amount = len(indexes)\n",
  413. " N = len(corpusIndex.words_frequency)\n",
  414. " Nt = docs_amount\n",
  415. " return log(N / Nt)\n",
  416. "\n",
  417. "def construct_rsv_func(k1, b,\n",
  418. " idf_func = idf, \n",
  419. " norm_rsv = False,\n",
  420. " use_tfq = False, k2 = 100):\n",
  421. " def rsv(corpusIndex, doc_id, query_lemmas):\n",
  422. " score, idf_sum = 0.0, 0.0\n",
  423. " Ld = float(corpusIndex.words_frequency[doc_id - 1][0])\n",
  424. " _L_ = corpusIndex.average_doc_len\n",
  425. " for lemma in set(query_lemmas):\n",
  426. " if norm_rsv:\n",
  427. " idf_sum += idf(corpusIndex, lemma)\n",
  428. "\n",
  429. " f_td = corpusIndex.lemma_freq(lemma, doc_id)\n",
  430. " f_tq = query_lemmas.count(lemma)\n",
  431. " if f_td == 0:\n",
  432. " continue\n",
  433. "\n",
  434. " addition = idf_func(corpusIndex, lemma) * f_td * (k1 + 1) / (k1 * ((1 - b) + b * Ld / _L_) + f_td)\n",
  435. " if use_tfq:\n",
  436. " addition *= (k2 + 1) * f_tq / (k2 + f_tq)\n",
  437. " score += addition\n",
  438. "\n",
  439. " if norm_rsv:\n",
  440. " score /= idf_sum\n",
  441. "\n",
  442. " return score\n",
  443. " \n",
  444. " return rsv"
  445. ]
  446. },
  447. {
  448. "cell_type": "markdown",
  449. "metadata": {},
  450. "source": [
  451. "Функция для замера оценок получивсшегося ранжирования:"
  452. ]
  453. },
  454. {
  455. "cell_type": "code",
  456. "execution_count": 354,
  457. "metadata": {
  458. "collapsed": true
  459. },
  460. "outputs": [],
  461. "source": [
  462. "def test_ranking(corpus, rsv_func, extra_log_info, logging=True):\n",
  463. " disk_corpus = corpus\n",
  464. "\n",
  465. " query_list = parse_query_file(\"cran.qry\")\n",
  466. " #print(\"\\n\\n\".join(str(x) for x in query_list))\n",
  467. "\n",
  468. " search_results = map(lambda q: (q.index, disk_corpus.search_in_index(q.query, rsv_func)[:10]), query_list)\n",
  469. " with open(\"qrel_nnovik\", mode=\"w+\") as results_f:\n",
  470. " index = 1\n",
  471. " for (_, resutls) in search_results:\n",
  472. " for docId in resutls:\n",
  473. " results_f.write(\"%s %s\\n\" % (index, docId))\n",
  474. " index += 1\n",
  475. "\n",
  476. " result = print_standings(should_log = logging)\n",
  477. " return result"
  478. ]
  479. },
  480. {
  481. "cell_type": "markdown",
  482. "metadata": {},
  483. "source": [
  484. "Протестируем вначале наш индекс вместе с самой базовой BM25:"
  485. ]
  486. },
  487. {
  488. "cell_type": "code",
  489. "execution_count": 355,
  490. "metadata": {},
  491. "outputs": [
  492. {
  493. "name": "stdout",
  494. "output_type": "stream",
  495. "text": [
  496. "*** Ranking for k1 = 1.2, b = 0.75 (headers) ***\n",
  497. "Mean precision: 0.24666666666666678\n",
  498. "Mean recall: 0.3609997558574267\n",
  499. "Mean F-measure: 0.2930772645786639\n",
  500. "MAP@10: 0.28114384115786245\n",
  501. "Length of the dictionary: 1523\n",
  502. "Average list of word's positions length: 7.115561391989495\n",
  503. "Max list of word's positions length: 358\n",
  504. "\n",
  505. "\n",
  506. "*** Ranking for k1 = 1.2, b = 0.75 (annotations) ***\n",
  507. "Mean precision: 0.2942222222222223\n",
  508. "Mean recall: 0.42721996967640197\n",
  509. "Mean F-measure: 0.3484620396960764\n",
  510. "MAP@10: 0.3662481089555164\n",
  511. "Length of the dictionary: 6851\n",
  512. "Average list of word's positions length: 12.559772296015181\n",
  513. "Max list of word's positions length: 713\n"
  514. ]
  515. }
  516. ],
  517. "source": [
  518. "data_list = parse_data_file(\"cran.all.1400\")\n",
  519. "rsv = construct_rsv_func(k1 = 1.2, b = 0.75)\n",
  520. "\n",
  521. "headers_list = map(lambda x: x.header, data_list)\n",
  522. "corpus = CorpusIndex.from_corpus(headers_list)\n",
  523. "#corpus.save(\"test\")\n",
  524. "print(\"*** Ranking for k1 = 1.2, b = 0.75 (headers) ***\")\n",
  525. "ranking = test_ranking(corpus, rsv, \"headers\")\n",
  526. "corpus.print_statistics()\n",
  527. "print(\"\\n\")\n",
  528. "\n",
  529. "annotations_list = map(lambda x: x.annotation, data_list)\n",
  530. "corpus = CorpusIndex.from_corpus(annotations_list)\n",
  531. "print(\"*** Ranking for k1 = 1.2, b = 0.75 (annotations) ***\")\n",
  532. "ranking = test_ranking(corpus, rsv, \"annotations\")\n",
  533. "corpus.print_statistics()"
  534. ]
  535. },
  536. {
  537. "cell_type": "code",
  538. "execution_count": 356,
  539. "metadata": {
  540. "collapsed": true
  541. },
  542. "outputs": [],
  543. "source": [
  544. "data_list = parse_data_file(\"cran.all.1400\")\n",
  545. "annotations_list = map(lambda x: x.annotation, data_list)\n",
  546. "corpus = CorpusIndex.from_corpus(annotations_list)\n",
  547. "#headers_list = map(lambda x: x.header, data_list)\n",
  548. "#corpus = CorpusIndex.from_corpus(headers_list)"
  549. ]
  550. },
  551. {
  552. "cell_type": "markdown",
  553. "metadata": {},
  554. "source": [
  555. "В дальнейшем будем использовать инвертированный индекс, построенный по аннотациям (не по заголовкам)."
  556. ]
  557. },
  558. {
  559. "cell_type": "code",
  560. "execution_count": 337,
  561. "metadata": {},
  562. "outputs": [
  563. {
  564. "name": "stdout",
  565. "output_type": "stream",
  566. "text": [
  567. "Best f-measure: %s ((1.9, 0.7), 0.3560820379208156)\n",
  568. "Best MAP@10: %s ((2.0, 0.7), 0.37541323031270102)\n"
  569. ]
  570. }
  571. ],
  572. "source": [
  573. "from itertools import product\n",
  574. "\n",
  575. "range_k1 = [_ / 100.0 for _ in range(120, 201, 10)]\n",
  576. "range_b = [_ / 100.0 for _ in range(0, 101, 10)]\n",
  577. "\n",
  578. "best_f_measure = ((), 0.0)\n",
  579. "best_map_10 = ((), 0.0)\n",
  580. "for (k1, b) in product(range_k1, range_b):\n",
  581. " rsv = construct_rsv_func(k1, b)\n",
  582. " ranking = test_ranking(corpus, rsv, \"grid search\", False)\n",
  583. " if (ranking[2] > best_f_measure[1]):\n",
  584. " best_f_measure = ((k1, b), ranking[2])\n",
  585. " if (ranking[3] > best_map_10[1]):\n",
  586. " best_map_10 = ((k1, b), ranking[3])\n",
  587. " \n",
  588. "print(\"Best f-measure: %s\", str(best_f_measure))\n",
  589. "print(\"Best MAP@10: %s\", str(best_map_10))"
  590. ]
  591. },
  592. {
  593. "cell_type": "markdown",
  594. "metadata": {},
  595. "source": [
  596. "Из полученных результатов видно, что наилучшая ___f-measure___ достигается при $k1 = 1.9, b = 0.7$. \n",
  597. "\n",
  598. "Такой $k1(=1.9)$ говорит о том, что оптимальный ранг сильно зависит от того, насколько часто терм запроса встречается в документах.\n",
  599. "\n",
  600. "Такой $b(=0.7)$ говорит о том, что есть корелляция между весом терма запроса для данного документа и длиной документа. То есть, вес терма запроса для данного документа существенно зависит от длины документа."
  601. ]
  602. },
  603. {
  604. "cell_type": "code",
  605. "execution_count": 338,
  606. "metadata": {
  607. "collapsed": true
  608. },
  609. "outputs": [],
  610. "source": [
  611. "best_k1 = 1.9\n",
  612. "best_b = 0.7"
  613. ]
  614. },
  615. {
  616. "cell_type": "code",
  617. "execution_count": 339,
  618. "metadata": {},
  619. "outputs": [
  620. {
  621. "name": "stdout",
  622. "output_type": "stream",
  623. "text": [
  624. "Mean precision: 0.30177777777777776\n",
  625. "Mean recall: 0.4345521020216618\n",
  626. "Mean F-measure: 0.3561940681056661\n",
  627. "MAP@10: 0.3744629405671733\n"
  628. ]
  629. }
  630. ],
  631. "source": [
  632. "rsv = construct_rsv_func(best_k1, best_b, idf_func = idf_simple)\n",
  633. "ranking = test_ranking(corpus, rsv, \"New idf\")"
  634. ]
  635. },
  636. {
  637. "cell_type": "markdown",
  638. "metadata": {},
  639. "source": [
  640. "При достаточно большом кол-ве текстов две формулы вычисления $IDF$ отличаются минимально. \n",
  641. "Поэтому и результат поменялся незначительно."
  642. ]
  643. },
  644. {
  645. "cell_type": "code",
  646. "execution_count": 343,
  647. "metadata": {},
  648. "outputs": [
  649. {
  650. "name": "stdout",
  651. "output_type": "stream",
  652. "text": [
  653. "Mean precision: 0.3017777777777778\n",
  654. "Mean recall: 0.4342187686883285\n",
  655. "Mean F-measure: 0.3560820379208156\n",
  656. "MAP@10: 0.37417475714565657\n"
  657. ]
  658. }
  659. ],
  660. "source": [
  661. "rsv = construct_rsv_func(best_k1, best_b, norm_rsv = True)\n",
  662. "ranking = test_ranking(corpus, rsv, \"Norm RSV\")"
  663. ]
  664. },
  665. {
  666. "cell_type": "markdown",
  667. "metadata": {
  668. "collapsed": true
  669. },
  670. "source": [
  671. "Поскольку сумма IDF термов запроса одинакова для всех RSV(q, d), то нормирование по этой сумме не должно повлиять на результат (однако, меняет его минимально)."
  672. ]
  673. },
  674. {
  675. "cell_type": "code",
  676. "execution_count": 341,
  677. "metadata": {},
  678. "outputs": [
  679. {
  680. "name": "stdout",
  681. "output_type": "stream",
  682. "text": [
  683. "*** Ranking for k2 = 0 (annotations) ***\n",
  684. "Mean precision: 0.3017777777777778\n",
  685. "Mean recall: 0.4342187686883285\n",
  686. "Mean F-measure: 0.3560820379208156\n",
  687. "MAP@10: 0.37417475714565657\n",
  688. "*** Ranking for k2 = 5 (annotations) ***\n",
  689. "Mean precision: 0.3031111111111111\n",
  690. "Mean recall: 0.4371427706123304\n",
  691. "Mean F-measure: 0.3579929377904864\n",
  692. "MAP@10: 0.37369987682315736\n",
  693. "*** Ranking for k2 = 10 (annotations) ***\n",
  694. "Mean precision: 0.30266666666666664\n",
  695. "Mean recall: 0.4368958570320835\n",
  696. "Mean F-measure: 0.35760009056986974\n",
  697. "MAP@10: 0.37282727247277503\n",
  698. "*** Ranking for k2 = 50 (annotations) ***\n",
  699. "Mean precision: 0.30266666666666664\n",
  700. "Mean recall: 0.4368958570320835\n",
  701. "Mean F-measure: 0.35760009056986974\n",
  702. "MAP@10: 0.3737362230900591\n",
  703. "*** Ranking for k2 = 100 (annotations) ***\n",
  704. "Mean precision: 0.30266666666666664\n",
  705. "Mean recall: 0.4368958570320835\n",
  706. "Mean F-measure: 0.35760009056986974\n",
  707. "MAP@10: 0.3736447327902354\n",
  708. "*** Ranking for k2 = 500 (annotations) ***\n",
  709. "Mean precision: 0.30266666666666664\n",
  710. "Mean recall: 0.4368958570320835\n",
  711. "Mean F-measure: 0.35760009056986974\n",
  712. "MAP@10: 0.3736447327902354\n",
  713. "*** Ranking for k2 = 1000 (annotations) ***\n",
  714. "Mean precision: 0.30266666666666664\n",
  715. "Mean recall: 0.4368958570320835\n",
  716. "Mean F-measure: 0.35760009056986974\n",
  717. "MAP@10: 0.3736447327902354\n"
  718. ]
  719. }
  720. ],
  721. "source": [
  722. "k2_values = [0, 5, 10, 50, 100, 500, 1000]\n",
  723. "\n",
  724. "for _k2 in k2_values:\n",
  725. " rsv = construct_rsv_func(best_k1, best_b, use_tfq = True, k2 = _k2)\n",
  726. " print(\"*** Ranking for k2 = %s (annotations) ***\" % _k2)\n",
  727. " ranking = test_ranking(corpus, rsv, \"Use TFQ\", True)\n",
  728. " if (ranking[2] > best_f_measure[1]):\n",
  729. " best_f_measure = ((k1, b), ranking[2])"
  730. ]
  731. },
  732. {
  733. "cell_type": "markdown",
  734. "metadata": {
  735. "collapsed": true
  736. },
  737. "source": [
  738. "k2 = 5 оказался наилучшим вариантом для улучшения ___f-measure___. Это означает, что:\n",
  739. "1. у нас имеются запросы с повторящимися термами;\n",
  740. "2. если терм запроса повторяется несколько раз, то это должно это значит, что ранг данного документа должен быть немножечко увеличен для лучших результатов"
  741. ]
  742. }
  743. ],
  744. "metadata": {
  745. "kernelspec": {
  746. "display_name": "Python 3",
  747. "language": "python",
  748. "name": "python3"
  749. },
  750. "language_info": {
  751. "codemirror_mode": {
  752. "name": "ipython",
  753. "version": 3
  754. },
  755. "file_extension": ".py",
  756. "mimetype": "text/x-python",
  757. "name": "python",
  758. "nbconvert_exporter": "python",
  759. "pygments_lexer": "ipython3",
  760. "version": "3.6.1"
  761. }
  762. },
  763. "nbformat": 4,
  764. "nbformat_minor": 2
  765. }
Add Comment
Please, Sign In to add comment