Advertisement
Guest User

Untitled

a guest
Aug 2nd, 2015
188
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.39 KB | None | 0 0
  1. inverted_index = {
  2. "new" => [1, 4], # These are document ids.
  3. "home" => [1, 2, 3, 4], # The key is stemmed and some stop-words
  4. "sale" => [1, 2, 3, 4], # are being removed.
  5. "top" => [1],
  6. "forecast" => [1],
  7. "rise" => [2, 4],
  8. "juli" => [2, 3, 4],
  9. "increas" => [3]
  10. })
  11.  
  12. module Rankrb
  13. class Document
  14. attr_accessor :id, :body, :rank
  15.  
  16. def initialize(params={})
  17. @id = params.fetch :id, nil
  18. @body = params.fetch :body, ''
  19. @rank = params.fetch :rank, nil
  20. end
  21.  
  22. def length
  23. tokens.join(' ').length
  24. end
  25.  
  26. def include?(term)
  27. tokens.include? term_to_token(term)
  28. end
  29.  
  30. def term_freq(term)
  31. tokens.count term_to_token(term)
  32. end
  33.  
  34. def tokens
  35. Rankrb::Tokenizer.new(@body).tokenize
  36. end
  37.  
  38. def uniq_tokens
  39. tokens.uniq
  40. end
  41.  
  42. private
  43. def term_to_token(term)
  44. Rankrb::Tokenizer.new(term).tokenize.shift
  45. end
  46. end
  47. end
  48.  
  49. module Rankrb
  50. # The same tokenizer should be used for document
  51. # tokenization and query tokenization to ensure that
  52. # the same terms are being searched and returned.
  53. class Tokenizer
  54. attr_accessor :str
  55. attr_reader :tokens
  56.  
  57. def initialize(str='')
  58. @str = str
  59. @tokens = Array.new
  60. @stopwords = Rankrb.configuration.stopwords
  61. @lang = Rankrb.configuration.language
  62. end
  63.  
  64. def tokenize
  65. regex = /[^sp{Alnum}p{Han}p{Katakana}p{Hiragana}p{Hangul}]/
  66. @tokens = @str.gsub(regex,'')
  67. .downcase
  68. .split
  69. .delete_if {|token| @stopwords.include?(token)}
  70. .map {|w| Lingua.stemmer(w, :language => @lang)}
  71. @tokens
  72. end
  73. end
  74. end
  75.  
  76. module Rankrb
  77. class Collection
  78. attr_accessor :query, :docs
  79.  
  80. def initialize(params={})
  81. @docs = params.fetch(:docs, [])
  82. @query = params.fetch(:query, nil)
  83.  
  84. def @docs.<<(arg)
  85. self.push arg
  86. end
  87. end
  88.  
  89. def remove_doc(doc)
  90. @docs.delete_if do |curr_doc|
  91. curr_doc == doc
  92. end
  93. end
  94.  
  95. def containing_term(term)
  96. @docs.count {|doc| doc.include?(term)}
  97. end
  98.  
  99. def avg_dl
  100. @docs.map(&:length).inject(:+) / total_docs
  101. end
  102.  
  103. def total_docs
  104. @docs.size
  105. end
  106.  
  107. def idf(term)
  108. numerator = total_docs - containing_term(term) + 0.5
  109. denominator = containing_term(term) + 0.5
  110. Math.log(numerator / denominator)
  111. end
  112.  
  113. def bm25(params={:k => 1.2, :b => 0.75, :delta => 1.0})
  114. @k = params[:k]
  115. @b = params[:b]
  116. @delta = params[:delta]
  117.  
  118. @docs.each do |doc|
  119. score = 0
  120. dl = doc.length
  121. query_terms = @query.split
  122.  
  123. query_terms.each do |term|
  124. dtf = doc.term_freq(term)
  125. numerator = dtf * (@k + 1)
  126. denominator = dtf + @k * (1 - @b + @b * (doc.length / avg_dl))
  127. score += idf(term) * (numerator/denominator) + @delta
  128. end
  129. doc.rank = score
  130. end
  131. @docs.sort {|a, b| a.rank <=> b.rank}
  132. end
  133. end
  134. end
  135.  
  136. module Rankrb
  137. class InvertedIndex
  138. attr_accessor :collection, :iidx
  139.  
  140. def initialize(params={})
  141. @collection = params.fetch(:collection, Rankrb::Collection.new)
  142. @index_file = 'db/index.json'
  143. @iidx = Hash.new
  144. end
  145.  
  146. def build
  147. @collection.docs.each do |doc|
  148. # Make the inverted index hash
  149. doc.uniq_tokens.each do |token|
  150. if @iidx[token]
  151. @iidx[token] << doc.id
  152. else
  153. @iidx[token] = [doc.id]
  154. end
  155. end
  156. end
  157. # Now sort the document ids and return the inverted index!
  158. @iidx.each {|k, v| @iidx[k] = v.sort}
  159. end
  160.  
  161. def remove_doc(doc)
  162. doc.tokens.each do |token|
  163. # Remove the document id
  164. @iidx[token].delete(doc.id)
  165. # Then remove the key from the hash if
  166. # there are no more docs.
  167. @iidx.delete(token) if @iidx[token].empty?
  168. end
  169. # Once all tokens have been removed,
  170. # remove the document from the collection.
  171. @collection.remove_doc(doc)
  172. @iidx
  173. end
  174.  
  175. # Returns an array of document ids.
  176. def find(str)
  177. Rankrb::Tokenizer.new(str)
  178. .tokenize
  179. .map {|token| @iidx[token]}
  180. .compact
  181. .flatten
  182. .uniq
  183. .sort
  184. end
  185.  
  186. # Define query_or and query_and methods.
  187. %w(and or).each do |op|
  188. define_method("query_#{op}") do |word_ary|
  189. doc_ids = Array.new
  190. word_ary.each {|word| doc_ids << find(word) }
  191. case op
  192. when 'and'
  193. symbol = :&
  194. when 'or'
  195. symbol = :|
  196. end
  197. doc_ids.inject(symbol)
  198. end
  199. end
  200.  
  201. def commit!
  202. if File.exist?(@index_file)
  203. file = File.read @index_file
  204. # Merge the new tokens
  205. index = JSON.parse(file).merge(@iidx)
  206. File.open(@index_file, 'w+') { |f| f.write(index.to_json) }
  207. else
  208. # Create & write to file for the first time
  209. File.open(@index_file, 'w') { |f| f.write(@iidx) }
  210. end
  211. end
  212.  
  213. end
  214. end
  215.  
  216. d1 = Rankrb::Document.new body: "new home sales top forecasts", id: 1
  217. d2 = Rankrb::Document.new body: "home sales rise in july", id: 2
  218. d3 = Rankrb::Document.new body: "increase in home sales in july", id: 3
  219. d4 = Rankrb::Document.new body: "july new home sales rise", id: 4
  220. coll = Rankrb::Collection.new docs: [d1, d2, d3, d4]
  221. index = Rankrb::InvertedIndex.new collection: coll
  222. index.build # Inverted-index gets built and stored into @iidx
  223.  
  224. index.find('top sales') # => [1, 2, 3, 4]
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement