Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- inverted_index = {
- "new" => [1, 4], # These are document ids.
- "home" => [1, 2, 3, 4], # The key is stemmed and some stop-words
- "sale" => [1, 2, 3, 4], # are being removed.
- "top" => [1],
- "forecast" => [1],
- "rise" => [2, 4],
- "juli" => [2, 3, 4],
- "increas" => [3]
- })
- module Rankrb
- class Document
- attr_accessor :id, :body, :rank
- def initialize(params={})
- @id = params.fetch :id, nil
- @body = params.fetch :body, ''
- @rank = params.fetch :rank, nil
- end
- def length
- tokens.join(' ').length
- end
- def include?(term)
- tokens.include? term_to_token(term)
- end
- def term_freq(term)
- tokens.count term_to_token(term)
- end
- def tokens
- Rankrb::Tokenizer.new(@body).tokenize
- end
- def uniq_tokens
- tokens.uniq
- end
- private
- def term_to_token(term)
- Rankrb::Tokenizer.new(term).tokenize.shift
- end
- end
- end
- module Rankrb
- # The same tokenizer should be used for document
- # tokenization and query tokenization to ensure that
- # the same terms are being searched and returned.
- class Tokenizer
- attr_accessor :str
- attr_reader :tokens
- def initialize(str='')
- @str = str
- @tokens = Array.new
- @stopwords = Rankrb.configuration.stopwords
- @lang = Rankrb.configuration.language
- end
- def tokenize
- regex = /[^sp{Alnum}p{Han}p{Katakana}p{Hiragana}p{Hangul}]/
- @tokens = @str.gsub(regex,'')
- .downcase
- .split
- .delete_if {|token| @stopwords.include?(token)}
- .map {|w| Lingua.stemmer(w, :language => @lang)}
- @tokens
- end
- end
- end
- module Rankrb
- class Collection
- attr_accessor :query, :docs
- def initialize(params={})
- @docs = params.fetch(:docs, [])
- @query = params.fetch(:query, nil)
- def @docs.<<(arg)
- self.push arg
- end
- end
- def remove_doc(doc)
- @docs.delete_if do |curr_doc|
- curr_doc == doc
- end
- end
- def containing_term(term)
- @docs.count {|doc| doc.include?(term)}
- end
- def avg_dl
- @docs.map(&:length).inject(:+) / total_docs
- end
- def total_docs
- @docs.size
- end
- def idf(term)
- numerator = total_docs - containing_term(term) + 0.5
- denominator = containing_term(term) + 0.5
- Math.log(numerator / denominator)
- end
- def bm25(params={:k => 1.2, :b => 0.75, :delta => 1.0})
- @k = params[:k]
- @b = params[:b]
- @delta = params[:delta]
- @docs.each do |doc|
- score = 0
- dl = doc.length
- query_terms = @query.split
- query_terms.each do |term|
- dtf = doc.term_freq(term)
- numerator = dtf * (@k + 1)
- denominator = dtf + @k * (1 - @b + @b * (doc.length / avg_dl))
- score += idf(term) * (numerator/denominator) + @delta
- end
- doc.rank = score
- end
- @docs.sort {|a, b| a.rank <=> b.rank}
- end
- end
- end
- module Rankrb
- class InvertedIndex
- attr_accessor :collection, :iidx
- def initialize(params={})
- @collection = params.fetch(:collection, Rankrb::Collection.new)
- @index_file = 'db/index.json'
- @iidx = Hash.new
- end
- def build
- @collection.docs.each do |doc|
- # Make the inverted index hash
- doc.uniq_tokens.each do |token|
- if @iidx[token]
- @iidx[token] << doc.id
- else
- @iidx[token] = [doc.id]
- end
- end
- end
- # Now sort the document ids and return the inverted index!
- @iidx.each {|k, v| @iidx[k] = v.sort}
- end
- def remove_doc(doc)
- doc.tokens.each do |token|
- # Remove the document id
- @iidx[token].delete(doc.id)
- # Then remove the key from the hash if
- # there are no more docs.
- @iidx.delete(token) if @iidx[token].empty?
- end
- # Once all tokens have been removed,
- # remove the document from the collection.
- @collection.remove_doc(doc)
- @iidx
- end
- # Returns an array of document ids.
- def find(str)
- Rankrb::Tokenizer.new(str)
- .tokenize
- .map {|token| @iidx[token]}
- .compact
- .flatten
- .uniq
- .sort
- end
- # Define query_or and query_and methods.
- %w(and or).each do |op|
- define_method("query_#{op}") do |word_ary|
- doc_ids = Array.new
- word_ary.each {|word| doc_ids << find(word) }
- case op
- when 'and'
- symbol = :&
- when 'or'
- symbol = :|
- end
- doc_ids.inject(symbol)
- end
- end
- def commit!
- if File.exist?(@index_file)
- file = File.read @index_file
- # Merge the new tokens
- index = JSON.parse(file).merge(@iidx)
- File.open(@index_file, 'w+') { |f| f.write(index.to_json) }
- else
- # Create & write to file for the first time
- File.open(@index_file, 'w') { |f| f.write(@iidx) }
- end
- end
- end
- end
- d1 = Rankrb::Document.new body: "new home sales top forecasts", id: 1
- d2 = Rankrb::Document.new body: "home sales rise in july", id: 2
- d3 = Rankrb::Document.new body: "increase in home sales in july", id: 3
- d4 = Rankrb::Document.new body: "july new home sales rise", id: 4
- coll = Rankrb::Collection.new docs: [d1, d2, d3, d4]
- index = Rankrb::InvertedIndex.new collection: coll
- index.build # Inverted-index gets built and stored into @iidx
- index.find('top sales') # => [1, 2, 3, 4]
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement