Untitled

inverted_index = {
  "new" => [1, 4],         # These are document ids.
  "home" => [1, 2, 3, 4],  # The key is stemmed and some stop-words
  "sale" => [1, 2, 3, 4],  # are being removed.
  "top" => [1],
  "forecast" => [1],
  "rise" => [2, 4],
  "juli" => [2, 3, 4],
  "increas" => [3]
})

module Rankrb
  class Document
    attr_accessor :id, :body, :rank

    def initialize(params={})
      @id = params.fetch :id, nil
      @body = params.fetch :body, ''
      @rank = params.fetch :rank, nil
    end

    def length
      tokens.join(' ').length
    end

    def include?(term)
      tokens.include? term_to_token(term)
    end

    def term_freq(term)
      tokens.count term_to_token(term)
    end

    def tokens
      Rankrb::Tokenizer.new(@body).tokenize
    end

    def uniq_tokens
      tokens.uniq
    end

    private
    def term_to_token(term)
      Rankrb::Tokenizer.new(term).tokenize.shift
    end
  end
end

module Rankrb
  # The same tokenizer should be used for document
  # tokenization and query tokenization to ensure that
  # the same terms are being searched and returned.
  class Tokenizer
    attr_accessor :str
    attr_reader :tokens

    def initialize(str='')
      @str = str
      @tokens = Array.new
      @stopwords = Rankrb.configuration.stopwords
      @lang = Rankrb.configuration.language
    end

    def tokenize
      regex = /[^sp{Alnum}p{Han}p{Katakana}p{Hiragana}p{Hangul}]/
      @tokens = @str.gsub(regex,'')
                    .downcase
                    .split
                    .delete_if {|token| @stopwords.include?(token)}
                    .map {|w| Lingua.stemmer(w, :language => @lang)}
      @tokens
    end
  end
end

module Rankrb
  class Collection
    attr_accessor :query, :docs

    def initialize(params={})
      @docs = params.fetch(:docs, [])
      @query = params.fetch(:query, nil)

      def @docs.<<(arg)
        self.push arg
      end
    end

    def remove_doc(doc)
      @docs.delete_if do |curr_doc|
        curr_doc == doc
      end
    end

    def containing_term(term)
      @docs.count {|doc| doc.include?(term)}
    end

    def avg_dl
      @docs.map(&:length).inject(:+) / total_docs
    end

    def total_docs
      @docs.size
    end

    def idf(term)
      numerator = total_docs - containing_term(term) + 0.5
      denominator = containing_term(term) + 0.5
      Math.log(numerator / denominator)
    end

    def bm25(params={:k => 1.2, :b => 0.75, :delta => 1.0})
      @k = params[:k]
      @b = params[:b]
      @delta = params[:delta]

      @docs.each do |doc|
        score = 0
        dl = doc.length
        query_terms = @query.split

        query_terms.each do |term|
          dtf = doc.term_freq(term)
          numerator = dtf * (@k + 1)
          denominator = dtf + @k * (1 - @b + @b * (doc.length / avg_dl))
          score += idf(term) * (numerator/denominator) + @delta
        end
        doc.rank = score
      end
      @docs.sort {|a, b| a.rank <=> b.rank}
    end
  end
end

module Rankrb
  class InvertedIndex
    attr_accessor :collection, :iidx

    def initialize(params={})
      @collection = params.fetch(:collection, Rankrb::Collection.new)
      @index_file = 'db/index.json'
      @iidx = Hash.new
    end

    def build
      @collection.docs.each do |doc|
        # Make the inverted index hash
        doc.uniq_tokens.each do |token|
          if @iidx[token]
            @iidx[token] << doc.id
          else
            @iidx[token] = [doc.id]
          end
        end
      end
      # Now sort the document ids and return the inverted index!
      @iidx.each {|k, v| @iidx[k] = v.sort}
    end

    def remove_doc(doc)
      doc.tokens.each do |token|
        # Remove the document id
        @iidx[token].delete(doc.id)
        # Then remove the key from the hash if
        # there are no more docs.
        @iidx.delete(token) if @iidx[token].empty?
      end
      # Once all tokens have been removed,
      # remove the document from the collection.
      @collection.remove_doc(doc)
      @iidx
    end

    # Returns an array of document ids.
    def find(str)
      Rankrb::Tokenizer.new(str)
        .tokenize
        .map {|token| @iidx[token]}
        .compact
        .flatten
        .uniq
        .sort
    end

    # Define query_or and query_and methods.
    %w(and or).each do |op|
      define_method("query_#{op}") do |word_ary|
        doc_ids = Array.new
        word_ary.each {|word| doc_ids << find(word) }
        case op
        when 'and'
          symbol = :&
        when 'or'
          symbol = :|
        end
        doc_ids.inject(symbol)
      end
    end

    def commit!
      if File.exist?(@index_file)
        file = File.read @index_file
        # Merge the new tokens
        index = JSON.parse(file).merge(@iidx)
        File.open(@index_file, 'w+') { |f| f.write(index.to_json) }
      else
        # Create & write to file for the first time
        File.open(@index_file, 'w') { |f| f.write(@iidx) }
      end
    end

  end
end

d1 = Rankrb::Document.new body: "new home sales top forecasts", id: 1
d2 = Rankrb::Document.new body: "home sales rise in july", id: 2
d3 = Rankrb::Document.new body: "increase in home sales in july", id: 3
d4 = Rankrb::Document.new body: "july new home sales rise", id: 4
coll = Rankrb::Collection.new docs: [d1, d2, d3, d4]
index = Rankrb::InvertedIndex.new collection: coll
index.build # Inverted-index gets built and stored into @iidx

index.find('top sales') # => [1, 2, 3, 4]