Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- require 'nokogiri'
- require 'activesupport'
- include ActiveSupport::CoreExtensions::Hash
- require 'progressbar'
- require 'term/ansicolor'
- require 'ruby-prof'
- class String
- include Term::ANSIColor
- end
- class Pythia
- attr_accessor :url_store, :index
- def initialize
- puts "Initializing Pythia Search Engine....."
- start = Time.now
- @document_list = YAML.load_file('../db/index/catalogue.yml')
- @index = {}
- puts "loading index"
- @index = load_inverted_index
- puts "Time elapsed to Initialize Pythia Search Engine: #{Time.now - start} seconds"
- @results = {}
- end
- def start
- search
- end
- def search
- puts "Welcome to Pythia"
- puts "Ask anything:"
- while true
- query = gets.chomp.split
- #query = query.map{|word| word.lematize}
- results(query)
- end
- end
- def results(query)
- start = Time.now
- query.each do |keyword|
- str = "#{keyword}"
- docs_ids = @index[keyword]
- if !(docs_ids == [])
- docs_ids.each do |obj|
- find_docs_weight(obj)
- end
- else
- puts "Nothing Found for the word: #{str.green}"
- end
- puts " "
- end
- print_results(query)
- puts "Time elapsed to print results is #{Time.now - start} \n "
- puts "Ask something else..."
- end
- def find_docs_weight(element)
- doc_id = find_document_id(element[0])
- doc_weight = element[1]
- if @results.keys.include?(doc_id)
- @results[doc_id] = @results[doc_id].to_f + doc_weight.to_f
- else
- @results[doc_id] = doc_weight.to_f
- end
- @results
- end
- def print_results(query)
- puts "#{'We found:'.green}#{@results.size} #{'pages for your query:'.green} #{query.join(" ")}"
- @results.sort_by {|key, value| value}.reverse.each do |elem|
- puts "#{'Title:'.green} #{elem[0]} #{'Weight:'.red} #{elem[1]} "
- end
- clear_results
- end
- def clear_results
- @results = {}
- end
- def find_document_id(num)
- @document_list[num.to_i]
- end
- def load_inverted_index
- start = Time.now
- f = File.open("../db/index/inverted_index.xml","r")
- puts "Time elapsed to load the xml file is #{Time.now - start} seconds"
- start = Time.now
- doc = Nokogiri::XML.parse(f)
- puts "Time elapsed parse the xml file with nokogiri is #{Time.now - start} seconds"
- my_table = doc.search('//lemma').map{ |e| Hash.from_xml(e.to_xml)['lemma']; }
- dictionary = Hash.new{ |h, k| h[k] = [] }
- my_table.each do |value|
- if value["document"].is_a? Array
- value["document"].each do |element|
- doc_id = element["id"]
- weight = element["weight"]
- dictionary["#{value["name"]}"] << [doc_id, weight]
- end
- else
- doc_id = value["document"]["id"]
- weight = value["document"]["weight"]
- dictionary["#{value["name"]}"] << [doc_id, weight]
- end
- end
- dictionary
- end
- end
- Pythia.new.start
Add Comment
Please, Sign In to add comment