Guest User

Untitled

a guest
Jan 13th, 2018
72
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.95 KB | None | 0 0
  1. require 'nokogiri'
  2. require 'activesupport'
  3. include ActiveSupport::CoreExtensions::Hash
  4. require 'progressbar'
  5. require 'term/ansicolor'
  6. require 'ruby-prof'
  7.  
  8. class String
  9. include Term::ANSIColor
  10. end
  11.  
  12. class Pythia
  13.  
  14. attr_accessor :url_store, :index
  15.  
  16. def initialize
  17. puts "Initializing Pythia Search Engine....."
  18. start = Time.now
  19. @document_list = YAML.load_file('../db/index/catalogue.yml')
  20. @index = {}
  21. puts "loading index"
  22. @index = load_inverted_index
  23. puts "Time elapsed to Initialize Pythia Search Engine: #{Time.now - start} seconds"
  24. @results = {}
  25. end
  26.  
  27. def start
  28. search
  29. end
  30.  
  31. def search
  32. puts "Welcome to Pythia"
  33. puts "Ask anything:"
  34. while true
  35. query = gets.chomp.split
  36. #query = query.map{|word| word.lematize}
  37. results(query)
  38. end
  39. end
  40.  
  41. def results(query)
  42. start = Time.now
  43. query.each do |keyword|
  44. str = "#{keyword}"
  45. docs_ids = @index[keyword]
  46. if !(docs_ids == [])
  47. docs_ids.each do |obj|
  48. find_docs_weight(obj)
  49. end
  50. else
  51. puts "Nothing Found for the word: #{str.green}"
  52. end
  53. puts " "
  54. end
  55. print_results(query)
  56. puts "Time elapsed to print results is #{Time.now - start} \n "
  57. puts "Ask something else..."
  58. end
  59.  
  60. def find_docs_weight(element)
  61. doc_id = find_document_id(element[0])
  62. doc_weight = element[1]
  63. if @results.keys.include?(doc_id)
  64. @results[doc_id] = @results[doc_id].to_f + doc_weight.to_f
  65. else
  66. @results[doc_id] = doc_weight.to_f
  67. end
  68. @results
  69. end
  70.  
  71. def print_results(query)
  72. puts "#{'We found:'.green}#{@results.size} #{'pages for your query:'.green} #{query.join(" ")}"
  73. @results.sort_by {|key, value| value}.reverse.each do |elem|
  74. puts "#{'Title:'.green} #{elem[0]} #{'Weight:'.red} #{elem[1]} "
  75. end
  76. clear_results
  77. end
  78.  
  79. def clear_results
  80. @results = {}
  81. end
  82.  
  83. def find_document_id(num)
  84. @document_list[num.to_i]
  85. end
  86.  
  87. def load_inverted_index
  88.  
  89. start = Time.now
  90. f = File.open("../db/index/inverted_index.xml","r")
  91. puts "Time elapsed to load the xml file is #{Time.now - start} seconds"
  92.  
  93. start = Time.now
  94. doc = Nokogiri::XML.parse(f)
  95. puts "Time elapsed parse the xml file with nokogiri is #{Time.now - start} seconds"
  96.  
  97. my_table = doc.search('//lemma').map{ |e| Hash.from_xml(e.to_xml)['lemma']; }
  98.  
  99. dictionary = Hash.new{ |h, k| h[k] = [] }
  100. my_table.each do |value|
  101. if value["document"].is_a? Array
  102. value["document"].each do |element|
  103. doc_id = element["id"]
  104. weight = element["weight"]
  105. dictionary["#{value["name"]}"] << [doc_id, weight]
  106. end
  107. else
  108. doc_id = value["document"]["id"]
  109. weight = value["document"]["weight"]
  110. dictionary["#{value["name"]}"] << [doc_id, weight]
  111. end
  112. end
  113. dictionary
  114. end
  115.  
  116. end
  117.  
  118. Pythia.new.start
Add Comment
Please, Sign In to add comment