Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- module Normalizer
- @terms = {'SE' => 'special edition', 'se' => 'special edition', 'special edition' => 'se', 'Special Edition' => 'se', 'Limited Edition' => 'le', 'limited edition' => 'le', 'HD' => 'heavy duty', 'WT' => 'work truck', 'Work Truck' => 'wt', 'ann' => 'anniversary', 'editio n' => 'edition', 'anniversary' => 'ann', 'technology' => 'tech', 'tech' => 'technology', 'limited' => 'ltd', 'ltd' => 'limited'}
- def Normalizer.replace_synonyms(str)
- return str if str.nil? || str == ''
- words = str.split(' ')
- replaced_words = []
- words.each_with_index do |word, index|
- double_word = "#{words[index]} #{words[index+1]}"
- if @terms.key?(double_word) && !replaced_words.include?(double_word)
- words[index] = @terms[double_word]
- words.delete_at(index+1)
- replaced_words << words[index]
- elsif @terms.key?(word) && !replaced_words.include?(@terms[word])
- words[index] = @terms[word]
- replaced_words << @terms[word]
- end
- end
- return words.join(' ')
- end
- def Normalizer.normalize(keyword, terms)
- return {:match => nil, :match_type => :empty} if keyword.nil? or keyword == '' or terms.nil? or terms.length == 0
- downcased_keyword = keyword.downcase
- keyword_after_synonyms_replaced = Normalizer.replace_synonyms(downcased_keyword)
- return {:match => terms.select{|term| term.downcase == keyword_after_synonyms_replaced}[0], :match_type => :exact_match} if terms.select{|term| term.downcase == keyword_after_synonyms_replaced}.length > 0
- return {:match => keyword, :match_type => :exact_match} if terms.include?(keyword)
- return {:match => terms.select{|term| term.downcase == downcased_keyword}[0], :match_type => :exact_match} if terms.select{|term| term.downcase == downcased_keyword}.length > 0
- temp_matcher = downcased_keyword.gsub('-', '').gsub('/', '')
- return {:match => terms.select{|term| temp_matcher == term.gsub('/', '').gsub('-', '').downcase}[0], :match_type => :exact_match} if terms.select{|term| temp_matcher == term.gsub('/', '').gsub('-', '').downcase}.length > 0
- return {:match => terms.select{|term| term.downcase.include?(downcased_keyword)}.min{|x,y| x.size <=> y.size}, :match_type => :sub_match} if terms.select{|term| term.downcase.include?(downcased_keyword)}.length > 0
- keyword_downcased_with_spaces_for_dashes = downcased_keyword.gsub('-', ' ')
- return {:match => terms.select{|term| term.downcase.include?(keyword_downcased_with_spaces_for_dashes)}.min{|x,y| x.size <=> y.size}, :match_type => :sub_match} if terms.select{|term| term.downcase.include?(keyword_downcased_with_spaces_for_dashes)}.length > 0
- matched_term = ''
- terms.each do |term|
- if term.include?(' ')
- matched_term = {:match => term, :match_type => :sub_match} if downcased_keyword.include?(term.split(' ')[0].downcase.gsub('-', '')) && downcased_keyword.include?(term.split(' ')[1].downcase.gsub('-', '')) && matched_term.length < term.length
- end
- end
- keyword_array_after_synonyms = keyword_after_synonyms_replaced.split(' ')
- if matched_term != ''
- hold_term = ''
- hold_number_of_matched_terms = 0
- terms.each do |term|
- number_of_matched_terms = 0
- keyword_array_after_synonyms.each do |a_keyword|
- if term.downcase.gsub('-', ' ').split(' ').include?(a_keyword)
- number_of_matched_terms = number_of_matched_terms + 1
- end
- end
- if (keyword_array_after_synonyms & term.downcase.gsub('-', ' ').split(' ')).length >= number_of_matched_terms
- hold_number_of_matched_terms = (keyword_array_after_synonyms & term.gsub('-', '').downcase.split(' ')).length
- hold_term = term
- elsif number_of_matched_terms > hold_number_of_matched_terms
- hold_number_of_matched_terms = number_of_matched_terms
- hold_term = term
- end
- end
- end
- if hold_term != '' && !hold_term.nil? && matched_term == ''
- return Normalizer.post_process(keyword, {:match => hold_term, :match_type => :sub_match}, terms)
- elsif hold_term != '' && !hold_term.nil? && hold_term.length > matched_term[:match].length
- return Normalizer.post_process(keyword, {:match => hold_term, :match_type => :sub_match}, terms)
- elsif matched_term != ''
- return Normalizer.post_process(keyword, matched_term, terms)
- end
- keyword_downcased_without_spaces = downcased_keyword.gsub(' ', '')
- matched_term = ''
- terms.each do |term|
- split_term = term.split(' ')
- if split_term.length > 1
- matched_term = {:match => term, :match_type => :sub_match} if keyword_downcased_without_spaces.include?(split_term[0].downcase.gsub('-', '')) && keyword_downcased_without_spaces.include?(split_term[1].downcase.gsub('-', '')) && matched_term.length < term.length
- end
- end
- return matched_term if matched_term != ''
- matched_term = ''
- terms.each do |term|
- if term.include?(' ')
- downcased_keyword_split = downcased_keyword.split(' ') if downcased_keyword_split.nil?
- first_keyword = downcased_keyword_split[0].gsub('-', '').gsub(' ', '') if first_keyword.nil?
- second_keyword = downcased_keyword_split[1].gsub('-', '').gsub(' ', '') if second_keyword.nil?
- matched_term = {:match => term, :match_type => :sub_match} if term.gsub(' ', '').downcase.include?(first_keyword) && term.gsub(' ', '').downcase.include?(second_keyword) && matched_term.length < term.length
- end
- end
- return matched_term if matched_term != ''
- keyword_without_dashes = keyword.gsub('-', ' ')
- match = {:match => terms.select{|term| keyword_without_dashes.include?(term.gsub('-', ' '))}.max{|x,y| x.size <=> y.size}, :match_type => :sub_match} if terms.select{|term| keyword_without_dashes.include?(term.gsub('-', ' '))}.length > 0
- if !match.nil?
- hold_term = ''
- hold_number_of_matched_terms = 0
- terms.each do |term|
- number_of_matched_terms = 0
- keyword_array_after_synonyms.each do |a_keyword|
- if term.downcase.gsub("-", ' ').split(' ').include?(a_keyword)
- number_of_matched_terms = number_of_matched_terms + 1
- end
- end
- if (keyword_array_after_synonyms & term.downcase.gsub("-", ' ').split(' ')).length > number_of_matched_terms
- hold_number_of_matched_terms = (keyword_array_after_synonyms & term.gsub("-", '').downcase.split(' ')).length
- hold_term = term
- elsif number_of_matched_terms > hold_number_of_matched_terms
- hold_number_of_matched_terms = number_of_matched_terms
- hold_term = term
- end
- end
- end
- if hold_term != '' && match != nil && match[:match] < hold_term
- return {:match => hold_term, :match_type => :sub_match}
- elsif !match.nil?
- return match
- end
- return {:match => terms.select{|term| downcased_keyword.include?(term.downcase)}.max{|x,y| x.size <=> y.size}, :match_type => :sub_match} if terms.select{|term| downcased_keyword.include?(term.downcase)}.length > 0
- lowest_checksum_difference = 99
- hold_term = ''
- keyword_sum = keyword.sum
- terms.each do |term|
- if (term.sum - keyword_sum).abs < lowest_checksum_difference
- hold_term = term
- lowest_checksum_difference = (term.sum - keyword_sum).abs
- end
- end
- return {:match => hold_term, :match_type => :closest_match} if hold_term != ''
- return {:match => terms.min{|x,y| x.sum <=> keyword_sum}, :match_type => :sub_match }
- end
- def Normalizer.post_process(keyword, match, terms)
- if terms.include?(match[:match]) && keyword.downcase.gsub("-", '').sum == match[:match].downcase.gsub("-", ' ').sum
- return {:match => match[:match], :match_type => :exact_match}
- else
- return match
- end
- end
- end
Add Comment
Please, Sign In to add comment