Advertisement
Vilmar

infer_spaces_ruby

Mar 20th, 2014
132
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Ruby 2.42 KB | None | 0 0
  1. #!/usr/bin/ruby
  2. #encoding: UTF-8
  3. Encoding::default_internal = "UTF-8"
  4. Encoding::default_external = "UTF-8"
  5.  
  6. require 'active_support/core_ext'
  7.  
  8. @wordcost = Hash.new
  9. @count = %x{wc -l dictionary.txt}.split.first.to_i
  10.  
  11. i = 0
  12.  
  13. File.readlines("dictionary.txt").each do |line|
  14.   line.chomp!
  15.  
  16.   @wordcost[line.mb_chars.downcase.to_s] ||= Math.log((i+1) * Math.log(@count))
  17.   i += 1
  18. end
  19.  
  20. def infer_spaces(s)
  21.  
  22.   @sent = s.chomp
  23.  
  24.   def best_match(i)
  25.     result = []
  26.     candidates = @cost[0, i].reverse
  27.     candidates.each_index do |index|
  28.       if @wordcost.has_key?(@sent[i-index-1...i].mb_chars.downcase.to_s)
  29.         result << [(candidates[index] + @wordcost[@sent[i-index-1...i].mb_chars.downcase.to_s]), (index + 1)]
  30.       else
  31.         result << [(candidates[index] + Float::INFINITY), (index + 1)]
  32.       end
  33.     end
  34.     result.sort!
  35.     return result[0][0], result[0][1]
  36.   end
  37.  
  38.   @cost = [0]
  39.   for i in (1..@sent.length)
  40.     @wordcost[@sent[i-1].mb_chars.downcase.to_s] ||= Math.log(@count * Math.log(@count))
  41.     c, k = best_match(i)
  42.     @cost << c
  43.   end
  44.  
  45.   out = []
  46.   i = @sent.length
  47.   while i>0
  48.     c, k = best_match(i)
  49.     if c != @cost[i]
  50.       raise "Something went wrong"
  51.     end
  52.     out << @sent[i-k...i]
  53.     i -= k
  54.   end
  55.  
  56.   return out.reverse.join(" ")
  57.  
  58. end
  59.  
  60. def char_type(string)
  61.   case string
  62.   when /[[:punct:]]/
  63.     return "P"
  64.   when /[[:digit:]]/
  65.     return "D"
  66.   when /[A-z]/
  67.     return "F"
  68.   when /[[:upper:]]/
  69.     return "U"
  70.   else
  71.     return "R"
  72.   end
  73. end
  74.  
  75. def test_to_vert(s)
  76.   s.chomp!
  77.   orig_sent = s
  78.   a = s.mb_chars.downcase.to_s
  79.   a = infer_spaces(a)
  80.   space_indices = []
  81.   a = a.split("")
  82.   a.each_index do |i|
  83.     if a[i] == " "
  84.       space_indices << i
  85.     end
  86.   end
  87.   orig_sent = orig_sent.split("")
  88.   space_indices.each do |x|
  89.     orig_sent.insert(x, " ")
  90.   end
  91.   orig_sent = orig_sent.join
  92.   orig_sent = orig_sent.split
  93.  
  94.   answer = []
  95.  
  96.   orig_sent.each do |word|
  97.     letters = word.split("")
  98.     letters.each_index do |i|
  99.       answer << letters[i] + "\t" + letters[i].mb_chars.downcase.to_s + \
  100.       "\t" + char_type(letters[i]) + "\t" + i.to_s + "|" + word.length.to_s
  101.     end
  102.   end
  103.  
  104.   return answer.join("\n")
  105. end
  106.  
  107. file = File.open('test_ruby_vert.txt', 'w')
  108.  
  109. File.readlines("test.txt").each do |line|
  110.   if line.chomp.empty?
  111.     file.write("\n")
  112.   else
  113.     file.write(test_to_vert(line))
  114.     file.write("\n\n")
  115.   end
  116. end
  117.  
  118. file.close
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement