Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/ruby
- #encoding: UTF-8
- Encoding::default_internal = "UTF-8"
- Encoding::default_external = "UTF-8"
- require 'active_support/core_ext'
- @wordcost = Hash.new
- @count = %x{wc -l dictionary.txt}.split.first.to_i
- i = 0
- File.readlines("dictionary.txt").each do |line|
- line.chomp!
- @wordcost[line.mb_chars.downcase.to_s] ||= Math.log((i+1) * Math.log(@count))
- i += 1
- end
- def infer_spaces(s)
- @sent = s.chomp
- def best_match(i)
- result = []
- candidates = @cost[0, i].reverse
- candidates.each_index do |index|
- if @wordcost.has_key?(@sent[i-index-1...i].mb_chars.downcase.to_s)
- result << [(candidates[index] + @wordcost[@sent[i-index-1...i].mb_chars.downcase.to_s]), (index + 1)]
- else
- result << [(candidates[index] + Float::INFINITY), (index + 1)]
- end
- end
- result.sort!
- return result[0][0], result[0][1]
- end
- @cost = [0]
- for i in (1..@sent.length)
- @wordcost[@sent[i-1].mb_chars.downcase.to_s] ||= Math.log(@count * Math.log(@count))
- c, k = best_match(i)
- @cost << c
- end
- out = []
- i = @sent.length
- while i>0
- c, k = best_match(i)
- if c != @cost[i]
- raise "Something went wrong"
- end
- out << @sent[i-k...i]
- i -= k
- end
- return out.reverse.join(" ")
- end
- def char_type(string)
- case string
- when /[[:punct:]]/
- return "P"
- when /[[:digit:]]/
- return "D"
- when /[A-z]/
- return "F"
- when /[[:upper:]]/
- return "U"
- else
- return "R"
- end
- end
- def test_to_vert(s)
- s.chomp!
- orig_sent = s
- a = s.mb_chars.downcase.to_s
- a = infer_spaces(a)
- space_indices = []
- a = a.split("")
- a.each_index do |i|
- if a[i] == " "
- space_indices << i
- end
- end
- orig_sent = orig_sent.split("")
- space_indices.each do |x|
- orig_sent.insert(x, " ")
- end
- orig_sent = orig_sent.join
- orig_sent = orig_sent.split
- answer = []
- orig_sent.each do |word|
- letters = word.split("")
- letters.each_index do |i|
- answer << letters[i] + "\t" + letters[i].mb_chars.downcase.to_s + \
- "\t" + char_type(letters[i]) + "\t" + i.to_s + "|" + word.length.to_s
- end
- end
- return answer.join("\n")
- end
- file = File.open('test_ruby_vert.txt', 'w')
- File.readlines("test.txt").each do |line|
- if line.chomp.empty?
- file.write("\n")
- else
- file.write(test_to_vert(line))
- file.write("\n\n")
- end
- end
- file.close
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement