Untitled

# Very (very) naive wordt segmentation algorithm for Chinese
# (or any language with similar characteristics, works at the
# character level.)
class Partitioner
  attr_reader :ngrams

  # +ngrams+ Enumerable list of ngrams
  def initialize(ngrams, lookahead = 6)
    @lookahead = lookahead
    @ngrams = {}
    ngrams.each {|ng| @ngrams[ng] = true}
  end

  # Goes from beginning to end, each time trying to find the longest
  # initial n characters that are in the list of known n-grams
  def partition(text)
    text = text.split('')
    result = []
    while text and not text.empty?
      lookahead = @lookahead
      while lookahead > 0
        test = text[0...lookahead].join
        if lookahead == 1 || ngrams[test]
          result << test
          text = text[lookahead..-1]
          break
        end
        lookahead-=1
      end
    end
    result
  end
end