Untitled

#!/usr/bin/env ruby
#
# Data structures and proccessing of documents to be indexed, e.g. wikipedia
# pages.  ok, everything is wikipedia-specific.  :-)
#
# This file can be executed for various sorts of testing (see bottom)


require File.dirname(__FILE__)+'/common'

require 'ostruct'
require 'digest/md5'
require 'rubygems'
gem 'facets'
require 'xml/libxml'

require File.dirname(__FILE__)+'/../sentbreaker/sentbreaker'
require File.dirname(__FILE__)+'/semrep'

# $sentence_breaker = SentenceBreaker.new
$index_parser = Object.new
def $index_parser.method_missing
  raise "Error, no index parser was ever specified, please set $index_parser"
end

# "abstract" class
class Page
  attr_accessor :title, :sections, :id
  attr_accessor :body
end

class WikiPage < Page
  attr_accessor :xmldoc
  attr_accessor :categories_etc   # just text

  def initialize(pagestr)
    p = XML::Parser.new
    p.string = pagestr
    @xmldoc = p.parse
    parse_basics!
  end

  def break_sentences!
    sections.each { |sec| sec.break_sentences! }
  end

  # xml/wiki parse
  def parse_basics!
    parse_title!
    @id = Digest::MD5.hexdigest(title)

    # narrow down if appropriate
    coretext = rawtext
    @body = coretext
    if rawtext =~ / \[\[Category: /imx
      coretext = $~.pre_match
      @categories_etc = $~[0] + $~.post_match
    end

    # sections
    section_headers = []
    /^\s* = .*[^=].* = \s*$ /x.matches(coretext) do |match|
      section_headers << SectionHeader.create_from_match(match)
    end
    dummy_start = SectionHeader.new(:begin=>0, :end=>-1, :title => "ARTICLE_START")
    section_headers.unshift( dummy_start )
    # ppy section_headers

    # segment the text
    @sections = []
    section_headers.each_with_index do |sh, i|
      section_end = section_headers[i+1] ? (section_headers[i+1].begin - 1) : -1
      text = coretext[(sh.end+1) .. section_end] || ""
      @sections << Section.new(
          self,
          :text => text,
          :text_for_indexer => WikiPage.wiki_cleanup_smallscale(WikiPage.wiki_cleanup_largescale(text)),
          # :header => sh,
          :title => sh.title,
          :title_for_indexer => WikiPage.wiki_cleanup_smallscale(sh.title),
          :depth => (sh.orig_str[/=+/] || OpenStruct.new).size
      )
    end

    # normalize depths: 1 depth for the top siblings, same as ARTICLE_START
    @sections[0].depth = 1
    mindepth = @sections[1..-1].map{|s| s.depth}.min || 0
    @sections[1..-1].each{|s| s.depth = s.depth - mindepth + 1}

    make_section_tree!
    dfs_walk(@section_root) do |path, sec|
      sec.title_path = ((path[1..-1] || []) + [sec]).map{|s| s.title}.join(" -> ")
      sec.title_path_for_indexer = WikiPage.wiki_cleanup_smallscale(sec.title_path)
    end

    # extract out non-core-textual sections?
    # non_core_section_headers = section_headers.select do |sh|
    #   sh.title =~ /^ (other websites | see also | references) $/ix
    # end
    # puts section_headers.map{|sh| sh.title}
  end

  def dfs_walk(node)
    _dfs_walk([],node) { |p,n| yield [p,n] }
  end

  def _dfs_walk(path, node)
    yield [path,node]
    node.children.each do |c|
      _dfs_walk(path+[node], c) { |p,n| yield [p,n]}
    end
  end

  def make_section_tree!
    @section_root = Section.new(self, :title=>"SECTION_ROOT", :depth=>0)
    sections.each_with_index do |sec,i|
      sections_before = [@section_root] + (sections[0...i] || [])
      parent = sections_before.reverse.find {|s| s.depth <= sec.depth-1}
      if ! parent
        raise "impossible to not find parent now"
      end
      parent.children << sec
    end
    # could do better depth normalization here if we wanted
  end

  class Section
    attr_accessor :id, :text, :text_for_indexer, :title, :title_for_indexer
    attr_accessor :sentences
    attr_accessor :children, :depth, :title_path, :title_path_for_indexer

    def initialize(page, hash={})
      fill_attrs! hash
      @id = Digest::MD5.hexdigest(page.id + title)
      @sentences = []
      @children = []
    end

    def break_sentences!
      # return (@sentences = []) if $evil_global_dont_sentence_break

      text_first_clean = WikiPage.wiki_cleanup_largescale(text || "")
      text_first_clean = WikiPage.wiki_cleanup_smallscale(text_first_clean)  # super clean
      lines = (text_first_clean || "").split("\n")

      # if newlines can happen in the middle of sentences, we'd want to
      # strategically join back together lines right here.  this seems to
      # happen sometimes but not often, so let's not worry about it.

      @sentences = lines.map do |line|
        # line = WikiPage.wiki_cleanup_largescale(line)
        $sentence_breaker.break(line).map do |sent_text|
          Sentence.new(sent_text, self)
        end
      end.flatten
    end

  end

  class SectionHeader
    attr_accessor :title, :begin, :end, :orig_str

    def initialize(hash={})
      @orig_str = ""
      @title = ""
      fill_attrs! hash
    end

    def self.create_from_match(match)
      sh = SectionHeader.new
      sh.instance_eval do
        @orig_str = match[0]
        raw_name = @orig_str[ /^\s*=+ ([^=] .* [^=] )  =+ \s*$/x, 1]
        if raw_name
          @title = raw_name.strip.gsub(/\s{2,99}/, ' ')
          # leave in [[ ]]  markup...
        end
        @begin = match.begin(0)
        @end = match.end(0)
      end
      sh
    end
  end

  # this function should only do cleanups that are prerequisite for the
  # sentence breaker
  # therefore, dont do any small scale cleanups that could be done in-sentence.
  # e.g. anchor text cleanup.  we'd like to use those potentially...
  #
  def self.wiki_cleanup_largescale(s)
    # infoboxes are complex:  {{ .. \n|key=val.. \n|key2=val2 .. \n}}\n
    s = s.gsub(/ \{\{ [^\n]* \n
                (\| [^\n]* \n )+
                  \}\}  /mx,  '')
    # more tables, sloppier regex
    s = s.gsub(/^\s* \{\| \s* class="wikitable" .*?  \n\|\} /mx, '')
    # really sloppy, yikes
    s = s.gsub(/^\s* \{\| .*?  \n\|\} /mx, '')

    s = s.gsub(/<!-- .*? -->/mx, '')
    s = s.gsub(/ <ref [^>]* \/> /mx, '')
    s = s.gsub(/ <ref .*? <\/ref> /mx, '')
    s = s.gsub(/ \{\{ cite  [^}]*  \}\}/mx, '')
    # s = s.gsub(/ \{\{  [^|]*  \|  ([^}]*)  \}\}/mx,  '\1')
    s = s.gsub(/ \{\{  ([^}]*)  \}\}/mx,            '')
    s = s.gsub(/ < [^>]* > /x, '')   # arbitrary html or html-like tags
    # join together anchor texts that are spanning multiple lines
    s = s.gsub(/ \[\[ [^\]]* \n [^\]]*  \]\] /mx) { $~.to_s.gsub("\n"," ") }
  end

  def self.wiki_cleanup_smallscale(s)
    s = s.gsub(/'''/,"").gsub(/''/,"")
    s = s.gsub(/ \[\[   ([^\|\[\]]*)     \]\] /x,    '\1')  # anchor text of wiki link
    s = s.gsub(/ \[\[  [^\]]*  \|  ([^\]]+)  \]\] /x,'\1')  # anchor text of wiki link
    s = s.gsub(/ \[http:[^\s]+ \s* ([^\]]*) \] /x, '\1')  # web links
    s = s.gsub(/ ^\*+ \s* /x, '')       # a list
  end

  class Sentence
    attr_accessor :text, :text_for_parser, :semrep, :id

    def initialize(text, section)
      @text = text
      @text_for_parser = get_text_for_parser(text)
      @id = Digest::MD5.hexdigest(section.id + text)
    end

    # in a smarter world, save the output in a standoff-y way to exploit it for
    # name/coref resolution
    def get_text_for_parser(wiki_text)
      s = WikiPage.wiki_cleanup_smallscale(wiki_text)
    end

    def parse!(no_lex=false)
      # puts @text_for_parser
      triples = $index_parser.parse_dep @text_for_parser
      semrep_class = eval( $index_parser.parser_name + "SemRep" )
      @semrep = semrep_class.new(triples, no_lex)
    end
  end

  def parse_title!
    titleelt = @xmldoc.find('//title').each{|t| break t}
    return unless titleelt
    c = titleelt.child
    return unless c
    @title = c.content
  end

  def rawtext
    return @rawtext if @rawtext
    textelt = @xmldoc.find('//text').each{|t| break t}
    return (@rawtext = "") unless textelt
    textnode = textelt.child
    return (@rawtext = "") unless textnode
    @rawtext = textnode.content
  end

end


if __FILE__ == $0

banner_msg = <<-EOS
testing: put the code per page on the cmdline
e.g.  (./w here is "bzcat enwiki...xml.bz2" or "cat test/obama.xml")

examples ...

test page title extraction:
./w | lib/page.rb 'puts title'

test section extraction:
./w | lib/page.rb 'sections.each{|sec| puts "DOC: #\{self.title\} -- SEC: #\{sec.title\}"}'

test sentence extraction and cleanup:
./w | lib/page.rb -s 'puts "*** #\{text_for_parser\}"'

test sentence parsing/semrepping:
./w | lib/page.rb -s 'parse!'

view minipar triples per sentence:
./w | lib/page.rb -s 'parse!; puts text_for_parser; puts semrep.parse_triples'

view sempairs per sentence semrep:
./w | lib/page.rb -s 'parse!; puts text_for_parser; puts semrep.sempairs'

view sempairs per sentence semrep, without lex lookup:
./w | lib/page.rb -s 'parse! true; puts text_for_parser; puts semrep.sempairs'

options ...
EOS

STDOUT.sync = true
require File.dirname(__FILE__)+'/wikidump'
require 'trollop'

opts = Trollop::options do
  banner banner_msg
  opt :page, "run code per page", :default => true
  opt :sentence, "run code per sentence", :default => false
  opt :parser, "parser to use (as a drb uri)", :default => $miniq_conf['index_parser']
end

$index_parser = DRbObject.new(nil, opts[:parser])

cmd = ARGV.join(" ")
cmd = 'puts "*** #{text}\n=== #{text_for_parser}"' if cmd == "" && opts[:sentence]
cmd = 'puts "-- #{title}"' if cmd == "" && opts[:page]

# require 'unprof'

if opts[:sentence]
  WikiDump.yield_pages $stdin do |page|
    page.break_sentences!
    page.sections.each do |sec|
      sec.sentences.each do |sent|
        sent.instance_eval { eval cmd }
      end
    end
  end
elsif opts[:page]
  WikiDump.yield_pages $stdin do |page|
    page.instance_eval { eval cmd }
  end
else
  Trollop::die "Illegal options"
end

end