Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env ruby
- #
- # Data structures and proccessing of documents to be indexed, e.g. wikipedia
- # pages. ok, everything is wikipedia-specific. :-)
- #
- # This file can be executed for various sorts of testing (see bottom)
- require File.dirname(__FILE__)+'/common'
- require 'ostruct'
- require 'digest/md5'
- require 'rubygems'
- gem 'facets'
- require 'xml/libxml'
- require File.dirname(__FILE__)+'/../sentbreaker/sentbreaker'
- require File.dirname(__FILE__)+'/semrep'
- # $sentence_breaker = SentenceBreaker.new
- $index_parser = Object.new
- def $index_parser.method_missing
- raise "Error, no index parser was ever specified, please set $index_parser"
- end
- # "abstract" class
- class Page
- attr_accessor :title, :sections, :id
- attr_accessor :body
- end
- class WikiPage < Page
- attr_accessor :xmldoc
- attr_accessor :categories_etc # just text
- def initialize(pagestr)
- p = XML::Parser.new
- p.string = pagestr
- @xmldoc = p.parse
- parse_basics!
- end
- def break_sentences!
- sections.each { |sec| sec.break_sentences! }
- end
- # xml/wiki parse
- def parse_basics!
- parse_title!
- @id = Digest::MD5.hexdigest(title)
- # narrow down if appropriate
- coretext = rawtext
- @body = coretext
- if rawtext =~ / \[\[Category: /imx
- coretext = $~.pre_match
- @categories_etc = $~[0] + $~.post_match
- end
- # sections
- section_headers = []
- /^\s* = .*[^=].* = \s*$ /x.matches(coretext) do |match|
- section_headers << SectionHeader.create_from_match(match)
- end
- dummy_start = SectionHeader.new(:begin=>0, :end=>-1, :title => "ARTICLE_START")
- section_headers.unshift( dummy_start )
- # ppy section_headers
- # segment the text
- @sections = []
- section_headers.each_with_index do |sh, i|
- section_end = section_headers[i+1] ? (section_headers[i+1].begin - 1) : -1
- text = coretext[(sh.end+1) .. section_end] || ""
- @sections << Section.new(
- self,
- :text => text,
- :text_for_indexer => WikiPage.wiki_cleanup_smallscale(WikiPage.wiki_cleanup_largescale(text)),
- # :header => sh,
- :title => sh.title,
- :title_for_indexer => WikiPage.wiki_cleanup_smallscale(sh.title),
- :depth => (sh.orig_str[/=+/] || OpenStruct.new).size
- )
- end
- # normalize depths: 1 depth for the top siblings, same as ARTICLE_START
- @sections[0].depth = 1
- mindepth = @sections[1..-1].map{|s| s.depth}.min || 0
- @sections[1..-1].each{|s| s.depth = s.depth - mindepth + 1}
- make_section_tree!
- dfs_walk(@section_root) do |path, sec|
- sec.title_path = ((path[1..-1] || []) + [sec]).map{|s| s.title}.join(" -> ")
- sec.title_path_for_indexer = WikiPage.wiki_cleanup_smallscale(sec.title_path)
- end
- # extract out non-core-textual sections?
- # non_core_section_headers = section_headers.select do |sh|
- # sh.title =~ /^ (other websites | see also | references) $/ix
- # end
- # puts section_headers.map{|sh| sh.title}
- end
- def dfs_walk(node)
- _dfs_walk([],node) { |p,n| yield [p,n] }
- end
- def _dfs_walk(path, node)
- yield [path,node]
- node.children.each do |c|
- _dfs_walk(path+[node], c) { |p,n| yield [p,n]}
- end
- end
- def make_section_tree!
- @section_root = Section.new(self, :title=>"SECTION_ROOT", :depth=>0)
- sections.each_with_index do |sec,i|
- sections_before = [@section_root] + (sections[0...i] || [])
- parent = sections_before.reverse.find {|s| s.depth <= sec.depth-1}
- if ! parent
- raise "impossible to not find parent now"
- end
- parent.children << sec
- end
- # could do better depth normalization here if we wanted
- end
- class Section
- attr_accessor :id, :text, :text_for_indexer, :title, :title_for_indexer
- attr_accessor :sentences
- attr_accessor :children, :depth, :title_path, :title_path_for_indexer
- def initialize(page, hash={})
- fill_attrs! hash
- @id = Digest::MD5.hexdigest(page.id + title)
- @sentences = []
- @children = []
- end
- def break_sentences!
- # return (@sentences = []) if $evil_global_dont_sentence_break
- text_first_clean = WikiPage.wiki_cleanup_largescale(text || "")
- text_first_clean = WikiPage.wiki_cleanup_smallscale(text_first_clean) # super clean
- lines = (text_first_clean || "").split("\n")
- # if newlines can happen in the middle of sentences, we'd want to
- # strategically join back together lines right here. this seems to
- # happen sometimes but not often, so let's not worry about it.
- @sentences = lines.map do |line|
- # line = WikiPage.wiki_cleanup_largescale(line)
- $sentence_breaker.break(line).map do |sent_text|
- Sentence.new(sent_text, self)
- end
- end.flatten
- end
- end
- class SectionHeader
- attr_accessor :title, :begin, :end, :orig_str
- def initialize(hash={})
- @orig_str = ""
- @title = ""
- fill_attrs! hash
- end
- def self.create_from_match(match)
- sh = SectionHeader.new
- sh.instance_eval do
- @orig_str = match[0]
- raw_name = @orig_str[ /^\s*=+ ([^=] .* [^=] ) =+ \s*$/x, 1]
- if raw_name
- @title = raw_name.strip.gsub(/\s{2,99}/, ' ')
- # leave in [[ ]] markup...
- end
- @begin = match.begin(0)
- @end = match.end(0)
- end
- sh
- end
- end
- # this function should only do cleanups that are prerequisite for the
- # sentence breaker
- # therefore, dont do any small scale cleanups that could be done in-sentence.
- # e.g. anchor text cleanup. we'd like to use those potentially...
- #
- def self.wiki_cleanup_largescale(s)
- # infoboxes are complex: {{ .. \n|key=val.. \n|key2=val2 .. \n}}\n
- s = s.gsub(/ \{\{ [^\n]* \n
- (\| [^\n]* \n )+
- \}\} /mx, '')
- # more tables, sloppier regex
- s = s.gsub(/^\s* \{\| \s* class="wikitable" .*? \n\|\} /mx, '')
- # really sloppy, yikes
- s = s.gsub(/^\s* \{\| .*? \n\|\} /mx, '')
- s = s.gsub(/<!-- .*? -->/mx, '')
- s = s.gsub(/ <ref [^>]* \/> /mx, '')
- s = s.gsub(/ <ref .*? <\/ref> /mx, '')
- s = s.gsub(/ \{\{ cite [^}]* \}\}/mx, '')
- # s = s.gsub(/ \{\{ [^|]* \| ([^}]*) \}\}/mx, '\1')
- s = s.gsub(/ \{\{ ([^}]*) \}\}/mx, '')
- s = s.gsub(/ < [^>]* > /x, '') # arbitrary html or html-like tags
- # join together anchor texts that are spanning multiple lines
- s = s.gsub(/ \[\[ [^\]]* \n [^\]]* \]\] /mx) { $~.to_s.gsub("\n"," ") }
- end
- def self.wiki_cleanup_smallscale(s)
- s = s.gsub(/'''/,"").gsub(/''/,"")
- s = s.gsub(/ \[\[ ([^\|\[\]]*) \]\] /x, '\1') # anchor text of wiki link
- s = s.gsub(/ \[\[ [^\]]* \| ([^\]]+) \]\] /x,'\1') # anchor text of wiki link
- s = s.gsub(/ \[http:[^\s]+ \s* ([^\]]*) \] /x, '\1') # web links
- s = s.gsub(/ ^\*+ \s* /x, '') # a list
- end
- class Sentence
- attr_accessor :text, :text_for_parser, :semrep, :id
- def initialize(text, section)
- @text = text
- @text_for_parser = get_text_for_parser(text)
- @id = Digest::MD5.hexdigest(section.id + text)
- end
- # in a smarter world, save the output in a standoff-y way to exploit it for
- # name/coref resolution
- def get_text_for_parser(wiki_text)
- s = WikiPage.wiki_cleanup_smallscale(wiki_text)
- end
- def parse!(no_lex=false)
- # puts @text_for_parser
- triples = $index_parser.parse_dep @text_for_parser
- semrep_class = eval( $index_parser.parser_name + "SemRep" )
- @semrep = semrep_class.new(triples, no_lex)
- end
- end
- def parse_title!
- titleelt = @xmldoc.find('//title').each{|t| break t}
- return unless titleelt
- c = titleelt.child
- return unless c
- @title = c.content
- end
- def rawtext
- return @rawtext if @rawtext
- textelt = @xmldoc.find('//text').each{|t| break t}
- return (@rawtext = "") unless textelt
- textnode = textelt.child
- return (@rawtext = "") unless textnode
- @rawtext = textnode.content
- end
- end
- if __FILE__ == $0
- banner_msg = <<-EOS
- testing: put the code per page on the cmdline
- e.g. (./w here is "bzcat enwiki...xml.bz2" or "cat test/obama.xml")
- examples ...
- test page title extraction:
- ./w | lib/page.rb 'puts title'
- test section extraction:
- ./w | lib/page.rb 'sections.each{|sec| puts "DOC: #\{self.title\} -- SEC: #\{sec.title\}"}'
- test sentence extraction and cleanup:
- ./w | lib/page.rb -s 'puts "*** #\{text_for_parser\}"'
- test sentence parsing/semrepping:
- ./w | lib/page.rb -s 'parse!'
- view minipar triples per sentence:
- ./w | lib/page.rb -s 'parse!; puts text_for_parser; puts semrep.parse_triples'
- view sempairs per sentence semrep:
- ./w | lib/page.rb -s 'parse!; puts text_for_parser; puts semrep.sempairs'
- view sempairs per sentence semrep, without lex lookup:
- ./w | lib/page.rb -s 'parse! true; puts text_for_parser; puts semrep.sempairs'
- options ...
- EOS
- STDOUT.sync = true
- require File.dirname(__FILE__)+'/wikidump'
- require 'trollop'
- opts = Trollop::options do
- banner banner_msg
- opt :page, "run code per page", :default => true
- opt :sentence, "run code per sentence", :default => false
- opt :parser, "parser to use (as a drb uri)", :default => $miniq_conf['index_parser']
- end
- $index_parser = DRbObject.new(nil, opts[:parser])
- cmd = ARGV.join(" ")
- cmd = 'puts "*** #{text}\n=== #{text_for_parser}"' if cmd == "" && opts[:sentence]
- cmd = 'puts "-- #{title}"' if cmd == "" && opts[:page]
- # require 'unprof'
- if opts[:sentence]
- WikiDump.yield_pages $stdin do |page|
- page.break_sentences!
- page.sections.each do |sec|
- sec.sentences.each do |sent|
- sent.instance_eval { eval cmd }
- end
- end
- end
- elsif opts[:page]
- WikiDump.yield_pages $stdin do |page|
- page.instance_eval { eval cmd }
- end
- else
- Trollop::die "Illegal options"
- end
- end
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement