Advertisement
Guest User

Untitled

a guest
Oct 22nd, 2018
77
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 9.61 KB | None | 0 0
  1. #!/usr/bin/env ruby
  2. #
  3. # Data structures and proccessing of documents to be indexed, e.g. wikipedia
  4. # pages. ok, everything is wikipedia-specific. :-)
  5. #
  6. # This file can be executed for various sorts of testing (see bottom)
  7.  
  8.  
  9. require File.dirname(__FILE__)+'/common'
  10.  
  11. require 'ostruct'
  12. require 'digest/md5'
  13. require 'rubygems'
  14. gem 'facets'
  15. require 'xml/libxml'
  16.  
  17. require File.dirname(__FILE__)+'/../sentbreaker/sentbreaker'
  18. require File.dirname(__FILE__)+'/semrep'
  19.  
  20. # $sentence_breaker = SentenceBreaker.new
  21. $index_parser = Object.new
  22. def $index_parser.method_missing
  23. raise "Error, no index parser was ever specified, please set $index_parser"
  24. end
  25.  
  26. # "abstract" class
  27. class Page
  28. attr_accessor :title, :sections, :id
  29. attr_accessor :body
  30. end
  31.  
  32. class WikiPage < Page
  33. attr_accessor :xmldoc
  34. attr_accessor :categories_etc # just text
  35.  
  36. def initialize(pagestr)
  37. p = XML::Parser.new
  38. p.string = pagestr
  39. @xmldoc = p.parse
  40. parse_basics!
  41. end
  42.  
  43. def break_sentences!
  44. sections.each { |sec| sec.break_sentences! }
  45. end
  46.  
  47. # xml/wiki parse
  48. def parse_basics!
  49. parse_title!
  50. @id = Digest::MD5.hexdigest(title)
  51.  
  52. # narrow down if appropriate
  53. coretext = rawtext
  54. @body = coretext
  55. if rawtext =~ / \[\[Category: /imx
  56. coretext = $~.pre_match
  57. @categories_etc = $~[0] + $~.post_match
  58. end
  59.  
  60. # sections
  61. section_headers = []
  62. /^\s* = .*[^=].* = \s*$ /x.matches(coretext) do |match|
  63. section_headers << SectionHeader.create_from_match(match)
  64. end
  65. dummy_start = SectionHeader.new(:begin=>0, :end=>-1, :title => "ARTICLE_START")
  66. section_headers.unshift( dummy_start )
  67. # ppy section_headers
  68.  
  69. # segment the text
  70. @sections = []
  71. section_headers.each_with_index do |sh, i|
  72. section_end = section_headers[i+1] ? (section_headers[i+1].begin - 1) : -1
  73. text = coretext[(sh.end+1) .. section_end] || ""
  74. @sections << Section.new(
  75. self,
  76. :text => text,
  77. :text_for_indexer => WikiPage.wiki_cleanup_smallscale(WikiPage.wiki_cleanup_largescale(text)),
  78. # :header => sh,
  79. :title => sh.title,
  80. :title_for_indexer => WikiPage.wiki_cleanup_smallscale(sh.title),
  81. :depth => (sh.orig_str[/=+/] || OpenStruct.new).size
  82. )
  83. end
  84.  
  85. # normalize depths: 1 depth for the top siblings, same as ARTICLE_START
  86. @sections[0].depth = 1
  87. mindepth = @sections[1..-1].map{|s| s.depth}.min || 0
  88. @sections[1..-1].each{|s| s.depth = s.depth - mindepth + 1}
  89.  
  90. make_section_tree!
  91. dfs_walk(@section_root) do |path, sec|
  92. sec.title_path = ((path[1..-1] || []) + [sec]).map{|s| s.title}.join(" -> ")
  93. sec.title_path_for_indexer = WikiPage.wiki_cleanup_smallscale(sec.title_path)
  94. end
  95.  
  96. # extract out non-core-textual sections?
  97. # non_core_section_headers = section_headers.select do |sh|
  98. # sh.title =~ /^ (other websites | see also | references) $/ix
  99. # end
  100. # puts section_headers.map{|sh| sh.title}
  101. end
  102.  
  103. def dfs_walk(node)
  104. _dfs_walk([],node) { |p,n| yield [p,n] }
  105. end
  106.  
  107. def _dfs_walk(path, node)
  108. yield [path,node]
  109. node.children.each do |c|
  110. _dfs_walk(path+[node], c) { |p,n| yield [p,n]}
  111. end
  112. end
  113.  
  114. def make_section_tree!
  115. @section_root = Section.new(self, :title=>"SECTION_ROOT", :depth=>0)
  116. sections.each_with_index do |sec,i|
  117. sections_before = [@section_root] + (sections[0...i] || [])
  118. parent = sections_before.reverse.find {|s| s.depth <= sec.depth-1}
  119. if ! parent
  120. raise "impossible to not find parent now"
  121. end
  122. parent.children << sec
  123. end
  124. # could do better depth normalization here if we wanted
  125. end
  126.  
  127. class Section
  128. attr_accessor :id, :text, :text_for_indexer, :title, :title_for_indexer
  129. attr_accessor :sentences
  130. attr_accessor :children, :depth, :title_path, :title_path_for_indexer
  131.  
  132. def initialize(page, hash={})
  133. fill_attrs! hash
  134. @id = Digest::MD5.hexdigest(page.id + title)
  135. @sentences = []
  136. @children = []
  137. end
  138.  
  139. def break_sentences!
  140. # return (@sentences = []) if $evil_global_dont_sentence_break
  141.  
  142. text_first_clean = WikiPage.wiki_cleanup_largescale(text || "")
  143. text_first_clean = WikiPage.wiki_cleanup_smallscale(text_first_clean) # super clean
  144. lines = (text_first_clean || "").split("\n")
  145.  
  146. # if newlines can happen in the middle of sentences, we'd want to
  147. # strategically join back together lines right here. this seems to
  148. # happen sometimes but not often, so let's not worry about it.
  149.  
  150. @sentences = lines.map do |line|
  151. # line = WikiPage.wiki_cleanup_largescale(line)
  152. $sentence_breaker.break(line).map do |sent_text|
  153. Sentence.new(sent_text, self)
  154. end
  155. end.flatten
  156. end
  157.  
  158. end
  159.  
  160. class SectionHeader
  161. attr_accessor :title, :begin, :end, :orig_str
  162.  
  163. def initialize(hash={})
  164. @orig_str = ""
  165. @title = ""
  166. fill_attrs! hash
  167. end
  168.  
  169. def self.create_from_match(match)
  170. sh = SectionHeader.new
  171. sh.instance_eval do
  172. @orig_str = match[0]
  173. raw_name = @orig_str[ /^\s*=+ ([^=] .* [^=] ) =+ \s*$/x, 1]
  174. if raw_name
  175. @title = raw_name.strip.gsub(/\s{2,99}/, ' ')
  176. # leave in [[ ]] markup...
  177. end
  178. @begin = match.begin(0)
  179. @end = match.end(0)
  180. end
  181. sh
  182. end
  183. end
  184.  
  185. # this function should only do cleanups that are prerequisite for the
  186. # sentence breaker
  187. # therefore, dont do any small scale cleanups that could be done in-sentence.
  188. # e.g. anchor text cleanup. we'd like to use those potentially...
  189. #
  190. def self.wiki_cleanup_largescale(s)
  191. # infoboxes are complex: {{ .. \n|key=val.. \n|key2=val2 .. \n}}\n
  192. s = s.gsub(/ \{\{ [^\n]* \n
  193. (\| [^\n]* \n )+
  194. \}\} /mx, '')
  195. # more tables, sloppier regex
  196. s = s.gsub(/^\s* \{\| \s* class="wikitable" .*? \n\|\} /mx, '')
  197. # really sloppy, yikes
  198. s = s.gsub(/^\s* \{\| .*? \n\|\} /mx, '')
  199.  
  200. s = s.gsub(/<!-- .*? -->/mx, '')
  201. s = s.gsub(/ <ref [^>]* \/> /mx, '')
  202. s = s.gsub(/ <ref .*? <\/ref> /mx, '')
  203. s = s.gsub(/ \{\{ cite [^}]* \}\}/mx, '')
  204. # s = s.gsub(/ \{\{ [^|]* \| ([^}]*) \}\}/mx, '\1')
  205. s = s.gsub(/ \{\{ ([^}]*) \}\}/mx, '')
  206. s = s.gsub(/ < [^>]* > /x, '') # arbitrary html or html-like tags
  207. # join together anchor texts that are spanning multiple lines
  208. s = s.gsub(/ \[\[ [^\]]* \n [^\]]* \]\] /mx) { $~.to_s.gsub("\n"," ") }
  209. end
  210.  
  211. def self.wiki_cleanup_smallscale(s)
  212. s = s.gsub(/'''/,"").gsub(/''/,"")
  213. s = s.gsub(/ \[\[ ([^\|\[\]]*) \]\] /x, '\1') # anchor text of wiki link
  214. s = s.gsub(/ \[\[ [^\]]* \| ([^\]]+) \]\] /x,'\1') # anchor text of wiki link
  215. s = s.gsub(/ \[http:[^\s]+ \s* ([^\]]*) \] /x, '\1') # web links
  216. s = s.gsub(/ ^\*+ \s* /x, '') # a list
  217. end
  218.  
  219. class Sentence
  220. attr_accessor :text, :text_for_parser, :semrep, :id
  221.  
  222. def initialize(text, section)
  223. @text = text
  224. @text_for_parser = get_text_for_parser(text)
  225. @id = Digest::MD5.hexdigest(section.id + text)
  226. end
  227.  
  228. # in a smarter world, save the output in a standoff-y way to exploit it for
  229. # name/coref resolution
  230. def get_text_for_parser(wiki_text)
  231. s = WikiPage.wiki_cleanup_smallscale(wiki_text)
  232. end
  233.  
  234. def parse!(no_lex=false)
  235. # puts @text_for_parser
  236. triples = $index_parser.parse_dep @text_for_parser
  237. semrep_class = eval( $index_parser.parser_name + "SemRep" )
  238. @semrep = semrep_class.new(triples, no_lex)
  239. end
  240. end
  241.  
  242. def parse_title!
  243. titleelt = @xmldoc.find('//title').each{|t| break t}
  244. return unless titleelt
  245. c = titleelt.child
  246. return unless c
  247. @title = c.content
  248. end
  249.  
  250. def rawtext
  251. return @rawtext if @rawtext
  252. textelt = @xmldoc.find('//text').each{|t| break t}
  253. return (@rawtext = "") unless textelt
  254. textnode = textelt.child
  255. return (@rawtext = "") unless textnode
  256. @rawtext = textnode.content
  257. end
  258.  
  259. end
  260.  
  261.  
  262.  
  263. if __FILE__ == $0
  264.  
  265. banner_msg = <<-EOS
  266. testing: put the code per page on the cmdline
  267. e.g. (./w here is "bzcat enwiki...xml.bz2" or "cat test/obama.xml")
  268.  
  269. examples ...
  270.  
  271. test page title extraction:
  272. ./w | lib/page.rb 'puts title'
  273.  
  274. test section extraction:
  275. ./w | lib/page.rb 'sections.each{|sec| puts "DOC: #\{self.title\} -- SEC: #\{sec.title\}"}'
  276.  
  277. test sentence extraction and cleanup:
  278. ./w | lib/page.rb -s 'puts "*** #\{text_for_parser\}"'
  279.  
  280. test sentence parsing/semrepping:
  281. ./w | lib/page.rb -s 'parse!'
  282.  
  283. view minipar triples per sentence:
  284. ./w | lib/page.rb -s 'parse!; puts text_for_parser; puts semrep.parse_triples'
  285.  
  286. view sempairs per sentence semrep:
  287. ./w | lib/page.rb -s 'parse!; puts text_for_parser; puts semrep.sempairs'
  288.  
  289. view sempairs per sentence semrep, without lex lookup:
  290. ./w | lib/page.rb -s 'parse! true; puts text_for_parser; puts semrep.sempairs'
  291.  
  292. options ...
  293. EOS
  294.  
  295. STDOUT.sync = true
  296. require File.dirname(__FILE__)+'/wikidump'
  297. require 'trollop'
  298.  
  299. opts = Trollop::options do
  300. banner banner_msg
  301. opt :page, "run code per page", :default => true
  302. opt :sentence, "run code per sentence", :default => false
  303. opt :parser, "parser to use (as a drb uri)", :default => $miniq_conf['index_parser']
  304. end
  305.  
  306. $index_parser = DRbObject.new(nil, opts[:parser])
  307.  
  308. cmd = ARGV.join(" ")
  309. cmd = 'puts "*** #{text}\n=== #{text_for_parser}"' if cmd == "" && opts[:sentence]
  310. cmd = 'puts "-- #{title}"' if cmd == "" && opts[:page]
  311.  
  312. # require 'unprof'
  313.  
  314. if opts[:sentence]
  315. WikiDump.yield_pages $stdin do |page|
  316. page.break_sentences!
  317. page.sections.each do |sec|
  318. sec.sentences.each do |sent|
  319. sent.instance_eval { eval cmd }
  320. end
  321. end
  322. end
  323. elsif opts[:page]
  324. WikiDump.yield_pages $stdin do |page|
  325. page.instance_eval { eval cmd }
  326. end
  327. else
  328. Trollop::die "Illegal options"
  329. end
  330.  
  331. end
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement