Guest User

Untitled

a guest
Apr 30th, 2018
237
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 7.03 KB | None | 0 0
  1. # Tagiter - Old School HTML Parser
  2. # Copyright (c) 2000 Ⴗnyasu <nyasu@osk.3web.ne.jp>
  3.  
  4. # = TagIterator (aka Tagiter)
  5. #
  6. # Quickly iterate through tagged markup documents like HTML and XML.
  7. # TagIterator is good for quick and dirty web scrapping.
  8. #
  9. # This simple HTML/XHTML cascading parser may well be the first Ruby
  10. # HTML/XML scanner ever. It's dates back to at least 2000. Of course there
  11. # a number of much better options now, so it is no longer very useful. But
  12. # it's still worth remembering. It has been saved as a Gist for prosperity.
  13. #
  14. # == Usage
  15. #
  16. # # sample html
  17. # stext = <<-EOF
  18. # <body> This is a test...
  19. # <sub> S1 </sub> <sub> S2 </sub>
  20. # <DL>
  21. # <DT> A1
  22. # <DT> A2
  23. # <DT> A3
  24. # </DL>
  25. # <DL>
  26. # <DT> B1
  27. # <DT> B2
  28. # <DT> B3
  29. # </DL>
  30. # <NEST>
  31. # <P ALIGN="R">TOP</P>
  32. # <NEST>
  33. # <P>SECOND</P>
  34. # <OL>
  35. # <LI>C1
  36. # <LI>C2
  37. # <LI>C3
  38. # <LI>C4
  39. # </OL>
  40. # </NEST>
  41. # <OL>
  42. # <LI>D1
  43. # <LI>D2
  44. # <LI>D3
  45. # <LI>D4
  46. # </OL>
  47. # </NEST>
  48. # </body>
  49. # EOF
  50. #
  51. # a = TagIterator.new(stext)
  52. # a.first("body") do |y|
  53. # y.nth("dl",2) do |dl|
  54. # dl.enumtag("dt") do |t|
  55. # puts t.text.strip
  56. # end
  57. # end
  58. # y.first("nest") do |n|
  59. # n.first("p") do |c|
  60. # print c.text, ' '
  61. # puts c.attributes.collect{ |k,v| "#{k}=#{v}" }
  62. # end.next("nest") do |m|
  63. # m.first("p") do |c|
  64. # puts c.text
  65. # end.next("ol") do |o|
  66. # o.enumtag("li") do |i| puts i.text.strip end
  67. # end
  68. # end.next("ol") do |o|
  69. # o.enumtag("li") do |i| puts i.text.strip end
  70. # end
  71. # end
  72. # end
  73. # a.each_block("sub") do |y|
  74. # puts y.text.strip
  75. # end
  76. #
  77. # _produces_
  78. #
  79. # B1
  80. # B2
  81. # B3
  82. # TOP align=R
  83. # SECOND
  84. # C1
  85. # C2
  86. # C3
  87. # C4
  88. # D1
  89. # D2
  90. # D3
  91. # D4
  92. # S1
  93. # S2
  94. #
  95.  
  96. class TagIterator
  97.  
  98. attr :text
  99. attr :option, true
  100. attr :tag
  101. attr :attributes
  102.  
  103. private
  104.  
  105. def initialize(text,tag=nil,attributes={})
  106. raise RuntimeError,"Only String accepted" unless text.is_a?(String)
  107. @text=text
  108. @option="pi"
  109. @tag=tag
  110. @attributes=attributes
  111. def @attributes.[](aname)
  112. super aname.downcase
  113. end
  114. end
  115.  
  116. def find_element(element,st=0)
  117. rex=Regexp.new('<(\s|\n)*'+element+'(\s|\n|>)',@option)
  118. @text.index(rex,st)
  119. end
  120.  
  121. def parse_attribute(attstr)
  122. k={}; r={};
  123. attstr.scan(/(\w+)=(\S+)/) do |pt| k[ pt[0] ] = pt[1] end
  124. attstr.scan(/(\w+)="([^"]*)"/) do |pt| k[ pt[0] ] = pt[1] end
  125. k.each do |key,val| r[key.downcase]=val end
  126. r
  127. end
  128.  
  129. def find_opentag(tag,st=0)
  130. s=find_element(tag,st)
  131. return nil unless s
  132.  
  133. r=@text.index('>',s)
  134. return r+1,@text[s+1..r-1]
  135. end
  136.  
  137. def find_closetag(tag,st,opentag=nil)
  138. if opentag then
  139. p=find_element(tag,st)
  140. q,d = find_opentag(opentag,st)
  141. else
  142. p=find_element('/\s*'+tag,st)
  143. q,d = find_opentag(tag,st)
  144. end
  145. p-=1 if p
  146.  
  147. if p and q then if p > q then # tag nested
  148. p=find_closetag(tag,find_closetag(tag,q,opentag)+2,opentag)
  149. end end
  150.  
  151. return p
  152. end
  153.  
  154. def find_closeenumtag(tag,st=0)
  155. rex=Regexp.new('<\s*'+tag,@option)
  156. s=@text.index(rex,st)
  157. s-=1 if s
  158. s
  159. end
  160. alias_method :find_openenumtag, :find_opentag
  161.  
  162. public
  163.  
  164. def nth(tag,n,closetag=nil)
  165. raise RuntimeError,"nth: number not specified" unless n
  166. t=0
  167. e=s=0 # for their scope
  168. d=nil
  169.  
  170. 1.upto(n) do |i|
  171. s,d = find_opentag(tag,t)
  172. raise RuntimeError,"tag(#{tag}) not found at(#{i})" unless s
  173.  
  174. if closetag then
  175. e=find_closetag(closetag,s,tag)
  176. else
  177. e=find_closetag(tag,s)
  178. end
  179. e=-1 unless e
  180. t=@text.index('>',e+1)
  181. t=@text.length unless t
  182. end
  183. yield self.class.new(text[s..e],tag,parse_attribute(d))
  184. self.class.new(text[t+1..-1])
  185. end
  186.  
  187. def first(tag,*arg) nth(tag,1,*arg) do |f| yield f end end
  188. alias_method :next, :first
  189.  
  190. def each_block(tag,closetag=nil)
  191. t=0
  192. s,d =find_opentag(tag)
  193. raise RuntimeError,"tag(#{tag}) not found" unless s
  194.  
  195. while s do
  196. if closetag then
  197. e=find_closetag(closetag,s,tag)
  198. else
  199. e=find_closetag(tag,s)
  200. end
  201. e=-1 unless e
  202. yield self.class.new(@text[s..e],tag,parse_attribute(d))
  203. if e>=0 then
  204. t=@text.index('>',e+1)
  205. t=@text.length unless t
  206. s,d = find_opentag(tag,t)
  207. else
  208. s=false
  209. end
  210. end
  211. self.class.new(text[t+1..-1])
  212. end
  213.  
  214. def collect(*arg)
  215. a=[]
  216. each_block(*arg) do |tt| a.push tt end
  217. a
  218. end
  219.  
  220. def enumtag(tag)
  221. s,d = find_openenumtag(tag)
  222. while s do
  223. e=find_closeenumtag(tag,s+1)
  224. e=-1 unless e
  225. yield self.class.new(@text[s..e],tag,parse_attribute(d))
  226. s,d = find_openenumtag(tag,s)
  227. end
  228. end
  229.  
  230. def enumcollect(tag)
  231. a=[]
  232. enumtag(tag) do |t| a.push t end
  233. a
  234. end
  235.  
  236. def for_this
  237. yield self
  238. end
  239.  
  240. def get_nth(*arg) r=nil; nth(*arg) do |bl| r=bl end; r; end
  241.  
  242. def get_first(*arg) r=nil; first(*arg) do |bl| r=bl end; r; end
  243.  
  244. def tagexist?(tag,st=0)
  245. s=find_element(tag,st)
  246. if s then true else false end
  247. end
  248.  
  249. def tagnext
  250. s=@text.index("<")
  251. return nil unless s
  252. e=@text.index(">",s)
  253. return nil unless s
  254. @text[s..e].scan(/[^<>\s]+/)[0]
  255. end
  256.  
  257. def nth_tailer(tag,n)
  258. nth(tag,n) do end
  259. end
  260.  
  261. end
  262.  
  263. # Test
  264.  
  265. if __FILE__ == $0
  266.  
  267. require 'test/unit'
  268.  
  269. class TC_TagIterator < Test::Unit::TestCase
  270.  
  271. STEXT = <<-EOS
  272. <body> This is a test...
  273. <sub> S1 </sub> <sub> S2 </sub>
  274. <DL>
  275. <DT> A1
  276. <DT> A2
  277. <DT> A3
  278. </DL>
  279. <DL>
  280. <DT> B1
  281. <DT> B2
  282. <DT> B3
  283. </DL>
  284. <NEST>
  285. <P ALIGN="R">TOP</P>
  286. <NEST>
  287. <P>SECOND</P>
  288. <OL>
  289. <LI>C1
  290. <LI>C2
  291. <LI>C3
  292. <LI>C4
  293. </OL>
  294. </NEST>
  295. <OL>
  296. <LI>D1
  297. <LI>D2
  298. <LI>D3
  299. <LI>D4
  300. </OL>
  301. </NEST>
  302. </body>
  303. EOS
  304.  
  305. def test_all
  306. assert_nothing_raised{ @a = TagIterator.new( STEXT ) }
  307. @f = []
  308. assert_nothing_raised {
  309. @a.first("body") do |y|
  310. y.nth("dl",2) do |dl|
  311. dl.enumtag("dt") do |t|
  312. @f << t.text.strip
  313. end
  314. end
  315. y.first("nest") do |n|
  316. n.first("p") do |c|
  317. @f << c.text
  318. @f.concat c.attributes.collect{ |k,v| "#{k}=#{v}" }
  319. end.next("nest") do |m|
  320. m.first("p") do |c|
  321. @f << c.text
  322. end.next("ol") do |o|
  323. o.enumtag("li") do |i| @f << i.text.strip end
  324. end
  325. end.next("ol") do |o|
  326. o.enumtag("li") do |i| @f << i.text.strip end
  327. end
  328. end
  329. end
  330. @a.each_block("sub") do |y|
  331. @f << y.text.strip
  332. end
  333. }
  334. o = [ "B1", "B2", "B3",
  335. "TOP", "align=R", "SECOND",
  336. "C1", "C2", "C3", "C4",
  337. "D1", "D2", "D3", "D4",
  338. "S1", "S2" ]
  339. assert_equal( o, @f )
  340. end
  341.  
  342. end
  343.  
  344. end
Add Comment
Please, Sign In to add comment