Guest User

Screen Scraping with XQuery (eXist)

a guest
Feb 22nd, 2011
269
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
XML 3.56 KB | None | 0 0
  1. declare default element namespace "http://www.w3.org/1999/xhtml";
  2.  
  3. declare option exist:serialize "method=xhtml media-type=text/html omit-xml-declaration=yes indent=yes";
  4.  
  5. let $source-url := 'http://www.edrants.com/list-of-independent-alternatives-to-closed-borders-bookstores/'
  6. let $page := httpclient:get(xs:anyURI($source-url), false(), ())/httpclient:body/*
  7. let $locations := $page//div[@id eq 'post-16691']//b[starts-with(., 'Borders Bookstore #')]/parent::p
  8. return
  9.  
  10. <html>
  11. <head><title>Borders and Alternatives</title></head>
  12. <body>
  13.     <h1>Borders and Alternatives</h1>
  14.     <p>Extracted from <a href="{$source-url}">{$source-url}</a></p>
  15.     <table>
  16.     <thead><tr><th>Name</th><th>Address</th></tr></thead>
  17.     <tbody>{
  18.         for $location in $locations
  19.         (: first grab the Borders name and address :)
  20.         let $borders-name := $location/b[1]
  21.         let $borders-address := $borders-name/following-sibling::text()[1]
  22.         let $borders-entry := <tr><td>{$borders-name/text()}</td><td>{normalize-space($borders-address)}</td></tr>
  23.         (: then grab the alternative bookstores listed after each Borders in the list:)
  24.         let $alternative-entries :=
  25.             for $br in ($borders-address/following-sibling::br)[position() gt 1][not(name(following-sibling::node()[2]) = 'strike')]
  26.             return
  27.                 (: tests for text-only address entries, and checks to make sure the address contains numbers :)
  28.                 if ($br/following-sibling::node()[1] instance of text() and matches($br/following-sibling::node()[1], '\d')) then
  29.                     let $full-address := $br/following-sibling::node()[1]
  30.                     let $name := normalize-space(substring-before($full-address, ', '))
  31.                     let $address := normalize-space(substring-before(substring-after($full-address, ', '), ' ('))
  32.                     return <tr><td>{$name}</td><td>{$address}</td></tr>
  33.                 (: tests for miscoded HTML, where there is an open <a> tag and no closed </a> :)
  34.                 else if (name($br/following-sibling::node()[2]) = 'a' and $br/following-sibling::node()[2]/br) then
  35.                     let $address1 :=
  36.                         let $full-address := $br/following-sibling::node()[2]/text()[1]
  37.                         let $name := substring-before($full-address, ', ')
  38.                         let $address := substring-after($full-address, ', ')
  39.                         return <tr><td>{$name}</td><td>{substring-before($address, ' (')}</td></tr>
  40.                     let $address2 :=
  41.                         let $full-address := $br/following-sibling::node()[2]/text()[2]
  42.                         let $name := substring-before($full-address, ', ')
  43.                         let $address := substring-after($full-address, ', ')
  44.                         return <tr><td>{$name}</td><td>{substring-before($address, ' (')}</td></tr>
  45.                     return ($address1, $address2)
  46.                 (: tests for address entries that contain links to the store website :)
  47.                 else if (name($br/following-sibling::node()[2]) = 'a') then
  48.                     let $name := $br/following-sibling::node()[2]
  49.                     let $address := replace(normalize-space($name/following-sibling::text()[1]), '^,\s', '')
  50.                     return <tr><td>{$name/text()}</td><td>{substring-before($address, ' (')}</td></tr>
  51.                 (: do not return any lines that don't match the above patterns :)
  52.                 else ()
  53.         return ($borders-entry, $alternative-entries)
  54.     }</tbody>
  55.     </table>
  56. </body>
  57. </html>
Advertisement
Add Comment
Please, Sign In to add comment