Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- declare default element namespace "http://www.w3.org/1999/xhtml";
- declare option exist:serialize "method=xhtml media-type=text/html omit-xml-declaration=yes indent=yes";
- let $source-url := 'http://www.edrants.com/list-of-independent-alternatives-to-closed-borders-bookstores/'
- let $page := httpclient:get(xs:anyURI($source-url), false(), ())/httpclient:body/*
- let $locations := $page//div[@id eq 'post-16691']//b[starts-with(., 'Borders Bookstore #')]/parent::p
- return
- <html>
- <head><title>Borders and Alternatives</title></head>
- <body>
- <h1>Borders and Alternatives</h1>
- <p>Extracted from <a href="{$source-url}">{$source-url}</a></p>
- <table>
- <thead><tr><th>Name</th><th>Address</th></tr></thead>
- <tbody>{
- for $location in $locations
- (: first grab the Borders name and address :)
- let $borders-name := $location/b[1]
- let $borders-address := $borders-name/following-sibling::text()[1]
- let $borders-entry := <tr><td>{$borders-name/text()}</td><td>{normalize-space($borders-address)}</td></tr>
- (: then grab the alternative bookstores listed after each Borders in the list:)
- let $alternative-entries :=
- for $br in ($borders-address/following-sibling::br)[position() gt 1][not(name(following-sibling::node()[2]) = 'strike')]
- return
- (: tests for text-only address entries, and checks to make sure the address contains numbers :)
- if ($br/following-sibling::node()[1] instance of text() and matches($br/following-sibling::node()[1], '\d')) then
- let $full-address := $br/following-sibling::node()[1]
- let $name := normalize-space(substring-before($full-address, ', '))
- let $address := normalize-space(substring-before(substring-after($full-address, ', '), ' ('))
- return <tr><td>{$name}</td><td>{$address}</td></tr>
- (: tests for miscoded HTML, where there is an open <a> tag and no closed </a> :)
- else if (name($br/following-sibling::node()[2]) = 'a' and $br/following-sibling::node()[2]/br) then
- let $address1 :=
- let $full-address := $br/following-sibling::node()[2]/text()[1]
- let $name := substring-before($full-address, ', ')
- let $address := substring-after($full-address, ', ')
- return <tr><td>{$name}</td><td>{substring-before($address, ' (')}</td></tr>
- let $address2 :=
- let $full-address := $br/following-sibling::node()[2]/text()[2]
- let $name := substring-before($full-address, ', ')
- let $address := substring-after($full-address, ', ')
- return <tr><td>{$name}</td><td>{substring-before($address, ' (')}</td></tr>
- return ($address1, $address2)
- (: tests for address entries that contain links to the store website :)
- else if (name($br/following-sibling::node()[2]) = 'a') then
- let $name := $br/following-sibling::node()[2]
- let $address := replace(normalize-space($name/following-sibling::text()[1]), '^,\s', '')
- return <tr><td>{$name/text()}</td><td>{substring-before($address, ' (')}</td></tr>
- (: do not return any lines that don't match the above patterns :)
- else ()
- return ($borders-entry, $alternative-entries)
- }</tbody>
- </table>
- </body>
- </html>
Advertisement
Add Comment
Please, Sign In to add comment