Advertisement
Guest User

Groovy Shell script - download all Lurk articles HTML

a guest
Feb 19th, 2012
29
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Groovy 2.09 KB | None | 0 0
  1. import groovy.grape.Grape                        
  2. import groovyx.net.http.*
  3. Grape.grab(group:'org.codehaus.groovy.modules.http-builder', module:'http-builder')
  4.  
  5. http = new HTTPBuilder("http://www.lurkmore.to/");
  6.  
  7.  
  8. def getByRegex(http, tehUri, regex, collect) {
  9. println "Requesting: "+tehUri
  10. http.request(Method.GET, ContentType.TEXT) {
  11. uri=tehUri
  12.  
  13. headers.'User-Agent' = 'Mozilla/5.0 Ubuntu/8.10 Firefox/3.0.4'
  14. headers.'Accept-Charset' = 'UTF-8'
  15. response.success = { resp ->
  16. theReader = new java.io.BufferedReader(new java.io.InputStreamReader(resp.entity.content, "UTF-8"))
  17. line = theReader.readLine()
  18. while(line!=null) {
  19. line = theReader.readLine()
  20. matcher = (line =~ regex)
  21. while(matcher.find()) {
  22. if(matcher.groupCount()>0) {
  23. url = matcher.group(1);
  24. collect.add(url);
  25. }}
  26. }}}
  27. }
  28.  
  29. def getContent(http, tehUri) {
  30. println "Requesting: "+tehUri;
  31. http.request(Method.GET, ContentType.TEXT) {
  32. uri=tehUri
  33. headers.'User-Agent' = 'Mozilla/5.0 Ubuntu/8.10 Firefox/3.0.4'
  34. headers.'Accept-Charset' = 'UTF-8'
  35.  
  36. response.success = { resp ->
  37. return resp.entity.content.text
  38. }}}
  39.  
  40.  
  41. indexesURLs=new HashSet<String>();
  42. getByRegex(http, "http://www.lurkmore.to/\u0421\u043b\u0443\u0436\u0435\u0431\u043d\u0430\u044f:AllPages", /<a\s+href="(\/index.php\?title\=[^&]+&amp;from=[^&]+&amp;to=[^\"]+)\"/, indexesURLs);
  43.  
  44. indexesURLs = new java.util.ArrayList<String>(indexesURLs);
  45.  
  46. println "Found indexes URLs: "+indexesURLs.size();
  47.  
  48. pagesUrls = new HashSet<String>();
  49. indexesURLs.each{
  50. getByRegex(http, "http://www.lurkmore.to"+it.replace("&amp;","&"), /<td\s+width="33%".*<a\s+href="(\/[^\"]+)\"\s+title=\"/, pagesUrls);
  51. }
  52.  
  53. pagesUrls = new java.util.ArrayList<String>(pagesUrls);
  54. println "Found pages URLs: "+pagesUrls.size();
  55.  
  56. new File("lurk").mkdir()
  57.  
  58. i=0;
  59. pagesUrls.[595..pageUrls.size()]{
  60. try {
  61. println "URL: "+i++;
  62. file = new File("lurk/"+java.net.URLDecoder.decode(it, "UTF-8").replaceAll("[\\/\\\\]","_")+".html")
  63. content = getContent(http, "http://www.lurkmore.to"+it.replace("&amp;", "&"))
  64. file.createNewFile();
  65. file.append(content);
  66. } catch(Exception e) {
  67. e.printStackTrace();
  68. }
  69. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement