Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import groovy.grape.Grape
- import groovyx.net.http.*
- Grape.grab(group:'org.codehaus.groovy.modules.http-builder', module:'http-builder')
- http = new HTTPBuilder("http://www.lurkmore.to/");
- def getByRegex(http, tehUri, regex, collect) {
- println "Requesting: "+tehUri
- http.request(Method.GET, ContentType.TEXT) {
- uri=tehUri
- headers.'User-Agent' = 'Mozilla/5.0 Ubuntu/8.10 Firefox/3.0.4'
- headers.'Accept-Charset' = 'UTF-8'
- response.success = { resp ->
- theReader = new java.io.BufferedReader(new java.io.InputStreamReader(resp.entity.content, "UTF-8"))
- line = theReader.readLine()
- while(line!=null) {
- line = theReader.readLine()
- matcher = (line =~ regex)
- while(matcher.find()) {
- if(matcher.groupCount()>0) {
- url = matcher.group(1);
- collect.add(url);
- }}
- }}}
- }
- def getContent(http, tehUri) {
- println "Requesting: "+tehUri;
- http.request(Method.GET, ContentType.TEXT) {
- uri=tehUri
- headers.'User-Agent' = 'Mozilla/5.0 Ubuntu/8.10 Firefox/3.0.4'
- headers.'Accept-Charset' = 'UTF-8'
- response.success = { resp ->
- return resp.entity.content.text
- }}}
- indexesURLs=new HashSet<String>();
- getByRegex(http, "http://www.lurkmore.to/\u0421\u043b\u0443\u0436\u0435\u0431\u043d\u0430\u044f:AllPages", /<a\s+href="(\/index.php\?title\=[^&]+&from=[^&]+&to=[^\"]+)\"/, indexesURLs);
- indexesURLs = new java.util.ArrayList<String>(indexesURLs);
- println "Found indexes URLs: "+indexesURLs.size();
- pagesUrls = new HashSet<String>();
- indexesURLs.each{
- getByRegex(http, "http://www.lurkmore.to"+it.replace("&","&"), /<td\s+width="33%".*<a\s+href="(\/[^\"]+)\"\s+title=\"/, pagesUrls);
- }
- pagesUrls = new java.util.ArrayList<String>(pagesUrls);
- println "Found pages URLs: "+pagesUrls.size();
- new File("lurk").mkdir()
- i=0;
- pagesUrls.[595..pageUrls.size()]{
- try {
- println "URL: "+i++;
- file = new File("lurk/"+java.net.URLDecoder.decode(it, "UTF-8").replaceAll("[\\/\\\\]","_")+".html")
- content = getContent(http, "http://www.lurkmore.to"+it.replace("&", "&"))
- file.createNewFile();
- file.append(content);
- } catch(Exception e) {
- e.printStackTrace();
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement