Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- package jcrawler;
- import java.io.*;
- import java.net.*;
- import java.util.Iterator;
- import java.util.LinkedList;
- import java.util.List;
- import java.util.Stack;
- import java.util.logging.Level;
- import java.util.logging.Logger;
- /**
- * @author Jan Kuboschek
- *
- * Basic crawler class to easily and quickly interact with one website.
- * Override "doAction(String URL, String content)" to process the content further
- * (e.g. store it, parse it).
- *
- * + Does not attempt to fetch images.
- * + Identified as JCrawler/1.0 http://kuboschek.de/jan
- * + Does not take robots.txt into consider. Should/could be added in the future.
- *
- * Concept allows for multi-threading of crawlers. All class instances share
- * processed and queued lists of links.
- *
- * Instead of keeping track of processed links and queued links within the object,
- * a JDBC connection could be established to store links in a database.
- *
- * Currently limited to one website at a time, however, could be expanded upon
- * by adding an externalLinks stack and adding to it as appropriate.
- *
- * JCrawler is intended to be used to quickly generate XML sitemaps or parse websites
- * for your desired information. It's lightweight.
- */
- public abstract class JCrawler extends Thread
- {
- static private Stack<String> internalLinks;
- // Potentially, we could write a link class that contains the link and a time the link was
- // retrieved and use that class as LinkedList type.
- static private LinkedList<String> processedLinks;
- static private String currentDomain;
- static private int numberObjects;
- static final private String identity = "JCrawler/1.0 http://kuboschek.de/jan";
- private final int connectionRetries;
- private boolean pause;
- /**************************************************
- * Construct a new JCrawler object. Requires an object constructed with URL first.
- **************************************************/
- JCrawler()
- {
- if (internalLinks == null || currentDomain == null)
- {
- System.out.println("Error: First JCrawler object must be called with URL.");
- System.exit(0);
- }
- connectionRetries = 3;
- numberObjects++;
- pause = false;
- crawl();
- }
- /**************************************************
- * Construct a new JCrawler object with parameter.
- * If object with parameter has already been constructed,
- * treats this as the default constructor.
- **************************************************/
- JCrawler(String URL)
- {
- if (internalLinks == null)
- {
- internalLinks = new Stack<String>();
- processedLinks= new LinkedList<String>();
- currentDomain = getDomainFromURL(URL);
- numberObjects = 1;
- }
- else
- numberObjects++;
- connectionRetries = 3;
- internalLinks.add(URL);
- pause = false;
- crawl();
- }
- /**************************************************
- * Overwrite to customize crawler to i.e. parse contents, build XML site maps, etc.
- *
- * Makes the current URL and URL's content available to the method.
- **************************************************/
- abstract void doAction(String URL, String content);
- /**************************************************
- * Pauses the crawler.
- **************************************************/
- public void pauseCrawler() { pause = true; }
- /**************************************************
- * Resumes the crawler.
- **************************************************/
- public void resumeCrawler() { pause = false; }
- /**************************************************
- * Returns a list of processed links.
- **************************************************/
- public List<String> getProcessedLinks() { return processedLinks; }
- /**************************************************
- * Checks if crawler is paused. If not, check if crawler has been idle for too long.
- * If not, fetch link from stack, read content, push new links onto stack and call doAction (an
- * abstract method to do something with the content).
- *
- * crawl() is made final as it is the heart of the class' functionality. crawl() should not be altered
- * by any classes that potentially extend JCrawler; override e.g. the exception methods instead
- * if it's really necessary.
- *************************************************/
- private final void crawl()
- {
- (new Thread() {
- @Override
- public void run() {
- int runTime = 0;
- while (true)
- {
- if (runTime == 5)
- {
- System.out.println("Stopping thread. Queue has been empty for "+runTime+" seconds.");
- return;
- }
- else
- {
- if (!pause)
- {
- runTime++;
- if (internalLinks.size() > 0)
- {
- runTime = 0;
- String URL = internalLinks.pop();
- String content = "";
- if (!processedLinks.contains(URL))
- {
- content = getWebsiteContent(URL);
- if (!content.equals(""))
- {
- doAction(URL, content);
- try {
- List<String> links = HTMLUtils.extractLinks(content);
- for (String link : links)
- {
- link = processLink(link, URL);
- if (!link.equals("") && (getDomainFromURL(link).equalsIgnoreCase(currentDomain)) && (!internalLinks.contains(link)))
- internalLinks.push(link);
- }
- } catch (IOException ex) {
- Logger.getLogger(JCrawler.class.getName()).log(Level.SEVERE, null, ex);
- }
- }
- }
- processedLinks.push(URL);
- }
- else
- {
- try {
- Thread.sleep(1000);
- } catch (InterruptedException ex) {
- Logger.getLogger(JCrawler.class.getName()).log(Level.SEVERE, null, ex);
- }
- if (numberObjects > 1)
- System.out.println("Queue currently empty...waiting...");
- else
- {
- printProcessedLinks();
- System.out.println("Queue empty...quitting.");
- return;
- }
- }
- }
- else
- System.out.println("Paused...waiting to resume...");
- }
- }
- }
- }).start();
- }
- /**************************************************
- * Fetches the content of URL and returns it, otherwise returns a blank
- * string and prints an appropriate error message.
- *
- * Similar to crawl(), getWebsiteContent is a core method of the JCrawler class as well and should
- * not be altered. Therefore, final.
- **************************************************/
- private final String getWebsiteContent(String URL)
- {
- boolean noError = false;
- int i=0;
- try
- {
- URL myURL = new URL(URL);
- HttpURLConnection myConn = (HttpURLConnection)myURL.openConnection();
- BufferedReader in;
- HttpURLConnection.setFollowRedirects(false);
- myConn.setInstanceFollowRedirects(false);
- // Identify JCrawler as such.
- System.setProperty("http.agent", identity);
- while ( (i < connectionRetries) && (noError == false) )
- {
- noError = true;
- i++;
- try
- {
- in = new BufferedReader(new InputStreamReader(myConn.getInputStream()));
- myConn = (HttpURLConnection)myURL.openConnection();
- if ( (myConn.getResponseCode() == 200) && (!myConn.getContentType().startsWith("image")) )
- {
- String buffer = "";
- String inputLine;
- while ((inputLine = in.readLine()) != null)
- {
- buffer += inputLine + "\n";
- }
- in.close();
- System.out.println("Opening "+URL+"...Server reponse "+myConn.getResponseCode()+". OK. ");
- return buffer;
- }
- else
- {
- System.out.println("Opening "+URL+"... Server reponse "+myConn.getResponseCode()+", Content type "+myConn.getContentType()+". PAGE NOT INDEXED. ");
- return "";
- }
- }
- catch (SocketTimeoutException e)
- {
- System.out.println("Request timed out. Retrying ... "+i+" of "+connectionRetries+".");
- Thread.sleep(2000);
- noError = false;
- }
- catch (NoRouteToHostException e)
- {
- System.out.println("Lost internet connection. Retrying ... "+i+" of "+connectionRetries+".");
- Thread.sleep(5000);
- noError = false;
- }
- catch (UnknownHostException e)
- {
- System.out.println("Host is not available. Retrying ... "+i+" of "+connectionRetries+".");
- Thread.sleep(2000);
- noError = false;
- }
- catch (ConnectException e)
- {
- System.out.println("Unknown host. Retrying ... "+i+" of "+connectionRetries+".");
- Thread.sleep(2000);
- noError = false;
- }
- catch (java.io.FileNotFoundException e)
- {
- System.out.println("File not found... ");
- }
- }
- }
- catch (MalformedURLException e)
- {
- System.out.println("Malformed URL...");
- noError = false;
- }
- catch (java.io.IOException e)
- {
- System.out.println("Bad input... ");
- }
- catch (InterruptedException e)
- {
- System.out.println("Interrupted exception... ");
- }
- System.out.println("Request timed out. Skipping "+URL+" ...");
- return "";
- }
- /**************************************************
- * Returns the processed link or a blank string if link is null or an exception.
- * Also normalizes link.
- *
- * Method should not be altered. Alter normalizeLink() or stripExceptions() instead.
- **************************************************/
- private final String processLink(String link, String URL)
- {
- if (link == null)
- return "";
- if (stripExceptions(link))
- return link = normalizeLink(link, URL);
- else
- return "";
- }
- /**************************************************
- * Returns normalized link. A relative link is turned into an absolute link.
- **************************************************/
- public static String normalizeLink(String link, String URL)
- {
- link = link.replace("www.", "");
- if (link.endsWith("/"))
- link = link.substring(0, (link.length()-1));
- if(link.startsWith("http://") == false)
- {
- String newURL = "";
- while (link.startsWith("../") == true)
- {
- String temp[] = new String[URL.split("/").length];
- temp = URL.split("/");
- for (int i=0;i<temp.length-2;i++)
- newURL += temp[i]+"/";
- link = link.substring(3, link.length());
- }
- if (newURL.equals("")==false)
- URL = newURL;
- URL = getDomainFromURL(URL); // get the base URL
- if (link.startsWith("/") == false)
- {
- if (link.contains("."))
- return link = URL+"/"+link;
- else
- return link = URL+"/"+link+"/";
- }
- else
- {
- return link = URL+link;
- }
- }
- else
- return link;
- }
- /**************************************************
- * Returns false if link matches an exception, otherwise returns true.
- *
- * An alternative solution would be to move the exceptions into a file to easily add and remove
- * exceptions to and from the list instead of hard-coding this.
- **************************************************/
- private boolean stripExceptions(String link)
- {
- if (link.contains("mailto:"))
- {
- return false;
- }
- if (link.contains("ads"))
- {
- return false;
- }
- if (link.equals("/") == true)
- {
- return false;
- }
- if (link.contains("#"))
- {
- return false;
- }
- if (link.contains("javascript:"))
- {
- return false;
- }
- return true;
- }
- /**************************************************
- * Returns domain from URL.
- * E.g. http://www.foo.com/bar => http://www.foo.com
- **************************************************/
- public static String getDomainFromURL(String URL)
- {
- try{
- URL = URL.replace("www.", "");
- String partialURL[] = new String[4];
- partialURL = URL.split("/",4);
- URL = partialURL[0]+"/"+partialURL[1]+"/"+partialURL[2];
- }
- catch (ArrayIndexOutOfBoundsException e)
- {
- System.out.println("Out of bounds exception in URL.");
- }
- return URL;
- }
- /**************************************************
- * Prints out list of processed links.
- **************************************************/
- public void printProcessedLinks()
- {
- Iterator<String> x = processedLinks.iterator();
- System.out.print("Processed links: ");
- while (x.hasNext())
- System.out.println(x.next()+"|");
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement