JCrawler.java

package jcrawler;

import java.io.*;
import java.net.*;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Stack;
import java.util.logging.Level;
import java.util.logging.Logger;

/**
 * @author Jan Kuboschek
 *
 * Basic crawler class to easily and quickly interact with one website.
 * Override "doAction(String URL, String content)" to process the content further
 * (e.g. store it, parse it).
 *
 * + Does not attempt to fetch images.
 * + Identified as JCrawler/1.0 http://kuboschek.de/jan
 * + Does not take robots.txt into consider. Should/could be added in the future.
 *
 * Concept allows for multi-threading of crawlers. All class instances share
 * processed and queued lists of links.
 *
 * Instead of keeping track of processed links and queued links within the object,
 * a JDBC connection could be established to store links in a database.
 *
 * Currently limited to one website at a time, however, could be expanded upon
 * by adding an externalLinks stack and adding to it as appropriate.
 *
 * JCrawler is intended to be used to quickly generate XML sitemaps or parse websites
 * for your desired information. It's lightweight.
 */
public abstract class JCrawler extends Thread
{
    static private Stack<String> internalLinks;

    // Potentially, we could write a link class that contains the link and a time the link was
    // retrieved and use that class as LinkedList type.
    static private LinkedList<String> processedLinks;
    static private String currentDomain;
    static private int numberObjects;

    static final private String identity = "JCrawler/1.0 http://kuboschek.de/jan";

    private final int connectionRetries;

    private boolean pause;


    /**************************************************
     * Construct a new JCrawler object. Requires an object constructed with URL first.
     **************************************************/
    JCrawler()
    {
        if (internalLinks == null || currentDomain == null)
        {
            System.out.println("Error: First JCrawler object must be called with URL.");
            System.exit(0);
        }
        connectionRetries = 3;
        numberObjects++;
        pause = false;
        crawl();
    }

    /**************************************************
     * Construct a new JCrawler object with parameter.
     * If object with parameter has already been constructed,
     * treats this as the default constructor.
     **************************************************/
    JCrawler(String URL)
    {
        if (internalLinks == null)
        {
            internalLinks = new Stack<String>();
            processedLinks= new LinkedList<String>();
            currentDomain = getDomainFromURL(URL);
            numberObjects = 1;
        }
        else
            numberObjects++;
        connectionRetries = 3;
        internalLinks.add(URL);
        pause = false;
        crawl();
    }

    /**************************************************
     * Overwrite to customize crawler to i.e. parse contents, build XML site maps, etc.
     *
     * Makes the current URL and URL's content available to the method.
     **************************************************/
    abstract void doAction(String URL, String content);

    /**************************************************
     * Pauses the crawler.
     **************************************************/
    public void pauseCrawler() { pause = true; }

    /**************************************************
     * Resumes the crawler.
     **************************************************/
    public void resumeCrawler() { pause = false; }

    /**************************************************
     * Returns a list of processed links.
     **************************************************/
    public List<String> getProcessedLinks() { return processedLinks; }

    /**************************************************
     * Checks if crawler is paused. If not, check if crawler has been idle for too long.
     * If not, fetch link from stack, read content, push new links onto stack and call doAction (an
     * abstract method to do something with the content).
     *
     * crawl() is made final as it is the heart of the class' functionality. crawl() should not be altered
     * by any classes that potentially extend JCrawler; override e.g. the exception methods instead
     * if it's really necessary.
     *************************************************/
    private final void crawl()
    {
        (new Thread() {
            @Override
            public void run() {
                int runTime = 0;
                while (true)
                {
                    if (runTime == 5)
                    {
                        System.out.println("Stopping thread. Queue has been empty for "+runTime+" seconds.");
                        return;
                    }
                    else
                    {
                        if (!pause)
                        {
                            runTime++;
                            if (internalLinks.size() > 0)
                            {
                                runTime = 0;
                                String URL = internalLinks.pop();
                                String content = "";
                                if (!processedLinks.contains(URL))
                                {
                                    content = getWebsiteContent(URL);
                                    if (!content.equals(""))
                                    {
                                        doAction(URL, content);
                                        try {
                                            List<String> links = HTMLUtils.extractLinks(content);
                                            for (String link : links)
                                            {
                                                link = processLink(link, URL);
                                                if (!link.equals("") && (getDomainFromURL(link).equalsIgnoreCase(currentDomain)) && (!internalLinks.contains(link)))
                                                    internalLinks.push(link);
                                            }
                                        } catch (IOException ex) {
                                            Logger.getLogger(JCrawler.class.getName()).log(Level.SEVERE, null, ex);
                                        }
                                    }
                                }
                                processedLinks.push(URL);
                            }
                            else
                            {
                                try {
                                    Thread.sleep(1000);
                                } catch (InterruptedException ex) {
                                    Logger.getLogger(JCrawler.class.getName()).log(Level.SEVERE, null, ex);
                                }
                                if (numberObjects > 1)
                                    System.out.println("Queue currently empty...waiting...");
                                else
                                {
                                    printProcessedLinks();
                                    System.out.println("Queue empty...quitting.");
                                    return;
                                }
                            }
                        }
                        else
                            System.out.println("Paused...waiting to resume...");
                    }
                }
            }
        }).start();
     }

    /**************************************************
     * Fetches the content of URL and returns it, otherwise returns a blank
     * string and prints an appropriate error message.
     *
     * Similar to crawl(), getWebsiteContent is a core method of the JCrawler class as well and should
     * not be altered. Therefore, final.
     **************************************************/
    private final String getWebsiteContent(String URL)
    {
        boolean noError = false;
        int i=0;
        try
        {
            URL myURL = new URL(URL);
            HttpURLConnection myConn = (HttpURLConnection)myURL.openConnection();
            BufferedReader in;
            HttpURLConnection.setFollowRedirects(false);
            myConn.setInstanceFollowRedirects(false);
            // Identify JCrawler as such.
            System.setProperty("http.agent", identity);
            while ( (i < connectionRetries) && (noError == false) )
            {
                noError = true;
                i++;
                try
                {
                    in = new BufferedReader(new InputStreamReader(myConn.getInputStream()));
                    myConn = (HttpURLConnection)myURL.openConnection();

                    if ( (myConn.getResponseCode() == 200) && (!myConn.getContentType().startsWith("image")) )
                    {
                        String buffer = "";
                        String inputLine;

                        while ((inputLine = in.readLine()) != null)
                        {
                            buffer += inputLine + "\n";
                        }
                        in.close();
                        System.out.println("Opening "+URL+"...Server reponse "+myConn.getResponseCode()+". OK. ");
                        return buffer;
                    }
                    else
                    {
                        System.out.println("Opening "+URL+"... Server reponse "+myConn.getResponseCode()+", Content type "+myConn.getContentType()+". PAGE NOT INDEXED. ");
                        return "";
                    }
                }
                catch (SocketTimeoutException e)
                {
                    System.out.println("Request timed out. Retrying ... "+i+" of "+connectionRetries+".");
                    Thread.sleep(2000);
                    noError = false;
                }
                catch (NoRouteToHostException e)
                {
                    System.out.println("Lost internet connection. Retrying ... "+i+" of "+connectionRetries+".");
                    Thread.sleep(5000);
                    noError = false;
                }
                catch (UnknownHostException e)
                {
                    System.out.println("Host is not available. Retrying ... "+i+" of "+connectionRetries+".");
                    Thread.sleep(2000);
                    noError = false;
                }
                catch (ConnectException e)
                {
                    System.out.println("Unknown host. Retrying ... "+i+" of "+connectionRetries+".");
                    Thread.sleep(2000);
                    noError = false;
                }
                catch (java.io.FileNotFoundException e)
                {
                    System.out.println("File not found... ");
                }
            }
        }
        catch (MalformedURLException e)
        {
            System.out.println("Malformed URL...");
            noError = false;
        }
        catch (java.io.IOException e)
        {
            System.out.println("Bad input... ");
        }
        catch (InterruptedException e)
        {
            System.out.println("Interrupted exception... ");
        }

        System.out.println("Request timed out. Skipping "+URL+" ...");
        return "";
    }

    /**************************************************
     * Returns the processed link or a blank string if link is null or an exception.
     * Also normalizes link.
     *
     * Method should not be altered. Alter normalizeLink() or stripExceptions() instead.
     **************************************************/
    private final String processLink(String link, String URL)
    {
        if (link == null)
            return "";
        if (stripExceptions(link))
            return link = normalizeLink(link, URL);
        else
            return "";
    }

    /**************************************************
     * Returns normalized link. A relative link is turned into an absolute link.
     **************************************************/
    public static String normalizeLink(String link, String URL)
    {
        link = link.replace("www.", "");
        if (link.endsWith("/"))
            link = link.substring(0, (link.length()-1));
        if(link.startsWith("http://") == false)
        {
            String newURL = "";
            while (link.startsWith("../") == true)
            {
                String temp[] = new String[URL.split("/").length];
                temp  = URL.split("/");
                for (int i=0;i<temp.length-2;i++)
                    newURL += temp[i]+"/";
                link = link.substring(3, link.length());
            }
            if (newURL.equals("")==false)
                URL = newURL;

            URL = getDomainFromURL(URL); // get the base URL
            if (link.startsWith("/") == false)
            {
                if (link.contains("."))
                    return link = URL+"/"+link;
                else
                    return link = URL+"/"+link+"/";
            }
            else
            {
                return link = URL+link;
            }
        }
        else
            return link;
    }

    /**************************************************
     * Returns false if link matches an exception, otherwise returns true.
     *
     * An alternative solution would be to move the exceptions into a file to easily add and remove
     * exceptions to and from the list instead of hard-coding this.
     **************************************************/
    private boolean stripExceptions(String link)
    {
        if (link.contains("mailto:"))
        {
            return false;
        }
        if (link.contains("ads"))
        {
            return false;
        }
        if (link.equals("/") == true)
        {
            return false;
        }
        if (link.contains("#"))
        {
            return false;
        }
        if (link.contains("javascript:"))
        {
            return false;
        }
        return true;
    }

    /**************************************************
     * Returns domain from URL.
     * E.g. http://www.foo.com/bar => http://www.foo.com
     **************************************************/
    public static String getDomainFromURL(String URL)
    {
        try{
        URL = URL.replace("www.", "");
        String partialURL[] = new String[4];
        partialURL = URL.split("/",4);
        URL = partialURL[0]+"/"+partialURL[1]+"/"+partialURL[2];
        }
        catch (ArrayIndexOutOfBoundsException e)
        {
            System.out.println("Out of bounds exception in URL.");
        }
        return URL;
    }

    /**************************************************
     * Prints out list of processed links.
     **************************************************/
    public void printProcessedLinks()
    {
        Iterator<String> x = processedLinks.iterator();
        System.out.print("Processed links: ");
        while (x.hasNext())
            System.out.println(x.next()+"|");
    }
}