Guest User

JCrawler.java

a guest
May 31st, 2010
168
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. package jcrawler;
  2.  
  3. import java.io.*;
  4. import java.net.*;
  5. import java.util.Iterator;
  6. import java.util.LinkedList;
  7. import java.util.List;
  8. import java.util.Stack;
  9. import java.util.logging.Level;
  10. import java.util.logging.Logger;
  11.  
  12. /**
  13.  * @author Jan Kuboschek
  14.  *
  15.  * Basic crawler class to easily and quickly interact with one website.
  16.  * Override "doAction(String URL, String content)" to process the content further
  17.  * (e.g. store it, parse it).
  18.  *
  19.  * + Does not attempt to fetch images.
  20.  * + Identified as JCrawler/1.0 http://kuboschek.de/jan
  21.  * + Does not take robots.txt into consider. Should/could be added in the future.
  22.  *
  23.  * Concept allows for multi-threading of crawlers. All class instances share
  24.  * processed and queued lists of links.
  25.  *
  26.  * Instead of keeping track of processed links and queued links within the object,
  27.  * a JDBC connection could be established to store links in a database.
  28.  *
  29.  * Currently limited to one website at a time, however, could be expanded upon
  30.  * by adding an externalLinks stack and adding to it as appropriate.
  31.  *
  32.  * JCrawler is intended to be used to quickly generate XML sitemaps or parse websites
  33.  * for your desired information. It's lightweight.
  34.  */
  35. public abstract class JCrawler extends Thread
  36. {
  37.     static private Stack<String> internalLinks;
  38.  
  39.     // Potentially, we could write a link class that contains the link and a time the link was
  40.     // retrieved and use that class as LinkedList type.
  41.     static private LinkedList<String> processedLinks;
  42.     static private String currentDomain;
  43.     static private int numberObjects;
  44.  
  45.     static final private String identity = "JCrawler/1.0 http://kuboschek.de/jan";
  46.  
  47.     private final int connectionRetries;
  48.  
  49.     private boolean pause;
  50.  
  51.  
  52.     /**************************************************
  53.      * Construct a new JCrawler object. Requires an object constructed with URL first.
  54.      **************************************************/
  55.     JCrawler()
  56.     {
  57.         if (internalLinks == null || currentDomain == null)
  58.         {
  59.             System.out.println("Error: First JCrawler object must be called with URL.");
  60.             System.exit(0);
  61.         }
  62.         connectionRetries = 3;
  63.         numberObjects++;
  64.         pause = false;
  65.         crawl();
  66.     }
  67.  
  68.     /**************************************************
  69.      * Construct a new JCrawler object with parameter.
  70.      * If object with parameter has already been constructed,
  71.      * treats this as the default constructor.
  72.      **************************************************/
  73.     JCrawler(String URL)
  74.     {
  75.         if (internalLinks == null)
  76.         {
  77.             internalLinks = new Stack<String>();
  78.             processedLinks= new LinkedList<String>();
  79.             currentDomain = getDomainFromURL(URL);
  80.             numberObjects = 1;
  81.         }
  82.         else
  83.             numberObjects++;
  84.         connectionRetries = 3;
  85.         internalLinks.add(URL);
  86.         pause = false;
  87.         crawl();
  88.     }
  89.  
  90.     /**************************************************
  91.      * Overwrite to customize crawler to i.e. parse contents, build XML site maps, etc.
  92.      *
  93.      * Makes the current URL and URL's content available to the method.
  94.      **************************************************/
  95.     abstract void doAction(String URL, String content);
  96.  
  97.     /**************************************************
  98.      * Pauses the crawler.
  99.      **************************************************/
  100.     public void pauseCrawler() { pause = true; }
  101.  
  102.     /**************************************************
  103.      * Resumes the crawler.
  104.      **************************************************/
  105.     public void resumeCrawler() { pause = false; }
  106.  
  107.     /**************************************************
  108.      * Returns a list of processed links.
  109.      **************************************************/
  110.     public List<String> getProcessedLinks() { return processedLinks; }
  111.  
  112.     /**************************************************
  113.      * Checks if crawler is paused. If not, check if crawler has been idle for too long.
  114.      * If not, fetch link from stack, read content, push new links onto stack and call doAction (an
  115.      * abstract method to do something with the content).
  116.      *
  117.      * crawl() is made final as it is the heart of the class' functionality. crawl() should not be altered
  118.      * by any classes that potentially extend JCrawler; override e.g. the exception methods instead
  119.      * if it's really necessary.
  120.      *************************************************/
  121.     private final void crawl()
  122.     {
  123.         (new Thread() {
  124.             @Override
  125.             public void run() {
  126.                 int runTime = 0;
  127.                 while (true)
  128.                 {
  129.                     if (runTime == 5)
  130.                     {
  131.                         System.out.println("Stopping thread. Queue has been empty for "+runTime+" seconds.");
  132.                         return;
  133.                     }
  134.                     else
  135.                     {
  136.                         if (!pause)
  137.                         {
  138.                             runTime++;
  139.                             if (internalLinks.size() > 0)
  140.                             {
  141.                                 runTime = 0;
  142.                                 String URL = internalLinks.pop();
  143.                                 String content = "";
  144.                                 if (!processedLinks.contains(URL))
  145.                                 {
  146.                                     content = getWebsiteContent(URL);
  147.                                     if (!content.equals(""))
  148.                                     {
  149.                                         doAction(URL, content);
  150.                                         try {
  151.                                             List<String> links = HTMLUtils.extractLinks(content);
  152.                                             for (String link : links)
  153.                                             {
  154.                                                 link = processLink(link, URL);
  155.                                                 if (!link.equals("") && (getDomainFromURL(link).equalsIgnoreCase(currentDomain)) && (!internalLinks.contains(link)))
  156.                                                     internalLinks.push(link);
  157.                                             }
  158.                                         } catch (IOException ex) {
  159.                                             Logger.getLogger(JCrawler.class.getName()).log(Level.SEVERE, null, ex);
  160.                                         }
  161.                                     }
  162.                                 }
  163.                                 processedLinks.push(URL);
  164.                             }
  165.                             else
  166.                             {
  167.                                 try {
  168.                                     Thread.sleep(1000);
  169.                                 } catch (InterruptedException ex) {
  170.                                     Logger.getLogger(JCrawler.class.getName()).log(Level.SEVERE, null, ex);
  171.                                 }
  172.                                 if (numberObjects > 1)
  173.                                     System.out.println("Queue currently empty...waiting...");
  174.                                 else
  175.                                 {
  176.                                     printProcessedLinks();
  177.                                     System.out.println("Queue empty...quitting.");
  178.                                     return;
  179.                                 }
  180.                             }
  181.                         }
  182.                         else
  183.                             System.out.println("Paused...waiting to resume...");
  184.                     }
  185.                 }
  186.             }
  187.         }).start();
  188.      }
  189.  
  190.     /**************************************************
  191.      * Fetches the content of URL and returns it, otherwise returns a blank
  192.      * string and prints an appropriate error message.
  193.      *
  194.      * Similar to crawl(), getWebsiteContent is a core method of the JCrawler class as well and should
  195.      * not be altered. Therefore, final.
  196.      **************************************************/
  197.     private final String getWebsiteContent(String URL)
  198.     {
  199.         boolean noError = false;
  200.         int i=0;
  201.         try
  202.         {
  203.             URL myURL = new URL(URL);
  204.             HttpURLConnection myConn = (HttpURLConnection)myURL.openConnection();
  205.             BufferedReader in;
  206.             HttpURLConnection.setFollowRedirects(false);
  207.             myConn.setInstanceFollowRedirects(false);
  208.             // Identify JCrawler as such.
  209.             System.setProperty("http.agent", identity);
  210.             while ( (i < connectionRetries) && (noError == false) )
  211.             {
  212.                 noError = true;
  213.                 i++;
  214.                 try
  215.                 {
  216.                     in = new BufferedReader(new InputStreamReader(myConn.getInputStream()));
  217.                     myConn = (HttpURLConnection)myURL.openConnection();
  218.  
  219.                     if ( (myConn.getResponseCode() == 200) && (!myConn.getContentType().startsWith("image")) )
  220.                     {
  221.                         String buffer = "";
  222.                         String inputLine;
  223.  
  224.                         while ((inputLine = in.readLine()) != null)
  225.                         {
  226.                             buffer += inputLine + "\n";
  227.                         }
  228.                         in.close();
  229.                         System.out.println("Opening "+URL+"...Server reponse "+myConn.getResponseCode()+". OK. ");
  230.                         return buffer;
  231.                     }
  232.                     else
  233.                     {
  234.                         System.out.println("Opening "+URL+"... Server reponse "+myConn.getResponseCode()+", Content type "+myConn.getContentType()+". PAGE NOT INDEXED. ");
  235.                         return "";
  236.                     }
  237.                 }
  238.                 catch (SocketTimeoutException e)
  239.                 {
  240.                     System.out.println("Request timed out. Retrying ... "+i+" of "+connectionRetries+".");
  241.                     Thread.sleep(2000);
  242.                     noError = false;
  243.                 }
  244.                 catch (NoRouteToHostException e)
  245.                 {
  246.                     System.out.println("Lost internet connection. Retrying ... "+i+" of "+connectionRetries+".");
  247.                     Thread.sleep(5000);
  248.                     noError = false;
  249.                 }
  250.                 catch (UnknownHostException e)
  251.                 {
  252.                     System.out.println("Host is not available. Retrying ... "+i+" of "+connectionRetries+".");
  253.                     Thread.sleep(2000);
  254.                     noError = false;
  255.                 }
  256.                 catch (ConnectException e)
  257.                 {
  258.                     System.out.println("Unknown host. Retrying ... "+i+" of "+connectionRetries+".");
  259.                     Thread.sleep(2000);
  260.                     noError = false;
  261.                 }
  262.                 catch (java.io.FileNotFoundException e)
  263.                 {
  264.                     System.out.println("File not found... ");
  265.                 }
  266.             }
  267.         }
  268.         catch (MalformedURLException e)
  269.         {
  270.             System.out.println("Malformed URL...");
  271.             noError = false;
  272.         }
  273.         catch (java.io.IOException e)
  274.         {
  275.             System.out.println("Bad input... ");
  276.         }
  277.         catch (InterruptedException e)
  278.         {
  279.             System.out.println("Interrupted exception... ");
  280.         }
  281.        
  282.         System.out.println("Request timed out. Skipping "+URL+" ...");
  283.         return "";
  284.     }
  285.  
  286.     /**************************************************
  287.      * Returns the processed link or a blank string if link is null or an exception.
  288.      * Also normalizes link.
  289.      *
  290.      * Method should not be altered. Alter normalizeLink() or stripExceptions() instead.
  291.      **************************************************/
  292.     private final String processLink(String link, String URL)
  293.     {
  294.         if (link == null)
  295.             return "";
  296.         if (stripExceptions(link))
  297.             return link = normalizeLink(link, URL);
  298.         else
  299.             return "";
  300.     }
  301.  
  302.     /**************************************************
  303.      * Returns normalized link. A relative link is turned into an absolute link.
  304.      **************************************************/
  305.     public static String normalizeLink(String link, String URL)
  306.     {
  307.         link = link.replace("www.", "");
  308.         if (link.endsWith("/"))
  309.             link = link.substring(0, (link.length()-1));
  310.         if(link.startsWith("http://") == false)
  311.         {
  312.             String newURL = "";
  313.             while (link.startsWith("../") == true)
  314.             {
  315.                 String temp[] = new String[URL.split("/").length];
  316.                 temp  = URL.split("/");
  317.                 for (int i=0;i<temp.length-2;i++)
  318.                     newURL += temp[i]+"/";
  319.                 link = link.substring(3, link.length());
  320.             }
  321.             if (newURL.equals("")==false)
  322.                 URL = newURL;
  323.  
  324.             URL = getDomainFromURL(URL); // get the base URL
  325.             if (link.startsWith("/") == false)
  326.             {
  327.                 if (link.contains("."))
  328.                     return link = URL+"/"+link;
  329.                 else
  330.                     return link = URL+"/"+link+"/";
  331.             }
  332.             else
  333.             {
  334.                 return link = URL+link;
  335.             }
  336.         }
  337.         else
  338.             return link;
  339.     }
  340.  
  341.     /**************************************************
  342.      * Returns false if link matches an exception, otherwise returns true.
  343.      *
  344.      * An alternative solution would be to move the exceptions into a file to easily add and remove
  345.      * exceptions to and from the list instead of hard-coding this.
  346.      **************************************************/
  347.     private boolean stripExceptions(String link)
  348.     {
  349.         if (link.contains("mailto:"))
  350.         {
  351.             return false;
  352.         }
  353.         if (link.contains("ads"))
  354.         {
  355.             return false;
  356.         }
  357.         if (link.equals("/") == true)
  358.         {
  359.             return false;
  360.         }
  361.         if (link.contains("#"))
  362.         {
  363.             return false;
  364.         }
  365.         if (link.contains("javascript:"))
  366.         {
  367.             return false;
  368.         }
  369.         return true;
  370.     }
  371.  
  372.     /**************************************************
  373.      * Returns domain from URL.
  374.      * E.g. http://www.foo.com/bar => http://www.foo.com
  375.      **************************************************/
  376.     public static String getDomainFromURL(String URL)
  377.     {
  378.         try{
  379.         URL = URL.replace("www.", "");
  380.         String partialURL[] = new String[4];
  381.         partialURL = URL.split("/",4);
  382.         URL = partialURL[0]+"/"+partialURL[1]+"/"+partialURL[2];
  383.         }
  384.         catch (ArrayIndexOutOfBoundsException e)
  385.         {
  386.             System.out.println("Out of bounds exception in URL.");
  387.         }
  388.         return URL;
  389.     }
  390.  
  391.     /**************************************************
  392.      * Prints out list of processed links.
  393.      **************************************************/
  394.     public void printProcessedLinks()
  395.     {
  396.         Iterator<String> x = processedLinks.iterator();
  397.         System.out.print("Processed links: ");
  398.         while (x.hasNext())
  399.             System.out.println(x.next()+"|");
  400.     }
  401. }
RAW Paste Data

Adblocker detected! Please consider disabling it...

We've detected AdBlock Plus or some other adblocking software preventing Pastebin.com from fully loading.

We don't have any obnoxious sound, or popup ads, we actively block these annoying types of ads!

Please add Pastebin.com to your ad blocker whitelist or disable your adblocking software.

×