Advertisement
Guest User

WebCrawler Java fork/join

a guest
Jan 18th, 2012
137
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Java 4.54 KB | None | 0 0
  1. import java.io.*;
  2. import java.net.*;
  3. import java.util.*;
  4. import java.util.concurrent.*;
  5. import java.util.regex.*;
  6.  
  7. public class Crawler {
  8.     private static final Pattern pattern1 =
  9.         Pattern.compile( "(?i)href\\s*=\\s*(\"|\\')/?((?!#.*|/\\B|mailto:|location\\.|javascript:)[^\"\']+)(\"|\')" );
  10.     private final ConcurrentMap<String,URL> sitesVisited = new ConcurrentHashMap<String,URL>( );
  11.     private final ForkJoinPool pool = new ForkJoinPool( 10 );
  12.     // This member MUST be volatile
  13.     private volatile boolean done = false;
  14.     private final BlockingQueue<Tuple<Long,Tuple<URL,Integer>>> printerQ =
  15.         new ArrayBlockingQueue<Tuple<Long,Tuple<URL,Integer>>>( 20 );
  16.  
  17.     private final int MAX_DEPTH = 3;
  18.    
  19.     private static class Tuple<T1,T2> {
  20.         Tuple( T1 first,T2 second ) {
  21.             this.first = first;
  22.             this.second = second;
  23.         }
  24.         final T1 first;
  25.         final T2 second;
  26.     }
  27.  
  28.     private String fetch( URL url ) throws IOException {
  29.         HttpURLConnection urlc = ( HttpURLConnection ) url.openConnection( );
  30.         urlc.setRequestMethod( "GET" );
  31.         urlc.setConnectTimeout( 5000 );
  32.         urlc.setRequestProperty( "Content-type","text/html" );
  33.         urlc.connect( );
  34.         try {
  35.             if( urlc.getContentType( ).contains( "html" ) ) {
  36.                 BufferedReader reader = new BufferedReader( new InputStreamReader( urlc.getInputStream( ) ) );
  37.                 StringBuilder sb = new StringBuilder( );
  38.                 String inputLine;
  39.                 while( ( inputLine = reader.readLine( ) ) != null )
  40.                     sb.append( inputLine );
  41.                 reader.close( );
  42.                 return sb.toString( );
  43.             }
  44.             return null;
  45.         }
  46.         catch( Exception e ) {
  47.             System.out.println( "Fetch of document " + url + " failed" );
  48.             return null;
  49.         }
  50.         finally {
  51.             urlc.disconnect( );
  52.         }
  53.     }
  54.    
  55.     private Collection<URL> extractLinks( String html ) throws MalformedURLException {
  56.         if( html == null ) return Collections.emptyList( );
  57.         Collection<URL> urls = new ArrayList<URL>( );
  58.         Matcher matcher = pattern1.matcher( html );
  59.         while( matcher.find( ) ) {
  60.             String link = matcher.group( 2 );
  61.             if( ! link.startsWith( "http" ) )
  62.                 continue;
  63.             urls.add( new URL( link ) );
  64.         }
  65.         return urls;
  66.     }
  67.  
  68.     /**
  69.      * For more information see http://gee.cs.oswego.edu/dl/papers/fj.pdf
  70.      * and http://www.coopsoft.com/ar/ForkJoinArticle.html
  71.      *
  72.      */
  73.     @SuppressWarnings("serial")
  74.     private class CrawlerTask extends RecursiveTask<Collection<URL>> {
  75.         private final Tuple<URL,Integer> msg;
  76.        
  77.         CrawlerTask( Tuple<URL,Integer> msg ) {
  78.             this.msg = msg;
  79.         }
  80.        
  81.         @Override
  82.         protected Collection<URL> compute( ) {
  83.             Collection<URL> urls = new ArrayList<URL>( );
  84.             try {
  85.                 // Output and collect only if not the root of the search
  86.                 if( msg.second > 0 ) {
  87.                     printerQ.put( new Tuple<Long,Tuple<URL,Integer>>( Thread.currentThread().getId( ),msg ) );
  88.                     urls.add( msg.first );
  89.                 }
  90.                 Collection<CrawlerTask> forks = new ArrayList<CrawlerTask>( );
  91.                 String html = fetch( msg.first );
  92.                 for( URL url : extractLinks( html ) ) {
  93.                     if( sitesVisited.putIfAbsent( url.toExternalForm( ),url ) == null &&  msg.second < MAX_DEPTH ) {
  94.                         CrawlerTask task = new CrawlerTask( new Tuple<URL,Integer>( url,msg.second + 1 ) );
  95.                         forks.add( task );
  96.                         task.fork( );
  97.                     }
  98.                 }
  99.                
  100.                 for( CrawlerTask task : forks )
  101.                     urls.addAll( task.join( ) );
  102.                 return urls;
  103.             }
  104.             catch( InterruptedException e ) {
  105.                 Thread.currentThread( ).interrupt( );
  106.                 return Collections.emptyList( );
  107.             }
  108.             catch( IOException e ) {
  109.                 return Collections.emptyList( );
  110.             }
  111.         }
  112.     };
  113.  
  114.     public static void main( String ... args ) throws Exception {
  115.         //new Crawler( ).crawl( "http://localhost:9080/Crawler_Files/test.html" );
  116.         Collection<URL> urls = new Crawler( ).crawl( "http://news.google.com/" );
  117.         System.out.println( "Crawling ended with " + urls.size( ) + "URLs" );
  118.     }
  119.    
  120.     public Collection<URL> crawl( String strURL ) throws IOException, InterruptedException {
  121.         ExecutorService es = Executors.newFixedThreadPool( 1 );
  122.         es.submit(
  123.             new Callable<Void>( ) {
  124.                 @Override
  125.                 public Void call( ) throws Exception {
  126.                     do {
  127.                         Tuple<Long, Tuple<URL, Integer>> msg;
  128.                         msg = printerQ.take( );
  129.                         System.out.println(
  130.                                 String.format(  "URL %s crawled by agent %d. Depth is %d",
  131.                                                 msg.second.first,msg.first,msg.second.second ) );
  132.                     } while( ! done || ! printerQ.isEmpty( ) );
  133.                     return null;
  134.                 }
  135.             } );   
  136.  
  137.         CrawlerTask task = new CrawlerTask( new Tuple<URL,Integer>( new URL( strURL ),0 ) );
  138.         pool.submit( task );
  139.         Collection<URL> urls = task.join( );
  140.         done = true;
  141.         es.shutdown( );
  142.         return urls;
  143.     }
  144. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement