Advertisement
Guest User

Bug fix to web crawler

a guest
Jan 20th, 2012
83
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Java 5.80 KB | None | 0 0
  1. import java.io.*;
  2. import java.net.*;
  3. import java.util.*;
  4. import java.util.concurrent.*;
  5. import java.util.regex.*;
  6.  
  7. public class Crawler {
  8.     private final int MAX_DEPTH = 3;
  9.     private static final String LINK_TAG_PATTERN =
  10.         "(?i)(?:<\\s*a)\\s+(?:href)\\s*=\\s*(['\"])\\s*(?!(mailto|javascript):)(https?://)?([^\"']+)\\1";
  11.     private static final String BASE_TAG_PATTERN =
  12.         "(?i)<\\s*base(?:\\s*target=([\"'])[^\"']+\\1)?\\s+href=\\s*(['\"])\\s*(https?://[^\"']+)\\2";
  13.    
  14.     private static final Pattern reLink = Pattern.compile( LINK_TAG_PATTERN );
  15.     private static final Pattern reBase = Pattern.compile( BASE_TAG_PATTERN );
  16.     private final boolean stayInDomain;
  17.     private final URL rootURL;
  18.     private final String rootDomain;
  19.     // Concurrency related members
  20.     private final BlockingQueue<Tuple<Long,Tuple<URL,Integer>>> printerQ =
  21.         new ArrayBlockingQueue<Tuple<Long,Tuple<URL,Integer>>>( 20 );
  22.     private final ConcurrentMap<String,URL> sitesVisited =
  23.         new ConcurrentHashMap<String,URL>( );
  24.     private final ForkJoinPool pool = new ForkJoinPool( 20 );
  25.     // This member MUST be volatile
  26.     private volatile boolean done = false;
  27.    
  28.     private static class Tuple<T1,T2> {
  29.         Tuple( T1 first,T2 second ) {
  30.             this.first = first;
  31.             this.second = second;
  32.         }
  33.         final T1 first;
  34.         final T2 second;
  35.     }
  36.  
  37.     public Crawler( String rootURL ) throws MalformedURLException {
  38.         this( rootURL,false );
  39.     }
  40.    
  41.     public Crawler( String rootURL,boolean stayInDomain ) throws MalformedURLException {
  42.         this.rootURL = new URL( rootURL );
  43.         rootDomain = this.rootURL.getHost( ).replaceAll( "^(?:\\p{ASCII}+\\.)?(\\p{ASCII}+\\.\\p{ASCII}+)$","$1" );
  44.         this.stayInDomain = stayInDomain;
  45.     }
  46.    
  47.     private String fetch( URL url ) throws IOException {
  48.         HttpURLConnection urlc = ( HttpURLConnection ) url.openConnection( );
  49.         urlc.setRequestMethod( "GET" );
  50.         urlc.setConnectTimeout( 5000 );
  51.         urlc.setRequestProperty( "Content-type","text/html" );
  52.         urlc.connect( );
  53.         try {
  54.             if( urlc.getContentType( ).contains( "html" ) ) {
  55.                 BufferedReader reader = new BufferedReader( new InputStreamReader( urlc.getInputStream( ) ) );
  56.                 StringBuilder sb = new StringBuilder( );
  57.                 String inputLine;
  58.                 while( ( inputLine = reader.readLine( ) ) != null )
  59.                     sb.append( inputLine );
  60.                 reader.close( );
  61.                 return sb.toString( );
  62.             }
  63.             return null;
  64.         }
  65.         catch( Exception e ) {
  66.             System.out.println( "Fetch of document " + url + " failed" );
  67.             return null;
  68.         }
  69.         finally {
  70.             urlc.disconnect( );
  71.         }
  72.     }
  73.    
  74.     private Collection<URL> extractLinks( String html,String defaultBase )
  75.         throws MalformedURLException {
  76.         if( html == null )
  77.             return Collections.emptyList( );
  78.        
  79.         String base = defaultBase;
  80.         Matcher matcher = reBase.matcher( html );
  81.         if( matcher.find( ) && matcher.group( 3 ) != null )
  82.             base = matcher.group( 3 );
  83.         Collection<URL> urls = new ArrayList<URL>( );
  84.         matcher = reLink.matcher( html );
  85.         while( matcher.find( ) ) {
  86.             String protocol = matcher.group( 3 );
  87.             String link = matcher.group( 4 );
  88.             if( protocol == null || ! protocol.startsWith( "http" ) )
  89.                 link = base + link;
  90.             else
  91.                 link = protocol + link;
  92.             URL url = new URL( link );
  93.             if( url.getHost( ).endsWith( rootDomain ) || ! stayInDomain )
  94.                 urls.add( url );
  95.         }
  96.         return urls;
  97.     }
  98.  
  99.     @SuppressWarnings("serial")
  100.     private class CrawlerTask extends RecursiveTask<Collection<URL>> {
  101.         private final Tuple<URL,Integer> msg;
  102.         private final String base;
  103.        
  104.         CrawlerTask( Tuple<URL,Integer> msg ) {
  105.             this.msg = msg;
  106.             String strURL = msg.first.toExternalForm( );
  107.             int idx = strURL.lastIndexOf( "/" );
  108.             if( idx != -1 )
  109.                 base = strURL.substring( 0,idx + 1 );
  110.             else
  111.                 base = strURL;
  112.         }
  113.        
  114.         @Override
  115.         protected Collection<URL> compute( ) {
  116.             Collection<URL> urls = new ArrayList<URL>( );
  117.             try {
  118.                 // Output and collect only if not the root of the search
  119.                 if( msg.second > 0 ) {
  120.                     printerQ.put( new Tuple<Long,Tuple<URL,Integer>>( Thread.currentThread().getId( ),msg ) );
  121.                     urls.add( msg.first );
  122.                 }
  123.                 Collection<CrawlerTask> forks = new ArrayList<CrawlerTask>( );
  124.                 String html = fetch( msg.first );
  125.                 for( URL url : extractLinks( html,base ) ) {
  126.                     if( sitesVisited.putIfAbsent( url.toExternalForm( ),url ) == null &&  msg.second < MAX_DEPTH ) {
  127.                         CrawlerTask task = new CrawlerTask( new Tuple<URL,Integer>( url,msg.second + 1 ) );
  128.                         forks.add( task );
  129.                         task.fork( );
  130.                     }
  131.                 }
  132.                
  133.                 for( CrawlerTask task : forks )
  134.                     urls.addAll( task.join( ) );
  135.                 return urls;
  136.             }
  137.             catch( InterruptedException e ) {
  138.                 Thread.currentThread( ).interrupt( );
  139.                 return Collections.emptyList( );
  140.             }
  141.             catch( IOException e ) {
  142.                 return Collections.emptyList( );
  143.             }
  144.         }
  145.     };
  146.  
  147.     public static void main( String ... args ) throws Exception {
  148.         Collection<URL> urls = new Crawler( ).crawl( "http://news.google.com/" );
  149.         System.out.println( "Crawling ended with " + urls.size( ) + " URLs" );
  150.     }
  151.    
  152.     public Collection<URL> crawl( ) throws IOException, InterruptedException {
  153.         ExecutorService es = Executors.newFixedThreadPool( 1 );
  154.         es.submit(
  155.             new Callable<Void>( ) {
  156.                 @Override
  157.                 public Void call( ) throws Exception {
  158.                     int counter = 0;
  159.                     do {
  160.                         Tuple<Long, Tuple<URL, Integer>> msg;
  161.                         msg = printerQ.poll( 1000,TimeUnit.MILLISECONDS );
  162.                         if( msg != null ) {
  163.                             System.out.println(
  164.                                 String.format(  "%5d %s crawled by thread %d. Depth is %d",
  165.                                                 counter++,msg.second.first,msg.first,msg.second.second ) );
  166.                         }
  167.                     } while( ! done || ! printerQ.isEmpty( ) );
  168.                     return null;
  169.                 }
  170.             } );   
  171.  
  172.         CrawlerTask task = new CrawlerTask( new Tuple<URL,Integer>( rootURL,0 ) );
  173.         pool.submit( task );
  174.         Collection<URL> urls = task.join( );
  175.         pool.shutdown( );
  176.         done = true;
  177.         es.shutdown( );
  178.         return urls;
  179.     }
  180. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement