Bug fix to web crawler

import java.io.*;
import java.net.*;
import java.util.*;
import java.util.concurrent.*;
import java.util.regex.*;

public class Crawler {
    private final int MAX_DEPTH = 3;
    private static final String LINK_TAG_PATTERN =
        "(?i)(?:<\\s*a)\\s+(?:href)\\s*=\\s*(['\"])\\s*(?!(mailto|javascript):)(https?://)?([^\"']+)\\1";
    private static final String BASE_TAG_PATTERN =
        "(?i)<\\s*base(?:\\s*target=([\"'])[^\"']+\\1)?\\s+href=\\s*(['\"])\\s*(https?://[^\"']+)\\2";

    private static final Pattern reLink = Pattern.compile( LINK_TAG_PATTERN );
    private static final Pattern reBase = Pattern.compile( BASE_TAG_PATTERN );
    private final boolean stayInDomain;
    private final URL rootURL;
    private final String rootDomain;
    // Concurrency related members
    private final BlockingQueue<Tuple<Long,Tuple<URL,Integer>>> printerQ =
        new ArrayBlockingQueue<Tuple<Long,Tuple<URL,Integer>>>( 20 );
    private final ConcurrentMap<String,URL> sitesVisited =
        new ConcurrentHashMap<String,URL>( );
    private final ForkJoinPool pool = new ForkJoinPool( 20 );
    // This member MUST be volatile
    private volatile boolean done = false;

    private static class Tuple<T1,T2> {
        Tuple( T1 first,T2 second ) {
            this.first = first;
            this.second = second;
        }
        final T1 first;
        final T2 second;
    }

    public Crawler( String rootURL ) throws MalformedURLException {
        this( rootURL,false );
    }

    public Crawler( String rootURL,boolean stayInDomain ) throws MalformedURLException {
        this.rootURL = new URL( rootURL );
        rootDomain = this.rootURL.getHost( ).replaceAll( "^(?:\\p{ASCII}+\\.)?(\\p{ASCII}+\\.\\p{ASCII}+)$","$1" );
        this.stayInDomain = stayInDomain;
    }

    private String fetch( URL url ) throws IOException {
        HttpURLConnection urlc = ( HttpURLConnection ) url.openConnection( );
        urlc.setRequestMethod( "GET" );
        urlc.setConnectTimeout( 5000 );
        urlc.setRequestProperty( "Content-type","text/html" );
        urlc.connect( );
        try {
            if( urlc.getContentType( ).contains( "html" ) ) {
                BufferedReader reader = new BufferedReader( new InputStreamReader( urlc.getInputStream( ) ) );
                StringBuilder sb = new StringBuilder( );
                String inputLine;
                while( ( inputLine = reader.readLine( ) ) != null )
                    sb.append( inputLine );
                reader.close( );
                return sb.toString( );
            }
            return null;
        }
        catch( Exception e ) {
            System.out.println( "Fetch of document " + url + " failed" );
            return null;
        }
        finally {
            urlc.disconnect( );
        }
    }

    private Collection<URL> extractLinks( String html,String defaultBase )
        throws MalformedURLException {
        if( html == null )
            return Collections.emptyList( );

        String base = defaultBase;
        Matcher matcher = reBase.matcher( html );
        if( matcher.find( ) && matcher.group( 3 ) != null )
            base = matcher.group( 3 );
        Collection<URL> urls = new ArrayList<URL>( );
        matcher = reLink.matcher( html );
        while( matcher.find( ) ) {
            String protocol = matcher.group( 3 );
            String link = matcher.group( 4 );
            if( protocol == null || ! protocol.startsWith( "http" ) )
                link = base + link;
            else
                link = protocol + link;
            URL url = new URL( link );
            if( url.getHost( ).endsWith( rootDomain ) || ! stayInDomain )
                urls.add( url );
        }
        return urls;
    }

    @SuppressWarnings("serial")
    private class CrawlerTask extends RecursiveTask<Collection<URL>> {
        private final Tuple<URL,Integer> msg;
        private final String base;

        CrawlerTask( Tuple<URL,Integer> msg ) {
            this.msg = msg;
            String strURL = msg.first.toExternalForm( );
            int idx = strURL.lastIndexOf( "/" );
            if( idx != -1 )
                base = strURL.substring( 0,idx + 1 );
            else
                base = strURL;
        }

        @Override
        protected Collection<URL> compute( ) {
            Collection<URL> urls = new ArrayList<URL>( );
            try {
                // Output and collect only if not the root of the search
                if( msg.second > 0 ) {
                    printerQ.put( new Tuple<Long,Tuple<URL,Integer>>( Thread.currentThread().getId( ),msg ) );
                    urls.add( msg.first );
                }
                Collection<CrawlerTask> forks = new ArrayList<CrawlerTask>( );
                String html = fetch( msg.first );
                for( URL url : extractLinks( html,base ) ) {
                    if( sitesVisited.putIfAbsent( url.toExternalForm( ),url ) == null &&  msg.second < MAX_DEPTH ) {
                        CrawlerTask task = new CrawlerTask( new Tuple<URL,Integer>( url,msg.second + 1 ) );
                        forks.add( task );
                        task.fork( );
                    }
                }

                for( CrawlerTask task : forks )
                    urls.addAll( task.join( ) );
                return urls;
            }
            catch( InterruptedException e ) {
                Thread.currentThread( ).interrupt( );
                return Collections.emptyList( );
            }
            catch( IOException e ) {
                return Collections.emptyList( );
            }
        }
    };

    public static void main( String ... args ) throws Exception {
        Collection<URL> urls = new Crawler( ).crawl( "http://news.google.com/" );
        System.out.println( "Crawling ended with " + urls.size( ) + " URLs" );
    }

    public Collection<URL> crawl( ) throws IOException, InterruptedException {
        ExecutorService es = Executors.newFixedThreadPool( 1 );
        es.submit(
            new Callable<Void>( ) {
                @Override
                public Void call( ) throws Exception {
                    int counter = 0;
                    do {
                        Tuple<Long, Tuple<URL, Integer>> msg;
                        msg = printerQ.poll( 1000,TimeUnit.MILLISECONDS );
                        if( msg != null ) {
                            System.out.println(
                                String.format(  "%5d %s crawled by thread %d. Depth is %d",
                                                counter++,msg.second.first,msg.first,msg.second.second ) );
                        }
                    } while( ! done || ! printerQ.isEmpty( ) );
                    return null;
                }
            } );

        CrawlerTask task = new CrawlerTask( new Tuple<URL,Integer>( rootURL,0 ) );
        pool.submit( task );
        Collection<URL> urls = task.join( );
        pool.shutdown( );
        done = true;
        es.shutdown( );
        return urls;
    }
}