Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import java.io.*;
- import java.net.*;
- import java.util.*;
- import java.util.concurrent.*;
- import java.util.regex.*;
- public class Crawler {
- private final int MAX_DEPTH = 3;
- private static final String LINK_TAG_PATTERN =
- "(?i)(?:<\\s*a)\\s+(?:href)\\s*=\\s*(['\"])\\s*(?!(mailto|javascript):)(https?://)?([^\"']+)\\1";
- private static final String BASE_TAG_PATTERN =
- "(?i)<\\s*base(?:\\s*target=([\"'])[^\"']+\\1)?\\s+href=\\s*(['\"])\\s*(https?://[^\"']+)\\2";
- private static final Pattern reLink = Pattern.compile( LINK_TAG_PATTERN );
- private static final Pattern reBase = Pattern.compile( BASE_TAG_PATTERN );
- private final boolean stayInDomain;
- private final URL rootURL;
- private final String rootDomain;
- // Concurrency related members
- private final BlockingQueue<Tuple<Long,Tuple<URL,Integer>>> printerQ =
- new ArrayBlockingQueue<Tuple<Long,Tuple<URL,Integer>>>( 20 );
- private final ConcurrentMap<String,URL> sitesVisited =
- new ConcurrentHashMap<String,URL>( );
- private final ForkJoinPool pool = new ForkJoinPool( 20 );
- // This member MUST be volatile
- private volatile boolean done = false;
- private static class Tuple<T1,T2> {
- Tuple( T1 first,T2 second ) {
- this.first = first;
- this.second = second;
- }
- final T1 first;
- final T2 second;
- }
- public Crawler( String rootURL ) throws MalformedURLException {
- this( rootURL,false );
- }
- public Crawler( String rootURL,boolean stayInDomain ) throws MalformedURLException {
- this.rootURL = new URL( rootURL );
- rootDomain = this.rootURL.getHost( ).replaceAll( "^(?:\\p{ASCII}+\\.)?(\\p{ASCII}+\\.\\p{ASCII}+)$","$1" );
- this.stayInDomain = stayInDomain;
- }
- private String fetch( URL url ) throws IOException {
- HttpURLConnection urlc = ( HttpURLConnection ) url.openConnection( );
- urlc.setRequestMethod( "GET" );
- urlc.setConnectTimeout( 5000 );
- urlc.setRequestProperty( "Content-type","text/html" );
- urlc.connect( );
- try {
- if( urlc.getContentType( ).contains( "html" ) ) {
- BufferedReader reader = new BufferedReader( new InputStreamReader( urlc.getInputStream( ) ) );
- StringBuilder sb = new StringBuilder( );
- String inputLine;
- while( ( inputLine = reader.readLine( ) ) != null )
- sb.append( inputLine );
- reader.close( );
- return sb.toString( );
- }
- return null;
- }
- catch( Exception e ) {
- System.out.println( "Fetch of document " + url + " failed" );
- return null;
- }
- finally {
- urlc.disconnect( );
- }
- }
- private Collection<URL> extractLinks( String html,String defaultBase )
- throws MalformedURLException {
- if( html == null )
- return Collections.emptyList( );
- String base = defaultBase;
- Matcher matcher = reBase.matcher( html );
- if( matcher.find( ) && matcher.group( 3 ) != null )
- base = matcher.group( 3 );
- Collection<URL> urls = new ArrayList<URL>( );
- matcher = reLink.matcher( html );
- while( matcher.find( ) ) {
- String protocol = matcher.group( 3 );
- String link = matcher.group( 4 );
- if( protocol == null || ! protocol.startsWith( "http" ) )
- link = base + link;
- else
- link = protocol + link;
- URL url = new URL( link );
- if( url.getHost( ).endsWith( rootDomain ) || ! stayInDomain )
- urls.add( url );
- }
- return urls;
- }
- @SuppressWarnings("serial")
- private class CrawlerTask extends RecursiveTask<Collection<URL>> {
- private final Tuple<URL,Integer> msg;
- private final String base;
- CrawlerTask( Tuple<URL,Integer> msg ) {
- this.msg = msg;
- String strURL = msg.first.toExternalForm( );
- int idx = strURL.lastIndexOf( "/" );
- if( idx != -1 )
- base = strURL.substring( 0,idx + 1 );
- else
- base = strURL;
- }
- @Override
- protected Collection<URL> compute( ) {
- Collection<URL> urls = new ArrayList<URL>( );
- try {
- // Output and collect only if not the root of the search
- if( msg.second > 0 ) {
- printerQ.put( new Tuple<Long,Tuple<URL,Integer>>( Thread.currentThread().getId( ),msg ) );
- urls.add( msg.first );
- }
- Collection<CrawlerTask> forks = new ArrayList<CrawlerTask>( );
- String html = fetch( msg.first );
- for( URL url : extractLinks( html,base ) ) {
- if( sitesVisited.putIfAbsent( url.toExternalForm( ),url ) == null && msg.second < MAX_DEPTH ) {
- CrawlerTask task = new CrawlerTask( new Tuple<URL,Integer>( url,msg.second + 1 ) );
- forks.add( task );
- task.fork( );
- }
- }
- for( CrawlerTask task : forks )
- urls.addAll( task.join( ) );
- return urls;
- }
- catch( InterruptedException e ) {
- Thread.currentThread( ).interrupt( );
- return Collections.emptyList( );
- }
- catch( IOException e ) {
- return Collections.emptyList( );
- }
- }
- };
- public static void main( String ... args ) throws Exception {
- Collection<URL> urls = new Crawler( ).crawl( "http://news.google.com/" );
- System.out.println( "Crawling ended with " + urls.size( ) + " URLs" );
- }
- public Collection<URL> crawl( ) throws IOException, InterruptedException {
- ExecutorService es = Executors.newFixedThreadPool( 1 );
- es.submit(
- new Callable<Void>( ) {
- @Override
- public Void call( ) throws Exception {
- int counter = 0;
- do {
- Tuple<Long, Tuple<URL, Integer>> msg;
- msg = printerQ.poll( 1000,TimeUnit.MILLISECONDS );
- if( msg != null ) {
- System.out.println(
- String.format( "%5d %s crawled by thread %d. Depth is %d",
- counter++,msg.second.first,msg.first,msg.second.second ) );
- }
- } while( ! done || ! printerQ.isEmpty( ) );
- return null;
- }
- } );
- CrawlerTask task = new CrawlerTask( new Tuple<URL,Integer>( rootURL,0 ) );
- pool.submit( task );
- Collection<URL> urls = task.join( );
- pool.shutdown( );
- done = true;
- es.shutdown( );
- return urls;
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement