Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import java.io.*;
- import java.net.*;
- import java.util.*;
- import java.util.concurrent.*;
- import java.util.regex.*;
- public class Crawler {
- private static final Pattern pattern1 =
- Pattern.compile( "(?i)href\\s*=\\s*(\"|\\')/?((?!#.*|/\\B|mailto:|location\\.|javascript:)[^\"\']+)(\"|\')" );
- private final ConcurrentMap<String,URL> sitesVisited = new ConcurrentHashMap<String,URL>( );
- private final ForkJoinPool pool = new ForkJoinPool( 10 );
- // This member MUST be volatile
- private volatile boolean done = false;
- private final BlockingQueue<Tuple<Long,Tuple<URL,Integer>>> printerQ =
- new ArrayBlockingQueue<Tuple<Long,Tuple<URL,Integer>>>( 20 );
- private final int MAX_DEPTH = 3;
- private static class Tuple<T1,T2> {
- Tuple( T1 first,T2 second ) {
- this.first = first;
- this.second = second;
- }
- final T1 first;
- final T2 second;
- }
- private String fetch( URL url ) throws IOException {
- HttpURLConnection urlc = ( HttpURLConnection ) url.openConnection( );
- urlc.setRequestMethod( "GET" );
- urlc.setConnectTimeout( 5000 );
- urlc.setRequestProperty( "Content-type","text/html" );
- urlc.connect( );
- try {
- if( urlc.getContentType( ).contains( "html" ) ) {
- BufferedReader reader = new BufferedReader( new InputStreamReader( urlc.getInputStream( ) ) );
- StringBuilder sb = new StringBuilder( );
- String inputLine;
- while( ( inputLine = reader.readLine( ) ) != null )
- sb.append( inputLine );
- reader.close( );
- return sb.toString( );
- }
- return null;
- }
- catch( Exception e ) {
- System.out.println( "Fetch of document " + url + " failed" );
- return null;
- }
- finally {
- urlc.disconnect( );
- }
- }
- private Collection<URL> extractLinks( String html ) throws MalformedURLException {
- if( html == null ) return Collections.emptyList( );
- Collection<URL> urls = new ArrayList<URL>( );
- Matcher matcher = pattern1.matcher( html );
- while( matcher.find( ) ) {
- String link = matcher.group( 2 );
- if( ! link.startsWith( "http" ) )
- continue;
- urls.add( new URL( link ) );
- }
- return urls;
- }
- /**
- * For more information see http://gee.cs.oswego.edu/dl/papers/fj.pdf
- * and http://www.coopsoft.com/ar/ForkJoinArticle.html
- *
- */
- @SuppressWarnings("serial")
- private class CrawlerTask extends RecursiveTask<Collection<URL>> {
- private final Tuple<URL,Integer> msg;
- CrawlerTask( Tuple<URL,Integer> msg ) {
- this.msg = msg;
- }
- @Override
- protected Collection<URL> compute( ) {
- Collection<URL> urls = new ArrayList<URL>( );
- try {
- // Output and collect only if not the root of the search
- if( msg.second > 0 ) {
- printerQ.put( new Tuple<Long,Tuple<URL,Integer>>( Thread.currentThread().getId( ),msg ) );
- urls.add( msg.first );
- }
- Collection<CrawlerTask> forks = new ArrayList<CrawlerTask>( );
- String html = fetch( msg.first );
- for( URL url : extractLinks( html ) ) {
- if( sitesVisited.putIfAbsent( url.toExternalForm( ),url ) == null && msg.second < MAX_DEPTH ) {
- CrawlerTask task = new CrawlerTask( new Tuple<URL,Integer>( url,msg.second + 1 ) );
- forks.add( task );
- task.fork( );
- }
- }
- for( CrawlerTask task : forks )
- urls.addAll( task.join( ) );
- return urls;
- }
- catch( InterruptedException e ) {
- Thread.currentThread( ).interrupt( );
- return Collections.emptyList( );
- }
- catch( IOException e ) {
- return Collections.emptyList( );
- }
- }
- };
- public static void main( String ... args ) throws Exception {
- //new Crawler( ).crawl( "http://localhost:9080/Crawler_Files/test.html" );
- Collection<URL> urls = new Crawler( ).crawl( "http://news.google.com/" );
- System.out.println( "Crawling ended with " + urls.size( ) + "URLs" );
- }
- public Collection<URL> crawl( String strURL ) throws IOException, InterruptedException {
- ExecutorService es = Executors.newFixedThreadPool( 1 );
- es.submit(
- new Callable<Void>( ) {
- @Override
- public Void call( ) throws Exception {
- do {
- Tuple<Long, Tuple<URL, Integer>> msg;
- msg = printerQ.take( );
- System.out.println(
- String.format( "URL %s crawled by agent %d. Depth is %d",
- msg.second.first,msg.first,msg.second.second ) );
- } while( ! done || ! printerQ.isEmpty( ) );
- return null;
- }
- } );
- CrawlerTask task = new CrawlerTask( new Tuple<URL,Integer>( new URL( strURL ),0 ) );
- pool.submit( task );
- Collection<URL> urls = task.join( );
- done = true;
- es.shutdown( );
- return urls;
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement