Advertisement
Guest User

Untitled

a guest
May 27th, 2017
58
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Java 2.63 KB | None | 0 0
  1. import java.net.*;
  2. import java.io.*;
  3. import java.util.regex.*;
  4. import java.util.ArrayList;
  5. import java.util.Arrays;
  6. import java.util.List;
  7.  
  8. public class WebCrawler {
  9.  
  10.     public static void main( String[] argv )
  11.     {
  12.         //regex setup
  13.         ArrayList<String> inputs = new ArrayList<String>();
  14.         String re = "http(s)*://(\\w+\\.)*(\\w+)" ;
  15.         Pattern pat = Pattern.compile( re );
  16.         String re2 = "(../)+index.html" ;
  17.         Pattern pat2 = Pattern.compile( re2 );
  18.  
  19.  
  20.         String urlName = null;
  21.         if( argv.length < 1 )
  22.             {
  23.                 System.out.print( "Enter URL (w/protocol) => " );
  24.                 BufferedReader cin = new BufferedReader( new InputStreamReader( System.in ));
  25.             try {
  26.                 urlName = cin.readLine();
  27.             } catch (IOException e) {
  28.                 System.err.println( "IO Error.  Exiting." );
  29.                 System.exit( 1 );
  30.             }
  31.         }
  32.         else {
  33.             urlName = argv[0];
  34.         }//end get URL
  35.  
  36.         try {
  37.  
  38.         URL url = new URL( urlName );
  39.         String file = slurp( url.openStream() );
  40.         //System.out.println(file);
  41.  
  42.         List<String> inputlist = Arrays.asList(file);
  43.  
  44.         for( String s : inputlist )
  45.         {
  46.             Matcher matcher = pat.matcher( s );
  47.  
  48.             //System.out.println( "\nregex: " + re );
  49.             //System.out.println( "input: " + s );
  50.             //System.out.println( "----------------------------------" );
  51.  
  52.             while( matcher.find() )
  53.             {
  54.                 System.out.println( "This link is: " + matcher.group() );
  55.  
  56.                     // Note on groupings:
  57.                     // 0 is the entire string.  Groupings can be pulled out.  They are
  58.                     // simply enumerated, even nested parentheses, by counting the
  59.                     // opening left parenthesis (start at 1).
  60.             }
  61.  
  62.             Matcher matcher2 = pat2.matcher( s );
  63.  
  64.             while( matcher2.find() )
  65.             {
  66.                 System.out.println( "This link is: " + matcher2.group() );
  67.  
  68.                     // Note on groupings:
  69.                     // 0 is the entire string.  Groupings can be pulled out.  They are
  70.                     // simply enumerated, even nested parentheses, by counting the
  71.                     // opening left parenthesis (start at 1).
  72.             }
  73.         }
  74.  
  75.         } catch( MalformedURLException e ) {
  76.             System.err.println( "Poorly formed URL:" );
  77.             System.out.println( e );
  78.             System.exit( 2 );
  79.         } catch( UnknownHostException e ) {
  80.             System.err.println( "Unknown Host:" );
  81.             System.out.println( e );
  82.             System.exit( 3 );
  83.         } catch( IOException e ) {
  84.             System.err.println( ":" );
  85.             System.out.println( e );
  86.             System.exit( 4 );
  87.         } // try
  88.     } //end main
  89.  
  90.     public static String slurp (InputStream in) throws IOException
  91.     {
  92.         int n;
  93.         StringBuffer out = new StringBuffer();
  94.         byte[] b = new byte[4096];
  95.         n = in.read( b );
  96.         while( n != -1 )
  97.         {
  98.             out.append(new String(b, 0, n));
  99.             n = in.read( b );
  100.         }
  101.         return out.toString();
  102.     }   // slurp
  103.  
  104.  
  105. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement