Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import java.net.*;
- import java.io.*;
- import java.util.regex.*;
- import java.util.ArrayList;
- import java.util.Arrays;
- import java.util.List;
- public class WebCrawler {
- public static void main( String[] argv )
- {
- //regex setup
- ArrayList<String> inputs = new ArrayList<String>();
- String re = "http(s)*://(\\w+\\.)*(\\w+)" ;
- Pattern pat = Pattern.compile( re );
- String re2 = "(../)+index.html" ;
- Pattern pat2 = Pattern.compile( re2 );
- String urlName = null;
- if( argv.length < 1 )
- {
- System.out.print( "Enter URL (w/protocol) => " );
- BufferedReader cin = new BufferedReader( new InputStreamReader( System.in ));
- try {
- urlName = cin.readLine();
- } catch (IOException e) {
- System.err.println( "IO Error. Exiting." );
- System.exit( 1 );
- }
- }
- else {
- urlName = argv[0];
- }//end get URL
- try {
- URL url = new URL( urlName );
- String file = slurp( url.openStream() );
- //System.out.println(file);
- List<String> inputlist = Arrays.asList(file);
- for( String s : inputlist )
- {
- Matcher matcher = pat.matcher( s );
- //System.out.println( "\nregex: " + re );
- //System.out.println( "input: " + s );
- //System.out.println( "----------------------------------" );
- while( matcher.find() )
- {
- System.out.println( "This link is: " + matcher.group() );
- // Note on groupings:
- // 0 is the entire string. Groupings can be pulled out. They are
- // simply enumerated, even nested parentheses, by counting the
- // opening left parenthesis (start at 1).
- }
- Matcher matcher2 = pat2.matcher( s );
- while( matcher2.find() )
- {
- System.out.println( "This link is: " + matcher2.group() );
- // Note on groupings:
- // 0 is the entire string. Groupings can be pulled out. They are
- // simply enumerated, even nested parentheses, by counting the
- // opening left parenthesis (start at 1).
- }
- }
- } catch( MalformedURLException e ) {
- System.err.println( "Poorly formed URL:" );
- System.out.println( e );
- System.exit( 2 );
- } catch( UnknownHostException e ) {
- System.err.println( "Unknown Host:" );
- System.out.println( e );
- System.exit( 3 );
- } catch( IOException e ) {
- System.err.println( ":" );
- System.out.println( e );
- System.exit( 4 );
- } // try
- } //end main
- public static String slurp (InputStream in) throws IOException
- {
- int n;
- StringBuffer out = new StringBuffer();
- byte[] b = new byte[4096];
- n = in.read( b );
- while( n != -1 )
- {
- out.append(new String(b, 0, n));
- n = in.read( b );
- }
- return out.toString();
- } // slurp
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement