Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import java.io.*;
- import java.net.URL;
- public class WebCrawler {
- Pair[] Visit;
- private int count; // number of elements used in the Visit array
- Queue<String> WorkList;
- // N is the maximum number of websites to visit before stopping
- public WebCrawler(int N) {
- Visit = new Pair[N];
- //System.out.println(Visit[0]);
- count = 0;
- WorkList = new Queue<String>();
- }
- public String getPage( String url ) {
- try {
- BufferedReader br = new BufferedReader(
- new InputStreamReader(new URL(url).openStream()));
- StringBuffer sb = new StringBuffer();
- for( ; ; ) {
- String line = br.readLine();
- if (line == null) break;
- sb.append(line);
- sb.append("\n");
- }
- return sb.toString();
- } catch( Exception e ) {
- return null;
- }
- }
- // Beginning with the starting URL, visit websites adding them to the
- // Visit array if not there already.
- // The visit strategy described in Assignment 4 MUST be followed,
- // otherwise there is danger of non-terminating loops.
- public void Crawl( String startingURL ) {
- Pair n = new Pair(startingURL);
- WorkList.enqueue(startingURL);
- count = 0;
- Visit[count] = n;
- while (WorkList.isEmpty() != true) {
- // Begin crawling with the startingURL
- //String s = WorkList.dequeue();
- }
- // Prevent program from revisiting previously visited websites (preventing infinite loops)
- // "Crawl" through the returned HTML from previous getPage function and find all valid URLs
- // Add to the Visit array if not there already
- }
- // Outputs the web addresses (URLs) and counts for the m most popular
- // websites in the Visit array.
- public void PrintTopSites( int m ) {
- // fix me
- }
- }
Add Comment
Please, Sign In to add comment