Advertisement
Guest User

Untitled

a guest
Jan 21st, 2018
53
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Java 2.87 KB | None | 0 0
  1. package Prak9;
  2.  
  3. import java.io.FileWriter;
  4. import java.io.InputStream;
  5. import java.io.InputStreamReader;
  6. import java.io.Reader;
  7. import java.io.StringWriter;
  8. import java.io.Writer;
  9. import java.net.URI;
  10. import java.net.URL;
  11. import java.net.URLConnection;
  12. import java.util.ArrayList;
  13. import java.util.HashSet;
  14. import java.util.List;
  15. import java.util.Set;
  16. import java.util.regex.Matcher;
  17. import java.util.regex.Pattern;
  18.  
  19. public class Crawl {
  20.     String startURL;
  21.     Writer out;
  22.  
  23.     // private Set<String> visitedUrls = new HashSet<>();
  24.     private List<String> foundUrls = new ArrayList<String>();
  25.     private List<String> visitedUrls = new ArrayList<String>();
  26.  
  27.     public Crawl(String startURL, Writer out) {
  28.         this.startURL = startURL;
  29.         this.out = out;
  30.     }
  31.  
  32.     public void start() throws Exception {
  33.         // URL url = new URL(this.startURL ) ;
  34.         // URLConnection con = url.openConnection();
  35.         // InputStream in = con.getInputStream();
  36.         //
  37.         // Reader reader = new InputStreamReader(in);
  38.         // System.out.println("1");
  39.         boolean breaker = true;
  40.         int n = 0;
  41.         int o = 0;
  42.         while (breaker) {
  43.  
  44.             foundUrls.add(startURL);
  45.  
  46.             try {
  47.                 URI uri = new URI(foundUrls.get(n));
  48.                 System.out.println("Bin im try  " + n);
  49.                 InputStream binIn = uri.toURL().openStream();
  50.  
  51.                 Reader in = new InputStreamReader(binIn, "iso-8859-1");
  52.  
  53.                 String stringContent = "";
  54.                 StringBuilder sb = new StringBuilder();
  55.                 int i = in.read();
  56.                 while (i >= 0) {
  57.                     sb.append((char) i);
  58.                     i = in.read();
  59.  
  60.                 }
  61.                 in.close();
  62.                 System.out.println("2");
  63.                 stringContent = sb.toString();
  64.  
  65.                 linkFilter(stringContent);
  66.                 breaker = false;
  67.             } catch (Exception e) {
  68.                 System.out.println("Hat nicht funktioniert");
  69.                 n = n + 1;
  70.             }
  71.         }
  72.     }
  73.  
  74.     private void linkFilter(String content) throws Exception {
  75.         String regex = "\\b(https?|ftp|file)://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]";
  76.         Pattern pattern = Pattern.compile(regex);
  77.         Matcher matcher = pattern.matcher(content);
  78.         int y = 0;
  79.         while (matcher.find()) {
  80.             // System.out.println(matcher.group(0));
  81.             if (!this.foundUrls.contains(matcher.group(0)) && !this.visitedUrls.contains(matcher.group(0))) {
  82.                 // System.out.println(this.foundUrls .contains(matcher.group(0)));
  83.                 this.foundUrls.add(matcher.group(0));
  84.                 y++;
  85.             }
  86.         }
  87.         for (int x = 0; x <= this.foundUrls.size() - 1; x++) {
  88.             System.out.println(this.foundUrls.get(x));
  89.         }
  90.         visitedUrls.add(startURL);
  91.         foundUrls.remove(startURL);
  92.         startURL = foundUrls.get(0);
  93.         // visitedUrls.add(foundUrls.get(0));
  94.         // for(int p =0;p<visitedUrls.size()-1;p++)
  95.         // {System.out.println("#######################"+visitedUrls.get(p));}
  96.         start();
  97.     }
  98.  
  99.     public static void main(String[] args) throws Exception {
  100.         Writer writer = new StringWriter();
  101.         Crawl crawl = new Crawl("https://stackoverflow.com", writer);
  102.         crawl.start();
  103.     }
  104.  
  105. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement