Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- package Prak9;
- import java.io.FileWriter;
- import java.io.InputStream;
- import java.io.InputStreamReader;
- import java.io.Reader;
- import java.io.StringWriter;
- import java.io.Writer;
- import java.net.URI;
- import java.net.URL;
- import java.net.URLConnection;
- import java.util.ArrayList;
- import java.util.HashSet;
- import java.util.List;
- import java.util.Set;
- import java.util.regex.Matcher;
- import java.util.regex.Pattern;
- public class Crawl {
- String startURL;
- Writer out;
- // private Set<String> visitedUrls = new HashSet<>();
- private List<String> foundUrls = new ArrayList<String>();
- private List<String> visitedUrls = new ArrayList<String>();
- public Crawl(String startURL, Writer out) {
- this.startURL = startURL;
- this.out = out;
- }
- public void start() throws Exception {
- // URL url = new URL(this.startURL ) ;
- // URLConnection con = url.openConnection();
- // InputStream in = con.getInputStream();
- //
- // Reader reader = new InputStreamReader(in);
- // System.out.println("1");
- boolean breaker = true;
- int n = 0;
- int o = 0;
- while (breaker) {
- foundUrls.add(startURL);
- try {
- URI uri = new URI(foundUrls.get(n));
- System.out.println("Bin im try " + n);
- InputStream binIn = uri.toURL().openStream();
- Reader in = new InputStreamReader(binIn, "iso-8859-1");
- String stringContent = "";
- StringBuilder sb = new StringBuilder();
- int i = in.read();
- while (i >= 0) {
- sb.append((char) i);
- i = in.read();
- }
- in.close();
- System.out.println("2");
- stringContent = sb.toString();
- linkFilter(stringContent);
- breaker = false;
- } catch (Exception e) {
- System.out.println("Hat nicht funktioniert");
- n = n + 1;
- }
- }
- }
- private void linkFilter(String content) throws Exception {
- String regex = "\\b(https?|ftp|file)://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]";
- Pattern pattern = Pattern.compile(regex);
- Matcher matcher = pattern.matcher(content);
- int y = 0;
- while (matcher.find()) {
- // System.out.println(matcher.group(0));
- if (!this.foundUrls.contains(matcher.group(0)) && !this.visitedUrls.contains(matcher.group(0))) {
- // System.out.println(this.foundUrls .contains(matcher.group(0)));
- this.foundUrls.add(matcher.group(0));
- y++;
- }
- }
- for (int x = 0; x <= this.foundUrls.size() - 1; x++) {
- System.out.println(this.foundUrls.get(x));
- }
- visitedUrls.add(startURL);
- foundUrls.remove(startURL);
- startURL = foundUrls.get(0);
- // visitedUrls.add(foundUrls.get(0));
- // for(int p =0;p<visitedUrls.size()-1;p++)
- // {System.out.println("#######################"+visitedUrls.get(p));}
- start();
- }
- public static void main(String[] args) throws Exception {
- Writer writer = new StringWriter();
- Crawl crawl = new Crawl("https://stackoverflow.com", writer);
- crawl.start();
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement