SimpleWebCrawler

package crawler;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.lang.reflect.Field;
import java.net.URL;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class SimpleWebCrawler implements WebCrawler {

    private Downloader downloader;
    private Map<String, Page> pages = new HashMap<>();
    private Map<String, Image> images = new HashMap<>();
    private List<String> hasOriginal = new ArrayList<>();

    public SimpleWebCrawler(Downloader downloader) {
        this.downloader = new URLDownloader(downloader);
    }

    @Override
    public Page crawl(String url, int depth) throws IOException {
        depth--;
        url = decodeEntities(url);
        InputStream inputStream;
        try {
            inputStream = this.downloader.download(url);
        } catch (IOException ioe) {
            System.out.println(ioe);
            return getPage(url);
        }

        Scanner s = new Scanner(inputStream).useDelimiter("\\A");
        String content = s.hasNext() ? s.next() : "";

        String title = getTitleFromHTML(content);

        Page page = getPage(url, title);

        List<String> images = getImagesFromHTML(content);
        for (String image : images) {
            String[] res = doUrl(url, image);
            InputStream is = new URL(res[0]).openStream();

            String[] parts = image.split("/");
            String last_part = parts[parts.length-1];

            FileOutputStream fos = new FileOutputStream(last_part);

            byte[] buffer = new byte[8 * 1024];
            int bytesRead;
            while ((bytesRead = is.read(buffer)) != -1) {
                fos.write(buffer, 0, bytesRead);
            }
            is.close();
            fos.close();

            page.addImage(getImage(res[0], last_part));
        }

        List<String> links = getLinksFromHTML(content);

        for (String origLink : links) {
            int pos = content.indexOf(origLink);
            if (content.charAt(pos-1) == ' ' &&
                    content.charAt(pos-2) == '-' &&
                    content.charAt(pos-3) == '-' &&
                    content.charAt(pos-4) == '!' &&
                    content.charAt(pos-5) == '<') {
                continue;
            }

            String link = getUrl(origLink);
            String[] res = doUrl(url, link);
            link = res[0];
            String original = res[1];

            try {
                String urlTitle = getUrlTitle(origLink);
                original = original+urlTitle;
            } catch (IllegalStateException e) {
            }

            try {
                String urlTarget = getUrlTarget(origLink);
                original = original+urlTarget;
            } catch (IllegalStateException e) {
            }

            if (depth > 0) {
                if (!original.equals(link))
                    page.addLink(this.crawl(link, depth));
                else {
                    this.addLink(page, this.crawl(link, depth));
                }
            } else {
                if (!original.equals(link))
                    page.addLink(getPage(link));
                else {
                    this.addLink(page, getPage(link));
                }
            }
        }
        return page;
    }

    private String[] doUrl(String url, String link) {
        String original = link;
        link = link.replace(" ", "");
        String[] parts = url.split("/");
        String last_part = parts[parts.length-1];

        if (link.startsWith("#")) {
            link = url;
            original = link+"#";
        } else if (link.contains("#")) {
            link = link.split("#")[0];
            original = link+"#";
        }

        if (!link.startsWith("http") && !link.startsWith("https")) { // absolute link
            String url_part;
            if (url.contains(".html") || url.contains(".zip") || url.contains(".xhtml")) {
                url_part = url.replace(last_part, "");
            } else {
                 url_part = url.replace(last_part + "/", "");
            }
            if (link.startsWith("../")) {
                int i = (url.contains(".html") || url.contains(".zip") || url.contains(".xhtml")) ? 2 : 1;
                while (link.indexOf("../") == 0) {
                    link = link.substring(3, link.length());
                    url_part = url_part.replace(parts[parts.length - i] + "/", "");
                    i++;
                }
                link = url_part + link;
            } else if (link.equals("..")) {
                link = url_part.replace(parts[parts.length - 1] + "/", "");
            } else if (link.startsWith("//")) {
                link = "http:"+link;
            } else if (link.startsWith("/")) {
                link = parts[0]+"//"+parts[2]+link;
            } else {
                if (url.contains(".html") || url.contains(".zip") || url.contains(".xhtml")) {
                    link = url_part + link;
                    original = url_part + original;
                } else {
                    link = url + link;
                    original = url + original;
                }
            }
        }

        return new String[] {link, original};
    }

    private String getTitleFromHTML(String html) {
        Pattern p = Pattern.compile("<title>(.*?)</title>");
        Matcher m = p.matcher(html);
        m.find();
        return m.group(1);
    }

    private List<String> getLinksFromHTML(String html) {
        List<String> links = new ArrayList<>();
        Pattern p = Pattern.compile("<a.*?href.*?=.*?\"(.*?)\".*?>", Pattern.DOTALL);
        Matcher m = p.matcher(html);
        while(m.find()) {
            links.add(m.group(0));
        }
        return links;
    }

    private String getUrl(String url) {
        Pattern p = Pattern.compile("<a.*?href.*?=.*?\"(.*?)\".*?>", Pattern.DOTALL);
        Matcher m = p.matcher(url);
        m.find();
        return m.group(1);
    }

    private String getUrlTitle(String url) throws IllegalStateException {
        Pattern p = Pattern.compile("title=\"(.*?)\"", Pattern.DOTALL);
        Matcher m = p.matcher(url);
        m.find();
        return m.group(1);
    }

    private String getUrlTarget(String url) throws IllegalStateException {
        Pattern p = Pattern.compile("target=\"(.*?)\"", Pattern.DOTALL);
        Matcher m = p.matcher(url);
        m.find();
        return m.group(1);
    }

    private List<String> getImagesFromHTML(String html) {
        List<String> images = new ArrayList<>();
        Pattern p = Pattern.compile("<img.*?src.*?=.*?\"(.*?)\".*?>", Pattern.DOTALL);
        Matcher m = p.matcher(html);
        while(m.find()) {
            images.add(m.group(1));
        }
        return images;
    }

    private Page getPage(String url, String title) {
        url = decodeEntities(url.replace(" ", ""));
        title = decodeEntities(title);
        if (this.pages.containsKey(url)) {
            Page p = this.pages.get(url);
            if (p.getTitle().equals("") && !title.equals("")) {
                try {
                    Field f = p.getClass().getDeclaredField("title");
                    f.setAccessible(true);
                    f.set(p, title);
                } catch (Exception e) {
                    e.printStackTrace();
                }
            }
            return p;
        } else {
            this.pages.put(url, new Page(url, title));
            return this.pages.get(url);
        }
    }

    private Page getPage(String url) {
        return getPage(url, "");
    }

    private Image getImage(String url, String file) {
        url = decodeEntities(url);
        if (this.images.containsKey(url)) {
            return this.images.get(url);
        } else {
            this.images.put(url, new Image(url, file));
            return this.images.get(url);
        }
    }

    private void addLink(Page p1, Page p2) {
        if (!hasOriginal.contains(p2.getUrl())) {
            hasOriginal.add(p2.getUrl());
            p1.addLink(p2);
            return;
        }
        if(!p1.getLinks().contains(p2)) {
            p1.addLink(p2);
        }
        if(p2.getUrl().endsWith(".zip") || p2.getUrl().endsWith(".rar") || p2.getUrl().endsWith(".pdf")) {
            p1.addLink(p2);
        }
    }

    private String decodeEntities(String html) {
        html = html.replace("&lt;", "<");
        html = html.replace("&gt;", ">");
        html = html.replace("&amp;", "&");
        html = html.replace("&mdash;", "—");
        html = html.replace("&nbsp;", " ");
        html = html.replace("\r\n", "");
        return html;
    }
}