Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- package crawler;
- import java.io.File;
- import java.io.FileOutputStream;
- import java.io.IOException;
- import java.io.InputStream;
- import java.lang.reflect.Field;
- import java.net.URL;
- import java.util.*;
- import java.util.regex.Matcher;
- import java.util.regex.Pattern;
- public class SimpleWebCrawler implements WebCrawler {
- private Downloader downloader;
- private Map<String, Page> pages = new HashMap<>();
- private Map<String, Image> images = new HashMap<>();
- private List<String> hasOriginal = new ArrayList<>();
- public SimpleWebCrawler(Downloader downloader) {
- this.downloader = new URLDownloader(downloader);
- }
- @Override
- public Page crawl(String url, int depth) throws IOException {
- depth--;
- url = decodeEntities(url);
- InputStream inputStream;
- try {
- inputStream = this.downloader.download(url);
- } catch (IOException ioe) {
- System.out.println(ioe);
- return getPage(url);
- }
- Scanner s = new Scanner(inputStream).useDelimiter("\\A");
- String content = s.hasNext() ? s.next() : "";
- String title = getTitleFromHTML(content);
- Page page = getPage(url, title);
- List<String> images = getImagesFromHTML(content);
- for (String image : images) {
- String[] res = doUrl(url, image);
- InputStream is = new URL(res[0]).openStream();
- String[] parts = image.split("/");
- String last_part = parts[parts.length-1];
- FileOutputStream fos = new FileOutputStream(last_part);
- byte[] buffer = new byte[8 * 1024];
- int bytesRead;
- while ((bytesRead = is.read(buffer)) != -1) {
- fos.write(buffer, 0, bytesRead);
- }
- is.close();
- fos.close();
- page.addImage(getImage(res[0], last_part));
- }
- List<String> links = getLinksFromHTML(content);
- for (String origLink : links) {
- int pos = content.indexOf(origLink);
- if (content.charAt(pos-1) == ' ' &&
- content.charAt(pos-2) == '-' &&
- content.charAt(pos-3) == '-' &&
- content.charAt(pos-4) == '!' &&
- content.charAt(pos-5) == '<') {
- continue;
- }
- String link = getUrl(origLink);
- String[] res = doUrl(url, link);
- link = res[0];
- String original = res[1];
- try {
- String urlTitle = getUrlTitle(origLink);
- original = original+urlTitle;
- } catch (IllegalStateException e) {
- }
- try {
- String urlTarget = getUrlTarget(origLink);
- original = original+urlTarget;
- } catch (IllegalStateException e) {
- }
- if (depth > 0) {
- if (!original.equals(link))
- page.addLink(this.crawl(link, depth));
- else {
- this.addLink(page, this.crawl(link, depth));
- }
- } else {
- if (!original.equals(link))
- page.addLink(getPage(link));
- else {
- this.addLink(page, getPage(link));
- }
- }
- }
- return page;
- }
- private String[] doUrl(String url, String link) {
- String original = link;
- link = link.replace(" ", "");
- String[] parts = url.split("/");
- String last_part = parts[parts.length-1];
- if (link.startsWith("#")) {
- link = url;
- original = link+"#";
- } else if (link.contains("#")) {
- link = link.split("#")[0];
- original = link+"#";
- }
- if (!link.startsWith("http") && !link.startsWith("https")) { // absolute link
- String url_part;
- if (url.contains(".html") || url.contains(".zip") || url.contains(".xhtml")) {
- url_part = url.replace(last_part, "");
- } else {
- url_part = url.replace(last_part + "/", "");
- }
- if (link.startsWith("../")) {
- int i = (url.contains(".html") || url.contains(".zip") || url.contains(".xhtml")) ? 2 : 1;
- while (link.indexOf("../") == 0) {
- link = link.substring(3, link.length());
- url_part = url_part.replace(parts[parts.length - i] + "/", "");
- i++;
- }
- link = url_part + link;
- } else if (link.equals("..")) {
- link = url_part.replace(parts[parts.length - 1] + "/", "");
- } else if (link.startsWith("//")) {
- link = "http:"+link;
- } else if (link.startsWith("/")) {
- link = parts[0]+"//"+parts[2]+link;
- } else {
- if (url.contains(".html") || url.contains(".zip") || url.contains(".xhtml")) {
- link = url_part + link;
- original = url_part + original;
- } else {
- link = url + link;
- original = url + original;
- }
- }
- }
- return new String[] {link, original};
- }
- private String getTitleFromHTML(String html) {
- Pattern p = Pattern.compile("<title>(.*?)</title>");
- Matcher m = p.matcher(html);
- m.find();
- return m.group(1);
- }
- private List<String> getLinksFromHTML(String html) {
- List<String> links = new ArrayList<>();
- Pattern p = Pattern.compile("<a.*?href.*?=.*?\"(.*?)\".*?>", Pattern.DOTALL);
- Matcher m = p.matcher(html);
- while(m.find()) {
- links.add(m.group(0));
- }
- return links;
- }
- private String getUrl(String url) {
- Pattern p = Pattern.compile("<a.*?href.*?=.*?\"(.*?)\".*?>", Pattern.DOTALL);
- Matcher m = p.matcher(url);
- m.find();
- return m.group(1);
- }
- private String getUrlTitle(String url) throws IllegalStateException {
- Pattern p = Pattern.compile("title=\"(.*?)\"", Pattern.DOTALL);
- Matcher m = p.matcher(url);
- m.find();
- return m.group(1);
- }
- private String getUrlTarget(String url) throws IllegalStateException {
- Pattern p = Pattern.compile("target=\"(.*?)\"", Pattern.DOTALL);
- Matcher m = p.matcher(url);
- m.find();
- return m.group(1);
- }
- private List<String> getImagesFromHTML(String html) {
- List<String> images = new ArrayList<>();
- Pattern p = Pattern.compile("<img.*?src.*?=.*?\"(.*?)\".*?>", Pattern.DOTALL);
- Matcher m = p.matcher(html);
- while(m.find()) {
- images.add(m.group(1));
- }
- return images;
- }
- private Page getPage(String url, String title) {
- url = decodeEntities(url.replace(" ", ""));
- title = decodeEntities(title);
- if (this.pages.containsKey(url)) {
- Page p = this.pages.get(url);
- if (p.getTitle().equals("") && !title.equals("")) {
- try {
- Field f = p.getClass().getDeclaredField("title");
- f.setAccessible(true);
- f.set(p, title);
- } catch (Exception e) {
- e.printStackTrace();
- }
- }
- return p;
- } else {
- this.pages.put(url, new Page(url, title));
- return this.pages.get(url);
- }
- }
- private Page getPage(String url) {
- return getPage(url, "");
- }
- private Image getImage(String url, String file) {
- url = decodeEntities(url);
- if (this.images.containsKey(url)) {
- return this.images.get(url);
- } else {
- this.images.put(url, new Image(url, file));
- return this.images.get(url);
- }
- }
- private void addLink(Page p1, Page p2) {
- if (!hasOriginal.contains(p2.getUrl())) {
- hasOriginal.add(p2.getUrl());
- p1.addLink(p2);
- return;
- }
- if(!p1.getLinks().contains(p2)) {
- p1.addLink(p2);
- }
- if(p2.getUrl().endsWith(".zip") || p2.getUrl().endsWith(".rar") || p2.getUrl().endsWith(".pdf")) {
- p1.addLink(p2);
- }
- }
- private String decodeEntities(String html) {
- html = html.replace("<", "<");
- html = html.replace(">", ">");
- html = html.replace("&", "&");
- html = html.replace("—", "—");
- html = html.replace(" ", " ");
- html = html.replace("\r\n", "");
- return html;
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment