Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import java.io.BufferedReader;
- import java.io.BufferedWriter;
- import java.io.File;
- import java.io.FileWriter;
- import java.io.IOException;
- import java.io.InputStreamReader;
- import java.io.OutputStreamWriter;
- import java.io.PrintWriter;
- import java.net.MalformedURLException;
- import java.net.Socket;
- import java.net.URL;
- import java.net.UnknownHostException;
- import java.util.ArrayList;
- import java.util.Arrays;
- import java.util.HashMap;
- import java.util.Iterator;
- import java.util.LinkedList;
- import java.util.Queue;
- import java.util.Scanner;
- import java.util.regex.Matcher;
- import java.util.regex.Pattern;
- public class TrackMe {
- private static final boolean AUTO_FLUSH_OFF = false;
- private static final String REGEX_TO_MATCH_HTML_LINKS = "<a href=\"(.*?)\">";
- private static final String REGEX_TO_MATCH_IMG_TAGS = "<img src=\"(.*?)\"( /|/)>";
- private static final Integer STATUS_CODE_OK = 200;
- private static final Integer STATUS_CODE_NOT_FOUND = 404;
- private static final String[] HTML_FILE_EXTENSIONS = {"html", "htm"};
- private static final String[] IMAGE_FILE_EXTENSIONS = {"gif", "jpeg"};
- private static String currentURL;
- private static String host;
- private static String protocol;
- private static String inputAddress;
- private static URL inputURL;
- private static LinkedList<String> linksToVisit = new LinkedList<String>();
- private static ArrayList<String> visitedLinks = new ArrayList<String>();
- private static HashMap<String, Integer> addressToHeight = new HashMap<String, Integer>();
- public static void main(String args[]) throws IOException {
- File directory = createDirectory(args);
- BreadthFirstSearch(4, directory);
- }
- private static void BreadthFirstSearch(int levels, File directory) throws IOException {
- readyVariablesForBFS();
- while(!linksToVisit.isEmpty()) {
- String pageURLString = linksToVisit.poll();
- currentURL = pageURLString;
- visitedLinks.add(pageURLString);
- URL urlOfPage = new URL(pageURLString);
- enqueueLinksFromPageOnURL(urlOfPage, pageURLString, directory);
- }
- for (Iterator<String> iterator = visitedLinks.iterator(); iterator.hasNext();) {
- String type = (String) iterator.next();
- System.out.println(type);
- }
- }
- private static void enqueueLinksFromPageOnURL(URL urlOfPage, String pageURLString, File directory) throws IOException {
- String pageContent = downloadHTMLOfPage(urlOfPage);
- if (doesHTTPResponseHaveStatusCode(pageContent, STATUS_CODE_OK)) {
- downloadFileToDirectory(pageContent, pageURLString, directory, urlOfPage);
- ArrayList<String> linksOnPage = extractLinks(pageContent);
- enqueueLinks(linksOnPage);
- }
- else if (doesHTTPResponseHaveStatusCode(pageContent, STATUS_CODE_NOT_FOUND)){
- visitedLinks.remove(pageURLString);
- }
- }
- private static void downloadFileToDirectory(String pageContent, String pageURLString, File directory, URL urlOfPage) throws IOException {
- String fileNameWithExtension = getFileNameWithExtension(pageURLString);
- String fileExtension = getFileExtension(fileNameWithExtension);
- if (Arrays.asList(HTML_FILE_EXTENSIONS).contains(fileExtension)) {
- downloadHTMLFileToDirectory(pageContent, fileNameWithExtension, directory, fileExtension);
- } else if (Arrays.asList(IMAGE_FILE_EXTENSIONS).contains(fileExtension)) {
- downloadImageFileToDirectory(fileNameWithExtension, directory, urlOfPage);
- }
- }
- private static void downloadImageFileToDirectory(
- String fileNameWithExtension, File directory, URL urlOfPage) {
- }
- private static void downloadHTMLFileToDirectory(String pageContent,
- String fileNameWithExtension, File directory, String fileExtension) throws IOException {
- File htmlFile = new File(directory, fileNameWithExtension);
- if (!htmlFile.createNewFile()) {
- String newFileName = getAnotherFileName(fileNameWithExtension, fileExtension);
- downloadHTMLFileToDirectory(pageContent, newFileName, directory, fileExtension);
- }
- else {
- String content = extractContentToBeWritten(pageContent);
- writeContentToFile(pageContent, htmlFile);
- }
- }
- private static String extractContentToBeWritten(String pageContent) {
- Scanner sc = new Scanner(pageContent);
- String content = "";
- boolean hasReachedContentToBeWritten = false;
- while (sc.hasNext()) {
- String line = sc.nextLine();
- if(hasReachedContentToBeWritten) {
- content = content + line + "\n";
- }
- if(line.isEmpty()) {
- hasReachedContentToBeWritten = true;
- }
- }
- return content;
- }
- private static void writeContentToFile(String content, File fileToBeWrittenTo) throws IOException {
- FileWriter fw = new FileWriter(fileToBeWrittenTo);
- BufferedWriter bw = new BufferedWriter(fw);
- PrintWriter pw = new PrintWriter(bw);
- pw.print(content);
- pw.close();
- bw.close();
- fw.close();
- }
- private static String getAnotherFileName(String fileNameWithExtension, String fileExtension) {
- String fileNameWithoutExtension = getFileNameWithoutExtension(fileNameWithExtension);
- int indexBeforeDuplicateNumber = -1;
- for (int i = fileNameWithoutExtension.length() - 1; i >= 0; i --) {
- char c = fileNameWithoutExtension.charAt(i);
- if((c < '0') || (c > '9')) {
- if(i == (fileNameWithoutExtension.length() - 1)) {
- return fileNameWithoutExtension + "1." + fileExtension;
- } else {
- indexBeforeDuplicateNumber = i;
- break;
- }
- }
- }
- int duplicateNumber = Integer.parseInt(fileNameWithoutExtension.substring(indexBeforeDuplicateNumber + 1));
- return fileNameWithoutExtension.substring(0, indexBeforeDuplicateNumber + 1) + Integer.toString(duplicateNumber + 1) + "." + fileExtension;
- }
- private static String getFileNameWithoutExtension(String fileNameWithExtension) {
- int lastDotIndex = fileNameWithExtension.lastIndexOf('.');
- return fileNameWithExtension.substring(0, lastDotIndex);
- }
- private static String getFileExtension(String fileNameWithExtension) {
- int lastDotIndex = fileNameWithExtension.lastIndexOf('.');
- return fileNameWithExtension.substring(lastDotIndex + 1);
- }
- private static String getFileNameWithExtension(String pageURLString) {
- int lastForwardSlashIndex = pageURLString.lastIndexOf('/');
- return pageURLString.substring(lastForwardSlashIndex + 1);
- }
- private static boolean doesHTTPResponseHaveStatusCode(String pageContent, Integer statusCode) {
- Scanner pageContentParser = new Scanner(pageContent);
- String lineContainingStatusCode = pageContentParser.nextLine();
- if(lineContainingStatusCode.contains(statusCode.toString())) {
- return true;
- }
- else {
- return false;
- }
- }
- private static void enqueueLinks(ArrayList<String> linksOnPage) throws MalformedURLException {
- for (Iterator<String> iterator = linksOnPage.iterator(); iterator.hasNext();) {
- String linkText = (String) iterator.next();
- enqueueLink(linkText);
- }
- }
- private static void enqueueLink(String linkText) throws MalformedURLException {
- String linkString;
- if (linkText.contains("href")) {
- linkString = linkText.substring(9, linkText.length() - 2).trim();
- }
- else {
- linkString = linkText.substring(10, linkText.length() - 3).trim();
- }
- if (!doesLinkContainProtocolInformation(linkString)) {
- linkString = getAbsoluteURLFromRelativeURL(linkString);
- }
- if(shouldLinkBeVisited(linkString)) {
- linksToVisit.add(linkString);
- addressToHeight.put(linkString, addressToHeight.get(currentURL) + 1);
- }
- }
- private static String getAbsoluteURLFromRelativeURL(String relativeURL) {
- if(doesLinkContainAbsolutePath(relativeURL)) {
- String absoluteURL = protocol + "://" + host + relativeURL;
- return absoluteURL;
- }
- else {
- String urlUptoDirectory = getPathUptoDirectoryOfFileOnServer(currentURL);
- String absoluteURL = urlUptoDirectory + "/" + relativeURL;
- return absoluteURL;
- }
- }
- private static boolean doesLinkContainAbsolutePath(String relativeURL) {
- String regex = "/(.*?)/";
- Pattern p = Pattern.compile(regex);
- Matcher m = p.matcher(relativeURL);
- if(!m.find()) {
- return false;
- }
- else {
- String s = m.group(0);
- s = s.substring(0, s.length() - 1);
- return getPathUptoDirectoryOfFileOnServer(currentURL).contains(s);
- }
- }
- private static String getPathUptoDirectoryOfFileOnServer(String url) {
- int indexOfLastSlash = findIndexOfLastForwardSlash(url);
- String urlUptoFile = url.substring(0, indexOfLastSlash);
- return urlUptoFile;
- }
- private static int findIndexOfLastForwardSlash(String url) {
- for (int i = url.length() - 1; i >= 0; i--) {
- char c = url.charAt(i);
- if (c == '/') {
- return i;
- }
- }
- return Integer.MIN_VALUE;
- }
- private static boolean doesLinkContainProtocolInformation(String linkString) {
- if (linkString.contains("://")) {
- return true;
- }
- else {
- return false;
- }
- }
- private static boolean shouldLinkBeVisited(String linkString) throws MalformedURLException {
- if (linksToVisit.contains(linkString)) {
- return false;
- } else if (visitedLinks.contains(linkString)) {
- return false;
- } else if (addressToHeight.get(currentURL) == 4) {
- return false;
- } else if (doesLinkPointToSameHost(linkString)) {
- return true;
- } else {
- return false;
- }
- }
- private static boolean doesLinkPointToSameHost(String linkString) throws MalformedURLException {
- URL linkURL = new URL(linkString);
- String resourceHost = linkURL.getHost();
- if (resourceHost.equals(host)) {
- return true;
- }
- else {
- return false;
- }
- }
- private static void readyVariablesForBFS() {
- linksToVisit.add(inputAddress);
- addressToHeight.put(inputAddress, 0);
- currentURL = inputAddress;
- }
- private static ArrayList<String> extractLinks(String pageContent) {
- ArrayList<String> hrefLinksOnPage = extractHREFLinksFromHTML(pageContent);
- ArrayList<String> imgLinksOnPage = extractIMGLinksFromHTML(pageContent);
- ArrayList<String> linksOnPage = new ArrayList<String>();
- linksOnPage.addAll(hrefLinksOnPage);
- linksOnPage.addAll(imgLinksOnPage);
- return linksOnPage;
- }
- private static ArrayList<String> extractIMGLinksFromHTML(String pageContent) {
- Pattern imgLinksPattern = Pattern.compile(REGEX_TO_MATCH_IMG_TAGS);
- Matcher imgLinksMatcher = imgLinksPattern.matcher(pageContent);
- ArrayList<String> imgLinksOnPage = new ArrayList<String>();
- while(imgLinksMatcher.find()) {
- imgLinksOnPage.add(imgLinksMatcher.group());
- }
- return imgLinksOnPage;
- }
- private static ArrayList<String> extractHREFLinksFromHTML(String pageContent) {
- Pattern htmlLinksPattern = Pattern.compile(REGEX_TO_MATCH_HTML_LINKS);
- Matcher htmlLinksMatcher = htmlLinksPattern.matcher(pageContent);
- ArrayList<String> hrefLinksOnPage = new ArrayList<String>();
- while(htmlLinksMatcher.find()) {
- hrefLinksOnPage.add(htmlLinksMatcher.group());
- }
- return hrefLinksOnPage;
- }
- private static String downloadHTMLOfPage(URL urlOfPage) throws IOException {
- Socket clientSocket = connectToServer();
- BufferedReader inFromServer = new BufferedReader(new InputStreamReader(clientSocket.getInputStream()));
- OutputStreamWriter outWriter = new OutputStreamWriter(clientSocket.getOutputStream());
- PrintWriter outPrintWriter = new PrintWriter(outWriter, AUTO_FLUSH_OFF);
- sendGETRequest(outPrintWriter, urlOfPage);
- String pageContent = getServerResponse(inFromServer);
- clientSocket.close();
- return pageContent;
- }
- private static String getServerResponse(BufferedReader inFromServer) throws IOException {
- String pageContent = "", line;
- while ((line = inFromServer.readLine()) != null) {
- pageContent = pageContent + line + "\n";
- }
- return pageContent;
- }
- private static void sendGETRequest(PrintWriter outPrintWriter, URL urlOfPage) throws IOException {
- outPrintWriter.print("GET " + urlOfPage.getFile() + " HTTP/1.0\r\n");
- outPrintWriter.print("Host: "+ host +"\r\n");
- outPrintWriter.print("Accept: text/plain, text/html, text/*\r\n");
- outPrintWriter.print("\r\n");
- outPrintWriter.flush();
- }
- private static Socket connectToServer() {
- Socket clientSocket = null;
- try {
- clientSocket = new Socket(host, 80);
- } catch (UnknownHostException e) {
- System.out.println("The IP Address of " + host + " could not be determined. Program terminated.");
- System.exit(1);
- } catch (IOException e) {
- System.out.println("The connection to the remote server can't be made. Please try to run the program again. Program terminated.");
- System.exit(1);
- }
- return clientSocket;
- }
- private static File createDirectory(String[] args) {
- getURLFromUser(args);
- getURLObjectFromUserInputAddress();
- protocol = inputURL.getProtocol();
- host = inputURL.getHost();
- return createDirectory(host);
- }
- private static File createDirectory(String name) {
- File directory = new File(name);
- directory.mkdir();
- return directory;
- }
- private static void getURLObjectFromUserInputAddress() {
- try {
- inputURL = new URL(inputAddress);
- } catch (MalformedURLException e) {
- System.out.println("The URL entered is not parseable or contaisn an unsupported protocol. Please enter another URL -");
- inputAddress = getInput();
- getURLObjectFromUserInputAddress();
- }
- }
- private static void getURLFromUser(String[] args) {
- try {
- inputAddress = args[0];
- } catch (ArrayIndexOutOfBoundsException e) {
- System.out.println("No URL entered. Please enter a URL -");
- inputAddress = getInput();
- }
- }
- private static String getInput() {
- Scanner sc = new Scanner(System.in);
- return sc.nextLine();
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement