Advertisement
shubhamgoyal

CS2105 Ass. 1 - 1

Sep 27th, 2012
90
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Java 13.44 KB | None | 0 0
  1. import java.io.BufferedReader;
  2. import java.io.BufferedWriter;
  3. import java.io.File;
  4. import java.io.FileWriter;
  5. import java.io.IOException;
  6. import java.io.InputStreamReader;
  7. import java.io.OutputStreamWriter;
  8. import java.io.PrintWriter;
  9. import java.net.MalformedURLException;
  10. import java.net.Socket;
  11. import java.net.URL;
  12. import java.net.UnknownHostException;
  13. import java.util.ArrayList;
  14. import java.util.Arrays;
  15. import java.util.HashMap;
  16. import java.util.Iterator;
  17. import java.util.LinkedList;
  18. import java.util.Queue;
  19. import java.util.Scanner;
  20. import java.util.regex.Matcher;
  21. import java.util.regex.Pattern;
  22.  
  23.  
  24. public class TrackMe {
  25.    
  26.     private static final boolean AUTO_FLUSH_OFF = false;
  27.     private static final String REGEX_TO_MATCH_HTML_LINKS = "<a href=\"(.*?)\">";
  28.     private static final String REGEX_TO_MATCH_IMG_TAGS = "<img src=\"(.*?)\"( /|/)>";
  29.     private static final Integer STATUS_CODE_OK = 200;
  30.     private static final Integer STATUS_CODE_NOT_FOUND = 404;
  31.    
  32.     private static final String[] HTML_FILE_EXTENSIONS = {"html", "htm"};
  33.     private static final String[] IMAGE_FILE_EXTENSIONS = {"gif", "jpeg"};
  34.    
  35.     private static String currentURL;
  36.     private static String host;
  37.     private static String protocol;
  38.     private static String inputAddress;
  39.     private static URL inputURL;
  40.     private static LinkedList<String> linksToVisit = new LinkedList<String>();
  41.     private static ArrayList<String> visitedLinks = new ArrayList<String>();
  42.     private static HashMap<String, Integer> addressToHeight = new HashMap<String, Integer>();
  43.    
  44.     public static void main(String args[]) throws IOException {
  45.         File directory = createDirectory(args);
  46.         BreadthFirstSearch(4, directory);
  47.     }
  48.  
  49.     private static void BreadthFirstSearch(int levels, File directory) throws IOException {
  50.         readyVariablesForBFS();
  51.         while(!linksToVisit.isEmpty()) {
  52.             String pageURLString = linksToVisit.poll();
  53.             currentURL = pageURLString;
  54.             visitedLinks.add(pageURLString);
  55.             URL urlOfPage = new URL(pageURLString);
  56.             enqueueLinksFromPageOnURL(urlOfPage, pageURLString, directory);
  57.         }
  58.         for (Iterator<String> iterator = visitedLinks.iterator(); iterator.hasNext();) {
  59.             String type = (String) iterator.next();
  60.             System.out.println(type);
  61.         }
  62.     }
  63.  
  64.     private static void enqueueLinksFromPageOnURL(URL urlOfPage, String pageURLString, File directory) throws IOException {
  65.         String pageContent = downloadHTMLOfPage(urlOfPage);
  66.         if (doesHTTPResponseHaveStatusCode(pageContent, STATUS_CODE_OK)) {
  67.             downloadFileToDirectory(pageContent, pageURLString, directory, urlOfPage);
  68.             ArrayList<String> linksOnPage = extractLinks(pageContent);
  69.             enqueueLinks(linksOnPage);
  70.         }
  71.         else if (doesHTTPResponseHaveStatusCode(pageContent, STATUS_CODE_NOT_FOUND)){
  72.             visitedLinks.remove(pageURLString);
  73.         }
  74.     }
  75.  
  76.     private static void downloadFileToDirectory(String pageContent, String pageURLString, File directory, URL urlOfPage) throws IOException {
  77.         String fileNameWithExtension = getFileNameWithExtension(pageURLString);
  78.         String fileExtension = getFileExtension(fileNameWithExtension);
  79.         if (Arrays.asList(HTML_FILE_EXTENSIONS).contains(fileExtension)) {
  80.             downloadHTMLFileToDirectory(pageContent, fileNameWithExtension, directory, fileExtension);
  81.         } else if (Arrays.asList(IMAGE_FILE_EXTENSIONS).contains(fileExtension)) {
  82.             downloadImageFileToDirectory(fileNameWithExtension, directory, urlOfPage);
  83.         }
  84.     }
  85.  
  86.     private static void downloadImageFileToDirectory(
  87.             String fileNameWithExtension, File directory, URL urlOfPage) {
  88.        
  89.     }
  90.  
  91.     private static void downloadHTMLFileToDirectory(String pageContent,
  92.             String fileNameWithExtension, File directory, String fileExtension) throws IOException {
  93.         File htmlFile = new File(directory, fileNameWithExtension);
  94.         if (!htmlFile.createNewFile()) {
  95.             String newFileName = getAnotherFileName(fileNameWithExtension, fileExtension);
  96.             downloadHTMLFileToDirectory(pageContent, newFileName, directory, fileExtension);
  97.         }
  98.         else {
  99.             String content = extractContentToBeWritten(pageContent);
  100.             writeContentToFile(pageContent, htmlFile);
  101.         }
  102.     }
  103.  
  104.     private static String extractContentToBeWritten(String pageContent) {
  105.         Scanner sc = new Scanner(pageContent);
  106.         String content = "";
  107.         boolean hasReachedContentToBeWritten = false;
  108.         while (sc.hasNext()) {
  109.             String line = sc.nextLine();
  110.             if(hasReachedContentToBeWritten) {
  111.                 content = content + line + "\n";
  112.             }
  113.             if(line.isEmpty()) {
  114.                 hasReachedContentToBeWritten = true;
  115.             }
  116.         }
  117.         return content;
  118.     }
  119.  
  120.     private static void writeContentToFile(String content, File fileToBeWrittenTo) throws IOException {
  121.         FileWriter fw = new FileWriter(fileToBeWrittenTo);
  122.         BufferedWriter bw = new BufferedWriter(fw);
  123.         PrintWriter pw = new PrintWriter(bw);
  124.         pw.print(content);
  125.         pw.close();
  126.         bw.close();
  127.         fw.close();
  128.     }
  129.  
  130.     private static String getAnotherFileName(String fileNameWithExtension, String fileExtension) {
  131.         String fileNameWithoutExtension = getFileNameWithoutExtension(fileNameWithExtension);
  132.         int indexBeforeDuplicateNumber = -1;
  133.         for (int i = fileNameWithoutExtension.length() - 1; i >= 0; i --) {
  134.             char c = fileNameWithoutExtension.charAt(i);
  135.             if((c < '0') || (c > '9')) {
  136.                 if(i == (fileNameWithoutExtension.length() - 1)) {
  137.                     return fileNameWithoutExtension + "1." + fileExtension;
  138.                 } else {
  139.                     indexBeforeDuplicateNumber = i;
  140.                     break;
  141.                 }
  142.             }
  143.         }
  144.         int duplicateNumber = Integer.parseInt(fileNameWithoutExtension.substring(indexBeforeDuplicateNumber + 1));
  145.         return fileNameWithoutExtension.substring(0, indexBeforeDuplicateNumber + 1) + Integer.toString(duplicateNumber + 1) + "." + fileExtension;
  146.     }
  147.  
  148.     private static String getFileNameWithoutExtension(String fileNameWithExtension) {
  149.         int lastDotIndex = fileNameWithExtension.lastIndexOf('.');
  150.         return fileNameWithExtension.substring(0, lastDotIndex);
  151.     }
  152.  
  153.     private static String getFileExtension(String fileNameWithExtension) {
  154.         int lastDotIndex = fileNameWithExtension.lastIndexOf('.');
  155.         return fileNameWithExtension.substring(lastDotIndex + 1);
  156.     }
  157.  
  158.     private static String getFileNameWithExtension(String pageURLString) {
  159.         int lastForwardSlashIndex = pageURLString.lastIndexOf('/');
  160.         return pageURLString.substring(lastForwardSlashIndex + 1);
  161.     }
  162.  
  163.     private static boolean doesHTTPResponseHaveStatusCode(String pageContent, Integer statusCode) {
  164.         Scanner pageContentParser = new Scanner(pageContent);
  165.         String lineContainingStatusCode = pageContentParser.nextLine();
  166.         if(lineContainingStatusCode.contains(statusCode.toString())) {
  167.             return true;
  168.         }
  169.         else {
  170.             return false;
  171.         }
  172.     }
  173.  
  174.     private static void enqueueLinks(ArrayList<String> linksOnPage) throws MalformedURLException {
  175.         for (Iterator<String> iterator = linksOnPage.iterator(); iterator.hasNext();) {
  176.             String linkText = (String) iterator.next();
  177.             enqueueLink(linkText);
  178.         }
  179.     }
  180.  
  181.     private static void enqueueLink(String linkText) throws MalformedURLException {
  182.         String linkString;
  183.         if (linkText.contains("href")) {
  184.             linkString = linkText.substring(9, linkText.length() - 2).trim();
  185.         }
  186.         else {
  187.             linkString = linkText.substring(10, linkText.length() - 3).trim();
  188.         }
  189.         if (!doesLinkContainProtocolInformation(linkString)) {
  190.             linkString = getAbsoluteURLFromRelativeURL(linkString);
  191.         }
  192.         if(shouldLinkBeVisited(linkString)) {
  193.             linksToVisit.add(linkString);
  194.             addressToHeight.put(linkString, addressToHeight.get(currentURL) + 1);
  195.         }
  196.     }
  197.  
  198.     private static String getAbsoluteURLFromRelativeURL(String relativeURL) {
  199.         if(doesLinkContainAbsolutePath(relativeURL)) {
  200.             String absoluteURL = protocol + "://" + host + relativeURL;
  201.             return absoluteURL;
  202.         }
  203.         else {
  204.             String urlUptoDirectory = getPathUptoDirectoryOfFileOnServer(currentURL);
  205.             String absoluteURL = urlUptoDirectory + "/" + relativeURL;
  206.             return absoluteURL;
  207.         }
  208.     }
  209.  
  210.     private static boolean doesLinkContainAbsolutePath(String relativeURL) {
  211.         String regex = "/(.*?)/";
  212.         Pattern p = Pattern.compile(regex);
  213.         Matcher m = p.matcher(relativeURL);
  214.         if(!m.find()) {
  215.             return false;
  216.         }
  217.         else {
  218.             String s = m.group(0);
  219.             s = s.substring(0, s.length() - 1);
  220.             return getPathUptoDirectoryOfFileOnServer(currentURL).contains(s);
  221.         }
  222.     }
  223.  
  224.     private static String getPathUptoDirectoryOfFileOnServer(String url) {
  225.         int indexOfLastSlash = findIndexOfLastForwardSlash(url);
  226.         String urlUptoFile = url.substring(0, indexOfLastSlash);
  227.         return urlUptoFile;
  228.     }
  229.  
  230.     private static int findIndexOfLastForwardSlash(String url) {
  231.         for (int i = url.length() - 1; i >= 0; i--) {
  232.             char c = url.charAt(i);
  233.             if (c == '/') {
  234.                 return i;
  235.             }
  236.         }
  237.         return Integer.MIN_VALUE;
  238.     }
  239.  
  240.     private static boolean doesLinkContainProtocolInformation(String linkString) {
  241.         if (linkString.contains("://")) {
  242.             return true;
  243.         }
  244.         else {
  245.             return false;
  246.         }
  247.     }
  248.  
  249.     private static boolean shouldLinkBeVisited(String linkString) throws MalformedURLException {
  250.         if (linksToVisit.contains(linkString)) {
  251.             return false;
  252.         } else if (visitedLinks.contains(linkString)) {
  253.             return false;
  254.         } else if (addressToHeight.get(currentURL) == 4) {
  255.             return false;
  256.         } else if (doesLinkPointToSameHost(linkString)) {
  257.             return true;
  258.         } else {
  259.             return false;
  260.         }
  261.     }
  262.  
  263.     private static boolean doesLinkPointToSameHost(String linkString) throws MalformedURLException {
  264.         URL linkURL = new URL(linkString);
  265.         String resourceHost = linkURL.getHost();
  266.         if (resourceHost.equals(host)) {
  267.             return true;
  268.         }
  269.         else {
  270.             return false;
  271.         }
  272.     }
  273.  
  274.     private static void readyVariablesForBFS() {
  275.         linksToVisit.add(inputAddress);
  276.         addressToHeight.put(inputAddress, 0);
  277.         currentURL = inputAddress;
  278.     }
  279.    
  280.     private static ArrayList<String> extractLinks(String pageContent) {
  281.         ArrayList<String> hrefLinksOnPage = extractHREFLinksFromHTML(pageContent);
  282.         ArrayList<String> imgLinksOnPage = extractIMGLinksFromHTML(pageContent);
  283.         ArrayList<String> linksOnPage = new ArrayList<String>();
  284.         linksOnPage.addAll(hrefLinksOnPage);
  285.         linksOnPage.addAll(imgLinksOnPage);
  286.         return linksOnPage;
  287.     }
  288.  
  289.     private static ArrayList<String> extractIMGLinksFromHTML(String pageContent) {
  290.         Pattern imgLinksPattern = Pattern.compile(REGEX_TO_MATCH_IMG_TAGS);
  291.         Matcher imgLinksMatcher = imgLinksPattern.matcher(pageContent);
  292.         ArrayList<String> imgLinksOnPage = new ArrayList<String>();
  293.         while(imgLinksMatcher.find()) {
  294.             imgLinksOnPage.add(imgLinksMatcher.group());
  295.         }
  296.         return imgLinksOnPage;
  297.     }
  298.  
  299.     private static ArrayList<String> extractHREFLinksFromHTML(String pageContent) {
  300.         Pattern htmlLinksPattern = Pattern.compile(REGEX_TO_MATCH_HTML_LINKS);
  301.         Matcher htmlLinksMatcher = htmlLinksPattern.matcher(pageContent);
  302.         ArrayList<String> hrefLinksOnPage = new ArrayList<String>();
  303.         while(htmlLinksMatcher.find()) {
  304.             hrefLinksOnPage.add(htmlLinksMatcher.group());
  305.         }
  306.         return hrefLinksOnPage;
  307.     }
  308.  
  309.     private static String downloadHTMLOfPage(URL urlOfPage) throws IOException {
  310.         Socket clientSocket = connectToServer();
  311.         BufferedReader inFromServer = new BufferedReader(new InputStreamReader(clientSocket.getInputStream()));
  312.         OutputStreamWriter outWriter = new OutputStreamWriter(clientSocket.getOutputStream());
  313.         PrintWriter outPrintWriter = new PrintWriter(outWriter, AUTO_FLUSH_OFF);
  314.         sendGETRequest(outPrintWriter, urlOfPage);
  315.         String pageContent = getServerResponse(inFromServer);
  316.         clientSocket.close();
  317.         return pageContent;
  318.     }
  319.        
  320.     private static String getServerResponse(BufferedReader inFromServer) throws IOException {
  321.         String pageContent = "", line;
  322.         while ((line = inFromServer.readLine()) != null) {
  323.             pageContent = pageContent + line + "\n";
  324.         }
  325.         return pageContent;
  326.     }
  327.  
  328.     private static void sendGETRequest(PrintWriter outPrintWriter, URL urlOfPage) throws IOException {
  329.         outPrintWriter.print("GET " + urlOfPage.getFile() + " HTTP/1.0\r\n");
  330.         outPrintWriter.print("Host: "+ host +"\r\n");
  331.         outPrintWriter.print("Accept: text/plain, text/html, text/*\r\n");
  332.         outPrintWriter.print("\r\n");
  333.         outPrintWriter.flush();    
  334.     }
  335.  
  336.     private static Socket connectToServer() {
  337.         Socket clientSocket = null;
  338.         try {
  339.             clientSocket = new Socket(host, 80);
  340.         } catch (UnknownHostException e) {
  341.             System.out.println("The IP Address of " + host + " could not be determined. Program terminated.");
  342.             System.exit(1);
  343.         } catch (IOException e) {
  344.             System.out.println("The connection to the remote server can't be made. Please try to run the program again. Program terminated.");
  345.             System.exit(1);
  346.         }
  347.         return clientSocket;
  348.     }
  349.  
  350.     private static File createDirectory(String[] args) {
  351.         getURLFromUser(args);
  352.         getURLObjectFromUserInputAddress();
  353.         protocol = inputURL.getProtocol();
  354.         host = inputURL.getHost();
  355.          return createDirectory(host);
  356.     }
  357.  
  358.     private static File createDirectory(String name) {
  359.         File directory = new File(name);
  360.         directory.mkdir();
  361.         return directory;
  362.     }
  363.  
  364.     private static void getURLObjectFromUserInputAddress() {
  365.         try {
  366.             inputURL = new URL(inputAddress);
  367.         } catch (MalformedURLException e) {
  368.             System.out.println("The URL entered is not parseable or contaisn an unsupported protocol. Please enter another URL -");
  369.             inputAddress = getInput();
  370.             getURLObjectFromUserInputAddress();
  371.         }
  372.     }
  373.  
  374.     private static void getURLFromUser(String[] args) {
  375.         try {
  376.             inputAddress = args[0];
  377.         } catch (ArrayIndexOutOfBoundsException e) {
  378.             System.out.println("No URL entered. Please enter a URL -");
  379.             inputAddress = getInput();
  380.         }
  381.     }
  382.  
  383.     private static String getInput() {
  384.         Scanner sc = new Scanner(System.in);
  385.         return sc.nextLine();
  386.     }
  387. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement