Untitled

import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import org.jsoup.nodes.Element;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;

public class WebCrawl {
    private static final String USER_AGENT =
            "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0";
    private List<String> links = new LinkedList<String>(); // Just a list of URLs
    private Document htmlDocument; // This is our web page, or in other words, our document
    private Map<String, String> urlMap = new HashMap();

    public void crawl(String url)
    {
        try
        {
            Connection connection = Jsoup.connect(url).userAgent(USER_AGENT);
            Document htmlDocument = (Document) connection.get();
            this.htmlDocument = htmlDocument;
            if(connection.response().statusCode() == 200) // 200 is the HTTP OK status code
            // indicating that everything is great.
            {
                System.out.println("\n**Visiting** Received web page at " + url);
            }
            if(!connection.response().contentType().contains("text/html"))
            {
                System.out.println("**Failure** Retrieved something other than HTML");
                return;
            }
            String title = htmlDocument.title();
            if (title.contains("/"))
                title = title.replace("/","-");
            Element dataOnPage = htmlDocument.select("div.mw-parser-output").first();
            StringBuilder sb = new StringBuilder();
            Element eleInDiv = dataOnPage.select("p").first();
            sb.append(eleInDiv.text());
            Element nextPar = eleInDiv.nextElementSibling();
            while(nextPar!= null && nextPar.nodeName() != "h2"){
                if(nextPar.nodeName() == "p")
                    sb.append(nextPar.text()+"\n");
                nextPar = nextPar.nextElementSibling();
            }
            File directory = new File("Crawler/JavaWikiBook/"+title);
            if(! directory.exists())
                directory.mkdir();
            writeDataIntoFile(sb.toString(),"Crawler/JavaWikiBook/"+title+"/"+title+".txt");
            System.out.println(title);
            System.out.println(sb.toString());
            Elements allh2 = dataOnPage.select("h2");
            for(Element eachH2 : allh2){
                String title2 = eachH2.text();
                if (title2.contains("/"))
                    title2 = title2.replace("/","-");
                if(title2.contains("[edit]"))
                    title2 = title2.replace("[edit]","");
                Element nextP = eachH2.nextElementSibling();
                StringBuffer stringBuffer = new StringBuffer();
                while (nextP != null && nextP.nodeName() != "h2"){
                    stringBuffer.append(nextP.text()+"\n");
                    nextP = nextP.nextElementSibling();
                }
                writeDataIntoFile(stringBuffer.toString(),"Crawler/JavaWikiBook/"+title+"/"+title2+".txt");
                System.out.println(title2);
                System.out.println(stringBuffer.toString());
            }
//            return dataOnPage.text();
        }
        catch(IOException ioe)
        {
            // We were not successful in our HTTP request
            System.out.println("ERROR!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!");
            return;
        }
    }

    public Map<String,String> crawlerStageOne(String url)
    {
        try {
            Connection connection = Jsoup.connect(url).userAgent(USER_AGENT);
            Document htmlDocument = (Document) connection.get();
            if(connection.response().statusCode() == 200) // 200 is the HTTP OK status code
            // indicating that everything is great.
            {
                System.out.println("\n**Visiting** Received web page at " + url);
            }
            if(!connection.response().contentType().contains("text/html"))
            {
                System.out.println("**Failure** Retrieved something other than HTML");
                return null;
            }

            Element contentDiv = htmlDocument.select("div.mw-content-ltr").first();
            Elements javaLinks = contentDiv.getElementsByTag("a");
            boolean foundPreface = false;
            for(Element link: javaLinks)
            {
                if(link.text().equalsIgnoreCase("preface"))
                    foundPreface = true;
                if(!link.text().isEmpty() && foundPreface)
                {
                    System.out.println("Title :"+link.text()+"\tURL :"+link.absUrl("href"));
                    urlMap.put(link.text(),link.absUrl("href"));
                }
            }
        }catch (IOException exp){
            System.out.println(exp);
        }
        return urlMap;
    }

    public List<String> getLinks()
    {
        return this.links;
    }

    private boolean writeDataIntoFile(String data, String fileName){
        try{
            BufferedWriter writer = new BufferedWriter(new FileWriter(fileName,true));
            //System.out.println("Data: "+data);
            writer.write(data);
            writer.close();
            return true;
        }catch (IOException exp){
            System.out.println("ERROR!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! "+ exp);
            return false;
        }
    }
}