Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import org.jsoup.Connection;
- import org.jsoup.Jsoup;
- import org.jsoup.nodes.Document;
- import org.jsoup.select.Elements;
- import org.jsoup.nodes.Element;
- import java.io.BufferedWriter;
- import java.io.File;
- import java.io.FileWriter;
- import java.io.IOException;
- import java.util.HashMap;
- import java.util.LinkedList;
- import java.util.List;
- import java.util.Map;
- public class WebCrawl {
- private static final String USER_AGENT =
- "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0";
- private List<String> links = new LinkedList<String>(); // Just a list of URLs
- private Document htmlDocument; // This is our web page, or in other words, our document
- private Map<String, String> urlMap = new HashMap();
- public void crawl(String url)
- {
- try
- {
- Connection connection = Jsoup.connect(url).userAgent(USER_AGENT);
- Document htmlDocument = (Document) connection.get();
- this.htmlDocument = htmlDocument;
- if(connection.response().statusCode() == 200) // 200 is the HTTP OK status code
- // indicating that everything is great.
- {
- System.out.println("\n**Visiting** Received web page at " + url);
- }
- if(!connection.response().contentType().contains("text/html"))
- {
- System.out.println("**Failure** Retrieved something other than HTML");
- return;
- }
- String title = htmlDocument.title();
- if (title.contains("/"))
- title = title.replace("/","-");
- Element dataOnPage = htmlDocument.select("div.mw-parser-output").first();
- StringBuilder sb = new StringBuilder();
- Element eleInDiv = dataOnPage.select("p").first();
- sb.append(eleInDiv.text());
- Element nextPar = eleInDiv.nextElementSibling();
- while(nextPar!= null && nextPar.nodeName() != "h2"){
- if(nextPar.nodeName() == "p")
- sb.append(nextPar.text()+"\n");
- nextPar = nextPar.nextElementSibling();
- }
- File directory = new File("Crawler/JavaWikiBook/"+title);
- if(! directory.exists())
- directory.mkdir();
- writeDataIntoFile(sb.toString(),"Crawler/JavaWikiBook/"+title+"/"+title+".txt");
- System.out.println(title);
- System.out.println(sb.toString());
- Elements allh2 = dataOnPage.select("h2");
- for(Element eachH2 : allh2){
- String title2 = eachH2.text();
- if (title2.contains("/"))
- title2 = title2.replace("/","-");
- if(title2.contains("[edit]"))
- title2 = title2.replace("[edit]","");
- Element nextP = eachH2.nextElementSibling();
- StringBuffer stringBuffer = new StringBuffer();
- while (nextP != null && nextP.nodeName() != "h2"){
- stringBuffer.append(nextP.text()+"\n");
- nextP = nextP.nextElementSibling();
- }
- writeDataIntoFile(stringBuffer.toString(),"Crawler/JavaWikiBook/"+title+"/"+title2+".txt");
- System.out.println(title2);
- System.out.println(stringBuffer.toString());
- }
- // return dataOnPage.text();
- }
- catch(IOException ioe)
- {
- // We were not successful in our HTTP request
- System.out.println("ERROR!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!");
- return;
- }
- }
- public Map<String,String> crawlerStageOne(String url)
- {
- try {
- Connection connection = Jsoup.connect(url).userAgent(USER_AGENT);
- Document htmlDocument = (Document) connection.get();
- if(connection.response().statusCode() == 200) // 200 is the HTTP OK status code
- // indicating that everything is great.
- {
- System.out.println("\n**Visiting** Received web page at " + url);
- }
- if(!connection.response().contentType().contains("text/html"))
- {
- System.out.println("**Failure** Retrieved something other than HTML");
- return null;
- }
- Element contentDiv = htmlDocument.select("div.mw-content-ltr").first();
- Elements javaLinks = contentDiv.getElementsByTag("a");
- boolean foundPreface = false;
- for(Element link: javaLinks)
- {
- if(link.text().equalsIgnoreCase("preface"))
- foundPreface = true;
- if(!link.text().isEmpty() && foundPreface)
- {
- System.out.println("Title :"+link.text()+"\tURL :"+link.absUrl("href"));
- urlMap.put(link.text(),link.absUrl("href"));
- }
- }
- }catch (IOException exp){
- System.out.println(exp);
- }
- return urlMap;
- }
- public List<String> getLinks()
- {
- return this.links;
- }
- private boolean writeDataIntoFile(String data, String fileName){
- try{
- BufferedWriter writer = new BufferedWriter(new FileWriter(fileName,true));
- //System.out.println("Data: "+data);
- writer.write(data);
- writer.close();
- return true;
- }catch (IOException exp){
- System.out.println("ERROR!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! "+ exp);
- return false;
- }
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement