Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- /*
- * To change this template, choose Tools | Templates
- * and open the template bf the editor.
- */
- package crawler;
- import crawler.Main;
- import java.io.BufferedReader;
- import java.io.IOException;
- import java.io.InputStreamReader;
- import java.net.URL;
- import java.util.ArrayList;
- import java.util.List;
- import java.util.logging.Level;
- import java.util.logging.Logger;
- import java.util.regex.Matcher;
- import java.util.regex.Pattern;
- import org.jsoup.Jsoup;
- import org.jsoup.nodes.Document;
- import org.jsoup.nodes.Element;
- import org.jsoup.select.Elements;
- /**
- *
- * @author syncsys
- */
- public class Crawler implements Runnable {
- private static final String patternString = "[_A-Za-z0-9-]+(\\.[_A-Za-z0-9-]+)*@[A-Za-z0-9]+(\\.[A-Za-z0-9]+)*(\\.[A-Za-z]{2,})";
- private volatile String url;
- private volatile String nonProcessedLinkFromDB = null;
- private void crawl(String url) {
- //get url from db or use supplied url
- //use boolean "first time" to check if its first time or sufficient urls are in db.
- BufferedReader bf = null;
- try {
- URL target = new URL(url);
- bf = new BufferedReader(
- new InputStreamReader(target.openStream())
- );
- // System.out.println("debug ========= 1");
- StringBuilder html = new StringBuilder();
- String inputLine;
- while ((inputLine = bf.readLine()) != null) {
- html.append(inputLine);
- }
- List emailList = new ArrayList( getEmailList(html.toString()) );
- List linkList = new ArrayList( getLinkList(html.toString(), url) );
- System.out.println("Just worked on --------- "+ url);
- // boolean markedLinkAsProcessedBoolean = new BasicDAO().markLinkAsProcesed(url);
- // if(markedLinkAsProcessedBoolean){
- //// System.out.println("Link marked processed in db "+url);
- // }
- Main.processedLinksCount++;
- putEmailsInDB(emailList);
- putLinksInDB(linkList);
- // System.out.println("debug ========= 2");
- } catch (IOException ex) {
- new Logging().logError(ex.toString());
- new BasicDAO().deleteLink(url);
- } catch (Exception ex) {
- new Logging().logError(ex.toString());
- new BasicDAO().deleteLink(url);
- }finally{
- if(bf !=null){
- try {
- bf.close();
- // System.out.println("debug ========= 3");
- } catch (IOException ex) {
- new Logging().logError(ex.toString());
- }
- }
- // System.out.println("debug ========= 4");
- synchronized(Crawler.class){
- nonProcessedLinkFromDB = getNonProcessedLinkFromDB();
- boolean markedLinkAsProcessedBoolean = new BasicDAO().markLinkAsProcesed(url);
- if(markedLinkAsProcessedBoolean){
- // System.out.println("Link marked processed in db "+url);
- }
- }
- // System.out.println("fetched non-processed link from db: ++++++++++++++++ "+ nonProcessedLinkFromDB);
- crawl(nonProcessedLinkFromDB);
- // System.out.println("nonePlinkfromDB is ++++++++++++++++++++++++++" + nonProcessedLinkFromDB);
- // System.out.println("debug ========= 5");
- nonProcessedLinkFromDB= null;
- }
- /// String line = "kj asdkfj a;sdlfkj <p>[email protected]</p> asdkfja sdlfkj [email protected] ads";
- }
- private List getLinkList(String html, String url) {
- Document doc = Jsoup.parse(html);
- Elements bodies = doc.select("body");
- List linkList = new ArrayList();
- for(Element body : bodies ){
- Elements aTags = body.getElementsByTag("a");
- for (Element a: aTags){
- String link = a.attr("href");
- if ( !(link.startsWith("#"))
- &&
- !(link.contains("()"))
- &&
- !(link.endsWith(".jpg"))
- &&
- !(link.endsWith(".jpeg"))
- &&
- !(link.endsWith(".png"))
- &&
- !(link.endsWith(".gif")) ){
- if( link.startsWith("/") ){
- link = url+link;
- }
- linkList.add(link);
- //put link in db
- }
- }
- }
- return linkList;
- }
- private List getEmailList(String html) {
- Pattern p = Pattern.compile(patternString);
- Matcher m = p.matcher(html);
- List emailList = new ArrayList();
- while(m.find()){
- emailList.add(m.group());
- Main.nonUniqueEmailsCount++;
- }
- return emailList;
- }
- private String getNonProcessedLinkFromDB() {
- return ( new BasicDAO().getNonProcessedLink() );
- }
- private void putEmailsInDB(List emailList) {
- new BasicDAO().insertEmail(emailList);
- }
- private void putLinksInDB(List linkList) {
- new BasicDAO().insertLinks(linkList);
- }
- @Override
- public void run() {
- if(url != null){
- crawl(url);
- }else{
- // crawl();
- }
- }
- public Crawler(String url){
- this.url = url;
- }
- public Crawler(){
- this.url = null;
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment