Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- package crawler;
- import java.io.BufferedReader;
- import java.io.IOException;
- import java.io.InputStreamReader;
- import java.util.ArrayList;
- import java.util.Iterator;
- import java.util.List;
- import java.util.Scanner;
- import java.util.logging.Level;
- import java.util.logging.Logger;
- import crawler.BasicDAO;
- /*
- * To change this template, choose Tools | Templates
- * and open the template in the editor.
- */
- /**
- *
- * @author syncsys
- */
- public class Main {
- public static volatile long processedLinksCount = 0;
- public static volatile long nonUniqueEmailsCount = 0;
- public static volatile long UniqueEmailsCount = 0;
- private static Integer requiredThreadCount;
- private static Integer intervalToShowUpdate; // in minuts
- private static boolean resumeWithSavedLinksBoolean = false;
- // run crawler once with given url
- // run threads
- public static void main(String [] args){
- String initialUrl = null;
- BasicDAO dao = new BasicDAO();
- if(!dao.connectionAvailable()){
- return;
- }
- Scanner scanner = new Scanner(System.in);
- System.out.println("Are you running this application first time? (y/n) ");
- String firstTime = scanner.nextLine();
- if (firstTime.toLowerCase().equals("y")){
- if(!dao.createTables()){
- System.out.println("This is not your first time. The database already exists.");
- System.out.println("Hit ctrl + c and next time choose type n instead of y");
- }
- }
- System.out.println("Would you like to resume processing saved links (y) or start with a new one (n) : (y/n): ");
- String resumeWithSavedLinks = scanner.nextLine();
- if(resumeWithSavedLinks.toLowerCase().equals("y")){
- resumeWithSavedLinksBoolean = true;
- }else{
- System.out.println("Type url to crawl (e.g http://yahoo.com) : ");
- initialUrl = scanner.nextLine();
- }
- System.out.println("Type number of Threads to use : ");
- requiredThreadCount = Integer.parseInt(scanner.nextLine()) ;
- System.out.println("Type time interval for which you want stats to be updated and shown to you (minuts) : ");
- intervalToShowUpdate = Integer.parseInt(scanner.nextLine()) ;
- scanner.close();
- List<Thread> threadList = new ArrayList<Thread>();
- int runningThreadCount = 0;
- while(true){
- runningThreadCount = threadList.size();
- if(runningThreadCount < requiredThreadCount){
- int difference = requiredThreadCount - runningThreadCount;
- for (int i = 0; i < difference; i++) {
- try {
- // if(resumeWithSavedLinksBoolean){
- Thread thread = new Thread(new Crawler("https://www.google.com.pk/?gws_rd=cr&ei=-q8vUqqNDIny4QTLlYCwAQ#q=pakistan"/*new BasicDAO().getNonProcessedLink()*/));
- System.out.println("resume with saved link true");
- // }else{
- // thread = new Thread(new Crawler(initialUrl));
- // resumeWithSavedLinksBoolean = true;
- // System.out.println("resume with saved links false");
- // }
- thread.start();
- System.out.println("thread stared");
- threadList.add(thread);
- System.out.println("thread added to arraylist");
- Thread.sleep(20000);
- } catch (Exception ex) {
- new Logging().logError(ex.toString());
- }
- }
- }
- if (threadList.size() > 0){
- Iterator it = threadList.iterator();
- while (it.hasNext()){
- Thread t = (Thread)it.next();
- if(t.isAlive()){
- System.out.println("thread alive");
- }else{
- System.out.println("Thread state : "+t.getState());
- System.out.println("thread dead");
- // t.start();
- // System.out.println("thread restarted");
- // System.out.println("Thread state : "+t.getState());
- }
- }
- }
- try {
- Thread.sleep(intervalToShowUpdate*60*1000);
- } catch (InterruptedException ex) {
- new Logging().logError(ex.toString());
- }
- System.out.println("=====================================================");
- System.out.println("---Pages--------------------");
- System.out.println("Fetched links so far: \t \t " + getLinksCountFromDB());
- System.out.println("Processed pages so far: \t \t " + processedLinksCount);
- System.out.println("---Emails--------------------");
- System.out.println("Emains fetched so far (non-unique) : \t \t " + nonUniqueEmailsCount);
- System.out.println("Emains fetched so far (unique) : \t \t " + getEmailCountFromDB());
- System.out.println("---Threads--------------------");
- System.out.println("Threads required : \t \t " + requiredThreadCount);
- System.out.println("Threads running currently : \t \t " + runningThreadCount);
- System.out.println("Thread list size (debuggin mode) : \t \t " + threadList.size());
- System.out.println("---Others--------------------");
- System.out.println("Interval to update stats : \t \t " + intervalToShowUpdate+" minuts");
- System.out.println("=====================================================");
- }
- // new Thread(new Runnable(){
- // @Override
- // public void run() {
- // System.out.println("blah");
- // }
- // }).start();
- }
- private static long getEmailCountFromDB(){
- return 1;
- }
- private static long getLinksCountFromDB(){
- return 1;
- }
- }
- =========================
- /*
- * To change this template, choose Tools | Templates
- * and open the template bf the editor.
- */
- package crawler;
- import crawler.Main;
- import java.io.BufferedReader;
- import java.io.IOException;
- import java.io.InputStreamReader;
- import java.net.URL;
- import java.util.ArrayList;
- import java.util.List;
- import java.util.logging.Level;
- import java.util.logging.Logger;
- import java.util.regex.Matcher;
- import java.util.regex.Pattern;
- import org.jsoup.Jsoup;
- import org.jsoup.nodes.Document;
- import org.jsoup.nodes.Element;
- import org.jsoup.select.Elements;
- /**
- *
- * @author syncsys
- */
- public class Crawler implements Runnable {
- private static final String patternString = "[_A-Za-z0-9-]+(\\.[_A-Za-z0-9-]+)*@[A-Za-z0-9]+(\\.[A-Za-z0-9]+)*(\\.[A-Za-z]{2,})";
- private volatile String url;
- private void crawl(String url) {
- //get url from db or use supplied url
- //use boolean "first time" to check if its first time or sufficient urls are in db.
- BufferedReader bf = null;
- try {
- URL target = new URL(url);
- bf = new BufferedReader(
- new InputStreamReader(target.openStream())
- );
- // System.out.println("debug ========= 1");
- StringBuilder html = new StringBuilder();
- String inputLine;
- while ((inputLine = bf.readLine()) != null) {
- html.append(inputLine);
- }
- List emailList = new ArrayList( getEmailList(html.toString()) );
- List linkList = new ArrayList( getLinkList(html.toString(), url) );
- System.out.println("Just worked on --------- "+ url);
- if(new BasicDAO().markLinkAsProcesed(url)){
- System.out.println("Link marked processed in db "+url);
- }
- Main.processedLinksCount++;
- putEmailsInDB(emailList);
- putLinksInDB(linkList);
- // System.out.println("debug ========= 2");
- } catch (IOException ex) {
- new Logging().logError(ex.toString());
- new BasicDAO().deleteLink(url);
- } catch (Exception ex) {
- new Logging().logError(ex.toString());
- new BasicDAO().deleteLink(url);
- }finally{
- if(bf !=null){
- try {
- bf.close();
- // System.out.println("debug ========= 3");
- } catch (IOException ex) {
- new Logging().logError(ex.toString());
- }
- }
- // System.out.println("debug ========= 4");
- String nonProcessedLinkFromDB = null;
- nonProcessedLinkFromDB = getNonProcessedLinkFromDB();
- System.out.println("fetched non-processed link from db: ++++++++++++++++ "+ nonProcessedLinkFromDB);
- crawl(nonProcessedLinkFromDB);
- // System.out.println("nonePlinkfromDB is ++++++++++++++++++++++++++" + nonProcessedLinkFromDB);
- // System.out.println("debug ========= 5");
- nonProcessedLinkFromDB= null;
- }
- /// String line = "kj asdkfj a;sdlfkj <p>[email protected]</p> asdkfja sdlfkj [email protected] ads";
- }
- private List getLinkList(String html, String url) {
- Document doc = Jsoup.parse(html);
- Elements bodies = doc.select("body");
- List linkList = new ArrayList();
- for(Element body : bodies ){
- Elements aTags = body.getElementsByTag("a");
- for (Element a: aTags){
- String link = a.attr("href");
- if ( !(link.startsWith("#"))
- &&
- !(link.contains("()")) ){
- if( link.startsWith("/") ){
- link = url+link;
- }
- linkList.add(link);
- //put link in db
- }
- }
- }
- return linkList;
- }
- private List getEmailList(String html) {
- Pattern p = Pattern.compile(patternString);
- Matcher m = p.matcher(html);
- List emailList = new ArrayList();
- while(m.find()){
- emailList.add(m.group());
- Main.nonUniqueEmailsCount++;
- }
- return emailList;
- }
- private String getNonProcessedLinkFromDB() {
- return ( new BasicDAO().getNonProcessedLink() );
- }
- private void putEmailsInDB(List emailList) {
- new BasicDAO().insertEmail(emailList);
- }
- private void putLinksInDB(List linkList) {
- new BasicDAO().insertLinks(linkList);
- }
- @Override
- public void run() {
- if(url != null){
- crawl(url);
- }else{
- // crawl();
- }
- }
- public Crawler(String url){
- this.url = url;
- }
- public Crawler(){
- this.url = null;
- }
- }
- =========================
- /*
- * To change this template, choose Tools | Templates
- * and open the template in the editor.
- */
- package crawler;
- import java.sql.Connection;
- import java.sql.DriverManager;
- import java.sql.PreparedStatement;
- import java.sql.ResultSet;
- import java.sql.SQLException;
- import java.util.List;
- import java.util.logging.Level;
- import java.util.logging.Logger;
- /**
- *
- * @author syncsys
- */
- public class BasicDAO {
- private static final String DBUser = "postgres";
- private static final String DBName = "postgres";
- private static final String DBPass= "abc";
- public boolean connectionAvailable(){
- System.out.println("------------------ PostgreSQL "
- + "JDBC Connection Testing ------------------");
- try {
- Class.forName("org.postgresql.Driver");
- } catch (ClassNotFoundException e) {
- System.out.println("PostgreSQL JDBC Driver not found in application directory ");
- e.printStackTrace();
- return false;
- }
- System.out.println("PostgreSQL JDBC Driver Registered!");
- Connection connection = null;
- try {
- connection = DriverManager.getConnection(
- "jdbc:postgresql://127.0.0.1:5432/"+DBName, DBUser,
- DBPass);
- } catch (SQLException e) {
- System.out.println("Connection Failed! Check output console");
- e.printStackTrace();
- return false;
- }
- if (connection != null) {
- System.out.println("------------------ Database Connectivity : OK ------------------");
- return true;
- } else {
- System.out.println("Failed to make connection!");
- }
- return false;
- }
- public boolean createTables(){
- try {
- Class.forName("org.postgresql.Driver");
- } catch (ClassNotFoundException e) {
- System.out.println("PostgreSQL JDBC Driver not found in application directory ");
- e.printStackTrace();
- return false;
- }
- Connection connection = null;
- try {
- connection = DriverManager.getConnection(
- "jdbc:postgresql://127.0.0.1:5432/"+DBName, DBUser,
- DBPass);
- } catch (SQLException e) {
- System.out.println("Connection Failed! Check output console");
- e.printStackTrace();
- return false;
- }
- if (connection != null) {
- try {
- PreparedStatement pStmt = connection.prepareStatement(
- "create table links ("
- + "id bigserial PRIMARY KEY,"
- + "href text UNIQUE,"
- + "processed boolean"
- + ")"
- );
- pStmt.execute();
- pStmt.close();
- PreparedStatement pStmt2 = connection.prepareStatement(
- "create table emails ("
- + "id bigserial PRIMARY KEY,"
- + "email VARCHAR(255) UNIQUE"
- + ")"
- );
- pStmt2.execute();
- pStmt2.close();
- connection.close();
- System.out.println("tables created succesfully -------------------");
- return true;
- } catch (SQLException ex) {
- new Logging().logError(ex.toString());
- }
- } else {
- System.out.println("Failed to make connection!");
- }
- return false;
- }
- public void insertEmail(List emailList){
- try {
- Class.forName("org.postgresql.Driver");
- } catch (ClassNotFoundException e) {
- System.out.println("PostgreSQL JDBC Driver not found in application directory ");
- e.printStackTrace();
- }
- Connection connection = null;
- try {
- connection = DriverManager.getConnection(
- "jdbc:postgresql://127.0.0.1:5432/"+DBName, DBUser,
- DBPass);
- } catch (SQLException e) {
- new Logging().logError(e.toString());
- }
- if (connection != null) {
- try {
- if (emailList.size() > 0){
- for(Object email : emailList){
- try{
- PreparedStatement pStmt = connection.prepareStatement(
- "Insert into emails("
- + "email"
- + ")"
- + " Values ("
- + "?"
- + ")"
- );
- pStmt.setString(1, email.toString());
- pStmt.execute();
- if(pStmt != null){
- pStmt.close();
- }
- } catch (SQLException ex) {
- Logger.getLogger(BasicDAO.class.getName()).log(Level.SEVERE, null, ex);
- }
- }
- }
- connection.close();
- } catch (SQLException ex) {
- Logger.getLogger(BasicDAO.class.getName()).log(Level.SEVERE, null, ex);
- }
- } else {
- new Logging().logError("Failed to make db connection");
- }
- }
- public void insertLinks(List linkList){
- try {
- Class.forName("org.postgresql.Driver");
- } catch (ClassNotFoundException e) {
- new Logging().logError(e.toString());
- }
- Connection connection = null;
- try {
- connection = DriverManager.getConnection(
- "jdbc:postgresql://127.0.0.1:5432/"+DBName, DBUser,
- DBPass);
- } catch (SQLException e) {
- new Logging().logError(e.toString());
- }
- if (connection != null) {
- try {
- if (linkList.size() > 0){
- for(Object link : linkList){
- try{
- PreparedStatement pStmt = connection.prepareStatement(
- "Insert into links("
- + "href,"
- + "processed"
- + ")"
- + " Values ("
- + "?,?"
- + ")"
- );
- pStmt.setString(1, link.toString());
- pStmt.setBoolean(2, false);
- pStmt.execute();
- if(pStmt != null) {pStmt.close();}
- } catch (SQLException ex) {
- new Logging().logError(ex.toString());
- }
- }
- }
- if (connection != null) {
- connection.close();
- }
- } catch (SQLException ex) {
- Logger.getLogger(BasicDAO.class.getName()).log(Level.SEVERE, null, ex);
- } finally{
- if (connection != null) {
- try {
- connection.close();
- } catch (SQLException ex) {
- new Logging().logError(ex.toString());
- }
- }
- }
- } else {
- new Logging().logError("Failed to make db connection");
- }
- }
- public String getNonProcessedLink(){
- try {
- Class.forName("org.postgresql.Driver");
- } catch (ClassNotFoundException e) {
- new Logging().logError(e.toString());
- }
- Connection connection = null;
- try {
- connection = DriverManager.getConnection(
- "jdbc:postgresql://127.0.0.1:5432/"+DBName, DBUser,
- DBPass);
- } catch (SQLException e) {
- new Logging().logError(e.toString());
- }
- if (connection != null) {
- try {
- PreparedStatement pStmt = connection.prepareStatement(
- "SELECT href from links where processed = false limit 1"
- );
- ResultSet rs = null;
- rs = pStmt.executeQuery();
- String href = null;
- while(rs.next()){
- href = rs.getString("href");
- }
- rs.close();
- pStmt.close();
- if(connection != null) {
- connection.close();
- }
- //System.out.println("debug ===============---------------================== d: "+ href);
- return href;
- } catch (SQLException ex) {
- new Logging().logError(ex.toString());
- }
- } else {
- new Logging().logError("Failed to make db connection");
- }
- return null;
- }
- public boolean markLinkAsProcesed(String link){
- try {
- Class.forName("org.postgresql.Driver");
- } catch (ClassNotFoundException e) {
- new Logging().logError(e.toString());
- }
- Connection connection = null;
- try {
- connection = DriverManager.getConnection(
- "jdbc:postgresql://127.0.0.1:5432/"+DBName, DBUser,
- DBPass);
- } catch (SQLException e) {
- new Logging().logError(e.toString());
- }
- if (connection != null) {
- try {
- //System.out.println("debug +++++++++++++++++++++++++ 1");
- PreparedStatement pStmt = connection.prepareStatement(
- "update links set processed = ? where href = ?"
- );
- //System.out.println("debug +++++++++++++++++++++++++ 2");
- pStmt.setBoolean(1, true);
- pStmt.setString(2, link);
- //System.out.println("debug +++++++++++++++++++++++++ 3" +pStmt.toString());
- pStmt.execute();
- // System.out.println("link turned true in db ================================== " + link);
- System.out.println("The query was: "+pStmt.toString());
- if(pStmt != null) {pStmt.close();}
- return true;
- // System.out.println("debug +++++++++++++++++++++++++ 4");
- } catch (SQLException ex) {
- // System.out.println(ex.toString());
- // ex.printStackTrace();
- new Logging().logError(ex.toString());
- } finally{
- if (connection != null) {
- try {
- connection.close();
- } catch (SQLException ex) {
- new Logging().logError(ex.toString());
- }
- }
- }
- } else {
- new Logging().logError("Failed to make db connection");
- }
- return false;
- }
- public String deleteLink(String link){
- try {
- Class.forName("org.postgresql.Driver");
- } catch (ClassNotFoundException e) {
- new Logging().logError(e.toString());
- }
- Connection connection = null;
- try {
- connection = DriverManager.getConnection(
- "jdbc:postgresql://127.0.0.1:5432/"+DBName, DBUser,
- DBPass);
- } catch (SQLException e) {
- new Logging().logError(e.toString());
- }
- if (connection != null) {
- try {
- PreparedStatement pStmt = connection.prepareStatement(
- "delete from links where href = ?"
- );
- pStmt.setString(1, link);
- pStmt.execute();
- if(pStmt != null) {pStmt.close();}
- //System.out.println("deleted +++++++++++++++++++++++++ " + link);
- } catch (SQLException ex) {
- // System.out.println(ex.toString());
- // ex.printStackTrace();
- new Logging().logError(ex.toString());
- } finally{
- if (connection != null) {
- try {
- connection.close();
- } catch (SQLException ex) {
- new Logging().logError(ex.toString());
- }
- }
- }
- } else {
- new Logging().logError("Failed to make db connection");
- }
- return null;
- }
- }
- ==========================
- /*
- * To change this template, choose Tools | Templates
- * and open the template in the editor.
- */
- package crawler;
- import java.io.BufferedWriter;
- import java.io.File;
- import java.io.FileWriter;
- import java.io.IOException;
- /**
- *
- * @author syncsys
- */
- public class Logging {
- volatile static File errorFile = new File("errors.txt");
- public static void logError (String error){
- try {
- BufferedWriter bw = new BufferedWriter(new FileWriter(errorFile,true));
- //Write out a string to the file
- bw.write(error+"\r\n");
- //write a new line to the file so the next time you write
- //to the file it does it on the next line
- bw.newLine();
- //flushes and closes the stream
- bw.close();
- } catch (IOException e) {
- // TODO Auto-generated catch block
- new Logging().logError(e.toString());
- }
- }
- }
- ==========================
Advertisement
Add Comment
Please, Sign In to add comment