Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import java.io.FileInputStream;
- import java.sql.Connection;
- import java.sql.DriverManager;
- import java.sql.PreparedStatement;
- import java.sql.ResultSet;
- import java.sql.Statement;
- import java.text.SimpleDateFormat;
- import java.util.*;
- public class DB_Class
- {
- private String MySQL_IP = null;
- private String MySQL_PORT = null;
- private String MySQL_DATABASE = null;
- // private String MySQL_TABLE = null;
- private String MySQL_USERNAME = null;
- private String MySQL_PASSWORD = null;
- private Connection conn = null;
- private PreparedStatement Crawler_prestate = null;
- private PreparedStatement newsdata_prestate = null;
- private PreparedStatement crawlerlog_prestate = null;
- private PreparedStatement parsed_prestate = null;
- public DB_Class()
- {
- getDBConfig();
- }
- private void getDBConfig()
- {
- try
- {
- Properties props = new Properties();
- props.load(new FileInputStream("db_config.ini"));
- MySQL_IP = props.getProperty("MySQL_IP");
- MySQL_PORT = props.getProperty("MySQL_PORT");
- MySQL_DATABASE = props.getProperty("MySQL_DATABASE");
- // MySQL_TABLE = props.getProperty("MySQL_TABLE");
- MySQL_USERNAME = props.getProperty("MySQL_USERNAME");
- MySQL_PASSWORD = props.getProperty("MySQL_PASSWORD");
- }
- catch(Exception e)
- {
- e.printStackTrace();
- }
- }
- public void conn_DB()
- {
- try
- {
- String driver = "com.mysql.jdbc.Driver";
- String url = "jdbc:mysql://" + MySQL_IP + ":" + MySQL_PORT + "/" + MySQL_DATABASE + "?useUnicode=true&characterEncoding=UTF8";
- Class.forName(driver);
- conn = DriverManager.getConnection(url, MySQL_USERNAME, MySQL_PASSWORD);
- }
- catch(Exception e)
- {
- e.printStackTrace();
- }
- }
- public void Server_Initail_URL_list()
- {
- try
- {
- String initail_sql = "UPDATE url_list SET crawler_id=?, crawler_ip=?, cost_time=?, update_time=?";
- PreparedStatement prestate = conn.prepareStatement(initail_sql);
- prestate.setString(1, null);
- prestate.setString(2, null);
- prestate.setString(3, null);
- prestate.setString(4, null);
- prestate.executeUpdate();
- prestate.close();
- }
- catch(Exception e)
- {
- e.printStackTrace();
- }
- }
- public void Server_Initail_Cralwer_Status()
- {
- try
- {
- String initail_sql = "DELETE FROM crawler_status";
- PreparedStatement prestate = conn.prepareStatement(initail_sql);
- prestate.executeUpdate();
- prestate.close();
- }
- catch(Exception e)
- {
- e.printStackTrace();
- }
- }
- public HashMap<String, String> Server_Select_Crawl_URL(String crawler_ip)
- {
- HashMap<String, String> return_map = new HashMap<String, String>();
- try
- {
- String select_sql = "SELECT id, site_id, sitename, type, url FROM url_list WHERE site_id NOT IN (SELECT DISTINCT site_id FROM url_list WHERE crawler_ip=?) AND crawler_ip IS null ORDER BY update_time LIMIT 1";
- PreparedStatement prestate = conn.prepareStatement(select_sql);
- prestate.setString(1, crawler_ip);
- ResultSet result = prestate.executeQuery();
- while(result.next())
- {
- return_map.put("id", result.getString("id"));
- return_map.put("site_id", result.getString("site_id"));
- return_map.put("sitename", result.getString("sitename"));
- return_map.put("type", result.getString("type"));
- return_map.put("url", result.getString("url"));
- }
- result.close();
- prestate.close();
- }
- catch(Exception e)
- {
- e.printStackTrace();
- }
- return return_map;
- }
- public void Server_Register_Update_URL_List_CrawlerIP(
- String id,
- String crawler_ip,
- String crawler_id)
- {
- try
- {
- String update_sql = "UPDATE url_list SET crawler_ip=?, crawler_id=? WHERE id=?";
- PreparedStatement prestate = conn.prepareStatement(update_sql);
- prestate.setString(1, crawler_ip);
- prestate.setString(2, crawler_id);
- prestate.setString(3, id);
- prestate.executeUpdate();
- prestate.close();
- }
- catch(Exception e)
- {
- e.printStackTrace();
- }
- }
- public void Server_Register_Update_Crawler_Status(
- String crawler_id,
- String crawler_ip,
- String crawler_pid,
- String url)
- {
- try
- {
- Calendar rightNow = Calendar.getInstance();
- SimpleDateFormat fmt = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");//存入DB的時間格式
- String check_sql = "SELECT count(*) check_sql FROM crawler_status WHERE crawler_id=?";
- String insert_sql = "INSERT INTO crawler_status (crawler_id, crawler_ip, pid, url, last_update_time, status) VALUES(?, ?, ?, ?, ?, ?)";
- String update_sql = "UPDATE crawler_status SET last_update_time=?, url=?, status=? WHERE crawler_id=?";
- PreparedStatement prestate = conn.prepareStatement(check_sql);
- prestate.setString(1, crawler_id);
- ResultSet result = prestate.executeQuery();
- result.next();
- int check_string = result.getInt("check_sql");
- if(check_string == 0)
- {
- prestate = conn.prepareStatement(insert_sql);
- prestate.setString(1, crawler_id);
- prestate.setString(2, crawler_ip);
- prestate.setString(3, crawler_pid);
- prestate.setString(4, url);
- prestate.setString(5, fmt.format(rightNow.getTime()));
- prestate.setString(6, "green");
- }
- else
- {
- prestate = conn.prepareStatement(update_sql);
- prestate.setString(1, fmt.format(rightNow.getTime()));
- prestate.setString(2, url);
- prestate.setString(3, "green");
- prestate.setString(4, crawler_id);
- }
- result.close();
- prestate.executeUpdate();
- prestate.close();
- }
- catch(Exception e)
- {
- e.printStackTrace();
- }
- }
- public void Server_Done_Update_URL_List_CrawlerIP(
- String crawler_ip,
- String crawler_id,
- String cost_time)
- {
- try
- {
- Calendar rightNow = Calendar.getInstance();
- SimpleDateFormat fmt = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");//存入DB的時間格式
- String update_sql = "UPDATE url_list SET crawler_ip=?, crawler_id=?, cost_time=?, update_time=? WHERE crawler_id=?";
- PreparedStatement prestate = conn.prepareStatement(update_sql);
- prestate.setString(1, null);
- prestate.setString(2, null);
- prestate.setString(3, cost_time);
- prestate.setString(4, fmt.format(rightNow.getTime()));
- prestate.setString(5, crawler_id);
- prestate.executeUpdate();
- prestate.close();
- }
- catch(Exception e)
- {
- e.printStackTrace();
- }
- }
- public void Server_Update_Crawler_Status(String crawler_id)
- {
- try
- {
- Calendar rightNow = Calendar.getInstance();
- SimpleDateFormat fmt = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");//存入DB的時間格式
- String update_sql = "UPDATE crawler_status SET last_update_time=? WHERE crawler_id=?";
- PreparedStatement prestate = conn.prepareStatement(update_sql);
- prestate.setString(1, fmt.format(rightNow.getTime()));
- prestate.setString(2, crawler_id);
- prestate.executeUpdate();
- prestate.close();
- }
- catch(Exception e)
- {
- e.printStackTrace();
- }
- }
- public void Server_Check_Crawler_Status()
- {
- try
- {
- Calendar rightNow = Calendar.getInstance();
- SimpleDateFormat fmt = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
- rightNow.add(Calendar.HOUR_OF_DAY, -1);
- String check_sql = "UPDATE crawler_status SET status=? WHERE last_update_time<?";
- PreparedStatement prestate = conn.prepareStatement(check_sql);
- prestate.setString(1, "red");
- prestate.setString(2, fmt.format(rightNow.getTime()));
- prestate.executeUpdate();
- prestate.close();
- }
- catch(Exception e)
- {
- e.printStackTrace();
- }
- }
- public void Server_Clear_Dead_Crawler()
- {
- try
- {
- String delete_sql = "UPDATE url_list SET crawler_id=?, crawler_ip=? WHERE crawler_id IN (SELECT crawler_id FROM crawler_status WHERE status=?)";
- PreparedStatement prestate = conn.prepareStatement(delete_sql);
- prestate.setString(1, null);
- prestate.setString(2, null);
- prestate.setString(3, "red");
- prestate.executeUpdate();
- prestate.close();
- }
- catch(Exception e)
- {
- e.printStackTrace();
- }
- }
- public ResultSet Cawler_SelectURL_List(String site_id)
- {
- ResultSet result = null;
- try
- {
- String check_sql = "SELECT url, sitename, type FROM url_list WHERE site_id=?";
- PreparedStatement prestate = conn.prepareStatement(check_sql);
- prestate.setString(1, site_id);
- result = prestate.executeQuery();
- }
- catch(Exception e)
- {
- e.printStackTrace();
- }
- return result;
- }
- public String Cawler_Check_Newsdata_HTML(String keyURL)
- {
- String check_string = "";
- try
- {
- String check_sql = "";
- check_sql = "SELECT count(*) check_news FROM newsdata_html WHERE url='" + keyURL + "'";
- Statement stmt = conn.createStatement();
- ResultSet result = stmt.executeQuery(check_sql);
- while(result.next())
- check_string = result.getString("check_news");
- result.close();
- stmt.close();
- }
- catch(Exception e)
- {
- e.printStackTrace();
- }
- return check_string;
- }
- public String Cawler_Check_Newsdata(String keyURL)
- {
- String check_string = "";
- try
- {
- String check_sql = "";
- check_sql = "SELECT count(*) check_news FROM newsdata WHERE url='" + keyURL + "'";
- Statement stmt = conn.createStatement();
- ResultSet result = stmt.executeQuery(check_sql);
- while(result.next())
- check_string = result.getString("check_news");
- result.close();
- stmt.close();
- }
- catch(Exception e)
- {
- e.printStackTrace();
- }
- return check_string;
- }
- public void Crawler_SetUpdate_preStmt()
- {
- try
- {
- String update_newsdata_html_sql = "INSERT INTO newsdata_html (site_id, parsed, url, sitename, title, author, type, rawtime, html) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?)";
- Crawler_prestate = conn.prepareStatement(update_newsdata_html_sql);
- }
- catch(Exception e)
- {
- e.printStackTrace();
- }
- }
- public void Crawler_AddBatch(String site_id, Page_Element PE)
- {
- try
- {
- if(Cawler_Check_Newsdata_HTML(PE.getURL()).equals("0") && Cawler_Check_Newsdata(PE.getURL()).equals("0"))//再次判斷
- {
- Crawler_prestate.setString(1, site_id);
- Crawler_prestate.setString(2, "No");
- Crawler_prestate.setString(3, PE.getURL());
- Crawler_prestate.setString(4, PE.getSitename());
- Crawler_prestate.setString(5, PE.getTitle());
- Crawler_prestate.setString(6, PE.getAuthor());
- Crawler_prestate.setString(7, PE.getType());
- Crawler_prestate.setString(8, PE.getRawTime());
- Crawler_prestate.setString(9, PE.getHTML());
- Crawler_prestate.addBatch();
- }
- }
- catch(Exception e)
- {
- e.printStackTrace();
- }
- }
- public void Crawler_ExecuteBatch()
- {
- try
- {
- Crawler_prestate.executeBatch();
- Crawler_prestate.clearBatch();
- }
- catch(Exception e)
- {
- e.printStackTrace();
- }
- }
- public String Parser_Local_SelectLastID(String crawler_id)
- {
- String return_string = "0";
- try
- {
- String select_sql = "SELECT id FROM newsdata_html WHERE site_id=? and parsed='No' order by id DESC limit 1";
- PreparedStatement prestate = conn.prepareStatement(select_sql);
- prestate.setString(1, crawler_id);
- ResultSet result = prestate.executeQuery();
- while(result.next())
- return_string = result.getString("id");
- prestate.close();
- }
- catch(Exception e)
- {
- e.printStackTrace();
- }
- return return_string;
- }
- public ResultSet Parser_Local_SelectHTML(String crawler_id, String html_id)
- {
- ResultSet result = null;
- try
- {
- String select_sql = "SELECT * FROM newsdata_html WHERE site_id=? and id>? and parsed='No' limit 10000";
- PreparedStatement prestate = conn.prepareStatement(select_sql);
- prestate.setString(1, crawler_id);
- prestate.setString(2, html_id);
- result = prestate.executeQuery();
- }
- catch(Exception e)
- {
- e.printStackTrace();
- }
- return result;
- }
- public String Parser_Server_SelectLastID()
- {
- String return_string = "0";
- try
- {
- String select_sql = "SELECT id FROM newsdata_html WHERE parsed='No' order by id DESC limit 1";
- PreparedStatement prestate = conn.prepareStatement(select_sql);
- ResultSet result = prestate.executeQuery();
- while(result.next())
- return_string = result.getString("id");
- prestate.close();
- }
- catch(Exception e)
- {
- e.printStackTrace();
- }
- return return_string;
- }
- public ResultSet Parser_Server_SelectHTML(String html_id)
- {
- ResultSet result = null;
- try
- {
- String select_sql = "SELECT * FROM newsdata_html WHERE id>? and parsed='No' limit 10000";
- // String select_sql = "SELECT * FROM newsdata_html WHERE id>? limit 10000";
- PreparedStatement prestate = conn.prepareStatement(select_sql);
- prestate.setString(1, html_id);
- result = prestate.executeQuery();
- }
- catch(Exception e)
- {
- e.printStackTrace();
- }
- return result;
- }
- public String Parser_CheckNews(String keyURL)
- {
- String check_string = "";
- try
- {
- String check_sql = "";
- check_sql = "SELECT count(*) check_news FROM newsdata WHERE url=?";
- PreparedStatement prestate = conn.prepareStatement(check_sql);
- prestate.setString(1, keyURL);
- ResultSet result = prestate.executeQuery();
- while(result.next())
- check_string = result.getString("check_news");
- result.close();
- prestate.close();
- }
- catch(Exception e)
- {
- e.printStackTrace();
- }
- return check_string;
- }
- public void Parser_SetUpdate_preStmt()
- {
- try
- {
- String update_newsdata_sql = "INSERT INTO newsdata (url, title, time, rawtime, content, sitename, author, crawltime, type, html) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?)";
- String update_crawler_lost_log_sql = "INSERT INTO crawler_lost_log (sitename, url, time, lost, black_list) VALUES(?, ?, ?, ?, ?)";
- String parsed_sql = "UPDATE newsdata_html SET parsed='Yes' WHERE id=?";
- newsdata_prestate = conn.prepareStatement(update_newsdata_sql);
- crawlerlog_prestate = conn.prepareStatement(update_crawler_lost_log_sql);
- parsed_prestate = conn.prepareStatement(parsed_sql);
- }
- catch(Exception e)
- {
- e.printStackTrace();
- }
- }
- public void Parser_AddBatch(Page_Element PE, Get_Crawl_Pattern GCP, Time_Process_Class TPC)
- {
- try
- {
- if(Parser_CheckNews(PE.getURL()).equals("0"))//再次判斷
- {
- StringBuilder sb = new StringBuilder();
- newsdata_prestate.setString(1, PE.getURL());
- newsdata_prestate.setString(2, PE.getTitle());
- if(PE.getTime() == null)
- newsdata_prestate.setString(3, PE.getCrawlTime());
- else
- newsdata_prestate.setString(3, TPC.time(PE.getTime(), GCP.getTimeZone()));
- newsdata_prestate.setString(4, PE.getRawTime());
- newsdata_prestate.setString(5, PE.getContent());
- newsdata_prestate.setString(6, PE.getSitename());
- if(PE.getAuthor() == null)
- newsdata_prestate.setString(7, PE.getSitename());
- else
- newsdata_prestate.setString(7, PE.getAuthor());
- newsdata_prestate.setString(8, PE.getCrawlTime());
- newsdata_prestate.setString(9, PE.getType());
- newsdata_prestate.setString(10, PE.getHTML());
- newsdata_prestate.addBatch();
- if(PE.getTitle() == null)
- sb.append("title ");
- if(PE.getRawTime() == null)
- sb.append("time ");
- if(PE.getContent() == null)
- sb.append("content ");
- if(PE.getAuthor() == null)
- sb.append("author ");
- if(PE.getType() == null)
- sb.append("type ");
- if(sb.length() > 0)
- {
- String check_lostLog = Check_LostLog(PE.getURL());
- if(check_lostLog.equals("0"))
- {
- crawlerlog_prestate.setString(1, PE.getSitename());
- crawlerlog_prestate.setString(2, PE.getURL());
- crawlerlog_prestate.setString(3, PE.getCrawlTime());
- crawlerlog_prestate.setString(4, sb.toString());
- crawlerlog_prestate.setString(5, "0");
- crawlerlog_prestate.addBatch();
- }
- }
- parsed_prestate.setString(1, PE.getID());
- parsed_prestate.addBatch();
- }
- }
- catch(Exception e)
- {
- e.printStackTrace();
- }
- }
- public void Parser_ExecuteBatch()
- {
- try
- {
- newsdata_prestate.executeBatch();
- crawlerlog_prestate.executeBatch();
- parsed_prestate.executeBatch();
- newsdata_prestate.clearBatch();
- crawlerlog_prestate.clearBatch();
- parsed_prestate.clearBatch();
- }
- catch(Exception e)
- {
- e.printStackTrace();
- }
- }
- public String Check_LostLog(String URL)
- {
- String check_string = "";
- try
- {
- String check_sql = "SELECT count(*) check_lost FROM crawler_lost_log WHERE url='" + URL + "'";
- Statement stmt = conn.createStatement();
- ResultSet result = stmt.executeQuery(check_sql);
- while(result.next())
- check_string = result.getString("check_lost");
- result.close();
- stmt.close();
- }
- catch(Exception e)
- {
- e.printStackTrace();
- }
- return check_string;
- }
- public ResultSet getCrawl_Pattern(String site_id)
- {
- ResultSet result = null;
- try
- {
- String select_sql = "SELECT * FROM site_pattern WHERE site_id=?";
- PreparedStatement prestate = conn.prepareStatement(select_sql);
- prestate.setString(1, site_id);
- result = prestate.executeQuery();
- }
- catch(Exception e)
- {
- e.printStackTrace();
- }
- return result;
- }
- public void close_DB()//
- {
- try
- {
- conn.close();
- }
- catch(Exception e)
- {
- e.printStackTrace();
- }
- }
- }
Add Comment
Please, Sign In to add comment