Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- package thepioneer;
- import java.io.IOException;
- import java.sql.Connection;
- import java.sql.DriverManager;
- import java.sql.SQLException;
- import java.sql.Statement;
- import java.util.ArrayList;
- import org.jsoup.Jsoup;
- import org.jsoup.nodes.Document;
- import org.jsoup.nodes.Element;
- import org.jsoup.select.Elements;
- public class Scraper {
- // true = uploads to database
- // false = Creates files in directory
- static boolean upload_mysql = true;
- static int startYear = 2011;
- static int endYear = 2020;
- static int startMonth = 1;
- static int endMonth = 12;
- // DO NOT CHANGE
- static Connection conn = null;
- static Statement stmt = null;
- static int lastPage;
- // Information for MYSQL Server
- // DO NOT CHANGE
- static final String JDBC_DRIVER = "com.mysql.jdbc.Driver";
- // CHANGE ME
- static final String ip = "localhost";
- static final String port = "3306";
- static final String database_name = "thepioneer";
- static final String username = "root";
- static final String password = "";
- public static void main(String[] args) throws SQLException {
- try {
- scrape();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- public static void scrape() throws IOException, SQLException {
- ArrayList<String> al = new ArrayList<String>();
- int counter = 1;
- // int startYear = sY;
- // int endYear = eY;
- if (upload_mysql) {
- createDatabase();
- createTables();
- }
- for (int i = startYear; i <= endYear; i++) {
- for (int j = startMonth; j <= endMonth; j++) {
- Document doc = Jsoup
- .connect("https://www.dailypioneer.com/searchlist.php?yr=" + i + "&mn=" + j + "&page=").get();
- try {
- Element last = doc.select("div.pagingList").select("ul").select("li").last();
- Elements lastPg = last.select("a");
- // System.out.println(last);
- String lastInt = lastPg.attr("id");
- setLastPage(Integer.parseInt(lastInt));
- } catch (Exception e) {
- }
- for (int k = 1; k < lastPage; k++) {
- System.out.println("Year: " + i + " Month: " + j + " Pages: " + k);
- // System.out.println(lastPageNum);
- // System.out.println("Year: " + i + " Month: " + j);
- // System.out.println("https://www.dailypioneer.com/searchlist.php?yr=" + i +
- // "&mn=" + j + "&page=");
- doc = Jsoup
- .connect("https://www.dailypioneer.com/searchlist.php?yr=" + i + "&mn=" + j + "&page=" + k)
- .get();
- Elements highLightedNews = doc.select("div.highLightedNews").select("ul.list-unstyled").select("li")
- .select("a");
- Elements innerNewsList = doc.select("div.innerNewsList").select("a");
- for (Element e : highLightedNews) {
- if (!e.attr("abs:href").contains("author"))
- al.add(e.attr("abs:href"));
- }
- for (Element e : innerNewsList) {
- if (!e.attr("abs:href").contains("author"))
- al.add(e.attr("abs:href"));
- }
- System.out.println(al.size());
- for (String s : al) {
- System.out.println(s);
- doc = Jsoup.connect(s).get();
- //String body = doc.select("div.newsDetailedContent").text();
- //System.out.println(body);
- if (upload_mysql == true) {
- // Uploads articles to database
- Connection myConn = DriverManager
- .getConnection("jdbc:mysql://localhost:3306/" + database_name, username, password);
- Statement myStmt = myConn.createStatement();
- String newTitle = doc.title().replace("'", "''");
- String newURL = s.replace("'", "''");
- String site = "dailypioneer.com";
- String newsInfo = doc.select("div.newsInfo").first().text();
- String author = newsInfo.substring(newsInfo.indexOf('|') + 2, newsInfo.length());
- String newAuthor = author.replace("'", "''");
- String date = newsInfo.substring(0, newsInfo.indexOf('|'));
- String newDate = date.replace("'", "''");
- // String country = "India";
- // String newCity = city.replace("'", "''");
- String body = doc.select("div.newsDetailedContent").text();
- //System.out.println(body);
- String newContent = body.replace("'", "''");
- String sql = "insert into `" + i + "` "
- + " (id, Title, URL, Site, Author, Date, Content)" + " values ('"
- + counter + "', '" + newTitle + "', '" + newURL + "', '" + site + "', '" + newAuthor
- + "', '" + newDate + "', '" + newContent + "')";
- try {
- myStmt.executeUpdate(sql);
- } catch (SQLException sqlEx) {
- System.out.println("Error uploading article: " + counter);
- } finally{
- /*This block should be added to your code
- * You need to release the resources like connections
- */
- if(conn!=null)
- conn.close();
- }
- System.out.println("Article " + counter + " Succesfully Uploaded!");
- counter++;
- }
- }
- al.clear();
- }
- }
- }
- }
- public static void setLastPage(int x) {
- lastPage = x;
- }
- /**
- * Function: Creates database
- **/
- public static void createDatabase() {
- String DB_URL = "jdbc:mysql://" + ip + ":" + port + "/";
- try {
- // STEP 2: Register JDBC driver
- Class.forName("com.mysql.jdbc.Driver");
- // STEP 3: Open a connection
- System.out.println("Connecting to database...");
- conn = DriverManager.getConnection(DB_URL, username, password);
- // STEP 4: Execute a query
- System.out.println("Creating database...");
- stmt = conn.createStatement();
- String sql = "CREATE DATABASE IF NOT EXISTS " + database_name.toUpperCase();
- stmt.executeUpdate(sql);
- System.out.println("Database created successfully...");
- } catch (SQLException se) {
- // Handle errors for JDBC
- se.printStackTrace();
- } catch (Exception ex) {
- // Handle errors for Class.forName
- ex.printStackTrace();
- }
- }
- /**
- * Function: Creates tables in database
- **/
- /**
- * Function: Creates tables in database
- **/
- public static void createTables() {
- String DB_URL = "jdbc:mysql://" + ip + ":" + port + "/" + database_name;
- try {
- // STEP 2: Register JDBC driver
- Class.forName("com.mysql.jdbc.Driver");
- // STEP 3: Open a connection
- System.out.println("Connecting to database...");
- conn = DriverManager.getConnection(DB_URL, username, password);
- for (int i = startYear; i <= endYear; i++) {
- // STEP 4: Execute a query
- System.out.println("Creating table in given database...");
- stmt = conn.createStatement();
- String sql = "CREATE TABLE IF NOT EXISTS `" + Integer.toString(i).toUpperCase() + "` "
- + "(`id` int(11) DEFAULT NULL," + "`Title` longtext DEFAULT NULL,"
- + "`URL` longtext DEFAULT NULL," + "`Site` text DEFAULT NULL," + "`Author` text DEFAULT NULL,"
- + "`Date` text DEFAULT NULL," + "`Content` longtext DEFAULT NULL)";
- stmt.executeUpdate(sql);
- System.out.println("Created table in given database...");
- }
- } catch (SQLException se) {
- // Handle errors for JDBC
- se.printStackTrace();
- } catch (Exception ex) {
- // Handle errors for Class.forName
- ex.printStackTrace();
- }
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement