Advertisement
Guest User

The Pioneer Scraper

a guest
Feb 20th, 2020
320
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 7.08 KB | None | 0 0
  1. package thepioneer;
  2.  
  3. import java.io.IOException;
  4. import java.sql.Connection;
  5. import java.sql.DriverManager;
  6. import java.sql.SQLException;
  7. import java.sql.Statement;
  8. import java.util.ArrayList;
  9.  
  10. import org.jsoup.Jsoup;
  11. import org.jsoup.nodes.Document;
  12. import org.jsoup.nodes.Element;
  13. import org.jsoup.select.Elements;
  14.  
  15. public class Scraper {
  16.  
  17. // true = uploads to database
  18. // false = Creates files in directory
  19. static boolean upload_mysql = true;
  20. static int startYear = 2011;
  21. static int endYear = 2020;
  22. static int startMonth = 1;
  23. static int endMonth = 12;
  24. // DO NOT CHANGE
  25. static Connection conn = null;
  26. static Statement stmt = null;
  27. static int lastPage;
  28.  
  29. // Information for MYSQL Server
  30. // DO NOT CHANGE
  31. static final String JDBC_DRIVER = "com.mysql.jdbc.Driver";
  32. // CHANGE ME
  33. static final String ip = "localhost";
  34. static final String port = "3306";
  35. static final String database_name = "thepioneer";
  36. static final String username = "root";
  37. static final String password = "";
  38.  
  39. public static void main(String[] args) throws SQLException {
  40.  
  41. try {
  42. scrape();
  43. } catch (IOException e) {
  44. e.printStackTrace();
  45. }
  46.  
  47. }
  48.  
  49. public static void scrape() throws IOException, SQLException {
  50. ArrayList<String> al = new ArrayList<String>();
  51.  
  52. int counter = 1;
  53.  
  54. // int startYear = sY;
  55. // int endYear = eY;
  56.  
  57. if (upload_mysql) {
  58. createDatabase();
  59. createTables();
  60. }
  61.  
  62. for (int i = startYear; i <= endYear; i++) {
  63. for (int j = startMonth; j <= endMonth; j++) {
  64. Document doc = Jsoup
  65. .connect("https://www.dailypioneer.com/searchlist.php?yr=" + i + "&mn=" + j + "&page=").get();
  66. try {
  67. Element last = doc.select("div.pagingList").select("ul").select("li").last();
  68. Elements lastPg = last.select("a");
  69.  
  70. // System.out.println(last);
  71.  
  72. String lastInt = lastPg.attr("id");
  73.  
  74. setLastPage(Integer.parseInt(lastInt));
  75. } catch (Exception e) {
  76.  
  77. }
  78.  
  79.  
  80. for (int k = 1; k < lastPage; k++) {
  81.  
  82. System.out.println("Year: " + i + " Month: " + j + " Pages: " + k);
  83.  
  84. // System.out.println(lastPageNum);
  85.  
  86. // System.out.println("Year: " + i + " Month: " + j);
  87. // System.out.println("https://www.dailypioneer.com/searchlist.php?yr=" + i +
  88. // "&mn=" + j + "&page=");
  89.  
  90. doc = Jsoup
  91. .connect("https://www.dailypioneer.com/searchlist.php?yr=" + i + "&mn=" + j + "&page=" + k)
  92. .get();
  93.  
  94. Elements highLightedNews = doc.select("div.highLightedNews").select("ul.list-unstyled").select("li")
  95. .select("a");
  96. Elements innerNewsList = doc.select("div.innerNewsList").select("a");
  97.  
  98. for (Element e : highLightedNews) {
  99. if (!e.attr("abs:href").contains("author"))
  100. al.add(e.attr("abs:href"));
  101. }
  102.  
  103. for (Element e : innerNewsList) {
  104. if (!e.attr("abs:href").contains("author"))
  105. al.add(e.attr("abs:href"));
  106. }
  107.  
  108. System.out.println(al.size());
  109.  
  110. for (String s : al) {
  111. System.out.println(s);
  112. doc = Jsoup.connect(s).get();
  113. //String body = doc.select("div.newsDetailedContent").text();
  114. //System.out.println(body);
  115.  
  116. if (upload_mysql == true) {
  117.  
  118. // Uploads articles to database
  119. Connection myConn = DriverManager
  120. .getConnection("jdbc:mysql://localhost:3306/" + database_name, username, password);
  121.  
  122. Statement myStmt = myConn.createStatement();
  123.  
  124. String newTitle = doc.title().replace("'", "''");
  125.  
  126. String newURL = s.replace("'", "''");
  127.  
  128. String site = "dailypioneer.com";
  129.  
  130. String newsInfo = doc.select("div.newsInfo").first().text();
  131.  
  132. String author = newsInfo.substring(newsInfo.indexOf('|') + 2, newsInfo.length());
  133. String newAuthor = author.replace("'", "''");
  134.  
  135. String date = newsInfo.substring(0, newsInfo.indexOf('|'));
  136. String newDate = date.replace("'", "''");
  137.  
  138. // String country = "India";
  139.  
  140. // String newCity = city.replace("'", "''");
  141.  
  142. String body = doc.select("div.newsDetailedContent").text();
  143. //System.out.println(body);
  144. String newContent = body.replace("'", "''");
  145.  
  146. String sql = "insert into `" + i + "` "
  147. + " (id, Title, URL, Site, Author, Date, Content)" + " values ('"
  148. + counter + "', '" + newTitle + "', '" + newURL + "', '" + site + "', '" + newAuthor
  149. + "', '" + newDate + "', '" + newContent + "')";
  150.  
  151. try {
  152. myStmt.executeUpdate(sql);
  153. } catch (SQLException sqlEx) {
  154. System.out.println("Error uploading article: " + counter);
  155. } finally{
  156. /*This block should be added to your code
  157. * You need to release the resources like connections
  158. */
  159. if(conn!=null)
  160. conn.close();
  161. }
  162.  
  163. System.out.println("Article " + counter + " Succesfully Uploaded!");
  164. counter++;
  165. }
  166. }
  167.  
  168. al.clear();
  169.  
  170. }
  171. }
  172. }
  173. }
  174.  
  175. public static void setLastPage(int x) {
  176. lastPage = x;
  177. }
  178.  
  179. /**
  180. * Function: Creates database
  181. **/
  182. public static void createDatabase() {
  183. String DB_URL = "jdbc:mysql://" + ip + ":" + port + "/";
  184.  
  185. try {
  186. // STEP 2: Register JDBC driver
  187. Class.forName("com.mysql.jdbc.Driver");
  188.  
  189. // STEP 3: Open a connection
  190. System.out.println("Connecting to database...");
  191. conn = DriverManager.getConnection(DB_URL, username, password);
  192.  
  193. // STEP 4: Execute a query
  194. System.out.println("Creating database...");
  195. stmt = conn.createStatement();
  196.  
  197. String sql = "CREATE DATABASE IF NOT EXISTS " + database_name.toUpperCase();
  198. stmt.executeUpdate(sql);
  199. System.out.println("Database created successfully...");
  200. } catch (SQLException se) {
  201. // Handle errors for JDBC
  202. se.printStackTrace();
  203. } catch (Exception ex) {
  204. // Handle errors for Class.forName
  205. ex.printStackTrace();
  206. }
  207. }
  208.  
  209. /**
  210. * Function: Creates tables in database
  211. **/
  212. /**
  213. * Function: Creates tables in database
  214. **/
  215. public static void createTables() {
  216.  
  217. String DB_URL = "jdbc:mysql://" + ip + ":" + port + "/" + database_name;
  218.  
  219. try {
  220. // STEP 2: Register JDBC driver
  221. Class.forName("com.mysql.jdbc.Driver");
  222.  
  223. // STEP 3: Open a connection
  224. System.out.println("Connecting to database...");
  225. conn = DriverManager.getConnection(DB_URL, username, password);
  226.  
  227. for (int i = startYear; i <= endYear; i++) {
  228.  
  229. // STEP 4: Execute a query
  230. System.out.println("Creating table in given database...");
  231. stmt = conn.createStatement();
  232.  
  233. String sql = "CREATE TABLE IF NOT EXISTS `" + Integer.toString(i).toUpperCase() + "` "
  234. + "(`id` int(11) DEFAULT NULL," + "`Title` longtext DEFAULT NULL,"
  235. + "`URL` longtext DEFAULT NULL," + "`Site` text DEFAULT NULL," + "`Author` text DEFAULT NULL,"
  236. + "`Date` text DEFAULT NULL," + "`Content` longtext DEFAULT NULL)";
  237.  
  238. stmt.executeUpdate(sql);
  239. System.out.println("Created table in given database...");
  240.  
  241. }
  242. } catch (SQLException se) {
  243. // Handle errors for JDBC
  244. se.printStackTrace();
  245. } catch (Exception ex) {
  246. // Handle errors for Class.forName
  247. ex.printStackTrace();
  248. }
  249.  
  250. }
  251.  
  252. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement