Index154

JSoup SCP title fetcher

May 31st, 2020 (edited)
116
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Java 7.26 KB | None | 0 0
  1. //This is a simple program in Java using JSoup to grab a list of text from the SCP Wiki website and save it to a local text file
  2. import java.io.BufferedWriter;
  3. import java.io.File;
  4. import java.io.FileWriter;
  5. import java.io.IOException;
  6. import java.util.Scanner;
  7.  
  8. import org.jsoup.Connection.Response;
  9. import org.jsoup.Jsoup;
  10. import org.jsoup.nodes.Document;
  11. import org.jsoup.select.Elements;
  12.  
  13. public class JSoupSCP {
  14.    
  15.     //Create important global variables
  16.     static int maxSCP = 5010; //This value determines the final SCP we write to our file at the end. Any titles that come after it are discarded
  17.     static int listSize = (int) (1000 * (1 + Math.floor(maxSCP / 1000))); //Used to determine the length of our required arrays based on maxSCP
  18.     static int bigSize = 0; //bigSize denotes the point at which the method getSCPs last left off so the individual results from each call
  19.                             //can be combined without any gaps or overlap. Starts at 0 for the first execution
  20.     static String[] bigList = new String[listSize]; //bigList is the combined list of all titles we fetch from each getSCPs method call
  21.     static String[] oldbigList = new String[listSize]; //oldbigList ist the same as bigList but for the results from the latest saved txt file
  22.    
  23.     public static void main(String[] args) throws IOException {
  24.        
  25.         System.out.println("Number of articles to fetch: " + listSize);
  26.        
  27.         //Set the path for our resulting text-file
  28.         String path = "E:\\User\\Downloads\\SCP titles.txt";
  29.        
  30.         //getSCPs is designed to fetch titles from only one page, so we call it once for every series of SCPs, passing the number of SCPs to fetch and
  31.         //also the page number (empty for the first series)
  32.         getSCPs("", 999);
  33.         getSCPs("-2", 1000);
  34.         getSCPs("-3", 1000);
  35.         getSCPs("-4", 1000);
  36.         getSCPs("-5", 1000);
  37.         getSCPs("-6", 1000);
  38.        
  39.         System.out.println("");
  40.        
  41.         //Check if the file in our specified path already exists. If it does, read the file and compare its contents to our bigList
  42.         File txtFile = new File(path);
  43.         if(txtFile.exists()) {
  44.             //readFromFile reads the contents of the existing txt file which will be overwritten at the end of the program
  45.             readFromFile(path);
  46.             //compareLists compares the data from the file to the newly fetched data and points out any changes
  47.             compareLists();
  48.         }
  49.        
  50.         //saveToFile saves the fetched titles into a local txt file for further use, the path of which is specified here
  51.         saveToFile(path);
  52.        
  53.     }
  54.    
  55.     //getSCPs is the most important part of the program because it finds and organizes the data we want from the internet
  56.     public static void getSCPs(String page, int length) throws IOException {
  57.        
  58.         System.out.println("Fetching data from http://www.scp-wiki.net/scp-series" + page + "...");
  59.        
  60.         //First fetch text from the SCP Wiki, the targeted page being determined by the value passed from the main when calling the method
  61.         //This code is copied from an online post to avoid error code 500 which suddenly started appearing
  62.         org.jsoup.Connection con = Jsoup.connect("http://www.scp-wiki.net/scp-series" + page).userAgent("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.21 (KHTML, like Gecko) Chrome/19.0.1042.0 Safari/535.21");
  63.         con.timeout(180000).ignoreHttpErrors(true).followRedirects(true);
  64.         Response resp = con.execute();
  65.         Document doc = null;
  66.         if (resp.statusCode() == 200) {
  67.             doc = con.get();
  68.         }
  69.        
  70.         //This is the way I used to do it. It stopped working consistently for some reason
  71.         //Document doc = Jsoup.connect("http://www.scp-wiki.net/scp-series" + page).get();
  72.        
  73.         //Only use text from the html class "content-panel standalone series"
  74.         Elements tags = doc.getElementsByClass("content-panel standalone series");
  75.         //Only fetch elements with the tag "ul" (unordered list) within this class. These contain the individual SCPs as list items (tagged as "li")
  76.         tags = tags.get(0).getElementsByTag("ul");
  77.        
  78.         //Create a new array with the length we need for our titles (passed over from the main)
  79.         String[] list = new String[length];
  80.        
  81.         //Now pass over the "li" elements from the elements array "tags" to the new array "list"
  82.         for(int i = 0, totalSize = 0; i + 1 < tags.size(); i++) {
  83.             Elements tags2 = tags.get(i + 1).getElementsByTag("li");
  84.             int size = tags2.size();
  85.             //Each "ul" tag contains 100 SCPs (99 for the first batch of series 1) in the form of "li" elements. This is why we need two loops:
  86.             //The main loop goes through each "ul" element from "tags" and the second loop saves all the "li" elements within into our new array
  87.             for (int a = 0; a < size; a++) {
  88.                 list[a + totalSize] = tags2.get(a).text() + ";;"; //We also add ";;" to the end of each line for easier parsing later
  89.             }
  90.                    
  91.             //Update "totalSize" with the amount of list elements we extracted in the current loop so the next loop can continue where this one left off
  92.             totalSize = totalSize + size;
  93.         }
  94.        
  95.         System.out.println("Cleaning up...");
  96.        
  97.         //This loop here is used to remove excessive or problematic characters and lines so our end result is consistent and only contains what we want
  98.         //It also saves any lines from the list array that aren't faulty into our final array bigList
  99.         for(int i = 0; i < list.length; i++) {
  100.             if(list[i] != null) {
  101.                 if(list[i].contains(" - ")) {
  102.                     list[i] = list[i].substring(list[i].indexOf(" - ")+1);
  103.                     if(list[i].startsWith("- ")) {
  104.                         list[i] = list[i].substring(2);
  105.                     }
  106.                     list[i] = list[i].replace("\"", "'");
  107.                 }
  108.                 bigList[i + bigSize] = list[i];
  109.             }
  110.            
  111.         }
  112.        
  113.         //Update the bigSize variable so the program knows which array index we stopped at
  114.         //The next time we call getSCPs it will save new elements to indexes past this point
  115.         bigSize = bigSize + list.length;
  116.        
  117.     }
  118.    
  119.     //For further use of the data we gathered we write it into a txt file
  120.     public static void saveToFile(String path) throws IOException {
  121.        
  122.         //Initiate the FileWriter with the designated file path from our main
  123.         FileWriter f = new FileWriter(path);
  124.         BufferedWriter bW = new BufferedWriter(f);
  125.        
  126.         System.out.println("Saving data to file...");
  127.        
  128.         //Go through our bigList array and write down all lines until we hit the maxSCP number
  129.         for(int i = 0; i < maxSCP; i++) {
  130.             bW.write(bigList[i]);
  131.             bW.newLine();
  132.         }
  133.        
  134.         bW.close();
  135.        
  136.         System.out.println("File overwritten successfully!");
  137.        
  138.     }
  139.    
  140.     //Get the lines from the existing txt file and save them into the array oldBigList
  141.     public static void readFromFile(String path) throws IOException {
  142.        
  143.         //Start the scanner with our specified path from the main
  144.         File f = new File(path);
  145.         Scanner s = new Scanner(f);
  146.        
  147.         System.out.println("Reading file content...");
  148.        
  149.         //Go through the lines and save them into the array
  150.         for(int i = 0; s.hasNext(); i++) {
  151.             oldbigList[i] = s.nextLine();
  152.         }
  153.        
  154.         s.close();
  155.        
  156.     }
  157.    
  158.     //Compare bigList to oldBigList and point out any differences
  159.     public static void compareLists() {
  160.        
  161.         System.out.println("Comparing new data to old file...");
  162.        
  163.         //Compare the elements of both arrays in order and give out an alert, if an entry's title has changed
  164.         for(int i = 0; i < maxSCP; i++) {
  165.             if(bigList[i].equals(oldbigList[i]) ) {}
  166.             else {
  167.                 System.out.println("Changes to SCP " + (i + 1) + " detected!");
  168.             }
  169.         }
  170.        
  171.         System.out.println("");
  172.        
  173.     }
  174.    
  175. }
Add Comment
Please, Sign In to add comment