Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- //This is a simple program in Java using JSoup to grab a list of text from the SCP Wiki website and save it to a local text file
- import java.io.BufferedWriter;
- import java.io.File;
- import java.io.FileWriter;
- import java.io.IOException;
- import java.util.Scanner;
- import org.jsoup.Connection.Response;
- import org.jsoup.Jsoup;
- import org.jsoup.nodes.Document;
- import org.jsoup.select.Elements;
- public class JSoupSCP {
- //Create important global variables
- static int maxSCP = 5010; //This value determines the final SCP we write to our file at the end. Any titles that come after it are discarded
- static int listSize = (int) (1000 * (1 + Math.floor(maxSCP / 1000))); //Used to determine the length of our required arrays based on maxSCP
- static int bigSize = 0; //bigSize denotes the point at which the method getSCPs last left off so the individual results from each call
- //can be combined without any gaps or overlap. Starts at 0 for the first execution
- static String[] bigList = new String[listSize]; //bigList is the combined list of all titles we fetch from each getSCPs method call
- static String[] oldbigList = new String[listSize]; //oldbigList ist the same as bigList but for the results from the latest saved txt file
- public static void main(String[] args) throws IOException {
- System.out.println("Number of articles to fetch: " + listSize);
- //Set the path for our resulting text-file
- String path = "E:\\User\\Downloads\\SCP titles.txt";
- //getSCPs is designed to fetch titles from only one page, so we call it once for every series of SCPs, passing the number of SCPs to fetch and
- //also the page number (empty for the first series)
- getSCPs("", 999);
- getSCPs("-2", 1000);
- getSCPs("-3", 1000);
- getSCPs("-4", 1000);
- getSCPs("-5", 1000);
- getSCPs("-6", 1000);
- System.out.println("");
- //Check if the file in our specified path already exists. If it does, read the file and compare its contents to our bigList
- File txtFile = new File(path);
- if(txtFile.exists()) {
- //readFromFile reads the contents of the existing txt file which will be overwritten at the end of the program
- readFromFile(path);
- //compareLists compares the data from the file to the newly fetched data and points out any changes
- compareLists();
- }
- //saveToFile saves the fetched titles into a local txt file for further use, the path of which is specified here
- saveToFile(path);
- }
- //getSCPs is the most important part of the program because it finds and organizes the data we want from the internet
- public static void getSCPs(String page, int length) throws IOException {
- System.out.println("Fetching data from http://www.scp-wiki.net/scp-series" + page + "...");
- //First fetch text from the SCP Wiki, the targeted page being determined by the value passed from the main when calling the method
- //This code is copied from an online post to avoid error code 500 which suddenly started appearing
- org.jsoup.Connection con = Jsoup.connect("http://www.scp-wiki.net/scp-series" + page).userAgent("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.21 (KHTML, like Gecko) Chrome/19.0.1042.0 Safari/535.21");
- con.timeout(180000).ignoreHttpErrors(true).followRedirects(true);
- Response resp = con.execute();
- Document doc = null;
- if (resp.statusCode() == 200) {
- doc = con.get();
- }
- //This is the way I used to do it. It stopped working consistently for some reason
- //Document doc = Jsoup.connect("http://www.scp-wiki.net/scp-series" + page).get();
- //Only use text from the html class "content-panel standalone series"
- Elements tags = doc.getElementsByClass("content-panel standalone series");
- //Only fetch elements with the tag "ul" (unordered list) within this class. These contain the individual SCPs as list items (tagged as "li")
- tags = tags.get(0).getElementsByTag("ul");
- //Create a new array with the length we need for our titles (passed over from the main)
- String[] list = new String[length];
- //Now pass over the "li" elements from the elements array "tags" to the new array "list"
- for(int i = 0, totalSize = 0; i + 1 < tags.size(); i++) {
- Elements tags2 = tags.get(i + 1).getElementsByTag("li");
- int size = tags2.size();
- //Each "ul" tag contains 100 SCPs (99 for the first batch of series 1) in the form of "li" elements. This is why we need two loops:
- //The main loop goes through each "ul" element from "tags" and the second loop saves all the "li" elements within into our new array
- for (int a = 0; a < size; a++) {
- list[a + totalSize] = tags2.get(a).text() + ";;"; //We also add ";;" to the end of each line for easier parsing later
- }
- //Update "totalSize" with the amount of list elements we extracted in the current loop so the next loop can continue where this one left off
- totalSize = totalSize + size;
- }
- System.out.println("Cleaning up...");
- //This loop here is used to remove excessive or problematic characters and lines so our end result is consistent and only contains what we want
- //It also saves any lines from the list array that aren't faulty into our final array bigList
- for(int i = 0; i < list.length; i++) {
- if(list[i] != null) {
- if(list[i].contains(" - ")) {
- list[i] = list[i].substring(list[i].indexOf(" - ")+1);
- if(list[i].startsWith("- ")) {
- list[i] = list[i].substring(2);
- }
- list[i] = list[i].replace("\"", "'");
- }
- bigList[i + bigSize] = list[i];
- }
- }
- //Update the bigSize variable so the program knows which array index we stopped at
- //The next time we call getSCPs it will save new elements to indexes past this point
- bigSize = bigSize + list.length;
- }
- //For further use of the data we gathered we write it into a txt file
- public static void saveToFile(String path) throws IOException {
- //Initiate the FileWriter with the designated file path from our main
- FileWriter f = new FileWriter(path);
- BufferedWriter bW = new BufferedWriter(f);
- System.out.println("Saving data to file...");
- //Go through our bigList array and write down all lines until we hit the maxSCP number
- for(int i = 0; i < maxSCP; i++) {
- bW.write(bigList[i]);
- bW.newLine();
- }
- bW.close();
- System.out.println("File overwritten successfully!");
- }
- //Get the lines from the existing txt file and save them into the array oldBigList
- public static void readFromFile(String path) throws IOException {
- //Start the scanner with our specified path from the main
- File f = new File(path);
- Scanner s = new Scanner(f);
- System.out.println("Reading file content...");
- //Go through the lines and save them into the array
- for(int i = 0; s.hasNext(); i++) {
- oldbigList[i] = s.nextLine();
- }
- s.close();
- }
- //Compare bigList to oldBigList and point out any differences
- public static void compareLists() {
- System.out.println("Comparing new data to old file...");
- //Compare the elements of both arrays in order and give out an alert, if an entry's title has changed
- for(int i = 0; i < maxSCP; i++) {
- if(bigList[i].equals(oldbigList[i]) ) {}
- else {
- System.out.println("Changes to SCP " + (i + 1) + " detected!");
- }
- }
- System.out.println("");
- }
- }
Add Comment
Please, Sign In to add comment