Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- java-bbcNews-scraper
- ---------
- import java.io.*;
- import java.util.Scanner;
- import org.jsoup.*;
- import org.jsoup.nodes.*;
- import java.util.regex.Pattern;
- import java.util.regex.Matcher;
- public class Scraper{
- public static void main (String[] args){
- // initialize primitive objects
- int choice;
- String finalHOutput;
- String finalSOutput;
- String finalDOutput;
- String output;
- String selection = "http://www.bbc.co.uk/news/uk/"; // given default value
- // initialize complex objects
- HeadlineRegex headlineRegex = new HeadlineRegex();
- SummaryRegex summaryRegex = new SummaryRegex();
- DateRegex dateRegex = new DateRegex();
- RegexVisitor visitor = new RegexVisitor();
- // take user input
- System.out.println("What kind of news story would you like to scrape?\n"
- + "Press: 1 for UK, 2 for Business,\n 3 for Politics, "
- + "4 for Tech,\n 5 for Science, or 6 for Health... ");
- Scanner reader = new Scanner(System.in);
- choice = reader.nextInt();
- // alter selection to fit user choice
- switch (choice){
- case (1):
- selection = "http://www.bbc.co.uk/news/uk/";
- break;
- case (2):
- selection = "http://bbc.co.uk/news/business/";
- break;
- case (3):
- selection = "http://bbc.co.uk/news/politics/";
- break;
- case (4):
- selection = "http://bbc.co.uk/news/technology/";
- break;
- case (5):
- selection = "http://bbc.co.uk/news/science_and_environment/";
- break;
- case (6):
- selection = "http://bbc.co.uk/news/health/";
- break;
- }
- // create new object of Document class and scrape HTML to populate it
- Document doc = null;
- try {
- doc = Jsoup.connect(selection).get();
- } catch (IOException ioe) {
- ioe.printStackTrace();
- }
- // send the HTML input to the visitor
- String input = doc.toString();
- visitor.setInput(input);
- // visit the regular expression
- visitor.visit(headlineRegex);
- finalHOutput = visitor.getFinalHOutput();
- visitor.visit(summaryRegex);
- finalSOutput = visitor.getFinalSOutput();
- visitor.visit(dateRegex);
- finalDOutput = visitor.getFinalDOutput();
- visitor.createOutput(finalHOutput, finalSOutput, finalDOutput);
- // get the output from the visitor
- output = visitor.getOutput();
- System.out.println(output + "\n");
- reader.close();
- }
- }
- // Allow visit-able objects to accept visitor
- public interface Visitable{
- public void accept(Visitor visitor);
- }
- // plan out the visitor's route
- public interface Visitor{
- // parse headline
- public String visit(HeadlineRegex headlineRegex);
- // parse summary
- public String visit(SummaryRegex summaryRegex);
- // parse date
- public String visit(DateRegex dateRegex);
- }
- //concrete element
- public class HeadlineRegex implements Visitable{
- // initialize variables
- private String finalHOutput;
- private Pattern headline = Pattern.compile("<span class=\"title-link__title-text\">[A-Za-z0-9 -;:.,!\"'/$]*</span>");
- // accept the visitor
- public void accept(Visitor visitor){
- visitor.visit(this);
- }
- // refine regular expression
- public String refineHOutput(String hOutput) {
- // requires StringBuilder object as Strings are immutable in Java
- StringBuilder sbHOutput = new StringBuilder(hOutput);
- sbHOutput.delete(0, 163);
- finalHOutput = sbHOutput.toString();
- finalHOutput = finalHOutput.replace("</span>]", "");
- return finalHOutput;
- }
- // getters
- public Pattern getHeadline() {
- return headline;
- }
- public String getFinalHOutput() {
- return finalHOutput;
- }
- }
- // concrete element
- public class SummaryRegex implements Visitable {
- // initialize variables
- private String finalSOutput;
- private Pattern summary = Pattern.compile("<p class=\"buzzard__summary\">[A-Za-z0-9 -;:.,!\\\"'/$]*</p>");
- // accept the visitor
- public void accept(Visitor visitor){
- visitor.visit(this);
- }
- // refine regular expression
- public String refineSOutput(String sOutput) {
- // requires StringBuilder object as Strings are immutable in Java
- StringBuilder sbSOutput = new StringBuilder(sOutput);
- sbSOutput.delete(0, 143);
- finalSOutput = sbSOutput.toString();
- finalSOutput = finalSOutput.replace("</p>]", "");
- return finalSOutput;
- }
- // getters
- public Pattern getSummary() {
- return summary;
- }
- public String getFinalSOutput() {
- return finalSOutput;
- }
- }
- // concrete element
- public class DateRegex implements Visitable {
- // initialize variables
- private String finalDOutput;
- private Pattern date = Pattern.compile("data-datetime=\"[A-Za-z0-9 -;:.,!\"'/$]*\">");
- // accept the visitor
- public void accept(Visitor visitor){
- visitor.visit(this);
- }
- // refine regular expression
- public String refineDOutput(String dOutput) {
- // requires StringBuilder object as Strings are immutable in Java
- StringBuilder sbDOutput = new StringBuilder(dOutput);
- sbDOutput.delete(0, 114);
- finalDOutput = sbDOutput.toString();
- finalDOutput = finalDOutput.replace("\">]", "");
- return finalDOutput;
- }
- // getters
- public Pattern getDate() {
- return date;
- }
- public String getFinalDOutput() {
- return finalDOutput;
- }
- }
- public class RegexVisitor implements Visitor {
- private String input;
- private String output;
- private String hOutput;
- private String finalHOutput;
- private String sOutput;
- private String finalSOutput;
- private String dOutput;
- private String finalDOutput;
- private boolean hFinds;
- private boolean sFinds;
- private boolean dFinds;
- // collect data about the regular expressions
- // use this data to return a matching String
- public String visit(HeadlineRegex headlineRegex) {
- Pattern headline = headlineRegex.getHeadline();
- Matcher hMatcher = headline.matcher(input);
- // find sub-string that contains pattern
- hFinds = hMatcher.find();
- if (hFinds == true ) {
- hOutput = hMatcher.toString();
- } else {
- hOutput = "Error";
- }
- // refine output to remove Object info and tags
- headlineRegex.refineHOutput(hOutput);
- finalHOutput = headlineRegex.getFinalHOutput();
- // return String
- return finalHOutput;
- }
- public String visit(SummaryRegex summaryRegex){
- Pattern summary = summaryRegex.getSummary();
- Matcher sMatcher = summary.matcher(input);
- // find sub-string that contains pattern
- sFinds = sMatcher.find();
- if (sFinds == true ) {
- sOutput = sMatcher.toString();
- } else {
- sOutput = "Error";
- }
- // refine output to remove Object info and tags
- summaryRegex.refineSOutput(sOutput);
- finalSOutput = summaryRegex.getFinalSOutput();
- return finalSOutput;
- }
- public String visit(DateRegex dateRegex){
- Pattern date = dateRegex.getDate();
- Matcher dMatcher = date.matcher(input);
- // find sub-string that contains pattern
- dFinds = dMatcher.find();
- if (dFinds == true ) {
- dOutput = dMatcher.toString();
- } else {
- dOutput = "Error";
- }
- // refine output to remove object info and tags
- dateRegex.refineDOutput(dOutput);
- finalDOutput = dateRegex.getFinalDOutput();
- return finalDOutput;
- }
- // concatenate the outputs
- public String createOutput(String finalHOutput, String finalSOutput, String finalDOutput){
- output = "\n" + finalHOutput + "\n" + finalSOutput + "\n" + finalDOutput;
- return output;
- }
- // getters and setters
- // collect input
- public String getInput() {
- return input;
- }
- public void setInput(String input) {
- this.input = input;
- }
- public String getFinalHOutput() {
- return finalHOutput;
- }
- public String getFinalSOutput() {
- return finalSOutput;
- }
- public String getFinalDOutput() {
- return finalDOutput;
- }
- public String getOutput() {
- return output;
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement