Advertisement
Guest User

Untitled

a guest
Apr 25th, 2018
105
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 7.66 KB | None | 0 0
  1. java-bbcNews-scraper
  2. ---------
  3. import java.io.*;
  4. import java.util.Scanner;
  5. import org.jsoup.*;
  6. import org.jsoup.nodes.*;
  7. import java.util.regex.Pattern;
  8. import java.util.regex.Matcher;
  9.  
  10.  
  11. public class Scraper{
  12. public static void main (String[] args){
  13. // initialize primitive objects
  14. int choice;
  15. String finalHOutput;
  16. String finalSOutput;
  17. String finalDOutput;
  18. String output;
  19. String selection = "http://www.bbc.co.uk/news/uk/"; // given default value
  20.  
  21. // initialize complex objects
  22. HeadlineRegex headlineRegex = new HeadlineRegex();
  23. SummaryRegex summaryRegex = new SummaryRegex();
  24. DateRegex dateRegex = new DateRegex();
  25. RegexVisitor visitor = new RegexVisitor();
  26.  
  27. // take user input
  28. System.out.println("What kind of news story would you like to scrape?\n"
  29. + "Press: 1 for UK, 2 for Business,\n 3 for Politics, "
  30. + "4 for Tech,\n 5 for Science, or 6 for Health... ");
  31. Scanner reader = new Scanner(System.in);
  32. choice = reader.nextInt();
  33.  
  34. // alter selection to fit user choice
  35. switch (choice){
  36. case (1):
  37. selection = "http://www.bbc.co.uk/news/uk/";
  38. break;
  39. case (2):
  40. selection = "http://bbc.co.uk/news/business/";
  41. break;
  42. case (3):
  43. selection = "http://bbc.co.uk/news/politics/";
  44. break;
  45. case (4):
  46. selection = "http://bbc.co.uk/news/technology/";
  47. break;
  48. case (5):
  49. selection = "http://bbc.co.uk/news/science_and_environment/";
  50. break;
  51. case (6):
  52. selection = "http://bbc.co.uk/news/health/";
  53. break;
  54. }
  55.  
  56. // create new object of Document class and scrape HTML to populate it
  57. Document doc = null;
  58. try {
  59. doc = Jsoup.connect(selection).get();
  60. } catch (IOException ioe) {
  61. ioe.printStackTrace();
  62. }
  63.  
  64. // send the HTML input to the visitor
  65. String input = doc.toString();
  66. visitor.setInput(input);
  67.  
  68. // visit the regular expression
  69. visitor.visit(headlineRegex);
  70. finalHOutput = visitor.getFinalHOutput();
  71. visitor.visit(summaryRegex);
  72. finalSOutput = visitor.getFinalSOutput();
  73. visitor.visit(dateRegex);
  74. finalDOutput = visitor.getFinalDOutput();
  75. visitor.createOutput(finalHOutput, finalSOutput, finalDOutput);
  76.  
  77. // get the output from the visitor
  78. output = visitor.getOutput();
  79.  
  80. System.out.println(output + "\n");
  81. reader.close();
  82. }
  83. }
  84.  
  85. // Allow visit-able objects to accept visitor
  86. public interface Visitable{
  87. public void accept(Visitor visitor);
  88. }
  89.  
  90. // plan out the visitor's route
  91. public interface Visitor{
  92. // parse headline
  93. public String visit(HeadlineRegex headlineRegex);
  94. // parse summary
  95. public String visit(SummaryRegex summaryRegex);
  96. // parse date
  97. public String visit(DateRegex dateRegex);
  98. }
  99.  
  100.  
  101. //concrete element
  102. public class HeadlineRegex implements Visitable{
  103.  
  104. // initialize variables
  105. private String finalHOutput;
  106. private Pattern headline = Pattern.compile("<span class=\"title-link__title-text\">[A-Za-z0-9 -;:.,!\"'/$]*</span>");
  107.  
  108. // accept the visitor
  109. public void accept(Visitor visitor){
  110. visitor.visit(this);
  111. }
  112.  
  113. // refine regular expression
  114. public String refineHOutput(String hOutput) {
  115. // requires StringBuilder object as Strings are immutable in Java
  116. StringBuilder sbHOutput = new StringBuilder(hOutput);
  117. sbHOutput.delete(0, 163);
  118. finalHOutput = sbHOutput.toString();
  119. finalHOutput = finalHOutput.replace("</span>]", "");
  120. return finalHOutput;
  121. }
  122.  
  123. // getters
  124. public Pattern getHeadline() {
  125. return headline;
  126. }
  127.  
  128. public String getFinalHOutput() {
  129. return finalHOutput;
  130. }
  131.  
  132.  
  133. }
  134.  
  135. // concrete element
  136. public class SummaryRegex implements Visitable {
  137.  
  138. // initialize variables
  139. private String finalSOutput;
  140. private Pattern summary = Pattern.compile("<p class=\"buzzard__summary\">[A-Za-z0-9 -;:.,!\\\"'/$]*</p>");
  141.  
  142.  
  143. // accept the visitor
  144. public void accept(Visitor visitor){
  145. visitor.visit(this);
  146. }
  147.  
  148. // refine regular expression
  149. public String refineSOutput(String sOutput) {
  150. // requires StringBuilder object as Strings are immutable in Java
  151. StringBuilder sbSOutput = new StringBuilder(sOutput);
  152. sbSOutput.delete(0, 143);
  153. finalSOutput = sbSOutput.toString();
  154. finalSOutput = finalSOutput.replace("</p>]", "");
  155. return finalSOutput;
  156. }
  157.  
  158. // getters
  159. public Pattern getSummary() {
  160. return summary;
  161. }
  162.  
  163. public String getFinalSOutput() {
  164. return finalSOutput;
  165. }
  166.  
  167. }
  168.  
  169. // concrete element
  170. public class DateRegex implements Visitable {
  171.  
  172. // initialize variables
  173. private String finalDOutput;
  174. private Pattern date = Pattern.compile("data-datetime=\"[A-Za-z0-9 -;:.,!\"'/$]*\">");
  175.  
  176.  
  177. // accept the visitor
  178. public void accept(Visitor visitor){
  179. visitor.visit(this);
  180. }
  181.  
  182. // refine regular expression
  183. public String refineDOutput(String dOutput) {
  184. // requires StringBuilder object as Strings are immutable in Java
  185. StringBuilder sbDOutput = new StringBuilder(dOutput);
  186. sbDOutput.delete(0, 114);
  187. finalDOutput = sbDOutput.toString();
  188. finalDOutput = finalDOutput.replace("\">]", "");
  189. return finalDOutput;
  190. }
  191.  
  192. // getters
  193. public Pattern getDate() {
  194. return date;
  195. }
  196.  
  197. public String getFinalDOutput() {
  198. return finalDOutput;
  199. }
  200.  
  201. }
  202.  
  203. public class RegexVisitor implements Visitor {
  204. private String input;
  205. private String output;
  206. private String hOutput;
  207. private String finalHOutput;
  208. private String sOutput;
  209. private String finalSOutput;
  210. private String dOutput;
  211. private String finalDOutput;
  212. private boolean hFinds;
  213. private boolean sFinds;
  214. private boolean dFinds;
  215.  
  216. // collect data about the regular expressions
  217. // use this data to return a matching String
  218. public String visit(HeadlineRegex headlineRegex) {
  219. Pattern headline = headlineRegex.getHeadline();
  220. Matcher hMatcher = headline.matcher(input);
  221. // find sub-string that contains pattern
  222. hFinds = hMatcher.find();
  223. if (hFinds == true ) {
  224. hOutput = hMatcher.toString();
  225. } else {
  226. hOutput = "Error";
  227. }
  228. // refine output to remove Object info and tags
  229. headlineRegex.refineHOutput(hOutput);
  230. finalHOutput = headlineRegex.getFinalHOutput();
  231.  
  232. // return String
  233. return finalHOutput;
  234. }
  235.  
  236. public String visit(SummaryRegex summaryRegex){
  237. Pattern summary = summaryRegex.getSummary();
  238. Matcher sMatcher = summary.matcher(input);
  239. // find sub-string that contains pattern
  240. sFinds = sMatcher.find();
  241. if (sFinds == true ) {
  242. sOutput = sMatcher.toString();
  243. } else {
  244. sOutput = "Error";
  245. }
  246. // refine output to remove Object info and tags
  247. summaryRegex.refineSOutput(sOutput);
  248. finalSOutput = summaryRegex.getFinalSOutput();
  249. return finalSOutput;
  250. }
  251.  
  252. public String visit(DateRegex dateRegex){
  253. Pattern date = dateRegex.getDate();
  254. Matcher dMatcher = date.matcher(input);
  255. // find sub-string that contains pattern
  256. dFinds = dMatcher.find();
  257. if (dFinds == true ) {
  258. dOutput = dMatcher.toString();
  259. } else {
  260. dOutput = "Error";
  261. }
  262. // refine output to remove object info and tags
  263. dateRegex.refineDOutput(dOutput);
  264. finalDOutput = dateRegex.getFinalDOutput();
  265. return finalDOutput;
  266. }
  267.  
  268. // concatenate the outputs
  269. public String createOutput(String finalHOutput, String finalSOutput, String finalDOutput){
  270. output = "\n" + finalHOutput + "\n" + finalSOutput + "\n" + finalDOutput;
  271. return output;
  272. }
  273.  
  274. // getters and setters
  275. // collect input
  276. public String getInput() {
  277. return input;
  278. }
  279.  
  280. public void setInput(String input) {
  281. this.input = input;
  282. }
  283.  
  284. public String getFinalHOutput() {
  285. return finalHOutput;
  286. }
  287.  
  288. public String getFinalSOutput() {
  289. return finalSOutput;
  290. }
  291.  
  292. public String getFinalDOutput() {
  293. return finalDOutput;
  294. }
  295.  
  296. public String getOutput() {
  297. return output;
  298. }
  299.  
  300. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement