Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import java.util.ArrayList;
- import java.util.Arrays;
- public class Page {
- private String html;
- private ArrayList<String> links;
- private ArrayList<String> words;
- public Page() {
- this(new String(""));
- }
- public Page(String html) {
- setHTML(html);
- }
- public void setHTML(String html) {
- this.html = html;
- processLinks();
- processWords();
- }
- public int numLinks() {
- return links.size();
- }
- public String getLink(int index) {
- return links.get(index);
- }
- public String[] getLinks() {
- return links.toArray(new String[links.size()]);
- }
- public String[] getWords() {
- return words.toArray(new String[words.size()]);
- }
- private void processWords() {
- String data = this.html;
- // Attempt to remove HTML tags
- String noHTML = data.replaceAll("\\<.*?\\>", "");
- // Get all of the distinct phrases left, make them all lowercase
- String[] tempWords = noHTML.split(" ");
- for (String t : tempWords) {
- t = t.toLowerCase();
- }
- words = new ArrayList<String>(Arrays.asList(tempWords));
- }
- private void processLinks() {
- String data = this.html;
- String link = "a href";
- ArrayList<String> links = new ArrayList<String>();
- int find;
- int lastFind = 0;
- try {
- while((find = data.indexOf(link,lastFind)) != -1) {
- int startFind = data.indexOf("\"", find);
- int endFind = data.indexOf("\"",startFind+1);
- String theLink = data.substring(startFind+1,endFind);
- links.add(theLink);
- lastFind = endFind;
- }
- }
- catch (Exception e) {
- }
- this.links = links;
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement