Advertisement
Guest User

JCrawler.java

a guest
May 29th, 2010
503
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Java 1.56 KB | None | 0 0
  1. package jcrawler;
  2.  
  3. import java.io.IOException;
  4. import java.io.StringReader;
  5. import java.util.List;
  6. import java.util.ArrayList;
  7.  
  8. import javax.swing.text.html.parser.ParserDelegator;
  9. import javax.swing.text.html.HTMLEditorKit.ParserCallback;
  10. import javax.swing.text.html.HTML.Tag;
  11. import javax.swing.text.html.HTML.Attribute;
  12. import javax.swing.text.MutableAttributeSet;
  13.  
  14. public class HTMLUtils {
  15.   private HTMLUtils() {}
  16.  
  17.   public static List<String> extractLinks(String buffer) throws IOException {
  18.     final ArrayList<String> list = new ArrayList<String>();
  19.     StringReader myStringReader = new StringReader(buffer);
  20.     ParserDelegator parserDelegator = new ParserDelegator();
  21.     ParserCallback parserCallback = new ParserCallback() {
  22.             @Override
  23.       public void handleText(final char[] data, final int pos) { }
  24.             @Override
  25.       public void handleStartTag(Tag tag, MutableAttributeSet attribute, int pos) {
  26.         if (tag == Tag.A) {
  27.           String address = (String) attribute.getAttribute(Attribute.HREF);
  28.           list.add(address);
  29.         }
  30.       }
  31.             @Override
  32.       public void handleEndTag(Tag t, final int pos) {  }
  33.             @Override
  34.       public void handleSimpleTag(Tag t, MutableAttributeSet a, final int pos) { }
  35.             @Override
  36.       public void handleComment(final char[] data, final int pos) { }
  37.             @Override
  38.       public void handleError(final java.lang.String errMsg, final int pos) { }
  39.     };
  40.     parserDelegator.parse(myStringReader, parserCallback, true);
  41.     return list;
  42.   }
  43. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement