Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import java.io.IOException;
- import java.util.Iterator;
- import java.util.LinkedHashMap;
- import java.util.LinkedHashSet;
- import java.util.Map;
- import java.util.Set;
- import java.util.concurrent.TimeUnit;
- import java.util.regex.Matcher;
- import java.util.regex.Pattern;
- import org.jsoup.Jsoup;
- import org.jsoup.nodes.Document;
- import org.jsoup.nodes.Element;
- import org.jsoup.select.Elements;
- import com.google.common.base.Stopwatch;
- public class TestAnchor {
- private static final String TOC_ANCHOR = "div[id=toc] ul>li a[href^=#]:not([href=#])";
- private static final String PLAIN_ANCHOR_A_TAG = "a[href^=#]:not([href=#])";
- private static final int MAX_ANCHOR_LINKS = 500;
- // only <div id="bodyContent"> section
- //private static final String PATTERN_BODY_ROOT = "div[id=bodyContent]";
- public static Map<String, String> parseHTML(String url) throws IOException {
- Map<String, String> anchorContents = new LinkedHashMap<String, String>();
- Document doc = Jsoup.connect(url).get();
- Elements rootElements = doc.getAllElements(); // select(PATTERN_BODY_ROOT).first();
- for (Element rootElement : rootElements) {
- if (rootElement == null) { continue; }
- Set<String> anchors = getAnchors(rootElement);
- if (anchors.isEmpty())
- return anchorContents;
- StringBuilder remaining = new StringBuilder(rootElement.toString());
- Iterator<String> it = anchors.iterator();
- String current = it.next();
- while (it.hasNext() && remaining.length() > 0) {
- String next = it.next();
- anchorContents.put(
- current,
- getContentBetweenAnchor(remaining, current, next,
- "span", "id"));
- current = next;
- }
- // last one
- String lastTxt = Jsoup.parse(remaining.toString()).text();
- if (lastTxt.length() > 0) {
- anchorContents.put(current, lastTxt);
- }
- }
- return anchorContents;
- }
- public static Set<String> getAnchors(Element rootElement) {
- Set<String> anchors = new LinkedHashSet<String>() {
- private static final long serialVersionUID = 1L;
- @Override
- public boolean add(String e) {
- if (size() >= MAX_ANCHOR_LINKS)
- return false;
- return super.add(e);
- }
- };
- getAnchorsImpl(rootElement, TOC_ANCHOR, anchors);
- if (anchors.isEmpty()) {
- // no toc anchor found, then use
- getAnchorsImpl(rootElement, PLAIN_ANCHOR_A_TAG, anchors);
- }
- return anchors;
- }
- public static void getAnchorsImpl(Element rootElement,
- String anchorPattern, Set<String> anchors) {
- Elements elements = rootElement.select(anchorPattern);
- if (!elements.isEmpty()) {
- for (Element element : elements) {
- String href = element.attr("href");
- anchors.add(href.substring(1));
- }
- }
- }
- public static String getContentBetweenAnchor(StringBuilder remaining,
- String anchor1, String anchor2, String anchorElement,
- String anchorAttribute) throws IOException {
- StringBuilder sb = new StringBuilder();
- // the first group is the anchor text
- sb.append(matchAnchorRegexStr(anchor1, anchorElement, true))
- // the second group is the text between these 2 anchors
- .append("(.*)")
- // the third group is the remaing text
- .append("(")
- .append(matchAnchorRegexStr(anchor2, anchorElement, false))
- .append(".*)");
- System.out.println(sb);
- Matcher matcher = Pattern.compile(sb.toString(),
- Pattern.DOTALL | Pattern.MULTILINE).matcher(remaining);
- String matchedText = "";
- if (matcher.find()) {
- String anchorText = Jsoup.parse(matcher.group(1)).text();
- matchedText = anchorText + " "
- + Jsoup.parse(matcher.group(2)).text();
- String newRemaining = matcher.group(3);
- remaining.setLength(0);
- remaining.append(newRemaining);
- }
- return matchedText;
- }
- public static String matchAnchorRegexStr(String anchor1,
- String anchorElement, boolean cpatureAnchorText) {
- StringBuilder sb = new StringBuilder().append("<")
- .append(anchorElement).append("[^>]*").append("\s*")
- .append("(?:"|')?").append(anchor1).append("(?:'|")?[^>]*>");
- if (cpatureAnchorText) {
- sb.append("([^<]*)");
- } else {
- sb.append("[^<]*");
- }
- return sb.append("</").append(anchorElement).append(">").toString();
- }
- public static void main(String[] args) throws Exception {
- Stopwatch stopwatch = Stopwatch.createStarted();
- String url = "http://en.wikipedia.org/wiki/Baked_potato";
- Map<String, String> anchorContents = parseHTML(url);
- System.out.println(anchorContents);
- System.out.println("Took " + stopwatch.elapsed(TimeUnit.MILLISECONDS));
- stopwatch.stop();
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement