Advertisement
Guest User

Untitled

a guest
Apr 23rd, 2014
33
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.20 KB | None | 0 0
  1. import java.io.IOException;
  2. import java.util.Iterator;
  3. import java.util.LinkedHashMap;
  4. import java.util.LinkedHashSet;
  5. import java.util.Map;
  6. import java.util.Set;
  7. import java.util.concurrent.TimeUnit;
  8. import java.util.regex.Matcher;
  9. import java.util.regex.Pattern;
  10.  
  11. import org.jsoup.Jsoup;
  12. import org.jsoup.nodes.Document;
  13. import org.jsoup.nodes.Element;
  14. import org.jsoup.select.Elements;
  15.  
  16. import com.google.common.base.Stopwatch;
  17.  
  18. public class TestAnchor {
  19. private static final String TOC_ANCHOR = "div[id=toc] ul>li a[href^=#]:not([href=#])";
  20. private static final String PLAIN_ANCHOR_A_TAG = "a[href^=#]:not([href=#])";
  21.  
  22. private static final int MAX_ANCHOR_LINKS = 500;
  23. // only <div id="bodyContent"> section
  24. //private static final String PATTERN_BODY_ROOT = "div[id=bodyContent]";
  25.  
  26. public static Map<String, String> parseHTML(String url) throws IOException {
  27. Map<String, String> anchorContents = new LinkedHashMap<String, String>();
  28.  
  29. Document doc = Jsoup.connect(url).get();
  30. Elements rootElements = doc.getAllElements(); // select(PATTERN_BODY_ROOT).first();
  31. for (Element rootElement : rootElements) {
  32. if (rootElement == null) { continue; }
  33. Set<String> anchors = getAnchors(rootElement);
  34. if (anchors.isEmpty())
  35. return anchorContents;
  36. StringBuilder remaining = new StringBuilder(rootElement.toString());
  37.  
  38. Iterator<String> it = anchors.iterator();
  39. String current = it.next();
  40. while (it.hasNext() && remaining.length() > 0) {
  41. String next = it.next();
  42. anchorContents.put(
  43. current,
  44. getContentBetweenAnchor(remaining, current, next,
  45. "span", "id"));
  46. current = next;
  47. }
  48. // last one
  49. String lastTxt = Jsoup.parse(remaining.toString()).text();
  50. if (lastTxt.length() > 0) {
  51. anchorContents.put(current, lastTxt);
  52. }
  53. }
  54. return anchorContents;
  55. }
  56.  
  57. public static Set<String> getAnchors(Element rootElement) {
  58. Set<String> anchors = new LinkedHashSet<String>() {
  59. private static final long serialVersionUID = 1L;
  60.  
  61. @Override
  62. public boolean add(String e) {
  63. if (size() >= MAX_ANCHOR_LINKS)
  64. return false;
  65. return super.add(e);
  66. }
  67. };
  68. getAnchorsImpl(rootElement, TOC_ANCHOR, anchors);
  69. if (anchors.isEmpty()) {
  70. // no toc anchor found, then use
  71. getAnchorsImpl(rootElement, PLAIN_ANCHOR_A_TAG, anchors);
  72. }
  73. return anchors;
  74. }
  75.  
  76. public static void getAnchorsImpl(Element rootElement,
  77. String anchorPattern, Set<String> anchors) {
  78. Elements elements = rootElement.select(anchorPattern);
  79. if (!elements.isEmpty()) {
  80. for (Element element : elements) {
  81. String href = element.attr("href");
  82. anchors.add(href.substring(1));
  83. }
  84. }
  85. }
  86.  
  87. public static String getContentBetweenAnchor(StringBuilder remaining,
  88. String anchor1, String anchor2, String anchorElement,
  89. String anchorAttribute) throws IOException {
  90. StringBuilder sb = new StringBuilder();
  91. // the first group is the anchor text
  92. sb.append(matchAnchorRegexStr(anchor1, anchorElement, true))
  93. // the second group is the text between these 2 anchors
  94. .append("(.*)")
  95. // the third group is the remaing text
  96. .append("(")
  97. .append(matchAnchorRegexStr(anchor2, anchorElement, false))
  98. .append(".*)");
  99.  
  100. System.out.println(sb);
  101. Matcher matcher = Pattern.compile(sb.toString(),
  102. Pattern.DOTALL | Pattern.MULTILINE).matcher(remaining);
  103. String matchedText = "";
  104. if (matcher.find()) {
  105. String anchorText = Jsoup.parse(matcher.group(1)).text();
  106. matchedText = anchorText + " "
  107. + Jsoup.parse(matcher.group(2)).text();
  108. String newRemaining = matcher.group(3);
  109. remaining.setLength(0);
  110. remaining.append(newRemaining);
  111. }
  112. return matchedText;
  113. }
  114.  
  115. public static String matchAnchorRegexStr(String anchor1,
  116. String anchorElement, boolean cpatureAnchorText) {
  117. StringBuilder sb = new StringBuilder().append("<")
  118. .append(anchorElement).append("[^>]*").append("\s*")
  119. .append("(?:"|')?").append(anchor1).append("(?:'|")?[^>]*>");
  120. if (cpatureAnchorText) {
  121. sb.append("([^<]*)");
  122. } else {
  123. sb.append("[^<]*");
  124. }
  125. return sb.append("</").append(anchorElement).append(">").toString();
  126. }
  127.  
  128. public static void main(String[] args) throws Exception {
  129. Stopwatch stopwatch = Stopwatch.createStarted();
  130. String url = "http://en.wikipedia.org/wiki/Baked_potato";
  131. Map<String, String> anchorContents = parseHTML(url);
  132. System.out.println(anchorContents);
  133. System.out.println("Took " + stopwatch.elapsed(TimeUnit.MILLISECONDS));
  134. stopwatch.stop();
  135.  
  136. }
  137.  
  138. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement