Advertisement
Guest User

Extract Hyperlinks

a guest
Jun 1st, 2018
253
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.76 KB | None | 0 0
  1. import java.util.ArrayList;
  2. import java.util.List;
  3. import java.util.Scanner;
  4. import java.util.regex.Matcher;
  5. import java.util.regex.Pattern;
  6.  
  7. /**
  8. * Created by IntelliJ IDEA.
  9. * User: LAPD
  10. * Date: 28.5.2018 г.
  11. * Time: 10:37 ч.
  12. */
  13. public class _16ExtractHyperlinks {
  14. public static void main(String[] args) {
  15. Scanner console = new Scanner(System.in);
  16.  
  17. String attributeRegex = "<a[\\s\\S]+?>[\\s\\S]*?<\\/a>";
  18. String urlRegex = "href\\s*=\\s*(\".*?\"|'.*?'|.*?)[\\s>]";
  19.  
  20. StringBuilder stringBuilder = new StringBuilder();
  21.  
  22. String input;
  23. while (!"END".equals(input = console.nextLine())) {
  24. stringBuilder.append(input);
  25. }
  26.  
  27. Pattern pattern = Pattern.compile(attributeRegex);
  28. Matcher matcher = pattern.matcher(stringBuilder);
  29.  
  30. List<String> attributes = new ArrayList<>();
  31.  
  32. while (matcher.find()) {
  33. String attribute = matcher.group();
  34.  
  35. if (attribute.contains("href")) {
  36. String[] test = attribute.split("<");
  37.  
  38. if (test.length == 3) {
  39. attributes.add(attribute);
  40. }
  41. }
  42. }
  43.  
  44. pattern = Pattern.compile(urlRegex);
  45.  
  46. List<String> urls = new ArrayList<>();
  47.  
  48. for (String attribute : attributes) {
  49. matcher = pattern.matcher(attribute);
  50.  
  51. if (matcher.find()){
  52. String url = matcher.group(1);
  53.  
  54. if (url.startsWith("'")||url.startsWith("\"")){
  55. url=url.substring(1,url.length()-1);
  56. }
  57.  
  58. urls.add(url);
  59. }
  60. }
  61.  
  62. for (String url : urls) {
  63. System.out.println(url);
  64. }
  65. }
  66. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement