Advertisement
Guest User

Untitled

a guest
Nov 29th, 2018
193
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Java 1.64 KB | None | 0 0
  1. import java.util.Scanner;
  2. import java.util.regex.Matcher;
  3. import java.util.regex.Pattern;
  4.  
  5. public class HTMLParser {
  6.     public static void main(String[] args) {
  7.         Scanner scanner = new Scanner(System.in);
  8.         String line = scanner.nextLine();
  9.  
  10.         //title
  11.         String regex = "<title>(?<title>.+)</title>";
  12.         Pattern pattern = Pattern.compile(regex);
  13.         Matcher matcher = pattern.matcher(line);
  14.         if (matcher.find()) {
  15.             System.out.println("Title: " + matcher.group("title"));
  16.         } else {
  17.             System.out.println("Title not matched!");
  18.             return;
  19.         }
  20.  
  21.         //The content should be a single string.
  22.         //There might be different tags inside of the body, which you must ignore.
  23.  
  24.         //first get whole body
  25.         String content;
  26.         regex = "<body>(?<body>.+)</body>";
  27.         pattern = Pattern.compile(regex);
  28.         matcher = pattern.matcher(line);
  29.         if (matcher.find()) {
  30.             content = (matcher.group("body"));
  31.         } else {
  32.             System.out.println("Body not matched!");
  33.             return;
  34.         }
  35.  
  36.         //clear tags
  37. //        regex = "<.*?>";
  38.         regex="<(\"[^\"]*?\"|'[^']*?'|[^'\">])*>";
  39.         pattern = Pattern.compile(regex);
  40.         matcher = pattern.matcher(content);
  41.  
  42.         while (matcher.find()) {
  43.             content = content.replace(matcher.group(), " ");
  44.         }
  45.  
  46.         //clear \n symbols and more then one spaces
  47.         content = content.replace("\\n", " ");
  48.         content = content.replaceAll("\\s+", " ").trim();
  49.         System.out.println("Content: " + content);
  50.  
  51.     }
  52. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement