Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import java.util.Scanner;
- import java.util.regex.Matcher;
- import java.util.regex.Pattern;
- public class HTMLParser {
- public static void main(String[] args) {
- Scanner scanner = new Scanner(System.in);
- String line = scanner.nextLine();
- //title
- String regex = "<title>(?<title>.+)</title>";
- Pattern pattern = Pattern.compile(regex);
- Matcher matcher = pattern.matcher(line);
- if (matcher.find()) {
- System.out.println("Title: " + matcher.group("title"));
- } else {
- System.out.println("Title not matched!");
- return;
- }
- //The content should be a single string.
- //There might be different tags inside of the body, which you must ignore.
- //first get whole body
- String content;
- regex = "<body>(?<body>.+)</body>";
- pattern = Pattern.compile(regex);
- matcher = pattern.matcher(line);
- if (matcher.find()) {
- content = (matcher.group("body"));
- } else {
- System.out.println("Body not matched!");
- return;
- }
- //clear tags
- // regex = "<.*?>";
- regex="<(\"[^\"]*?\"|'[^']*?'|[^'\">])*>";
- pattern = Pattern.compile(regex);
- matcher = pattern.matcher(content);
- while (matcher.find()) {
- content = content.replace(matcher.group(), " ");
- }
- //clear \n symbols and more then one spaces
- content = content.replace("\\n", " ");
- content = content.replaceAll("\\s+", " ").trim();
- System.out.println("Content: " + content);
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement