Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- /*
- * Write a program to extract all hyperlinks (<href=…>) from a given text.
- * The text comes from the console on a variable number of lines and ends with the command "END".
- * Print at the console the href values in the text. The input text is standard HTML code.
- * It may hold many tags and can be formatted in many different forms (with or without whitespace).
- * The <a> elements may have many attributes, not only href.
- * You should extract only the values of the href attributes of all <a> elements.
- * The input will be well formed HTML fragment (all tags and attributes will be correctly closed).
- * Attribute values will never hold tags and hyperlinks, e.g. "<img alt='<a href="hello">' />" is invalid.
- * Commented links are also extracted.
- * The number of input lines will be in the range [1 ... 100].
- * Print at the console the href values in the text, each at a separate line, in the order they come from the input.
- */
- import java.io.BufferedReader;
- import java.io.IOException;
- import java.io.InputStreamReader;
- import java.util.regex.Matcher;
- import java.util.regex.Pattern;
- public class ExtractHyperlinks_Regex {
- public static void main(String[] args) throws IOException {
- StringBuilder htmlText = new StringBuilder();
- BufferedReader reader =
- new BufferedReader(new InputStreamReader(System.in));
- String inputLine = reader.readLine();
- while (!inputLine.equals("END")) {
- htmlText.append(inputLine);
- inputLine = reader.readLine();
- }
- Pattern hyperlinkPatt = Pattern.compile("<a(?:[^>]+?)href\\s*=\\s*(\"|'|\\s?)(.+?)\\1(?=\\s|>)");
- Matcher match = hyperlinkPatt.matcher(htmlText);
- while (match.find()) {
- System.out.println(match.group(2));
- }
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement