ellapt

T14.25.ExtractHTMLtext

Feb 3rd, 2013
54
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 0.97 KB | None | 0 0
  1. using System;
  2. using System.IO;
  3. using System.Text.RegularExpressions;
  4.  
  5. class ExtractHTMLtext
  6. {
  7. static void Main()
  8. {
  9. Console.WriteLine("Extract from HTML file its title (if available),\nand its body text without the HTML tags\n");
  10.  
  11. Console.WriteLine(@"The original text file is named '..\..\test.htm'");
  12.  
  13. Console.WriteLine("\nPlease, open this file to check the result.");
  14.  
  15. StreamReader reader = new StreamReader(@"..\..\test.htm");
  16. using (reader)
  17. {
  18. string record = "";
  19. MatchCollection matches = Regex.Matches(record, @"(?<=^|>)[^><]+?(?=<|$)");
  20. while ((record = reader.ReadLine()) != null)
  21. {
  22. matches = Regex.Matches(record, @"(?<=^|>)[^><]+?(?=<|$)");
  23.  
  24. foreach (var item in matches)
  25. {
  26. Console.WriteLine(item);
  27. }
  28. }
  29. }
  30. Console.WriteLine();
  31. }
  32. }
Advertisement
Add Comment
Please, Sign In to add comment