Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- //download html
- WebClient wb = new WebClient();
- string html = wb.DownloadString("http://www.stackoverflow.com");
- // regex test
- Stopwatch regexStopwatch = Stopwatch.StartNew();
- Match m = Regex.Match(html, "href\\s*=\\s*(?:[\"'](?<1>[^\"']*)[\"']|(?<1>\\S+))", RegexOptions.IgnoreCase);
- List<string> ListOfURLs = new List<string>();
- while (m.Success)
- {
- string urlChecked = m.Groups[1].ToString().Trim();
- if (!string.IsNullOrEmpty(urlChecked)) ListOfURLs.Add(urlChecked);
- m = m.NextMatch();
- }
- regexStopwatch.Stop();
- Console.WriteLine(regexStopwatch.ElapsedMilliseconds);
- // agility test
- Stopwatch agilityStopwatch = Stopwatch.StartNew();
- HtmlAgilityPack.HtmlDocument htmlDoc = new HtmlAgilityPack.HtmlDocument();
- htmlDoc.OptionFixNestedTags = true;
- htmlDoc.LoadHtml(html);
- List<string> linkList = htmlDoc.DocumentNode.SelectNodes("//a[@href]").Select(n => n.GetAttributeValue("href", string.Empty).Trim()).Where(link => !string.IsNullOrEmpty(link)).ToList();
- agilityStopwatch.Stop();
- Console.WriteLine(agilityStopwatch.ElapsedMilliseconds);
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement