Advertisement
Guest User

Untitled

a guest
Aug 14th, 2018
63
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.32 KB | None | 0 0
  1. public IEnumerable<Uri> GetSitemapUrls(Uri sitemapUrl)
  2. {
  3. var sitemapText = GetSitemapText(sitemapUrl);
  4.  
  5. if (string.IsNullOrWhiteSpace(sitemapText))
  6. yield break;
  7.  
  8. var urls = new List<string>();
  9.  
  10. var urlRegex = new Regex(@"b(?:https?://|www.)[^ fnrtv]]+b", RegexOptions.Compiled | RegexOptions.IgnoreCase);
  11.  
  12. foreach (Match m in urlRegex.Matches(sitemapText))
  13. urls.Add(CleanUriString(m.Value));
  14.  
  15. foreach (var url in urls)
  16. {
  17. var cleanedUriString = CleanUriString(url);
  18.  
  19. if (Uri.IsWellFormedUriString(cleanedUriString, UriKind.RelativeOrAbsolute))
  20. yield return new Uri(cleanedUriString);
  21. }
  22. }
  23.  
  24. string GetSitemapText(Uri sitemapUri)
  25. {
  26. var wc = new WebClient
  27. {
  28. Encoding = System.Text.Encoding.UTF8
  29. };
  30.  
  31. return wc.DownloadString(sitemapUri);
  32. }
  33.  
  34. string CleanUriString(string dirtyUriString)
  35. {
  36. var legalCharacters = @"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~:/?#[]@!$&'()*+,;=".ToCharArray();
  37.  
  38. var cleanedString = dirtyUriString;
  39.  
  40. foreach (var character in dirtyUriString)
  41. {
  42. var matchIndex = dirtyUriString.IndexOf(character);
  43.  
  44. if (!legalCharacters.Any(x => x.Equals(character)) && matchIndex > 0)
  45. cleanedString = dirtyUriString.Substring(0, matchIndex);
  46. }
  47.  
  48. return cleanedString;
  49. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement