Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- public IEnumerable<Uri> GetSitemapUrls(Uri sitemapUrl)
- {
- var sitemapText = GetSitemapText(sitemapUrl);
- if (string.IsNullOrWhiteSpace(sitemapText))
- yield break;
- var urls = new List<string>();
- var urlRegex = new Regex(@"b(?:https?://|www.)[^ fnrtv]]+b", RegexOptions.Compiled | RegexOptions.IgnoreCase);
- foreach (Match m in urlRegex.Matches(sitemapText))
- urls.Add(CleanUriString(m.Value));
- foreach (var url in urls)
- {
- var cleanedUriString = CleanUriString(url);
- if (Uri.IsWellFormedUriString(cleanedUriString, UriKind.RelativeOrAbsolute))
- yield return new Uri(cleanedUriString);
- }
- }
- string GetSitemapText(Uri sitemapUri)
- {
- var wc = new WebClient
- {
- Encoding = System.Text.Encoding.UTF8
- };
- return wc.DownloadString(sitemapUri);
- }
- string CleanUriString(string dirtyUriString)
- {
- var legalCharacters = @"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~:/?#[]@!$&'()*+,;=".ToCharArray();
- var cleanedString = dirtyUriString;
- foreach (var character in dirtyUriString)
- {
- var matchIndex = dirtyUriString.IndexOf(character);
- if (!legalCharacters.Any(x => x.Equals(character)) && matchIndex > 0)
- cleanedString = dirtyUriString.Substring(0, matchIndex);
- }
- return cleanedString;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement