Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- protected void CrawlUri(object o)
- {
- try
- {
- Interlocked.Increment(ref _threadCount);
- Uri uri = (Uri)o;
- foreach (Match match in _regex.Matches(GetWebResponse(uri)))
- {
- Uri newUri = new Uri(uri, match.Value);
- if (!_uriCollection.Contains(newUri))
- {
- _uriCollection.Add(newUri);
- ThreadPool.QueueUserWorkItem(_waitCallback, newUri);
- }
- }
- }
- catch
- {
- // Handle exceptions
- }
- finally
- {
- Interlocked.Decrement(ref _threadCount);
- }
- // If there are no more threads running then signal the waithandle
- if (_threadCount == 0)
- _eventWaitHandle.Set();
- }
- // Request first page (based on host)
- Uri root = new Uri(context.Request.Url.GetLeftPart(UriPartial.Authority));
- // Begin threaded crawling of the Uri
- ThreadPool.QueueUserWorkItem(_waitCallback, root);
- Thread.Sleep(5000); // TEMP SOLUTION: Sleep for 5 seconds
- _eventWaitHandle.WaitOne();
- // Server the Xml Sitemap
- context.Response.ContentType = "text/xml";
- context.Response.Write(GetXml().OuterXml);
- public class CrawlUriTool
- {
- private Regex regex;
- private int pendingRequests;
- private List<Uri> uriCollection;
- private object uriCollectionSync = new object();
- private ManualResetEvent crawlCompletedEvent;
- public List<Uri> CrawlUri(Uri uri)
- {
- this.pendingRequests = 0;
- this.uriCollection = new List<Uri>();
- this.crawlCompletedEvent = new ManualResetEvent(false);
- this.StartUriCrawl(uri);
- this.crawlCompletedEvent.WaitOne();
- return this.uriCollection;
- }
- private void StartUriCrawl(Uri uri)
- {
- Interlocked.Increment(ref this.pendingRequests);
- HttpWebRequest request = (HttpWebRequest)WebRequest.Create(uri);
- request.BeginGetResponse(this.UriCrawlCallback, request);
- }
- private void UriCrawlCallback(IAsyncResult asyncResult)
- {
- HttpWebRequest request = asyncResult.AsyncState as HttpWebRequest;
- try
- {
- HttpWebResponse response = (HttpWebResponse)request.EndGetResponse(asyncResult);
- string responseText = this.GetTextFromResponse(response); // not included
- foreach (Match match in this.regex.Matches(responseText))
- {
- Uri newUri = new Uri(response.ResponseUri, match.Value);
- lock (this.uriCollectionSync)
- {
- if (!this.uriCollection.Contains(newUri))
- {
- this.uriCollection.Add(newUri);
- this.StartUriCrawl(newUri);
- }
- }
- }
- }
- catch (WebException exception)
- {
- // handle exception
- }
- finally
- {
- if (Interlocked.Decrement(ref this.pendingRequests) == 0)
- {
- this.crawlCompletedEvent.Set();
- }
- }
- }
- }
Add Comment
Please, Sign In to add comment